commit 0ab7c64fbc841d82465f0d784ff4115ac38f5718 Author: busya Date: Sun Dec 14 19:00:23 2025 +0300 init commit diff --git a/domain_models.py b/domain_models.py new file mode 100644 index 0000000..331ee77 --- /dev/null +++ b/domain_models.py @@ -0,0 +1,37 @@ +# [DEF:domain_models:Module] +# @SEMANTICS: data_structures, schemas, pydantic, context_definition +# @PURPOSE: Define strict schemas for Context and Subtitle entities. +# @LAYER: Domain +# @RELATION: READS_FROM -> None +# @PUBLIC_API: TranslationContext, CharacterProfile, SubtitleLine + +from typing import List, Dict, Optional +from pydantic import BaseModel, Field + +# [DEF:TranslationContext:DataClass] +# @PURPOSE: Holds the semantic snapshot of the entire plot (Result of Pass 1) +class CharacterProfile(BaseModel): + name: str = Field(..., description="Character name or 'Unknown'") + role: str = Field(..., description="Role in the story") + speech_style: str = Field(..., description="Formal, slang, archaic, etc.") + +class TranslationContext(BaseModel): + title: str = Field(..., description="Inferred title or topic") + genre: str = Field(..., description="Genre and atmosphere") + plot_summary: str = Field(..., description="Brief summary of the content") + style_guidelines: str = Field(..., description="Instructions for the translator (tone, register)") + terminology: Dict[str, str] = Field(default_factory=dict, description="Key terms glossary") + characters: List[CharacterProfile] = Field(default_factory=list) +# [/DEF:TranslationContext] + +# [DEF:SubtitleLine:DataClass] +# @PURPOSE: Atomic unit of subtitle +class SubtitleLine(BaseModel): + index: int + start_time: str + end_time: str + original_text: str + translated_text: Optional[str] = None +# [/DEF:SubtitleLine] + +# [/DEF:domain_models] \ No newline at end of file diff --git a/llm_core.py b/llm_core.py new file mode 100644 index 0000000..6fee5cc --- /dev/null +++ b/llm_core.py @@ -0,0 +1,44 @@ +# [DEF:llm_core:Module] +import os +import logging +from typing import Type, TypeVar +from pydantic import BaseModel + +# Pylance often fails to resolve dynamic exports in google-generativeai +import google.generativeai as genai # type: ignore + +logger = logging.getLogger("LLM_Core") +T = TypeVar("T", bound=BaseModel) + +class GeminiProcessor: + def __init__(self, model_name: str = "gemini-1.5-flash"): + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + raise ValueError("[FATAL] GOOGLE_API_KEY not found.") + + # Explicit type ignore for Pylance strict mode + genai.configure(api_key=api_key) # type: ignore + self.model = genai.GenerativeModel(model_name) # type: ignore + + def generate_structured(self, prompt: str, content: str, schema: Type[T]) -> T: + logger.info(f"[GeminiProcessor] Structured generation for {schema.__name__}") + full_prompt = f"{prompt}\n\nINPUT TEXT:\n{content}" + + response = self.model.generate_content( + full_prompt, + generation_config=genai.GenerationConfig( # type: ignore + response_mime_type="application/json", + response_schema=schema + ) + ) + return schema.model_validate_json(response.text) + + def generate_text(self, system_instruction: str, user_content: str) -> str: + # Re-instantiate model with system instruction + model_w_sys = genai.GenerativeModel( # type: ignore + self.model.model_name, + system_instruction=system_instruction + ) + response = model_w_sys.generate_content(user_content) + return response.text +# [/DEF:llm_core] \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..3597063 --- /dev/null +++ b/main.py @@ -0,0 +1,73 @@ +# [DEF:main:Module] +# @PURPOSE: CLI Entry point. +# @RELATION: CALLS -> translation_engine + +import sys +import re +from pathlib import Path +from translation_engine import SubtitleOrchestrator +from domain_models import SubtitleLine + +# [DEF:parse_srt:Function] +# @PURPOSE: Extract raw text and structure from SRT file. +def parse_srt(file_path: str) -> list[SubtitleLine]: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Regex to capture standard SRT blocks + pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3})\n((?:(?!\r?\n\r?\n).)*)', re.DOTALL) + matches = pattern.findall(content) + + lines = [] + for idx, times, text in matches: + start, end = times.split(' --> ') + # Clean text + clean_text = text.replace('\n', ' ').strip() + lines.append(SubtitleLine( + index=int(idx), + start_time=start, + end_time=end, + original_text=clean_text + )) + return lines +# [/DEF:parse_srt] + +# [DEF:save_srt:Function] +def save_srt(lines: list[SubtitleLine], path: str): + with open(path, 'w', encoding='utf-8') as f: + for line in lines: + text = line.translated_text if line.translated_text else line.original_text + f.write(f"{line.index}\n{line.start_time} --> {line.end_time}\n{text}\n\n") +# [/DEF:save_srt] + +if __name__ == "__main__": + # [STATE:Entry] + if len(sys.argv) < 2: + print("Usage: python main.py ") + sys.exit(1) + + input_file = sys.argv[1] + output_file = str(Path(input_file).with_stem(Path(input_file).stem + "_ru")) + + engine = SubtitleOrchestrator() + + # 1. Load Data + subs = parse_srt(input_file) + full_text = " ".join([s.original_text for s in subs]) + + # 2. Pass 1: Analyze + print("--- PASS 1: Analyzing Context ---") + context = engine.pass_one_analysis(full_text) + print(f"Detected Genre: {context.genre}") + print(f"Characters found: {len(context.characters)}") + + # 3. Pass 2: Translate + print("--- PASS 2: Translating ---") + translated_subs = engine.pass_two_translation(subs) + + # 4. Save + save_srt(translated_subs, output_file) + print(f"Saved to {output_file}") + # [STATE:Exit] + +# [/DEF:main] \ No newline at end of file diff --git a/translation_engine.py b/translation_engine.py new file mode 100644 index 0000000..c57bf4b --- /dev/null +++ b/translation_engine.py @@ -0,0 +1,65 @@ +# [DEF:translation_engine:Module] +import logging +from typing import List + +# Local imports - require files to be in the same folder +try: + from domain_models import TranslationContext, SubtitleLine + from llm_core import GeminiProcessor +except ImportError as e: + raise ImportError(f"Failed to import local modules: {e}. Ensure domain_models.py and llm_core.py exist.") + +logger = logging.getLogger("Engine") + +class SubtitleOrchestrator: + def __init__(self): + self.llm = GeminiProcessor() + self.context: TranslationContext = None # type: ignore + + def pass_one_analysis(self, full_text: str) -> TranslationContext: + logger.info("[PassOne] Starting Analysis...") + prompt = "ACT AS: Narrative Analyst. Identify Genre, Tone, Characters, and Plot." + self.context = self.llm.generate_structured(prompt, full_text, TranslationContext) + return self.context + + def pass_two_translation(self, subtitle_lines: List[SubtitleLine], chunk_size: int = 20) -> List[SubtitleLine]: + if not self.context: + raise RuntimeError("Context must be analyzed before translation.") + + logger.info(f"[PassTwo] Translating {len(subtitle_lines)} lines.") + + # Serialize context for the LLM + char_desc = "\n".join([f"- {c.name}: {c.role}, {c.speech_style}" for c in self.context.characters]) + system_instruction = ( + f"CONTEXT:\nTitle: {self.context.title}\nGenre: {self.context.genre}\n" + f"Style: {self.context.style_guidelines}\n\nCHARACTERS:\n{char_desc}\n\n" + "TASK: Translate subtitles to Russian. Format: 'ID || Text'. Maintain timing." + ) + + translated_lines = [] + for i in range(0, len(subtitle_lines), chunk_size): + chunk = subtitle_lines[i : i + chunk_size] + input_text = "\n".join([f"{sub.index} || {sub.original_text}" for sub in chunk]) + + try: + response_text = self.llm.generate_text(system_instruction, input_text) + + # Parsing logic + mapping = {} + for line in response_text.strip().split('\n'): + if " || " in line: + parts = line.split(" || ", 1) + if parts[0].strip().isdigit(): + mapping[int(parts[0].strip())] = parts[1].strip() + + for sub in chunk: + sub.translated_text = mapping.get(sub.index, sub.original_text) + + except Exception as e: + logger.error(f"Chunk failed: {e}") + for sub in chunk: sub.translated_text = sub.original_text # Fallback + + translated_lines.extend(chunk) + + return translated_lines +# [/DEF:translation_engine] \ No newline at end of file