init commit

2025-12-14 19:00:23 +03:00
commit 0ab7c64fbc
4 changed files with 219 additions and 0 deletions
--- a/domain_models.py
+++ b/domain_models.py
@@ -0,0 +1,37 @@
+# [DEF:domain_models:Module]
+# @SEMANTICS: data_structures, schemas, pydantic, context_definition
+# @PURPOSE:   Define strict schemas for Context and Subtitle entities.
+# @LAYER:     Domain
+# @RELATION:  READS_FROM -> None
+# @PUBLIC_API: TranslationContext, CharacterProfile, SubtitleLine
+
+from typing import List, Dict, Optional
+from pydantic import BaseModel, Field
+
+# [DEF:TranslationContext:DataClass]
+# @PURPOSE: Holds the semantic snapshot of the entire plot (Result of Pass 1)
+class CharacterProfile(BaseModel):
+    name: str = Field(..., description="Character name or 'Unknown'")
+    role: str = Field(..., description="Role in the story")
+    speech_style: str = Field(..., description="Formal, slang, archaic, etc.")
+
+class TranslationContext(BaseModel):
+    title: str = Field(..., description="Inferred title or topic")
+    genre: str = Field(..., description="Genre and atmosphere")
+    plot_summary: str = Field(..., description="Brief summary of the content")
+    style_guidelines: str = Field(..., description="Instructions for the translator (tone, register)")
+    terminology: Dict[str, str] = Field(default_factory=dict, description="Key terms glossary")
+    characters: List[CharacterProfile] = Field(default_factory=list)
+# [/DEF:TranslationContext]
+
+# [DEF:SubtitleLine:DataClass]
+# @PURPOSE: Atomic unit of subtitle
+class SubtitleLine(BaseModel):
+    index: int
+    start_time: str
+    end_time: str
+    original_text: str
+    translated_text: Optional[str] = None
+# [/DEF:SubtitleLine]
+
+# [/DEF:domain_models]
--- a/llm_core.py
+++ b/llm_core.py
@@ -0,0 +1,44 @@
+# [DEF:llm_core:Module]
+import os
+import logging
+from typing import Type, TypeVar
+from pydantic import BaseModel
+
+# Pylance often fails to resolve dynamic exports in google-generativeai
+import google.generativeai as genai # type: ignore
+
+logger = logging.getLogger("LLM_Core")
+T = TypeVar("T", bound=BaseModel)
+
+class GeminiProcessor:
+    def __init__(self, model_name: str = "gemini-1.5-flash"):
+        api_key = os.getenv("GOOGLE_API_KEY")
+        if not api_key:
+            raise ValueError("[FATAL] GOOGLE_API_KEY not found.")
+        
+        # Explicit type ignore for Pylance strict mode
+        genai.configure(api_key=api_key) # type: ignore
+        self.model = genai.GenerativeModel(model_name) # type: ignore
+
+    def generate_structured(self, prompt: str, content: str, schema: Type[T]) -> T:
+        logger.info(f"[GeminiProcessor] Structured generation for {schema.__name__}")
+        full_prompt = f"{prompt}\n\nINPUT TEXT:\n{content}"
+        
+        response = self.model.generate_content(
+            full_prompt,
+            generation_config=genai.GenerationConfig( # type: ignore
+                response_mime_type="application/json",
+                response_schema=schema
+            )
+        )
+        return schema.model_validate_json(response.text)
+
+    def generate_text(self, system_instruction: str, user_content: str) -> str:
+        # Re-instantiate model with system instruction
+        model_w_sys = genai.GenerativeModel( # type: ignore
+            self.model.model_name,
+            system_instruction=system_instruction
+        )
+        response = model_w_sys.generate_content(user_content)
+        return response.text
+# [/DEF:llm_core]
--- a/main.py
+++ b/main.py
@@ -0,0 +1,73 @@
+# [DEF:main:Module]
+# @PURPOSE: CLI Entry point.
+# @RELATION: CALLS -> translation_engine
+
+import sys
+import re
+from pathlib import Path
+from translation_engine import SubtitleOrchestrator
+from domain_models import SubtitleLine
+
+# [DEF:parse_srt:Function]
+# @PURPOSE: Extract raw text and structure from SRT file.
+def parse_srt(file_path: str) -> list[SubtitleLine]:
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+    
+    # Regex to capture standard SRT blocks
+    pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3})\n((?:(?!\r?\n\r?\n).)*)', re.DOTALL)
+    matches = pattern.findall(content)
+    
+    lines = []
+    for idx, times, text in matches:
+        start, end = times.split(' --> ')
+        # Clean text
+        clean_text = text.replace('\n', ' ').strip()
+        lines.append(SubtitleLine(
+            index=int(idx),
+            start_time=start,
+            end_time=end,
+            original_text=clean_text
+        ))
+    return lines
+# [/DEF:parse_srt]
+
+# [DEF:save_srt:Function]
+def save_srt(lines: list[SubtitleLine], path: str):
+    with open(path, 'w', encoding='utf-8') as f:
+        for line in lines:
+            text = line.translated_text if line.translated_text else line.original_text
+            f.write(f"{line.index}\n{line.start_time} --> {line.end_time}\n{text}\n\n")
+# [/DEF:save_srt]
+
+if __name__ == "__main__":
+    # [STATE:Entry]
+    if len(sys.argv) < 2:
+        print("Usage: python main.py <subs.srt>")
+        sys.exit(1)
+        
+    input_file = sys.argv[1]
+    output_file = str(Path(input_file).with_stem(Path(input_file).stem + "_ru"))
+    
+    engine = SubtitleOrchestrator()
+    
+    # 1. Load Data
+    subs = parse_srt(input_file)
+    full_text = " ".join([s.original_text for s in subs])
+    
+    # 2. Pass 1: Analyze
+    print("--- PASS 1: Analyzing Context ---")
+    context = engine.pass_one_analysis(full_text)
+    print(f"Detected Genre: {context.genre}")
+    print(f"Characters found: {len(context.characters)}")
+    
+    # 3. Pass 2: Translate
+    print("--- PASS 2: Translating ---")
+    translated_subs = engine.pass_two_translation(subs)
+    
+    # 4. Save
+    save_srt(translated_subs, output_file)
+    print(f"Saved to {output_file}")
+    # [STATE:Exit]
+
+# [/DEF:main]
--- a/translation_engine.py
+++ b/translation_engine.py
@@ -0,0 +1,65 @@
+# [DEF:translation_engine:Module]
+import logging
+from typing import List
+
+# Local imports - require files to be in the same folder
+try:
+    from domain_models import TranslationContext, SubtitleLine
+    from llm_core import GeminiProcessor
+except ImportError as e:
+    raise ImportError(f"Failed to import local modules: {e}. Ensure domain_models.py and llm_core.py exist.")
+
+logger = logging.getLogger("Engine")
+
+class SubtitleOrchestrator:
+    def __init__(self):
+        self.llm = GeminiProcessor()
+        self.context: TranslationContext = None # type: ignore
+
+    def pass_one_analysis(self, full_text: str) -> TranslationContext:
+        logger.info("[PassOne] Starting Analysis...")
+        prompt = "ACT AS: Narrative Analyst. Identify Genre, Tone, Characters, and Plot."
+        self.context = self.llm.generate_structured(prompt, full_text, TranslationContext)
+        return self.context
+
+    def pass_two_translation(self, subtitle_lines: List[SubtitleLine], chunk_size: int = 20) -> List[SubtitleLine]:
+        if not self.context:
+            raise RuntimeError("Context must be analyzed before translation.")
+        
+        logger.info(f"[PassTwo] Translating {len(subtitle_lines)} lines.")
+        
+        # Serialize context for the LLM
+        char_desc = "\n".join([f"- {c.name}: {c.role}, {c.speech_style}" for c in self.context.characters])
+        system_instruction = (
+            f"CONTEXT:\nTitle: {self.context.title}\nGenre: {self.context.genre}\n"
+            f"Style: {self.context.style_guidelines}\n\nCHARACTERS:\n{char_desc}\n\n"
+            "TASK: Translate subtitles to Russian. Format: 'ID || Text'. Maintain timing."
+        )
+
+        translated_lines = []
+        for i in range(0, len(subtitle_lines), chunk_size):
+            chunk = subtitle_lines[i : i + chunk_size]
+            input_text = "\n".join([f"{sub.index} || {sub.original_text}" for sub in chunk])
+            
+            try:
+                response_text = self.llm.generate_text(system_instruction, input_text)
+                
+                # Parsing logic
+                mapping = {}
+                for line in response_text.strip().split('\n'):
+                    if " || " in line:
+                        parts = line.split(" || ", 1)
+                        if parts[0].strip().isdigit():
+                            mapping[int(parts[0].strip())] = parts[1].strip()
+                
+                for sub in chunk:
+                    sub.translated_text = mapping.get(sub.index, sub.original_text)
+                    
+            except Exception as e:
+                logger.error(f"Chunk failed: {e}")
+                for sub in chunk: sub.translated_text = sub.original_text # Fallback
+            
+            translated_lines.extend(chunk)
+
+        return translated_lines
+# [/DEF:translation_engine]