init commit

This commit is contained in:
2025-12-14 19:00:23 +03:00
commit 0ab7c64fbc
4 changed files with 219 additions and 0 deletions

37
domain_models.py Normal file
View File

@@ -0,0 +1,37 @@
# [DEF:domain_models:Module]
# @SEMANTICS: data_structures, schemas, pydantic, context_definition
# @PURPOSE: Define strict schemas for Context and Subtitle entities.
# @LAYER: Domain
# @RELATION: READS_FROM -> None
# @PUBLIC_API: TranslationContext, CharacterProfile, SubtitleLine
from typing import List, Dict, Optional
from pydantic import BaseModel, Field
# [DEF:TranslationContext:DataClass]
# @PURPOSE: Holds the semantic snapshot of the entire plot (Result of Pass 1)
class CharacterProfile(BaseModel):
name: str = Field(..., description="Character name or 'Unknown'")
role: str = Field(..., description="Role in the story")
speech_style: str = Field(..., description="Formal, slang, archaic, etc.")
class TranslationContext(BaseModel):
title: str = Field(..., description="Inferred title or topic")
genre: str = Field(..., description="Genre and atmosphere")
plot_summary: str = Field(..., description="Brief summary of the content")
style_guidelines: str = Field(..., description="Instructions for the translator (tone, register)")
terminology: Dict[str, str] = Field(default_factory=dict, description="Key terms glossary")
characters: List[CharacterProfile] = Field(default_factory=list)
# [/DEF:TranslationContext]
# [DEF:SubtitleLine:DataClass]
# @PURPOSE: Atomic unit of subtitle
class SubtitleLine(BaseModel):
index: int
start_time: str
end_time: str
original_text: str
translated_text: Optional[str] = None
# [/DEF:SubtitleLine]
# [/DEF:domain_models]

44
llm_core.py Normal file
View File

@@ -0,0 +1,44 @@
# [DEF:llm_core:Module]
import os
import logging
from typing import Type, TypeVar
from pydantic import BaseModel
# Pylance often fails to resolve dynamic exports in google-generativeai
import google.generativeai as genai # type: ignore
logger = logging.getLogger("LLM_Core")
T = TypeVar("T", bound=BaseModel)
class GeminiProcessor:
def __init__(self, model_name: str = "gemini-1.5-flash"):
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("[FATAL] GOOGLE_API_KEY not found.")
# Explicit type ignore for Pylance strict mode
genai.configure(api_key=api_key) # type: ignore
self.model = genai.GenerativeModel(model_name) # type: ignore
def generate_structured(self, prompt: str, content: str, schema: Type[T]) -> T:
logger.info(f"[GeminiProcessor] Structured generation for {schema.__name__}")
full_prompt = f"{prompt}\n\nINPUT TEXT:\n{content}"
response = self.model.generate_content(
full_prompt,
generation_config=genai.GenerationConfig( # type: ignore
response_mime_type="application/json",
response_schema=schema
)
)
return schema.model_validate_json(response.text)
def generate_text(self, system_instruction: str, user_content: str) -> str:
# Re-instantiate model with system instruction
model_w_sys = genai.GenerativeModel( # type: ignore
self.model.model_name,
system_instruction=system_instruction
)
response = model_w_sys.generate_content(user_content)
return response.text
# [/DEF:llm_core]

73
main.py Normal file
View File

@@ -0,0 +1,73 @@
# [DEF:main:Module]
# @PURPOSE: CLI Entry point.
# @RELATION: CALLS -> translation_engine
import sys
import re
from pathlib import Path
from translation_engine import SubtitleOrchestrator
from domain_models import SubtitleLine
# [DEF:parse_srt:Function]
# @PURPOSE: Extract raw text and structure from SRT file.
def parse_srt(file_path: str) -> list[SubtitleLine]:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Regex to capture standard SRT blocks
pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3})\n((?:(?!\r?\n\r?\n).)*)', re.DOTALL)
matches = pattern.findall(content)
lines = []
for idx, times, text in matches:
start, end = times.split(' --> ')
# Clean text
clean_text = text.replace('\n', ' ').strip()
lines.append(SubtitleLine(
index=int(idx),
start_time=start,
end_time=end,
original_text=clean_text
))
return lines
# [/DEF:parse_srt]
# [DEF:save_srt:Function]
def save_srt(lines: list[SubtitleLine], path: str):
with open(path, 'w', encoding='utf-8') as f:
for line in lines:
text = line.translated_text if line.translated_text else line.original_text
f.write(f"{line.index}\n{line.start_time} --> {line.end_time}\n{text}\n\n")
# [/DEF:save_srt]
if __name__ == "__main__":
# [STATE:Entry]
if len(sys.argv) < 2:
print("Usage: python main.py <subs.srt>")
sys.exit(1)
input_file = sys.argv[1]
output_file = str(Path(input_file).with_stem(Path(input_file).stem + "_ru"))
engine = SubtitleOrchestrator()
# 1. Load Data
subs = parse_srt(input_file)
full_text = " ".join([s.original_text for s in subs])
# 2. Pass 1: Analyze
print("--- PASS 1: Analyzing Context ---")
context = engine.pass_one_analysis(full_text)
print(f"Detected Genre: {context.genre}")
print(f"Characters found: {len(context.characters)}")
# 3. Pass 2: Translate
print("--- PASS 2: Translating ---")
translated_subs = engine.pass_two_translation(subs)
# 4. Save
save_srt(translated_subs, output_file)
print(f"Saved to {output_file}")
# [STATE:Exit]
# [/DEF:main]

65
translation_engine.py Normal file
View File

@@ -0,0 +1,65 @@
# [DEF:translation_engine:Module]
import logging
from typing import List
# Local imports - require files to be in the same folder
try:
from domain_models import TranslationContext, SubtitleLine
from llm_core import GeminiProcessor
except ImportError as e:
raise ImportError(f"Failed to import local modules: {e}. Ensure domain_models.py and llm_core.py exist.")
logger = logging.getLogger("Engine")
class SubtitleOrchestrator:
def __init__(self):
self.llm = GeminiProcessor()
self.context: TranslationContext = None # type: ignore
def pass_one_analysis(self, full_text: str) -> TranslationContext:
logger.info("[PassOne] Starting Analysis...")
prompt = "ACT AS: Narrative Analyst. Identify Genre, Tone, Characters, and Plot."
self.context = self.llm.generate_structured(prompt, full_text, TranslationContext)
return self.context
def pass_two_translation(self, subtitle_lines: List[SubtitleLine], chunk_size: int = 20) -> List[SubtitleLine]:
if not self.context:
raise RuntimeError("Context must be analyzed before translation.")
logger.info(f"[PassTwo] Translating {len(subtitle_lines)} lines.")
# Serialize context for the LLM
char_desc = "\n".join([f"- {c.name}: {c.role}, {c.speech_style}" for c in self.context.characters])
system_instruction = (
f"CONTEXT:\nTitle: {self.context.title}\nGenre: {self.context.genre}\n"
f"Style: {self.context.style_guidelines}\n\nCHARACTERS:\n{char_desc}\n\n"
"TASK: Translate subtitles to Russian. Format: 'ID || Text'. Maintain timing."
)
translated_lines = []
for i in range(0, len(subtitle_lines), chunk_size):
chunk = subtitle_lines[i : i + chunk_size]
input_text = "\n".join([f"{sub.index} || {sub.original_text}" for sub in chunk])
try:
response_text = self.llm.generate_text(system_instruction, input_text)
# Parsing logic
mapping = {}
for line in response_text.strip().split('\n'):
if " || " in line:
parts = line.split(" || ", 1)
if parts[0].strip().isdigit():
mapping[int(parts[0].strip())] = parts[1].strip()
for sub in chunk:
sub.translated_text = mapping.get(sub.index, sub.original_text)
except Exception as e:
logger.error(f"Chunk failed: {e}")
for sub in chunk: sub.translated_text = sub.original_text # Fallback
translated_lines.extend(chunk)
return translated_lines
# [/DEF:translation_engine]