init commit
This commit is contained in:
37
domain_models.py
Normal file
37
domain_models.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# [DEF:domain_models:Module]
|
||||
# @SEMANTICS: data_structures, schemas, pydantic, context_definition
|
||||
# @PURPOSE: Define strict schemas for Context and Subtitle entities.
|
||||
# @LAYER: Domain
|
||||
# @RELATION: READS_FROM -> None
|
||||
# @PUBLIC_API: TranslationContext, CharacterProfile, SubtitleLine
|
||||
|
||||
from typing import List, Dict, Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# [DEF:TranslationContext:DataClass]
|
||||
# @PURPOSE: Holds the semantic snapshot of the entire plot (Result of Pass 1)
|
||||
class CharacterProfile(BaseModel):
|
||||
name: str = Field(..., description="Character name or 'Unknown'")
|
||||
role: str = Field(..., description="Role in the story")
|
||||
speech_style: str = Field(..., description="Formal, slang, archaic, etc.")
|
||||
|
||||
class TranslationContext(BaseModel):
|
||||
title: str = Field(..., description="Inferred title or topic")
|
||||
genre: str = Field(..., description="Genre and atmosphere")
|
||||
plot_summary: str = Field(..., description="Brief summary of the content")
|
||||
style_guidelines: str = Field(..., description="Instructions for the translator (tone, register)")
|
||||
terminology: Dict[str, str] = Field(default_factory=dict, description="Key terms glossary")
|
||||
characters: List[CharacterProfile] = Field(default_factory=list)
|
||||
# [/DEF:TranslationContext]
|
||||
|
||||
# [DEF:SubtitleLine:DataClass]
|
||||
# @PURPOSE: Atomic unit of subtitle
|
||||
class SubtitleLine(BaseModel):
|
||||
index: int
|
||||
start_time: str
|
||||
end_time: str
|
||||
original_text: str
|
||||
translated_text: Optional[str] = None
|
||||
# [/DEF:SubtitleLine]
|
||||
|
||||
# [/DEF:domain_models]
|
||||
44
llm_core.py
Normal file
44
llm_core.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# [DEF:llm_core:Module]
|
||||
import os
|
||||
import logging
|
||||
from typing import Type, TypeVar
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Pylance often fails to resolve dynamic exports in google-generativeai
|
||||
import google.generativeai as genai # type: ignore
|
||||
|
||||
logger = logging.getLogger("LLM_Core")
|
||||
T = TypeVar("T", bound=BaseModel)
|
||||
|
||||
class GeminiProcessor:
|
||||
def __init__(self, model_name: str = "gemini-1.5-flash"):
|
||||
api_key = os.getenv("GOOGLE_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("[FATAL] GOOGLE_API_KEY not found.")
|
||||
|
||||
# Explicit type ignore for Pylance strict mode
|
||||
genai.configure(api_key=api_key) # type: ignore
|
||||
self.model = genai.GenerativeModel(model_name) # type: ignore
|
||||
|
||||
def generate_structured(self, prompt: str, content: str, schema: Type[T]) -> T:
|
||||
logger.info(f"[GeminiProcessor] Structured generation for {schema.__name__}")
|
||||
full_prompt = f"{prompt}\n\nINPUT TEXT:\n{content}"
|
||||
|
||||
response = self.model.generate_content(
|
||||
full_prompt,
|
||||
generation_config=genai.GenerationConfig( # type: ignore
|
||||
response_mime_type="application/json",
|
||||
response_schema=schema
|
||||
)
|
||||
)
|
||||
return schema.model_validate_json(response.text)
|
||||
|
||||
def generate_text(self, system_instruction: str, user_content: str) -> str:
|
||||
# Re-instantiate model with system instruction
|
||||
model_w_sys = genai.GenerativeModel( # type: ignore
|
||||
self.model.model_name,
|
||||
system_instruction=system_instruction
|
||||
)
|
||||
response = model_w_sys.generate_content(user_content)
|
||||
return response.text
|
||||
# [/DEF:llm_core]
|
||||
73
main.py
Normal file
73
main.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# [DEF:main:Module]
|
||||
# @PURPOSE: CLI Entry point.
|
||||
# @RELATION: CALLS -> translation_engine
|
||||
|
||||
import sys
|
||||
import re
|
||||
from pathlib import Path
|
||||
from translation_engine import SubtitleOrchestrator
|
||||
from domain_models import SubtitleLine
|
||||
|
||||
# [DEF:parse_srt:Function]
|
||||
# @PURPOSE: Extract raw text and structure from SRT file.
|
||||
def parse_srt(file_path: str) -> list[SubtitleLine]:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Regex to capture standard SRT blocks
|
||||
pattern = re.compile(r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3})\n((?:(?!\r?\n\r?\n).)*)', re.DOTALL)
|
||||
matches = pattern.findall(content)
|
||||
|
||||
lines = []
|
||||
for idx, times, text in matches:
|
||||
start, end = times.split(' --> ')
|
||||
# Clean text
|
||||
clean_text = text.replace('\n', ' ').strip()
|
||||
lines.append(SubtitleLine(
|
||||
index=int(idx),
|
||||
start_time=start,
|
||||
end_time=end,
|
||||
original_text=clean_text
|
||||
))
|
||||
return lines
|
||||
# [/DEF:parse_srt]
|
||||
|
||||
# [DEF:save_srt:Function]
|
||||
def save_srt(lines: list[SubtitleLine], path: str):
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
for line in lines:
|
||||
text = line.translated_text if line.translated_text else line.original_text
|
||||
f.write(f"{line.index}\n{line.start_time} --> {line.end_time}\n{text}\n\n")
|
||||
# [/DEF:save_srt]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# [STATE:Entry]
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python main.py <subs.srt>")
|
||||
sys.exit(1)
|
||||
|
||||
input_file = sys.argv[1]
|
||||
output_file = str(Path(input_file).with_stem(Path(input_file).stem + "_ru"))
|
||||
|
||||
engine = SubtitleOrchestrator()
|
||||
|
||||
# 1. Load Data
|
||||
subs = parse_srt(input_file)
|
||||
full_text = " ".join([s.original_text for s in subs])
|
||||
|
||||
# 2. Pass 1: Analyze
|
||||
print("--- PASS 1: Analyzing Context ---")
|
||||
context = engine.pass_one_analysis(full_text)
|
||||
print(f"Detected Genre: {context.genre}")
|
||||
print(f"Characters found: {len(context.characters)}")
|
||||
|
||||
# 3. Pass 2: Translate
|
||||
print("--- PASS 2: Translating ---")
|
||||
translated_subs = engine.pass_two_translation(subs)
|
||||
|
||||
# 4. Save
|
||||
save_srt(translated_subs, output_file)
|
||||
print(f"Saved to {output_file}")
|
||||
# [STATE:Exit]
|
||||
|
||||
# [/DEF:main]
|
||||
65
translation_engine.py
Normal file
65
translation_engine.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# [DEF:translation_engine:Module]
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
# Local imports - require files to be in the same folder
|
||||
try:
|
||||
from domain_models import TranslationContext, SubtitleLine
|
||||
from llm_core import GeminiProcessor
|
||||
except ImportError as e:
|
||||
raise ImportError(f"Failed to import local modules: {e}. Ensure domain_models.py and llm_core.py exist.")
|
||||
|
||||
logger = logging.getLogger("Engine")
|
||||
|
||||
class SubtitleOrchestrator:
|
||||
def __init__(self):
|
||||
self.llm = GeminiProcessor()
|
||||
self.context: TranslationContext = None # type: ignore
|
||||
|
||||
def pass_one_analysis(self, full_text: str) -> TranslationContext:
|
||||
logger.info("[PassOne] Starting Analysis...")
|
||||
prompt = "ACT AS: Narrative Analyst. Identify Genre, Tone, Characters, and Plot."
|
||||
self.context = self.llm.generate_structured(prompt, full_text, TranslationContext)
|
||||
return self.context
|
||||
|
||||
def pass_two_translation(self, subtitle_lines: List[SubtitleLine], chunk_size: int = 20) -> List[SubtitleLine]:
|
||||
if not self.context:
|
||||
raise RuntimeError("Context must be analyzed before translation.")
|
||||
|
||||
logger.info(f"[PassTwo] Translating {len(subtitle_lines)} lines.")
|
||||
|
||||
# Serialize context for the LLM
|
||||
char_desc = "\n".join([f"- {c.name}: {c.role}, {c.speech_style}" for c in self.context.characters])
|
||||
system_instruction = (
|
||||
f"CONTEXT:\nTitle: {self.context.title}\nGenre: {self.context.genre}\n"
|
||||
f"Style: {self.context.style_guidelines}\n\nCHARACTERS:\n{char_desc}\n\n"
|
||||
"TASK: Translate subtitles to Russian. Format: 'ID || Text'. Maintain timing."
|
||||
)
|
||||
|
||||
translated_lines = []
|
||||
for i in range(0, len(subtitle_lines), chunk_size):
|
||||
chunk = subtitle_lines[i : i + chunk_size]
|
||||
input_text = "\n".join([f"{sub.index} || {sub.original_text}" for sub in chunk])
|
||||
|
||||
try:
|
||||
response_text = self.llm.generate_text(system_instruction, input_text)
|
||||
|
||||
# Parsing logic
|
||||
mapping = {}
|
||||
for line in response_text.strip().split('\n'):
|
||||
if " || " in line:
|
||||
parts = line.split(" || ", 1)
|
||||
if parts[0].strip().isdigit():
|
||||
mapping[int(parts[0].strip())] = parts[1].strip()
|
||||
|
||||
for sub in chunk:
|
||||
sub.translated_text = mapping.get(sub.index, sub.original_text)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Chunk failed: {e}")
|
||||
for sub in chunk: sub.translated_text = sub.original_text # Fallback
|
||||
|
||||
translated_lines.extend(chunk)
|
||||
|
||||
return translated_lines
|
||||
# [/DEF:translation_engine]
|
||||
Reference in New Issue
Block a user