+ deduplicate
This commit is contained in:
@@ -23,6 +23,7 @@ from contextlib import contextmanager
|
||||
# [IMPORTS] Third-party
|
||||
import yaml
|
||||
import shutil
|
||||
import zlib
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
|
||||
@@ -164,11 +165,31 @@ def read_dashboard_from_disk(
|
||||
|
||||
# [SECTION] Archive Management
|
||||
|
||||
def calculate_crc32(file_path: Path) -> str:
|
||||
"""[HELPER] Calculates the CRC32 checksum of a file.
|
||||
@pre:
|
||||
- file_path must be a valid path to a file.
|
||||
@post:
|
||||
- Returns the CRC32 checksum as a hexadecimal string.
|
||||
@raise:
|
||||
- FileNotFoundError: If the file does not exist.
|
||||
- Exception: For any other file I/O errors.
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
crc32_value = zlib.crc32(f.read())
|
||||
return hex(crc32_value)[2:].zfill(8) # Convert to hex string, remove "0x", and pad with zeros
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Error calculating CRC32 for {file_path}: {str(e)}")
|
||||
|
||||
def archive_exports(
|
||||
output_dir: str,
|
||||
daily_retention: int = 7,
|
||||
weekly_retention: int = 4,
|
||||
monthly_retention: int = 12,
|
||||
deduplicate: bool = False,
|
||||
logger: Optional[SupersetLogger] = None
|
||||
) -> None:
|
||||
"""[CONTRACT] Управление архивом экспортированных дашбордов
|
||||
@@ -178,29 +199,55 @@ def archive_exports(
|
||||
@post:
|
||||
- Сохраняет файлы согласно политике хранения
|
||||
- Удаляет устаревшие архивы
|
||||
- Сохраняет логическую структуру каталогов
|
||||
- Логирует все действия
|
||||
@raise:
|
||||
- ValueError: Если retention параметры некорректны
|
||||
- Exception: При любых других ошибках
|
||||
"""
|
||||
logger = logger or SupersetLogger(name="fileio", console=False)
|
||||
logger.info(f"[ARCHIVE] Старт очистки архивов в {output_dir}")
|
||||
logger.info(f"[ARCHIVE] Starting archive cleanup in {output_dir}. Deduplication: {deduplicate}")
|
||||
|
||||
# [VALIDATION] Проверка параметров
|
||||
if not all(isinstance(x, int) and x >= 0 for x in [daily_retention, weekly_retention, monthly_retention]):
|
||||
raise ValueError("[CONFIG_ERROR] Значения retention должны быть положительными")
|
||||
raise ValueError("[CONFIG_ERROR] Retention values must be positive integers.")
|
||||
|
||||
checksums = {} # Dictionary to store checksums and file paths
|
||||
try:
|
||||
export_dir = Path(output_dir)
|
||||
files_with_dates = []
|
||||
|
||||
# [PROCESSING] Сбор данных о файлах
|
||||
if not export_dir.exists():
|
||||
logger.error(f"[ARCHIVE_ERROR] Directory does not exist: {export_dir}")
|
||||
raise FileNotFoundError(f"Directory not found: {export_dir}")
|
||||
|
||||
# [PROCESSING] Сбор информации о файлах
|
||||
files_with_dates = []
|
||||
for file in export_dir.glob("*.zip"):
|
||||
try:
|
||||
timestamp_str = file.stem.split('_')[-1].split('T')[0]
|
||||
file_date = datetime.strptime(timestamp_str, "%Y%m%d").date()
|
||||
logger.debug(f"[DATE_PARSE] Файл {file.name} добавлен к анализу очистки (массив files_with_dates)")
|
||||
except (ValueError, IndexError):
|
||||
file_date = datetime.fromtimestamp(file.stat().st_mtime).date()
|
||||
logger.warning(f"[DATE_PARSE] Используется дата модификации для {file.name}")
|
||||
logger.warning(f"[DATE_PARSE] Using modification date for {file.name}")
|
||||
|
||||
files_with_dates.append((file, file_date))
|
||||
|
||||
|
||||
# [DEDUPLICATION]
|
||||
if deduplicate:
|
||||
logger.info("[DEDUPLICATION] Starting checksum-based deduplication.")
|
||||
for file in files_with_dates:
|
||||
file_path = file[0]
|
||||
try:
|
||||
crc32_checksum = calculate_crc32(file_path)
|
||||
if crc32_checksum in checksums:
|
||||
# Duplicate found, delete the older file
|
||||
logger.warning(f"[DEDUPLICATION] Duplicate found: {file_path}. Deleting.")
|
||||
file_path.unlink()
|
||||
else:
|
||||
checksums[crc32_checksum] = file_path
|
||||
except Exception as e:
|
||||
logger.error(f"[DEDUPLICATION_ERROR] Error processing {file_path}: {str(e)}", exc_info=True)
|
||||
|
||||
# [PROCESSING] Применение политик хранения
|
||||
keep_files = apply_retention_policy(
|
||||
@@ -212,13 +259,20 @@ def archive_exports(
|
||||
)
|
||||
|
||||
# [CLEANUP] Удаление устаревших файлов
|
||||
deleted_count = 0
|
||||
for file, _ in files_with_dates:
|
||||
if file not in keep_files:
|
||||
file.unlink(missing_ok=True)
|
||||
logger.info(f"[FILE_REMOVED] Удален архив: {file.name}")
|
||||
try:
|
||||
file.unlink()
|
||||
deleted_count += 1
|
||||
logger.info(f"[FILE_REMOVED] Deleted archive: {file.name}")
|
||||
except OSError as e:
|
||||
logger.error(f"[FILE_ERROR] Error deleting {file.name}: {str(e)}", exc_info=True)
|
||||
|
||||
logger.info(f"[ARCHIVE_RESULT] Cleanup completed. Deleted {deleted_count} archives.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[ARCHIVE_ERROR] Ошибка обработки архивов: {str(e)}", exc_info=True)
|
||||
logger.error(f"[ARCHIVE_ERROR] Critical error during archive cleanup: {str(e)}", exc_info=True)
|
||||
raise
|
||||
|
||||
def apply_retention_policy(
|
||||
|
||||
Reference in New Issue
Block a user