""" FILE: app/core/ingestion/ingestion_utils.py DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups. AUDIT v2.13.7: Dynamisierung von Cleanup-Patterns und Default-Typen (WP-14). """ import os import json import re import yaml from typing import Any, Optional, Dict def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any: """ Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic). WP-14: Nutzt nun dynamische cleanup_patterns aus der Registry. """ if not text or not isinstance(text, str): return [] # Fallback-Patterns für die Bereinigung patterns = ["", "", "[OUT]", "[/OUT]"] # Falls keine Registry übergeben wurde, versuchen wir sie zu laden reg = registry or load_type_registry() if reg: # Lade Patterns aus llm_settings (WP-14 Erweiterung) patterns = reg.get("llm_settings", {}).get("cleanup_patterns", patterns) clean = text for p in patterns: clean = clean.replace(p, "") clean = clean.strip() match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL) payload = match.group(1) if match else clean try: return json.loads(payload.strip()) except json.JSONDecodeError: # Recovery: Suche nach Liste start = payload.find('[') end = payload.rfind(']') + 1 if start != -1 and end > start: try: return json.loads(payload[start:end]) except: pass # Recovery: Suche nach Objekt start_obj = payload.find('{') end_obj = payload.rfind('}') + 1 if start_obj != -1 and end_obj > start_obj: try: return json.loads(payload[start_obj:end_obj]) except: pass return [] def load_type_registry(custom_path: Optional[str] = None) -> dict: """Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion.""" from app.config import get_settings settings = get_settings() path = custom_path or settings.MINDNET_TYPES_FILE if not os.path.exists(path): return {} try: with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {} except Exception: return {} def resolve_note_type(registry: dict, requested: Optional[str]) -> str: """ Bestimmt den finalen Notiz-Typ. WP-14: Fallback wird nun über ingestion_settings.default_note_type gesteuert. """ types = registry.get("types", {}) if requested and requested in types: return requested # Dynamischer Fallback aus der Registry (Standard: 'concept') ingest_cfg = registry.get("ingestion_settings", {}) return ingest_cfg.get("default_note_type", "concept") def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]: """Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.""" from app.core.chunking import get_chunk_config profiles = registry.get("chunking_profiles", {}) if profile_name in profiles: cfg = profiles[profile_name].copy() if "overlap" in cfg and isinstance(cfg["overlap"], list): cfg["overlap"] = tuple(cfg["overlap"]) return cfg return get_chunk_config(note_type)