mindnet/app/core/ingestion/ingestion_utils.py

88 lines
3.2 KiB
Python

"""
FILE: app/core/ingestion/ingestion_utils.py
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
AUDIT v2.13.7: Dynamisierung von Cleanup-Patterns und Default-Typen (WP-14).
"""
import os
import json
import re
import yaml
from typing import Any, Optional, Dict
def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any:
"""
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic).
WP-14: Nutzt nun dynamische cleanup_patterns aus der Registry.
"""
if not text or not isinstance(text, str):
return []
# Fallback-Patterns für die Bereinigung
patterns = ["<s>", "</s>", "[OUT]", "[/OUT]"]
# Falls keine Registry übergeben wurde, versuchen wir sie zu laden
reg = registry or load_type_registry()
if reg:
# Lade Patterns aus llm_settings (WP-14 Erweiterung)
patterns = reg.get("llm_settings", {}).get("cleanup_patterns", patterns)
clean = text
for p in patterns:
clean = clean.replace(p, "")
clean = clean.strip()
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
payload = match.group(1) if match else clean
try:
return json.loads(payload.strip())
except json.JSONDecodeError:
# Recovery: Suche nach Liste
start = payload.find('[')
end = payload.rfind(']') + 1
if start != -1 and end > start:
try: return json.loads(payload[start:end])
except: pass
# Recovery: Suche nach Objekt
start_obj = payload.find('{')
end_obj = payload.rfind('}') + 1
if start_obj != -1 and end_obj > start_obj:
try: return json.loads(payload[start_obj:end_obj])
except: pass
return []
def load_type_registry(custom_path: Optional[str] = None) -> dict:
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion."""
from app.config import get_settings
settings = get_settings()
path = custom_path or settings.MINDNET_TYPES_FILE
if not os.path.exists(path): return {}
try:
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
except Exception: return {}
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
"""
Bestimmt den finalen Notiz-Typ.
WP-14: Fallback wird nun über ingestion_settings.default_note_type gesteuert.
"""
types = registry.get("types", {})
if requested and requested in types:
return requested
# Dynamischer Fallback aus der Registry (Standard: 'concept')
ingest_cfg = registry.get("ingestion_settings", {})
return ingest_cfg.get("default_note_type", "concept")
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
from app.core.chunking import get_chunk_config
profiles = registry.get("chunking_profiles", {})
if profile_name in profiles:
cfg = profiles[profile_name].copy()
if "overlap" in cfg and isinstance(cfg["overlap"], list):
cfg["overlap"] = tuple(cfg["overlap"])
return cfg
return get_chunk_config(note_type)