mindnet/app/core/ingestion/ingestion_utils.py

69 lines
2.6 KiB
Python

"""
FILE: app/core/ingestion/ingestion_utils.py
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
"""
import os
import json
import re
import yaml
from typing import Any, Optional, Dict
def extract_json_from_response(text: str) -> Any:
"""
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic).
Entfernt <s>, [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit.
"""
if not text or not isinstance(text, str):
return []
clean = text.replace("<s>", "").replace("</s>", "")
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
clean = clean.strip()
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
payload = match.group(1) if match else clean
try:
return json.loads(payload.strip())
except json.JSONDecodeError:
# Recovery: Suche nach Liste
start = payload.find('[')
end = payload.rfind(']') + 1
if start != -1 and end > start:
try: return json.loads(payload[start:end])
except: pass
# Recovery: Suche nach Objekt
start_obj = payload.find('{')
end_obj = payload.rfind('}') + 1
if start_obj != -1 and end_obj > start_obj:
try: return json.loads(payload[start_obj:end_obj])
except: pass
return []
def load_type_registry(custom_path: Optional[str] = None) -> dict:
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion."""
from app.config import get_settings
settings = get_settings()
path = custom_path or settings.MINDNET_TYPES_FILE
if not os.path.exists(path): return {}
try:
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
except Exception: return {}
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
"""Bestimmt den finalen Notiz-Typ (Fallback auf 'concept')."""
types = registry.get("types", {})
if requested and requested in types: return requested
return "concept"
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
from app.core.chunker import get_chunk_config
profiles = registry.get("chunking_profiles", {})
if profile_name in profiles:
cfg = profiles[profile_name].copy()
if "overlap" in cfg and isinstance(cfg["overlap"], list):
cfg["overlap"] = tuple(cfg["overlap"])
return cfg
return get_chunk_config(note_type)