69 lines
2.6 KiB
Python
69 lines
2.6 KiB
Python
"""
|
|
FILE: app/core/ingestion/ingestion_utils.py
|
|
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
|
|
"""
|
|
import os
|
|
import json
|
|
import re
|
|
import yaml
|
|
from typing import Any, Optional, Dict
|
|
|
|
def extract_json_from_response(text: str) -> Any:
|
|
"""
|
|
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic).
|
|
Entfernt <s>, [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit.
|
|
"""
|
|
if not text or not isinstance(text, str):
|
|
return []
|
|
|
|
clean = text.replace("<s>", "").replace("</s>", "")
|
|
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
|
clean = clean.strip()
|
|
|
|
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
|
payload = match.group(1) if match else clean
|
|
|
|
try:
|
|
return json.loads(payload.strip())
|
|
except json.JSONDecodeError:
|
|
# Recovery: Suche nach Liste
|
|
start = payload.find('[')
|
|
end = payload.rfind(']') + 1
|
|
if start != -1 and end > start:
|
|
try: return json.loads(payload[start:end])
|
|
except: pass
|
|
|
|
# Recovery: Suche nach Objekt
|
|
start_obj = payload.find('{')
|
|
end_obj = payload.rfind('}') + 1
|
|
if start_obj != -1 and end_obj > start_obj:
|
|
try: return json.loads(payload[start_obj:end_obj])
|
|
except: pass
|
|
return []
|
|
|
|
def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
|
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion."""
|
|
from app.config import get_settings
|
|
settings = get_settings()
|
|
path = custom_path or settings.MINDNET_TYPES_FILE
|
|
if not os.path.exists(path): return {}
|
|
try:
|
|
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
|
|
except Exception: return {}
|
|
|
|
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
|
"""Bestimmt den finalen Notiz-Typ (Fallback auf 'concept')."""
|
|
types = registry.get("types", {})
|
|
if requested and requested in types: return requested
|
|
return "concept"
|
|
|
|
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
|
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
|
from app.core.chunking import get_chunk_config
|
|
profiles = registry.get("chunking_profiles", {})
|
|
if profile_name in profiles:
|
|
cfg = profiles[profile_name].copy()
|
|
if "overlap" in cfg and isinstance(cfg["overlap"], list):
|
|
cfg["overlap"] = tuple(cfg["overlap"])
|
|
return cfg
|
|
return get_chunk_config(note_type) |