71 lines
2.7 KiB
Python
71 lines
2.7 KiB
Python
"""
|
|
FILE: app/core/ingestion/ingestion_utils.py
|
|
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
|
|
AUDIT v2.13.9: Behebung des Circular Imports durch Nutzung der app.core.registry.
|
|
"""
|
|
import json
|
|
import re
|
|
from typing import Any, Optional, Dict
|
|
|
|
# ENTSCHEIDENDER FIX: Import der Basis-Logik aus dem neutralen Registry-Modul.
|
|
# Dies bricht den Zirkelbezug auf, da dieses Modul keine Services mehr importiert.
|
|
from app.core.registry import load_type_registry, clean_llm_text
|
|
|
|
def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any:
|
|
"""
|
|
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen.
|
|
WP-14: Nutzt nun die zentrale clean_llm_text Funktion aus app.core.registry.
|
|
"""
|
|
if not text:
|
|
return []
|
|
|
|
# 1. Text zentral bereinigen via neutralem Modul
|
|
clean = clean_llm_text(text, registry)
|
|
|
|
# 2. Markdown-Code-Blöcke extrahieren
|
|
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
|
payload = match.group(1) if match else clean
|
|
|
|
try:
|
|
return json.loads(payload.strip())
|
|
except json.JSONDecodeError:
|
|
# Recovery: Suche nach Liste
|
|
start = payload.find('[')
|
|
end = payload.rfind(']') + 1
|
|
if start != -1 and end > start:
|
|
try: return json.loads(payload[start:end])
|
|
except: pass
|
|
|
|
# Recovery: Suche nach Objekt
|
|
start_obj = payload.find('{')
|
|
end_obj = payload.rfind('}') + 1
|
|
if start_obj != -1 and end_obj > start_obj:
|
|
try: return json.loads(payload[start_obj:end_obj])
|
|
except: pass
|
|
return []
|
|
|
|
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
|
"""
|
|
Bestimmt den finalen Notiz-Typ.
|
|
WP-14: Fallback wird nun über ingestion_settings.default_note_type gesteuert.
|
|
"""
|
|
types = registry.get("types", {})
|
|
if requested and requested in types:
|
|
return requested
|
|
|
|
# Dynamischer Fallback aus der Registry (Standard: 'concept')
|
|
ingest_cfg = registry.get("ingestion_settings", {})
|
|
return ingest_cfg.get("default_note_type", "concept")
|
|
|
|
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
|
"""
|
|
Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.
|
|
"""
|
|
from app.core.chunking import get_chunk_config
|
|
profiles = registry.get("chunking_profiles", {})
|
|
if profile_name in profiles:
|
|
cfg = profiles[profile_name].copy()
|
|
if "overlap" in cfg and isinstance(cfg["overlap"], list):
|
|
cfg["overlap"] = tuple(cfg["overlap"])
|
|
return cfg
|
|
return get_chunk_config(note_type) |