100 lines
3.6 KiB
Python
100 lines
3.6 KiB
Python
"""
|
|
FILE: app/core/ingestion/ingestion_utils.py
|
|
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
|
|
AUDIT v2.13.8: Zentralisierung der Text-Bereinigung für LLM-Antworten.
|
|
"""
|
|
import os
|
|
import json
|
|
import re
|
|
import yaml
|
|
from typing import Any, Optional, Dict, List
|
|
|
|
def clean_llm_text(text: str, registry: Optional[dict] = None) -> str:
|
|
"""
|
|
Entfernt LLM-Steuerzeichen und Artefakte aus einem Text.
|
|
Nutzt die cleanup_patterns aus der Registry oder Standardwerte.
|
|
"""
|
|
if not text or not isinstance(text, str):
|
|
return ""
|
|
|
|
# Fallback-Patterns, falls die Registry nicht greift
|
|
default_patterns = ["<s>", "</s>", "[OUT]", "[/OUT]"]
|
|
|
|
# Falls keine Registry übergeben wurde, versuchen wir sie zu laden
|
|
reg = registry or load_type_registry()
|
|
|
|
# Lade Patterns aus llm_settings (WP-14 Erweiterung)
|
|
patterns: List[str] = reg.get("llm_settings", {}).get("cleanup_patterns", default_patterns)
|
|
|
|
clean = text
|
|
for p in patterns:
|
|
clean = clean.replace(p, "")
|
|
|
|
return clean.strip()
|
|
|
|
def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any:
|
|
"""
|
|
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen.
|
|
WP-14: Nutzt nun die zentrale clean_llm_text Funktion.
|
|
"""
|
|
if not text:
|
|
return []
|
|
|
|
# 1. Text zentral bereinigen
|
|
clean = clean_llm_text(text, registry)
|
|
|
|
# 2. Markdown-Code-Blöcke extrahieren
|
|
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
|
payload = match.group(1) if match else clean
|
|
|
|
try:
|
|
return json.loads(payload.strip())
|
|
except json.JSONDecodeError:
|
|
# Recovery: Suche nach Liste
|
|
start = payload.find('[')
|
|
end = payload.rfind(']') + 1
|
|
if start != -1 and end > start:
|
|
try: return json.loads(payload[start:end])
|
|
except: pass
|
|
|
|
# Recovery: Suche nach Objekt
|
|
start_obj = payload.find('{')
|
|
end_obj = payload.rfind('}') + 1
|
|
if start_obj != -1 and end_obj > start_obj:
|
|
try: return json.loads(payload[start_obj:end_obj])
|
|
except: pass
|
|
return []
|
|
|
|
def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
|
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion."""
|
|
from app.config import get_settings
|
|
settings = get_settings()
|
|
path = custom_path or settings.MINDNET_TYPES_FILE
|
|
if not os.path.exists(path): return {}
|
|
try:
|
|
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
|
|
except Exception: return {}
|
|
|
|
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
|
"""
|
|
Bestimmt den finalen Notiz-Typ.
|
|
WP-14: Fallback wird nun über ingestion_settings.default_note_type gesteuert.
|
|
"""
|
|
types = registry.get("types", {})
|
|
if requested and requested in types:
|
|
return requested
|
|
|
|
# Dynamischer Fallback aus der Registry (Standard: 'concept')
|
|
ingest_cfg = registry.get("ingestion_settings", {})
|
|
return ingest_cfg.get("default_note_type", "concept")
|
|
|
|
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
|
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
|
from app.core.chunking import get_chunk_config
|
|
profiles = registry.get("chunking_profiles", {})
|
|
if profile_name in profiles:
|
|
cfg = profiles[profile_name].copy()
|
|
if "overlap" in cfg and isinstance(cfg["overlap"], list):
|
|
cfg["overlap"] = tuple(cfg["overlap"])
|
|
return cfg
|
|
return get_chunk_config(note_type) |