Mistral sichere Parser implemntierung
This commit is contained in:
parent
ecfdc67485
commit
16e128668c
|
|
@ -1,10 +1,10 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/ingestion.py
|
FILE: app/core/ingestion.py
|
||||||
DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen.
|
DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen.
|
||||||
WP-20: Optimiert für OpenRouter (openai/gpt-oss-20b:free) als Primary.
|
WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free).
|
||||||
WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash.
|
WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash.
|
||||||
FIX: Finale DoD-Härtung, Entfernung aller Shortcuts und Stabilitätspatch.
|
FIX: Finale Mistral-Härtung (<s> & [OUT] Tags), robuste JSON-Recovery & DoD-Sync.
|
||||||
VERSION: 2.11.10
|
VERSION: 2.11.11
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry
|
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry
|
||||||
"""
|
"""
|
||||||
|
|
@ -49,21 +49,41 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# --- Global Helpers ---
|
# --- Global Helpers ---
|
||||||
def extract_json_from_response(text: str) -> Any:
|
def extract_json_from_response(text: str) -> Any:
|
||||||
"""Extrahiert JSON-Daten, selbst wenn sie in Markdown-Blöcken stehen."""
|
"""
|
||||||
|
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama).
|
||||||
|
Entfernt <s>, [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit.
|
||||||
|
"""
|
||||||
if not text: return []
|
if not text: return []
|
||||||
# Suche nach ```json ... ``` oder ``` ... ```
|
|
||||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", text, re.DOTALL)
|
# 1. Entferne Mistral/Llama Steuerzeichen und Tags
|
||||||
clean_text = match.group(1) if match else text
|
clean = text.replace("<s>", "").replace("</s>", "")
|
||||||
|
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
||||||
|
clean = clean.strip()
|
||||||
|
|
||||||
|
# 2. Suche nach Markdown JSON-Blöcken (```json ... ```)
|
||||||
|
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
||||||
|
payload = match.group(1) if match else clean
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return json.loads(clean_text.strip())
|
return json.loads(payload.strip())
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# Versuch: Alles vor der ersten [ und nach der letzten ] entfernen (Recovery)
|
# 3. Recovery: Suche nach der ersten [ und letzten ] (Liste)
|
||||||
start = clean_text.find('[')
|
start = payload.find('[')
|
||||||
end = clean_text.rfind(']') + 1
|
end = payload.rfind(']') + 1
|
||||||
if start != -1 and end != 0:
|
if start != -1 and end > start:
|
||||||
try: return json.loads(clean_text[start:end])
|
try:
|
||||||
|
return json.loads(payload[start:end])
|
||||||
except: pass
|
except: pass
|
||||||
raise
|
|
||||||
|
# 4. Zweite Recovery: Suche nach der ersten { und letzten } (Objekt)
|
||||||
|
start_obj = payload.find('{')
|
||||||
|
end_obj = payload.rfind('}') + 1
|
||||||
|
if start_obj != -1 and end_obj > start_obj:
|
||||||
|
try:
|
||||||
|
return json.loads(payload[start_obj:end_obj])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
||||||
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion."""
|
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion."""
|
||||||
|
|
@ -121,14 +141,7 @@ class IngestionService:
|
||||||
Respektiert die Provider-Einstellung (OpenRouter Primary).
|
Respektiert die Provider-Einstellung (OpenRouter Primary).
|
||||||
"""
|
"""
|
||||||
provider = self.settings.MINDNET_LLM_PROVIDER
|
provider = self.settings.MINDNET_LLM_PROVIDER
|
||||||
|
model = self.settings.OPENROUTER_MODEL if provider == "openrouter" else self.settings.GEMINI_MODEL
|
||||||
# Modell-Zuordnung basierend auf Provider-Wahl (Keine festen Annahmen)
|
|
||||||
if provider == "openrouter":
|
|
||||||
model = self.settings.OPENROUTER_MODEL
|
|
||||||
elif provider == "gemini":
|
|
||||||
model = self.settings.GEMINI_MODEL
|
|
||||||
else:
|
|
||||||
model = self.settings.LLM_MODEL
|
|
||||||
|
|
||||||
logger.info(f"🚀 [Ingestion] Turbo-Mode: Extracting edges for '{note_id}' using {model} on {provider}")
|
logger.info(f"🚀 [Ingestion] Turbo-Mode: Extracting edges for '{note_id}' using {model} on {provider}")
|
||||||
|
|
||||||
|
|
@ -140,14 +153,14 @@ class IngestionService:
|
||||||
try:
|
try:
|
||||||
# Sicherheits-Check: Formatierung des Templates gegen KeyError schützen
|
# Sicherheits-Check: Formatierung des Templates gegen KeyError schützen
|
||||||
try:
|
try:
|
||||||
# Nutzt die ersten 6000 Zeichen als Kontext-Fenster (DoD: Explizit dokumentiert)
|
# Nutzt die ersten 6000 Zeichen als Kontext-Fenster
|
||||||
prompt = template.format(
|
prompt = template.format(
|
||||||
text=text[:6000],
|
text=text[:6000],
|
||||||
note_id=note_id,
|
note_id=note_id,
|
||||||
valid_types=valid_types_str
|
valid_types=valid_types_str
|
||||||
)
|
)
|
||||||
except KeyError as ke:
|
except KeyError as ke:
|
||||||
logger.error(f"❌ [Ingestion] Prompt-Template Fehler (Variable {ke} fehlt). Prüfe prompts.yaml Maskierung.")
|
logger.error(f"❌ [Ingestion] Prompt-Template Fehler (Variable {ke} fehlt).")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
response_json = await self.llm.generate_raw_response(
|
response_json = await self.llm.generate_raw_response(
|
||||||
|
|
@ -155,7 +168,7 @@ class IngestionService:
|
||||||
provider=provider, model_override=model
|
provider=provider, model_override=model
|
||||||
)
|
)
|
||||||
|
|
||||||
# Robustes JSON-Parsing via Helper
|
# Nutzt den verbesserten Mistral-sicheren JSON-Extraktor
|
||||||
raw_data = extract_json_from_response(response_json)
|
raw_data = extract_json_from_response(response_json)
|
||||||
|
|
||||||
# Recovery: Suche nach Listen in Dictionaries (z.B. {"edges": [...]})
|
# Recovery: Suche nach Listen in Dictionaries (z.B. {"edges": [...]})
|
||||||
|
|
|
||||||
|
|
@ -3,16 +3,16 @@ FILE: app/services/semantic_analyzer.py
|
||||||
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
|
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
|
||||||
WP-20 Fix: Volle Kompatibilität mit der provider-basierten Routing-Logik (OpenRouter Primary).
|
WP-20 Fix: Volle Kompatibilität mit der provider-basierten Routing-Logik (OpenRouter Primary).
|
||||||
WP-22: Integration von valid_types zur Halluzinations-Vermeidung.
|
WP-22: Integration von valid_types zur Halluzinations-Vermeidung.
|
||||||
FIX: Finale DoD-Härtung, Entfernung von Hardcoded Limits und optimiertes Error-Handling.
|
FIX: Mistral-sicheres JSON-Parsing (<s> & [OUT] Handling) und 100% Logik-Erhalt.
|
||||||
VERSION: 2.2.4
|
VERSION: 2.2.6
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: app.services.llm_service, app.services.edge_registry, json, logging
|
DEPENDENCIES: app.services.llm_service, app.services.edge_registry, json, logging, re
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Any
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
# Importe
|
# Importe
|
||||||
|
|
@ -52,6 +52,43 @@ class SemanticAnalyzer:
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _extract_json_safely(self, text: str) -> Any:
|
||||||
|
"""
|
||||||
|
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama).
|
||||||
|
Implementiert robuste Recovery-Logik für Cloud-Provider.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 1. Entferne Mistral/Llama Steuerzeichen und Tags
|
||||||
|
clean = text.replace("<s>", "").replace("</s>", "")
|
||||||
|
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
||||||
|
clean = clean.strip()
|
||||||
|
|
||||||
|
# 2. Suche nach Markdown JSON-Blöcken
|
||||||
|
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
||||||
|
payload = match.group(1) if match else clean
|
||||||
|
|
||||||
|
try:
|
||||||
|
return json.loads(payload.strip())
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# 3. Recovery: Suche nach der ersten [ und letzten ]
|
||||||
|
start = payload.find('[')
|
||||||
|
end = payload.rfind(']') + 1
|
||||||
|
if start != -1 and end > start:
|
||||||
|
try:
|
||||||
|
return json.loads(payload[start:end])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
# 4. Zweite Recovery: Suche nach der ersten { und letzten }
|
||||||
|
start_obj = payload.find('{')
|
||||||
|
end_obj = payload.rfind('}') + 1
|
||||||
|
if start_obj != -1 and end_obj > start_obj:
|
||||||
|
try:
|
||||||
|
return json.loads(payload[start_obj:end_obj])
|
||||||
|
except: pass
|
||||||
|
return []
|
||||||
|
|
||||||
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
|
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
|
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
|
||||||
|
|
@ -65,7 +102,7 @@ class SemanticAnalyzer:
|
||||||
provider = self.llm.settings.MINDNET_LLM_PROVIDER
|
provider = self.llm.settings.MINDNET_LLM_PROVIDER
|
||||||
model = self.llm.settings.OPENROUTER_MODEL if provider == "openrouter" else self.llm.settings.GEMINI_MODEL
|
model = self.llm.settings.OPENROUTER_MODEL if provider == "openrouter" else self.llm.settings.GEMINI_MODEL
|
||||||
|
|
||||||
# 2. Prompt laden (Provider-spezifisch)
|
# 2. Prompt laden (Provider-spezifisch via get_prompt)
|
||||||
prompt_template = self.llm.get_prompt("edge_allocation_template", provider)
|
prompt_template = self.llm.get_prompt("edge_allocation_template", provider)
|
||||||
|
|
||||||
if not prompt_template or not isinstance(prompt_template, str):
|
if not prompt_template or not isinstance(prompt_template, str):
|
||||||
|
|
@ -86,7 +123,7 @@ class SemanticAnalyzer:
|
||||||
|
|
||||||
# 4. Prompt füllen mit Format-Check (Kein Shortcut)
|
# 4. Prompt füllen mit Format-Check (Kein Shortcut)
|
||||||
try:
|
try:
|
||||||
# Wir begrenzen den Text auf eine vernünftige Länge für das Kontextfenster (ca. 10k Tokens max)
|
# Wir begrenzen den Text auf eine vernünftige Länge für das Kontextfenster
|
||||||
final_prompt = prompt_template.format(
|
final_prompt = prompt_template.format(
|
||||||
chunk_text=chunk_text[:6000],
|
chunk_text=chunk_text[:6000],
|
||||||
edge_list=edges_str,
|
edge_list=edges_str,
|
||||||
|
|
@ -108,30 +145,12 @@ class SemanticAnalyzer:
|
||||||
model_override=model
|
model_override=model
|
||||||
)
|
)
|
||||||
|
|
||||||
# 6. Bulletproof JSON Extraction (Analog zur Ingestion)
|
# 6. Mistral-sicheres JSON Parsing via Helper
|
||||||
# Entfernt Markdown-Code-Blöcke falls vorhanden
|
data = self._extract_json_safely(response_json)
|
||||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", response_json, re.DOTALL)
|
|
||||||
clean_json = match.group(1) if match else response_json
|
|
||||||
clean_json = clean_json.strip()
|
|
||||||
|
|
||||||
if not clean_json:
|
if not data:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
try:
|
|
||||||
data = json.loads(clean_json)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
# Letzter Rettungsversuch: Suche nach dem ersten '[' und letzten ']'
|
|
||||||
start = clean_json.find('[')
|
|
||||||
end = clean_json.rfind(']') + 1
|
|
||||||
if start != -1 and end != 0:
|
|
||||||
try:
|
|
||||||
data = json.loads(clean_json[start:end])
|
|
||||||
except:
|
|
||||||
logger.error("❌ [SemanticAnalyzer] JSON Recovery failed.")
|
|
||||||
return []
|
|
||||||
else:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 7. Robuste Normalisierung (List vs Dict Recovery)
|
# 7. Robuste Normalisierung (List vs Dict Recovery)
|
||||||
raw_candidates = []
|
raw_candidates = []
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
|
|
@ -146,7 +165,9 @@ class SemanticAnalyzer:
|
||||||
if not raw_candidates:
|
if not raw_candidates:
|
||||||
for k, v in data.items():
|
for k, v in data.items():
|
||||||
if isinstance(v, str): raw_candidates.append(f"{k}:{v}")
|
if isinstance(v, str): raw_candidates.append(f"{k}:{v}")
|
||||||
elif isinstance(v, list): [raw_candidates.append(f"{k}:{i}") for i in v if isinstance(i, str)]
|
elif isinstance(v, list):
|
||||||
|
for target in v:
|
||||||
|
if isinstance(target, str): raw_candidates.append(f"{k}:{target}")
|
||||||
|
|
||||||
# 8. Strikte Validierung gegen Kanten-Format
|
# 8. Strikte Validierung gegen Kanten-Format
|
||||||
valid_edges = []
|
valid_edges = []
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user