semantic_analyzer verschachtelete Strukturen
This commit is contained in:
parent
7fc316d284
commit
d93b9b30ae
|
|
@ -1,7 +1,6 @@
|
||||||
"""
|
"""
|
||||||
app/services/semantic_analyzer.py
|
app/services/semantic_analyzer.py — Edge Validation & Filtering
|
||||||
Zweck: Asynchroner Service zur Zuweisung von Kanten zu Text-Chunks mittels LLM.
|
Version: 1.1 (Robust JSON Parsing)
|
||||||
Nutzt Templates aus prompts.yaml.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
@ -28,17 +27,22 @@ class SemanticAnalyzer:
|
||||||
|
|
||||||
# 1. Prompt laden
|
# 1. Prompt laden
|
||||||
prompt_template = self.llm.prompts.get("edge_allocation_template")
|
prompt_template = self.llm.prompts.get("edge_allocation_template")
|
||||||
|
|
||||||
|
# Fallback, falls Prompt nicht in YAML definiert ist (für Tests ohne volle Config)
|
||||||
if not prompt_template:
|
if not prompt_template:
|
||||||
logger.error("Prompt 'edge_allocation_template' in prompts.yaml nicht gefunden.")
|
prompt_template = (
|
||||||
return []
|
"TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n"
|
||||||
|
"TEXT: {chunk_text}\n"
|
||||||
|
"KANDIDATEN: {edge_list}\n"
|
||||||
|
"OUTPUT: JSON Liste von Strings [\"kind:target\"]."
|
||||||
|
)
|
||||||
|
|
||||||
# 2. Kandidaten-Liste formatieren
|
# 2. Kandidaten-Liste formatieren
|
||||||
# Wir übergeben die Kanten als einfache Liste, damit das LLM sie auswählen kann.
|
|
||||||
edges_str = "\n".join([f"- {e}" for e in all_edges])
|
edges_str = "\n".join([f"- {e}" for e in all_edges])
|
||||||
|
|
||||||
# 3. Prompt füllen
|
# 3. Prompt füllen
|
||||||
final_prompt = prompt_template.format(
|
final_prompt = prompt_template.format(
|
||||||
chunk_text=chunk_text[:3000], # Truncate safety
|
chunk_text=chunk_text[:3000],
|
||||||
edge_list=edges_str
|
edge_list=edges_str
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -49,32 +53,41 @@ class SemanticAnalyzer:
|
||||||
force_json=True
|
force_json=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# 5. Parsing
|
# 5. Parsing & Cleaning
|
||||||
clean_json = response_json.replace("```json", "").replace("```", "").strip()
|
clean_json = response_json.replace("```json", "").replace("```", "").strip()
|
||||||
|
if not clean_json: return []
|
||||||
# Fallback für leere Antworten
|
|
||||||
if not clean_json:
|
|
||||||
return []
|
|
||||||
|
|
||||||
data = json.loads(clean_json)
|
data = json.loads(clean_json)
|
||||||
|
valid_edges = []
|
||||||
|
|
||||||
# 6. Validierung: Wir erwarten eine Liste von Strings
|
# 6. Robuste Validierung (List vs Dict)
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
# Filtern: Nur Strings zurückgeben, die auch in der Input-Liste waren (Sicherheit)
|
# Standardfall: ["kind:target", ...]
|
||||||
# oder zumindest das korrekte Format haben.
|
|
||||||
valid_edges = [str(e) for e in data if isinstance(e, str) and ":" in e]
|
valid_edges = [str(e) for e in data if isinstance(e, str) and ":" in e]
|
||||||
return valid_edges
|
|
||||||
elif isinstance(data, dict):
|
|
||||||
# Manchmal packt das LLM es in {"edges": [...]}
|
|
||||||
for key, val in data.items():
|
|
||||||
if isinstance(val, list):
|
|
||||||
return [str(e) for e in val if isinstance(e, str)]
|
|
||||||
|
|
||||||
logger.warning(f"SemanticAnalyzer: Unerwartetes JSON Format: {str(data)[:100]}")
|
elif isinstance(data, dict):
|
||||||
return []
|
# Abweichende Formate behandeln
|
||||||
|
for key, val in data.items():
|
||||||
|
# Fall A: {"edges": ["kind:target"]}
|
||||||
|
if key.lower() in ["edges", "results", "kanten"] and isinstance(val, list):
|
||||||
|
valid_edges.extend([str(e) for e in val if isinstance(e, str) and ":" in e])
|
||||||
|
|
||||||
|
# Fall B: {"kind": "target"} (Das beobachtete Format im Log)
|
||||||
|
elif isinstance(val, str):
|
||||||
|
# Wir rekonstruieren "kind:target"
|
||||||
|
valid_edges.append(f"{key}:{val}")
|
||||||
|
|
||||||
|
# Fall C: {"kind": ["target1", "target2"]}
|
||||||
|
elif isinstance(val, list):
|
||||||
|
for target in val:
|
||||||
|
if isinstance(target, str):
|
||||||
|
valid_edges.append(f"{key}:{target}")
|
||||||
|
|
||||||
|
# Safety: Filtere nur Kanten, die halbwegs valide aussehen
|
||||||
|
return [e for e in valid_edges if ":" in e]
|
||||||
|
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
logger.warning("SemanticAnalyzer: LLM lieferte kein valides JSON. Keine Kanten zugewiesen.")
|
logger.warning("SemanticAnalyzer: LLM lieferte kein valides JSON. Ignoriere Zuweisung.")
|
||||||
return []
|
return []
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"SemanticAnalyzer Error: {e}")
|
logger.error(f"SemanticAnalyzer Error: {e}")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user