Enhance logging and debugging in chunking_processor.py, graph_derive_edges.py, and ingestion_chunk_payload.py for version 4.4.0: Introduce detailed debug statements to trace chunk extraction, global scan comparisons, and payload transfers. Improve visibility into candidate pool handling and decision-making processes for callout edges, ensuring better traceability and debugging capabilities.
This commit is contained in:
parent
ee91583614
commit
c91910ee9f
|
|
@ -76,7 +76,8 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
# 5. WP-15b: Candidate Pool Aufbau (Metadaten für IngestionService)
|
# 5. WP-15b: Candidate Pool Aufbau (Metadaten für IngestionService)
|
||||||
# WP-24c v4.2.7: Markiere Callout-Kanten explizit für Chunk-Attribution
|
# WP-24c v4.2.7: Markiere Callout-Kanten explizit für Chunk-Attribution
|
||||||
# Zuerst die explizit im Text vorhandenen Kanten sammeln.
|
# Zuerst die explizit im Text vorhandenen Kanten sammeln.
|
||||||
for ch in chunks:
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 1 - Extraktion
|
||||||
|
for idx, ch in enumerate(chunks):
|
||||||
# Wir extrahieren aus dem bereits (durch Propagation) angereicherten Text.
|
# Wir extrahieren aus dem bereits (durch Propagation) angereicherten Text.
|
||||||
# ch.candidate_pool wird im Modell-Konstruktor als leere Liste initialisiert.
|
# ch.candidate_pool wird im Modell-Konstruktor als leere Liste initialisiert.
|
||||||
for edge_info in parse_edges_robust(ch.text):
|
for edge_info in parse_edges_robust(ch.text):
|
||||||
|
|
@ -89,6 +90,10 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
provenance = "explicit:callout" if is_callout else "explicit"
|
provenance = "explicit:callout" if is_callout else "explicit"
|
||||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": provenance})
|
ch.candidate_pool.append({"kind": k, "to": t, "provenance": provenance})
|
||||||
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 1 - Logging
|
||||||
|
if is_callout:
|
||||||
|
logger.debug(f"DEBUG-TRACER [Extraction]: Chunk Index: {idx}, Chunk ID: {ch.id}, Kind: {k}, Target: {t}, Provenance: {provenance}, Is_Callout: {is_callout}, Raw_Edge_Str: {edge_str}")
|
||||||
|
|
||||||
# 6. Global Pool (Unzugeordnete Kanten - kann mitten im Dokument oder am Ende stehen)
|
# 6. Global Pool (Unzugeordnete Kanten - kann mitten im Dokument oder am Ende stehen)
|
||||||
# WP-24c v4.2.0: Konfigurierbare Header-Namen und -Ebene via .env
|
# WP-24c v4.2.0: Konfigurierbare Header-Namen und -Ebene via .env
|
||||||
# Sucht nach ALLEN Edge-Pool Blöcken im Original-Markdown (nicht nur am Ende).
|
# Sucht nach ALLEN Edge-Pool Blöcken im Original-Markdown (nicht nur am Ende).
|
||||||
|
|
|
||||||
|
|
@ -179,9 +179,15 @@ def extract_callouts_from_markdown(
|
||||||
# Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
|
# Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
|
||||||
# Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
|
# Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
|
||||||
callout_key = (k, t, sec)
|
callout_key = (k, t, sec)
|
||||||
if callout_key in existing_chunk_callouts:
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan Vergleich
|
||||||
|
is_blocked = callout_key in existing_chunk_callouts
|
||||||
|
logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key: ({k}, {t}, {sec}), Raw_Target: {raw_t}, In_Block_List: {is_blocked}, Block_List_Size: {len(existing_chunk_callouts) if existing_chunk_callouts else 0}")
|
||||||
|
|
||||||
|
if is_blocked:
|
||||||
# Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
|
# Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
|
||||||
# Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
|
# Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
|
||||||
|
logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key ({k}, {t}, {sec}) ist blockiert - überspringe")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
|
# WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
|
||||||
|
|
@ -196,6 +202,9 @@ def extract_callouts_from_markdown(
|
||||||
if sec:
|
if sec:
|
||||||
payload["target_section"] = sec
|
payload["target_section"] = sec
|
||||||
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan erstellt Note-Scope Callout
|
||||||
|
logger.debug(f"DEBUG-TRACER [Global Scan Create]: Erstelle Note-Scope Callout - Kind: {k}, Target: {t}, Section: {sec}, Raw_Target: {raw_t}, Edge_ID: {payload['edge_id']}, Confidence: {payload['confidence']}")
|
||||||
|
|
||||||
edges.append(_edge(
|
edges.append(_edge(
|
||||||
kind=k,
|
kind=k,
|
||||||
scope="note",
|
scope="note",
|
||||||
|
|
@ -344,6 +353,9 @@ def build_edges_for_note(
|
||||||
all_chunk_callout_keys.add(callout_key)
|
all_chunk_callout_keys.add(callout_key)
|
||||||
logger.debug(f"Note [{note_id}]: Callout-Key gesammelt: ({k}, {t}, {sec})")
|
logger.debug(f"Note [{note_id}]: Callout-Key gesammelt: ({k}, {t}, {sec})")
|
||||||
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Synchronisation Phase 1
|
||||||
|
logger.debug(f"DEBUG-TRACER [Phase 1 Sync]: Gefundener Key im Pool: ({k}, {t}, {sec}), Raw_Target: {raw_t}, Zugeordnet zu: {cid}, Chunk_Index: {ch.get('index', '?')}, Provenance: {p}")
|
||||||
|
|
||||||
# WP-24c v4.3.0: Debug-Logik - Ausgabe der gesammelten Keys
|
# WP-24c v4.3.0: Debug-Logik - Ausgabe der gesammelten Keys
|
||||||
if all_chunk_callout_keys:
|
if all_chunk_callout_keys:
|
||||||
logger.debug(f"Note [{note_id}]: Gesammelt {len(all_chunk_callout_keys)} Callout-Keys aus candidate_pools")
|
logger.debug(f"Note [{note_id}]: Gesammelt {len(all_chunk_callout_keys)} Callout-Keys aus candidate_pools")
|
||||||
|
|
@ -474,6 +486,12 @@ def build_edges_for_note(
|
||||||
# WP-24c v4.3.0: Debug-Logik - Ausgabe vor globalem Scan
|
# WP-24c v4.3.0: Debug-Logik - Ausgabe vor globalem Scan
|
||||||
logger.debug(f"Note [{note_id}]: Starte globalen Markdown-Scan mit {len(all_chunk_callout_keys)} ausgeschlossenen Callout-Keys")
|
logger.debug(f"Note [{note_id}]: Starte globalen Markdown-Scan mit {len(all_chunk_callout_keys)} ausgeschlossenen Callout-Keys")
|
||||||
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan Start
|
||||||
|
block_list = list(all_chunk_callout_keys)
|
||||||
|
logger.debug(f"DEBUG-TRACER [Global Scan Start]: Block-Liste (all_chunk_callout_keys): {block_list}, Anzahl: {len(block_list)}")
|
||||||
|
for key in block_list:
|
||||||
|
logger.debug(f"DEBUG-TRACER [Global Scan Start]: Block-Key Detail - Kind: {key[0]}, Target: {key[1]}, Section: {key[2]}")
|
||||||
|
|
||||||
callout_edges_from_markdown = extract_callouts_from_markdown(
|
callout_edges_from_markdown = extract_callouts_from_markdown(
|
||||||
markdown_body,
|
markdown_body,
|
||||||
note_id,
|
note_id,
|
||||||
|
|
@ -522,6 +540,11 @@ def build_edges_for_note(
|
||||||
# Semantischer Schlüssel: (kind, semantic_source, target_id, target_section)
|
# Semantischer Schlüssel: (kind, semantic_source, target_id, target_section)
|
||||||
semantic_key = (kind, semantic_source, target_id, target_section)
|
semantic_key = (kind, semantic_source, target_id, target_section)
|
||||||
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 4 - De-Duplizierung Gruppierung
|
||||||
|
# Nur für Callout-Kanten loggen
|
||||||
|
if e.get("provenance") == "explicit:callout":
|
||||||
|
logger.debug(f"DEBUG-TRACER [Dedup Grouping]: Edge zu Gruppe - Semantic_Key: {semantic_key}, Scope: {scope}, Source_ID: {source_id}, Provenance: {e.get('provenance')}, Confidence: {e.get('confidence')}, Edge_ID: {e.get('edge_id')}")
|
||||||
|
|
||||||
if semantic_key not in semantic_groups:
|
if semantic_key not in semantic_groups:
|
||||||
semantic_groups[semantic_key] = []
|
semantic_groups[semantic_key] = []
|
||||||
semantic_groups[semantic_key].append(e)
|
semantic_groups[semantic_key].append(e)
|
||||||
|
|
@ -531,6 +554,10 @@ def build_edges_for_note(
|
||||||
final_edges: List[dict] = []
|
final_edges: List[dict] = []
|
||||||
|
|
||||||
for semantic_key, group in semantic_groups.items():
|
for semantic_key, group in semantic_groups.items():
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 4 - De-Duplizierung Entscheidung
|
||||||
|
# Prüfe, ob diese Gruppe Callout-Kanten enthält
|
||||||
|
has_callouts = any(e.get("provenance") == "explicit:callout" for e in group)
|
||||||
|
|
||||||
if len(group) == 1:
|
if len(group) == 1:
|
||||||
# Nur eine Kante: Direkt verwenden, aber ID neu berechnen mit finalem Scope
|
# Nur eine Kante: Direkt verwenden, aber ID neu berechnen mit finalem Scope
|
||||||
winner = group[0]
|
winner = group[0]
|
||||||
|
|
@ -546,17 +573,29 @@ def build_edges_for_note(
|
||||||
if winner.get("provenance") == "explicit:callout":
|
if winner.get("provenance") == "explicit:callout":
|
||||||
logger.debug(f"Note [{note_id}]: Finale Callout-Kante (single): scope={final_scope}, source={final_source}, target={target_id}, section={target_section}")
|
logger.debug(f"Note [{note_id}]: Finale Callout-Kante (single): scope={final_scope}, source={final_source}, target={target_id}, section={target_section}")
|
||||||
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 4 - Single Edge
|
||||||
|
if has_callouts:
|
||||||
|
logger.debug(f"DEBUG-TRACER [Dedup]: Gruppe: {semantic_key}, Kandidaten: [Single: scope={final_scope}/provenance={winner.get('provenance')}/confidence={winner.get('confidence')}], Gewinner: {final_edge_id}, Grund: Single-Edge")
|
||||||
|
|
||||||
final_edges.append(winner)
|
final_edges.append(winner)
|
||||||
else:
|
else:
|
||||||
# Mehrere Kanten mit gleichem semantischen Schlüssel: Scope-Entscheidung
|
# Mehrere Kanten mit gleichem semantischen Schlüssel: Scope-Entscheidung
|
||||||
# WP-24c v4.3.1: Präzision (Chunk) siegt über Globalität (Note)
|
# WP-24c v4.3.1: Präzision (Chunk) siegt über Globalität (Note)
|
||||||
winner = None
|
winner = None
|
||||||
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 4 - De-Duplizierung Kandidaten-Analyse
|
||||||
|
if has_callouts:
|
||||||
|
candidates_info = []
|
||||||
|
for e in group:
|
||||||
|
candidates_info.append(f"scope={e.get('scope')}/provenance={e.get('provenance')}/confidence={e.get('confidence')}/source={e.get('source_id')}")
|
||||||
|
logger.debug(f"DEBUG-TRACER [Dedup]: Gruppe: {semantic_key}, Kandidaten: [{', '.join(candidates_info)}]")
|
||||||
|
|
||||||
# Regel 1: explicit:note_zone hat höchste Priorität (Autorität)
|
# Regel 1: explicit:note_zone hat höchste Priorität (Autorität)
|
||||||
note_zone_candidates = [e for e in group if e.get("provenance") == "explicit:note_zone"]
|
note_zone_candidates = [e for e in group if e.get("provenance") == "explicit:note_zone"]
|
||||||
if note_zone_candidates:
|
if note_zone_candidates:
|
||||||
# Wenn mehrere note_zone: Nimm die mit höchster Confidence
|
# Wenn mehrere note_zone: Nimm die mit höchster Confidence
|
||||||
winner = max(note_zone_candidates, key=lambda e: e.get("confidence", 0))
|
winner = max(note_zone_candidates, key=lambda e: e.get("confidence", 0))
|
||||||
|
decision_reason = "explicit:note_zone (höchste Priorität)"
|
||||||
else:
|
else:
|
||||||
# Regel 2: chunk-Scope ZWINGEND bevorzugen (Präzisions-Vorteil)
|
# Regel 2: chunk-Scope ZWINGEND bevorzugen (Präzisions-Vorteil)
|
||||||
# WP-24c v4.3.1: Wenn mindestens ein chunk-Kandidat existiert, muss dieser gewinnen
|
# WP-24c v4.3.1: Wenn mindestens ein chunk-Kandidat existiert, muss dieser gewinnen
|
||||||
|
|
@ -567,11 +606,13 @@ def build_edges_for_note(
|
||||||
winner = max(chunk_candidates, key=lambda e: (
|
winner = max(chunk_candidates, key=lambda e: (
|
||||||
e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
|
e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
|
||||||
))
|
))
|
||||||
|
decision_reason = f"chunk-Scope (Präzision, {len(chunk_candidates)} chunk-Kandidaten)"
|
||||||
else:
|
else:
|
||||||
# Regel 3: Fallback (nur wenn KEIN chunk-Kandidat vorhanden): Höchste Confidence * Priority
|
# Regel 3: Fallback (nur wenn KEIN chunk-Kandidat vorhanden): Höchste Confidence * Priority
|
||||||
winner = max(group, key=lambda e: (
|
winner = max(group, key=lambda e: (
|
||||||
e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
|
e.get("confidence", 0) * PROVENANCE_PRIORITY.get(e.get("provenance", ""), 0.7)
|
||||||
))
|
))
|
||||||
|
decision_reason = "Fallback (höchste Confidence * Priority, kein chunk-Kandidat)"
|
||||||
|
|
||||||
# WP-24c v4.2.2: Berechne edge_id mit finalem Scope
|
# WP-24c v4.2.2: Berechne edge_id mit finalem Scope
|
||||||
final_scope = winner.get("scope", "chunk")
|
final_scope = winner.get("scope", "chunk")
|
||||||
|
|
@ -585,6 +626,10 @@ def build_edges_for_note(
|
||||||
if winner.get("provenance") == "explicit:callout":
|
if winner.get("provenance") == "explicit:callout":
|
||||||
logger.debug(f"Note [{note_id}]: Finale Callout-Kante (deduped, {len(group)} Kandidaten): scope={final_scope}, source={final_source}, target={target_id}, section={target_section}")
|
logger.debug(f"Note [{note_id}]: Finale Callout-Kante (deduped, {len(group)} Kandidaten): scope={final_scope}, source={final_source}, target={target_id}, section={target_section}")
|
||||||
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 4 - Entscheidung
|
||||||
|
if has_callouts:
|
||||||
|
logger.debug(f"DEBUG-TRACER [Decision]: Gewinner: {final_edge_id}, Scope: {final_scope}, Source: {final_source}, Provenance: {winner.get('provenance')}, Confidence: {winner.get('confidence')}, Grund: {decision_reason}")
|
||||||
|
|
||||||
final_edges.append(winner)
|
final_edges.append(winner)
|
||||||
|
|
||||||
return final_edges
|
return final_edges
|
||||||
|
|
@ -113,6 +113,16 @@ def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunke
|
||||||
for alias in ("chunk_num", "Chunk_Number"):
|
for alias in ("chunk_num", "Chunk_Number"):
|
||||||
pl.pop(alias, None)
|
pl.pop(alias, None)
|
||||||
|
|
||||||
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 2 - Transfer
|
||||||
|
# Log-Output unmittelbar bevor das Dictionary zurückgegeben wird
|
||||||
|
pool_size = len(candidate_pool) if candidate_pool else 0
|
||||||
|
pool_content = candidate_pool if candidate_pool else []
|
||||||
|
explicit_callout_in_pool = [c for c in pool_content if isinstance(c, dict) and c.get("provenance") == "explicit:callout"]
|
||||||
|
logger.debug(f"DEBUG-TRACER [Payload]: Chunk ID: {cid}, Index: {index}, Pool-Size: {pool_size}, Pool-Inhalt: {pool_content}, Explicit-Callout-Count: {len(explicit_callout_in_pool)}, Has_Candidate_Pool_Key: {'candidate_pool' in pl}")
|
||||||
|
if explicit_callout_in_pool:
|
||||||
|
for ec in explicit_callout_in_pool:
|
||||||
|
logger.debug(f"DEBUG-TRACER [Payload]: Explicit-Callout Detail - Kind: {ec.get('kind')}, To: {ec.get('to')}, Provenance: {ec.get('provenance')}")
|
||||||
|
|
||||||
out.append(pl)
|
out.append(pl)
|
||||||
|
|
||||||
return out
|
return out
|
||||||
Loading…
Reference in New Issue
Block a user