From 2d87f9d816af466dd2cc30310460426f42831ebc Mon Sep 17 00:00:00 2001 From: Lars Date: Sun, 11 Jan 2026 15:39:03 +0100 Subject: [PATCH] Enhance compatibility in chunking and edge processing for version 4.4.1: Harmonize handling of "to" and "target_id" across chunking_processor.py, graph_derive_edges.py, and ingestion_processor.py. Ensure consistent validation and processing of explicit callouts, improving integration and reliability in edge candidate handling. --- app/core/chunking/chunking_processor.py | 9 ++++++++- app/core/graph/graph_derive_edges.py | 11 ++++++++--- app/core/ingestion/ingestion_processor.py | 7 ++++++- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/app/core/chunking/chunking_processor.py b/app/core/chunking/chunking_processor.py index 567224f..af0afb8 100644 --- a/app/core/chunking/chunking_processor.py +++ b/app/core/chunking/chunking_processor.py @@ -87,8 +87,15 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op if len(parts) == 2: k, t = parts # WP-24c v4.2.7: Callout-Kanten erhalten explicit:callout Provenance + # WP-24c v4.4.1: Harmonisierung - Provenance muss exakt "explicit:callout" sein provenance = "explicit:callout" if is_callout else "explicit" - ch.candidate_pool.append({"kind": k, "to": t, "provenance": provenance}) + # WP-24c v4.4.1: Verwende "to" für Kompatibilität (wird auch in graph_derive_edges.py erwartet) + # Zusätzlich "target_id" für maximale Kompatibilität mit ingestion_processor Validierung + pool_entry = {"kind": k, "to": t, "provenance": provenance} + if is_callout: + # WP-24c v4.4.1: Für Callouts auch "target_id" hinzufügen für Validierung + pool_entry["target_id"] = t + ch.candidate_pool.append(pool_entry) # WP-24c v4.4.0-DEBUG: Schnittstelle 1 - Logging if is_callout: diff --git a/app/core/graph/graph_derive_edges.py b/app/core/graph/graph_derive_edges.py index b7ee395..c56e9d9 100644 --- a/app/core/graph/graph_derive_edges.py +++ b/app/core/graph/graph_derive_edges.py @@ -340,14 +340,16 @@ def build_edges_for_note( logger.debug(f"Note [{note_id}]: Chunk [{ch.get('index', '?')}] hat {pool_size} Kanten im Candidate-Pool ({explicit_callout_count} explicit:callout)") for cand in pool: - raw_t = cand.get("to") + # WP-24c v4.4.1: Harmonisierung - akzeptiere sowohl "to" als auch "target_id" + raw_t = cand.get("to") or cand.get("target_id") k = cand.get("kind", "related_to") p = cand.get("provenance", "semantic_ai") + # WP-24c v4.4.1: String-Check - Provenance muss exakt "explicit:callout" sein (case-sensitive) # WP-24c v4.2.9 Fix B: Wenn Provenance explicit:callout, extrahiere Key # WP-24c v4.3.1: Key-Generierung gehärtet - Format (kind, target_id, target_section) # Exakt konsistent mit dem globalen Scan für zuverlässige Deduplizierung - if p == "explicit:callout": + if p == "explicit:callout" and raw_t: t, sec = parse_link_target(raw_t, note_id) if t: # Key-Format: (kind, target_id, target_section) - exakt wie im globalen Scan @@ -390,7 +392,10 @@ def build_edges_for_note( # WP-24c v4.2.9: Erstelle Kanten aus candidate_pool (Keys bereits in Phase 1 gesammelt) pool = ch.get("candidate_pool") or ch.get("candidate_edges") or [] for cand in pool: - raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai") + # WP-24c v4.4.1: Harmonisierung - akzeptiere sowohl "to" als auch "target_id" + raw_t = cand.get("to") or cand.get("target_id") + k = cand.get("kind", "related_to") + p = cand.get("provenance", "semantic_ai") t, sec = parse_link_target(raw_t, note_id) if t: # WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py index d803d9a..3c1ee21 100644 --- a/app/core/ingestion/ingestion_processor.py +++ b/app/core/ingestion/ingestion_processor.py @@ -242,13 +242,18 @@ class IngestionService: for ch in chunks: new_pool = [] for cand in getattr(ch, "candidate_pool", []): - t_id = cand.get('target_id') or cand.get('note_id') + # WP-24c v4.4.1: Harmonisierung - akzeptiere sowohl "to" als auch "target_id" + # Der chunking_processor verwendet "to", daher muss die Validierung beide Keys unterstützen + t_id = cand.get('target_id') or cand.get('to') or cand.get('note_id') if not self._is_valid_id(t_id): continue + # WP-24c v4.4.1: explicit:callout Kanten werden NICHT validiert (bereits präzise) + # Sie müssen den Pool passieren, damit sie in Phase 1 erkannt werden if cand.get("provenance") == "global_pool" and enable_smart: is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm) if is_valid: new_pool.append(cand) else: + # WP-24c v4.4.1: Alle anderen Provenances (inkl. explicit:callout) passieren ohne Validierung new_pool.append(cand) ch.candidate_pool = new_pool