Enhance compatibility in chunking and edge processing for version 4.4.1: Harmonize handling of "to" and "target_id" across chunking_processor.py, graph_derive_edges.py, and ingestion_processor.py. Ensure consistent validation and processing of explicit callouts, improving integration and reliability in edge candidate handling.
This commit is contained in:
parent
d7d6155203
commit
2d87f9d816
|
|
@ -87,8 +87,15 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
k, t = parts
|
k, t = parts
|
||||||
# WP-24c v4.2.7: Callout-Kanten erhalten explicit:callout Provenance
|
# WP-24c v4.2.7: Callout-Kanten erhalten explicit:callout Provenance
|
||||||
|
# WP-24c v4.4.1: Harmonisierung - Provenance muss exakt "explicit:callout" sein
|
||||||
provenance = "explicit:callout" if is_callout else "explicit"
|
provenance = "explicit:callout" if is_callout else "explicit"
|
||||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": provenance})
|
# WP-24c v4.4.1: Verwende "to" für Kompatibilität (wird auch in graph_derive_edges.py erwartet)
|
||||||
|
# Zusätzlich "target_id" für maximale Kompatibilität mit ingestion_processor Validierung
|
||||||
|
pool_entry = {"kind": k, "to": t, "provenance": provenance}
|
||||||
|
if is_callout:
|
||||||
|
# WP-24c v4.4.1: Für Callouts auch "target_id" hinzufügen für Validierung
|
||||||
|
pool_entry["target_id"] = t
|
||||||
|
ch.candidate_pool.append(pool_entry)
|
||||||
|
|
||||||
# WP-24c v4.4.0-DEBUG: Schnittstelle 1 - Logging
|
# WP-24c v4.4.0-DEBUG: Schnittstelle 1 - Logging
|
||||||
if is_callout:
|
if is_callout:
|
||||||
|
|
|
||||||
|
|
@ -340,14 +340,16 @@ def build_edges_for_note(
|
||||||
logger.debug(f"Note [{note_id}]: Chunk [{ch.get('index', '?')}] hat {pool_size} Kanten im Candidate-Pool ({explicit_callout_count} explicit:callout)")
|
logger.debug(f"Note [{note_id}]: Chunk [{ch.get('index', '?')}] hat {pool_size} Kanten im Candidate-Pool ({explicit_callout_count} explicit:callout)")
|
||||||
|
|
||||||
for cand in pool:
|
for cand in pool:
|
||||||
raw_t = cand.get("to")
|
# WP-24c v4.4.1: Harmonisierung - akzeptiere sowohl "to" als auch "target_id"
|
||||||
|
raw_t = cand.get("to") or cand.get("target_id")
|
||||||
k = cand.get("kind", "related_to")
|
k = cand.get("kind", "related_to")
|
||||||
p = cand.get("provenance", "semantic_ai")
|
p = cand.get("provenance", "semantic_ai")
|
||||||
|
|
||||||
|
# WP-24c v4.4.1: String-Check - Provenance muss exakt "explicit:callout" sein (case-sensitive)
|
||||||
# WP-24c v4.2.9 Fix B: Wenn Provenance explicit:callout, extrahiere Key
|
# WP-24c v4.2.9 Fix B: Wenn Provenance explicit:callout, extrahiere Key
|
||||||
# WP-24c v4.3.1: Key-Generierung gehärtet - Format (kind, target_id, target_section)
|
# WP-24c v4.3.1: Key-Generierung gehärtet - Format (kind, target_id, target_section)
|
||||||
# Exakt konsistent mit dem globalen Scan für zuverlässige Deduplizierung
|
# Exakt konsistent mit dem globalen Scan für zuverlässige Deduplizierung
|
||||||
if p == "explicit:callout":
|
if p == "explicit:callout" and raw_t:
|
||||||
t, sec = parse_link_target(raw_t, note_id)
|
t, sec = parse_link_target(raw_t, note_id)
|
||||||
if t:
|
if t:
|
||||||
# Key-Format: (kind, target_id, target_section) - exakt wie im globalen Scan
|
# Key-Format: (kind, target_id, target_section) - exakt wie im globalen Scan
|
||||||
|
|
@ -390,7 +392,10 @@ def build_edges_for_note(
|
||||||
# WP-24c v4.2.9: Erstelle Kanten aus candidate_pool (Keys bereits in Phase 1 gesammelt)
|
# WP-24c v4.2.9: Erstelle Kanten aus candidate_pool (Keys bereits in Phase 1 gesammelt)
|
||||||
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
|
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
|
||||||
for cand in pool:
|
for cand in pool:
|
||||||
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
# WP-24c v4.4.1: Harmonisierung - akzeptiere sowohl "to" als auch "target_id"
|
||||||
|
raw_t = cand.get("to") or cand.get("target_id")
|
||||||
|
k = cand.get("kind", "related_to")
|
||||||
|
p = cand.get("provenance", "semantic_ai")
|
||||||
t, sec = parse_link_target(raw_t, note_id)
|
t, sec = parse_link_target(raw_t, note_id)
|
||||||
if t:
|
if t:
|
||||||
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
||||||
|
|
|
||||||
|
|
@ -242,13 +242,18 @@ class IngestionService:
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
new_pool = []
|
new_pool = []
|
||||||
for cand in getattr(ch, "candidate_pool", []):
|
for cand in getattr(ch, "candidate_pool", []):
|
||||||
t_id = cand.get('target_id') or cand.get('note_id')
|
# WP-24c v4.4.1: Harmonisierung - akzeptiere sowohl "to" als auch "target_id"
|
||||||
|
# Der chunking_processor verwendet "to", daher muss die Validierung beide Keys unterstützen
|
||||||
|
t_id = cand.get('target_id') or cand.get('to') or cand.get('note_id')
|
||||||
if not self._is_valid_id(t_id): continue
|
if not self._is_valid_id(t_id): continue
|
||||||
|
|
||||||
|
# WP-24c v4.4.1: explicit:callout Kanten werden NICHT validiert (bereits präzise)
|
||||||
|
# Sie müssen den Pool passieren, damit sie in Phase 1 erkannt werden
|
||||||
if cand.get("provenance") == "global_pool" and enable_smart:
|
if cand.get("provenance") == "global_pool" and enable_smart:
|
||||||
is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm)
|
is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm)
|
||||||
if is_valid: new_pool.append(cand)
|
if is_valid: new_pool.append(cand)
|
||||||
else:
|
else:
|
||||||
|
# WP-24c v4.4.1: Alle anderen Provenances (inkl. explicit:callout) passieren ohne Validierung
|
||||||
new_pool.append(cand)
|
new_pool.append(cand)
|
||||||
ch.candidate_pool = new_pool
|
ch.candidate_pool = new_pool
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user