neu deduplizierung für callout-edges
This commit is contained in:
parent
38a61d7b50
commit
03d3173ca6
|
|
@ -4,7 +4,7 @@ DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
|
||||||
AUDIT:
|
AUDIT:
|
||||||
- Nutzt parse_link_target
|
- Nutzt parse_link_target
|
||||||
- Übergibt Section als 'variant' an ID-Gen
|
- Übergibt Section als 'variant' an ID-Gen
|
||||||
- FIXED: Semantische De-Duplizierung (ignoriert rule_id bei Konflikten)
|
- FIXED: Semantische De-Duplizierung via 'sem_key' (löst das Callout-Problem)
|
||||||
"""
|
"""
|
||||||
from typing import List, Optional, Dict, Tuple
|
from typing import List, Optional, Dict, Tuple
|
||||||
from .graph_utils import (
|
from .graph_utils import (
|
||||||
|
|
@ -63,6 +63,7 @@ def build_edges_for_note(
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"chunk_id": cid,
|
"chunk_id": cid,
|
||||||
|
# Variant=sec sorgt für eindeutige ID pro Abschnitt
|
||||||
"edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec),
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec),
|
||||||
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
||||||
}
|
}
|
||||||
|
|
@ -127,6 +128,8 @@ def build_edges_for_note(
|
||||||
|
|
||||||
# 3) Note-Scope & De-Duplizierung
|
# 3) Note-Scope & De-Duplizierung
|
||||||
if include_note_scope_refs:
|
if include_note_scope_refs:
|
||||||
|
# refs_all ist jetzt schon gesäubert (nur Targets)
|
||||||
|
# note_level_references müssen auch gesäubert werden
|
||||||
cleaned_note_refs = [parse_link_target(r, note_id)[0] for r in (note_level_references or [])]
|
cleaned_note_refs = [parse_link_target(r, note_id)[0] for r in (note_level_references or [])]
|
||||||
refs_note = _dedupe_seq((refs_all or []) + cleaned_note_refs)
|
refs_note = _dedupe_seq((refs_all or []) + cleaned_note_refs)
|
||||||
|
|
||||||
|
|
@ -141,9 +144,12 @@ def build_edges_for_note(
|
||||||
"provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
"provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
||||||
}))
|
}))
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------
|
||||||
# FIX: Semantische Deduplizierung
|
# FIX: Semantische Deduplizierung
|
||||||
|
# Hier lösen wir das Problem, dass Callout-Kanten andere überschreiben.
|
||||||
# Wir nutzen einen Key aus (Source, Target, Kind, Section), um Duplikate
|
# Wir nutzen einen Key aus (Source, Target, Kind, Section), um Duplikate
|
||||||
# aus verschiedenen Regeln (z.B. callout vs. wikilink) zusammenzuführen.
|
# aus verschiedenen Regeln (z.B. callout vs. wikilink) zusammenzuführen.
|
||||||
|
# ----------------------------------------------------------------------------------
|
||||||
unique_map: Dict[str, dict] = {}
|
unique_map: Dict[str, dict] = {}
|
||||||
|
|
||||||
for e in edges:
|
for e in edges:
|
||||||
|
|
@ -153,6 +159,8 @@ def build_edges_for_note(
|
||||||
kind = e.get("kind", "")
|
kind = e.get("kind", "")
|
||||||
sec = e.get("target_section", "")
|
sec = e.get("target_section", "")
|
||||||
|
|
||||||
|
# Dieser Key ist für alle Einträge im Callout-Block UNTERSCHIEDLICH,
|
||||||
|
# da 'sec' (1) Integrität, 3) Disziplin...) unterschiedlich ist.
|
||||||
sem_key = f"{src}->{tgt}:{kind}@{sec}"
|
sem_key = f"{src}->{tgt}:{kind}@{sec}"
|
||||||
|
|
||||||
if sem_key not in unique_map:
|
if sem_key not in unique_map:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user