Fix: Semantische Deduplizierung in graph_derive_edges.py
This commit is contained in:
parent
0a429e1f7b
commit
38a61d7b50
|
|
@ -4,7 +4,7 @@ DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
|
||||||
AUDIT:
|
AUDIT:
|
||||||
- Nutzt parse_link_target
|
- Nutzt parse_link_target
|
||||||
- Übergibt Section als 'variant' an ID-Gen
|
- Übergibt Section als 'variant' an ID-Gen
|
||||||
- Dedup basiert jetzt auf Edge-ID (erlaubt Multigraph für Sections)
|
- FIXED: Semantische De-Duplizierung (ignoriert rule_id bei Konflikten)
|
||||||
"""
|
"""
|
||||||
from typing import List, Optional, Dict, Tuple
|
from typing import List, Optional, Dict, Tuple
|
||||||
from .graph_utils import (
|
from .graph_utils import (
|
||||||
|
|
@ -21,11 +21,11 @@ def build_edges_for_note(
|
||||||
note_level_references: Optional[List[str]] = None,
|
note_level_references: Optional[List[str]] = None,
|
||||||
include_note_scope_refs: bool = False,
|
include_note_scope_refs: bool = False,
|
||||||
) -> List[dict]:
|
) -> List[dict]:
|
||||||
"""Erzeugt und aggregiert alle Kanten für eine Note."""
|
"""Erzeugt und aggregiert alle Kanten für eine Note (WP-15b)."""
|
||||||
edges: List[dict] = []
|
edges: List[dict] = []
|
||||||
note_type = _get(chunks[0], "type") if chunks else "concept"
|
note_type = _get(chunks[0], "type") if chunks else "concept"
|
||||||
|
|
||||||
# 1) Struktur-Kanten
|
# 1) Struktur-Kanten (belongs_to, next/prev)
|
||||||
for idx, ch in enumerate(chunks):
|
for idx, ch in enumerate(chunks):
|
||||||
cid = _get(ch, "chunk_id", "id")
|
cid = _get(ch, "chunk_id", "id")
|
||||||
if not cid: continue
|
if not cid: continue
|
||||||
|
|
@ -55,21 +55,21 @@ def build_edges_for_note(
|
||||||
if not cid: continue
|
if not cid: continue
|
||||||
raw = _get(ch, "window") or _get(ch, "text") or ""
|
raw = _get(ch, "window") or _get(ch, "text") or ""
|
||||||
|
|
||||||
# Typed
|
# Typed & Candidate Pool (WP-15b Integration)
|
||||||
typed, rem = extract_typed_relations(raw)
|
typed, rem = extract_typed_relations(raw)
|
||||||
for k, raw_t in typed:
|
for k, raw_t in typed:
|
||||||
t, sec = parse_link_target(raw_t, note_id)
|
t, sec = parse_link_target(raw_t, note_id)
|
||||||
if not t: continue
|
if not t: continue
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"chunk_id": cid,
|
"chunk_id": cid,
|
||||||
# Variant=sec sorgt für eindeutige ID pro Abschnitt
|
|
||||||
"edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec),
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec),
|
||||||
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
||||||
}
|
}
|
||||||
if sec: payload["target_section"] = sec
|
if sec: payload["target_section"] = sec
|
||||||
|
|
||||||
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
||||||
|
|
||||||
# Semantic AI Candidates
|
|
||||||
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
|
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
|
||||||
for cand in pool:
|
for cand in pool:
|
||||||
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
||||||
|
|
@ -81,38 +81,38 @@ def build_edges_for_note(
|
||||||
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
|
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
|
||||||
}
|
}
|
||||||
if sec: payload["target_section"] = sec
|
if sec: payload["target_section"] = sec
|
||||||
|
|
||||||
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
||||||
|
|
||||||
# Callouts
|
# Callouts & Wikilinks
|
||||||
call_pairs, rem2 = extract_callout_relations(rem)
|
call_pairs, rem2 = extract_callout_relations(rem)
|
||||||
for k, raw_t in call_pairs:
|
for k, raw_t in call_pairs:
|
||||||
t, sec = parse_link_target(raw_t, note_id)
|
t, sec = parse_link_target(raw_t, note_id)
|
||||||
if not t: continue
|
if not t: continue
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"chunk_id": cid,
|
"chunk_id": cid,
|
||||||
"edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge", variant=sec),
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge", variant=sec),
|
||||||
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
|
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
|
||||||
}
|
}
|
||||||
if sec: payload["target_section"] = sec
|
if sec: payload["target_section"] = sec
|
||||||
|
|
||||||
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
||||||
|
|
||||||
# Wikilinks & Defaults
|
|
||||||
refs = extract_wikilinks(rem2)
|
refs = extract_wikilinks(rem2)
|
||||||
for raw_r in refs:
|
for raw_r in refs:
|
||||||
r, sec = parse_link_target(raw_r, note_id)
|
r, sec = parse_link_target(raw_r, note_id)
|
||||||
if not r: continue
|
if not r: continue
|
||||||
|
|
||||||
# Explicit Reference
|
|
||||||
payload = {
|
payload = {
|
||||||
"chunk_id": cid, "ref_text": raw_r,
|
"chunk_id": cid, "ref_text": raw_r,
|
||||||
"edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink", variant=sec),
|
"edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink", variant=sec),
|
||||||
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
|
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
|
||||||
}
|
}
|
||||||
if sec: payload["target_section"] = sec
|
if sec: payload["target_section"] = sec
|
||||||
|
|
||||||
edges.append(_edge("references", "chunk", cid, r, note_id, payload))
|
edges.append(_edge("references", "chunk", cid, r, note_id, payload))
|
||||||
|
|
||||||
# Defaults (nur einmal pro Target, Section hier irrelevant für Typ-Logik, oder?)
|
|
||||||
# Wir erzeugen Defaults auch pro Section, um Konsistenz zu wahren.
|
|
||||||
for rel in defaults:
|
for rel in defaults:
|
||||||
if rel != "references":
|
if rel != "references":
|
||||||
def_payload = {
|
def_payload = {
|
||||||
|
|
@ -141,13 +141,27 @@ def build_edges_for_note(
|
||||||
"provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
"provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
||||||
}))
|
}))
|
||||||
|
|
||||||
# Deduplizierung: Wir nutzen jetzt die EDGE-ID als Schlüssel.
|
# FIX: Semantische Deduplizierung
|
||||||
# Da die Edge-ID nun 'variant' (Section) enthält, bleiben unterschiedliche Sections erhalten.
|
# Wir nutzen einen Key aus (Source, Target, Kind, Section), um Duplikate
|
||||||
|
# aus verschiedenen Regeln (z.B. callout vs. wikilink) zusammenzuführen.
|
||||||
unique_map: Dict[str, dict] = {}
|
unique_map: Dict[str, dict] = {}
|
||||||
|
|
||||||
for e in edges:
|
for e in edges:
|
||||||
eid = e["edge_id"]
|
# Semantischer Schlüssel: Unabhängig von rule_id oder edge_id
|
||||||
# Bei Konflikt (gleiche ID = exakt gleiche Kante und Section) gewinnt die höhere Confidence
|
src = e.get("source_id", "")
|
||||||
if eid not in unique_map or e.get("confidence", 0) > unique_map[eid].get("confidence", 0):
|
tgt = e.get("target_id", "")
|
||||||
unique_map[eid] = e
|
kind = e.get("kind", "")
|
||||||
|
sec = e.get("target_section", "")
|
||||||
|
|
||||||
|
sem_key = f"{src}->{tgt}:{kind}@{sec}"
|
||||||
|
|
||||||
|
if sem_key not in unique_map:
|
||||||
|
unique_map[sem_key] = e
|
||||||
|
else:
|
||||||
|
# Konfliktlösung: Die Kante mit der höheren Confidence gewinnt
|
||||||
|
curr_conf = unique_map[sem_key].get("confidence", 0.0)
|
||||||
|
new_conf = e.get("confidence", 0.0)
|
||||||
|
if new_conf > curr_conf:
|
||||||
|
unique_map[sem_key] = e
|
||||||
|
|
||||||
return list(unique_map.values())
|
return list(unique_map.values())
|
||||||
Loading…
Reference in New Issue
Block a user