anpassungen Kantenvergeleich

This commit is contained in:
Lars 2025-12-29 11:45:25 +01:00
parent 857ba953e3
commit 0a429e1f7b
2 changed files with 43 additions and 26 deletions

View File

@ -1,7 +1,10 @@
""" """
FILE: app/core/graph/graph_derive_edges.py FILE: app/core/graph/graph_derive_edges.py
DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung. DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
AUDIT: Integriert parse_link_target für saubere Graphen-Topologie. AUDIT:
- Nutzt parse_link_target
- Übergibt Section als 'variant' an ID-Gen
- Dedup basiert jetzt auf Edge-ID (erlaubt Multigraph für Sections)
""" """
from typing import List, Optional, Dict, Tuple from typing import List, Optional, Dict, Tuple
from .graph_utils import ( from .graph_utils import (
@ -18,11 +21,11 @@ def build_edges_for_note(
note_level_references: Optional[List[str]] = None, note_level_references: Optional[List[str]] = None,
include_note_scope_refs: bool = False, include_note_scope_refs: bool = False,
) -> List[dict]: ) -> List[dict]:
"""Erzeugt und aggregiert alle Kanten für eine Note (WP-15b).""" """Erzeugt und aggregiert alle Kanten für eine Note."""
edges: List[dict] = [] edges: List[dict] = []
note_type = _get(chunks[0], "type") if chunks else "concept" note_type = _get(chunks[0], "type") if chunks else "concept"
# 1) Struktur-Kanten (belongs_to, next/prev) # 1) Struktur-Kanten
for idx, ch in enumerate(chunks): for idx, ch in enumerate(chunks):
cid = _get(ch, "chunk_id", "id") cid = _get(ch, "chunk_id", "id")
if not cid: continue if not cid: continue
@ -52,76 +55,78 @@ def build_edges_for_note(
if not cid: continue if not cid: continue
raw = _get(ch, "window") or _get(ch, "text") or "" raw = _get(ch, "window") or _get(ch, "text") or ""
# Typed & Candidate Pool (WP-15b Integration) # Typed
typed, rem = extract_typed_relations(raw) typed, rem = extract_typed_relations(raw)
for k, raw_t in typed: for k, raw_t in typed:
t, sec = parse_link_target(raw_t, note_id) t, sec = parse_link_target(raw_t, note_id)
if not t: continue if not t: continue
payload = { payload = {
"chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel"), "chunk_id": cid,
# Variant=sec sorgt für eindeutige ID pro Abschnitt
"edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec),
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"] "provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
} }
if sec: payload["target_section"] = sec if sec: payload["target_section"] = sec
edges.append(_edge(k, "chunk", cid, t, note_id, payload)) edges.append(_edge(k, "chunk", cid, t, note_id, payload))
# Semantic AI Candidates
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or [] pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
for cand in pool: for cand in pool:
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai") raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
t, sec = parse_link_target(raw_t, note_id) t, sec = parse_link_target(raw_t, note_id)
if t: if t:
payload = { payload = {
"chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", f"candidate:{p}"), "chunk_id": cid,
"edge_id": _mk_edge_id(k, cid, t, "chunk", f"candidate:{p}", variant=sec),
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90) "provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
} }
if sec: payload["target_section"] = sec if sec: payload["target_section"] = sec
edges.append(_edge(k, "chunk", cid, t, note_id, payload)) edges.append(_edge(k, "chunk", cid, t, note_id, payload))
# Callouts & Wikilinks # Callouts
call_pairs, rem2 = extract_callout_relations(rem) call_pairs, rem2 = extract_callout_relations(rem)
for k, raw_t in call_pairs: for k, raw_t in call_pairs:
t, sec = parse_link_target(raw_t, note_id) t, sec = parse_link_target(raw_t, note_id)
if not t: continue if not t: continue
payload = { payload = {
"chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge"), "chunk_id": cid,
"edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge", variant=sec),
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"] "provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
} }
if sec: payload["target_section"] = sec if sec: payload["target_section"] = sec
edges.append(_edge(k, "chunk", cid, t, note_id, payload)) edges.append(_edge(k, "chunk", cid, t, note_id, payload))
# Wikilinks & Defaults
refs = extract_wikilinks(rem2) refs = extract_wikilinks(rem2)
for raw_r in refs: for raw_r in refs:
r, sec = parse_link_target(raw_r, note_id) r, sec = parse_link_target(raw_r, note_id)
if not r: continue if not r: continue
# Explicit Reference
payload = { payload = {
"chunk_id": cid, "ref_text": raw_r, "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"), "chunk_id": cid, "ref_text": raw_r,
"edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink", variant=sec),
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"] "provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
} }
if sec: payload["target_section"] = sec if sec: payload["target_section"] = sec
edges.append(_edge("references", "chunk", cid, r, note_id, payload)) edges.append(_edge("references", "chunk", cid, r, note_id, payload))
# Defaults (nur einmal pro Target, Section hier irrelevant für Typ-Logik, oder?)
# Wir erzeugen Defaults auch pro Section, um Konsistenz zu wahren.
for rel in defaults: for rel in defaults:
if rel != "references": if rel != "references":
def_payload = { def_payload = {
"chunk_id": cid, "edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{rel}"), "chunk_id": cid,
"edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{rel}", variant=sec),
"provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"] "provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"]
} }
if sec: def_payload["target_section"] = sec if sec: def_payload["target_section"] = sec
edges.append(_edge(rel, "chunk", cid, r, note_id, def_payload)) edges.append(_edge(rel, "chunk", cid, r, note_id, def_payload))
# Für Note-Scope Sammlung nutzen wir den Original-String zur Dedup, aber gesäubert
refs_all.extend([parse_link_target(r, note_id)[0] for r in refs]) refs_all.extend([parse_link_target(r, note_id)[0] for r in refs])
# 3) Note-Scope & De-Duplizierung # 3) Note-Scope & De-Duplizierung
if include_note_scope_refs: if include_note_scope_refs:
# refs_all ist jetzt schon gesäubert (nur Targets)
# note_level_references müssen auch gesäubert werden
cleaned_note_refs = [parse_link_target(r, note_id)[0] for r in (note_level_references or [])] cleaned_note_refs = [parse_link_target(r, note_id)[0] for r in (note_level_references or [])]
refs_note = _dedupe_seq((refs_all or []) + cleaned_note_refs) refs_note = _dedupe_seq((refs_all or []) + cleaned_note_refs)
@ -136,10 +141,13 @@ def build_edges_for_note(
"provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"] "provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
})) }))
unique_map: Dict[Tuple[str, str, str], dict] = {} # Deduplizierung: Wir nutzen jetzt die EDGE-ID als Schlüssel.
# Da die Edge-ID nun 'variant' (Section) enthält, bleiben unterschiedliche Sections erhalten.
unique_map: Dict[str, dict] = {}
for e in edges: for e in edges:
key = (str(e.get("source_id")), str(e.get("target_id")), str(e.get("kind"))) eid = e["edge_id"]
if key not in unique_map or e.get("confidence", 0) > unique_map[key].get("confidence", 0): # Bei Konflikt (gleiche ID = exakt gleiche Kante und Section) gewinnt die höhere Confidence
unique_map[key] = e if eid not in unique_map or e.get("confidence", 0) > unique_map[eid].get("confidence", 0):
unique_map[eid] = e
return list(unique_map.values()) return list(unique_map.values())

View File

@ -41,10 +41,19 @@ def _dedupe_seq(seq: Iterable[str]) -> List[str]:
seen.add(s); out.append(s) seen.add(s); out.append(s)
return out return out
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str: def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None, variant: Optional[str] = None) -> str:
"""Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s.""" """
Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s.
WP-Fix: 'variant' (z.B. Section) fließt in den Hash ein, um mehrere Kanten
zum gleichen Target-Node (aber unterschiedlichen Abschnitten) zu unterscheiden.
"""
base = f"{kind}:{s}->{t}#{scope}" base = f"{kind}:{s}->{t}#{scope}"
if rule_id: base += f"|{rule_id}" if rule_id:
base += f"|{rule_id}"
if variant:
base += f"|{variant}" # <--- Hier entsteht die Eindeutigkeit für verschiedene Sections
return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest() return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest()
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict: