Update graph_derive_edges.py and graph_utils.py to version 4.1.0: Enhance edge ID generation by incorporating target_section into the ID calculation, allowing for distinct edges across different sections. Update documentation to reflect changes in ID structure and improve clarity on edge handling during de-duplication.
This commit is contained in:
parent
a852975811
commit
2da98e8e37
|
|
@ -3,7 +3,7 @@ FILE: app/core/graph/graph_derive_edges.py
|
||||||
DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
|
DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
|
||||||
WP-15b/c Audit:
|
WP-15b/c Audit:
|
||||||
- Präzises Sektions-Splitting via parse_link_target.
|
- Präzises Sektions-Splitting via parse_link_target.
|
||||||
- Eindeutige ID-Generierung pro Sektions-Variante (Multigraph).
|
- v4.1.0: Eindeutige ID-Generierung pro Sektions-Variante (Multigraph).
|
||||||
- Ermöglicht dem Retriever die Super-Edge-Aggregation.
|
- Ermöglicht dem Retriever die Super-Edge-Aggregation.
|
||||||
"""
|
"""
|
||||||
from typing import List, Optional, Dict, Tuple
|
from typing import List, Optional, Dict, Tuple
|
||||||
|
|
@ -56,7 +56,6 @@ def build_edges_for_note(
|
||||||
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
|
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
|
||||||
}))
|
}))
|
||||||
edges.append(_edge("prev", "chunk", next_id, cid, note_id, {
|
edges.append(_edge("prev", "chunk", next_id, cid, note_id, {
|
||||||
"chunk_id": next_id,
|
|
||||||
"edge_id": _mk_edge_id("prev", next_id, cid, "chunk"),
|
"edge_id": _mk_edge_id("prev", next_id, cid, "chunk"),
|
||||||
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
|
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
|
||||||
}))
|
}))
|
||||||
|
|
@ -79,8 +78,8 @@ def build_edges_for_note(
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"chunk_id": cid,
|
"chunk_id": cid,
|
||||||
# WP-24c v4.0.0: variant wird nur im Payload gespeichert (target_section), fließt nicht in die ID ein
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
||||||
"edge_id": _mk_edge_id(k, cid, t, "chunk"),
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
|
||||||
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
||||||
}
|
}
|
||||||
if sec: payload["target_section"] = sec
|
if sec: payload["target_section"] = sec
|
||||||
|
|
@ -92,10 +91,10 @@ def build_edges_for_note(
|
||||||
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
||||||
t, sec = parse_link_target(raw_t, note_id)
|
t, sec = parse_link_target(raw_t, note_id)
|
||||||
if t:
|
if t:
|
||||||
# WP-24c v4.0.0: rule_id und variant werden nur im Payload gespeichert, fließen nicht in die ID ein
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
||||||
payload = {
|
payload = {
|
||||||
"chunk_id": cid,
|
"chunk_id": cid,
|
||||||
"edge_id": _mk_edge_id(k, cid, t, "chunk"),
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
|
||||||
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
|
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
|
||||||
}
|
}
|
||||||
if sec: payload["target_section"] = sec
|
if sec: payload["target_section"] = sec
|
||||||
|
|
@ -107,10 +106,10 @@ def build_edges_for_note(
|
||||||
t, sec = parse_link_target(raw_t, note_id)
|
t, sec = parse_link_target(raw_t, note_id)
|
||||||
if not t: continue
|
if not t: continue
|
||||||
|
|
||||||
# WP-24c v4.0.0: rule_id und variant werden nur im Payload gespeichert, fließen nicht in die ID ein
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
||||||
payload = {
|
payload = {
|
||||||
"chunk_id": cid,
|
"chunk_id": cid,
|
||||||
"edge_id": _mk_edge_id(k, cid, t, "chunk"),
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
|
||||||
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
|
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
|
||||||
}
|
}
|
||||||
if sec: payload["target_section"] = sec
|
if sec: payload["target_section"] = sec
|
||||||
|
|
@ -122,10 +121,10 @@ def build_edges_for_note(
|
||||||
r, sec = parse_link_target(raw_r, note_id)
|
r, sec = parse_link_target(raw_r, note_id)
|
||||||
if not r: continue
|
if not r: continue
|
||||||
|
|
||||||
# WP-24c v4.0.0: rule_id und variant werden nur im Payload gespeichert, fließen nicht in die ID ein
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
||||||
payload = {
|
payload = {
|
||||||
"chunk_id": cid, "ref_text": raw_r,
|
"chunk_id": cid, "ref_text": raw_r,
|
||||||
"edge_id": _mk_edge_id("references", cid, r, "chunk"),
|
"edge_id": _mk_edge_id("references", cid, r, "chunk", target_section=sec),
|
||||||
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
|
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
|
||||||
}
|
}
|
||||||
if sec: payload["target_section"] = sec
|
if sec: payload["target_section"] = sec
|
||||||
|
|
@ -134,10 +133,10 @@ def build_edges_for_note(
|
||||||
# Automatische Kanten-Vererbung aus types.yaml
|
# Automatische Kanten-Vererbung aus types.yaml
|
||||||
for rel in defaults:
|
for rel in defaults:
|
||||||
if rel != "references":
|
if rel != "references":
|
||||||
# WP-24c v4.0.0: rule_id und variant werden nur im Payload gespeichert, fließen nicht in die ID ein
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
||||||
def_payload = {
|
def_payload = {
|
||||||
"chunk_id": cid,
|
"chunk_id": cid,
|
||||||
"edge_id": _mk_edge_id(rel, cid, r, "chunk"),
|
"edge_id": _mk_edge_id(rel, cid, r, "chunk", target_section=sec),
|
||||||
"provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"]
|
"provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"]
|
||||||
}
|
}
|
||||||
if sec: def_payload["target_section"] = sec
|
if sec: def_payload["target_section"] = sec
|
||||||
|
|
@ -164,9 +163,9 @@ def build_edges_for_note(
|
||||||
}))
|
}))
|
||||||
|
|
||||||
# 4) De-Duplizierung (In-Place)
|
# 4) De-Duplizierung (In-Place)
|
||||||
# WP-24c v4.0.0: Da die EDGE-ID nur auf 4 Parametern basiert (kind, source, target, scope),
|
# WP-24c v4.1.0: Da die EDGE-ID nun auf 5 Parametern basiert (inkl. target_section),
|
||||||
# werden Links auf unterschiedliche Abschnitte derselben Note durch die De-Duplizierung
|
# bleiben Links auf unterschiedliche Abschnitte derselben Note als eigenständige
|
||||||
# konsolidiert. Die Sektion-Information bleibt im Payload (target_section) erhalten.
|
# Kanten erhalten. Nur identische Sektions-Links werden nach Confidence konsolidiert.
|
||||||
unique_map: Dict[str, dict] = {}
|
unique_map: Dict[str, dict] = {}
|
||||||
for e in edges:
|
for e in edges:
|
||||||
eid = e["edge_id"]
|
eid = e["edge_id"]
|
||||||
|
|
|
||||||
|
|
@ -86,7 +86,7 @@ def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[
|
||||||
|
|
||||||
return target, section
|
return target, section
|
||||||
|
|
||||||
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None, variant: Optional[str] = None) -> str:
|
def _mk_edge_id(kind: str, s: str, t: str, scope: str, target_section: Optional[str] = None) -> str:
|
||||||
"""
|
"""
|
||||||
WP-24c v4.0.0: DER GLOBALE STANDARD für Kanten-IDs.
|
WP-24c v4.0.0: DER GLOBALE STANDARD für Kanten-IDs.
|
||||||
Erzeugt eine deterministische UUIDv5. Dies stellt sicher, dass manuelle Links
|
Erzeugt eine deterministische UUIDv5. Dies stellt sicher, dass manuelle Links
|
||||||
|
|
@ -109,12 +109,13 @@ def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] =
|
||||||
if not all([kind, s, t]):
|
if not all([kind, s, t]):
|
||||||
raise ValueError(f"Incomplete data for edge ID: kind={kind}, src={s}, tgt={t}")
|
raise ValueError(f"Incomplete data for edge ID: kind={kind}, src={s}, tgt={t}")
|
||||||
|
|
||||||
# GOLD-STANDARD v4.0.0: STRICT 4-Parameter-ID
|
# Der String enthält nun alle distinkten semantischen Merkmale
|
||||||
# Keine Suffixe für rule_id oder variant im Hash-String!
|
|
||||||
# Jede manuelle Änderung an diesem String-Format führt zu doppelten Kanten in der DB!
|
|
||||||
base = f"edge:{kind}:{s}:{t}:{scope}"
|
base = f"edge:{kind}:{s}:{t}:{scope}"
|
||||||
|
|
||||||
# Nutzt den URL-Namespace für deterministische Reproduzierbarkeit
|
# Wenn ein Link auf eine spezifische Sektion zeigt, ist es eine andere Relation
|
||||||
|
if target_section:
|
||||||
|
base += f":{target_section}"
|
||||||
|
|
||||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, base))
|
return str(uuid.uuid5(uuid.NAMESPACE_URL, base))
|
||||||
|
|
||||||
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user