332 lines
14 KiB
Python
332 lines
14 KiB
Python
"""
|
|
FILE: app/core/graph/graph_derive_edges.py
|
|
DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
|
|
WP-15b/c Audit:
|
|
- Präzises Sektions-Splitting via parse_link_target.
|
|
- v4.1.0: Eindeutige ID-Generierung pro Sektions-Variante (Multigraph).
|
|
- Ermöglicht dem Retriever die Super-Edge-Aggregation.
|
|
WP-24c v4.2.0: Note-Scope Extraktions-Zonen für globale Referenzen.
|
|
- Header-basierte Identifikation von Note-Scope Zonen
|
|
- Automatische Scope-Umschaltung (chunk -> note)
|
|
- Priorisierung: Note-Scope Links haben Vorrang bei Duplikaten
|
|
VERSION: 4.2.0 (WP-24c: Note-Scope Zones)
|
|
STATUS: Active
|
|
"""
|
|
import re
|
|
from typing import List, Optional, Dict, Tuple
|
|
from .graph_utils import (
|
|
_get, _edge, _mk_edge_id, _dedupe_seq, parse_link_target,
|
|
PROVENANCE_PRIORITY, load_types_registry, get_edge_defaults_for
|
|
)
|
|
from .graph_extractors import (
|
|
extract_typed_relations, extract_callout_relations, extract_wikilinks
|
|
)
|
|
|
|
# WP-24c v4.2.0: Header-basierte Identifikation von Note-Scope Zonen
|
|
NOTE_SCOPE_ZONE_HEADERS = [
|
|
"Smart Edges",
|
|
"Relationen",
|
|
"Global Links",
|
|
"Note-Level Relations",
|
|
"Globale Verbindungen"
|
|
]
|
|
|
|
def extract_note_scope_zones(markdown_body: str) -> List[Tuple[str, str]]:
|
|
"""
|
|
WP-24c v4.2.0: Extrahiert Note-Scope Zonen aus Markdown.
|
|
|
|
Identifiziert Sektionen mit spezifischen Headern (z.B. "## Smart Edges")
|
|
und extrahiert alle darin enthaltenen Links.
|
|
|
|
Returns:
|
|
List[Tuple[str, str]]: Liste von (kind, target) Tupeln
|
|
"""
|
|
if not markdown_body:
|
|
return []
|
|
|
|
edges: List[Tuple[str, str]] = []
|
|
|
|
# Regex für Header-Erkennung (## oder ###)
|
|
header_pattern = r'^#{2,3}\s+(.+?)$'
|
|
|
|
lines = markdown_body.split('\n')
|
|
in_zone = False
|
|
zone_content = []
|
|
|
|
for i, line in enumerate(lines):
|
|
# Prüfe auf Header
|
|
header_match = re.match(header_pattern, line.strip())
|
|
if header_match:
|
|
header_text = header_match.group(1).strip()
|
|
|
|
# Prüfe, ob dieser Header eine Note-Scope Zone ist
|
|
is_zone_header = any(
|
|
header_text.lower() == zone_header.lower()
|
|
for zone_header in NOTE_SCOPE_ZONE_HEADERS
|
|
)
|
|
|
|
if is_zone_header:
|
|
in_zone = True
|
|
zone_content = []
|
|
continue
|
|
else:
|
|
# Neuer Header gefunden, der keine Zone ist -> Zone beendet
|
|
if in_zone:
|
|
# Verarbeite gesammelten Inhalt
|
|
zone_text = '\n'.join(zone_content)
|
|
# Extrahiere Typed Relations
|
|
typed, _ = extract_typed_relations(zone_text)
|
|
edges.extend(typed)
|
|
# Extrahiere Wikilinks (als related_to)
|
|
wikilinks = extract_wikilinks(zone_text)
|
|
for wl in wikilinks:
|
|
edges.append(("related_to", wl))
|
|
# Extrahiere Callouts
|
|
callouts, _ = extract_callout_relations(zone_text)
|
|
edges.extend(callouts)
|
|
in_zone = False
|
|
zone_content = []
|
|
|
|
# Sammle Inhalt, wenn wir in einer Zone sind
|
|
if in_zone:
|
|
zone_content.append(line)
|
|
|
|
# Verarbeite letzte Zone (falls am Ende des Dokuments)
|
|
if in_zone and zone_content:
|
|
zone_text = '\n'.join(zone_content)
|
|
typed, _ = extract_typed_relations(zone_text)
|
|
edges.extend(typed)
|
|
wikilinks = extract_wikilinks(zone_text)
|
|
for wl in wikilinks:
|
|
edges.append(("related_to", wl))
|
|
callouts, _ = extract_callout_relations(zone_text)
|
|
edges.extend(callouts)
|
|
|
|
return edges
|
|
|
|
def build_edges_for_note(
|
|
note_id: str,
|
|
chunks: List[dict],
|
|
note_level_references: Optional[List[str]] = None,
|
|
include_note_scope_refs: bool = False,
|
|
markdown_body: Optional[str] = None,
|
|
) -> List[dict]:
|
|
"""
|
|
Erzeugt und aggregiert alle Kanten für eine Note.
|
|
WP-24c v4.2.0: Unterstützt Note-Scope Extraktions-Zonen.
|
|
|
|
Args:
|
|
note_id: ID der Note
|
|
chunks: Liste von Chunk-Payloads
|
|
note_level_references: Optionale Liste von Note-Level Referenzen
|
|
include_note_scope_refs: Ob Note-Scope Referenzen eingeschlossen werden sollen
|
|
markdown_body: Optionaler Original-Markdown-Text für Note-Scope Zonen-Extraktion
|
|
"""
|
|
edges: List[dict] = []
|
|
# note_type für die Ermittlung der edge_defaults (types.yaml)
|
|
note_type = _get(chunks[0], "type") if chunks else "concept"
|
|
|
|
# WP-24c v4.2.0: Note-Scope Zonen Extraktion (VOR Chunk-Verarbeitung)
|
|
note_scope_edges: List[dict] = []
|
|
if markdown_body:
|
|
zone_links = extract_note_scope_zones(markdown_body)
|
|
for kind, raw_target in zone_links:
|
|
target, sec = parse_link_target(raw_target, note_id)
|
|
if not target:
|
|
continue
|
|
|
|
# WP-24c v4.2.0: Note-Scope Links mit scope: "note" und source_id: note_id
|
|
# ID-Konsistenz: Exakt wie in Phase 2 (Symmetrie-Prüfung)
|
|
payload = {
|
|
"edge_id": _mk_edge_id(kind, note_id, target, "note", target_section=sec),
|
|
"provenance": "explicit:note_zone",
|
|
"rule_id": "explicit:note_zone",
|
|
"confidence": PROVENANCE_PRIORITY.get("explicit:note_zone", 1.0)
|
|
}
|
|
if sec:
|
|
payload["target_section"] = sec
|
|
|
|
note_scope_edges.append(_edge(
|
|
kind=kind,
|
|
scope="note",
|
|
source_id=note_id, # WP-24c v4.2.0: source_id = note_id (nicht chunk_id)
|
|
target_id=target,
|
|
note_id=note_id,
|
|
extra=payload
|
|
))
|
|
|
|
# 1) Struktur-Kanten (Internal: belongs_to, next/prev)
|
|
# Diese erhalten die Provenienz 'structure' und sind in der Registry geschützt.
|
|
for idx, ch in enumerate(chunks):
|
|
cid = _get(ch, "chunk_id", "id")
|
|
if not cid: continue
|
|
|
|
# Verbindung Chunk -> Note
|
|
# WP-24c v4.0.0: rule_id wird nur im Payload gespeichert, fließt nicht in die ID ein
|
|
edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, {
|
|
"chunk_id": cid,
|
|
"edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk"),
|
|
"provenance": "structure",
|
|
"rule_id": "structure:belongs_to",
|
|
"confidence": PROVENANCE_PRIORITY["structure:belongs_to"]
|
|
}))
|
|
|
|
# Horizontale Verkettung (Ordnung)
|
|
if idx < len(chunks) - 1:
|
|
next_id = _get(chunks[idx+1], "chunk_id", "id")
|
|
if next_id:
|
|
# WP-24c v4.0.0: rule_id wird nur im Payload gespeichert, fließt nicht in die ID ein
|
|
edges.append(_edge("next", "chunk", cid, next_id, note_id, {
|
|
"chunk_id": cid,
|
|
"edge_id": _mk_edge_id("next", cid, next_id, "chunk"),
|
|
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
|
|
}))
|
|
edges.append(_edge("prev", "chunk", next_id, cid, note_id, {
|
|
"edge_id": _mk_edge_id("prev", next_id, cid, "chunk"),
|
|
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
|
|
}))
|
|
|
|
# 2) Inhaltliche Kanten (Explicit & Candidate Pool)
|
|
reg = load_types_registry()
|
|
defaults = get_edge_defaults_for(note_type, reg)
|
|
refs_all: List[str] = []
|
|
|
|
for ch in chunks:
|
|
cid = _get(ch, "chunk_id", "id")
|
|
if not cid: continue
|
|
raw = _get(ch, "window") or _get(ch, "text") or ""
|
|
|
|
# A. Typed Relations (Inline [[rel:kind|target]])
|
|
typed, rem = extract_typed_relations(raw)
|
|
for k, raw_t in typed:
|
|
t, sec = parse_link_target(raw_t, note_id)
|
|
if not t: continue
|
|
|
|
payload = {
|
|
"chunk_id": cid,
|
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
|
|
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
|
}
|
|
if sec: payload["target_section"] = sec
|
|
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
|
|
|
# B. Candidate Pool (WP-15b Validierte KI-Kanten)
|
|
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
|
|
for cand in pool:
|
|
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
|
t, sec = parse_link_target(raw_t, note_id)
|
|
if t:
|
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
|
payload = {
|
|
"chunk_id": cid,
|
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
|
|
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
|
|
}
|
|
if sec: payload["target_section"] = sec
|
|
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
|
|
|
# C. Callouts (> [!edge])
|
|
call_pairs, rem2 = extract_callout_relations(rem)
|
|
for k, raw_t in call_pairs:
|
|
t, sec = parse_link_target(raw_t, note_id)
|
|
if not t: continue
|
|
|
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
|
payload = {
|
|
"chunk_id": cid,
|
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", target_section=sec),
|
|
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
|
|
}
|
|
if sec: payload["target_section"] = sec
|
|
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
|
|
|
# D. Standard Wikilinks & Typ-Defaults
|
|
refs = extract_wikilinks(rem2)
|
|
for raw_r in refs:
|
|
r, sec = parse_link_target(raw_r, note_id)
|
|
if not r: continue
|
|
|
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
|
payload = {
|
|
"chunk_id": cid, "ref_text": raw_r,
|
|
"edge_id": _mk_edge_id("references", cid, r, "chunk", target_section=sec),
|
|
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
|
|
}
|
|
if sec: payload["target_section"] = sec
|
|
edges.append(_edge("references", "chunk", cid, r, note_id, payload))
|
|
|
|
# Automatische Kanten-Vererbung aus types.yaml
|
|
for rel in defaults:
|
|
if rel != "references":
|
|
# WP-24c v4.1.0: target_section fließt nun fest in die ID-Generierung ein
|
|
def_payload = {
|
|
"chunk_id": cid,
|
|
"edge_id": _mk_edge_id(rel, cid, r, "chunk", target_section=sec),
|
|
"provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"]
|
|
}
|
|
if sec: def_payload["target_section"] = sec
|
|
edges.append(_edge(rel, "chunk", cid, r, note_id, def_payload))
|
|
|
|
refs_all.extend([parse_link_target(r, note_id)[0] for r in refs])
|
|
|
|
# 3) Note-Scope (Grobe Struktur-Verbindungen)
|
|
if include_note_scope_refs:
|
|
cleaned_note_refs = [parse_link_target(r, note_id)[0] for r in (note_level_references or [])]
|
|
refs_note = _dedupe_seq((refs_all or []) + cleaned_note_refs)
|
|
|
|
for r in refs_note:
|
|
if not r: continue
|
|
# WP-24c v4.0.0: rule_id wird nur im Payload gespeichert, fließt nicht in die ID ein
|
|
edges.append(_edge("references", "note", note_id, r, note_id, {
|
|
"edge_id": _mk_edge_id("references", note_id, r, "note"),
|
|
"provenance": "explicit", "rule_id": "explicit:note_scope", "confidence": PROVENANCE_PRIORITY["explicit:note_scope"]
|
|
}))
|
|
# Backlinks zur Stärkung der Bidirektionalität
|
|
edges.append(_edge("backlink", "note", r, note_id, note_id, {
|
|
"edge_id": _mk_edge_id("backlink", r, note_id, "note"),
|
|
"provenance": "rule", "rule_id": "derived:backlink", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
|
}))
|
|
|
|
# 4) WP-24c v4.2.0: Note-Scope Edges hinzufügen (VOR De-Duplizierung)
|
|
# Diese werden mit höherer Priorität behandelt, da sie explizite Note-Level Verbindungen sind
|
|
edges.extend(note_scope_edges)
|
|
|
|
# 5) De-Duplizierung (In-Place) mit Priorisierung
|
|
# WP-24c v4.2.0: Note-Scope Links haben Vorrang bei Duplikaten
|
|
# WP-24c v4.1.0: Da die EDGE-ID nun auf 5 Parametern basiert (inkl. target_section),
|
|
# bleiben Links auf unterschiedliche Abschnitte derselben Note als eigenständige
|
|
# Kanten erhalten. Nur identische Sektions-Links werden nach Confidence und Provenance konsolidiert.
|
|
unique_map: Dict[str, dict] = {}
|
|
for e in edges:
|
|
eid = e["edge_id"]
|
|
|
|
# WP-24c v4.2.0: Priorisierung bei Duplikaten
|
|
# 1. Note-Scope Links (explicit:note_zone) haben höchste Priorität
|
|
# 2. Dann Confidence
|
|
# 3. Dann Provenance-Priority
|
|
if eid not in unique_map:
|
|
unique_map[eid] = e
|
|
else:
|
|
existing = unique_map[eid]
|
|
existing_prov = existing.get("provenance", "")
|
|
new_prov = e.get("provenance", "")
|
|
|
|
# Note-Scope Zone Links haben Vorrang
|
|
is_existing_note_zone = existing_prov == "explicit:note_zone"
|
|
is_new_note_zone = new_prov == "explicit:note_zone"
|
|
|
|
if is_new_note_zone and not is_existing_note_zone:
|
|
# Neuer Link ist Note-Scope Zone -> ersetze
|
|
unique_map[eid] = e
|
|
elif is_existing_note_zone and not is_new_note_zone:
|
|
# Bestehender Link ist Note-Scope Zone -> behalte
|
|
pass
|
|
else:
|
|
# Beide sind Note-Scope oder beide nicht -> vergleiche Confidence
|
|
existing_conf = existing.get("confidence", 0)
|
|
new_conf = e.get("confidence", 0)
|
|
if new_conf > existing_conf:
|
|
unique_map[eid] = e
|
|
|
|
return list(unique_map.values()) |