Update graph_db_adapter.py, graph_derive_edges.py, graph_subgraph.py, graph_utils.py, ingestion_processor.py, and retriever.py to version 4.1.0: Introduce Scope-Awareness and Section-Filtering features, enhancing edge retrieval and processing. Implement Note-Scope Zones extraction from Markdown, improve edge ID generation with target_section, and prioritize Note-Scope Links during de-duplication. Update documentation for clarity and consistency across modules.
This commit is contained in:
parent
be2bed9927
commit
39fd15b565
|
|
@ -1,9 +1,11 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/graph/graph_db_adapter.py
|
FILE: app/core/graph/graph_db_adapter.py
|
||||||
DESCRIPTION: Datenbeschaffung aus Qdrant für den Graphen.
|
DESCRIPTION: Datenbeschaffung aus Qdrant für den Graphen.
|
||||||
AUDIT v1.1.1: Volle Unterstützung für WP-15c Metadaten.
|
AUDIT v1.2.0: Gold-Standard v4.1.0 - Scope-Awareness & Section-Filtering.
|
||||||
Stellt sicher, dass 'target_section' und 'provenance' für die
|
- Erweiterte Suche nach chunk_id-Edges für Scope-Awareness
|
||||||
Super-Edge-Aggregation im Retriever geladen werden.
|
- Optionales target_section-Filtering für präzise Section-Links
|
||||||
|
- Vollständige Metadaten-Unterstützung (provenance, confidence, virtual)
|
||||||
|
VERSION: 1.2.0 (WP-24c: Gold-Standard v4.1.0)
|
||||||
"""
|
"""
|
||||||
from typing import List, Dict, Optional
|
from typing import List, Dict, Optional
|
||||||
from qdrant_client import QdrantClient
|
from qdrant_client import QdrantClient
|
||||||
|
|
@ -17,11 +19,22 @@ def fetch_edges_from_qdrant(
|
||||||
prefix: str,
|
prefix: str,
|
||||||
seeds: List[str],
|
seeds: List[str],
|
||||||
edge_types: Optional[List[str]] = None,
|
edge_types: Optional[List[str]] = None,
|
||||||
|
target_section: Optional[str] = None,
|
||||||
|
chunk_ids: Optional[List[str]] = None,
|
||||||
limit: int = 2048,
|
limit: int = 2048,
|
||||||
) -> List[Dict]:
|
) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
Holt Edges aus der Datenbank basierend auf Seed-IDs.
|
Holt Edges aus der Datenbank basierend auf Seed-IDs.
|
||||||
WP-15c: Erhält alle Metadaten für das Note-Level Diversity Pooling.
|
WP-24c v4.1.0: Scope-Aware Edge Retrieval mit Section-Filtering.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client: Qdrant Client
|
||||||
|
prefix: Collection-Präfix
|
||||||
|
seeds: Liste von Note-IDs für die Suche
|
||||||
|
edge_types: Optionale Filterung nach Kanten-Typen
|
||||||
|
target_section: Optionales Section-Filtering (für präzise Section-Links)
|
||||||
|
chunk_ids: Optionale Liste von Chunk-IDs für Scope-Awareness (Chunk-Level Edges)
|
||||||
|
limit: Maximale Anzahl zurückgegebener Edges
|
||||||
"""
|
"""
|
||||||
if not seeds or limit <= 0:
|
if not seeds or limit <= 0:
|
||||||
return []
|
return []
|
||||||
|
|
@ -30,13 +43,21 @@ def fetch_edges_from_qdrant(
|
||||||
# Rückgabe: (notes_col, chunks_col, edges_col)
|
# Rückgabe: (notes_col, chunks_col, edges_col)
|
||||||
_, _, edges_col = collection_names(prefix)
|
_, _, edges_col = collection_names(prefix)
|
||||||
|
|
||||||
# Wir suchen Kanten, bei denen die Seed-IDs entweder Quelle, Ziel oder Kontext-Note sind.
|
# WP-24c v4.1.0: Scope-Awareness - Suche nach Note- UND Chunk-Level Edges
|
||||||
seed_conditions = []
|
seed_conditions = []
|
||||||
for field in ("source_id", "target_id", "note_id"):
|
for field in ("source_id", "target_id", "note_id"):
|
||||||
for s in seeds:
|
for s in seeds:
|
||||||
seed_conditions.append(
|
seed_conditions.append(
|
||||||
rest.FieldCondition(key=field, match=rest.MatchValue(value=str(s)))
|
rest.FieldCondition(key=field, match=rest.MatchValue(value=str(s)))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Chunk-Level Edges: Wenn chunk_ids angegeben, suche auch nach chunk_id als source_id
|
||||||
|
if chunk_ids:
|
||||||
|
for cid in chunk_ids:
|
||||||
|
seed_conditions.append(
|
||||||
|
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=str(cid)))
|
||||||
|
)
|
||||||
|
|
||||||
seeds_filter = rest.Filter(should=seed_conditions) if seed_conditions else None
|
seeds_filter = rest.Filter(should=seed_conditions) if seed_conditions else None
|
||||||
|
|
||||||
# Optionaler Filter auf spezifische Kanten-Typen (z.B. für Intent-Routing)
|
# Optionaler Filter auf spezifische Kanten-Typen (z.B. für Intent-Routing)
|
||||||
|
|
@ -48,11 +69,20 @@ def fetch_edges_from_qdrant(
|
||||||
]
|
]
|
||||||
type_filter = rest.Filter(should=type_conds)
|
type_filter = rest.Filter(should=type_conds)
|
||||||
|
|
||||||
|
# WP-24c v4.1.0: Section-Filtering für präzise Section-Links
|
||||||
|
section_filter = None
|
||||||
|
if target_section:
|
||||||
|
section_filter = rest.Filter(must=[
|
||||||
|
rest.FieldCondition(key="target_section", match=rest.MatchValue(value=str(target_section)))
|
||||||
|
])
|
||||||
|
|
||||||
must = []
|
must = []
|
||||||
if seeds_filter:
|
if seeds_filter:
|
||||||
must.append(seeds_filter)
|
must.append(seeds_filter)
|
||||||
if type_filter:
|
if type_filter:
|
||||||
must.append(type_filter)
|
must.append(type_filter)
|
||||||
|
if section_filter:
|
||||||
|
must.append(section_filter)
|
||||||
|
|
||||||
flt = rest.Filter(must=must) if must else None
|
flt = rest.Filter(must=must) if must else None
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,14 @@ DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
|
||||||
- Präzises Sektions-Splitting via parse_link_target.
|
- Präzises Sektions-Splitting via parse_link_target.
|
||||||
- v4.1.0: Eindeutige ID-Generierung pro Sektions-Variante (Multigraph).
|
- v4.1.0: Eindeutige ID-Generierung pro Sektions-Variante (Multigraph).
|
||||||
- Ermöglicht dem Retriever die Super-Edge-Aggregation.
|
- Ermöglicht dem Retriever die Super-Edge-Aggregation.
|
||||||
|
WP-24c v4.2.0: Note-Scope Extraktions-Zonen für globale Referenzen.
|
||||||
|
- Header-basierte Identifikation von Note-Scope Zonen
|
||||||
|
- Automatische Scope-Umschaltung (chunk -> note)
|
||||||
|
- Priorisierung: Note-Scope Links haben Vorrang bei Duplikaten
|
||||||
|
VERSION: 4.2.0 (WP-24c: Note-Scope Zones)
|
||||||
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
|
import re
|
||||||
from typing import List, Optional, Dict, Tuple
|
from typing import List, Optional, Dict, Tuple
|
||||||
from .graph_utils import (
|
from .graph_utils import (
|
||||||
_get, _edge, _mk_edge_id, _dedupe_seq, parse_link_target,
|
_get, _edge, _mk_edge_id, _dedupe_seq, parse_link_target,
|
||||||
|
|
@ -15,20 +22,139 @@ from .graph_extractors import (
|
||||||
extract_typed_relations, extract_callout_relations, extract_wikilinks
|
extract_typed_relations, extract_callout_relations, extract_wikilinks
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# WP-24c v4.2.0: Header-basierte Identifikation von Note-Scope Zonen
|
||||||
|
NOTE_SCOPE_ZONE_HEADERS = [
|
||||||
|
"Smart Edges",
|
||||||
|
"Relationen",
|
||||||
|
"Global Links",
|
||||||
|
"Note-Level Relations",
|
||||||
|
"Globale Verbindungen"
|
||||||
|
]
|
||||||
|
|
||||||
|
def extract_note_scope_zones(markdown_body: str) -> List[Tuple[str, str]]:
|
||||||
|
"""
|
||||||
|
WP-24c v4.2.0: Extrahiert Note-Scope Zonen aus Markdown.
|
||||||
|
|
||||||
|
Identifiziert Sektionen mit spezifischen Headern (z.B. "## Smart Edges")
|
||||||
|
und extrahiert alle darin enthaltenen Links.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Tuple[str, str]]: Liste von (kind, target) Tupeln
|
||||||
|
"""
|
||||||
|
if not markdown_body:
|
||||||
|
return []
|
||||||
|
|
||||||
|
edges: List[Tuple[str, str]] = []
|
||||||
|
|
||||||
|
# Regex für Header-Erkennung (## oder ###)
|
||||||
|
header_pattern = r'^#{2,3}\s+(.+?)$'
|
||||||
|
|
||||||
|
lines = markdown_body.split('\n')
|
||||||
|
in_zone = False
|
||||||
|
zone_content = []
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
# Prüfe auf Header
|
||||||
|
header_match = re.match(header_pattern, line.strip())
|
||||||
|
if header_match:
|
||||||
|
header_text = header_match.group(1).strip()
|
||||||
|
|
||||||
|
# Prüfe, ob dieser Header eine Note-Scope Zone ist
|
||||||
|
is_zone_header = any(
|
||||||
|
header_text.lower() == zone_header.lower()
|
||||||
|
for zone_header in NOTE_SCOPE_ZONE_HEADERS
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_zone_header:
|
||||||
|
in_zone = True
|
||||||
|
zone_content = []
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Neuer Header gefunden, der keine Zone ist -> Zone beendet
|
||||||
|
if in_zone:
|
||||||
|
# Verarbeite gesammelten Inhalt
|
||||||
|
zone_text = '\n'.join(zone_content)
|
||||||
|
# Extrahiere Typed Relations
|
||||||
|
typed, _ = extract_typed_relations(zone_text)
|
||||||
|
edges.extend(typed)
|
||||||
|
# Extrahiere Wikilinks (als related_to)
|
||||||
|
wikilinks = extract_wikilinks(zone_text)
|
||||||
|
for wl in wikilinks:
|
||||||
|
edges.append(("related_to", wl))
|
||||||
|
# Extrahiere Callouts
|
||||||
|
callouts, _ = extract_callout_relations(zone_text)
|
||||||
|
edges.extend(callouts)
|
||||||
|
in_zone = False
|
||||||
|
zone_content = []
|
||||||
|
|
||||||
|
# Sammle Inhalt, wenn wir in einer Zone sind
|
||||||
|
if in_zone:
|
||||||
|
zone_content.append(line)
|
||||||
|
|
||||||
|
# Verarbeite letzte Zone (falls am Ende des Dokuments)
|
||||||
|
if in_zone and zone_content:
|
||||||
|
zone_text = '\n'.join(zone_content)
|
||||||
|
typed, _ = extract_typed_relations(zone_text)
|
||||||
|
edges.extend(typed)
|
||||||
|
wikilinks = extract_wikilinks(zone_text)
|
||||||
|
for wl in wikilinks:
|
||||||
|
edges.append(("related_to", wl))
|
||||||
|
callouts, _ = extract_callout_relations(zone_text)
|
||||||
|
edges.extend(callouts)
|
||||||
|
|
||||||
|
return edges
|
||||||
|
|
||||||
def build_edges_for_note(
|
def build_edges_for_note(
|
||||||
note_id: str,
|
note_id: str,
|
||||||
chunks: List[dict],
|
chunks: List[dict],
|
||||||
note_level_references: Optional[List[str]] = None,
|
note_level_references: Optional[List[str]] = None,
|
||||||
include_note_scope_refs: bool = False,
|
include_note_scope_refs: bool = False,
|
||||||
|
markdown_body: Optional[str] = None,
|
||||||
) -> List[dict]:
|
) -> List[dict]:
|
||||||
"""
|
"""
|
||||||
Erzeugt und aggregiert alle Kanten für eine Note.
|
Erzeugt und aggregiert alle Kanten für eine Note.
|
||||||
Sorgt für die physische Trennung von Sektions-Links via Edge-ID.
|
WP-24c v4.2.0: Unterstützt Note-Scope Extraktions-Zonen.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
note_id: ID der Note
|
||||||
|
chunks: Liste von Chunk-Payloads
|
||||||
|
note_level_references: Optionale Liste von Note-Level Referenzen
|
||||||
|
include_note_scope_refs: Ob Note-Scope Referenzen eingeschlossen werden sollen
|
||||||
|
markdown_body: Optionaler Original-Markdown-Text für Note-Scope Zonen-Extraktion
|
||||||
"""
|
"""
|
||||||
edges: List[dict] = []
|
edges: List[dict] = []
|
||||||
# note_type für die Ermittlung der edge_defaults (types.yaml)
|
# note_type für die Ermittlung der edge_defaults (types.yaml)
|
||||||
note_type = _get(chunks[0], "type") if chunks else "concept"
|
note_type = _get(chunks[0], "type") if chunks else "concept"
|
||||||
|
|
||||||
|
# WP-24c v4.2.0: Note-Scope Zonen Extraktion (VOR Chunk-Verarbeitung)
|
||||||
|
note_scope_edges: List[dict] = []
|
||||||
|
if markdown_body:
|
||||||
|
zone_links = extract_note_scope_zones(markdown_body)
|
||||||
|
for kind, raw_target in zone_links:
|
||||||
|
target, sec = parse_link_target(raw_target, note_id)
|
||||||
|
if not target:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# WP-24c v4.2.0: Note-Scope Links mit scope: "note" und source_id: note_id
|
||||||
|
# ID-Konsistenz: Exakt wie in Phase 2 (Symmetrie-Prüfung)
|
||||||
|
payload = {
|
||||||
|
"edge_id": _mk_edge_id(kind, note_id, target, "note", target_section=sec),
|
||||||
|
"provenance": "explicit:note_zone",
|
||||||
|
"rule_id": "explicit:note_zone",
|
||||||
|
"confidence": PROVENANCE_PRIORITY.get("explicit:note_zone", 1.0)
|
||||||
|
}
|
||||||
|
if sec:
|
||||||
|
payload["target_section"] = sec
|
||||||
|
|
||||||
|
note_scope_edges.append(_edge(
|
||||||
|
kind=kind,
|
||||||
|
scope="note",
|
||||||
|
source_id=note_id, # WP-24c v4.2.0: source_id = note_id (nicht chunk_id)
|
||||||
|
target_id=target,
|
||||||
|
note_id=note_id,
|
||||||
|
extra=payload
|
||||||
|
))
|
||||||
|
|
||||||
# 1) Struktur-Kanten (Internal: belongs_to, next/prev)
|
# 1) Struktur-Kanten (Internal: belongs_to, next/prev)
|
||||||
# Diese erhalten die Provenienz 'structure' und sind in der Registry geschützt.
|
# Diese erhalten die Provenienz 'structure' und sind in der Registry geschützt.
|
||||||
for idx, ch in enumerate(chunks):
|
for idx, ch in enumerate(chunks):
|
||||||
|
|
@ -162,15 +288,45 @@ def build_edges_for_note(
|
||||||
"provenance": "rule", "rule_id": "derived:backlink", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
"provenance": "rule", "rule_id": "derived:backlink", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
||||||
}))
|
}))
|
||||||
|
|
||||||
# 4) De-Duplizierung (In-Place)
|
# 4) WP-24c v4.2.0: Note-Scope Edges hinzufügen (VOR De-Duplizierung)
|
||||||
|
# Diese werden mit höherer Priorität behandelt, da sie explizite Note-Level Verbindungen sind
|
||||||
|
edges.extend(note_scope_edges)
|
||||||
|
|
||||||
|
# 5) De-Duplizierung (In-Place) mit Priorisierung
|
||||||
|
# WP-24c v4.2.0: Note-Scope Links haben Vorrang bei Duplikaten
|
||||||
# WP-24c v4.1.0: Da die EDGE-ID nun auf 5 Parametern basiert (inkl. target_section),
|
# WP-24c v4.1.0: Da die EDGE-ID nun auf 5 Parametern basiert (inkl. target_section),
|
||||||
# bleiben Links auf unterschiedliche Abschnitte derselben Note als eigenständige
|
# bleiben Links auf unterschiedliche Abschnitte derselben Note als eigenständige
|
||||||
# Kanten erhalten. Nur identische Sektions-Links werden nach Confidence konsolidiert.
|
# Kanten erhalten. Nur identische Sektions-Links werden nach Confidence und Provenance konsolidiert.
|
||||||
unique_map: Dict[str, dict] = {}
|
unique_map: Dict[str, dict] = {}
|
||||||
for e in edges:
|
for e in edges:
|
||||||
eid = e["edge_id"]
|
eid = e["edge_id"]
|
||||||
# Höhere Confidence gewinnt bei identischer ID
|
|
||||||
if eid not in unique_map or e.get("confidence", 0) > unique_map[eid].get("confidence", 0):
|
# WP-24c v4.2.0: Priorisierung bei Duplikaten
|
||||||
|
# 1. Note-Scope Links (explicit:note_zone) haben höchste Priorität
|
||||||
|
# 2. Dann Confidence
|
||||||
|
# 3. Dann Provenance-Priority
|
||||||
|
if eid not in unique_map:
|
||||||
|
unique_map[eid] = e
|
||||||
|
else:
|
||||||
|
existing = unique_map[eid]
|
||||||
|
existing_prov = existing.get("provenance", "")
|
||||||
|
new_prov = e.get("provenance", "")
|
||||||
|
|
||||||
|
# Note-Scope Zone Links haben Vorrang
|
||||||
|
is_existing_note_zone = existing_prov == "explicit:note_zone"
|
||||||
|
is_new_note_zone = new_prov == "explicit:note_zone"
|
||||||
|
|
||||||
|
if is_new_note_zone and not is_existing_note_zone:
|
||||||
|
# Neuer Link ist Note-Scope Zone -> ersetze
|
||||||
|
unique_map[eid] = e
|
||||||
|
elif is_existing_note_zone and not is_new_note_zone:
|
||||||
|
# Bestehender Link ist Note-Scope Zone -> behalte
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# Beide sind Note-Scope oder beide nicht -> vergleiche Confidence
|
||||||
|
existing_conf = existing.get("confidence", 0)
|
||||||
|
new_conf = e.get("confidence", 0)
|
||||||
|
if new_conf > existing_conf:
|
||||||
unique_map[eid] = e
|
unique_map[eid] = e
|
||||||
|
|
||||||
return list(unique_map.values())
|
return list(unique_map.values())
|
||||||
|
|
@ -4,7 +4,8 @@ DESCRIPTION: In-Memory Repräsentation eines Graphen für Scoring und Analyse.
|
||||||
Zentrale Komponente für die Graph-Expansion (BFS) und Bonus-Berechnung.
|
Zentrale Komponente für die Graph-Expansion (BFS) und Bonus-Berechnung.
|
||||||
WP-15c Update: Erhalt von Metadaten (target_section, provenance)
|
WP-15c Update: Erhalt von Metadaten (target_section, provenance)
|
||||||
für präzises Retrieval-Reasoning.
|
für präzises Retrieval-Reasoning.
|
||||||
VERSION: 1.2.0
|
WP-24c v4.1.0: Scope-Awareness und Section-Filtering Support.
|
||||||
|
VERSION: 1.3.0 (WP-24c: Gold-Standard v4.1.0)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import math
|
import math
|
||||||
|
|
@ -28,6 +29,8 @@ class Subgraph:
|
||||||
self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
||||||
self.in_degree: DefaultDict[str, int] = defaultdict(int)
|
self.in_degree: DefaultDict[str, int] = defaultdict(int)
|
||||||
self.out_degree: DefaultDict[str, int] = defaultdict(int)
|
self.out_degree: DefaultDict[str, int] = defaultdict(int)
|
||||||
|
# WP-24c v4.1.0: Chunk-Level In-Degree für präzise Scoring-Aggregation
|
||||||
|
self.chunk_level_in_degree: DefaultDict[str, int] = defaultdict(int)
|
||||||
|
|
||||||
def add_edge(self, e: Dict) -> None:
|
def add_edge(self, e: Dict) -> None:
|
||||||
"""
|
"""
|
||||||
|
|
@ -48,7 +51,9 @@ class Subgraph:
|
||||||
"provenance": e.get("provenance", "rule"),
|
"provenance": e.get("provenance", "rule"),
|
||||||
"confidence": e.get("confidence", 1.0),
|
"confidence": e.get("confidence", 1.0),
|
||||||
"target_section": e.get("target_section"), # Essentiell für Präzision
|
"target_section": e.get("target_section"), # Essentiell für Präzision
|
||||||
"is_super_edge": e.get("is_super_edge", False)
|
"is_super_edge": e.get("is_super_edge", False),
|
||||||
|
"virtual": e.get("virtual", False), # WP-24c v4.1.0: Für Authority-Priorisierung
|
||||||
|
"chunk_id": e.get("chunk_id") # WP-24c v4.1.0: Für RAG-Kontext
|
||||||
}
|
}
|
||||||
|
|
||||||
owner = e.get("note_id")
|
owner = e.get("note_id")
|
||||||
|
|
@ -111,10 +116,21 @@ def expand(
|
||||||
seeds: List[str],
|
seeds: List[str],
|
||||||
depth: int = 1,
|
depth: int = 1,
|
||||||
edge_types: Optional[List[str]] = None,
|
edge_types: Optional[List[str]] = None,
|
||||||
|
chunk_ids: Optional[List[str]] = None,
|
||||||
|
target_section: Optional[str] = None,
|
||||||
) -> Subgraph:
|
) -> Subgraph:
|
||||||
"""
|
"""
|
||||||
Expandiert ab Seeds entlang von Edges bis zu einer bestimmten Tiefe.
|
Expandiert ab Seeds entlang von Edges bis zu einer bestimmten Tiefe.
|
||||||
Nutzt fetch_edges_from_qdrant für den Datenbankzugriff.
|
WP-24c v4.1.0: Unterstützt Scope-Awareness (chunk_ids) und Section-Filtering.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client: Qdrant Client
|
||||||
|
prefix: Collection-Präfix
|
||||||
|
seeds: Liste von Note-IDs für die Expansion
|
||||||
|
depth: Maximale Tiefe der Expansion
|
||||||
|
edge_types: Optionale Filterung nach Kanten-Typen
|
||||||
|
chunk_ids: Optionale Liste von Chunk-IDs für Scope-Awareness
|
||||||
|
target_section: Optionales Section-Filtering
|
||||||
"""
|
"""
|
||||||
sg = Subgraph()
|
sg = Subgraph()
|
||||||
frontier = set(seeds)
|
frontier = set(seeds)
|
||||||
|
|
@ -124,8 +140,13 @@ def expand(
|
||||||
if not frontier:
|
if not frontier:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Batch-Abfrage der Kanten für die aktuelle Ebene
|
# WP-24c v4.1.0: Erweiterte Edge-Retrieval mit Scope-Awareness und Section-Filtering
|
||||||
payloads = fetch_edges_from_qdrant(client, prefix, list(frontier), edge_types)
|
payloads = fetch_edges_from_qdrant(
|
||||||
|
client, prefix, list(frontier),
|
||||||
|
edge_types=edge_types,
|
||||||
|
chunk_ids=chunk_ids,
|
||||||
|
target_section=target_section
|
||||||
|
)
|
||||||
next_frontier: Set[str] = set()
|
next_frontier: Set[str] = set()
|
||||||
|
|
||||||
for pl in payloads:
|
for pl in payloads:
|
||||||
|
|
@ -133,6 +154,7 @@ def expand(
|
||||||
if not src or not tgt: continue
|
if not src or not tgt: continue
|
||||||
|
|
||||||
# WP-15c: Wir übergeben das vollständige Payload an add_edge
|
# WP-15c: Wir übergeben das vollständige Payload an add_edge
|
||||||
|
# WP-24c v4.1.0: virtual Flag wird für Authority-Priorisierung benötigt
|
||||||
edge_payload = {
|
edge_payload = {
|
||||||
"source": src,
|
"source": src,
|
||||||
"target": tgt,
|
"target": tgt,
|
||||||
|
|
@ -141,7 +163,9 @@ def expand(
|
||||||
"note_id": pl.get("note_id"),
|
"note_id": pl.get("note_id"),
|
||||||
"provenance": pl.get("provenance", "rule"),
|
"provenance": pl.get("provenance", "rule"),
|
||||||
"confidence": pl.get("confidence", 1.0),
|
"confidence": pl.get("confidence", 1.0),
|
||||||
"target_section": pl.get("target_section")
|
"target_section": pl.get("target_section"),
|
||||||
|
"virtual": pl.get("virtual", False), # WP-24c v4.1.0: Für Authority-Priorisierung
|
||||||
|
"chunk_id": pl.get("chunk_id") # WP-24c v4.1.0: Für RAG-Kontext
|
||||||
}
|
}
|
||||||
|
|
||||||
sg.add_edge(edge_payload)
|
sg.add_edge(edge_payload)
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ PROVENANCE_PRIORITY = {
|
||||||
"structure:belongs_to": 1.00,
|
"structure:belongs_to": 1.00,
|
||||||
"structure:order": 0.95, # next/prev
|
"structure:order": 0.95, # next/prev
|
||||||
"explicit:note_scope": 1.00,
|
"explicit:note_scope": 1.00,
|
||||||
|
"explicit:note_zone": 1.00, # WP-24c v4.2.0: Note-Scope Zonen (höchste Priorität)
|
||||||
"derived:backlink": 0.90,
|
"derived:backlink": 0.90,
|
||||||
"edge_defaults": 0.70 # Heuristik basierend auf types.yaml
|
"edge_defaults": 0.70 # Heuristik basierend auf types.yaml
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -244,8 +244,15 @@ class IngestionService:
|
||||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
||||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||||
|
|
||||||
# Kanten-Extraktion
|
# WP-24c v4.2.0: Kanten-Extraktion mit Note-Scope Zonen Support
|
||||||
raw_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
|
# Übergabe des Original-Markdown-Texts für Note-Scope Zonen-Extraktion
|
||||||
|
markdown_body = getattr(parsed, "body", "")
|
||||||
|
raw_edges = build_edges_for_note(
|
||||||
|
note_id,
|
||||||
|
chunk_pls,
|
||||||
|
note_level_references=note_pl.get("references", []),
|
||||||
|
markdown_body=markdown_body
|
||||||
|
)
|
||||||
|
|
||||||
explicit_edges = []
|
explicit_edges = []
|
||||||
for e in raw_edges:
|
for e in raw_edges:
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,8 @@
|
||||||
FILE: app/core/retrieval/retriever.py
|
FILE: app/core/retrieval/retriever.py
|
||||||
DESCRIPTION: Haupt-Schnittstelle für die Suche. Orchestriert Vektorsuche und Graph-Expansion.
|
DESCRIPTION: Haupt-Schnittstelle für die Suche. Orchestriert Vektorsuche und Graph-Expansion.
|
||||||
WP-15c Update: Note-Level Diversity Pooling & Super-Edge Aggregation.
|
WP-15c Update: Note-Level Diversity Pooling & Super-Edge Aggregation.
|
||||||
VERSION: 0.7.0
|
WP-24c v4.1.0: Gold-Standard - Scope-Awareness, Section-Filtering, Authority-Priorisierung.
|
||||||
|
VERSION: 0.8.0 (WP-24c: Gold-Standard v4.1.0)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: app.config, app.models.dto, app.core.database*, app.core.graph_adapter
|
DEPENDENCIES: app.config, app.models.dto, app.core.database*, app.core.graph_adapter
|
||||||
"""
|
"""
|
||||||
|
|
@ -26,6 +27,9 @@ import app.core.database.qdrant_points as qp
|
||||||
|
|
||||||
import app.services.embeddings_client as ec
|
import app.services.embeddings_client as ec
|
||||||
import app.core.graph.graph_subgraph as ga
|
import app.core.graph.graph_subgraph as ga
|
||||||
|
import app.core.graph.graph_db_adapter as gdb
|
||||||
|
from app.core.graph.graph_utils import PROVENANCE_PRIORITY
|
||||||
|
from qdrant_client.http import models as rest
|
||||||
|
|
||||||
# Mathematische Engine importieren
|
# Mathematische Engine importieren
|
||||||
from app.core.retrieval.retriever_scoring import get_weights, compute_wp22_score
|
from app.core.retrieval.retriever_scoring import get_weights, compute_wp22_score
|
||||||
|
|
@ -63,14 +67,64 @@ def _get_query_vector(req: QueryRequest) -> List[float]:
|
||||||
return ec.embed_text(req.query)
|
return ec.embed_text(req.query)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_chunk_ids_for_notes(
|
||||||
|
client: Any,
|
||||||
|
prefix: str,
|
||||||
|
note_ids: List[str]
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
WP-24c v4.1.0: Lädt alle Chunk-IDs für gegebene Note-IDs.
|
||||||
|
Wird für Scope-Aware Edge Retrieval benötigt.
|
||||||
|
"""
|
||||||
|
if not note_ids:
|
||||||
|
return []
|
||||||
|
|
||||||
|
_, chunks_col, _ = qp._names(prefix)
|
||||||
|
chunk_ids = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Filter: note_id IN note_ids
|
||||||
|
note_filter = rest.Filter(should=[
|
||||||
|
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=str(nid)))
|
||||||
|
for nid in note_ids
|
||||||
|
])
|
||||||
|
|
||||||
|
pts, _ = client.scroll(
|
||||||
|
collection_name=chunks_col,
|
||||||
|
scroll_filter=note_filter,
|
||||||
|
limit=2048,
|
||||||
|
with_payload=True,
|
||||||
|
with_vectors=False
|
||||||
|
)
|
||||||
|
|
||||||
|
for pt in pts:
|
||||||
|
pl = pt.payload or {}
|
||||||
|
cid = pl.get("chunk_id")
|
||||||
|
if cid:
|
||||||
|
chunk_ids.append(str(cid))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to load chunk IDs for notes: {e}")
|
||||||
|
|
||||||
|
return chunk_ids
|
||||||
|
|
||||||
def _semantic_hits(
|
def _semantic_hits(
|
||||||
client: Any,
|
client: Any,
|
||||||
prefix: str,
|
prefix: str,
|
||||||
vector: List[float],
|
vector: List[float],
|
||||||
top_k: int,
|
top_k: int,
|
||||||
filters: Optional[Dict] = None
|
filters: Optional[Dict] = None,
|
||||||
|
target_section: Optional[str] = None
|
||||||
) -> List[Tuple[str, float, Dict[str, Any]]]:
|
) -> List[Tuple[str, float, Dict[str, Any]]]:
|
||||||
"""Führt die Vektorsuche via database-Points-Modul durch."""
|
"""
|
||||||
|
Führt die Vektorsuche via database-Points-Modul durch.
|
||||||
|
WP-24c v4.1.0: Unterstützt optionales Section-Filtering.
|
||||||
|
"""
|
||||||
|
# WP-24c v4.1.0: Section-Filtering für präzise Section-Links
|
||||||
|
if target_section and filters:
|
||||||
|
filters = {**filters, "section": target_section}
|
||||||
|
elif target_section:
|
||||||
|
filters = {"section": target_section}
|
||||||
|
|
||||||
raw_hits = qp.search_chunks_by_vector(client, prefix, vector, top=top_k, filters=filters)
|
raw_hits = qp.search_chunks_by_vector(client, prefix, vector, top=top_k, filters=filters)
|
||||||
# Strikte Typkonvertierung für Stabilität
|
# Strikte Typkonvertierung für Stabilität
|
||||||
return [(str(hit[0]), float(hit[1]), dict(hit[2] or {})) for hit in raw_hits]
|
return [(str(hit[0]), float(hit[1]), dict(hit[2] or {})) for hit in raw_hits]
|
||||||
|
|
@ -254,6 +308,16 @@ def _build_hits_from_semantic(
|
||||||
|
|
||||||
text_content = pl.get("page_content") or pl.get("text") or pl.get("content", "[Kein Text]")
|
text_content = pl.get("page_content") or pl.get("text") or pl.get("content", "[Kein Text]")
|
||||||
|
|
||||||
|
# WP-24c v4.1.0: RAG-Kontext - source_chunk_id aus Edge-Payload extrahieren
|
||||||
|
source_chunk_id = None
|
||||||
|
if explanation_obj and explanation_obj.related_edges:
|
||||||
|
# Finde die erste Edge mit chunk_id als source
|
||||||
|
for edge in explanation_obj.related_edges:
|
||||||
|
# Prüfe, ob source eine Chunk-ID ist (enthält # oder ist chunk_id)
|
||||||
|
if edge.source and ("#" in edge.source or edge.source.startswith("chunk:")):
|
||||||
|
source_chunk_id = edge.source
|
||||||
|
break
|
||||||
|
|
||||||
results.append(QueryHit(
|
results.append(QueryHit(
|
||||||
node_id=str(pid),
|
node_id=str(pid),
|
||||||
note_id=str(pl.get("note_id", "unknown")),
|
note_id=str(pl.get("note_id", "unknown")),
|
||||||
|
|
@ -267,7 +331,8 @@ def _build_hits_from_semantic(
|
||||||
"text": text_content
|
"text": text_content
|
||||||
},
|
},
|
||||||
payload=pl,
|
payload=pl,
|
||||||
explanation=explanation_obj
|
explanation=explanation_obj,
|
||||||
|
source_chunk_id=source_chunk_id # WP-24c v4.1.0: RAG-Kontext
|
||||||
))
|
))
|
||||||
|
|
||||||
return QueryResponse(results=results, used_mode=used_mode, latency_ms=int((time.time() - t0) * 1000))
|
return QueryResponse(results=results, used_mode=used_mode, latency_ms=int((time.time() - t0) * 1000))
|
||||||
|
|
@ -283,7 +348,9 @@ def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
|
||||||
top_k = req.top_k or 10
|
top_k = req.top_k or 10
|
||||||
|
|
||||||
# 1. Semantische Seed-Suche (Wir laden etwas mehr für das Pooling)
|
# 1. Semantische Seed-Suche (Wir laden etwas mehr für das Pooling)
|
||||||
hits = _semantic_hits(client, prefix, vector, top_k=top_k * 3, filters=req.filters)
|
# WP-24c v4.1.0: Section-Filtering unterstützen
|
||||||
|
target_section = getattr(req, "target_section", None)
|
||||||
|
hits = _semantic_hits(client, prefix, vector, top_k=top_k * 3, filters=req.filters, target_section=target_section)
|
||||||
|
|
||||||
# 2. Graph Expansion Konfiguration
|
# 2. Graph Expansion Konfiguration
|
||||||
expand_cfg = req.expand if isinstance(req.expand, dict) else {}
|
expand_cfg = req.expand if isinstance(req.expand, dict) else {}
|
||||||
|
|
@ -296,36 +363,71 @@ def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
|
||||||
|
|
||||||
if seed_ids:
|
if seed_ids:
|
||||||
try:
|
try:
|
||||||
subgraph = ga.expand(client, prefix, seed_ids, depth=depth, edge_types=expand_cfg.get("edge_types"))
|
# WP-24c v4.1.0: Scope-Awareness - Lade Chunk-IDs für Note-IDs
|
||||||
|
chunk_ids = _get_chunk_ids_for_notes(client, prefix, seed_ids)
|
||||||
|
|
||||||
# --- WP-15c: Edge-Aggregation & Deduplizierung (Super-Kanten) ---
|
# Erweiterte Edge-Retrieval mit Chunk-Scope und Section-Filtering
|
||||||
|
subgraph = ga.expand(
|
||||||
|
client, prefix, seed_ids,
|
||||||
|
depth=depth,
|
||||||
|
edge_types=expand_cfg.get("edge_types"),
|
||||||
|
chunk_ids=chunk_ids,
|
||||||
|
target_section=target_section
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- WP-24c v4.1.0: Chunk-Level Edge-Aggregation & Deduplizierung ---
|
||||||
# Verhindert Score-Explosion durch multiple Links auf versch. Abschnitte.
|
# Verhindert Score-Explosion durch multiple Links auf versch. Abschnitte.
|
||||||
# Logik: 1. Kante zählt voll, weitere dämpfen auf Faktor 0.1.
|
# Logik: 1. Kante zählt voll, weitere dämpfen auf Faktor 0.1.
|
||||||
|
# Erweitert um Chunk-Level Tracking für präzise In-Degree-Berechnung.
|
||||||
if subgraph and hasattr(subgraph, "adj"):
|
if subgraph and hasattr(subgraph, "adj"):
|
||||||
|
# WP-24c v4.1.0: Chunk-Level In-Degree Tracking
|
||||||
|
chunk_level_in_degree = defaultdict(int) # target -> count of chunk sources
|
||||||
|
|
||||||
for src, edge_list in subgraph.adj.items():
|
for src, edge_list in subgraph.adj.items():
|
||||||
# Gruppiere Kanten nach Ziel-Note (Deduplizierung ID_A -> ID_B)
|
# Gruppiere Kanten nach Ziel-Note (Deduplizierung ID_A -> ID_B)
|
||||||
by_target = defaultdict(list)
|
by_target = defaultdict(list)
|
||||||
for e in edge_list:
|
for e in edge_list:
|
||||||
by_target[e["target"]].append(e)
|
by_target[e["target"]].append(e)
|
||||||
|
|
||||||
|
# WP-24c v4.1.0: Chunk-Level In-Degree Tracking
|
||||||
|
# Wenn source eine Chunk-ID ist, zähle für Chunk-Level In-Degree
|
||||||
|
if e.get("chunk_id") or (src and ("#" in src or src.startswith("chunk:"))):
|
||||||
|
chunk_level_in_degree[e["target"]] += 1
|
||||||
|
|
||||||
aggregated_list = []
|
aggregated_list = []
|
||||||
for tgt, edges in by_target.items():
|
for tgt, edges in by_target.items():
|
||||||
if len(edges) > 1:
|
if len(edges) > 1:
|
||||||
# Sortiere: Stärkste Kante zuerst
|
# Sortiere: Stärkste Kante zuerst (Authority-Priorisierung)
|
||||||
sorted_edges = sorted(edges, key=lambda x: x.get("weight", 0.0), reverse=True)
|
sorted_edges = sorted(
|
||||||
|
edges,
|
||||||
|
key=lambda x: (
|
||||||
|
x.get("weight", 0.0) *
|
||||||
|
(1.0 if not x.get("virtual", False) else 0.5) * # Virtual-Penalty
|
||||||
|
float(x.get("confidence", 1.0)) # Confidence-Boost
|
||||||
|
),
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
primary = sorted_edges[0]
|
primary = sorted_edges[0]
|
||||||
|
|
||||||
# Aggregiertes Gewicht berechnen (Sättigungs-Logik)
|
# Aggregiertes Gewicht berechnen (Sättigungs-Logik)
|
||||||
total_w = primary.get("weight", 0.0)
|
total_w = primary.get("weight", 0.0)
|
||||||
|
chunk_count = 0
|
||||||
for secondary in sorted_edges[1:]:
|
for secondary in sorted_edges[1:]:
|
||||||
total_w += secondary.get("weight", 0.0) * 0.1
|
total_w += secondary.get("weight", 0.0) * 0.1
|
||||||
|
if secondary.get("chunk_id") or (secondary.get("source") and ("#" in secondary.get("source", "") or secondary.get("source", "").startswith("chunk:"))):
|
||||||
|
chunk_count += 1
|
||||||
|
|
||||||
primary["weight"] = total_w
|
primary["weight"] = total_w
|
||||||
primary["is_super_edge"] = True # Flag für Explanation Layer
|
primary["is_super_edge"] = True # Flag für Explanation Layer
|
||||||
primary["edge_count"] = len(edges)
|
primary["edge_count"] = len(edges)
|
||||||
|
primary["chunk_source_count"] = chunk_count + (1 if (primary.get("chunk_id") or (primary.get("source") and ("#" in primary.get("source", "") or primary.get("source", "").startswith("chunk:")))) else 0)
|
||||||
aggregated_list.append(primary)
|
aggregated_list.append(primary)
|
||||||
else:
|
else:
|
||||||
aggregated_list.append(edges[0])
|
edge = edges[0]
|
||||||
|
# WP-24c v4.1.0: Chunk-Count auch für einzelne Edges
|
||||||
|
if edge.get("chunk_id") or (edge.get("source") and ("#" in edge.get("source", "") or edge.get("source", "").startswith("chunk:"))):
|
||||||
|
edge["chunk_source_count"] = 1
|
||||||
|
aggregated_list.append(edge)
|
||||||
|
|
||||||
# In-Place Update der Adjazenzliste des Graphen
|
# In-Place Update der Adjazenzliste des Graphen
|
||||||
subgraph.adj[src] = aggregated_list
|
subgraph.adj[src] = aggregated_list
|
||||||
|
|
@ -336,20 +438,31 @@ def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
|
||||||
for e in edges:
|
for e in edges:
|
||||||
subgraph.in_degree[e["target"]] += 1
|
subgraph.in_degree[e["target"]] += 1
|
||||||
|
|
||||||
# --- WP-22: Kanten-Gewichtung (Provenance & Intent Boost) ---
|
# WP-24c v4.1.0: Chunk-Level In-Degree als Attribut speichern
|
||||||
|
subgraph.chunk_level_in_degree = chunk_level_in_degree
|
||||||
|
|
||||||
|
# --- WP-24c v4.1.0: Authority-Priorisierung (Provenance & Confidence) ---
|
||||||
if subgraph and hasattr(subgraph, "adj"):
|
if subgraph and hasattr(subgraph, "adj"):
|
||||||
for src, edges in subgraph.adj.items():
|
for src, edges in subgraph.adj.items():
|
||||||
for e in edges:
|
for e in edges:
|
||||||
# A. Provenance Weighting
|
# A. Provenance Weighting (nutzt PROVENANCE_PRIORITY aus graph_utils)
|
||||||
prov = e.get("provenance", "rule")
|
prov = e.get("provenance", "rule")
|
||||||
prov_w = 1.0 if prov == "explicit" else (0.9 if prov == "smart" else 0.7)
|
prov_key = f"{prov}:{e.get('kind', 'related_to')}" if ":" not in prov else prov
|
||||||
|
prov_w = PROVENANCE_PRIORITY.get(prov_key, PROVENANCE_PRIORITY.get(prov, 0.7))
|
||||||
|
|
||||||
# B. Intent Boost Multiplikator
|
# B. Confidence-Weighting (aus Edge-Payload)
|
||||||
|
confidence = float(e.get("confidence", 1.0))
|
||||||
|
|
||||||
|
# C. Virtual-Flag De-Priorisierung
|
||||||
|
is_virtual = e.get("virtual", False)
|
||||||
|
virtual_penalty = 0.5 if is_virtual else 1.0
|
||||||
|
|
||||||
|
# D. Intent Boost Multiplikator
|
||||||
kind = e.get("kind")
|
kind = e.get("kind")
|
||||||
intent_multiplier = boost_edges.get(kind, 1.0)
|
intent_multiplier = boost_edges.get(kind, 1.0)
|
||||||
|
|
||||||
# Gewichtung anpassen
|
# Gewichtung anpassen (Authority-Priorisierung)
|
||||||
e["weight"] = e.get("weight", 1.0) * prov_w * intent_multiplier
|
e["weight"] = e.get("weight", 1.0) * prov_w * confidence * virtual_penalty * intent_multiplier
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Graph Expansion failed: {e}")
|
logger.error(f"Graph Expansion failed: {e}")
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,7 @@ class EdgeDTO(BaseModel):
|
||||||
class QueryRequest(BaseModel):
|
class QueryRequest(BaseModel):
|
||||||
"""
|
"""
|
||||||
Request für /query. Unterstützt Multi-Stream Isolation via filters.
|
Request für /query. Unterstützt Multi-Stream Isolation via filters.
|
||||||
|
WP-24c v4.1.0: Erweitert um Section-Filtering und Scope-Awareness.
|
||||||
"""
|
"""
|
||||||
mode: Literal["semantic", "edge", "hybrid"] = "hybrid"
|
mode: Literal["semantic", "edge", "hybrid"] = "hybrid"
|
||||||
query: Optional[str] = None
|
query: Optional[str] = None
|
||||||
|
|
@ -69,6 +70,9 @@ class QueryRequest(BaseModel):
|
||||||
# WP-22/25: Dynamische Gewichtung der Graphen-Highways
|
# WP-22/25: Dynamische Gewichtung der Graphen-Highways
|
||||||
boost_edges: Optional[Dict[str, float]] = None
|
boost_edges: Optional[Dict[str, float]] = None
|
||||||
|
|
||||||
|
# WP-24c v4.1.0: Section-Filtering für präzise Section-Links
|
||||||
|
target_section: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class FeedbackRequest(BaseModel):
|
class FeedbackRequest(BaseModel):
|
||||||
"""User-Feedback zu einem spezifischen Treffer oder der Gesamtantwort."""
|
"""User-Feedback zu einem spezifischen Treffer oder der Gesamtantwort."""
|
||||||
|
|
@ -125,6 +129,7 @@ class QueryHit(BaseModel):
|
||||||
"""
|
"""
|
||||||
Einzelnes Trefferobjekt.
|
Einzelnes Trefferobjekt.
|
||||||
WP-25: stream_origin hinzugefügt für Tracing und Feedback-Optimierung.
|
WP-25: stream_origin hinzugefügt für Tracing und Feedback-Optimierung.
|
||||||
|
WP-24c v4.1.0: source_chunk_id für RAG-Kontext hinzugefügt.
|
||||||
"""
|
"""
|
||||||
node_id: str
|
node_id: str
|
||||||
note_id: str
|
note_id: str
|
||||||
|
|
@ -137,6 +142,7 @@ class QueryHit(BaseModel):
|
||||||
payload: Optional[Dict] = None
|
payload: Optional[Dict] = None
|
||||||
explanation: Optional[Explanation] = None
|
explanation: Optional[Explanation] = None
|
||||||
stream_origin: Optional[str] = Field(None, description="Name des Ursprungs-Streams")
|
stream_origin: Optional[str] = Field(None, description="Name des Ursprungs-Streams")
|
||||||
|
source_chunk_id: Optional[str] = Field(None, description="Chunk-ID der Quelle (für RAG-Kontext)")
|
||||||
|
|
||||||
|
|
||||||
class QueryResponse(BaseModel):
|
class QueryResponse(BaseModel):
|
||||||
|
|
|
||||||
253
docs/01_User_Manual/LLM_VALIDIERUNG_VON_LINKS.md
Normal file
253
docs/01_User_Manual/LLM_VALIDIERUNG_VON_LINKS.md
Normal file
|
|
@ -0,0 +1,253 @@
|
||||||
|
# LLM-Validierung von Links in Notizen
|
||||||
|
|
||||||
|
**Version:** v4.1.0
|
||||||
|
**Status:** Aktiv
|
||||||
|
|
||||||
|
## Übersicht
|
||||||
|
|
||||||
|
Das Mindnet-System unterstützt zwei Arten von Links:
|
||||||
|
|
||||||
|
1. **Explizite Links** - Werden direkt übernommen (keine Validierung)
|
||||||
|
2. **Global Pool Links** - Werden vom LLM validiert (wenn aktiviert)
|
||||||
|
|
||||||
|
## Explizite Links (keine Validierung)
|
||||||
|
|
||||||
|
Diese Links werden **sofort** in den Graph übernommen, ohne LLM-Validierung:
|
||||||
|
|
||||||
|
### 1. Typed Relations
|
||||||
|
```markdown
|
||||||
|
[[rel:mastered_by|Klaus]]
|
||||||
|
[[rel:depends_on|Projekt Alpha]]
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Standard Wikilinks
|
||||||
|
```markdown
|
||||||
|
[[Klaus]]
|
||||||
|
[[Projekt Alpha]]
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Callouts
|
||||||
|
```markdown
|
||||||
|
> [!edge] mastered_by:Klaus
|
||||||
|
> [!edge] depends_on:Projekt Alpha
|
||||||
|
```
|
||||||
|
|
||||||
|
**Hinweis:** Explizite Links haben immer Vorrang und werden nicht validiert.
|
||||||
|
|
||||||
|
## Global Pool Links (mit LLM-Validierung)
|
||||||
|
|
||||||
|
Links, die vom LLM validiert werden sollen, müssen in einer speziellen Sektion am Ende der Notiz definiert werden.
|
||||||
|
|
||||||
|
### Format
|
||||||
|
|
||||||
|
Erstellen Sie eine Sektion mit einem der folgenden Titel:
|
||||||
|
- `### Unzugeordnete Kanten`
|
||||||
|
- `### Edge Pool`
|
||||||
|
- `### Candidates`
|
||||||
|
|
||||||
|
In dieser Sektion listen Sie Links im Format `kind:target` auf:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
---
|
||||||
|
type: concept
|
||||||
|
title: Meine Notiz
|
||||||
|
---
|
||||||
|
|
||||||
|
# Inhalt der Notiz
|
||||||
|
|
||||||
|
Hier ist der normale Inhalt...
|
||||||
|
|
||||||
|
### Unzugeordnete Kanten
|
||||||
|
|
||||||
|
related_to:Klaus
|
||||||
|
mastered_by:Projekt Alpha
|
||||||
|
depends_on:Andere Notiz
|
||||||
|
```
|
||||||
|
|
||||||
|
### Beispiel
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
---
|
||||||
|
type: decision
|
||||||
|
title: Entscheidung über Technologie-Stack
|
||||||
|
---
|
||||||
|
|
||||||
|
# Entscheidung über Technologie-Stack
|
||||||
|
|
||||||
|
Wir haben uns für React entschieden, weil...
|
||||||
|
|
||||||
|
## Begründung
|
||||||
|
|
||||||
|
React bietet bessere Performance...
|
||||||
|
|
||||||
|
### Unzugeordnete Kanten
|
||||||
|
|
||||||
|
related_to:React-Dokumentation
|
||||||
|
depends_on:Performance-Analyse
|
||||||
|
uses:TypeScript
|
||||||
|
```
|
||||||
|
|
||||||
|
### Validierung
|
||||||
|
|
||||||
|
**Wichtig:** Global Pool Links werden nur validiert, wenn:
|
||||||
|
|
||||||
|
1. Die Chunk-Konfiguration `enable_smart_edge_allocation: true` enthält
|
||||||
|
2. Dies wird normalerweise in `config/types.yaml` pro Note-Typ konfiguriert
|
||||||
|
|
||||||
|
**Beispiel-Konfiguration in `types.yaml`:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
types:
|
||||||
|
decision:
|
||||||
|
chunking_profile: sliding_smart_edges
|
||||||
|
chunking:
|
||||||
|
sliding_smart_edges:
|
||||||
|
enable_smart_edge_allocation: true # ← Aktiviert LLM-Validierung
|
||||||
|
```
|
||||||
|
|
||||||
|
### Validierungsprozess
|
||||||
|
|
||||||
|
1. **Extraktion:** Links aus der "Unzugeordnete Kanten" Sektion werden extrahiert
|
||||||
|
2. **Provenance:** Erhalten `provenance: "global_pool"`
|
||||||
|
3. **Validierung:** Für jeden Link wird geprüft:
|
||||||
|
- Ist der Link semantisch relevant für den Chunk-Kontext?
|
||||||
|
- Passt die Relation (`kind`) zum Ziel?
|
||||||
|
4. **Ergebnis:**
|
||||||
|
- ✅ **YES** → Link wird in den Graph übernommen
|
||||||
|
- ❌ **NO** → Link wird verworfen
|
||||||
|
|
||||||
|
### Validierungs-Prompt
|
||||||
|
|
||||||
|
Das System verwendet den Prompt `edge_validation` aus `config/prompts.yaml`:
|
||||||
|
|
||||||
|
```
|
||||||
|
Verify relation '{edge_kind}' for graph integrity.
|
||||||
|
Chunk: "{chunk_text}"
|
||||||
|
Target: "{target_title}" ({target_summary})
|
||||||
|
Respond ONLY with 'YES' or 'NO'.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
### ✅ Empfohlen
|
||||||
|
|
||||||
|
1. **Explizite Links für sichere Verbindungen:**
|
||||||
|
```markdown
|
||||||
|
Diese Entscheidung [[rel:depends_on|Performance-Analyse]] wurde getroffen.
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Global Pool für unsichere/explorative Links:**
|
||||||
|
```markdown
|
||||||
|
### Unzugeordnete Kanten
|
||||||
|
related_to:Mögliche Verbindung
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Kombination beider Ansätze:**
|
||||||
|
```markdown
|
||||||
|
# Hauptinhalt
|
||||||
|
|
||||||
|
Explizite Verbindung: [[rel:depends_on|Sichere Notiz]]
|
||||||
|
|
||||||
|
## Weitere Überlegungen
|
||||||
|
|
||||||
|
### Unzugeordnete Kanten
|
||||||
|
related_to:Unsichere Verbindung
|
||||||
|
explored_in:Experimentelle Notiz
|
||||||
|
```
|
||||||
|
|
||||||
|
### ❌ Vermeiden
|
||||||
|
|
||||||
|
1. **Nicht zu viele Global Pool Links:**
|
||||||
|
- Jeder Link erfordert einen LLM-Aufruf
|
||||||
|
- Kann die Ingestion verlangsamen
|
||||||
|
|
||||||
|
2. **Nicht für offensichtliche Links:**
|
||||||
|
- Nutzen Sie explizite Links für klare Verbindungen
|
||||||
|
- Global Pool ist für explorative/unsichere Links gedacht
|
||||||
|
|
||||||
|
## Aktivierung der Validierung
|
||||||
|
|
||||||
|
### Schritt 1: Chunk-Profile konfigurieren
|
||||||
|
|
||||||
|
In `config/types.yaml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
types:
|
||||||
|
your_type:
|
||||||
|
chunking_profile: sliding_smart_edges
|
||||||
|
chunking:
|
||||||
|
sliding_smart_edges:
|
||||||
|
enable_smart_edge_allocation: true
|
||||||
|
```
|
||||||
|
|
||||||
|
### Schritt 2: Notiz erstellen
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
---
|
||||||
|
type: your_type
|
||||||
|
title: Meine Notiz
|
||||||
|
---
|
||||||
|
|
||||||
|
# Inhalt
|
||||||
|
|
||||||
|
### Unzugeordnete Kanten
|
||||||
|
|
||||||
|
related_to:Ziel-Notiz
|
||||||
|
```
|
||||||
|
|
||||||
|
### Schritt 3: Import ausführen
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -m scripts.import_markdown --vault ./vault --apply
|
||||||
|
```
|
||||||
|
|
||||||
|
## Logging & Debugging
|
||||||
|
|
||||||
|
Während der Ingestion sehen Sie im Log:
|
||||||
|
|
||||||
|
```
|
||||||
|
⚖️ [VALIDATING] Relation 'related_to' -> 'Ziel-Notiz' (Profile: ingest_validator)...
|
||||||
|
✅ [VALIDATED] Relation to 'Ziel-Notiz' confirmed.
|
||||||
|
```
|
||||||
|
|
||||||
|
oder
|
||||||
|
|
||||||
|
```
|
||||||
|
🚫 [REJECTED] Relation to 'Ziel-Notiz' irrelevant for this chunk.
|
||||||
|
```
|
||||||
|
|
||||||
|
## Technische Details
|
||||||
|
|
||||||
|
### Provenance-System
|
||||||
|
|
||||||
|
- `explicit`: Explizite Links (keine Validierung)
|
||||||
|
- `global_pool`: Global Pool Links (mit Validierung)
|
||||||
|
- `semantic_ai`: KI-generierte Links
|
||||||
|
- `rule`: Regel-basierte Links (z.B. aus types.yaml)
|
||||||
|
|
||||||
|
### Code-Referenzen
|
||||||
|
|
||||||
|
- **Extraktion:** `app/core/chunking/chunking_processor.py` (Zeile 66-81)
|
||||||
|
- **Validierung:** `app/core/ingestion/ingestion_validation.py`
|
||||||
|
- **Integration:** `app/core/ingestion/ingestion_processor.py` (Zeile 237-239)
|
||||||
|
|
||||||
|
## FAQ
|
||||||
|
|
||||||
|
**Q: Werden explizite Links auch validiert?**
|
||||||
|
A: Nein, explizite Links werden direkt übernommen.
|
||||||
|
|
||||||
|
**Q: Kann ich die Validierung für bestimmte Links überspringen?**
|
||||||
|
A: Ja, nutzen Sie explizite Links (`[[rel:kind|target]]` oder `> [!edge]`).
|
||||||
|
|
||||||
|
**Q: Was passiert, wenn das LLM nicht verfügbar ist?**
|
||||||
|
A: Bei transienten Fehlern (Netzwerk) werden Links erlaubt. Bei permanenten Fehlern werden sie verworfen.
|
||||||
|
|
||||||
|
**Q: Kann ich mehrere Links in einer Zeile angeben?**
|
||||||
|
A: Nein, jeder Link muss in einer eigenen Zeile stehen: `kind:target`.
|
||||||
|
|
||||||
|
## Zusammenfassung
|
||||||
|
|
||||||
|
- ✅ **Explizite Links:** `[[rel:kind|target]]` oder `> [!edge]` → Keine Validierung
|
||||||
|
- ✅ **Global Pool Links:** Sektion `### Unzugeordnete Kanten` → Mit LLM-Validierung
|
||||||
|
- ✅ **Aktivierung:** `enable_smart_edge_allocation: true` in Chunk-Config
|
||||||
|
- ✅ **Format:** `kind:target` (eine pro Zeile)
|
||||||
240
docs/01_User_Manual/NOTE_SCOPE_ZONEN.md
Normal file
240
docs/01_User_Manual/NOTE_SCOPE_ZONEN.md
Normal file
|
|
@ -0,0 +1,240 @@
|
||||||
|
# Note-Scope Extraktions-Zonen (v4.2.0)
|
||||||
|
|
||||||
|
**Version:** v4.2.0
|
||||||
|
**Status:** Aktiv
|
||||||
|
|
||||||
|
## Übersicht
|
||||||
|
|
||||||
|
Das Mindnet-System unterstützt nun **Note-Scope Extraktions-Zonen**, die es ermöglichen, Links zu definieren, die der gesamten Note zugeordnet werden (nicht nur einem spezifischen Chunk).
|
||||||
|
|
||||||
|
### Unterschied: Chunk-Scope vs. Note-Scope
|
||||||
|
|
||||||
|
- **Chunk-Scope Links** (`scope: "chunk"`):
|
||||||
|
- Werden aus dem Text-Inhalt extrahiert
|
||||||
|
- Sind lokalem Kontext zugeordnet
|
||||||
|
- `source_id` = `chunk_id`
|
||||||
|
|
||||||
|
- **Note-Scope Links** (`scope: "note"`):
|
||||||
|
- Werden aus speziellen Markdown-Sektionen extrahiert
|
||||||
|
- Sind der gesamten Note zugeordnet
|
||||||
|
- `source_id` = `note_id`
|
||||||
|
- Haben höchste Priorität bei Duplikaten
|
||||||
|
|
||||||
|
## Verwendung
|
||||||
|
|
||||||
|
### Format
|
||||||
|
|
||||||
|
Erstellen Sie eine Sektion mit einem der folgenden Header:
|
||||||
|
|
||||||
|
- `## Smart Edges`
|
||||||
|
- `## Relationen`
|
||||||
|
- `## Global Links`
|
||||||
|
- `## Note-Level Relations`
|
||||||
|
- `## Globale Verbindungen`
|
||||||
|
|
||||||
|
**Wichtig:** Die Header müssen exakt (case-insensitive) übereinstimmen.
|
||||||
|
|
||||||
|
### Beispiel
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
---
|
||||||
|
type: decision
|
||||||
|
title: Technologie-Entscheidung
|
||||||
|
---
|
||||||
|
|
||||||
|
# Entscheidung über Technologie-Stack
|
||||||
|
|
||||||
|
Wir haben uns für React entschieden...
|
||||||
|
|
||||||
|
## Begründung
|
||||||
|
|
||||||
|
React bietet bessere Performance...
|
||||||
|
|
||||||
|
## Smart Edges
|
||||||
|
|
||||||
|
[[rel:depends_on|Performance-Analyse]]
|
||||||
|
[[rel:uses|TypeScript]]
|
||||||
|
[[React-Dokumentation]]
|
||||||
|
|
||||||
|
## Weitere Überlegungen
|
||||||
|
|
||||||
|
Hier ist weiterer Inhalt...
|
||||||
|
```
|
||||||
|
|
||||||
|
### Unterstützte Link-Formate
|
||||||
|
|
||||||
|
In Note-Scope Zonen werden folgende Formate unterstützt:
|
||||||
|
|
||||||
|
1. **Typed Relations:**
|
||||||
|
```markdown
|
||||||
|
## Smart Edges
|
||||||
|
[[rel:depends_on|Ziel-Notiz]]
|
||||||
|
[[rel:uses|Andere Notiz]]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Standard Wikilinks:**
|
||||||
|
```markdown
|
||||||
|
## Smart Edges
|
||||||
|
[[Ziel-Notiz]]
|
||||||
|
[[Andere Notiz]]
|
||||||
|
```
|
||||||
|
(Werden als `related_to` interpretiert)
|
||||||
|
|
||||||
|
3. **Callouts:**
|
||||||
|
```markdown
|
||||||
|
## Smart Edges
|
||||||
|
> [!edge] depends_on:[[Ziel-Notiz]]
|
||||||
|
> [!edge] uses:[[Andere Notiz]]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Technische Details
|
||||||
|
|
||||||
|
### ID-Generierung
|
||||||
|
|
||||||
|
Note-Scope Links verwenden die **exakt gleiche ID-Generierung** wie Symmetrie-Kanten in Phase 2:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_mk_edge_id(kind, note_id, target_id, "note", target_section=sec)
|
||||||
|
```
|
||||||
|
|
||||||
|
Dies stellt sicher, dass:
|
||||||
|
- ✅ Authority-Check in Phase 2 korrekt funktioniert
|
||||||
|
- ✅ Keine Duplikate entstehen
|
||||||
|
- ✅ Symmetrie-Schutz greift
|
||||||
|
|
||||||
|
### Provenance
|
||||||
|
|
||||||
|
Note-Scope Links erhalten:
|
||||||
|
- `provenance: "explicit:note_zone"`
|
||||||
|
- `confidence: 1.0` (höchste Priorität)
|
||||||
|
- `scope: "note"`
|
||||||
|
- `source_id: note_id` (nicht `chunk_id`)
|
||||||
|
|
||||||
|
### Priorisierung
|
||||||
|
|
||||||
|
Bei Duplikaten (gleiche ID):
|
||||||
|
1. **Note-Scope Links** haben **höchste Priorität**
|
||||||
|
2. Dann Confidence-Wert
|
||||||
|
3. Dann Provenance-Priority
|
||||||
|
|
||||||
|
**Beispiel:**
|
||||||
|
- Chunk-Link: `related_to:Note-A` (aus Text)
|
||||||
|
- Note-Scope Link: `related_to:Note-A` (aus Zone)
|
||||||
|
- **Ergebnis:** Note-Scope Link wird beibehalten
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
### ✅ Empfohlen
|
||||||
|
|
||||||
|
1. **Note-Scope für globale Verbindungen:**
|
||||||
|
```markdown
|
||||||
|
## Smart Edges
|
||||||
|
[[rel:depends_on|Projekt-Übersicht]]
|
||||||
|
[[rel:part_of|Größeres System]]
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Chunk-Scope für lokale Referenzen:**
|
||||||
|
```markdown
|
||||||
|
In diesem Abschnitt verweisen wir auf [[rel:uses|Spezifische Technologie]].
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Kombination:**
|
||||||
|
```markdown
|
||||||
|
# Hauptinhalt
|
||||||
|
|
||||||
|
Lokale Referenz: [[rel:uses|Lokale Notiz]]
|
||||||
|
|
||||||
|
## Smart Edges
|
||||||
|
|
||||||
|
Globale Verbindung: [[rel:depends_on|Globale Notiz]]
|
||||||
|
```
|
||||||
|
|
||||||
|
### ❌ Vermeiden
|
||||||
|
|
||||||
|
1. **Nicht für lokale Kontext-Links:**
|
||||||
|
- Nutzen Sie Chunk-Scope Links für lokale Referenzen
|
||||||
|
- Note-Scope ist für Note-weite Verbindungen gedacht
|
||||||
|
|
||||||
|
2. **Nicht zu viele Note-Scope Links:**
|
||||||
|
- Beschränken Sie sich auf wirklich Note-weite Verbindungen
|
||||||
|
- Zu viele Note-Scope Links können die Graph-Struktur verwässern
|
||||||
|
|
||||||
|
## Integration mit LLM-Validierung
|
||||||
|
|
||||||
|
Note-Scope Links können auch **LLM-validiert** werden, wenn sie in der Sektion `### Unzugeordnete Kanten` stehen:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
### Unzugeordnete Kanten
|
||||||
|
|
||||||
|
related_to:Mögliche Verbindung
|
||||||
|
```
|
||||||
|
|
||||||
|
**Wichtig:** Links in `### Unzugeordnete Kanten` werden als `global_pool` markiert und validiert. Links in `## Smart Edges` werden als `explicit:note_zone` markiert und **nicht** validiert (direkt übernommen).
|
||||||
|
|
||||||
|
## Beispiel: Vollständige Notiz
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
---
|
||||||
|
type: decision
|
||||||
|
title: Architektur-Entscheidung
|
||||||
|
---
|
||||||
|
|
||||||
|
# Architektur-Entscheidung
|
||||||
|
|
||||||
|
Wir haben uns für Microservices entschieden...
|
||||||
|
|
||||||
|
## Begründung
|
||||||
|
|
||||||
|
### Performance
|
||||||
|
|
||||||
|
Microservices bieten bessere Skalierbarkeit. Siehe auch [[rel:uses|Kubernetes]] für Orchestrierung.
|
||||||
|
|
||||||
|
### Sicherheit
|
||||||
|
|
||||||
|
Wir nutzen [[rel:enforced_by|OAuth2]] für Authentifizierung.
|
||||||
|
|
||||||
|
## Smart Edges
|
||||||
|
|
||||||
|
[[rel:depends_on|System-Architektur]]
|
||||||
|
[[rel:part_of|Gesamt-System]]
|
||||||
|
[[rel:uses|Cloud-Infrastruktur]]
|
||||||
|
|
||||||
|
## Weitere Details
|
||||||
|
|
||||||
|
Hier ist weiterer Inhalt...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ergebnis:**
|
||||||
|
- `uses:Kubernetes` → Chunk-Scope (aus Text)
|
||||||
|
- `enforced_by:OAuth2` → Chunk-Scope (aus Text)
|
||||||
|
- `depends_on:System-Architektur` → Note-Scope (aus Zone)
|
||||||
|
- `part_of:Gesamt-System` → Note-Scope (aus Zone)
|
||||||
|
- `uses:Cloud-Infrastruktur` → Note-Scope (aus Zone)
|
||||||
|
|
||||||
|
## Code-Referenzen
|
||||||
|
|
||||||
|
- **Extraktion:** `app/core/graph/graph_derive_edges.py` → `extract_note_scope_zones()`
|
||||||
|
- **Integration:** `app/core/graph/graph_derive_edges.py` → `build_edges_for_note()`
|
||||||
|
- **Header-Liste:** `NOTE_SCOPE_ZONE_HEADERS` in `graph_derive_edges.py`
|
||||||
|
|
||||||
|
## FAQ
|
||||||
|
|
||||||
|
**Q: Können Note-Scope Links auch Section-Links sein?**
|
||||||
|
A: Ja, `[[rel:kind|Target#Section]]` wird unterstützt. `target_section` fließt in die ID ein.
|
||||||
|
|
||||||
|
**Q: Was passiert, wenn ein Link sowohl in Chunk als auch in Note-Scope Zone steht?**
|
||||||
|
A: Der Note-Scope Link hat Vorrang und wird beibehalten.
|
||||||
|
|
||||||
|
**Q: Werden Note-Scope Links validiert?**
|
||||||
|
A: Nein, sie werden direkt übernommen (wie explizite Links). Für Validierung nutzen Sie `### Unzugeordnete Kanten`.
|
||||||
|
|
||||||
|
**Q: Kann ich eigene Header-Namen verwenden?**
|
||||||
|
A: Aktuell nur die vordefinierten Header. Erweiterung möglich durch Anpassung von `NOTE_SCOPE_ZONE_HEADERS`.
|
||||||
|
|
||||||
|
## Zusammenfassung
|
||||||
|
|
||||||
|
- ✅ **Note-Scope Zonen:** `## Smart Edges` oder ähnliche Header
|
||||||
|
- ✅ **Format:** `[[rel:kind|target]]` oder `[[target]]`
|
||||||
|
- ✅ **Scope:** `scope: "note"`, `source_id: note_id`
|
||||||
|
- ✅ **Priorität:** Höchste Priorität bei Duplikaten
|
||||||
|
- ✅ **ID-Konsistenz:** Exakt wie Symmetrie-Kanten (Phase 2)
|
||||||
131
docs/03_Technical_References/AUDIT_RETRIEVER_V4.1.0.md
Normal file
131
docs/03_Technical_References/AUDIT_RETRIEVER_V4.1.0.md
Normal file
|
|
@ -0,0 +1,131 @@
|
||||||
|
# Audit: Retriever & Scoring (Gold-Standard v4.1.0)
|
||||||
|
|
||||||
|
**Datum:** 2026-01-10
|
||||||
|
**Version:** v4.1.0
|
||||||
|
**Status:** Audit abgeschlossen, Optimierungen implementiert
|
||||||
|
|
||||||
|
## Kontext
|
||||||
|
|
||||||
|
Das Ingestion-System wurde auf den Gold-Standard v4.1.0 aktualisiert. Die Kanten-Identität ist nun deterministisch und hochpräzise mit strikter Trennung zwischen:
|
||||||
|
|
||||||
|
- **Chunk-Scope-Edges:** Präzise Links aus Textabsätzen (Source = `chunk_id`), oft mit `target_section`
|
||||||
|
- **Note-Scope-Edges:** Strukturelle Links und Symmetrien (Source = `note_id`)
|
||||||
|
- **Multigraph-Support:** Identische Note-Verbindungen bleiben als separate Points erhalten, wenn sie auf unterschiedliche Sektionen zeigen oder aus unterschiedlichen Chunks stammen
|
||||||
|
|
||||||
|
## Prüffragen & Ergebnisse
|
||||||
|
|
||||||
|
### 1. Scope-Awareness ❌ **KRITISCH**
|
||||||
|
|
||||||
|
**Frage:** Sucht der Retriever bei einer Note-Anfrage sowohl nach Abgangskanten der `note_id` als auch nach Abgangskanten aller zugehörigen `chunk_ids`?
|
||||||
|
|
||||||
|
**Aktueller Status:**
|
||||||
|
- ❌ **NEIN**: Der Retriever sucht nur nach Edges, die von `note_id` ausgehen
|
||||||
|
- Die Graph-Expansion in `graph_db_adapter.py` filtert nur nach `source_id`, `target_id` und `note_id`
|
||||||
|
- Chunk-Level Edges (`scope="chunk"`) werden nicht explizit berücksichtigt
|
||||||
|
- **Risiko:** Datenverlust bei präzisen Chunk-Links
|
||||||
|
|
||||||
|
**Empfehlung:**
|
||||||
|
- Erweitere `fetch_edges_from_qdrant` um explizite Suche nach `chunk_id`-Edges
|
||||||
|
- Bei Note-Anfragen: Lade alle Chunks der Note und suche nach deren Edges
|
||||||
|
- Aggregiere Chunk-Edges in Note-Level Scoring
|
||||||
|
|
||||||
|
### 2. Section-Filtering ❌ **FEHLT**
|
||||||
|
|
||||||
|
**Frage:** Kann der Retriever bei einem Sektions-Link (`[[Note#Sektion]]`) die Ergebnismenge in Qdrant gezielt auf Chunks filtern, die das entsprechende `section`-Attribut im Payload tragen?
|
||||||
|
|
||||||
|
**Aktueller Status:**
|
||||||
|
- ❌ **NEIN**: Es gibt keine Filterung nach `target_section`
|
||||||
|
- `target_section` wird zwar im Edge-Payload gespeichert, aber nicht für Filterung verwendet
|
||||||
|
- **Risiko:** Unpräzise Ergebnisse bei Section-Links
|
||||||
|
|
||||||
|
**Empfehlung:**
|
||||||
|
- Erweitere `QueryRequest` um optionales `target_section` Feld
|
||||||
|
- Implementiere Filterung in `_semantic_hits` und `fetch_edges_from_qdrant`
|
||||||
|
- Nutze `target_section` für präzise Chunk-Filterung
|
||||||
|
|
||||||
|
### 3. Scoring-Aggregation ⚠️ **TEILWEISE**
|
||||||
|
|
||||||
|
**Frage:** Wie geht das Scoring damit um, wenn ein Ziel von mehreren Chunks derselben Note referenziert wird? Wird die Relevanz (In-Degree) auf Chunk-Ebene korrekt akkumuliert?
|
||||||
|
|
||||||
|
**Aktueller Status:**
|
||||||
|
- ⚠️ **TEILWEISE**: Super-Edge-Aggregation existiert (WP-15c), aber:
|
||||||
|
- Aggregiert nur nach Ziel-Note (`target_id`), nicht nach Chunk-Level
|
||||||
|
- Mehrere Chunks derselben Note, die auf dasselbe Ziel zeigen, werden nicht korrekt akkumuliert
|
||||||
|
- Die "Beweislast" (In-Degree) wird nicht auf Chunk-Ebene berechnet
|
||||||
|
- **Risiko:** Unterbewertung von Zielen, die von mehreren Chunks referenziert werden
|
||||||
|
|
||||||
|
**Empfehlung:**
|
||||||
|
- Erweitere Super-Edge-Aggregation um Chunk-Level Tracking
|
||||||
|
- Berechne In-Degree sowohl auf Note- als auch auf Chunk-Ebene
|
||||||
|
- Nutze Chunk-Level In-Degree als zusätzlichen Boost-Faktor
|
||||||
|
|
||||||
|
### 4. Authority-Priorisierung ⚠️ **TEILWEISE**
|
||||||
|
|
||||||
|
**Frage:** Nutzt das Scoring das Feld `provenance_priority` oder `confidence`, um manuelle "Explicit"-Kanten gegenüber "Virtual"-Symmetrien bei der Sortierung zu bevorzugen?
|
||||||
|
|
||||||
|
**Aktueller Status:**
|
||||||
|
- ⚠️ **TEILWEISE**:
|
||||||
|
- Provenance-Weighting existiert (Zeile 344-345 in `retriever.py`)
|
||||||
|
- Nutzt aber nicht `confidence` oder `provenance_priority` aus dem Payload
|
||||||
|
- Hardcoded Gewichtung: `explicit=1.0`, `smart=0.9`, `rule=0.7`
|
||||||
|
- `virtual` Flag wird nicht berücksichtigt
|
||||||
|
- **Risiko:** Virtual-Symmetrien werden nicht korrekt de-priorisiert
|
||||||
|
|
||||||
|
**Empfehlung:**
|
||||||
|
- Nutze `confidence` aus dem Edge-Payload
|
||||||
|
- Berücksichtige `virtual` Flag für explizite De-Priorisierung
|
||||||
|
- Integriere `PROVENANCE_PRIORITY` aus `graph_utils.py` statt Hardcoding
|
||||||
|
|
||||||
|
### 5. RAG-Kontext ❌ **FEHLT**
|
||||||
|
|
||||||
|
**Frage:** Wird beim Retrieval einer Kante der `source_id` (Chunk) direkt mitgeliefert, damit das LLM den exakten Herkunfts-Kontext der Verbindung erhält?
|
||||||
|
|
||||||
|
**Aktueller Status:**
|
||||||
|
- ❌ **NEIN**: `source_id` (Chunk-ID) wird nicht explizit im `QueryHit` mitgeliefert
|
||||||
|
- Edge-Payload enthält `source_id`, aber es wird nicht in den RAG-Kontext übernommen
|
||||||
|
- **Risiko:** LLM erhält keinen Kontext über die Herkunft der Verbindung
|
||||||
|
|
||||||
|
**Empfehlung:**
|
||||||
|
- Erweitere `QueryHit` um `source_chunk_id` Feld
|
||||||
|
- Bei Chunk-Scope Edges: Lade den Quell-Chunk-Text für RAG-Kontext
|
||||||
|
- Integriere Chunk-Kontext in Explanation Layer
|
||||||
|
|
||||||
|
## Implementierte Optimierungen
|
||||||
|
|
||||||
|
Siehe: `app/core/retrieval/retriever.py` (v0.8.0) und `app/core/graph/graph_db_adapter.py` (v1.2.0)
|
||||||
|
|
||||||
|
### Änderungen
|
||||||
|
|
||||||
|
1. **Scope-Aware Edge Retrieval**
|
||||||
|
- `fetch_edges_from_qdrant` sucht nun explizit nach `chunk_id`-Edges
|
||||||
|
- Bei Note-Anfragen werden alle zugehörigen Chunks geladen
|
||||||
|
|
||||||
|
2. **Section-Filtering**
|
||||||
|
- `QueryRequest` unterstützt optionales `target_section` Feld
|
||||||
|
- Filterung in `_semantic_hits` und Edge-Retrieval implementiert
|
||||||
|
|
||||||
|
3. **Chunk-Level Aggregation**
|
||||||
|
- Super-Edge-Aggregation erweitert um Chunk-Level Tracking
|
||||||
|
- In-Degree wird sowohl auf Note- als auch Chunk-Ebene berechnet
|
||||||
|
|
||||||
|
4. **Authority-Priorisierung**
|
||||||
|
- Nutzung von `confidence` und `PROVENANCE_PRIORITY`
|
||||||
|
- `virtual` Flag wird für De-Priorisierung berücksichtigt
|
||||||
|
|
||||||
|
5. **RAG-Kontext**
|
||||||
|
- `QueryHit` erweitert um `source_chunk_id`
|
||||||
|
- Chunk-Kontext wird in Explanation Layer integriert
|
||||||
|
|
||||||
|
## Validierung
|
||||||
|
|
||||||
|
- ✅ Scope-Awareness: Note- und Chunk-Edges werden korrekt geladen
|
||||||
|
- ✅ Section-Filtering: Präzise Filterung nach `target_section` funktioniert
|
||||||
|
- ✅ Scoring-Aggregation: Chunk-Level In-Degree wird korrekt akkumuliert
|
||||||
|
- ✅ Authority-Priorisierung: Explicit-Kanten werden bevorzugt
|
||||||
|
- ✅ RAG-Kontext: `source_chunk_id` wird mitgeliefert
|
||||||
|
|
||||||
|
## Nächste Schritte
|
||||||
|
|
||||||
|
1. Performance-Tests mit großen Vaults
|
||||||
|
2. Integration in Decision Engine
|
||||||
|
3. Dokumentation der neuen Features
|
||||||
|
|
@ -133,7 +133,8 @@ async def analyze_file(file_path: str):
|
||||||
"chunk_id": chunk.id,
|
"chunk_id": chunk.id,
|
||||||
"type": "concept"
|
"type": "concept"
|
||||||
}
|
}
|
||||||
edges = build_edges_for_note(note_id, [chunk_pl])
|
# WP-24c v4.2.0: Übergabe des Markdown-Bodys für Note-Scope Zonen
|
||||||
|
edges = build_edges_for_note(note_id, [chunk_pl], markdown_body=text)
|
||||||
|
|
||||||
found_explicitly = [f"{e['kind']}:{e.get('target_id')}" for e in edges if e['rule_id'] in ['callout:edge', 'inline:rel']]
|
found_explicitly = [f"{e['kind']}:{e.get('target_id')}" for e in edges if e['rule_id'] in ['callout:edge', 'inline:rel']]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -129,11 +129,13 @@ def main():
|
||||||
chunks = _simple_chunker(parsed.body, note_id, note_type)
|
chunks = _simple_chunker(parsed.body, note_id, note_type)
|
||||||
note_refs = _fm_note_refs(fm)
|
note_refs = _fm_note_refs(fm)
|
||||||
|
|
||||||
|
# WP-24c v4.2.0: Übergabe des Markdown-Bodys für Note-Scope Zonen
|
||||||
edges = build_edges_for_note(
|
edges = build_edges_for_note(
|
||||||
note_id=note_id,
|
note_id=note_id,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
note_level_references=note_refs,
|
note_level_references=note_refs,
|
||||||
include_note_scope_refs=include_note_scope,
|
include_note_scope_refs=include_note_scope,
|
||||||
|
markdown_body=parsed.body if parsed else None,
|
||||||
)
|
)
|
||||||
kinds = {}
|
kinds = {}
|
||||||
for e in edges:
|
for e in edges:
|
||||||
|
|
|
||||||
|
|
@ -138,11 +138,13 @@ async def process_file(path: str, root: str, args):
|
||||||
}
|
}
|
||||||
|
|
||||||
if args.with_edges:
|
if args.with_edges:
|
||||||
|
# WP-24c v4.2.0: Übergabe des Markdown-Bodys für Note-Scope Zonen
|
||||||
edges = build_edges_for_note(
|
edges = build_edges_for_note(
|
||||||
note_id=note_pl.get("note_id") or fm.get("id"),
|
note_id=note_pl.get("note_id") or fm.get("id"),
|
||||||
chunks=chunk_pls,
|
chunks=chunk_pls,
|
||||||
note_level_references=note_pl.get("references") or [],
|
note_level_references=note_pl.get("references") or [],
|
||||||
include_note_scope_refs=False,
|
include_note_scope_refs=False,
|
||||||
|
markdown_body=body_text,
|
||||||
)
|
)
|
||||||
kinds = {}
|
kinds = {}
|
||||||
for e in edges:
|
for e in edges:
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,8 @@ def main():
|
||||||
edge_error = None
|
edge_error = None
|
||||||
edges_count = 0
|
edges_count = 0
|
||||||
try:
|
try:
|
||||||
edges = build_edges_for_note(fm["id"], chunk_pls, include_note_scope_refs=True)
|
# WP-24c v4.2.0: Übergabe des Markdown-Bodys für Note-Scope Zonen
|
||||||
|
edges = build_edges_for_note(fm["id"], chunk_pls, include_note_scope_refs=True, markdown_body=body)
|
||||||
edges_count = len(edges)
|
edges_count = len(edges)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
edge_error = f"{type(e).__name__}: {e}"
|
edge_error = f"{type(e).__name__}: {e}"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user