- Introduced configurable edge scoring with internal and external boosts for intra-note edges. - Added aggregation configuration to support note-level and chunk-level retrieval strategies. - Updated retriever and graph subgraph modules to utilize new scoring and aggregation logic. - Enhanced YAML configuration to include new parameters for edge scoring and aggregation levels. - Added boolean indexing for filtering based on edge properties in the setup script.
233 lines
8.7 KiB
Python
233 lines
8.7 KiB
Python
"""
|
|
FILE: app/core/graph/graph_subgraph.py
|
|
DESCRIPTION: In-Memory Repräsentation eines Graphen für Scoring und Analyse.
|
|
Zentrale Komponente für die Graph-Expansion (BFS) und Bonus-Berechnung.
|
|
WP-15c Update: Erhalt von Metadaten (target_section, provenance)
|
|
für präzises Retrieval-Reasoning.
|
|
WP-24c v4.1.0: Scope-Awareness und Section-Filtering Support.
|
|
WP-26 v1.0: is_internal-Boost für Intra-Note-Edges.
|
|
VERSION: 1.4.0 (WP-26: Intra-Note-Edge-Boost)
|
|
STATUS: Active
|
|
"""
|
|
import os
|
|
import math
|
|
from functools import lru_cache
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Optional, DefaultDict, Any, Set
|
|
from qdrant_client import QdrantClient
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
yaml = None
|
|
|
|
# Lokale Paket-Imports
|
|
from .graph_weights import EDGE_BASE_WEIGHTS, calculate_edge_weight
|
|
from .graph_db_adapter import fetch_edges_from_qdrant
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@lru_cache
|
|
def get_edge_scoring_config() -> Dict[str, float]:
|
|
"""
|
|
WP-26 v1.0: Lädt Edge-Scoring-Konfiguration aus retriever.yaml.
|
|
|
|
Returns:
|
|
Dict mit internal_edge_boost und external_edge_boost
|
|
"""
|
|
defaults = {
|
|
"internal_edge_boost": 1.2, # +20% Boost für Intra-Note-Edges
|
|
"external_edge_boost": 1.0 # Standard für Inter-Note-Edges
|
|
}
|
|
|
|
config_path = os.getenv("MINDNET_RETRIEVER_CONFIG", "config/retriever.yaml")
|
|
if yaml and os.path.exists(config_path):
|
|
try:
|
|
with open(config_path, "r", encoding="utf-8") as f:
|
|
data = yaml.safe_load(f) or {}
|
|
edge_scoring = data.get("edge_scoring", {})
|
|
defaults["internal_edge_boost"] = float(edge_scoring.get("internal_edge_boost", defaults["internal_edge_boost"]))
|
|
defaults["external_edge_boost"] = float(edge_scoring.get("external_edge_boost", defaults["external_edge_boost"]))
|
|
except Exception as e:
|
|
logger.warning(f"Edge-Scoring-Konfiguration konnte nicht geladen werden: {e}")
|
|
|
|
return defaults
|
|
|
|
class Subgraph:
|
|
"""
|
|
Leichtgewichtiger Subgraph mit Adjazenzlisten & Kennzahlen.
|
|
Wird für die Berechnung von Graph-Boni im Retriever genutzt.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
# adj speichert nun vollständige Payloads statt nur Tripel
|
|
self.adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
|
self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
|
self.in_degree: DefaultDict[str, int] = defaultdict(int)
|
|
self.out_degree: DefaultDict[str, int] = defaultdict(int)
|
|
# WP-24c v4.1.0: Chunk-Level In-Degree für präzise Scoring-Aggregation
|
|
self.chunk_level_in_degree: DefaultDict[str, int] = defaultdict(int)
|
|
|
|
def add_edge(self, e: Dict) -> None:
|
|
"""
|
|
Fügt eine Kante hinzu und aktualisiert Indizes.
|
|
WP-15c: Speichert das vollständige Payload für den Explanation Layer.
|
|
WP-26 v1.0: Wendet is_internal-Boost auf Intra-Note-Edges an.
|
|
"""
|
|
src = e.get("source")
|
|
tgt = e.get("target")
|
|
kind = e.get("kind")
|
|
|
|
# Basis-Gewicht aus Payload oder Edge-Weights
|
|
base_weight = e.get("weight", EDGE_BASE_WEIGHTS.get(kind, 0.0))
|
|
|
|
# WP-26 v1.0: is_internal-Boost anwenden
|
|
is_internal = e.get("is_internal", False)
|
|
edge_scoring = get_edge_scoring_config()
|
|
if is_internal:
|
|
weight_multiplier = edge_scoring["internal_edge_boost"]
|
|
else:
|
|
weight_multiplier = edge_scoring["external_edge_boost"]
|
|
|
|
final_weight = base_weight * weight_multiplier
|
|
|
|
# Das gesamte Payload wird als Kanten-Objekt behalten
|
|
# Wir stellen sicher, dass alle relevanten Metadaten vorhanden sind
|
|
edge_data = {
|
|
"source": src,
|
|
"target": tgt,
|
|
"kind": kind,
|
|
"weight": final_weight,
|
|
"provenance": e.get("provenance", "rule"),
|
|
"confidence": e.get("confidence", 1.0),
|
|
"target_section": e.get("target_section"), # Essentiell für Präzision
|
|
"is_super_edge": e.get("is_super_edge", False),
|
|
"virtual": e.get("virtual", False), # WP-24c v4.1.0: Für Authority-Priorisierung
|
|
"chunk_id": e.get("chunk_id"), # WP-24c v4.1.0: Für RAG-Kontext
|
|
"is_internal": is_internal # WP-26 v1.0: Flag für Debugging
|
|
}
|
|
|
|
owner = e.get("note_id")
|
|
|
|
if not src or not tgt:
|
|
return
|
|
|
|
# 1. Forward-Kante
|
|
self.adj[src].append(edge_data)
|
|
self.out_degree[src] += 1
|
|
self.in_degree[tgt] += 1
|
|
|
|
# 2. Reverse-Kante (für Explanation Layer & Backlinks)
|
|
self.reverse_adj[tgt].append(edge_data)
|
|
|
|
# 3. Kontext-Note Handling (erhöht die Zentralität der Parent-Note)
|
|
if owner and owner != src:
|
|
# Wir erstellen eine virtuelle Kontext-Kante
|
|
ctx_edge = edge_data.copy()
|
|
ctx_edge["source"] = owner
|
|
ctx_edge["via_context"] = True
|
|
|
|
self.adj[owner].append(ctx_edge)
|
|
self.out_degree[owner] += 1
|
|
if owner != tgt:
|
|
self.reverse_adj[tgt].append(ctx_edge)
|
|
self.in_degree[owner] += 1
|
|
|
|
def aggregate_edge_bonus(self, node_id: str) -> float:
|
|
"""Summe der ausgehenden Kantengewichte (Hub-Score)."""
|
|
return sum(edge["weight"] for edge in self.adj.get(node_id, []))
|
|
|
|
def edge_bonus(self, node_id: str) -> float:
|
|
"""API für Retriever (WP-04a Kompatibilität)."""
|
|
return self.aggregate_edge_bonus(node_id)
|
|
|
|
def centrality_bonus(self, node_id: str) -> float:
|
|
"""
|
|
Log-gedämpfte Zentralität basierend auf dem In-Degree.
|
|
Begrenzt auf einen maximalen Boost von 0.15.
|
|
"""
|
|
indeg = self.in_degree.get(node_id, 0)
|
|
if indeg <= 0:
|
|
return 0.0
|
|
# math.log1p(x) entspricht log(1+x)
|
|
return min(math.log1p(indeg) / 10.0, 0.15)
|
|
|
|
def get_outgoing_edges(self, node_id: str) -> List[Dict[str, Any]]:
|
|
"""Gibt alle ausgehenden Kanten einer Node inkl. Metadaten zurück."""
|
|
return self.adj.get(node_id, [])
|
|
|
|
def get_incoming_edges(self, node_id: str) -> List[Dict[str, Any]]:
|
|
"""Gibt alle eingehenden Kanten einer Node inkl. Metadaten zurück."""
|
|
return self.reverse_adj.get(node_id, [])
|
|
|
|
|
|
def expand(
|
|
client: QdrantClient,
|
|
prefix: str,
|
|
seeds: List[str],
|
|
depth: int = 1,
|
|
edge_types: Optional[List[str]] = None,
|
|
chunk_ids: Optional[List[str]] = None,
|
|
target_section: Optional[str] = None,
|
|
) -> Subgraph:
|
|
"""
|
|
Expandiert ab Seeds entlang von Edges bis zu einer bestimmten Tiefe.
|
|
WP-24c v4.1.0: Unterstützt Scope-Awareness (chunk_ids) und Section-Filtering.
|
|
|
|
Args:
|
|
client: Qdrant Client
|
|
prefix: Collection-Präfix
|
|
seeds: Liste von Note-IDs für die Expansion
|
|
depth: Maximale Tiefe der Expansion
|
|
edge_types: Optionale Filterung nach Kanten-Typen
|
|
chunk_ids: Optionale Liste von Chunk-IDs für Scope-Awareness
|
|
target_section: Optionales Section-Filtering
|
|
"""
|
|
sg = Subgraph()
|
|
frontier = set(seeds)
|
|
visited = set()
|
|
|
|
for _ in range(max(depth, 0)):
|
|
if not frontier:
|
|
break
|
|
|
|
# WP-24c v4.1.0: Erweiterte Edge-Retrieval mit Scope-Awareness und Section-Filtering
|
|
payloads = fetch_edges_from_qdrant(
|
|
client, prefix, list(frontier),
|
|
edge_types=edge_types,
|
|
chunk_ids=chunk_ids,
|
|
target_section=target_section
|
|
)
|
|
next_frontier: Set[str] = set()
|
|
|
|
for pl in payloads:
|
|
src, tgt = pl.get("source_id"), pl.get("target_id")
|
|
if not src or not tgt: continue
|
|
|
|
# WP-15c: Wir übergeben das vollständige Payload an add_edge
|
|
# WP-24c v4.1.0: virtual Flag wird für Authority-Priorisierung benötigt
|
|
edge_payload = {
|
|
"source": src,
|
|
"target": tgt,
|
|
"kind": pl.get("kind", "edge"),
|
|
"weight": calculate_edge_weight(pl),
|
|
"note_id": pl.get("note_id"),
|
|
"provenance": pl.get("provenance", "rule"),
|
|
"confidence": pl.get("confidence", 1.0),
|
|
"target_section": pl.get("target_section"),
|
|
"virtual": pl.get("virtual", False), # WP-24c v4.1.0: Für Authority-Priorisierung
|
|
"chunk_id": pl.get("chunk_id") # WP-24c v4.1.0: Für RAG-Kontext
|
|
}
|
|
|
|
sg.add_edge(edge_payload)
|
|
|
|
# BFS Logik: Neue Ziele in die nächste Frontier aufnehmen
|
|
if tgt not in visited:
|
|
next_frontier.add(str(tgt))
|
|
|
|
visited |= frontier
|
|
frontier = next_frontier - visited
|
|
|
|
return sg |