Implement WP-15c enhancements across graph and retrieval modules, including full metadata support for Super-Edge aggregation and Note-Level Diversity Pooling. Update scoring logic to reflect new edge handling and improve retrieval accuracy. Version updates to reflect these changes.

This commit is contained in:
Lars 2025-12-30 21:47:18 +01:00
parent cd5056d4c9
commit d35bdc64b9
5 changed files with 202 additions and 91 deletions

View File

@ -1,13 +1,15 @@
""" """
FILE: app/core/graph/graph_db_adapter.py FILE: app/core/graph/graph_db_adapter.py
DESCRIPTION: Datenbeschaffung aus Qdrant für den Graphen. DESCRIPTION: Datenbeschaffung aus Qdrant für den Graphen.
AUDIT v1.1.0: Nutzt nun die zentrale database-Infrastruktur für Namen. AUDIT v1.1.1: Volle Unterstützung für WP-15c Metadaten.
Stellt sicher, dass 'target_section' und 'provenance' für die
Super-Edge-Aggregation im Retriever geladen werden.
""" """
from typing import List, Dict, Optional from typing import List, Dict, Optional
from qdrant_client import QdrantClient from qdrant_client import QdrantClient
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
# ENTSCHEIDENDER FIX: Nutzt die neue Infrastruktur für konsistente Collection-Namen # Nutzt die zentrale Infrastruktur für konsistente Collection-Namen (WP-14)
from app.core.database import collection_names from app.core.database import collection_names
def fetch_edges_from_qdrant( def fetch_edges_from_qdrant(
@ -19,14 +21,16 @@ def fetch_edges_from_qdrant(
) -> List[Dict]: ) -> List[Dict]:
""" """
Holt Edges aus der Datenbank basierend auf Seed-IDs. Holt Edges aus der Datenbank basierend auf Seed-IDs.
Filtert auf source_id, target_id oder note_id. WP-15c: Erhält alle Metadaten für das Note-Level Diversity Pooling.
""" """
if not seeds or limit <= 0: if not seeds or limit <= 0:
return [] return []
# Konsistente Namensauflösung via database-Paket # Konsistente Namensauflösung via database-Paket
# Rückgabe: (notes_col, chunks_col, edges_col)
_, _, edges_col = collection_names(prefix) _, _, edges_col = collection_names(prefix)
# Wir suchen Kanten, bei denen die Seed-IDs entweder Quelle, Ziel oder Kontext-Note sind.
seed_conditions = [] seed_conditions = []
for field in ("source_id", "target_id", "note_id"): for field in ("source_id", "target_id", "note_id"):
for s in seeds: for s in seeds:
@ -35,6 +39,7 @@ def fetch_edges_from_qdrant(
) )
seeds_filter = rest.Filter(should=seed_conditions) if seed_conditions else None seeds_filter = rest.Filter(should=seed_conditions) if seed_conditions else None
# Optionaler Filter auf spezifische Kanten-Typen (z.B. für Intent-Routing)
type_filter = None type_filter = None
if edge_types: if edge_types:
type_conds = [ type_conds = [
@ -52,6 +57,7 @@ def fetch_edges_from_qdrant(
flt = rest.Filter(must=must) if must else None flt = rest.Filter(must=must) if must else None
# Abfrage via Qdrant Scroll API # Abfrage via Qdrant Scroll API
# WICHTIG: with_payload=True lädt alle Metadaten (target_section, provenance etc.)
pts, _ = client.scroll( pts, _ = client.scroll(
collection_name=edges_col, collection_name=edges_col,
scroll_filter=flt, scroll_filter=flt,
@ -60,4 +66,6 @@ def fetch_edges_from_qdrant(
with_vectors=False, with_vectors=False,
) )
# Wir geben das vollständige Payload zurück, damit der Retriever
# alle Signale für die Super-Edge-Aggregation und das Scoring hat.
return [dict(p.payload) for p in pts if p.payload] return [dict(p.payload) for p in pts if p.payload]

View File

@ -1,10 +1,10 @@
""" """
FILE: app/core/graph/graph_derive_edges.py FILE: app/core/graph/graph_derive_edges.py
DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung. DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
AUDIT: WP-15b/c Audit:
- Nutzt parse_link_target - Präzises Sektions-Splitting via parse_link_target.
- Übergibt Section als 'variant' an ID-Gen - Eindeutige ID-Generierung pro Sektions-Variante (Multigraph).
- Dedup basiert jetzt auf Edge-ID (erlaubt Multigraph für Sections) - Ermöglicht dem Retriever die Super-Edge-Aggregation.
""" """
from typing import List, Optional, Dict, Tuple from typing import List, Optional, Dict, Tuple
from .graph_utils import ( from .graph_utils import (
@ -21,31 +21,45 @@ def build_edges_for_note(
note_level_references: Optional[List[str]] = None, note_level_references: Optional[List[str]] = None,
include_note_scope_refs: bool = False, include_note_scope_refs: bool = False,
) -> List[dict]: ) -> List[dict]:
"""Erzeugt und aggregiert alle Kanten für eine Note (WP-15b).""" """
Erzeugt und aggregiert alle Kanten für eine Note.
Sorgt für die physische Trennung von Sektions-Links via Edge-ID.
"""
edges: List[dict] = [] edges: List[dict] = []
# note_type für die Ermittlung der edge_defaults (types.yaml)
note_type = _get(chunks[0], "type") if chunks else "concept" note_type = _get(chunks[0], "type") if chunks else "concept"
# 1) Struktur-Kanten (belongs_to, next/prev) # 1) Struktur-Kanten (Internal: belongs_to, next/prev)
# Diese erhalten die Provenienz 'structure' und sind in der Registry geschützt.
for idx, ch in enumerate(chunks): for idx, ch in enumerate(chunks):
cid = _get(ch, "chunk_id", "id") cid = _get(ch, "chunk_id", "id")
if not cid: continue if not cid: continue
# Verbindung Chunk -> Note
edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, { edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, {
"chunk_id": cid, "edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"), "chunk_id": cid,
"provenance": "structure", "rule_id": "structure:belongs_to", "confidence": PROVENANCE_PRIORITY["structure:belongs_to"] "edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"),
"provenance": "structure",
"rule_id": "structure:belongs_to",
"confidence": PROVENANCE_PRIORITY["structure:belongs_to"]
})) }))
# Horizontale Verkettung (Ordnung)
if idx < len(chunks) - 1: if idx < len(chunks) - 1:
next_id = _get(chunks[idx+1], "chunk_id", "id") next_id = _get(chunks[idx+1], "chunk_id", "id")
if next_id: if next_id:
edges.append(_edge("next", "chunk", cid, next_id, note_id, { edges.append(_edge("next", "chunk", cid, next_id, note_id, {
"chunk_id": cid, "edge_id": _mk_edge_id("next", cid, next_id, "chunk", "structure:order"), "chunk_id": cid,
"edge_id": _mk_edge_id("next", cid, next_id, "chunk", "structure:order"),
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"] "provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
})) }))
edges.append(_edge("prev", "chunk", next_id, cid, note_id, { edges.append(_edge("prev", "chunk", next_id, cid, note_id, {
"chunk_id": next_id, "edge_id": _mk_edge_id("prev", next_id, cid, "chunk", "structure:order"), "chunk_id": next_id,
"edge_id": _mk_edge_id("prev", next_id, cid, "chunk", "structure:order"),
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"] "provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
})) }))
# 2) Inhaltliche Kanten # 2) Inhaltliche Kanten (Explicit & Candidate Pool)
reg = load_types_registry() reg = load_types_registry()
defaults = get_edge_defaults_for(note_type, reg) defaults = get_edge_defaults_for(note_type, reg)
refs_all: List[str] = [] refs_all: List[str] = []
@ -55,7 +69,7 @@ def build_edges_for_note(
if not cid: continue if not cid: continue
raw = _get(ch, "window") or _get(ch, "text") or "" raw = _get(ch, "window") or _get(ch, "text") or ""
# Typed & Candidate Pool (WP-15b Integration) # A. Typed Relations (Inline [[rel:kind|target]])
typed, rem = extract_typed_relations(raw) typed, rem = extract_typed_relations(raw)
for k, raw_t in typed: for k, raw_t in typed:
t, sec = parse_link_target(raw_t, note_id) t, sec = parse_link_target(raw_t, note_id)
@ -63,14 +77,14 @@ def build_edges_for_note(
payload = { payload = {
"chunk_id": cid, "chunk_id": cid,
# Variant=sec sorgt für eindeutige ID pro Abschnitt # WP-Fix: Variant=sec sorgt für eindeutige ID pro Sektion
"edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec), "edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec),
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"] "provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
} }
if sec: payload["target_section"] = sec if sec: payload["target_section"] = sec
edges.append(_edge(k, "chunk", cid, t, note_id, payload)) edges.append(_edge(k, "chunk", cid, t, note_id, payload))
# B. Candidate Pool (WP-15b Validierte KI-Kanten)
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or [] pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
for cand in pool: for cand in pool:
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai") raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
@ -82,10 +96,9 @@ def build_edges_for_note(
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90) "provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
} }
if sec: payload["target_section"] = sec if sec: payload["target_section"] = sec
edges.append(_edge(k, "chunk", cid, t, note_id, payload)) edges.append(_edge(k, "chunk", cid, t, note_id, payload))
# Callouts & Wikilinks # C. Callouts (> [!edge])
call_pairs, rem2 = extract_callout_relations(rem) call_pairs, rem2 = extract_callout_relations(rem)
for k, raw_t in call_pairs: for k, raw_t in call_pairs:
t, sec = parse_link_target(raw_t, note_id) t, sec = parse_link_target(raw_t, note_id)
@ -97,9 +110,9 @@ def build_edges_for_note(
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"] "provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
} }
if sec: payload["target_section"] = sec if sec: payload["target_section"] = sec
edges.append(_edge(k, "chunk", cid, t, note_id, payload)) edges.append(_edge(k, "chunk", cid, t, note_id, payload))
# D. Standard Wikilinks & Typ-Defaults
refs = extract_wikilinks(rem2) refs = extract_wikilinks(rem2)
for raw_r in refs: for raw_r in refs:
r, sec = parse_link_target(raw_r, note_id) r, sec = parse_link_target(raw_r, note_id)
@ -111,9 +124,9 @@ def build_edges_for_note(
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"] "provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
} }
if sec: payload["target_section"] = sec if sec: payload["target_section"] = sec
edges.append(_edge("references", "chunk", cid, r, note_id, payload)) edges.append(_edge("references", "chunk", cid, r, note_id, payload))
# Automatische Kanten-Vererbung aus types.yaml
for rel in defaults: for rel in defaults:
if rel != "references": if rel != "references":
def_payload = { def_payload = {
@ -124,13 +137,10 @@ def build_edges_for_note(
if sec: def_payload["target_section"] = sec if sec: def_payload["target_section"] = sec
edges.append(_edge(rel, "chunk", cid, r, note_id, def_payload)) edges.append(_edge(rel, "chunk", cid, r, note_id, def_payload))
# Für Note-Scope Sammlung nutzen wir den Original-String zur Dedup, aber gesäubert
refs_all.extend([parse_link_target(r, note_id)[0] for r in refs]) refs_all.extend([parse_link_target(r, note_id)[0] for r in refs])
# 3) Note-Scope & De-Duplizierung # 3) Note-Scope (Grobe Struktur-Verbindungen)
if include_note_scope_refs: if include_note_scope_refs:
# refs_all ist jetzt schon gesäubert (nur Targets)
# note_level_references müssen auch gesäubert werden
cleaned_note_refs = [parse_link_target(r, note_id)[0] for r in (note_level_references or [])] cleaned_note_refs = [parse_link_target(r, note_id)[0] for r in (note_level_references or [])]
refs_note = _dedupe_seq((refs_all or []) + cleaned_note_refs) refs_note = _dedupe_seq((refs_all or []) + cleaned_note_refs)
@ -140,17 +150,19 @@ def build_edges_for_note(
"edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"), "edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"),
"provenance": "explicit", "confidence": PROVENANCE_PRIORITY["explicit:note_scope"] "provenance": "explicit", "confidence": PROVENANCE_PRIORITY["explicit:note_scope"]
})) }))
# Backlinks zur Stärkung der Bidirektionalität
edges.append(_edge("backlink", "note", r, note_id, note_id, { edges.append(_edge("backlink", "note", r, note_id, note_id, {
"edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"), "edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"),
"provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"] "provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
})) }))
# Deduplizierung: Wir nutzen jetzt die EDGE-ID als Schlüssel. # 4) De-Duplizierung (In-Place)
# Da die Edge-ID nun 'variant' (Section) enthält, bleiben unterschiedliche Sections erhalten. # Da die EDGE-ID nun die Sektion (variant) enthält, bleiben Links auf
# unterschiedliche Abschnitte derselben Note erhalten.
unique_map: Dict[str, dict] = {} unique_map: Dict[str, dict] = {}
for e in edges: for e in edges:
eid = e["edge_id"] eid = e["edge_id"]
# Bei Konflikt (gleiche ID = exakt gleiche Kante und Section) gewinnt die höhere Confidence # Höhere Confidence gewinnt bei identischer ID
if eid not in unique_map or e.get("confidence", 0) > unique_map[eid].get("confidence", 0): if eid not in unique_map or e.get("confidence", 0) > unique_map[eid].get("confidence", 0):
unique_map[eid] = e unique_map[eid] = e

View File

@ -2,8 +2,9 @@
FILE: app/core/graph/graph_subgraph.py FILE: app/core/graph/graph_subgraph.py
DESCRIPTION: In-Memory Repräsentation eines Graphen für Scoring und Analyse. DESCRIPTION: In-Memory Repräsentation eines Graphen für Scoring und Analyse.
Zentrale Komponente für die Graph-Expansion (BFS) und Bonus-Berechnung. Zentrale Komponente für die Graph-Expansion (BFS) und Bonus-Berechnung.
MODULARISIERUNG: Teil des graph-Pakets (WP-14). WP-15c Update: Erhalt von Metadaten (target_section, provenance)
VERSION: 1.1.0 für präzises Retrieval-Reasoning.
VERSION: 1.2.0
STATUS: Active STATUS: Active
""" """
import math import math
@ -22,6 +23,7 @@ class Subgraph:
""" """
def __init__(self) -> None: def __init__(self) -> None:
# adj speichert nun vollständige Payloads statt nur Tripel
self.adj: DefaultDict[str, List[Dict]] = defaultdict(list) self.adj: DefaultDict[str, List[Dict]] = defaultdict(list)
self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list) self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list)
self.in_degree: DefaultDict[str, int] = defaultdict(int) self.in_degree: DefaultDict[str, int] = defaultdict(int)
@ -30,31 +32,49 @@ class Subgraph:
def add_edge(self, e: Dict) -> None: def add_edge(self, e: Dict) -> None:
""" """
Fügt eine Kante hinzu und aktualisiert Indizes. Fügt eine Kante hinzu und aktualisiert Indizes.
Unterstützt Kontext-Notes für verbesserte Graph-Konnektivität. WP-15c: Speichert das vollständige Payload für den Explanation Layer.
""" """
src = e.get("source") src = e.get("source")
tgt = e.get("target") tgt = e.get("target")
kind = e.get("kind") kind = e.get("kind")
weight = e.get("weight", EDGE_BASE_WEIGHTS.get(kind, 0.0))
# Das gesamte Payload wird als Kanten-Objekt behalten
# Wir stellen sicher, dass alle relevanten Metadaten vorhanden sind
edge_data = {
"source": src,
"target": tgt,
"kind": kind,
"weight": e.get("weight", EDGE_BASE_WEIGHTS.get(kind, 0.0)),
"provenance": e.get("provenance", "rule"),
"confidence": e.get("confidence", 1.0),
"target_section": e.get("target_section"), # Essentiell für Präzision
"is_super_edge": e.get("is_super_edge", False)
}
owner = e.get("note_id") owner = e.get("note_id")
if not src or not tgt: if not src or not tgt:
return return
# 1. Forward-Kante # 1. Forward-Kante
self.adj[src].append({"target": tgt, "kind": kind, "weight": weight}) self.adj[src].append(edge_data)
self.out_degree[src] += 1 self.out_degree[src] += 1
self.in_degree[tgt] += 1 self.in_degree[tgt] += 1
# 2. Reverse-Kante (für WP-04b Explanation Layer) # 2. Reverse-Kante (für Explanation Layer & Backlinks)
self.reverse_adj[tgt].append({"source": src, "kind": kind, "weight": weight}) self.reverse_adj[tgt].append(edge_data)
# 3. Kontext-Note Handling (erhöht die Zentralität der Parent-Note) # 3. Kontext-Note Handling (erhöht die Zentralität der Parent-Note)
if owner and owner != src: if owner and owner != src:
self.adj[owner].append({"target": tgt, "kind": kind, "weight": weight}) # Wir erstellen eine virtuelle Kontext-Kante
ctx_edge = edge_data.copy()
ctx_edge["source"] = owner
ctx_edge["via_context"] = True
self.adj[owner].append(ctx_edge)
self.out_degree[owner] += 1 self.out_degree[owner] += 1
if owner != tgt: if owner != tgt:
self.reverse_adj[tgt].append({"source": owner, "kind": kind, "weight": weight, "via_context": True}) self.reverse_adj[tgt].append(ctx_edge)
self.in_degree[owner] += 1 self.in_degree[owner] += 1
def aggregate_edge_bonus(self, node_id: str) -> float: def aggregate_edge_bonus(self, node_id: str) -> float:
@ -73,14 +93,15 @@ class Subgraph:
indeg = self.in_degree.get(node_id, 0) indeg = self.in_degree.get(node_id, 0)
if indeg <= 0: if indeg <= 0:
return 0.0 return 0.0
# math.log1p(x) entspricht log(1+x)
return min(math.log1p(indeg) / 10.0, 0.15) return min(math.log1p(indeg) / 10.0, 0.15)
def get_outgoing_edges(self, node_id: str) -> List[Dict[str, Any]]: def get_outgoing_edges(self, node_id: str) -> List[Dict[str, Any]]:
"""Gibt alle ausgehenden Kanten einer Node zurück.""" """Gibt alle ausgehenden Kanten einer Node inkl. Metadaten zurück."""
return self.adj.get(node_id, []) return self.adj.get(node_id, [])
def get_incoming_edges(self, node_id: str) -> List[Dict[str, Any]]: def get_incoming_edges(self, node_id: str) -> List[Dict[str, Any]]:
"""Gibt alle eingehenden Kanten einer Node zurück.""" """Gibt alle eingehenden Kanten einer Node inkl. Metadaten zurück."""
return self.reverse_adj.get(node_id, []) return self.reverse_adj.get(node_id, [])
@ -111,13 +132,19 @@ def expand(
src, tgt = pl.get("source_id"), pl.get("target_id") src, tgt = pl.get("source_id"), pl.get("target_id")
if not src or not tgt: continue if not src or not tgt: continue
sg.add_edge({ # WP-15c: Wir übergeben das vollständige Payload an add_edge
edge_payload = {
"source": src, "source": src,
"target": tgt, "target": tgt,
"kind": pl.get("kind", "edge"), "kind": pl.get("kind", "edge"),
"weight": calculate_edge_weight(pl), "weight": calculate_edge_weight(pl),
"note_id": pl.get("note_id"), "note_id": pl.get("note_id"),
}) "provenance": pl.get("provenance", "rule"),
"confidence": pl.get("confidence", 1.0),
"target_section": pl.get("target_section")
}
sg.add_edge(edge_payload)
# BFS Logik: Neue Ziele in die nächste Frontier aufnehmen # BFS Logik: Neue Ziele in die nächste Frontier aufnehmen
if tgt not in visited: if tgt not in visited:

View File

@ -1,9 +1,8 @@
""" """
FILE: app/core/retrieval/retriever.py FILE: app/core/retrieval/retriever.py
DESCRIPTION: Haupt-Schnittstelle für die Suche. Orchestriert Vektorsuche und Graph-Expansion. DESCRIPTION: Haupt-Schnittstelle für die Suche. Orchestriert Vektorsuche und Graph-Expansion.
Nutzt retriever_scoring.py für die WP-22 Logik. WP-15c Update: Note-Level Diversity Pooling & Super-Edge Aggregation.
MODULARISIERUNG: Verschoben in das retrieval-Paket für WP-14. VERSION: 0.7.0
VERSION: 0.6.16
STATUS: Active STATUS: Active
DEPENDENCIES: app.config, app.models.dto, app.core.database*, app.core.graph_adapter DEPENDENCIES: app.config, app.models.dto, app.core.database*, app.core.graph_adapter
""" """
@ -13,6 +12,7 @@ import os
import time import time
import logging import logging
from typing import Any, Dict, List, Tuple, Iterable, Optional from typing import Any, Dict, List, Tuple, Iterable, Optional
from collections import defaultdict
from app.config import get_settings from app.config import get_settings
from app.models.dto import ( from app.models.dto import (
@ -89,7 +89,6 @@ def _build_explanation(
) -> Explanation: ) -> Explanation:
""" """
Transformiert mathematische Scores und Graph-Signale in eine menschenlesbare Erklärung. Transformiert mathematische Scores und Graph-Signale in eine menschenlesbare Erklärung.
Behebt Pydantic ValidationErrors durch explizite String-Sicherung.
""" """
_, edge_w_cfg, _ = get_weights() _, edge_w_cfg, _ = get_weights()
base_val = scoring_debug["base_val"] base_val = scoring_debug["base_val"]
@ -116,12 +115,22 @@ def _build_explanation(
elif semantic_score > 0.70: elif semantic_score > 0.70:
reasons.append(Reason(kind="semantic", message="Inhaltliche Übereinstimmung.", score_impact=base_val)) reasons.append(Reason(kind="semantic", message="Inhaltliche Übereinstimmung.", score_impact=base_val))
# 3. Gründe für Typ und Lifecycle # 3. Gründe für Typ und Lifecycle (WP-25 Vorbereitung)
type_weight = float(payload.get("retriever_weight", 1.0)) type_weight = float(payload.get("retriever_weight", 1.0))
if type_weight != 1.0: if type_weight != 1.0:
msg = "Bevorzugt" if type_weight > 1.0 else "De-priorisiert" msg = "Bevorzugt" if type_weight > 1.0 else "De-priorisiert"
reasons.append(Reason(kind="type", message=f"{msg} durch Typ-Profil.", score_impact=base_val * (type_weight - 1.0))) reasons.append(Reason(kind="type", message=f"{msg} durch Typ-Profil.", score_impact=base_val * (type_weight - 1.0)))
# NEU: Explizite Ausweisung des Lifecycle-Status (WP-22)
status_mult = scoring_debug.get("status_multiplier", 1.0)
if status_mult != 1.0:
status_msg = "Belohnt (Stable)" if status_mult > 1.0 else "De-priorisiert (Draft)"
reasons.append(Reason(
kind="status",
message=f"{status_msg} durch Content-Lifecycle.",
score_impact=semantic_score * (status_mult - 1.0)
))
# 4. Kanten-Verarbeitung (Graph-Intelligence) # 4. Kanten-Verarbeitung (Graph-Intelligence)
if subgraph and target_note_id and scoring_debug["edge_bonus"] > 0: if subgraph and target_note_id and scoring_debug["edge_bonus"] > 0:
raw_edges = [] raw_edges = []
@ -131,7 +140,6 @@ def _build_explanation(
raw_edges.extend(subgraph.get_outgoing_edges(target_note_id) or []) raw_edges.extend(subgraph.get_outgoing_edges(target_note_id) or [])
for edge in raw_edges: for edge in raw_edges:
# FIX: Zwingende String-Konvertierung für Pydantic-Stabilität
src = str(edge.get("source") or "note_root") src = str(edge.get("source") or "note_root")
tgt = str(edge.get("target") or target_note_id or "unknown_target") tgt = str(edge.get("target") or target_note_id or "unknown_target")
kind = str(edge.get("kind", "related_to")) kind = str(edge.get("kind", "related_to"))
@ -187,10 +195,14 @@ def _build_hits_from_semantic(
explain: bool = False, explain: bool = False,
dynamic_edge_boosts: Dict[str, float] = None dynamic_edge_boosts: Dict[str, float] = None
) -> QueryResponse: ) -> QueryResponse:
"""Wandelt semantische Roh-Treffer in bewertete QueryHits um.""" """
Wandelt semantische Roh-Treffer in bewertete QueryHits um.
WP-15c: Implementiert Note-Level Diversity Pooling.
"""
t0 = time.time() t0 = time.time()
enriched = [] enriched = []
# Erstes Scoring für alle Kandidaten
for pid, semantic_score, payload in hits: for pid, semantic_score, payload in hits:
edge_bonus, cent_bonus = 0.0, 0.0 edge_bonus, cent_bonus = 0.0, 0.0
target_id = payload.get("note_id") target_id = payload.get("note_id")
@ -202,15 +214,30 @@ def _build_hits_from_semantic(
except Exception: except Exception:
pass pass
# Mathematisches Scoring via WP-22 Engine
debug_data = compute_wp22_score( debug_data = compute_wp22_score(
semantic_score, payload, edge_bonus, cent_bonus, dynamic_edge_boosts semantic_score, payload, edge_bonus, cent_bonus, dynamic_edge_boosts
) )
enriched.append((pid, semantic_score, payload, debug_data)) enriched.append((pid, semantic_score, payload, debug_data))
# Sortierung nach finalem mathematischen Score # 1. Sortierung nach finalem mathematischen Score
enriched_sorted = sorted(enriched, key=lambda h: h[3]["total"], reverse=True) enriched_sorted = sorted(enriched, key=lambda h: h[3]["total"], reverse=True)
limited_hits = enriched_sorted[: max(1, top_k)]
# 2. WP-15c: Note-Level Diversity Pooling
# Wir behalten pro note_id nur den Hit mit dem höchsten total_score.
# Dies verhindert, dass 10 Chunks derselben Note andere KeyNotes verdrängen.
unique_note_hits = []
seen_notes = set()
for item in enriched_sorted:
_, _, payload, _ = item
note_id = str(payload.get("note_id", "unknown"))
if note_id not in seen_notes:
unique_note_hits.append(item)
seen_notes.add(note_id)
# 3. Begrenzung auf top_k nach dem Diversity-Pooling
limited_hits = unique_note_hits[: max(1, top_k)]
results: List[QueryHit] = [] results: List[QueryHit] = []
for pid, s_score, pl, dbg in limited_hits: for pid, s_score, pl, dbg in limited_hits:
@ -225,7 +252,6 @@ def _build_hits_from_semantic(
applied_boosts=dynamic_edge_boosts applied_boosts=dynamic_edge_boosts
) )
# Payload Text-Feld normalisieren
text_content = pl.get("page_content") or pl.get("text") or pl.get("content", "[Kein Text]") text_content = pl.get("page_content") or pl.get("text") or pl.get("content", "[Kein Text]")
results.append(QueryHit( results.append(QueryHit(
@ -250,14 +276,14 @@ def _build_hits_from_semantic(
def hybrid_retrieve(req: QueryRequest) -> QueryResponse: def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
""" """
Die Haupt-Einstiegsfunktion für die hybride Suche. Die Haupt-Einstiegsfunktion für die hybride Suche.
Kombiniert Vektorsuche mit Graph-Expansion und WP-22 Gewichtung. WP-15c: Implementiert Edge-Aggregation (Super-Kanten).
""" """
client, prefix = _get_client_and_prefix() client, prefix = _get_client_and_prefix()
vector = list(req.query_vector) if req.query_vector else _get_query_vector(req) vector = list(req.query_vector) if req.query_vector else _get_query_vector(req)
top_k = req.top_k or 10 top_k = req.top_k or 10
# 1. Semantische Seed-Suche # 1. Semantische Seed-Suche (Wir laden etwas mehr für das Pooling)
hits = _semantic_hits(client, prefix, vector, top_k=top_k, filters=req.filters) hits = _semantic_hits(client, prefix, vector, top_k=top_k * 3, filters=req.filters)
# 2. Graph Expansion Konfiguration # 2. Graph Expansion Konfiguration
expand_cfg = req.expand if isinstance(req.expand, dict) else {} expand_cfg = req.expand if isinstance(req.expand, dict) else {}
@ -266,39 +292,76 @@ def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
subgraph: ga.Subgraph | None = None subgraph: ga.Subgraph | None = None
if depth > 0 and hits: if depth > 0 and hits:
# Start-IDs für den Graph-Traversal sammeln
seed_ids = list({h[2].get("note_id") for h in hits if h[2].get("note_id")}) seed_ids = list({h[2].get("note_id") for h in hits if h[2].get("note_id")})
if seed_ids: if seed_ids:
try: try:
# Subgraph aus RAM/DB laden
subgraph = ga.expand(client, prefix, seed_ids, depth=depth, edge_types=expand_cfg.get("edge_types")) subgraph = ga.expand(client, prefix, seed_ids, depth=depth, edge_types=expand_cfg.get("edge_types"))
# --- WP-22: Kanten-Gewichtung im RAM-Graphen vor Bonus-Berechnung --- # --- WP-15c: Edge-Aggregation & Deduplizierung (Super-Kanten) ---
if subgraph and hasattr(subgraph, "graph"): # Verhindert Score-Explosion durch multiple Links auf versch. Abschnitte.
for _, _, data in subgraph.graph.edges(data=True): # Logik: 1. Kante zählt voll, weitere dämpfen auf Faktor 0.1.
# A. Provenance Weighting (WP-22 Bonus für Herkunft) if subgraph and hasattr(subgraph, "adj"):
prov = data.get("provenance", "rule") for src, edge_list in subgraph.adj.items():
# Belohnung: Explizite Links (1.0) > Smart (0.9) > Rule (0.7) # Gruppiere Kanten nach Ziel-Note (Deduplizierung ID_A -> ID_B)
prov_w = 1.0 if prov == "explicit" else (0.9 if prov == "smart" else 0.7) by_target = defaultdict(list)
for e in edge_list:
by_target[e["target"]].append(e)
# B. Intent Boost Multiplikator (Vom Router dynamisch injiziert) aggregated_list = []
kind = data.get("kind") for tgt, edges in by_target.items():
intent_multiplier = boost_edges.get(kind, 1.0) if len(edges) > 1:
# Sortiere: Stärkste Kante zuerst
sorted_edges = sorted(edges, key=lambda x: x.get("weight", 0.0), reverse=True)
primary = sorted_edges[0]
# Finales Gewicht setzen (Basis * Provenance * Intent) # Aggregiertes Gewicht berechnen (Sättigungs-Logik)
data["weight"] = data.get("weight", 1.0) * prov_w * intent_multiplier total_w = primary.get("weight", 0.0)
for secondary in sorted_edges[1:]:
total_w += secondary.get("weight", 0.0) * 0.1
primary["weight"] = total_w
primary["is_super_edge"] = True # Flag für Explanation Layer
primary["edge_count"] = len(edges)
aggregated_list.append(primary)
else:
aggregated_list.append(edges[0])
# In-Place Update der Adjazenzliste des Graphen
subgraph.adj[src] = aggregated_list
# Re-Sync der In-Degrees für Centrality-Bonus (Aggregation konsistent halten)
subgraph.in_degree = defaultdict(int)
for src, edges in subgraph.adj.items():
for e in edges:
subgraph.in_degree[e["target"]] += 1
# --- WP-22: Kanten-Gewichtung (Provenance & Intent Boost) ---
if subgraph and hasattr(subgraph, "adj"):
for src, edges in subgraph.adj.items():
for e in edges:
# A. Provenance Weighting
prov = e.get("provenance", "rule")
prov_w = 1.0 if prov == "explicit" else (0.9 if prov == "smart" else 0.7)
# B. Intent Boost Multiplikator
kind = e.get("kind")
intent_multiplier = boost_edges.get(kind, 1.0)
# Gewichtung anpassen
e["weight"] = e.get("weight", 1.0) * prov_w * intent_multiplier
except Exception as e: except Exception as e:
logger.error(f"Graph Expansion failed: {e}") logger.error(f"Graph Expansion failed: {e}")
subgraph = None subgraph = None
# 3. Scoring & Explanation Generierung # 3. Scoring & Explanation Generierung
# top_k wird erst hier final angewandt
return _build_hits_from_semantic(hits, top_k, "hybrid", subgraph, req.explain, boost_edges) return _build_hits_from_semantic(hits, top_k, "hybrid", subgraph, req.explain, boost_edges)
def semantic_retrieve(req: QueryRequest) -> QueryResponse: def semantic_retrieve(req: QueryRequest) -> QueryResponse:
"""Standard Vektorsuche ohne Graph-Einfluss (WP-02 Fallback).""" """Standard Vektorsuche ohne Graph-Einfluss."""
client, prefix = _get_client_and_prefix() client, prefix = _get_client_and_prefix()
vector = _get_query_vector(req) vector = _get_query_vector(req)
hits = _semantic_hits(client, prefix, vector, req.top_k or 10, req.filters) hits = _semantic_hits(client, prefix, vector, req.top_k or 10, req.filters)
@ -308,5 +371,4 @@ def semantic_retrieve(req: QueryRequest) -> QueryResponse:
class Retriever: class Retriever:
"""Schnittstelle für die asynchrone Suche.""" """Schnittstelle für die asynchrone Suche."""
async def search(self, request: QueryRequest) -> QueryResponse: async def search(self, request: QueryRequest) -> QueryResponse:
"""Führt eine hybride Suche aus."""
return hybrid_retrieve(request) return hybrid_retrieve(request)

View File

@ -1,11 +1,10 @@
""" """
FILE: app/core/retrieval/retriever_scoring.py FILE: app/core/retrieval/retriever_scoring.py
DESCRIPTION: Mathematische Kern-Logik für das WP-22 Scoring. DESCRIPTION: Mathematische Kern-Logik für das WP-22/WP-15c Scoring.
Berechnet Relevanz-Scores basierend auf Semantik, Graph-Intelligence und Content Lifecycle. Berechnet Relevanz-Scores basierend auf Semantik, Graph-Intelligence und Content Lifecycle.
MODULARISIERUNG: Verschoben in das retrieval-Paket für WP-14. FIX v1.0.3: Optimierte Interaktion zwischen Typ-Boost und Status-Dämpfung.
VERSION: 1.0.2 VERSION: 1.0.3
STATUS: Active STATUS: Active
DEPENDENCIES: app.config, typing
""" """
import os import os
import logging import logging
@ -23,10 +22,6 @@ logger = logging.getLogger(__name__)
def get_weights() -> Tuple[float, float, float]: def get_weights() -> Tuple[float, float, float]:
""" """
Liefert die Basis-Gewichtung (semantic, edge, centrality) aus der Konfiguration. Liefert die Basis-Gewichtung (semantic, edge, centrality) aus der Konfiguration.
Priorität:
1. config/retriever.yaml (Scoring-Sektion)
2. Umgebungsvariablen (RETRIEVER_W_*)
3. System-Defaults (1.0, 0.0, 0.0)
""" """
from app.config import get_settings from app.config import get_settings
settings = get_settings() settings = get_settings()
@ -58,7 +53,7 @@ def get_status_multiplier(payload: Dict[str, Any]) -> float:
- stable: 1.2 (Belohnung für verifiziertes Wissen) - stable: 1.2 (Belohnung für verifiziertes Wissen)
- active: 1.0 (Standard-Gewichtung) - active: 1.0 (Standard-Gewichtung)
- draft: 0.5 (Bestrafung für unfertige Fragmente) - draft: 0.5 (Dämpfung für unfertige Fragmente)
""" """
status = str(payload.get("status", "active")).lower().strip() status = str(payload.get("status", "active")).lower().strip()
if status == "stable": if status == "stable":
@ -75,35 +70,42 @@ def compute_wp22_score(
dynamic_edge_boosts: Optional[Dict[str, float]] = None dynamic_edge_boosts: Optional[Dict[str, float]] = None
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Die zentrale mathematische Scoring-Formel der Mindnet Intelligence. Die zentrale mathematische Scoring-Formel (WP-15c optimiert).
Implementiert das WP-22 Hybrid-Scoring (Semantic * Lifecycle * Graph). Implementiert das Hybrid-Scoring (Semantic * Lifecycle * Graph).
FORMEL: LOGIK:
Score = (Similarity * StatusMult) * (1 + (TypeWeight - 1) + ((EdgeW * EB + CentW * CB) * IntentBoost)) 1. Base = Similarity * StatusMult (Lifecycle-Filter).
2. Boosts = (TypeBoost - 1) + (GraphBoni * IntentFactor).
3. Final = Base * (1 + Boosts).
Returns: Der edge_bonus_raw enthält bereits die Super-Edge-Aggregation (WP-15c).
Dict mit dem finalen 'total' Score und allen mathematischen Zwischenwerten für den Explanation Layer.
""" """
sem_w, edge_w_cfg, cent_w_cfg = get_weights() sem_w, edge_w_cfg, cent_w_cfg = get_weights()
status_mult = get_status_multiplier(payload) status_mult = get_status_multiplier(payload)
# Retriever Weight (Type Boost aus types.yaml, z.B. 1.1 für Decisions) # Retriever Weight (Typ-Boost aus types.yaml, z.B. 1.1 für Decisions)
node_weight = float(payload.get("retriever_weight", 1.0)) node_weight = float(payload.get("retriever_weight", 1.0))
# 1. Berechnung des Base Scores (Semantik gewichtet durch Lifecycle-Status) # 1. Berechnung des Base Scores (Semantik gewichtet durch Lifecycle-Status)
# WICHTIG: Der Status wirkt hier als Multiplikator auf die Basis-Relevanz.
base_val = float(semantic_score) * status_mult base_val = float(semantic_score) * status_mult
# 2. Graph Boost Factor (Teil C: Intent-spezifische Verstärkung) # 2. Graph Boost Factor (Intent-spezifische Verstärkung aus decision_engine.yaml)
# Erhöht das Gewicht des gesamten Graphen um 50%, wenn ein spezifischer Intent vorliegt. # Erhöht das Gewicht des gesamten Graphen um 50%, wenn ein spezifischer Intent vorliegt.
graph_boost_factor = 1.5 if dynamic_edge_boosts and (edge_bonus_raw > 0 or cent_bonus_raw > 0) else 1.0 graph_boost_factor = 1.5 if dynamic_edge_boosts and (edge_bonus_raw > 0 or cent_bonus_raw > 0) else 1.0
# 3. Einzelne Graph-Komponenten berechnen # 3. Einzelne Graph-Komponenten berechnen
# WP-15c Hinweis: edge_bonus_raw ist durch den retriever.py bereits gedämpft/aggregiert.
edge_impact_final = (edge_w_cfg * edge_bonus_raw) * graph_boost_factor edge_impact_final = (edge_w_cfg * edge_bonus_raw) * graph_boost_factor
cent_impact_final = (cent_w_cfg * cent_bonus_raw) * graph_boost_factor cent_impact_final = (cent_w_cfg * cent_bonus_raw) * graph_boost_factor
# 4. Finales Zusammenführen (Merging) # 4. Finales Zusammenführen (Merging)
# (node_weight - 1.0) sorgt dafür, dass ein Gewicht von 1.0 keinen Einfluss hat (neutral). # (node_weight - 1.0) wandelt das Gewicht in einen relativen Bonus um (z.B. 1.2 -> +0.2).
total = base_val * (1.0 + (node_weight - 1.0) + edge_impact_final + cent_impact_final) # Alle Boni werden addiert und wirken dann auf den base_val.
type_impact = node_weight - 1.0
total_boost = 1.0 + type_impact + edge_impact_final + cent_impact_final
total = base_val * total_boost
# Sicherstellen, dass der Score niemals 0 oder negativ ist (Floor) # Sicherstellen, dass der Score niemals 0 oder negativ ist (Floor)
final_score = max(0.0001, float(total)) final_score = max(0.0001, float(total))
@ -114,7 +116,7 @@ def compute_wp22_score(
"cent_bonus": float(cent_bonus_raw), "cent_bonus": float(cent_bonus_raw),
"status_multiplier": status_mult, "status_multiplier": status_mult,
"graph_boost_factor": graph_boost_factor, "graph_boost_factor": graph_boost_factor,
"type_impact": node_weight - 1.0, "type_impact": type_impact,
"base_val": base_val, "base_val": base_val,
"edge_impact_final": edge_impact_final, "edge_impact_final": edge_impact_final,
"cent_impact_final": cent_impact_final "cent_impact_final": cent_impact_final