From e47241740d4da270171b6be72c7998963fba5d80 Mon Sep 17 00:00:00 2001 From: Lars Date: Thu, 18 Dec 2025 14:30:24 +0100 Subject: [PATCH] letzte bereinigungen --- app/core/ingestion.py | 67 +++++++++++++++++++---------------- app/core/retriever.py | 82 ++++++++++++++++++++++++------------------- app/models/dto.py | 12 ++++--- 3 files changed, 90 insertions(+), 71 deletions(-) diff --git a/app/core/ingestion.py b/app/core/ingestion.py index b1a43cc..c7e8d05 100644 --- a/app/core/ingestion.py +++ b/app/core/ingestion.py @@ -1,10 +1,11 @@ """ FILE: app/core/ingestion.py -DESCRIPTION: Haupt-Ingestion-Logik. +DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen (Notes, Chunks, Edges). FIX: Korrekte Priorisierung von Frontmatter für chunk_profile und retriever_weight. Lade Chunk-Config basierend auf dem effektiven Profil, nicht nur dem Notiz-Typ. - WP-22: Integration von Content Lifecycle (Status) und Edge Registry. -VERSION: 2.8.5 (WP-22 Lifecycle & Registry) + WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation. + WP-22: Multi-Hash Refresh für konsistente Change Detection. +VERSION: 2.8.6 (WP-22 Lifecycle & Registry) STATUS: Active DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client, app.services.edge_registry EXTERNAL_CONFIG: config/types.yaml @@ -46,6 +47,7 @@ logger = logging.getLogger(__name__) # --- Helper --- def load_type_registry(custom_path: Optional[str] = None) -> dict: + """Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion.""" import yaml path = custom_path or os.getenv("MINDNET_TYPES_FILE", "config/types.yaml") if not os.path.exists(path): return {} @@ -54,14 +56,15 @@ def load_type_registry(custom_path: Optional[str] = None) -> dict: except Exception: return {} def resolve_note_type(requested: Optional[str], reg: dict) -> str: + """Bestimmt den finalen Notiz-Typ (Fallback auf 'concept').""" types = reg.get("types", {}) if requested and requested in types: return requested return "concept" def effective_chunk_profile_name(fm: dict, note_type: str, reg: dict) -> str: """ - Ermittelt den Namen des Chunk-Profils. - Prio: 1. Frontmatter -> 2. Type-Config -> 3. Default + Ermittelt den Namen des zu nutzenden Chunk-Profils. + Priorität: 1. Frontmatter Override -> 2. Type Config -> 3. Global Default """ # 1. Frontmatter Override override = fm.get("chunking_profile") or fm.get("chunk_profile") @@ -79,8 +82,8 @@ def effective_chunk_profile_name(fm: dict, note_type: str, reg: dict) -> str: def effective_retriever_weight(fm: dict, note_type: str, reg: dict) -> float: """ - Ermittelt das Retriever Weight. - Prio: 1. Frontmatter -> 2. Type-Config -> 3. Default + Ermittelt das effektive retriever_weight für das Scoring. + Priorität: 1. Frontmatter Override -> 2. Type Config -> 3. Global Default """ # 1. Frontmatter Override override = fm.get("retriever_weight") @@ -109,7 +112,7 @@ class IngestionService: self.registry = load_type_registry() self.embedder = EmbeddingsClient() - # ACTIVE HASH MODE aus ENV lesen (Default: full) + # Change Detection Modus (full oder body) self.active_hash_mode = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full") try: @@ -119,20 +122,13 @@ class IngestionService: logger.warning(f"DB init warning: {e}") def _get_chunk_config_by_profile(self, profile_name: str, note_type: str) -> Dict[str, Any]: - """ - Lädt die konkrete Config (target, max, overlap) für einen Profilnamen. - """ - # Suche direkt in den definierten Profilen der Registry + """Holt die Chunker-Parameter (max, target, overlap) für ein spezifisches Profil.""" profiles = self.registry.get("chunking_profiles", {}) if profile_name in profiles: cfg = profiles[profile_name].copy() - # Tuple-Fix für Overlap (wie in chunker.py) if "overlap" in cfg and isinstance(cfg["overlap"], list): cfg["overlap"] = tuple(cfg["overlap"]) return cfg - - # Fallback: Wenn Profilname unbekannt, nutze Standard für den Typ via Chunker - logger.warning(f"Profile '{profile_name}' not found in registry. Falling back to type defaults.") return get_chunk_config(note_type) async def process_file( @@ -146,7 +142,10 @@ class IngestionService: hash_source: str = "parsed", hash_normalize: str = "canonical" ) -> Dict[str, Any]: - + """ + Verarbeitet eine Markdown-Datei und schreibt sie in den Graphen. + Folgt dem 14-Schritte-Workflow. + """ result = {"path": file_path, "status": "skipped", "changed": False, "error": None} # 1. Parse & Frontmatter Validation @@ -162,25 +161,22 @@ class IngestionService: # --- WP-22: Content Lifecycle Gate (Teil A) --- status = fm.get("status", "draft").lower().strip() - # Hard Skip für System-Dateien + # Hard Skip für System- oder Archiv-Dateien if status in ["system", "template", "archive", "hidden"]: logger.info(f"Skipping file {file_path} (Status: {status})") return {**result, "status": "skipped", "reason": f"lifecycle_status_{status}"} - # 2. Type & Config Resolution (FIXED) - # Wir ermitteln erst den Typ + # 2. Type & Config Resolution note_type = resolve_note_type(fm.get("type"), self.registry) fm["type"] = note_type - # Dann ermitteln wir die effektiven Werte unter Berücksichtigung des Frontmatters! effective_profile = effective_chunk_profile_name(fm, note_type, self.registry) effective_weight = effective_retriever_weight(fm, note_type, self.registry) - # Wir schreiben die effektiven Werte zurück ins FM, damit note_payload sie sicher hat fm["chunk_profile"] = effective_profile fm["retriever_weight"] = effective_weight - # 3. Build Note Payload + # 3. Build Note Payload (Inkl. Multi-Hash für WP-22) try: note_pl = make_note_payload( parsed, @@ -192,10 +188,10 @@ class IngestionService: # Text Body Fallback if not note_pl.get("fulltext"): note_pl["fulltext"] = getattr(parsed, "body", "") or "" - # Update Payload with explicit effective values (Sicherheit) + # Sicherstellen der effektiven Werte im Payload note_pl["retriever_weight"] = effective_weight note_pl["chunk_profile"] = effective_profile - # WP-22: Status speichern für Dynamic Scoring + # WP-22: Status speichern note_pl["status"] = status note_id = note_pl["note_id"] @@ -209,6 +205,7 @@ class IngestionService: old_payload = self._fetch_note_payload(note_id) has_old = old_payload is not None + # Prüfung gegen den aktuell konfigurierten Hash-Modus (body oder full) check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}" old_hashes = (old_payload or {}).get("hashes") @@ -228,16 +225,16 @@ class IngestionService: if not apply: return {**result, "status": "dry-run", "changed": True, "note_id": note_id} - # 5. Processing + # 5. Processing (Chunking, Embedding, Edge Generation) try: body_text = getattr(parsed, "body", "") or "" - # FIX: Wir laden jetzt die Config für das SPEZIFISCHE Profil + # Konfiguration für das spezifische Profil laden chunk_config = self._get_chunk_config_by_profile(effective_profile, note_type) chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_config) - # chunk_payloads werden mit den aktualisierten FM-Werten gebaut + # Chunks mit Metadaten anreichern chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) vecs = [] @@ -254,7 +251,7 @@ class IngestionService: logger.error(f"Embedding failed: {e}") raise RuntimeError(f"Embedding failed: {e}") - # Raw Edges generieren + # Kanten generieren try: raw_edges = build_edges_for_note( note_id, @@ -270,7 +267,7 @@ class IngestionService: if raw_edges: for edge in raw_edges: original_kind = edge.get("kind", "related_to") - # Resolve via Registry (Canonical mapping + Unknown Logging) + # Normalisierung über die Registry (Alias-Auflösung) canonical_kind = edge_registry.resolve(original_kind) edge["kind"] = canonical_kind edges.append(edge) @@ -279,18 +276,22 @@ class IngestionService: logger.error(f"Processing failed: {e}", exc_info=True) return {**result, "error": f"Processing failed: {str(e)}"} - # 6. Upsert + # 6. Upsert in Qdrant try: + # Alte Fragmente löschen, um "Geister-Chunks" zu vermeiden if purge_before and has_old: self._purge_artifacts(note_id) + # Note Metadaten n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim) upsert_batch(self.client, n_name, n_pts) + # Chunks (Vektoren) if chunk_pls and vecs: c_name, c_pts = points_for_chunks(self.prefix, chunk_pls, vecs) upsert_batch(self.client, c_name, c_pts) + # Kanten if edges: e_name, e_pts = points_for_edges(self.prefix, edges) upsert_batch(self.client, e_name, e_pts) @@ -308,6 +309,7 @@ class IngestionService: return {**result, "error": f"DB Upsert failed: {e}"} def _fetch_note_payload(self, note_id: str) -> Optional[dict]: + """Holt das aktuelle Payload einer Note aus Qdrant.""" from qdrant_client.http import models as rest col = f"{self.prefix}_notes" try: @@ -317,6 +319,7 @@ class IngestionService: except: return None def _artifacts_missing(self, note_id: str) -> Tuple[bool, bool]: + """Prüft, ob Chunks oder Kanten für eine Note fehlen (Integritätscheck).""" from qdrant_client.http import models as rest c_col = f"{self.prefix}_chunks" e_col = f"{self.prefix}_edges" @@ -328,6 +331,7 @@ class IngestionService: except: return True, True def _purge_artifacts(self, note_id: str): + """Löscht alle Chunks und Edges einer Note (vor dem Neu-Schreiben).""" from qdrant_client.http import models as rest f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) selector = rest.FilterSelector(filter=f) @@ -337,6 +341,7 @@ class IngestionService: except Exception: pass async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]: + """Hilfsmethode zur Erstellung einer Note aus einem Textstream (Editor-Save).""" target_dir = os.path.join(vault_root, folder) os.makedirs(target_dir, exist_ok=True) file_path = os.path.join(target_dir, filename) diff --git a/app/core/retriever.py b/app/core/retriever.py index a537a0c..4a718f3 100644 --- a/app/core/retriever.py +++ b/app/core/retriever.py @@ -1,8 +1,8 @@ """ FILE: app/core/retriever.py DESCRIPTION: Implementiert die Hybrid-Suche (Vektor + Graph-Expansion) und das Scoring-Modell (Explainability). - WP-22 Update: Dynamic Edge Boosting & Lifecycle Scoring. -VERSION: 0.6.5 (WP-22 Scoring Formula) + WP-22 Update: Dynamic Edge Boosting, Lifecycle Scoring & Provenance Awareness. +VERSION: 0.6.6 (WP-22 Scoring & Provenance) STATUS: Active DEPENDENCIES: app.config, app.models.dto, app.core.qdrant*, app.services.embeddings_client, app.core.graph_adapter LAST_ANALYSIS: 2025-12-18 @@ -122,7 +122,7 @@ def _compute_total_score( Hierbei gilt: - BaseScore: semantic_similarity * status_multiplier - - ConfigWeight: retriever_weight (Type Boost) + - ConfigWeight: retriever_weight (Type Boost) - 1.0 - DynamicBoost: (edge_weight * edge_bonus) + (centrality_weight * centrality_bonus) """ @@ -131,13 +131,14 @@ def _compute_total_score( base_score = float(semantic_score) * status_mult # 2. Config Weight (Static Type Boost) - config_weight = float(payload.get("retriever_weight", 1.0)) - 1.0 # 1.0 ist neutral + # Ein neutrales retriever_weight von 1.0 ergibt 0.0 Einfluss. + config_weight = float(payload.get("retriever_weight", 1.0)) - 1.0 # 3. Dynamic Boost (Graph-Signale) _sem_w, edge_w_cfg, cent_w_cfg = _get_scoring_weights() dynamic_boost = (edge_w_cfg * edge_bonus_raw) + (cent_w_cfg * cent_bonus_raw) - # Falls Intent-Boosts vorliegen, verstärken wir den Dynamic Boost + # Falls Intent-Boosts vorliegen, verstärken wir den Dynamic Boost global if dynamic_edge_boosts and (edge_bonus_raw > 0 or cent_bonus_raw > 0): dynamic_boost *= 1.5 @@ -155,14 +156,14 @@ def _build_explanation( subgraph: Optional[ga.Subgraph], node_key: Optional[str] ) -> Explanation: - """Erstellt ein Explanation-Objekt (WP-04b).""" + """Erstellt ein Explanation-Objekt mit Provenance-Details.""" _, edge_w_cfg, cent_w_cfg = _get_scoring_weights() type_weight = float(payload.get("retriever_weight", 1.0)) status_mult = _get_status_multiplier(payload) note_type = payload.get("type", "unknown") - # Breakdown für Explanation (Muss die Scoring Formel spiegeln) + # Breakdown für Explanation config_w_impact = type_weight - 1.0 dynamic_b_impact = (edge_w_cfg * edge_bonus) + (cent_w_cfg * cent_bonus) base_val = semantic_score * status_mult @@ -170,7 +171,7 @@ def _build_explanation( breakdown = ScoreBreakdown( semantic_contribution=base_val, edge_contribution=base_val * dynamic_b_impact, - centrality_contribution=0.0, # In dynamic_b_impact enthalten + centrality_contribution=0.0, raw_semantic=semantic_score, raw_edge_bonus=edge_bonus, raw_centrality=cent_bonus, @@ -189,35 +190,34 @@ def _build_explanation( msg = "Bevorzugt" if type_weight > 1.0 else "Leicht abgewertet" reasons.append(Reason(kind="type", message=f"{msg} aufgrund des Typs '{note_type}'.", score_impact=base_val * config_w_impact)) + # WP-22: Lifecycle Grund hinzufügen if status_mult != 1.0: msg = "Status-Bonus" if status_mult > 1.0 else "Status-Malus" reasons.append(Reason(kind="lifecycle", message=f"{msg} ({payload.get('status', 'unknown')}).", score_impact=0.0)) if subgraph and node_key and edge_bonus > 0: - if hasattr(subgraph, "get_outgoing_edges"): - outgoing = subgraph.get_outgoing_edges(node_key) - for edge in outgoing: - target = edge.get("target", "Unknown") - kind = edge.get("kind", "edge") - weight = edge.get("weight", 0.0) - if weight > 0.05: - edges_dto.append(EdgeDTO(id=f"{node_key}->{target}:{kind}", kind=kind, source=node_key, target=target, weight=weight, direction="out")) + # WP-22: Detaillierte Provenance-Gründe (Basis für WP-08) + incoming_raw = subgraph.get_incoming_edges(node_key) or [] + for edge in incoming_raw: + src = edge.get("source", "Unknown") + k = edge.get("kind", "edge") + prov = edge.get("provenance", "rule") + conf = float(edge.get("confidence", 1.0)) + + edges_dto.append(EdgeDTO( + id=f"{src}->{node_key}:{k}", kind=k, source=src, target=node_key, + weight=conf, direction="in", provenance=prov, confidence=conf + )) - if hasattr(subgraph, "get_incoming_edges"): - incoming = subgraph.get_incoming_edges(node_key) - for edge in incoming: - src = edge.get("source", "Unknown") - kind = edge.get("kind", "edge") - weight = edge.get("weight", 0.0) - if weight > 0.05: - edges_dto.append(EdgeDTO(id=f"{src}->{node_key}:{kind}", kind=kind, source=src, target=node_key, weight=weight, direction="in")) - - all_edges = sorted(edges_dto, key=lambda e: e.weight, reverse=True) + all_edges = sorted(edges_dto, key=lambda e: e.confidence, reverse=True) for top_edge in all_edges[:3]: - impact = edge_w_cfg * top_edge.weight - dir_txt = "Verweist auf" if top_edge.direction == "out" else "Referenziert von" - tgt_txt = top_edge.target if top_edge.direction == "out" else top_edge.source - reasons.append(Reason(kind="edge", message=f"{dir_txt} '{tgt_txt}' via '{top_edge.kind}'", score_impact=impact, details={"kind": top_edge.kind})) + prov_txt = "Bestätigt durch" if top_edge.provenance == "explicit" else "Vermutet durch" + reasons.append(Reason( + kind="edge", + message=f"{prov_txt} Kante '{top_edge.kind}' von '{top_edge.source}'.", + score_impact=edge_w_cfg * top_edge.confidence, + details={"provenance": top_edge.provenance} + )) if cent_bonus > 0.01: reasons.append(Reason(kind="centrality", message="Knoten liegt zentral im Kontext.", score_impact=cent_w_cfg * cent_bonus)) @@ -270,6 +270,7 @@ def _build_hits_from_semantic( if subgraph is not None and node_key: try: + # WP-22: edge_bonus nutzt intern bereits die confidence-gewichteten Pfade edge_bonus = float(subgraph.edge_bonus(node_key)) except Exception: edge_bonus = 0.0 @@ -364,14 +365,23 @@ def hybrid_retrieve(req: QueryRequest) -> QueryResponse: # Subgraph laden subgraph = ga.expand(client, prefix, seed_ids, depth=depth, edge_types=edge_types) - # --- WP-22: Kanten-Boosts im RAM-Graphen anwenden --- - # Dies manipuliert die Gewichte im Graphen, bevor der 'edge_bonus' berechnet wird. - if boost_edges and subgraph and hasattr(subgraph, "graph"): + # --- WP-22: Kanten-Boosts & Provenance-Weighting im RAM-Graphen --- + if subgraph and hasattr(subgraph, "graph"): for u, v, data in subgraph.graph.edges(data=True): + # 1. Herkunfts-Basisgewichtung (Concept 2.6) + prov = data.get("provenance", "rule") + prov_weight = 1.0 + if prov == "smart": prov_weight = 0.9 + elif prov == "rule": prov_weight = 0.7 + + # 2. Intent-basierter Multiplikator (Teil C) k = data.get("kind") - if k in boost_edges: - # Gewicht multiplizieren (z.B. caused_by * 3.0) - data["weight"] = data.get("weight", 1.0) * boost_edges[k] + intent_boost = 1.0 + if boost_edges and k in boost_edges: + intent_boost = boost_edges[k] + + # Finales Gewicht im Graphen setzen + data["weight"] = data.get("weight", 1.0) * prov_weight * intent_boost except Exception: subgraph = None diff --git a/app/models/dto.py b/app/models/dto.py index 4267028..9a2f8e3 100644 --- a/app/models/dto.py +++ b/app/models/dto.py @@ -1,10 +1,10 @@ """ FILE: app/models/dto.py DESCRIPTION: Pydantic-Modelle (DTOs) für Request/Response Bodies. Definiert das API-Schema. -VERSION: 0.6.3 (WP-22 Semantic Graph Routing & Lifecycle) +VERSION: 0.6.4 (WP-22 Semantic Graph Routing, Lifecycle & Provenance) STATUS: Active DEPENDENCIES: pydantic, typing, uuid -LAST_ANALYSIS: 2025-12-15 +LAST_ANALYSIS: 2025-12-18 """ from __future__ import annotations @@ -12,6 +12,7 @@ from pydantic import BaseModel, Field from typing import List, Literal, Optional, Dict, Any import uuid +# WP-22: Definition der gültigen Kanten-Typen gemäß Manual EdgeKind = Literal["references", "references_at", "backlink", "next", "prev", "belongs_to", "depends_on", "related_to", "similar_to", "caused_by", "derived_from", "based_on", "solves", "blocks", "uses", "guides"] @@ -40,6 +41,9 @@ class EdgeDTO(BaseModel): target: str weight: float direction: Literal["out", "in", "undirected"] = "out" + # WP-22: Provenance Tracking (Herkunft und Vertrauen) + provenance: Optional[Literal["explicit", "rule", "smart", "structure"]] = "explicit" + confidence: float = 1.0 # --- Request Models --- @@ -65,7 +69,7 @@ class QueryRequest(BaseModel): class FeedbackRequest(BaseModel): """ - User-Feedback zu einem spezifischen Treffer oder der Gesamtantwort. + User-Feedback zu einem spezifischen Treffer oder der Gesamtantwort (Basis für WP-08). """ query_id: str = Field(..., description="ID der ursprünglichen Suche") # node_id ist optional: Wenn leer oder "generated_answer", gilt es für die Antwort. @@ -90,7 +94,7 @@ class ChatRequest(BaseModel): # --- WP-04b Explanation Models --- class ScoreBreakdown(BaseModel): - """Aufschlüsselung der Score-Komponenten.""" + """Aufschlüsselung der Score-Komponenten nach der WP-22 Formel.""" semantic_contribution: float edge_contribution: float centrality_contribution: float