From 765fad6a8da713572160e5fd32115ad4a600e0f8 Mon Sep 17 00:00:00 2001 From: Lars Date: Thu, 11 Dec 2025 07:25:24 +0100 Subject: [PATCH] neuer discovery mode --- app/services/discovery.py | 137 ++++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 51 deletions(-) diff --git a/app/services/discovery.py b/app/services/discovery.py index 6612e64..7036382 100644 --- a/app/services/discovery.py +++ b/app/services/discovery.py @@ -1,10 +1,13 @@ """ app/services/discovery.py Service für Link-Vorschläge und Knowledge-Discovery (WP-11). +Analysiert Drafts auf Keywords und semantische Ähnlichkeiten. +Implementiert 'Late Binding' für Edge-Typen via types.yaml. """ import logging -from typing import List, Dict, Any, Set -from qdrant_client.http import models as rest +import os +from typing import List, Dict, Any, Optional +import yaml from app.core.qdrant import QdrantConfig, get_client from app.models.dto import QueryRequest @@ -14,20 +17,27 @@ logger = logging.getLogger(__name__) class DiscoveryService: def __init__(self, collection_prefix: str = None): + # 1. Config laden self.cfg = QdrantConfig.from_env() # Prefix Priorität: Argument > Env > Default self.prefix = collection_prefix or self.cfg.prefix or "mindnet" self.client = get_client(self.cfg) + + # 2. Registry für Late Binding laden (Edge Defaults) + self.registry = self._load_type_registry() async def analyze_draft(self, text: str, current_type: str) -> Dict[str, Any]: """ Analysiert einen Draft-Text und schlägt Verlinkungen vor. - Kombiniert Exact Match (Titel/Alias) und Semantic Match (Vektor). + Nutzt 'types.yaml' um den passenden Edge-Typ vorzuschlagen. """ suggestions = [] + # Welcher Edge-Typ ist für diesen Draft-Typ (z.B. 'project') der Standard? + # Late Binding: Wir schauen in die Config, statt es zu hardcoden. + default_edge_type = self._get_default_edge_type(current_type) + # 1. Exact Match: Finde Begriffe im Text, die als Notiz-Titel existieren - # (Holt alle Titel aus Qdrant - bei riesigen Vaults später cachen) known_entities = self._fetch_all_titles_and_aliases() found_entities = self._find_entities_in_text(text, known_entities) @@ -35,43 +45,96 @@ class DiscoveryService: for entity in found_entities: existing_target_ids.add(entity["id"]) + + # Vorschlag generieren + target_title = entity["title"] + # Markdown-Vorschlag: [[rel:depends_on Ziel]] + suggested_md = f"[[rel:{default_edge_type} {target_title}]]" + suggestions.append({ "type": "exact_match", "text_found": entity["match"], - "target_title": entity["title"], + "target_title": target_title, "target_id": entity["id"], + "suggested_edge_type": default_edge_type, + "suggested_markdown": suggested_md, "confidence": 1.0, - "reason": "Existierender Notiz-Titel/Alias" + "reason": f"Existierender Titel (Default für '{current_type}': {default_edge_type})" }) - # 2. Semantic Match: Finde inhaltlich ähnliche Notizen via Vektor-Suche + # 2. Semantic Match: Finde inhaltlich ähnliche Notizen semantic_hits = self._get_semantic_suggestions(text) for hit in semantic_hits: - # Duplikate vermeiden (wenn wir es schon per Titel gefunden haben) if hit.node_id in existing_target_ids: continue - # Schwellwert: Nur relevante Vorschläge - # total_score beinhaltet bereits Typ-Gewichte aus dem Retriever if hit.total_score > 0.65: + # Bei semantischen Treffern ist 'related_to' oft sicherer als 'depends_on', + # es sei denn, die Config erzwingt etwas anderes. + # Wir bleiben hier beim Config-Default, um konsistent zu sein. + target_title = hit.payload.get("title", "Unbekannt") + suggested_md = f"[[rel:{default_edge_type} {target_title}]]" + suggestions.append({ "type": "semantic_match", "text_found": (hit.source.get("text") or "")[:50] + "...", - "target_title": hit.payload.get("title", "Unbekannt"), + "target_title": target_title, "target_id": hit.node_id, + "suggested_edge_type": default_edge_type, + "suggested_markdown": suggested_md, "confidence": round(hit.total_score, 2), - "reason": f"Inhaltliche Ähnlichkeit (Score: {round(hit.total_score, 2)})" + "reason": f"Semantische Ähnlichkeit (Score: {round(hit.total_score, 2)})" }) return { "draft_length": len(text), + "draft_type": current_type, + "default_strategy": default_edge_type, "suggestions_count": len(suggestions), "suggestions": suggestions } + # --- Configuration & Late Binding Helpers --- + + def _load_type_registry(self) -> dict: + """Lädt die types.yaml für Konfigurations-Zugriffe.""" + path = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml") + if not os.path.exists(path): + # Fallback relative Pfade + if os.path.exists("types.yaml"): path = "types.yaml" + elif os.path.exists("../config/types.yaml"): path = "../config/types.yaml" + else: return {} + + try: + with open(path, "r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + except Exception as e: + logger.warning(f"Failed to load types registry: {e}") + return {} + + def _get_default_edge_type(self, note_type: str) -> str: + """ + Ermittelt den bevorzugten Kanten-Typ für einen gegebenen Notiz-Typ. + Logik: types.yaml -> types -> {note_type} -> edge_defaults[0] + Fallback: 'related_to' + """ + # 1. Config für den Typ laden + types_cfg = self.registry.get("types", {}) + type_def = types_cfg.get(note_type, {}) + + # 2. Defaults prüfen + defaults = type_def.get("edge_defaults") + if defaults and isinstance(defaults, list) and len(defaults) > 0: + # Wir nehmen den ersten Default als "Haupt-Beziehung" + return defaults[0] + + # 3. Fallback, falls nichts konfiguriert ist + return "related_to" + + # --- Core Logic (Unverändert) --- + def _fetch_all_titles_and_aliases(self) -> List[Dict]: - """Lädt alle Titel und Aliases aus der Notes-Collection.""" notes = [] next_page = None col_name = f"{self.prefix}_notes" @@ -87,8 +150,6 @@ class DiscoveryService: ) for point in res: pl = point.payload or {} - - # Aliases robust lesen aliases = pl.get("aliases") or [] if isinstance(aliases, str): aliases = [aliases] @@ -97,57 +158,31 @@ class DiscoveryService: "title": pl.get("title"), "aliases": aliases }) - - if next_page is None: - break + if next_page is None: break except Exception as e: logger.error(f"Error fetching titles: {e}") return [] - return notes def _find_entities_in_text(self, text: str, entities: List[Dict]) -> List[Dict]: - """ - Sucht Vorkommen von Titeln/Alias im Text (Case-Insensitive). - """ found = [] text_lower = text.lower() - for entity in entities: - # 1. Titel prüfen title = entity.get("title") if title and title.lower() in text_lower: - found.append({ - "match": title, - "title": title, - "id": entity["id"] - }) - continue # Wenn Titel gefunden, Aliases nicht mehr prüfen - - # 2. Aliases prüfen - aliases = entity.get("aliases") - if aliases and isinstance(aliases, list): - for alias in aliases: - if alias and str(alias).lower() in text_lower: - found.append({ - "match": alias, - "title": title, # Target ist immer der Haupt-Titel - "id": entity["id"] - }) - break + found.append({"match": title, "title": title, "id": entity["id"]}) + continue + aliases = entity.get("aliases", []) + for alias in aliases: + if alias and str(alias).lower() in text_lower: + found.append({"match": alias, "title": title, "id": entity["id"]}) + break return found def _get_semantic_suggestions(self, text: str): - """Wrapper um den Hybrid Retriever.""" - req = QueryRequest( - query=text, - top_k=5, - explain=False - ) + req = QueryRequest(query=text, top_k=5, explain=False) try: - # hybrid_retrieve nutzen (sync Wrapper) res = hybrid_retrieve(req) return res.results - except Exception as e: - logger.error(f"Semantic suggestion failed: {e}") + except Exception: return [] \ No newline at end of file