From 0170d9291fd3fa20700d7f27013ee7ef758ada3e Mon Sep 17 00:00:00 2001 From: Lars Date: Wed, 10 Dec 2025 22:05:35 +0100 Subject: [PATCH] erster Stand --- app/core/ingestion.py | 40 +++++++++- app/main.py | 11 ++- app/routers/ingest.py | 89 ++++++++++++++++++++++ app/services/discovery.py | 151 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 286 insertions(+), 5 deletions(-) create mode 100644 app/routers/ingest.py create mode 100644 app/services/discovery.py diff --git a/app/core/ingestion.py b/app/core/ingestion.py index 32c82e3..992c29d 100644 --- a/app/core/ingestion.py +++ b/app/core/ingestion.py @@ -261,4 +261,42 @@ class IngestionService: try: self.client.delete(collection_name=f"{self.prefix}_{suffix}", points_selector=selector) except Exception: - pass \ No newline at end of file + pass + + def create_from_text( + self, + markdown_content: str, + filename: str, + vault_root: str, + folder: str = "Inbox" # Standard-Ordner für neue Files + ) -> Dict[str, Any]: + """ + WP-11: Schreibt Text in eine physische Datei und indiziert sie sofort. + """ + # 1. Pfad vorbereiten + target_dir = os.path.join(vault_root, folder) + os.makedirs(target_dir, exist_ok=True) + + # Dateiname bereinigen (Sicherheit) + safe_filename = os.path.basename(filename) + if not safe_filename.endswith(".md"): + safe_filename += ".md" + + file_path = os.path.join(target_dir, safe_filename) + + # 2. Schreiben (Write to Disk - Single Source of Truth) + try: + with open(file_path, "w", encoding="utf-8") as f: + f.write(markdown_content) + except Exception as e: + return {"status": "error", "error": f"Disk write failed: {str(e)}"} + + # 3. Indizieren (Ingest) + # Wir rufen einfach die existierende Logik auf! + return self.process_file( + file_path=file_path, + vault_root=vault_root, + apply=True, # Sofort schreiben + force_replace=True, # Da neu, erzwingen wir Update + purge_before=True # Sauberer Start + ) \ No newline at end of file diff --git a/app/main.py b/app/main.py index 3afd514..98e7ea5 100644 --- a/app/main.py +++ b/app/main.py @@ -11,16 +11,18 @@ from .routers.query import router as query_router from .routers.graph import router as graph_router from .routers.tools import router as tools_router from .routers.feedback import router as feedback_router -# NEU: Chat Router (WP-05) from .routers.chat import router as chat_router +# NEU: Ingest Router (WP-11) +from .routers.ingest import router as ingest_router + try: from .routers.admin import router as admin_router except Exception: admin_router = None def create_app() -> FastAPI: - app = FastAPI(title="mindnet API", version="0.5.0") # Version bump WP-05 + app = FastAPI(title="mindnet API", version="0.6.0") # Version bump WP-11 s = get_settings() @app.get("/healthz") @@ -34,9 +36,10 @@ def create_app() -> FastAPI: app.include_router(graph_router, prefix="/graph", tags=["graph"]) app.include_router(tools_router, prefix="/tools", tags=["tools"]) app.include_router(feedback_router, prefix="/feedback", tags=["feedback"]) - - # NEU: Chat Endpoint app.include_router(chat_router, prefix="/chat", tags=["chat"]) + + # NEU: Registrierung des Ingest-Routers + app.include_router(ingest_router, prefix="/ingest", tags=["ingest"]) if admin_router: app.include_router(admin_router, prefix="/admin", tags=["admin"]) diff --git a/app/routers/ingest.py b/app/routers/ingest.py new file mode 100644 index 0000000..4e9a7a0 --- /dev/null +++ b/app/routers/ingest.py @@ -0,0 +1,89 @@ +""" +app/routers/ingest.py +API-Endpunkte für WP-11 (Discovery & Persistence). +""" +import os +import time +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel +from typing import Optional, List, Dict, Any + +from app.core.ingestion import IngestionService +from app.services.discovery import DiscoveryService + +router = APIRouter() + +# --- DTOs --- + +class AnalyzeRequest(BaseModel): + text: str + type: str = "concept" + +class SaveRequest(BaseModel): + markdown_content: str + filename: Optional[str] = None # Optional, fallback auf Timestamp/Titel + folder: str = "00_Inbox" # Zielordner im Vault + +class SaveResponse(BaseModel): + status: str + file_path: str + note_id: str + stats: Dict[str, Any] + +# --- Services --- +# Instanzierung hier oder via Dependency Injection +discovery_service = DiscoveryService() + +@router.post("/analyze") +async def analyze_draft(req: AnalyzeRequest): + """ + WP-11 Intelligence: Analysiert einen Entwurf und liefert Link-Vorschläge. + """ + try: + result = await discovery_service.analyze_draft(req.text, req.type) + return result + except Exception as e: + raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}") + +@router.post("/save", response_model=SaveResponse) +async def save_note(req: SaveRequest): + """ + WP-11 Persistence: Speichert Markdown physisch und indiziert es in Qdrant. + """ + # 1. Vault Root ermitteln + vault_root = os.getenv("MINDNET_VAULT_ROOT", "./vault") + if not os.path.exists(vault_root): + # Fallback für Dev-Umgebungen + if os.path.exists("../vault"): + vault_root = "../vault" + else: + raise HTTPException(status_code=500, detail="Vault root not configured or missing") + + # 2. Filename generieren falls fehlend + final_filename = req.filename + if not final_filename: + # Einfacher Fallback: Timestamp + final_filename = f"draft_{int(time.time())}.md" + + # 3. Ingestion Service nutzen + ingest_service = IngestionService() # nutzt Default-Prefix oder aus Env + + result = ingest_service.create_from_text( + markdown_content=req.markdown_content, + filename=final_filename, + vault_root=os.path.abspath(vault_root), + folder=req.folder + ) + + if result.get("status") == "error": + raise HTTPException(status_code=500, detail=result.get("error")) + + return SaveResponse( + status="success", + file_path=result["path"], + note_id=result.get("note_id", "unknown"), + stats={ + "chunks": result.get("chunks_count", 0), + "edges": result.get("edges_count", 0) + } + ) \ No newline at end of file diff --git a/app/services/discovery.py b/app/services/discovery.py new file mode 100644 index 0000000..359a13a --- /dev/null +++ b/app/services/discovery.py @@ -0,0 +1,151 @@ +""" +app/services/discovery.py +Service für Link-Vorschläge und Knowledge-Discovery (WP-11). +Analysiert Drafts auf Keywords und semantische Ähnlichkeiten. +""" +import logging +from typing import List, Dict, Any, Set +from qdrant_client.http import models as rest + +from app.core.qdrant import QdrantConfig, get_client +from app.models.dto import QueryRequest +from app.core.retriever import hybrid_retrieve + +logger = logging.getLogger(__name__) + +class DiscoveryService: + def __init__(self, collection_prefix: str = "mindnet"): + self.prefix = collection_prefix + self.cfg = QdrantConfig.from_env() + self.cfg.prefix = collection_prefix + self.client = get_client(self.cfg) + + async def analyze_draft(self, text: str, current_type: str) -> Dict[str, Any]: + """ + Analysiert einen Draft-Text und schlägt Verlinkungen vor. + Kombiniert Exact Match (Titel) und Semantic Match (Vektor). + """ + suggestions = [] + + # 1. Exact Match: Finde Begriffe im Text, die als Notiz-Titel existieren + # (Bei sehr großen Vaults >10k sollte dies gecached werden) + known_entities = self._fetch_all_titles_and_aliases() + found_entities = self._find_entities_in_text(text, known_entities) + + existing_target_ids = set() + + for entity in found_entities: + existing_target_ids.add(entity["id"]) + suggestions.append({ + "type": "exact_match", + "text_found": entity["match"], + "target_title": entity["title"], + "target_id": entity["id"], + "confidence": 1.0, + "reason": "Existierender Notiz-Titel" + }) + + # 2. Semantic Match: Finde inhaltlich ähnliche Notizen via Vektor-Suche + # Wir filtern Ergebnisse heraus, die wir schon per Exact Match gefunden haben. + semantic_hits = self._get_semantic_suggestions(text) + + for hit in semantic_hits: + if hit.node_id in existing_target_ids: + continue + + # Schwellwert: Nur relevante Vorschläge anzeigen (z.B. > 0.65) + # Wir nutzen den total_score, der bereits Typ-Gewichte enthält. + if hit.total_score > 0.65: + suggestions.append({ + "type": "semantic_match", + "text_found": (hit.source.get("text") or "")[:50] + "...", # Snippet + "target_title": hit.payload.get("title", "Unbekannt"), + "target_id": hit.node_id, + "confidence": round(hit.total_score, 2), + "reason": f"Inhaltliche Ähnlichkeit (Score: {round(hit.total_score, 2)})" + }) + + return { + "draft_length": len(text), + "suggestions_count": len(suggestions), + "suggestions": suggestions + } + + def _fetch_all_titles_and_aliases(self) -> List[Dict]: + """Lädt alle Titel und Aliases aus der Notes-Collection.""" + notes = [] + next_page = None + col_name = f"{self.prefix}_notes" + + try: + while True: + # Scroll API nutzen, um effizient alle Metadaten zu laden + res, next_page = self.client.scroll( + collection_name=col_name, + limit=1000, + offset=next_page, + with_payload=True, + with_vectors=False + ) + for point in res: + pl = point.payload or {} + notes.append({ + "id": pl.get("note_id"), + "title": pl.get("title"), + "aliases": pl.get("aliases", []) + }) + + if next_page is None: + break + except Exception as e: + logger.error(f"Error fetching titles: {e}") + return [] + + return notes + + def _find_entities_in_text(self, text: str, entities: List[Dict]) -> List[Dict]: + """ + Sucht Vorkommen von Titeln im Text (Case-Insensitive). + """ + found = [] + text_lower = text.lower() + + for entity in entities: + # 1. Titel prüfen + title = entity.get("title") + if title and title.lower() in text_lower: + found.append({ + "match": title, + "title": title, + "id": entity["id"] + }) + continue # Wenn Titel gefunden, Aliases nicht mehr prüfen (Prio) + + # 2. Aliases prüfen + aliases = entity.get("aliases") + if aliases and isinstance(aliases, list): + for alias in aliases: + if alias and str(alias).lower() in text_lower: + found.append({ + "match": alias, + "title": title, # Target ist immer der Haupt-Titel + "id": entity["id"] + }) + break + return found + + def _get_semantic_suggestions(self, text: str): + """Wrapper um den Hybrid Retriever.""" + # Wir nutzen eine vereinfachte Query + req = QueryRequest( + query=text, + top_k=5, + explain=False + ) + try: + # hybrid_retrieve ist sync, wird aber schnell genug sein für diesen Kontext + res = hybrid_retrieve(req) + return res.results + except Exception as e: + logger.error(f"Semantic suggestion failed: {e}") + return [] \ No newline at end of file