diff --git a/app/core/edges.py b/app/core/edges.py deleted file mode 100644 index c8b7fd7..0000000 --- a/app/core/edges.py +++ /dev/null @@ -1,296 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Modul: app/core/edges.py -Version: 2.0.0 (V2‑superset, rückwärtskompatibel zu v1 vom 2025‑09‑09) - -Zweck ------ -Bewahrt die bestehende Edgelogik (belongs_to, prev/next, references, backlink) -und ergänzt V2‑Felder + Typ‑Default‑Kanten gemäß config/types.yaml (edge_defaults). -Die Funktion ist **idempotent** und **rückwärtskompatibel** zur bisherigen Signatur. - -Kompatibilitätsgarantien (gegenüber v1): -- **Input**: akzeptiert identische Chunk‑Payloads wie v1: - * `id` (Chunk‑ID), `note_id` (Owner), `neighbors.prev|next` (optional), - `references: [{target_id: ...}]` (optional), - alternativ: `chunk_id`, `chunk_index|ord`, `window|text` -- **Output (v1‑Felder)**: `kind`, `source_id`, `target_id`, `scope`, `note_id`, `edge_id` -- **Neu (v2‑Felder)**: `relation`, `src_note_id`, `src_chunk_id?`, `dst_note_id`, `dst_chunk_id?`, - `provenance` (`explicit|rule`), `rule_id?`, `confidence?` - -Regeln ------- -- Deduplizierungsschlüssel: (source_id, target_id, relation, rule_id) -- Strukturkanten: - * belongs_to: 1× pro Chunk - * next/prev: Sequenz der Chunks; nutzt bevorzugt neighbors; sonst ord/chunk_index -- Explizite Referenzen: - * aus Chunk: `references[].target_id` (falls vorhanden) - * Fallback: Wikilinks in `window|text`: [[Some Title|some-id]] oder [[some-id]] -- Note‑Scope: - * backlink immer; references nur, wenn include_note_scope_refs=True -- Typ‑Defaults (edge_defaults aus config/types.yaml des **Quell‑Notiztyps**): - * Für jede explizite Referenz wird je default‑Relation eine Regel‑Kante erzeugt - * rule_id: "type_default:{note_type}:{relation}:v1", provenance="rule" - -Konfiguration -------------- -- ENV MINDNET_TYPES_FILE (Default: ./config/types.yaml) - -Lizenz/Autor ------------- -- Erstimplementierung v1 (2025‑09‑09) — Projekt Mindnet -- Erweiterung v2 (2025‑11‑11) — kompatible Superset‑Implementierung -""" -from __future__ import annotations - -import os -import re -from typing import Dict, Iterable, List, Optional, Tuple, Set - -try: - import yaml # optional, nur für types.yaml -except Exception: # pragma: no cover - yaml = None - -# ------------------------------------------------------------ -# Hilfen: types.yaml laden (edge_defaults) -# ------------------------------------------------------------ - -def _types_path() -> str: - return os.getenv("MINDNET_TYPES_FILE") or "./config/types.yaml" - -def _load_types() -> Dict[str, dict]: - p = _types_path() - if not os.path.isfile(p) or yaml is None: - return {} - try: - with open(p, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) or {} - if isinstance(data, dict) and "types" in data and isinstance(data["types"], dict): - return data["types"] - return data if isinstance(data, dict) else {} - except Exception: - return {} - -def _edge_defaults_for(note_type: Optional[str]) -> List[str]: - types = _load_types() - t = (note_type or "").strip().lower() - cfg = types.get(t) or {} - defaults = cfg.get("edge_defaults") or [] - if isinstance(defaults, str): - defaults = [defaults] - return [str(x) for x in defaults if isinstance(x, (str, int, float))] - -# ------------------------------------------------------------ -# Wikilink‑Parser (Fallback, wenn ch["references"] fehlt) -# ------------------------------------------------------------ - -_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]") - -def _extract_wikilinks(text: str) -> List[str]: - ids: List[str] = [] - for m in _WIKILINK_RE.finditer(text or ""): - ids.append(m.group(1).strip()) - return ids - -# ------------------------------------------------------------ -# Utility -# ------------------------------------------------------------ - -def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str: - base = f"{kind}:{s}->{t}#{scope}" - if rule_id: - base += f"|{rule_id}" - try: - import hashlib - return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest() - except Exception: # pragma: no cover - return base - -def _dedupe(edges: List[Dict]) -> List[Dict]: - seen: Set[Tuple[str,str,str,str]] = set() - out: List[Dict] = [] - for e in edges: - s = str(e.get("source_id") or "") - t = str(e.get("target_id") or "") - rel = str(e.get("relation") or e.get("kind") or "edge") - rule = str(e.get("rule_id") or "") - key = (s, t, rel, rule) - if key in seen: - continue - seen.add(key) - out.append(e) - return out - -def _first(v: dict, *keys, default=None): - for k in keys: - if k in v and v[k] is not None: - return v[k] - return default - -# ------------------------------------------------------------ -# Hauptfunktion -# ------------------------------------------------------------ - -def build_edges_for_note( - note_id: str, - chunk_payloads: List[Dict], - note_level_refs: Optional[List[str]] = None, - *, - include_note_scope_refs: bool = False, -) -> List[Dict]: - edges: List[Dict] = [] - chunks = list(chunk_payloads or []) - # Notiztyp aus erstem Chunk ableiten (kompatibel zu existierenden Payloads) - note_type = (chunks[0].get("type") if chunks else None) or (chunks[0].get("note_type") if chunks else None) - - # --- Strukturkanten ------------------------------------------------------ - # belongs_to - for ch in chunks: - cid = _first(ch, "id", "chunk_id") - if not cid: - continue - owner = ch.get("note_id") or note_id - e = { - "edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to:v1"), - "kind": "belongs_to", - "relation": "belongs_to", - "scope": "chunk", - "source_id": cid, - "target_id": note_id, - "note_id": owner, # v1-Kompat - # v2 - "src_note_id": owner, - "src_chunk_id": cid, - "dst_note_id": note_id, - "provenance": "rule", - "rule_id": "structure:belongs_to:v1", - "confidence": 1.0, - } - edges.append(e) - - # next/prev — bevorzugt neighbors.prev/next; sonst via ord/chunk_index - # Map der Chunks nach Index - ordered = list(chunks) - def _idx(c): - return _first(c, "chunk_index", "ord", default=0) - ordered.sort(key=_idx) - - for i, ch in enumerate(ordered): - cid = _first(ch, "id", "chunk_id") - if not cid: - continue - owner = ch.get("note_id") or note_id - nb = ch.get("neighbors") or {} - prev_id = nb.get("prev") - next_id = nb.get("next") - # Fallback-Reihenfolge - if prev_id is None and i > 0: - prev_id = _first(ordered[i-1], "id", "chunk_id") - if next_id is None and i+1 < len(ordered): - next_id = _first(ordered[i+1], "id", "chunk_id") - - if prev_id: - edges.append({ - "edge_id": _mk_edge_id("prev", cid, prev_id, "chunk", "structure:order:v1"), - "kind": "prev", "relation": "prev", "scope": "chunk", - "source_id": cid, "target_id": prev_id, "note_id": owner, - "src_note_id": owner, "src_chunk_id": cid, - "dst_note_id": owner, "dst_chunk_id": prev_id, - "provenance": "rule", "rule_id": "structure:order:v1", "confidence": 0.95, - }) - edges.append({ - "edge_id": _mk_edge_id("next", prev_id, cid, "chunk", "structure:order:v1"), - "kind": "next", "relation": "next", "scope": "chunk", - "source_id": prev_id, "target_id": cid, "note_id": owner, - "src_note_id": owner, "src_chunk_id": prev_id, - "dst_note_id": owner, "dst_chunk_id": cid, - "provenance": "rule", "rule_id": "structure:order:v1", "confidence": 0.95, - }) - - # --- Explizite Referenzen (Chunk‑Scope) --------------------------------- - explicit_refs: List[Dict] = [] - for ch in chunks: - cid = _first(ch, "id", "chunk_id") - if not cid: - continue - owner = ch.get("note_id") or note_id - # 1) bevorzugt vorhandene ch["references"] - refs = ch.get("references") or [] - targets = [r.get("target_id") for r in refs if isinstance(r, dict) and r.get("target_id")] - # 2) Fallback: Wikilinks aus Text - if not targets: - text = _first(ch, "window", "text", default="") or "" - targets = _extract_wikilinks(text) - for tid in targets: - if not isinstance(tid, str) or not tid.strip(): - continue - e = { - "edge_id": _mk_edge_id("references", cid, tid, "chunk"), - "kind": "references", - "relation": "references", - "scope": "chunk", - "source_id": cid, - "target_id": tid, - "note_id": owner, - # v2 - "src_note_id": owner, - "src_chunk_id": cid, - "dst_note_id": tid, - "provenance": "explicit", - "rule_id": "", - "confidence": 1.0, - } - edges.append(e) - explicit_refs.append(e) - - # --- Note‑Scope: references (optional) + backlink (immer) ---------------- - unique_refs = [] - if note_level_refs: - seen = set() - for tid in note_level_refs: - if isinstance(tid, str) and tid.strip() and tid not in seen: - unique_refs.append(tid); seen.add(tid) - - for tid in unique_refs: - if include_note_scope_refs: - edges.append({ - "edge_id": _mk_edge_id("references", note_id, tid, "note"), - "kind": "references", "relation": "references", "scope": "note", - "source_id": note_id, "target_id": tid, "note_id": note_id, - "src_note_id": note_id, "dst_note_id": tid, - "provenance": "explicit", "rule_id": "", "confidence": 1.0, - }) - edges.append({ - "edge_id": _mk_edge_id("backlink", tid, note_id, "note", "derived:backlink:v1"), - "kind": "backlink", "relation": "backlink", "scope": "note", - "source_id": tid, "target_id": note_id, "note_id": note_id, - "src_note_id": tid, "dst_note_id": note_id, - "provenance": "rule", "rule_id": "derived:backlink:v1", "confidence": 0.9, - }) - - # --- Type‑Defaults je expliziter Referenz -------------------------------- - defaults = [d for d in _edge_defaults_for(note_type) if d and d != "references"] - if defaults: - for e in explicit_refs + ([ ] if not include_note_scope_refs else []): - # wir nutzen die bereits erzeugten explicit‑Edges als Vorlage - src = e["source_id"]; tgt = e["target_id"] - scope = e.get("scope", "chunk") - s_note = e.get("src_note_id") or note_id - s_chunk = e.get("src_chunk_id") - t_note = e.get("dst_note_id") or tgt - for rel in defaults: - rule_id = f"type_default:{(note_type or 'unknown')}:{rel}:v1" - edges.append({ - "edge_id": _mk_edge_id(rel, src, tgt, scope, rule_id), - "kind": rel, "relation": rel, "scope": scope, - "source_id": src, "target_id": tgt, "note_id": s_note, - "src_note_id": s_note, "src_chunk_id": s_chunk, - "dst_note_id": t_note, - "provenance": "rule", "rule_id": rule_id, "confidence": 0.7, - }) - - # --- Dedupe & Return ----------------------------------------------------- - return _dedupe(edges) diff --git a/app/core/edges_writer.py b/app/core/edges_writer.py deleted file mode 100644 index 066e44a..0000000 --- a/app/core/edges_writer.py +++ /dev/null @@ -1,94 +0,0 @@ -# app/core/edges_writer.py -from __future__ import annotations -import hashlib -from typing import Dict, List, Iterable, Tuple - -try: - # Dein Modul mit der Schemadefinition und der Builder-Funktion - from app.core.edges import build_edges_for_note # noqa: F401 -except Exception as e: - raise RuntimeError("Konnte app.core.edges nicht importieren. " - "Bitte sicherstellen, dass app/core/edges.py vorhanden ist.") from e - -def _edge_uid(kind: str, source_id: str, target_id: str, scope: str) -> str: - """ - Deterministische, kurze ID für eine Edge. - Kollisionen sind praktisch ausgeschlossen (BLAKE2s über den Kanonischen Schlüssel). - """ - key = f"{kind}|{source_id}|{target_id}|{scope}" - return hashlib.blake2s(key.encode("utf-8"), digest_size=12).hexdigest() - -def ensure_edges_collection(qdrant_client, collection: str) -> None: - """ - Legt die Edge-Collection an, falls sie nicht existiert. - Minimal: 1D-Vector (Dummy), Cosine. Payload-only-Collections sind je nach Qdrant-Version heikel. - """ - from qdrant_client.http import models as qm - - existing = [c.name for c in qdrant_client.get_collections().collections] - if collection in existing: - return - - qdrant_client.recreate_collection( - collection_name=collection, - vectors_config=qm.VectorParams(size=1, distance=qm.Distance.COSINE), - on_disk_payload=True, - ) - -def edges_from_note( - note_id: str, - chunk_payloads: List[Dict], - note_level_refs: Iterable[str] | None, - *, - include_note_scope_refs: bool = False, -) -> List[Dict]: - """ - Ruft deinen Edge-Builder auf und gibt die (deduplizierten) Edge-Payloads zurück. - Keine Schemaänderung – exakt das aus app/core/edges.py. - """ - return build_edges_for_note( - note_id=note_id, - chunk_payloads=chunk_payloads, - note_level_refs=list(note_level_refs or []), - include_note_scope_refs=include_note_scope_refs, - ) - -def upsert_edges( - qdrant_client, - collection: str, - edge_payloads: List[Dict], -) -> Tuple[int, int]: - """ - Schreibt Edges als Points in Qdrant. - - id: deterministisch aus (kind, source_id, target_id, scope) - - vector: [0.0] Dummy - - payload: Edge-Dict (unverändert, siehe Schema in app/core/edges.py) - Gibt (anzahl_points, anzahl_unique_keys) zurück. - """ - from qdrant_client.models import PointStruct - - if not edge_payloads: - return 0, 0 - - points = [] - seen = set() - for e in edge_payloads: - key = (e.get("kind"), e.get("source_id"), e.get("target_id"), e.get("scope")) - if key in seen: - continue - seen.add(key) - eid = _edge_uid(*key) - points.append( - PointStruct( - id=eid, - vector=[0.0], - payload=e, - ) - ) - - if not points: - return 0, 0 - - ensure_edges_collection(qdrant_client, collection) - qdrant_client.upsert(collection_name=collection, points=points) - return len(points), len(seen) diff --git a/app/core/env_vars.py b/app/core/env_vars.py deleted file mode 100644 index b1f2d4b..0000000 --- a/app/core/env_vars.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Datei: app/core/env_vars.py -Version: 1.1.0 (2025-11-08) - -Zweck - Einheitliche Auflösung von ENV-Variablen (Prefix, Qdrant, Embeddings, Hashing) - mit Abwärtskompatibilität. - -Grundsatz - - Für Qdrant-Funktionen ist 'COLLECTION_PREFIX' der Primärschlüssel. - - 'MINDNET_PREFIX' bleibt für App-/UI-/Exporter-Kontexte nutzbar. - - Fallbacks sorgen dafür, dass ältere Umgebungen weiter funktionieren. - -Wichtig - - Lädt optional eine .env (wenn python-dotenv verfügbar ist). - - Überschreibt keine bereits gesetzten OS-Variablen (override=False). -""" -from __future__ import annotations - -import os -from typing import Optional, Dict - -# Optional: .env automatisch laden (ohne Hard-Fail, falls nicht vorhanden) -try: - from dotenv import load_dotenv, find_dotenv # type: ignore - _p = find_dotenv() - if _p: - load_dotenv(_p, override=False) -except Exception: - pass - -# -------- Prefix-Auflösung -------- - -def get_collection_prefix(cli_override: Optional[str] = None) -> str: - """ - Für Qdrant-relevante Funktionen: - 1) CLI-Override (--prefix) - 2) ENV COLLECTION_PREFIX - 3) ENV MINDNET_PREFIX (Fallback) - 4) 'mindnet' (Default) - """ - if cli_override and str(cli_override).strip(): - return str(cli_override).strip() - return ( - os.getenv("COLLECTION_PREFIX") - or os.getenv("MINDNET_PREFIX") - or "mindnet" - ) - -def get_mindnet_prefix(cli_override: Optional[str] = None) -> str: - """ - Für App-/UI-/Exporter-Kontexte: - 1) CLI-Override (--prefix) - 2) ENV MINDNET_PREFIX - 3) ENV COLLECTION_PREFIX (Fallback) - 4) 'mindnet' - """ - if cli_override and str(cli_override).strip(): - return str(cli_override).strip() - return ( - os.getenv("MINDNET_PREFIX") - or os.getenv("COLLECTION_PREFIX") - or "mindnet" - ) - -def get_prefix(cli_override: Optional[str] = None, target: str = "qdrant") -> str: - """ - Universelle Hülle (abwärtskompatibel): - target='qdrant' -> get_collection_prefix - target='app' -> get_mindnet_prefix - """ - if target.lower() == "app": - return get_mindnet_prefix(cli_override) - return get_collection_prefix(cli_override) - -# -------- Qdrant / Embeddings / Hashing -------- - -def get_qdrant_url(default: str = "http://127.0.0.1:6333") -> str: - return os.getenv("QDRANT_URL", default) - -def get_qdrant_api_key(default: str = "") -> str: - return os.getenv("QDRANT_API_KEY", default) - -def get_vector_dim(default: int = 384) -> int: - try: - return int(os.getenv("VECTOR_DIM", str(default))) - except Exception: - return default - -def get_embed_url(default: Optional[str] = None) -> Optional[str]: - return os.getenv("EMBED_URL", default) - -def get_hash_env() -> Dict[str, str]: - """ - Liefert die Hash-Konfiguration (nur Aggregation; die Auswertung bleibt in den Skripten). - """ - return { - "MINDNET_HASH_COMPARE": os.getenv("MINDNET_HASH_COMPARE", ""), - "MINDNET_HASH_SOURCE": os.getenv("MINDNET_HASH_SOURCE", ""), - "MINDNET_HASH_NORMALIZE": os.getenv("MINDNET_HASH_NORMALIZE", ""), - } diff --git a/app/core/ranking.py b/app/core/ranking.py deleted file mode 100644 index e9105f3..0000000 --- a/app/core/ranking.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -app/core/ranking.py — Kombiniertes Scoring (WP-04) - -Zweck: - Zusammenführen von semantischem Score (normalisiert), Edge-Bonus und - Centrality-Bonus in einen Gesamtscore für die Ergebnisreihung. -Kompatibilität: - Python 3.12+ -Version: - 0.1.0 (Erstanlage) -Stand: - 2025-10-07 -Bezug: - WP-04 Ranking-Formel (w_sem, w_edge, w_cent) -Nutzung: - from app.core.ranking import combine_scores -Änderungsverlauf: - 0.1.0 (2025-10-07) – Erstanlage. -""" - -from __future__ import annotations -from typing import List, Tuple, Dict - - -def normalize_scores(values: List[float]) -> List[float]: - """Min-Max-Normalisierung über die Kandidatenmenge (Fallback 0.5 bei Konstanz).""" - if not values: - return values - lo, hi = min(values), max(values) - if hi - lo < 1e-9: - return [0.5] * len(values) - return [(v - lo) / (hi - lo) for v in values] - - -def combine_scores( - hits: List[Tuple[str, float, dict]], # (id, semantic_score, payload) - edge_bonus_map: Dict[str, float], - centrality_map: Dict[str, float], - w_sem: float = 0.70, - w_edge: float = 0.25, - w_cent: float = 0.05, -) -> List[Tuple[str, float, float, float, float]]: - """ - Liefert Liste von (point_id, total_score, edge_bonus, centrality_bonus, raw_semantic_score), - absteigend nach total_score sortiert. - """ - sem = [h[1] for h in hits] - sem_n = normalize_scores(sem) - out = [] - for (pid, s, payload), s_norm in zip(hits, sem_n): - e = edge_bonus_map.get(pid, 0.0) - c = centrality_map.get(pid, 0.0) - total = w_sem * s_norm + w_edge * e + w_cent * c - out.append((pid, total, e, c, s)) - out.sort(key=lambda t: t[1], reverse=True) - return out diff --git a/app/core/retriever_config.py b/app/core/retriever_config.py deleted file mode 100644 index fcf7bc7..0000000 --- a/app/core/retriever_config.py +++ /dev/null @@ -1,116 +0,0 @@ -"""app/core/retriever_config.py ---------------------------------- -Zentrale Konfiguration für den mindnet-Retriever (WP-04). - -Zweck: - - Lädt config/retriever.yaml (falls vorhanden) oder nutzt sinnvolle Defaults. - - Bietet einen gecachten Zugriff auf die Retriever-Config für - andere Module (z. B. graph_adapter, retriever). - -Hinweis zur Weiterentwicklung (Selbstjustierung): - - Die hier definierten Parameter sind so gewählt, dass sie später - durch ein Feedback-/Learning-to-Rank-Modell überschrieben werden - können, ohne die restliche Architektur anzupassen. -""" - -from __future__ import annotations - -import os -from dataclasses import dataclass -from functools import lru_cache -from pathlib import Path -from typing import Dict - -try: - import yaml # type: ignore -except Exception: # pragma: no cover - Fallback, falls PyYAML nicht installiert ist. - yaml = None # type: ignore - -@dataclass(frozen=True) -class RetrieverConfig: - semantic_scale: float - edge_scale: float - centrality_scale: float - edge_weights: Dict[str, float] - -@lru_cache -def get_retriever_config() -> RetrieverConfig: - """Lädt die Retriever-Konfiguration (YAML + Defaults). - - Reihenfolge: - 1. Defaults (sinnvoll gewählte Startwerte). - 2. Optional: config/retriever.yaml bzw. Pfad aus ENV - MINDNET_RETRIEVER_CONFIG überschreibt die Defaults. - - Die Funktion ist bewusst gecached, da sich die Konfiguration zur - Laufzeit üblicherweise nicht ändert. Für dynamisches Nachladen - könnte der Cache explizit geleert werden. - """ - - # 1) Defaults – bewusst konservativ gewählt. - semantic_scale = 1.0 - edge_scale = 1.0 - centrality_scale = 1.0 - - edge_weights: Dict[str, float] = { - # Wissens-Kanten - "depends_on": 1.0, - "related_to": 0.7, - "similar_to": 0.7, - "references": 0.5, - # Struktur-Kanten - "belongs_to": 0.2, - "next": 0.1, - "prev": 0.1, - # Sonstige / technische Kanten - "backlink": 0.2, - "references_at": 0.2, - } - - # 2) Optional: YAML-Konfiguration laden - cfg_path_env = os.getenv("MINDNET_RETRIEVER_CONFIG") - if cfg_path_env: - cfg_path = Path(cfg_path_env) - else: - # Project-Root = zwei Ebenen über app/core/ - cfg_path = Path(__file__).resolve().parents[2] / "config" / "retriever.yaml" - - if yaml is not None and cfg_path.exists(): - try: - data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {} - except Exception: - data = {} - - retr = data.get("retriever") or {} - - # Skalenwerte überschreiben, falls angegeben - try: - semantic_scale = float(retr.get("semantic_scale", semantic_scale)) - except (TypeError, ValueError): - pass - - try: - edge_scale = float(retr.get("edge_scale", edge_scale)) - except (TypeError, ValueError): - pass - - try: - centrality_scale = float(retr.get("centrality_scale", centrality_scale)) - except (TypeError, ValueError): - pass - - # Edge-Gewichte je Kanten-Typ - ew_cfg = retr.get("edge_weights") or {} - if isinstance(ew_cfg, dict): - for k, v in ew_cfg.items(): - try: - edge_weights[str(k)] = float(v) - except (TypeError, ValueError): - continue - - return RetrieverConfig( - semantic_scale=semantic_scale, - edge_scale=edge_scale, - centrality_scale=centrality_scale, - edge_weights=edge_weights, - ) diff --git a/app/core/schema_loader.py b/app/core/schema_loader.py deleted file mode 100644 index 019755f..0000000 --- a/app/core/schema_loader.py +++ /dev/null @@ -1,22 +0,0 @@ -from __future__ import annotations -import json -import os -from functools import lru_cache -from jsonschema import Draft202012Validator, RefResolver - -SCHEMAS_DIR = os.getenv("SCHEMAS_DIR", os.path.join(os.path.dirname(os.path.dirname(__file__)), "..", "schemas")) - -@lru_cache(maxsize=16) -def load_schema(name: str) -> dict: - # name: "note.schema.json" | "chunk.schema.json" | "edge.schema.json" - path = os.path.join(SCHEMAS_DIR, name) - if not os.path.isfile(path): - raise FileNotFoundError(f"Schema not found: {path}") - with open(path, "r", encoding="utf-8") as f: - return json.load(f) - -@lru_cache(maxsize=16) -def get_validator(name: str) -> Draft202012Validator: - schema = load_schema(name) - resolver = RefResolver.from_schema(schema) - return Draft202012Validator(schema, resolver=resolver) diff --git a/app/core/validate_note.py b/app/core/validate_note.py deleted file mode 100644 index 1fa77a6..0000000 --- a/app/core/validate_note.py +++ /dev/null @@ -1,16 +0,0 @@ -from __future__ import annotations -from typing import Dict -from jsonschema import ValidationError -from .schema_loader import get_validator - -NOTE_SCHEMA_NAME = "note.schema.json" - -def validate_note_payload(payload: Dict) -> None: - validator = get_validator(NOTE_SCHEMA_NAME) - errors = sorted(validator.iter_errors(payload), key=lambda e: e.path) - if errors: - msgs = [] - for e in errors: - loc = ".".join([str(x) for x in e.path]) or "" - msgs.append(f"{loc}: {e.message}") - raise ValidationError(" | ".join(msgs)) diff --git a/app/embed_server.py b/app/embed_server.py deleted file mode 100644 index fdc3213..0000000 --- a/app/embed_server.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Version 1 -""" -from __future__ import annotations -from fastapi import FastAPI, HTTPException -from pydantic import BaseModel -from typing import List, Optional -from sentence_transformers import SentenceTransformer - -app = FastAPI(title="mindnet-embed", version="1.0") - -MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # 384-dim -_model: SentenceTransformer | None = None - -class EmbedIn(BaseModel): - model: Optional[str] = None - inputs: List[str] - -class EmbedOut(BaseModel): - embeddings: List[List[float]] - -@app.on_event("startup") -def _load_model(): - global _model - _model = SentenceTransformer(MODEL_NAME) - -@app.get("/health") -def health(): - return {"ok": True, "model": MODEL_NAME, "dim": 384} - -@app.post("/embed", response_model=EmbedOut) -def embed(payload: EmbedIn) -> EmbedOut: - if _model is None: - raise HTTPException(status_code=503, detail="Model not loaded") - if not payload.inputs: - return EmbedOut(embeddings=[]) - vecs = _model.encode(payload.inputs, normalize_embeddings=False).tolist() - if any(len(v) != 384 for v in vecs): - raise HTTPException(status_code=500, detail="Embedding size mismatch (expected 384)") - return EmbedOut(embeddings=vecs) diff --git a/app/graph/service.py b/app/graph/service.py deleted file mode 100644 index 40b430e..0000000 --- a/app/graph/service.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Modul: app/graph/service.py -Version: 0.1.0 -Datum: 2025-09-10 - -Zweck ------ -Leichtgewichtiger Graph-Layer über Qdrant: - - get_note(note_id) - - get_chunks(note_id) - - neighbors(source_id, kinds=[...], scope=['note','chunk'], depth=1) - - walk_bfs(source_id, kinds, max_depth) - - context_for_note(note_id, max_neighbors): heuristische Kontextsammlung - -Hinweise --------- -- Nutzt die bestehenden Collections _notes/_chunks/_edges. -- Edges werden über Payload-Felder (`kind`, `source_id`, `target_id`) abgefragt. -""" -from __future__ import annotations -from typing import List, Dict, Any, Optional, Iterable, Set, Tuple -from qdrant_client.http import models as rest -from app.core.qdrant import QdrantConfig, get_client - -def _cols(prefix: str): - return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" - -class GraphService: - def __init__(self, cfg: Optional[QdrantConfig] = None, prefix: Optional[str] = None): - self.cfg = cfg or QdrantConfig.from_env() - if prefix: - self.cfg.prefix = prefix - self.client = get_client(self.cfg) - self.notes_col, self.chunks_col, self.edges_col = _cols(self.cfg.prefix) - - # ------------------------ fetch helpers ------------------------ - def _scroll(self, col: str, flt: Optional[rest.Filter] = None, limit: int = 256): - out = [] - nextp = None - while True: - pts, nextp = self.client.scroll( - collection_name=col, - with_payload=True, - with_vectors=False, - limit=limit, - offset=nextp, - scroll_filter=flt, - ) - if not pts: - break - out.extend(pts) - if nextp is None: - break - return out - - # ------------------------ public API --------------------------- - def get_note(self, note_id: str) -> Optional[Dict[str, Any]]: - f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - pts, _ = self.client.scroll(self.notes_col, with_payload=True, with_vectors=False, limit=1, scroll_filter=f) - return (pts[0].payload or None) if pts else None - - def get_chunks(self, note_id: str) -> List[Dict[str, Any]]: - f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - pts = self._scroll(self.chunks_col, f) - # Sortierung analog Export - def key(pl): - p = pl.payload or {} - s = p.get("seq") or 0 - ci = p.get("chunk_index") or 0 - n = 0 - cid = p.get("chunk_id") or "" - if isinstance(cid, str) and "#" in cid: - try: - n = int(cid.rsplit("#", 1)[-1]) - except Exception: - n = 0 - return (int(s), int(ci), n) - pts_sorted = sorted(pts, key=key) - return [p.payload or {} for p in pts_sorted] - - def neighbors(self, source_id: str, kinds: Optional[Iterable[str]] = None, - scope: Optional[Iterable[str]] = None, depth: int = 1) -> Dict[str, List[Dict[str, Any]]]: - """ - Liefert eingehende & ausgehende Nachbarn (nur nach kind gefiltert). - depth==1: direkte Kanten. - """ - kinds = list(kinds) if kinds else None - must = [rest.FieldCondition(key="source_id", match=rest.MatchValue(value=source_id))] - if kinds: - must.append(rest.FieldCondition(key="kind", match=rest.MatchAny(any=kinds))) - f = rest.Filter(must=must) - edges = self._scroll(self.edges_col, f) - out = {"out": [], "in": []} - for e in edges: - out["out"].append(e.payload or {}) - # Inverse Richtung (eingehend) - must_in = [rest.FieldCondition(key="target_id", match=rest.MatchValue(value=source_id))] - if kinds: - must_in.append(rest.FieldCondition(key="kind", match=rest.MatchAny(any=kinds))) - f_in = rest.Filter(must=must_in) - edges_in = self._scroll(self.edges_col, f_in) - for e in edges_in: - out["in"].append(e.payload or {}) - return out - - def walk_bfs(self, source_id: str, kinds: Iterable[str], max_depth: int = 2) -> Set[str]: - visited: Set[str] = {source_id} - frontier: Set[str] = {source_id} - kinds = list(kinds) - for _ in range(max_depth): - nxt: Set[str] = set() - for s in frontier: - neigh = self.neighbors(s, kinds=kinds) - for e in neigh["out"]: - t = e.get("target_id") - if isinstance(t, str) and t not in visited: - visited.add(t) - nxt.add(t) - frontier = nxt - if not frontier: - break - return visited - - def context_for_note(self, note_id: str, kinds: Iterable[str] = ("references","backlink"), max_neighbors: int = 12) -> Dict[str, Any]: - """ - Heuristischer Kontext: eigene Chunks + Nachbarn nach Kantenarten, dedupliziert. - """ - note = self.get_note(note_id) or {} - chunks = self.get_chunks(note_id) - neigh = self.neighbors(note_id, kinds=list(kinds)) - targets = [] - for e in neigh["out"]: - t = e.get("target_id") - if isinstance(t, str): - targets.append(t) - for e in neigh["in"]: - s = e.get("source_id") - if isinstance(s, str): - targets.append(s) - # de-dupe - seen = set() - uniq = [] - for t in targets: - if t not in seen: - seen.add(t) - uniq.append(t) - uniq = uniq[:max_neighbors] - neighbor_notes = [self.get_note(t) for t in uniq] - return { - "note": note, - "chunks": chunks, - "neighbors": [n for n in neighbor_notes if n], - "edges_out": neigh["out"], - "edges_in": neigh["in"], - } - -# Optional: Mini-CLI -if __name__ == "__main__": # pragma: no cover - import argparse, json - ap = argparse.ArgumentParser() - ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV)") - ap.add_argument("--note-id", required=True) - ap.add_argument("--neighbors", action="store_true", help="Nur Nachbarn anzeigen") - args = ap.parse_args() - svc = GraphService(prefix=args.prefix) - if args.neighbors: - out = svc.neighbors(args.note_id, kinds=["references","backlink","prev","next","belongs_to"]) - else: - out = svc.context_for_note(args.note_id) - print(json.dumps(out, ensure_ascii=False, indent=2)) diff --git a/app/main.py b/app/main.py index 521bc9a..1e2969c 100644 --- a/app/main.py +++ b/app/main.py @@ -10,8 +10,8 @@ LAST_ANALYSIS: 2025-12-15 from __future__ import annotations from fastapi import FastAPI from .config import get_settings -from .routers.embed_router import router as embed_router -from .routers.qdrant_router import router as qdrant_router +#from .routers.embed_router import router as embed_router +#from .routers.qdrant_router import router as qdrant_router from .routers.query import router as query_router from .routers.graph import router as graph_router @@ -35,8 +35,8 @@ def create_app() -> FastAPI: def healthz(): return {"status": "ok", "qdrant": s.QDRANT_URL, "prefix": s.COLLECTION_PREFIX} - app.include_router(embed_router) - app.include_router(qdrant_router) +# app.include_router(embed_router) +# app.include_router(qdrant_router) app.include_router(query_router, prefix="/query", tags=["query"]) app.include_router(graph_router, prefix="/graph", tags=["graph"])