löschen von Zombies
This commit is contained in:
parent
9025af62f0
commit
60092b378b
|
|
@ -1,296 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/core/edges.py
|
||||
Version: 2.0.0 (V2‑superset, rückwärtskompatibel zu v1 vom 2025‑09‑09)
|
||||
|
||||
Zweck
|
||||
-----
|
||||
Bewahrt die bestehende Edgelogik (belongs_to, prev/next, references, backlink)
|
||||
und ergänzt V2‑Felder + Typ‑Default‑Kanten gemäß config/types.yaml (edge_defaults).
|
||||
Die Funktion ist **idempotent** und **rückwärtskompatibel** zur bisherigen Signatur.
|
||||
|
||||
Kompatibilitätsgarantien (gegenüber v1):
|
||||
- **Input**: akzeptiert identische Chunk‑Payloads wie v1:
|
||||
* `id` (Chunk‑ID), `note_id` (Owner), `neighbors.prev|next` (optional),
|
||||
`references: [{target_id: ...}]` (optional),
|
||||
alternativ: `chunk_id`, `chunk_index|ord`, `window|text`
|
||||
- **Output (v1‑Felder)**: `kind`, `source_id`, `target_id`, `scope`, `note_id`, `edge_id`
|
||||
- **Neu (v2‑Felder)**: `relation`, `src_note_id`, `src_chunk_id?`, `dst_note_id`, `dst_chunk_id?`,
|
||||
`provenance` (`explicit|rule`), `rule_id?`, `confidence?`
|
||||
|
||||
Regeln
|
||||
------
|
||||
- Deduplizierungsschlüssel: (source_id, target_id, relation, rule_id)
|
||||
- Strukturkanten:
|
||||
* belongs_to: 1× pro Chunk
|
||||
* next/prev: Sequenz der Chunks; nutzt bevorzugt neighbors; sonst ord/chunk_index
|
||||
- Explizite Referenzen:
|
||||
* aus Chunk: `references[].target_id` (falls vorhanden)
|
||||
* Fallback: Wikilinks in `window|text`: [[Some Title|some-id]] oder [[some-id]]
|
||||
- Note‑Scope:
|
||||
* backlink immer; references nur, wenn include_note_scope_refs=True
|
||||
- Typ‑Defaults (edge_defaults aus config/types.yaml des **Quell‑Notiztyps**):
|
||||
* Für jede explizite Referenz wird je default‑Relation eine Regel‑Kante erzeugt
|
||||
* rule_id: "type_default:{note_type}:{relation}:v1", provenance="rule"
|
||||
|
||||
Konfiguration
|
||||
-------------
|
||||
- ENV MINDNET_TYPES_FILE (Default: ./config/types.yaml)
|
||||
|
||||
Lizenz/Autor
|
||||
------------
|
||||
- Erstimplementierung v1 (2025‑09‑09) — Projekt Mindnet
|
||||
- Erweiterung v2 (2025‑11‑11) — kompatible Superset‑Implementierung
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, Iterable, List, Optional, Tuple, Set
|
||||
|
||||
try:
|
||||
import yaml # optional, nur für types.yaml
|
||||
except Exception: # pragma: no cover
|
||||
yaml = None
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Hilfen: types.yaml laden (edge_defaults)
|
||||
# ------------------------------------------------------------
|
||||
|
||||
def _types_path() -> str:
|
||||
return os.getenv("MINDNET_TYPES_FILE") or "./config/types.yaml"
|
||||
|
||||
def _load_types() -> Dict[str, dict]:
|
||||
p = _types_path()
|
||||
if not os.path.isfile(p) or yaml is None:
|
||||
return {}
|
||||
try:
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
if isinstance(data, dict) and "types" in data and isinstance(data["types"], dict):
|
||||
return data["types"]
|
||||
return data if isinstance(data, dict) else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _edge_defaults_for(note_type: Optional[str]) -> List[str]:
|
||||
types = _load_types()
|
||||
t = (note_type or "").strip().lower()
|
||||
cfg = types.get(t) or {}
|
||||
defaults = cfg.get("edge_defaults") or []
|
||||
if isinstance(defaults, str):
|
||||
defaults = [defaults]
|
||||
return [str(x) for x in defaults if isinstance(x, (str, int, float))]
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Wikilink‑Parser (Fallback, wenn ch["references"] fehlt)
|
||||
# ------------------------------------------------------------
|
||||
|
||||
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]")
|
||||
|
||||
def _extract_wikilinks(text: str) -> List[str]:
|
||||
ids: List[str] = []
|
||||
for m in _WIKILINK_RE.finditer(text or ""):
|
||||
ids.append(m.group(1).strip())
|
||||
return ids
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Utility
|
||||
# ------------------------------------------------------------
|
||||
|
||||
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str:
|
||||
base = f"{kind}:{s}->{t}#{scope}"
|
||||
if rule_id:
|
||||
base += f"|{rule_id}"
|
||||
try:
|
||||
import hashlib
|
||||
return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest()
|
||||
except Exception: # pragma: no cover
|
||||
return base
|
||||
|
||||
def _dedupe(edges: List[Dict]) -> List[Dict]:
|
||||
seen: Set[Tuple[str,str,str,str]] = set()
|
||||
out: List[Dict] = []
|
||||
for e in edges:
|
||||
s = str(e.get("source_id") or "")
|
||||
t = str(e.get("target_id") or "")
|
||||
rel = str(e.get("relation") or e.get("kind") or "edge")
|
||||
rule = str(e.get("rule_id") or "")
|
||||
key = (s, t, rel, rule)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(e)
|
||||
return out
|
||||
|
||||
def _first(v: dict, *keys, default=None):
|
||||
for k in keys:
|
||||
if k in v and v[k] is not None:
|
||||
return v[k]
|
||||
return default
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Hauptfunktion
|
||||
# ------------------------------------------------------------
|
||||
|
||||
def build_edges_for_note(
|
||||
note_id: str,
|
||||
chunk_payloads: List[Dict],
|
||||
note_level_refs: Optional[List[str]] = None,
|
||||
*,
|
||||
include_note_scope_refs: bool = False,
|
||||
) -> List[Dict]:
|
||||
edges: List[Dict] = []
|
||||
chunks = list(chunk_payloads or [])
|
||||
# Notiztyp aus erstem Chunk ableiten (kompatibel zu existierenden Payloads)
|
||||
note_type = (chunks[0].get("type") if chunks else None) or (chunks[0].get("note_type") if chunks else None)
|
||||
|
||||
# --- Strukturkanten ------------------------------------------------------
|
||||
# belongs_to
|
||||
for ch in chunks:
|
||||
cid = _first(ch, "id", "chunk_id")
|
||||
if not cid:
|
||||
continue
|
||||
owner = ch.get("note_id") or note_id
|
||||
e = {
|
||||
"edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to:v1"),
|
||||
"kind": "belongs_to",
|
||||
"relation": "belongs_to",
|
||||
"scope": "chunk",
|
||||
"source_id": cid,
|
||||
"target_id": note_id,
|
||||
"note_id": owner, # v1-Kompat
|
||||
# v2
|
||||
"src_note_id": owner,
|
||||
"src_chunk_id": cid,
|
||||
"dst_note_id": note_id,
|
||||
"provenance": "rule",
|
||||
"rule_id": "structure:belongs_to:v1",
|
||||
"confidence": 1.0,
|
||||
}
|
||||
edges.append(e)
|
||||
|
||||
# next/prev — bevorzugt neighbors.prev/next; sonst via ord/chunk_index
|
||||
# Map der Chunks nach Index
|
||||
ordered = list(chunks)
|
||||
def _idx(c):
|
||||
return _first(c, "chunk_index", "ord", default=0)
|
||||
ordered.sort(key=_idx)
|
||||
|
||||
for i, ch in enumerate(ordered):
|
||||
cid = _first(ch, "id", "chunk_id")
|
||||
if not cid:
|
||||
continue
|
||||
owner = ch.get("note_id") or note_id
|
||||
nb = ch.get("neighbors") or {}
|
||||
prev_id = nb.get("prev")
|
||||
next_id = nb.get("next")
|
||||
# Fallback-Reihenfolge
|
||||
if prev_id is None and i > 0:
|
||||
prev_id = _first(ordered[i-1], "id", "chunk_id")
|
||||
if next_id is None and i+1 < len(ordered):
|
||||
next_id = _first(ordered[i+1], "id", "chunk_id")
|
||||
|
||||
if prev_id:
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id("prev", cid, prev_id, "chunk", "structure:order:v1"),
|
||||
"kind": "prev", "relation": "prev", "scope": "chunk",
|
||||
"source_id": cid, "target_id": prev_id, "note_id": owner,
|
||||
"src_note_id": owner, "src_chunk_id": cid,
|
||||
"dst_note_id": owner, "dst_chunk_id": prev_id,
|
||||
"provenance": "rule", "rule_id": "structure:order:v1", "confidence": 0.95,
|
||||
})
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id("next", prev_id, cid, "chunk", "structure:order:v1"),
|
||||
"kind": "next", "relation": "next", "scope": "chunk",
|
||||
"source_id": prev_id, "target_id": cid, "note_id": owner,
|
||||
"src_note_id": owner, "src_chunk_id": prev_id,
|
||||
"dst_note_id": owner, "dst_chunk_id": cid,
|
||||
"provenance": "rule", "rule_id": "structure:order:v1", "confidence": 0.95,
|
||||
})
|
||||
|
||||
# --- Explizite Referenzen (Chunk‑Scope) ---------------------------------
|
||||
explicit_refs: List[Dict] = []
|
||||
for ch in chunks:
|
||||
cid = _first(ch, "id", "chunk_id")
|
||||
if not cid:
|
||||
continue
|
||||
owner = ch.get("note_id") or note_id
|
||||
# 1) bevorzugt vorhandene ch["references"]
|
||||
refs = ch.get("references") or []
|
||||
targets = [r.get("target_id") for r in refs if isinstance(r, dict) and r.get("target_id")]
|
||||
# 2) Fallback: Wikilinks aus Text
|
||||
if not targets:
|
||||
text = _first(ch, "window", "text", default="") or ""
|
||||
targets = _extract_wikilinks(text)
|
||||
for tid in targets:
|
||||
if not isinstance(tid, str) or not tid.strip():
|
||||
continue
|
||||
e = {
|
||||
"edge_id": _mk_edge_id("references", cid, tid, "chunk"),
|
||||
"kind": "references",
|
||||
"relation": "references",
|
||||
"scope": "chunk",
|
||||
"source_id": cid,
|
||||
"target_id": tid,
|
||||
"note_id": owner,
|
||||
# v2
|
||||
"src_note_id": owner,
|
||||
"src_chunk_id": cid,
|
||||
"dst_note_id": tid,
|
||||
"provenance": "explicit",
|
||||
"rule_id": "",
|
||||
"confidence": 1.0,
|
||||
}
|
||||
edges.append(e)
|
||||
explicit_refs.append(e)
|
||||
|
||||
# --- Note‑Scope: references (optional) + backlink (immer) ----------------
|
||||
unique_refs = []
|
||||
if note_level_refs:
|
||||
seen = set()
|
||||
for tid in note_level_refs:
|
||||
if isinstance(tid, str) and tid.strip() and tid not in seen:
|
||||
unique_refs.append(tid); seen.add(tid)
|
||||
|
||||
for tid in unique_refs:
|
||||
if include_note_scope_refs:
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id("references", note_id, tid, "note"),
|
||||
"kind": "references", "relation": "references", "scope": "note",
|
||||
"source_id": note_id, "target_id": tid, "note_id": note_id,
|
||||
"src_note_id": note_id, "dst_note_id": tid,
|
||||
"provenance": "explicit", "rule_id": "", "confidence": 1.0,
|
||||
})
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id("backlink", tid, note_id, "note", "derived:backlink:v1"),
|
||||
"kind": "backlink", "relation": "backlink", "scope": "note",
|
||||
"source_id": tid, "target_id": note_id, "note_id": note_id,
|
||||
"src_note_id": tid, "dst_note_id": note_id,
|
||||
"provenance": "rule", "rule_id": "derived:backlink:v1", "confidence": 0.9,
|
||||
})
|
||||
|
||||
# --- Type‑Defaults je expliziter Referenz --------------------------------
|
||||
defaults = [d for d in _edge_defaults_for(note_type) if d and d != "references"]
|
||||
if defaults:
|
||||
for e in explicit_refs + ([ ] if not include_note_scope_refs else []):
|
||||
# wir nutzen die bereits erzeugten explicit‑Edges als Vorlage
|
||||
src = e["source_id"]; tgt = e["target_id"]
|
||||
scope = e.get("scope", "chunk")
|
||||
s_note = e.get("src_note_id") or note_id
|
||||
s_chunk = e.get("src_chunk_id")
|
||||
t_note = e.get("dst_note_id") or tgt
|
||||
for rel in defaults:
|
||||
rule_id = f"type_default:{(note_type or 'unknown')}:{rel}:v1"
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id(rel, src, tgt, scope, rule_id),
|
||||
"kind": rel, "relation": rel, "scope": scope,
|
||||
"source_id": src, "target_id": tgt, "note_id": s_note,
|
||||
"src_note_id": s_note, "src_chunk_id": s_chunk,
|
||||
"dst_note_id": t_note,
|
||||
"provenance": "rule", "rule_id": rule_id, "confidence": 0.7,
|
||||
})
|
||||
|
||||
# --- Dedupe & Return -----------------------------------------------------
|
||||
return _dedupe(edges)
|
||||
|
|
@ -1,94 +0,0 @@
|
|||
# app/core/edges_writer.py
|
||||
from __future__ import annotations
|
||||
import hashlib
|
||||
from typing import Dict, List, Iterable, Tuple
|
||||
|
||||
try:
|
||||
# Dein Modul mit der Schemadefinition und der Builder-Funktion
|
||||
from app.core.edges import build_edges_for_note # noqa: F401
|
||||
except Exception as e:
|
||||
raise RuntimeError("Konnte app.core.edges nicht importieren. "
|
||||
"Bitte sicherstellen, dass app/core/edges.py vorhanden ist.") from e
|
||||
|
||||
def _edge_uid(kind: str, source_id: str, target_id: str, scope: str) -> str:
|
||||
"""
|
||||
Deterministische, kurze ID für eine Edge.
|
||||
Kollisionen sind praktisch ausgeschlossen (BLAKE2s über den Kanonischen Schlüssel).
|
||||
"""
|
||||
key = f"{kind}|{source_id}|{target_id}|{scope}"
|
||||
return hashlib.blake2s(key.encode("utf-8"), digest_size=12).hexdigest()
|
||||
|
||||
def ensure_edges_collection(qdrant_client, collection: str) -> None:
|
||||
"""
|
||||
Legt die Edge-Collection an, falls sie nicht existiert.
|
||||
Minimal: 1D-Vector (Dummy), Cosine. Payload-only-Collections sind je nach Qdrant-Version heikel.
|
||||
"""
|
||||
from qdrant_client.http import models as qm
|
||||
|
||||
existing = [c.name for c in qdrant_client.get_collections().collections]
|
||||
if collection in existing:
|
||||
return
|
||||
|
||||
qdrant_client.recreate_collection(
|
||||
collection_name=collection,
|
||||
vectors_config=qm.VectorParams(size=1, distance=qm.Distance.COSINE),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
|
||||
def edges_from_note(
|
||||
note_id: str,
|
||||
chunk_payloads: List[Dict],
|
||||
note_level_refs: Iterable[str] | None,
|
||||
*,
|
||||
include_note_scope_refs: bool = False,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Ruft deinen Edge-Builder auf und gibt die (deduplizierten) Edge-Payloads zurück.
|
||||
Keine Schemaänderung – exakt das aus app/core/edges.py.
|
||||
"""
|
||||
return build_edges_for_note(
|
||||
note_id=note_id,
|
||||
chunk_payloads=chunk_payloads,
|
||||
note_level_refs=list(note_level_refs or []),
|
||||
include_note_scope_refs=include_note_scope_refs,
|
||||
)
|
||||
|
||||
def upsert_edges(
|
||||
qdrant_client,
|
||||
collection: str,
|
||||
edge_payloads: List[Dict],
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Schreibt Edges als Points in Qdrant.
|
||||
- id: deterministisch aus (kind, source_id, target_id, scope)
|
||||
- vector: [0.0] Dummy
|
||||
- payload: Edge-Dict (unverändert, siehe Schema in app/core/edges.py)
|
||||
Gibt (anzahl_points, anzahl_unique_keys) zurück.
|
||||
"""
|
||||
from qdrant_client.models import PointStruct
|
||||
|
||||
if not edge_payloads:
|
||||
return 0, 0
|
||||
|
||||
points = []
|
||||
seen = set()
|
||||
for e in edge_payloads:
|
||||
key = (e.get("kind"), e.get("source_id"), e.get("target_id"), e.get("scope"))
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
eid = _edge_uid(*key)
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=eid,
|
||||
vector=[0.0],
|
||||
payload=e,
|
||||
)
|
||||
)
|
||||
|
||||
if not points:
|
||||
return 0, 0
|
||||
|
||||
ensure_edges_collection(qdrant_client, collection)
|
||||
qdrant_client.upsert(collection_name=collection, points=points)
|
||||
return len(points), len(seen)
|
||||
|
|
@ -1,103 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Datei: app/core/env_vars.py
|
||||
Version: 1.1.0 (2025-11-08)
|
||||
|
||||
Zweck
|
||||
Einheitliche Auflösung von ENV-Variablen (Prefix, Qdrant, Embeddings, Hashing)
|
||||
mit Abwärtskompatibilität.
|
||||
|
||||
Grundsatz
|
||||
- Für Qdrant-Funktionen ist 'COLLECTION_PREFIX' der Primärschlüssel.
|
||||
- 'MINDNET_PREFIX' bleibt für App-/UI-/Exporter-Kontexte nutzbar.
|
||||
- Fallbacks sorgen dafür, dass ältere Umgebungen weiter funktionieren.
|
||||
|
||||
Wichtig
|
||||
- Lädt optional eine .env (wenn python-dotenv verfügbar ist).
|
||||
- Überschreibt keine bereits gesetzten OS-Variablen (override=False).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Optional, Dict
|
||||
|
||||
# Optional: .env automatisch laden (ohne Hard-Fail, falls nicht vorhanden)
|
||||
try:
|
||||
from dotenv import load_dotenv, find_dotenv # type: ignore
|
||||
_p = find_dotenv()
|
||||
if _p:
|
||||
load_dotenv(_p, override=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# -------- Prefix-Auflösung --------
|
||||
|
||||
def get_collection_prefix(cli_override: Optional[str] = None) -> str:
|
||||
"""
|
||||
Für Qdrant-relevante Funktionen:
|
||||
1) CLI-Override (--prefix)
|
||||
2) ENV COLLECTION_PREFIX
|
||||
3) ENV MINDNET_PREFIX (Fallback)
|
||||
4) 'mindnet' (Default)
|
||||
"""
|
||||
if cli_override and str(cli_override).strip():
|
||||
return str(cli_override).strip()
|
||||
return (
|
||||
os.getenv("COLLECTION_PREFIX")
|
||||
or os.getenv("MINDNET_PREFIX")
|
||||
or "mindnet"
|
||||
)
|
||||
|
||||
def get_mindnet_prefix(cli_override: Optional[str] = None) -> str:
|
||||
"""
|
||||
Für App-/UI-/Exporter-Kontexte:
|
||||
1) CLI-Override (--prefix)
|
||||
2) ENV MINDNET_PREFIX
|
||||
3) ENV COLLECTION_PREFIX (Fallback)
|
||||
4) 'mindnet'
|
||||
"""
|
||||
if cli_override and str(cli_override).strip():
|
||||
return str(cli_override).strip()
|
||||
return (
|
||||
os.getenv("MINDNET_PREFIX")
|
||||
or os.getenv("COLLECTION_PREFIX")
|
||||
or "mindnet"
|
||||
)
|
||||
|
||||
def get_prefix(cli_override: Optional[str] = None, target: str = "qdrant") -> str:
|
||||
"""
|
||||
Universelle Hülle (abwärtskompatibel):
|
||||
target='qdrant' -> get_collection_prefix
|
||||
target='app' -> get_mindnet_prefix
|
||||
"""
|
||||
if target.lower() == "app":
|
||||
return get_mindnet_prefix(cli_override)
|
||||
return get_collection_prefix(cli_override)
|
||||
|
||||
# -------- Qdrant / Embeddings / Hashing --------
|
||||
|
||||
def get_qdrant_url(default: str = "http://127.0.0.1:6333") -> str:
|
||||
return os.getenv("QDRANT_URL", default)
|
||||
|
||||
def get_qdrant_api_key(default: str = "") -> str:
|
||||
return os.getenv("QDRANT_API_KEY", default)
|
||||
|
||||
def get_vector_dim(default: int = 384) -> int:
|
||||
try:
|
||||
return int(os.getenv("VECTOR_DIM", str(default)))
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
def get_embed_url(default: Optional[str] = None) -> Optional[str]:
|
||||
return os.getenv("EMBED_URL", default)
|
||||
|
||||
def get_hash_env() -> Dict[str, str]:
|
||||
"""
|
||||
Liefert die Hash-Konfiguration (nur Aggregation; die Auswertung bleibt in den Skripten).
|
||||
"""
|
||||
return {
|
||||
"MINDNET_HASH_COMPARE": os.getenv("MINDNET_HASH_COMPARE", ""),
|
||||
"MINDNET_HASH_SOURCE": os.getenv("MINDNET_HASH_SOURCE", ""),
|
||||
"MINDNET_HASH_NORMALIZE": os.getenv("MINDNET_HASH_NORMALIZE", ""),
|
||||
}
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
"""
|
||||
app/core/ranking.py — Kombiniertes Scoring (WP-04)
|
||||
|
||||
Zweck:
|
||||
Zusammenführen von semantischem Score (normalisiert), Edge-Bonus und
|
||||
Centrality-Bonus in einen Gesamtscore für die Ergebnisreihung.
|
||||
Kompatibilität:
|
||||
Python 3.12+
|
||||
Version:
|
||||
0.1.0 (Erstanlage)
|
||||
Stand:
|
||||
2025-10-07
|
||||
Bezug:
|
||||
WP-04 Ranking-Formel (w_sem, w_edge, w_cent)
|
||||
Nutzung:
|
||||
from app.core.ranking import combine_scores
|
||||
Änderungsverlauf:
|
||||
0.1.0 (2025-10-07) – Erstanlage.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import List, Tuple, Dict
|
||||
|
||||
|
||||
def normalize_scores(values: List[float]) -> List[float]:
|
||||
"""Min-Max-Normalisierung über die Kandidatenmenge (Fallback 0.5 bei Konstanz)."""
|
||||
if not values:
|
||||
return values
|
||||
lo, hi = min(values), max(values)
|
||||
if hi - lo < 1e-9:
|
||||
return [0.5] * len(values)
|
||||
return [(v - lo) / (hi - lo) for v in values]
|
||||
|
||||
|
||||
def combine_scores(
|
||||
hits: List[Tuple[str, float, dict]], # (id, semantic_score, payload)
|
||||
edge_bonus_map: Dict[str, float],
|
||||
centrality_map: Dict[str, float],
|
||||
w_sem: float = 0.70,
|
||||
w_edge: float = 0.25,
|
||||
w_cent: float = 0.05,
|
||||
) -> List[Tuple[str, float, float, float, float]]:
|
||||
"""
|
||||
Liefert Liste von (point_id, total_score, edge_bonus, centrality_bonus, raw_semantic_score),
|
||||
absteigend nach total_score sortiert.
|
||||
"""
|
||||
sem = [h[1] for h in hits]
|
||||
sem_n = normalize_scores(sem)
|
||||
out = []
|
||||
for (pid, s, payload), s_norm in zip(hits, sem_n):
|
||||
e = edge_bonus_map.get(pid, 0.0)
|
||||
c = centrality_map.get(pid, 0.0)
|
||||
total = w_sem * s_norm + w_edge * e + w_cent * c
|
||||
out.append((pid, total, e, c, s))
|
||||
out.sort(key=lambda t: t[1], reverse=True)
|
||||
return out
|
||||
|
|
@ -1,116 +0,0 @@
|
|||
"""app/core/retriever_config.py
|
||||
---------------------------------
|
||||
Zentrale Konfiguration für den mindnet-Retriever (WP-04).
|
||||
|
||||
Zweck:
|
||||
- Lädt config/retriever.yaml (falls vorhanden) oder nutzt sinnvolle Defaults.
|
||||
- Bietet einen gecachten Zugriff auf die Retriever-Config für
|
||||
andere Module (z. B. graph_adapter, retriever).
|
||||
|
||||
Hinweis zur Weiterentwicklung (Selbstjustierung):
|
||||
- Die hier definierten Parameter sind so gewählt, dass sie später
|
||||
durch ein Feedback-/Learning-to-Rank-Modell überschrieben werden
|
||||
können, ohne die restliche Architektur anzupassen.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
except Exception: # pragma: no cover - Fallback, falls PyYAML nicht installiert ist.
|
||||
yaml = None # type: ignore
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RetrieverConfig:
|
||||
semantic_scale: float
|
||||
edge_scale: float
|
||||
centrality_scale: float
|
||||
edge_weights: Dict[str, float]
|
||||
|
||||
@lru_cache
|
||||
def get_retriever_config() -> RetrieverConfig:
|
||||
"""Lädt die Retriever-Konfiguration (YAML + Defaults).
|
||||
|
||||
Reihenfolge:
|
||||
1. Defaults (sinnvoll gewählte Startwerte).
|
||||
2. Optional: config/retriever.yaml bzw. Pfad aus ENV
|
||||
MINDNET_RETRIEVER_CONFIG überschreibt die Defaults.
|
||||
|
||||
Die Funktion ist bewusst gecached, da sich die Konfiguration zur
|
||||
Laufzeit üblicherweise nicht ändert. Für dynamisches Nachladen
|
||||
könnte der Cache explizit geleert werden.
|
||||
"""
|
||||
|
||||
# 1) Defaults – bewusst konservativ gewählt.
|
||||
semantic_scale = 1.0
|
||||
edge_scale = 1.0
|
||||
centrality_scale = 1.0
|
||||
|
||||
edge_weights: Dict[str, float] = {
|
||||
# Wissens-Kanten
|
||||
"depends_on": 1.0,
|
||||
"related_to": 0.7,
|
||||
"similar_to": 0.7,
|
||||
"references": 0.5,
|
||||
# Struktur-Kanten
|
||||
"belongs_to": 0.2,
|
||||
"next": 0.1,
|
||||
"prev": 0.1,
|
||||
# Sonstige / technische Kanten
|
||||
"backlink": 0.2,
|
||||
"references_at": 0.2,
|
||||
}
|
||||
|
||||
# 2) Optional: YAML-Konfiguration laden
|
||||
cfg_path_env = os.getenv("MINDNET_RETRIEVER_CONFIG")
|
||||
if cfg_path_env:
|
||||
cfg_path = Path(cfg_path_env)
|
||||
else:
|
||||
# Project-Root = zwei Ebenen über app/core/
|
||||
cfg_path = Path(__file__).resolve().parents[2] / "config" / "retriever.yaml"
|
||||
|
||||
if yaml is not None and cfg_path.exists():
|
||||
try:
|
||||
data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}
|
||||
except Exception:
|
||||
data = {}
|
||||
|
||||
retr = data.get("retriever") or {}
|
||||
|
||||
# Skalenwerte überschreiben, falls angegeben
|
||||
try:
|
||||
semantic_scale = float(retr.get("semantic_scale", semantic_scale))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
edge_scale = float(retr.get("edge_scale", edge_scale))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
centrality_scale = float(retr.get("centrality_scale", centrality_scale))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# Edge-Gewichte je Kanten-Typ
|
||||
ew_cfg = retr.get("edge_weights") or {}
|
||||
if isinstance(ew_cfg, dict):
|
||||
for k, v in ew_cfg.items():
|
||||
try:
|
||||
edge_weights[str(k)] = float(v)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
|
||||
return RetrieverConfig(
|
||||
semantic_scale=semantic_scale,
|
||||
edge_scale=edge_scale,
|
||||
centrality_scale=centrality_scale,
|
||||
edge_weights=edge_weights,
|
||||
)
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
from functools import lru_cache
|
||||
from jsonschema import Draft202012Validator, RefResolver
|
||||
|
||||
SCHEMAS_DIR = os.getenv("SCHEMAS_DIR", os.path.join(os.path.dirname(os.path.dirname(__file__)), "..", "schemas"))
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def load_schema(name: str) -> dict:
|
||||
# name: "note.schema.json" | "chunk.schema.json" | "edge.schema.json"
|
||||
path = os.path.join(SCHEMAS_DIR, name)
|
||||
if not os.path.isfile(path):
|
||||
raise FileNotFoundError(f"Schema not found: {path}")
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def get_validator(name: str) -> Draft202012Validator:
|
||||
schema = load_schema(name)
|
||||
resolver = RefResolver.from_schema(schema)
|
||||
return Draft202012Validator(schema, resolver=resolver)
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
from __future__ import annotations
|
||||
from typing import Dict
|
||||
from jsonschema import ValidationError
|
||||
from .schema_loader import get_validator
|
||||
|
||||
NOTE_SCHEMA_NAME = "note.schema.json"
|
||||
|
||||
def validate_note_payload(payload: Dict) -> None:
|
||||
validator = get_validator(NOTE_SCHEMA_NAME)
|
||||
errors = sorted(validator.iter_errors(payload), key=lambda e: e.path)
|
||||
if errors:
|
||||
msgs = []
|
||||
for e in errors:
|
||||
loc = ".".join([str(x) for x in e.path]) or "<root>"
|
||||
msgs.append(f"{loc}: {e.message}")
|
||||
raise ValidationError(" | ".join(msgs))
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
"""
|
||||
Version 1
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
app = FastAPI(title="mindnet-embed", version="1.0")
|
||||
|
||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # 384-dim
|
||||
_model: SentenceTransformer | None = None
|
||||
|
||||
class EmbedIn(BaseModel):
|
||||
model: Optional[str] = None
|
||||
inputs: List[str]
|
||||
|
||||
class EmbedOut(BaseModel):
|
||||
embeddings: List[List[float]]
|
||||
|
||||
@app.on_event("startup")
|
||||
def _load_model():
|
||||
global _model
|
||||
_model = SentenceTransformer(MODEL_NAME)
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"ok": True, "model": MODEL_NAME, "dim": 384}
|
||||
|
||||
@app.post("/embed", response_model=EmbedOut)
|
||||
def embed(payload: EmbedIn) -> EmbedOut:
|
||||
if _model is None:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded")
|
||||
if not payload.inputs:
|
||||
return EmbedOut(embeddings=[])
|
||||
vecs = _model.encode(payload.inputs, normalize_embeddings=False).tolist()
|
||||
if any(len(v) != 384 for v in vecs):
|
||||
raise HTTPException(status_code=500, detail="Embedding size mismatch (expected 384)")
|
||||
return EmbedOut(embeddings=vecs)
|
||||
|
|
@ -1,172 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/graph/service.py
|
||||
Version: 0.1.0
|
||||
Datum: 2025-09-10
|
||||
|
||||
Zweck
|
||||
-----
|
||||
Leichtgewichtiger Graph-Layer über Qdrant:
|
||||
- get_note(note_id)
|
||||
- get_chunks(note_id)
|
||||
- neighbors(source_id, kinds=[...], scope=['note','chunk'], depth=1)
|
||||
- walk_bfs(source_id, kinds, max_depth)
|
||||
- context_for_note(note_id, max_neighbors): heuristische Kontextsammlung
|
||||
|
||||
Hinweise
|
||||
--------
|
||||
- Nutzt die bestehenden Collections <prefix>_notes/_chunks/_edges.
|
||||
- Edges werden über Payload-Felder (`kind`, `source_id`, `target_id`) abgefragt.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import List, Dict, Any, Optional, Iterable, Set, Tuple
|
||||
from qdrant_client.http import models as rest
|
||||
from app.core.qdrant import QdrantConfig, get_client
|
||||
|
||||
def _cols(prefix: str):
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
class GraphService:
|
||||
def __init__(self, cfg: Optional[QdrantConfig] = None, prefix: Optional[str] = None):
|
||||
self.cfg = cfg or QdrantConfig.from_env()
|
||||
if prefix:
|
||||
self.cfg.prefix = prefix
|
||||
self.client = get_client(self.cfg)
|
||||
self.notes_col, self.chunks_col, self.edges_col = _cols(self.cfg.prefix)
|
||||
|
||||
# ------------------------ fetch helpers ------------------------
|
||||
def _scroll(self, col: str, flt: Optional[rest.Filter] = None, limit: int = 256):
|
||||
out = []
|
||||
nextp = None
|
||||
while True:
|
||||
pts, nextp = self.client.scroll(
|
||||
collection_name=col,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
limit=limit,
|
||||
offset=nextp,
|
||||
scroll_filter=flt,
|
||||
)
|
||||
if not pts:
|
||||
break
|
||||
out.extend(pts)
|
||||
if nextp is None:
|
||||
break
|
||||
return out
|
||||
|
||||
# ------------------------ public API ---------------------------
|
||||
def get_note(self, note_id: str) -> Optional[Dict[str, Any]]:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
pts, _ = self.client.scroll(self.notes_col, with_payload=True, with_vectors=False, limit=1, scroll_filter=f)
|
||||
return (pts[0].payload or None) if pts else None
|
||||
|
||||
def get_chunks(self, note_id: str) -> List[Dict[str, Any]]:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
pts = self._scroll(self.chunks_col, f)
|
||||
# Sortierung analog Export
|
||||
def key(pl):
|
||||
p = pl.payload or {}
|
||||
s = p.get("seq") or 0
|
||||
ci = p.get("chunk_index") or 0
|
||||
n = 0
|
||||
cid = p.get("chunk_id") or ""
|
||||
if isinstance(cid, str) and "#" in cid:
|
||||
try:
|
||||
n = int(cid.rsplit("#", 1)[-1])
|
||||
except Exception:
|
||||
n = 0
|
||||
return (int(s), int(ci), n)
|
||||
pts_sorted = sorted(pts, key=key)
|
||||
return [p.payload or {} for p in pts_sorted]
|
||||
|
||||
def neighbors(self, source_id: str, kinds: Optional[Iterable[str]] = None,
|
||||
scope: Optional[Iterable[str]] = None, depth: int = 1) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""
|
||||
Liefert eingehende & ausgehende Nachbarn (nur nach kind gefiltert).
|
||||
depth==1: direkte Kanten.
|
||||
"""
|
||||
kinds = list(kinds) if kinds else None
|
||||
must = [rest.FieldCondition(key="source_id", match=rest.MatchValue(value=source_id))]
|
||||
if kinds:
|
||||
must.append(rest.FieldCondition(key="kind", match=rest.MatchAny(any=kinds)))
|
||||
f = rest.Filter(must=must)
|
||||
edges = self._scroll(self.edges_col, f)
|
||||
out = {"out": [], "in": []}
|
||||
for e in edges:
|
||||
out["out"].append(e.payload or {})
|
||||
# Inverse Richtung (eingehend)
|
||||
must_in = [rest.FieldCondition(key="target_id", match=rest.MatchValue(value=source_id))]
|
||||
if kinds:
|
||||
must_in.append(rest.FieldCondition(key="kind", match=rest.MatchAny(any=kinds)))
|
||||
f_in = rest.Filter(must=must_in)
|
||||
edges_in = self._scroll(self.edges_col, f_in)
|
||||
for e in edges_in:
|
||||
out["in"].append(e.payload or {})
|
||||
return out
|
||||
|
||||
def walk_bfs(self, source_id: str, kinds: Iterable[str], max_depth: int = 2) -> Set[str]:
|
||||
visited: Set[str] = {source_id}
|
||||
frontier: Set[str] = {source_id}
|
||||
kinds = list(kinds)
|
||||
for _ in range(max_depth):
|
||||
nxt: Set[str] = set()
|
||||
for s in frontier:
|
||||
neigh = self.neighbors(s, kinds=kinds)
|
||||
for e in neigh["out"]:
|
||||
t = e.get("target_id")
|
||||
if isinstance(t, str) and t not in visited:
|
||||
visited.add(t)
|
||||
nxt.add(t)
|
||||
frontier = nxt
|
||||
if not frontier:
|
||||
break
|
||||
return visited
|
||||
|
||||
def context_for_note(self, note_id: str, kinds: Iterable[str] = ("references","backlink"), max_neighbors: int = 12) -> Dict[str, Any]:
|
||||
"""
|
||||
Heuristischer Kontext: eigene Chunks + Nachbarn nach Kantenarten, dedupliziert.
|
||||
"""
|
||||
note = self.get_note(note_id) or {}
|
||||
chunks = self.get_chunks(note_id)
|
||||
neigh = self.neighbors(note_id, kinds=list(kinds))
|
||||
targets = []
|
||||
for e in neigh["out"]:
|
||||
t = e.get("target_id")
|
||||
if isinstance(t, str):
|
||||
targets.append(t)
|
||||
for e in neigh["in"]:
|
||||
s = e.get("source_id")
|
||||
if isinstance(s, str):
|
||||
targets.append(s)
|
||||
# de-dupe
|
||||
seen = set()
|
||||
uniq = []
|
||||
for t in targets:
|
||||
if t not in seen:
|
||||
seen.add(t)
|
||||
uniq.append(t)
|
||||
uniq = uniq[:max_neighbors]
|
||||
neighbor_notes = [self.get_note(t) for t in uniq]
|
||||
return {
|
||||
"note": note,
|
||||
"chunks": chunks,
|
||||
"neighbors": [n for n in neighbor_notes if n],
|
||||
"edges_out": neigh["out"],
|
||||
"edges_in": neigh["in"],
|
||||
}
|
||||
|
||||
# Optional: Mini-CLI
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
import argparse, json
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV)")
|
||||
ap.add_argument("--note-id", required=True)
|
||||
ap.add_argument("--neighbors", action="store_true", help="Nur Nachbarn anzeigen")
|
||||
args = ap.parse_args()
|
||||
svc = GraphService(prefix=args.prefix)
|
||||
if args.neighbors:
|
||||
out = svc.neighbors(args.note_id, kinds=["references","backlink","prev","next","belongs_to"])
|
||||
else:
|
||||
out = svc.context_for_note(args.note_id)
|
||||
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||||
|
|
@ -10,8 +10,8 @@ LAST_ANALYSIS: 2025-12-15
|
|||
from __future__ import annotations
|
||||
from fastapi import FastAPI
|
||||
from .config import get_settings
|
||||
from .routers.embed_router import router as embed_router
|
||||
from .routers.qdrant_router import router as qdrant_router
|
||||
#from .routers.embed_router import router as embed_router
|
||||
#from .routers.qdrant_router import router as qdrant_router
|
||||
|
||||
from .routers.query import router as query_router
|
||||
from .routers.graph import router as graph_router
|
||||
|
|
@ -35,8 +35,8 @@ def create_app() -> FastAPI:
|
|||
def healthz():
|
||||
return {"status": "ok", "qdrant": s.QDRANT_URL, "prefix": s.COLLECTION_PREFIX}
|
||||
|
||||
app.include_router(embed_router)
|
||||
app.include_router(qdrant_router)
|
||||
# app.include_router(embed_router)
|
||||
# app.include_router(qdrant_router)
|
||||
|
||||
app.include_router(query_router, prefix="/query", tags=["query"])
|
||||
app.include_router(graph_router, prefix="/graph", tags=["graph"])
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user