scriptAudit #11
|
|
@ -1,17 +1,10 @@
|
|||
"""
|
||||
app — mindnet API package
|
||||
|
||||
Zweck:
|
||||
Markiert 'app/' als Python-Paket, damit 'from app.main import create_app'
|
||||
in Tests und Skripten funktioniert.
|
||||
Kompatibilität:
|
||||
Python 3.12+
|
||||
Version:
|
||||
0.1.0 (Erstanlage)
|
||||
Stand:
|
||||
2025-10-07
|
||||
Hinweise:
|
||||
Keine Logik – nur Paketinitialisierung.
|
||||
FILE: app/__init__.py
|
||||
DESCRIPTION: Paket-Initialisierung.
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: None
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
"""
|
||||
app/config.py — zentrale Konfiguration
|
||||
Version: 0.4.0 (WP-06 Complete)
|
||||
FILE: app/config.py
|
||||
DESCRIPTION: Zentrale Pydantic-Konfiguration (Env-Vars für Qdrant, LLM, Retriever).
|
||||
VERSION: 0.4.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: os, functools, pathlib
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
|
|
|
|||
|
|
@ -1,11 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
app/core/chunk_payload.py (Mindnet V2 — types.yaml authoritative)
|
||||
- neighbors_prev / neighbors_next sind Listen ([], [id]).
|
||||
- retriever_weight / chunk_profile kommen aus types.yaml (Frontmatter wird ignoriert).
|
||||
- Fallbacks: defaults.* in types.yaml; sonst 1.0 / "default".
|
||||
- WP-11 Update: Injects 'title' into chunk payload for Discovery Service.
|
||||
FILE: app/core/chunk_payload.py
|
||||
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
||||
FEATURES:
|
||||
- Inkludiert Nachbarschafts-IDs (prev/next) und Titel.
|
||||
- FIX 3: Robuste Erkennung des Inputs (Frontmatter-Dict vs. Note-Objekt), damit Overrides ankommen.
|
||||
VERSION: 2.3.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: yaml, os
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
|
@ -36,42 +38,48 @@ def _get_defaults(reg: dict) -> dict:
|
|||
return {}
|
||||
|
||||
def _as_float(x: Any):
|
||||
try:
|
||||
return float(x)
|
||||
except Exception:
|
||||
return None
|
||||
try: return float(x)
|
||||
except Exception: return None
|
||||
|
||||
def _resolve_chunk_profile(note_type: str, reg: dict) -> str:
|
||||
def _resolve_chunk_profile_from_config(note_type: str, reg: dict) -> Optional[str]:
|
||||
# 1. Type Level
|
||||
types = _get_types_map(reg)
|
||||
if isinstance(types, dict):
|
||||
t = types.get(note_type, {})
|
||||
if isinstance(t, dict) and isinstance(t.get("chunk_profile"), str):
|
||||
return t["chunk_profile"]
|
||||
if isinstance(t, dict):
|
||||
cp = t.get("chunking_profile") or t.get("chunk_profile")
|
||||
if isinstance(cp, str) and cp: return cp
|
||||
# 2. Defaults Level
|
||||
defs = _get_defaults(reg)
|
||||
if isinstance(defs, dict) and isinstance(defs.get("chunk_profile"), str):
|
||||
return defs["chunk_profile"]
|
||||
return "default"
|
||||
if isinstance(defs, dict):
|
||||
cp = defs.get("chunking_profile") or defs.get("chunk_profile")
|
||||
if isinstance(cp, str) and cp: return cp
|
||||
return None
|
||||
|
||||
def _resolve_retriever_weight(note_type: str, reg: dict) -> float:
|
||||
def _resolve_retriever_weight_from_config(note_type: str, reg: dict) -> float:
|
||||
"""
|
||||
Liest Weight nur aus Config (Type > Default).
|
||||
Wird aufgerufen, wenn im Frontmatter nichts steht.
|
||||
"""
|
||||
# 1. Type Level
|
||||
types = _get_types_map(reg)
|
||||
if isinstance(types, dict):
|
||||
t = types.get(note_type, {})
|
||||
if isinstance(t, dict) and (t.get("retriever_weight") is not None):
|
||||
v = _as_float(t.get("retriever_weight"))
|
||||
if v is not None:
|
||||
return float(v)
|
||||
if v is not None: return float(v)
|
||||
|
||||
# 2. Defaults Level
|
||||
defs = _get_defaults(reg)
|
||||
if isinstance(defs, dict) and (defs.get("retriever_weight") is not None):
|
||||
v = _as_float(defs.get("retriever_weight"))
|
||||
if v is not None:
|
||||
return float(v)
|
||||
if v is not None: return float(v)
|
||||
|
||||
return 1.0
|
||||
|
||||
def _as_list(x):
|
||||
if x is None:
|
||||
return []
|
||||
if isinstance(x, list):
|
||||
return x
|
||||
if x is None: return []
|
||||
if isinstance(x, list): return x
|
||||
return [x]
|
||||
|
||||
def make_chunk_payloads(note: Dict[str, Any],
|
||||
|
|
@ -81,18 +89,49 @@ def make_chunk_payloads(note: Dict[str, Any],
|
|||
note_text: str = "",
|
||||
types_cfg: Optional[dict] = None,
|
||||
file_path: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
fm = (note or {}).get("frontmatter", {}) or {}
|
||||
"""
|
||||
Erstellt die Payloads für die Chunks.
|
||||
|
||||
Argument 'note' kann sein:
|
||||
A) Ein komplexes Objekt/Dict mit Key "frontmatter" (Legacy / Tests)
|
||||
B) Direkt das Frontmatter-Dictionary (Call aus ingestion.py)
|
||||
"""
|
||||
|
||||
# --- FIX 3: Intelligente Erkennung der Input-Daten ---
|
||||
# Wir prüfen: Ist 'note' ein Container MIT 'frontmatter', oder IST es das 'frontmatter'?
|
||||
if isinstance(note, dict) and "frontmatter" in note and isinstance(note["frontmatter"], dict):
|
||||
# Fall A: Container (wir müssen auspacken)
|
||||
fm = note["frontmatter"]
|
||||
else:
|
||||
# Fall B: Direktes Dict (so ruft ingestion.py es auf!)
|
||||
fm = note or {}
|
||||
|
||||
note_type = fm.get("type") or note.get("type") or "concept"
|
||||
|
||||
# WP-11 FIX: Title Extraction für Discovery Service
|
||||
# Wir holen den Titel aus Frontmatter oder Fallback ID/Untitled
|
||||
# Title Extraction (Fallback Chain)
|
||||
title = fm.get("title") or note.get("title") or fm.get("id") or "Untitled"
|
||||
|
||||
reg = types_cfg if isinstance(types_cfg, dict) else _load_types()
|
||||
|
||||
# types.yaml authoritative
|
||||
cp = _resolve_chunk_profile(note_type, reg)
|
||||
rw = _resolve_retriever_weight(note_type, reg)
|
||||
# --- Profil-Ermittlung ---
|
||||
# Da wir 'fm' jetzt korrekt haben, funktionieren diese lookups:
|
||||
cp = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
|
||||
if not cp:
|
||||
cp = _resolve_chunk_profile_from_config(note_type, reg)
|
||||
if not cp:
|
||||
cp = "sliding_standard"
|
||||
|
||||
# --- Retriever Weight Ermittlung ---
|
||||
rw = fm.get("retriever_weight")
|
||||
|
||||
if rw is None:
|
||||
rw = _resolve_retriever_weight_from_config(note_type, reg)
|
||||
|
||||
try:
|
||||
rw = float(rw)
|
||||
except Exception:
|
||||
rw = 1.0
|
||||
|
||||
tags = fm.get("tags") or []
|
||||
if isinstance(tags, str):
|
||||
|
|
@ -100,7 +139,7 @@ def make_chunk_payloads(note: Dict[str, Any],
|
|||
|
||||
out: List[Dict[str, Any]] = []
|
||||
for idx, ch in enumerate(chunks_from_chunker):
|
||||
# Attribute oder Keys (Chunk-Objekt oder Dict)
|
||||
# Attribute extrahieren
|
||||
cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None)
|
||||
nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id"))
|
||||
index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx)
|
||||
|
|
@ -112,7 +151,7 @@ def make_chunk_payloads(note: Dict[str, Any],
|
|||
pl: Dict[str, Any] = {
|
||||
"note_id": nid,
|
||||
"chunk_id": cid,
|
||||
"title": title, # <--- HIER: Titel in Payload einfügen
|
||||
"title": title,
|
||||
"index": int(index),
|
||||
"ord": int(index) + 1,
|
||||
"type": note_type,
|
||||
|
|
@ -125,9 +164,10 @@ def make_chunk_payloads(note: Dict[str, Any],
|
|||
"path": note_path,
|
||||
"source_path": file_path or note_path,
|
||||
"retriever_weight": float(rw),
|
||||
"chunk_profile": cp,
|
||||
"chunk_profile": cp, # Jetzt endlich mit dem Override-Wert!
|
||||
}
|
||||
# Aufräumen von Alt-Feldern
|
||||
|
||||
# Cleanup
|
||||
for alias in ("chunk_num", "Chunk_Number"):
|
||||
pl.pop(alias, None)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,13 @@
|
|||
"""
|
||||
FILE: app/core/chunker.py
|
||||
DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings). Orchestriert die Smart-Edge-Allocation via SemanticAnalyzer.
|
||||
VERSION: 2.9.0 (Feat: Hybrid Strict Splitting with Size Safety)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.services.semantic_analyzer, app.core.derive_edges, markdown_it, yaml, asyncio
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
LAST_ANALYSIS: 2025-12-16
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Optional, Tuple, Any, Set
|
||||
|
|
@ -5,8 +15,6 @@ import re
|
|||
import math
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from markdown_it import MarkdownIt
|
||||
from markdown_it.token import Token
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
|
|
@ -93,7 +101,7 @@ class Chunk:
|
|||
suggested_edges: Optional[List[str]] = None
|
||||
|
||||
# ==========================================
|
||||
# 3. PARSING & STRATEGIES (SYNCHRON)
|
||||
# 3. PARSING & STRATEGIES
|
||||
# ==========================================
|
||||
|
||||
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||
|
|
@ -115,7 +123,13 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if stripped.startswith('# '):
|
||||
continue
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||
buffer = []
|
||||
blocks.append(RawBlock("heading", stripped, 1, section_path, current_h2))
|
||||
|
||||
elif stripped.startswith('## '):
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
|
|
@ -125,6 +139,15 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
current_h2 = stripped[3:].strip()
|
||||
section_path = f"/{current_h2}"
|
||||
blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2))
|
||||
|
||||
elif stripped.startswith('### '):
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||
buffer = []
|
||||
blocks.append(RawBlock("heading", stripped, 3, section_path, current_h2))
|
||||
|
||||
elif not stripped:
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
|
|
@ -141,6 +164,15 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
|||
|
||||
return blocks, h1_title
|
||||
|
||||
def _create_chunk_obj(chunks_list: List[Chunk], note_id: str, txt: str, win: str, sec: Optional[str], path: str):
|
||||
idx = len(chunks_list)
|
||||
chunks_list.append(Chunk(
|
||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||
section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None,
|
||||
suggested_edges=[]
|
||||
))
|
||||
|
||||
def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]:
|
||||
target = config.get("target", 400)
|
||||
max_tokens = config.get("max", 600)
|
||||
|
|
@ -148,15 +180,6 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
|
|||
overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
|
||||
chunks = []; buf = []
|
||||
|
||||
def _create_chunk(txt, win, sec, path):
|
||||
idx = len(chunks)
|
||||
chunks.append(Chunk(
|
||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||
section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None,
|
||||
suggested_edges=[]
|
||||
))
|
||||
|
||||
def flush_buffer():
|
||||
nonlocal buf
|
||||
if not buf: return
|
||||
|
|
@ -164,9 +187,14 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
|
|||
text_body = "\n\n".join([b.text for b in buf])
|
||||
win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body
|
||||
|
||||
# Basis-Info vom ersten Block im Buffer
|
||||
sec = buf[0].section_title if buf else None
|
||||
path = buf[0].section_path if buf else "/"
|
||||
|
||||
if estimate_tokens(text_body) <= max_tokens:
|
||||
_create_chunk(text_body, win_body, buf[-1].section_title, buf[-1].section_path)
|
||||
_create_chunk_obj(chunks, note_id, text_body, win_body, sec, path)
|
||||
else:
|
||||
# Fallback: Wenn Block zu groß, intern splitten (Sentence-Level)
|
||||
sentences = split_sentences(text_body)
|
||||
current_chunk_sents = []
|
||||
current_len = 0
|
||||
|
|
@ -176,7 +204,7 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
|
|||
if current_len + sent_len > target and current_chunk_sents:
|
||||
c_txt = " ".join(current_chunk_sents)
|
||||
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
|
||||
_create_chunk(c_txt, c_win, buf[-1].section_title, buf[-1].section_path)
|
||||
_create_chunk_obj(chunks, note_id, c_txt, c_win, sec, path)
|
||||
|
||||
overlap_sents = []
|
||||
ov_len = 0
|
||||
|
|
@ -197,27 +225,111 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
|
|||
if current_chunk_sents:
|
||||
c_txt = " ".join(current_chunk_sents)
|
||||
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
|
||||
_create_chunk(c_txt, c_win, buf[-1].section_title, buf[-1].section_path)
|
||||
_create_chunk_obj(chunks, note_id, c_txt, c_win, sec, path)
|
||||
|
||||
buf = []
|
||||
|
||||
for b in blocks:
|
||||
if b.kind == "heading": continue
|
||||
if b.kind == "heading":
|
||||
flush_buffer()
|
||||
|
||||
current_buf_text = "\n\n".join([x.text for x in buf])
|
||||
if estimate_tokens(current_buf_text) + estimate_tokens(b.text) >= target:
|
||||
if buf and (estimate_tokens(current_buf_text) + estimate_tokens(b.text) >= target):
|
||||
flush_buffer()
|
||||
|
||||
buf.append(b)
|
||||
if estimate_tokens(b.text) >= target:
|
||||
flush_buffer()
|
||||
|
||||
flush_buffer()
|
||||
return chunks
|
||||
|
||||
def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
||||
return _strategy_sliding_window(blocks, config, note_id, doc_title, context_prefix=f"# {doc_title}")
|
||||
"""
|
||||
MODUS: Structured / Heading Split
|
||||
- split_level: Ebene für logische Trennung (z.B. H2).
|
||||
- strict_heading_split:
|
||||
True: Trennt an jedem Header <= split_level.
|
||||
NEU v2.9: Wenn Inhalt > max_tokens, wird trotzdem gesplittet (Safety Split).
|
||||
False: Fasst zusammen bis 'target' erreicht ist.
|
||||
"""
|
||||
split_level = config.get("split_level", 2)
|
||||
target = config.get("target", 400)
|
||||
max_limit = config.get("max", 600)
|
||||
strict_mode = config.get("strict_heading_split", False)
|
||||
|
||||
chunks = []
|
||||
current_chunk_blocks = []
|
||||
|
||||
context_prefix = f"# {doc_title}"
|
||||
|
||||
def has_content(blk_list):
|
||||
return any(b.kind != "heading" for b in blk_list)
|
||||
|
||||
def flush_current_chunk():
|
||||
nonlocal current_chunk_blocks
|
||||
if not current_chunk_blocks:
|
||||
return
|
||||
|
||||
text_body = "\n\n".join([b.text for b in current_chunk_blocks])
|
||||
win_body = f"{context_prefix}\n{text_body}".strip()
|
||||
|
||||
first_b = current_chunk_blocks[0]
|
||||
sec = first_b.section_title
|
||||
path = first_b.section_path
|
||||
|
||||
_create_chunk_obj(chunks, note_id, text_body, win_body, sec, path)
|
||||
current_chunk_blocks = []
|
||||
|
||||
def get_current_size():
|
||||
txt = "\n\n".join([b.text for b in current_chunk_blocks])
|
||||
return estimate_tokens(txt)
|
||||
|
||||
for b in blocks:
|
||||
# 1. Header Logic (Struktur-Trigger)
|
||||
is_splitter = (b.kind == "heading" and b.level is not None and b.level <= split_level)
|
||||
|
||||
if is_splitter:
|
||||
is_higher_hierarchy = (b.level < split_level)
|
||||
|
||||
if strict_mode:
|
||||
# STRICT:
|
||||
# Wir splitten immer, außer der Vor-Chunk ist leer.
|
||||
if current_chunk_blocks and has_content(current_chunk_blocks):
|
||||
flush_current_chunk()
|
||||
current_chunk_blocks.append(b)
|
||||
else:
|
||||
# SOFT:
|
||||
# Split bei Hierarchie-Wechsel ODER wenn voll.
|
||||
if is_higher_hierarchy:
|
||||
flush_current_chunk()
|
||||
current_chunk_blocks.append(b)
|
||||
elif current_chunk_blocks and get_current_size() >= target:
|
||||
flush_current_chunk()
|
||||
current_chunk_blocks.append(b)
|
||||
else:
|
||||
current_chunk_blocks.append(b)
|
||||
else:
|
||||
# 2. Content Logic (Safety Trigger für Monster-Abschnitte)
|
||||
# Bevor wir den Block anhängen: Würde er das Fass zum Überlaufen bringen?
|
||||
# Wir nutzen hier 'max' als harte Grenze für den Safety-Split.
|
||||
current_size = get_current_size()
|
||||
block_size = estimate_tokens(b.text)
|
||||
|
||||
if current_chunk_blocks and (current_size + block_size > max_limit):
|
||||
# NOTBREMSE: Chunk wird zu groß.
|
||||
# Wir splitten hier, auch wenn kein Header da ist.
|
||||
# Der Kontext (Section Title) bleibt erhalten, da er aus `current_h2` kommt (siehe parse_blocks).
|
||||
flush_current_chunk()
|
||||
current_chunk_blocks.append(b)
|
||||
else:
|
||||
current_chunk_blocks.append(b)
|
||||
|
||||
# Letzten Rest flushen
|
||||
flush_current_chunk()
|
||||
|
||||
return chunks
|
||||
|
||||
# ==========================================
|
||||
# 4. ORCHESTRATION (ASYNC) - WP-15 CORE
|
||||
# 4. ORCHESTRATION (ASYNC)
|
||||
# ==========================================
|
||||
|
||||
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
|
||||
|
|
@ -245,7 +357,6 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
|||
return []
|
||||
|
||||
if enable_smart_edges:
|
||||
# Hier rufen wir nun die Smart Edge Allocation auf
|
||||
chunks = await _run_smart_edge_allocation(chunks, md_text, note_id, note_type)
|
||||
|
||||
for i, ch in enumerate(chunks):
|
||||
|
|
@ -255,56 +366,40 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
|||
return chunks
|
||||
|
||||
def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> List[str]:
|
||||
"""
|
||||
Hilfsfunktion: Erstellt einen Dummy-Chunk für den gesamten Text und ruft
|
||||
den Edge-Parser auf, um ALLE Kanten der Notiz zu finden.
|
||||
"""
|
||||
# 1. Dummy Chunk erstellen, der den gesamten Text enthält
|
||||
# Das ist notwendig, da build_edges_for_note Kanten nur aus Chunks extrahiert.
|
||||
dummy_chunk = {
|
||||
"chunk_id": f"{note_id}#full",
|
||||
"text": md_text,
|
||||
"content": md_text, # Sicherstellen, dass der Parser Text findet
|
||||
"content": md_text,
|
||||
"window": md_text,
|
||||
"type": note_type
|
||||
}
|
||||
|
||||
# 2. Aufruf des Parsers (Signatur-Fix!)
|
||||
# derive_edges.py: build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False)
|
||||
raw_edges = build_edges_for_note(
|
||||
note_id,
|
||||
[dummy_chunk],
|
||||
note_level_references=None,
|
||||
include_note_scope_refs=False
|
||||
)
|
||||
|
||||
# 3. Kanten extrahieren
|
||||
all_candidates = set()
|
||||
for e in raw_edges:
|
||||
kind = e.get("kind")
|
||||
target = e.get("target_id")
|
||||
if target and kind not in ["belongs_to", "next", "prev", "backlink"]:
|
||||
all_candidates.add(f"{kind}:{target}")
|
||||
|
||||
return list(all_candidates)
|
||||
|
||||
async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_id: str, note_type: str) -> List[Chunk]:
|
||||
analyzer = get_semantic_analyzer()
|
||||
|
||||
# A. Alle potenziellen Kanten der Notiz sammeln (über den Dummy-Chunk Trick)
|
||||
candidate_list = _extract_all_edges_from_md(full_text, note_id, note_type)
|
||||
|
||||
if not candidate_list:
|
||||
return chunks
|
||||
|
||||
# B. LLM Filterung pro Chunk (Parallel)
|
||||
tasks = []
|
||||
for chunk in chunks:
|
||||
tasks.append(analyzer.assign_edges_to_chunk(chunk.text, candidate_list, note_type))
|
||||
|
||||
results_per_chunk = await asyncio.gather(*tasks)
|
||||
|
||||
# C. Injection & Fallback
|
||||
assigned_edges_global = set()
|
||||
|
||||
for i, confirmed_edges in enumerate(results_per_chunk):
|
||||
|
|
@ -317,7 +412,6 @@ async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_i
|
|||
chunk.text += injection_str
|
||||
chunk.window += injection_str
|
||||
|
||||
# D. Fallback: Unassigned Kanten überall hin
|
||||
unassigned = set(candidate_list) - assigned_edges_global
|
||||
if unassigned:
|
||||
fallback_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in unassigned if ':' in e])
|
||||
|
|
|
|||
|
|
@ -1,26 +1,11 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/core/derive_edges.py
|
||||
Zweck:
|
||||
- Bewahrt bestehende Edgelogik (belongs_to, prev/next, references, backlink)
|
||||
- Ergänzt typenbasierte Default-Kanten (edge_defaults aus config/types.yaml)
|
||||
- Unterstützt "typed inline relations":
|
||||
* [[rel:KIND | Target]]
|
||||
* [[rel:KIND Target]]
|
||||
* rel: KIND [[Target]]
|
||||
- Unterstützt Obsidian-Callouts:
|
||||
* > [!edge] KIND: [[Target]] [[Target2]] ...
|
||||
Kompatibilität:
|
||||
- build_edges_for_note(...) Signatur unverändert
|
||||
- rule_id Werte:
|
||||
* structure:belongs_to
|
||||
* structure:order
|
||||
* explicit:wikilink
|
||||
* inline:rel
|
||||
* callout:edge
|
||||
* edge_defaults:<type>:<relation>
|
||||
* derived:backlink
|
||||
FILE: app/core/derive_edges.py
|
||||
DESCRIPTION: Extrahiert Graph-Kanten aus Text. Unterstützt Wikilinks, Inline-Relations ([[rel:type|target]]) und Obsidian Callouts.
|
||||
VERSION: 2.0.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: re, os, yaml, typing
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
|
|
@ -1,296 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/core/edges.py
|
||||
Version: 2.0.0 (V2‑superset, rückwärtskompatibel zu v1 vom 2025‑09‑09)
|
||||
|
||||
Zweck
|
||||
-----
|
||||
Bewahrt die bestehende Edgelogik (belongs_to, prev/next, references, backlink)
|
||||
und ergänzt V2‑Felder + Typ‑Default‑Kanten gemäß config/types.yaml (edge_defaults).
|
||||
Die Funktion ist **idempotent** und **rückwärtskompatibel** zur bisherigen Signatur.
|
||||
|
||||
Kompatibilitätsgarantien (gegenüber v1):
|
||||
- **Input**: akzeptiert identische Chunk‑Payloads wie v1:
|
||||
* `id` (Chunk‑ID), `note_id` (Owner), `neighbors.prev|next` (optional),
|
||||
`references: [{target_id: ...}]` (optional),
|
||||
alternativ: `chunk_id`, `chunk_index|ord`, `window|text`
|
||||
- **Output (v1‑Felder)**: `kind`, `source_id`, `target_id`, `scope`, `note_id`, `edge_id`
|
||||
- **Neu (v2‑Felder)**: `relation`, `src_note_id`, `src_chunk_id?`, `dst_note_id`, `dst_chunk_id?`,
|
||||
`provenance` (`explicit|rule`), `rule_id?`, `confidence?`
|
||||
|
||||
Regeln
|
||||
------
|
||||
- Deduplizierungsschlüssel: (source_id, target_id, relation, rule_id)
|
||||
- Strukturkanten:
|
||||
* belongs_to: 1× pro Chunk
|
||||
* next/prev: Sequenz der Chunks; nutzt bevorzugt neighbors; sonst ord/chunk_index
|
||||
- Explizite Referenzen:
|
||||
* aus Chunk: `references[].target_id` (falls vorhanden)
|
||||
* Fallback: Wikilinks in `window|text`: [[Some Title|some-id]] oder [[some-id]]
|
||||
- Note‑Scope:
|
||||
* backlink immer; references nur, wenn include_note_scope_refs=True
|
||||
- Typ‑Defaults (edge_defaults aus config/types.yaml des **Quell‑Notiztyps**):
|
||||
* Für jede explizite Referenz wird je default‑Relation eine Regel‑Kante erzeugt
|
||||
* rule_id: "type_default:{note_type}:{relation}:v1", provenance="rule"
|
||||
|
||||
Konfiguration
|
||||
-------------
|
||||
- ENV MINDNET_TYPES_FILE (Default: ./config/types.yaml)
|
||||
|
||||
Lizenz/Autor
|
||||
------------
|
||||
- Erstimplementierung v1 (2025‑09‑09) — Projekt Mindnet
|
||||
- Erweiterung v2 (2025‑11‑11) — kompatible Superset‑Implementierung
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, Iterable, List, Optional, Tuple, Set
|
||||
|
||||
try:
|
||||
import yaml # optional, nur für types.yaml
|
||||
except Exception: # pragma: no cover
|
||||
yaml = None
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Hilfen: types.yaml laden (edge_defaults)
|
||||
# ------------------------------------------------------------
|
||||
|
||||
def _types_path() -> str:
|
||||
return os.getenv("MINDNET_TYPES_FILE") or "./config/types.yaml"
|
||||
|
||||
def _load_types() -> Dict[str, dict]:
|
||||
p = _types_path()
|
||||
if not os.path.isfile(p) or yaml is None:
|
||||
return {}
|
||||
try:
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
if isinstance(data, dict) and "types" in data and isinstance(data["types"], dict):
|
||||
return data["types"]
|
||||
return data if isinstance(data, dict) else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _edge_defaults_for(note_type: Optional[str]) -> List[str]:
|
||||
types = _load_types()
|
||||
t = (note_type or "").strip().lower()
|
||||
cfg = types.get(t) or {}
|
||||
defaults = cfg.get("edge_defaults") or []
|
||||
if isinstance(defaults, str):
|
||||
defaults = [defaults]
|
||||
return [str(x) for x in defaults if isinstance(x, (str, int, float))]
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Wikilink‑Parser (Fallback, wenn ch["references"] fehlt)
|
||||
# ------------------------------------------------------------
|
||||
|
||||
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]")
|
||||
|
||||
def _extract_wikilinks(text: str) -> List[str]:
|
||||
ids: List[str] = []
|
||||
for m in _WIKILINK_RE.finditer(text or ""):
|
||||
ids.append(m.group(1).strip())
|
||||
return ids
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Utility
|
||||
# ------------------------------------------------------------
|
||||
|
||||
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str:
|
||||
base = f"{kind}:{s}->{t}#{scope}"
|
||||
if rule_id:
|
||||
base += f"|{rule_id}"
|
||||
try:
|
||||
import hashlib
|
||||
return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest()
|
||||
except Exception: # pragma: no cover
|
||||
return base
|
||||
|
||||
def _dedupe(edges: List[Dict]) -> List[Dict]:
|
||||
seen: Set[Tuple[str,str,str,str]] = set()
|
||||
out: List[Dict] = []
|
||||
for e in edges:
|
||||
s = str(e.get("source_id") or "")
|
||||
t = str(e.get("target_id") or "")
|
||||
rel = str(e.get("relation") or e.get("kind") or "edge")
|
||||
rule = str(e.get("rule_id") or "")
|
||||
key = (s, t, rel, rule)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(e)
|
||||
return out
|
||||
|
||||
def _first(v: dict, *keys, default=None):
|
||||
for k in keys:
|
||||
if k in v and v[k] is not None:
|
||||
return v[k]
|
||||
return default
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Hauptfunktion
|
||||
# ------------------------------------------------------------
|
||||
|
||||
def build_edges_for_note(
|
||||
note_id: str,
|
||||
chunk_payloads: List[Dict],
|
||||
note_level_refs: Optional[List[str]] = None,
|
||||
*,
|
||||
include_note_scope_refs: bool = False,
|
||||
) -> List[Dict]:
|
||||
edges: List[Dict] = []
|
||||
chunks = list(chunk_payloads or [])
|
||||
# Notiztyp aus erstem Chunk ableiten (kompatibel zu existierenden Payloads)
|
||||
note_type = (chunks[0].get("type") if chunks else None) or (chunks[0].get("note_type") if chunks else None)
|
||||
|
||||
# --- Strukturkanten ------------------------------------------------------
|
||||
# belongs_to
|
||||
for ch in chunks:
|
||||
cid = _first(ch, "id", "chunk_id")
|
||||
if not cid:
|
||||
continue
|
||||
owner = ch.get("note_id") or note_id
|
||||
e = {
|
||||
"edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to:v1"),
|
||||
"kind": "belongs_to",
|
||||
"relation": "belongs_to",
|
||||
"scope": "chunk",
|
||||
"source_id": cid,
|
||||
"target_id": note_id,
|
||||
"note_id": owner, # v1-Kompat
|
||||
# v2
|
||||
"src_note_id": owner,
|
||||
"src_chunk_id": cid,
|
||||
"dst_note_id": note_id,
|
||||
"provenance": "rule",
|
||||
"rule_id": "structure:belongs_to:v1",
|
||||
"confidence": 1.0,
|
||||
}
|
||||
edges.append(e)
|
||||
|
||||
# next/prev — bevorzugt neighbors.prev/next; sonst via ord/chunk_index
|
||||
# Map der Chunks nach Index
|
||||
ordered = list(chunks)
|
||||
def _idx(c):
|
||||
return _first(c, "chunk_index", "ord", default=0)
|
||||
ordered.sort(key=_idx)
|
||||
|
||||
for i, ch in enumerate(ordered):
|
||||
cid = _first(ch, "id", "chunk_id")
|
||||
if not cid:
|
||||
continue
|
||||
owner = ch.get("note_id") or note_id
|
||||
nb = ch.get("neighbors") or {}
|
||||
prev_id = nb.get("prev")
|
||||
next_id = nb.get("next")
|
||||
# Fallback-Reihenfolge
|
||||
if prev_id is None and i > 0:
|
||||
prev_id = _first(ordered[i-1], "id", "chunk_id")
|
||||
if next_id is None and i+1 < len(ordered):
|
||||
next_id = _first(ordered[i+1], "id", "chunk_id")
|
||||
|
||||
if prev_id:
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id("prev", cid, prev_id, "chunk", "structure:order:v1"),
|
||||
"kind": "prev", "relation": "prev", "scope": "chunk",
|
||||
"source_id": cid, "target_id": prev_id, "note_id": owner,
|
||||
"src_note_id": owner, "src_chunk_id": cid,
|
||||
"dst_note_id": owner, "dst_chunk_id": prev_id,
|
||||
"provenance": "rule", "rule_id": "structure:order:v1", "confidence": 0.95,
|
||||
})
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id("next", prev_id, cid, "chunk", "structure:order:v1"),
|
||||
"kind": "next", "relation": "next", "scope": "chunk",
|
||||
"source_id": prev_id, "target_id": cid, "note_id": owner,
|
||||
"src_note_id": owner, "src_chunk_id": prev_id,
|
||||
"dst_note_id": owner, "dst_chunk_id": cid,
|
||||
"provenance": "rule", "rule_id": "structure:order:v1", "confidence": 0.95,
|
||||
})
|
||||
|
||||
# --- Explizite Referenzen (Chunk‑Scope) ---------------------------------
|
||||
explicit_refs: List[Dict] = []
|
||||
for ch in chunks:
|
||||
cid = _first(ch, "id", "chunk_id")
|
||||
if not cid:
|
||||
continue
|
||||
owner = ch.get("note_id") or note_id
|
||||
# 1) bevorzugt vorhandene ch["references"]
|
||||
refs = ch.get("references") or []
|
||||
targets = [r.get("target_id") for r in refs if isinstance(r, dict) and r.get("target_id")]
|
||||
# 2) Fallback: Wikilinks aus Text
|
||||
if not targets:
|
||||
text = _first(ch, "window", "text", default="") or ""
|
||||
targets = _extract_wikilinks(text)
|
||||
for tid in targets:
|
||||
if not isinstance(tid, str) or not tid.strip():
|
||||
continue
|
||||
e = {
|
||||
"edge_id": _mk_edge_id("references", cid, tid, "chunk"),
|
||||
"kind": "references",
|
||||
"relation": "references",
|
||||
"scope": "chunk",
|
||||
"source_id": cid,
|
||||
"target_id": tid,
|
||||
"note_id": owner,
|
||||
# v2
|
||||
"src_note_id": owner,
|
||||
"src_chunk_id": cid,
|
||||
"dst_note_id": tid,
|
||||
"provenance": "explicit",
|
||||
"rule_id": "",
|
||||
"confidence": 1.0,
|
||||
}
|
||||
edges.append(e)
|
||||
explicit_refs.append(e)
|
||||
|
||||
# --- Note‑Scope: references (optional) + backlink (immer) ----------------
|
||||
unique_refs = []
|
||||
if note_level_refs:
|
||||
seen = set()
|
||||
for tid in note_level_refs:
|
||||
if isinstance(tid, str) and tid.strip() and tid not in seen:
|
||||
unique_refs.append(tid); seen.add(tid)
|
||||
|
||||
for tid in unique_refs:
|
||||
if include_note_scope_refs:
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id("references", note_id, tid, "note"),
|
||||
"kind": "references", "relation": "references", "scope": "note",
|
||||
"source_id": note_id, "target_id": tid, "note_id": note_id,
|
||||
"src_note_id": note_id, "dst_note_id": tid,
|
||||
"provenance": "explicit", "rule_id": "", "confidence": 1.0,
|
||||
})
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id("backlink", tid, note_id, "note", "derived:backlink:v1"),
|
||||
"kind": "backlink", "relation": "backlink", "scope": "note",
|
||||
"source_id": tid, "target_id": note_id, "note_id": note_id,
|
||||
"src_note_id": tid, "dst_note_id": note_id,
|
||||
"provenance": "rule", "rule_id": "derived:backlink:v1", "confidence": 0.9,
|
||||
})
|
||||
|
||||
# --- Type‑Defaults je expliziter Referenz --------------------------------
|
||||
defaults = [d for d in _edge_defaults_for(note_type) if d and d != "references"]
|
||||
if defaults:
|
||||
for e in explicit_refs + ([ ] if not include_note_scope_refs else []):
|
||||
# wir nutzen die bereits erzeugten explicit‑Edges als Vorlage
|
||||
src = e["source_id"]; tgt = e["target_id"]
|
||||
scope = e.get("scope", "chunk")
|
||||
s_note = e.get("src_note_id") or note_id
|
||||
s_chunk = e.get("src_chunk_id")
|
||||
t_note = e.get("dst_note_id") or tgt
|
||||
for rel in defaults:
|
||||
rule_id = f"type_default:{(note_type or 'unknown')}:{rel}:v1"
|
||||
edges.append({
|
||||
"edge_id": _mk_edge_id(rel, src, tgt, scope, rule_id),
|
||||
"kind": rel, "relation": rel, "scope": scope,
|
||||
"source_id": src, "target_id": tgt, "note_id": s_note,
|
||||
"src_note_id": s_note, "src_chunk_id": s_chunk,
|
||||
"dst_note_id": t_note,
|
||||
"provenance": "rule", "rule_id": rule_id, "confidence": 0.7,
|
||||
})
|
||||
|
||||
# --- Dedupe & Return -----------------------------------------------------
|
||||
return _dedupe(edges)
|
||||
|
|
@ -1,94 +0,0 @@
|
|||
# app/core/edges_writer.py
|
||||
from __future__ import annotations
|
||||
import hashlib
|
||||
from typing import Dict, List, Iterable, Tuple
|
||||
|
||||
try:
|
||||
# Dein Modul mit der Schemadefinition und der Builder-Funktion
|
||||
from app.core.edges import build_edges_for_note # noqa: F401
|
||||
except Exception as e:
|
||||
raise RuntimeError("Konnte app.core.edges nicht importieren. "
|
||||
"Bitte sicherstellen, dass app/core/edges.py vorhanden ist.") from e
|
||||
|
||||
def _edge_uid(kind: str, source_id: str, target_id: str, scope: str) -> str:
|
||||
"""
|
||||
Deterministische, kurze ID für eine Edge.
|
||||
Kollisionen sind praktisch ausgeschlossen (BLAKE2s über den Kanonischen Schlüssel).
|
||||
"""
|
||||
key = f"{kind}|{source_id}|{target_id}|{scope}"
|
||||
return hashlib.blake2s(key.encode("utf-8"), digest_size=12).hexdigest()
|
||||
|
||||
def ensure_edges_collection(qdrant_client, collection: str) -> None:
|
||||
"""
|
||||
Legt die Edge-Collection an, falls sie nicht existiert.
|
||||
Minimal: 1D-Vector (Dummy), Cosine. Payload-only-Collections sind je nach Qdrant-Version heikel.
|
||||
"""
|
||||
from qdrant_client.http import models as qm
|
||||
|
||||
existing = [c.name for c in qdrant_client.get_collections().collections]
|
||||
if collection in existing:
|
||||
return
|
||||
|
||||
qdrant_client.recreate_collection(
|
||||
collection_name=collection,
|
||||
vectors_config=qm.VectorParams(size=1, distance=qm.Distance.COSINE),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
|
||||
def edges_from_note(
|
||||
note_id: str,
|
||||
chunk_payloads: List[Dict],
|
||||
note_level_refs: Iterable[str] | None,
|
||||
*,
|
||||
include_note_scope_refs: bool = False,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Ruft deinen Edge-Builder auf und gibt die (deduplizierten) Edge-Payloads zurück.
|
||||
Keine Schemaänderung – exakt das aus app/core/edges.py.
|
||||
"""
|
||||
return build_edges_for_note(
|
||||
note_id=note_id,
|
||||
chunk_payloads=chunk_payloads,
|
||||
note_level_refs=list(note_level_refs or []),
|
||||
include_note_scope_refs=include_note_scope_refs,
|
||||
)
|
||||
|
||||
def upsert_edges(
|
||||
qdrant_client,
|
||||
collection: str,
|
||||
edge_payloads: List[Dict],
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Schreibt Edges als Points in Qdrant.
|
||||
- id: deterministisch aus (kind, source_id, target_id, scope)
|
||||
- vector: [0.0] Dummy
|
||||
- payload: Edge-Dict (unverändert, siehe Schema in app/core/edges.py)
|
||||
Gibt (anzahl_points, anzahl_unique_keys) zurück.
|
||||
"""
|
||||
from qdrant_client.models import PointStruct
|
||||
|
||||
if not edge_payloads:
|
||||
return 0, 0
|
||||
|
||||
points = []
|
||||
seen = set()
|
||||
for e in edge_payloads:
|
||||
key = (e.get("kind"), e.get("source_id"), e.get("target_id"), e.get("scope"))
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
eid = _edge_uid(*key)
|
||||
points.append(
|
||||
PointStruct(
|
||||
id=eid,
|
||||
vector=[0.0],
|
||||
payload=e,
|
||||
)
|
||||
)
|
||||
|
||||
if not points:
|
||||
return 0, 0
|
||||
|
||||
ensure_edges_collection(qdrant_client, collection)
|
||||
qdrant_client.upsert(collection_name=collection, points=points)
|
||||
return len(points), len(seen)
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
from __future__ import annotations
|
||||
import os, time, json
|
||||
import urllib.request
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Backend-Auswahl:
|
||||
# - EMBED_BACKEND=ollama -> EMBED_URL=/api/embeddings (Ollama), EMBED_MODEL=z.B. nomic-embed-text
|
||||
# - EMBED_BACKEND=mini -> EMBED_URL=/embed (unser MiniLM-Server), EMBED_MODEL=minilm-384
|
||||
EMBED_BACKEND = os.getenv("EMBED_BACKEND", "mini").lower()
|
||||
EMBED_URL = os.getenv("EMBED_URL", "http://127.0.0.1:8990/embed")
|
||||
EMBED_MODEL = os.getenv("EMBED_MODEL", "minilm-384")
|
||||
EMBED_BATCH = int(os.getenv("EMBED_BATCH", "64"))
|
||||
TIMEOUT = 60
|
||||
|
||||
class EmbedError(RuntimeError): ...
|
||||
|
||||
def _post_json(url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
def _embed_mini(inputs: List[str], model: str, batch: int) -> List[List[float]]:
|
||||
out: List[List[float]] = []
|
||||
i = 0
|
||||
while i < len(inputs):
|
||||
chunk = inputs[i:i+batch]
|
||||
# einfache Retries
|
||||
for attempt in range(5):
|
||||
try:
|
||||
resp = _post_json(EMBED_URL, {"model": model, "inputs": chunk})
|
||||
vecs = resp.get("embeddings") or resp.get("vectors") or resp.get("data")
|
||||
if not isinstance(vecs, list):
|
||||
raise EmbedError(f"Bad embed response keys: {list(resp.keys())}")
|
||||
out.extend(vecs)
|
||||
break
|
||||
except Exception:
|
||||
if attempt == 4:
|
||||
raise
|
||||
time.sleep(1.5 * (attempt + 1))
|
||||
i += batch
|
||||
return out
|
||||
|
||||
def _embed_ollama(inputs: List[str], model: str, batch: int) -> List[List[float]]:
|
||||
# Ollama /api/embeddings akzeptiert "input" als String ODER Array.
|
||||
# Die Response enthält:
|
||||
# - für single input: {"embedding":[...], "model":"...", ...}
|
||||
# - für array input: {"embeddings":[[...],[...],...], "model":"...", ...} (je nach Version)
|
||||
# Um maximal kompatibel zu sein, rufen wir pro Text einzeln auf.
|
||||
out: List[List[float]] = []
|
||||
for text in inputs:
|
||||
# Retries
|
||||
for attempt in range(5):
|
||||
try:
|
||||
resp = _post_json(EMBED_URL, {"model": model, "input": text})
|
||||
if "embedding" in resp and isinstance(resp["embedding"], list):
|
||||
out.append(resp["embedding"])
|
||||
elif "embeddings" in resp and isinstance(resp["embeddings"], list):
|
||||
# Falls Server array zurückgibt, nimm das erste Element
|
||||
vecs = resp["embeddings"]
|
||||
out.append(vecs[0] if vecs else [])
|
||||
else:
|
||||
raise EmbedError(f"Ollama response unexpected keys: {list(resp.keys())}")
|
||||
break
|
||||
except Exception:
|
||||
if attempt == 4:
|
||||
raise
|
||||
time.sleep(1.5 * (attempt + 1))
|
||||
return out
|
||||
|
||||
def embed_texts(texts: List[str], model: str | None = None, batch_size: int | None = None) -> List[List[float]]:
|
||||
model = model or EMBED_MODEL
|
||||
batch = batch_size or EMBED_BATCH
|
||||
if not texts:
|
||||
return []
|
||||
if EMBED_BACKEND == "ollama":
|
||||
return _embed_ollama(texts, model, batch)
|
||||
# default: mini
|
||||
return _embed_mini(texts, model, batch)
|
||||
|
||||
def embed_one(text: str, model: str | None = None) -> List[float]:
|
||||
return embed_texts([text], model=model, batch_size=1)[0]
|
||||
|
|
@ -1,103 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Datei: app/core/env_vars.py
|
||||
Version: 1.1.0 (2025-11-08)
|
||||
|
||||
Zweck
|
||||
Einheitliche Auflösung von ENV-Variablen (Prefix, Qdrant, Embeddings, Hashing)
|
||||
mit Abwärtskompatibilität.
|
||||
|
||||
Grundsatz
|
||||
- Für Qdrant-Funktionen ist 'COLLECTION_PREFIX' der Primärschlüssel.
|
||||
- 'MINDNET_PREFIX' bleibt für App-/UI-/Exporter-Kontexte nutzbar.
|
||||
- Fallbacks sorgen dafür, dass ältere Umgebungen weiter funktionieren.
|
||||
|
||||
Wichtig
|
||||
- Lädt optional eine .env (wenn python-dotenv verfügbar ist).
|
||||
- Überschreibt keine bereits gesetzten OS-Variablen (override=False).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Optional, Dict
|
||||
|
||||
# Optional: .env automatisch laden (ohne Hard-Fail, falls nicht vorhanden)
|
||||
try:
|
||||
from dotenv import load_dotenv, find_dotenv # type: ignore
|
||||
_p = find_dotenv()
|
||||
if _p:
|
||||
load_dotenv(_p, override=False)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# -------- Prefix-Auflösung --------
|
||||
|
||||
def get_collection_prefix(cli_override: Optional[str] = None) -> str:
|
||||
"""
|
||||
Für Qdrant-relevante Funktionen:
|
||||
1) CLI-Override (--prefix)
|
||||
2) ENV COLLECTION_PREFIX
|
||||
3) ENV MINDNET_PREFIX (Fallback)
|
||||
4) 'mindnet' (Default)
|
||||
"""
|
||||
if cli_override and str(cli_override).strip():
|
||||
return str(cli_override).strip()
|
||||
return (
|
||||
os.getenv("COLLECTION_PREFIX")
|
||||
or os.getenv("MINDNET_PREFIX")
|
||||
or "mindnet"
|
||||
)
|
||||
|
||||
def get_mindnet_prefix(cli_override: Optional[str] = None) -> str:
|
||||
"""
|
||||
Für App-/UI-/Exporter-Kontexte:
|
||||
1) CLI-Override (--prefix)
|
||||
2) ENV MINDNET_PREFIX
|
||||
3) ENV COLLECTION_PREFIX (Fallback)
|
||||
4) 'mindnet'
|
||||
"""
|
||||
if cli_override and str(cli_override).strip():
|
||||
return str(cli_override).strip()
|
||||
return (
|
||||
os.getenv("MINDNET_PREFIX")
|
||||
or os.getenv("COLLECTION_PREFIX")
|
||||
or "mindnet"
|
||||
)
|
||||
|
||||
def get_prefix(cli_override: Optional[str] = None, target: str = "qdrant") -> str:
|
||||
"""
|
||||
Universelle Hülle (abwärtskompatibel):
|
||||
target='qdrant' -> get_collection_prefix
|
||||
target='app' -> get_mindnet_prefix
|
||||
"""
|
||||
if target.lower() == "app":
|
||||
return get_mindnet_prefix(cli_override)
|
||||
return get_collection_prefix(cli_override)
|
||||
|
||||
# -------- Qdrant / Embeddings / Hashing --------
|
||||
|
||||
def get_qdrant_url(default: str = "http://127.0.0.1:6333") -> str:
|
||||
return os.getenv("QDRANT_URL", default)
|
||||
|
||||
def get_qdrant_api_key(default: str = "") -> str:
|
||||
return os.getenv("QDRANT_API_KEY", default)
|
||||
|
||||
def get_vector_dim(default: int = 384) -> int:
|
||||
try:
|
||||
return int(os.getenv("VECTOR_DIM", str(default)))
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
def get_embed_url(default: Optional[str] = None) -> Optional[str]:
|
||||
return os.getenv("EMBED_URL", default)
|
||||
|
||||
def get_hash_env() -> Dict[str, str]:
|
||||
"""
|
||||
Liefert die Hash-Konfiguration (nur Aggregation; die Auswertung bleibt in den Skripten).
|
||||
"""
|
||||
return {
|
||||
"MINDNET_HASH_COMPARE": os.getenv("MINDNET_HASH_COMPARE", ""),
|
||||
"MINDNET_HASH_SOURCE": os.getenv("MINDNET_HASH_SOURCE", ""),
|
||||
"MINDNET_HASH_NORMALIZE": os.getenv("MINDNET_HASH_NORMALIZE", ""),
|
||||
}
|
||||
|
|
@ -1,17 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
app/core/graph_adapter.py — Adjazenzaufbau & Subgraph-Expansion
|
||||
|
||||
Zweck:
|
||||
Baut aus Qdrant-Edges (Collection: *_edges) einen leichten In-Memory-Graph.
|
||||
|
||||
Kompatibilität:
|
||||
- WP-04a: Liefert Scores (edge_bonus, centrality).
|
||||
- WP-04b: Liefert jetzt auch Struktur-Daten für Erklärungen (Reverse-Lookup).
|
||||
|
||||
Version:
|
||||
0.4.0 (Update für WP-04b: Reverse Adjacency für Explainability)
|
||||
FILE: app/core/graph_adapter.py
|
||||
DESCRIPTION: Lädt Kanten aus Qdrant und baut einen In-Memory Subgraphen für Scoring (Centrality) und Explanation.
|
||||
VERSION: 0.4.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, app.core.qdrant
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
|
|
@ -1,8 +1,12 @@
|
|||
"""
|
||||
app/core/ingestion.py
|
||||
|
||||
Zentraler Service für die Transformation von Markdown-Dateien in Qdrant-Objekte.
|
||||
Version: 2.5.2 (Full Feature: Change Detection + Robust IO + Clean Config)
|
||||
FILE: app/core/ingestion.py
|
||||
DESCRIPTION: Haupt-Ingestion-Logik.
|
||||
FIX: Korrekte Priorisierung von Frontmatter für chunk_profile und retriever_weight.
|
||||
Lade Chunk-Config basierend auf dem effektiven Profil, nicht nur dem Notiz-Typ.
|
||||
VERSION: 2.7.0 (Fix: Frontmatter Overrides & Config Loading)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
|
|
@ -52,16 +56,42 @@ def resolve_note_type(requested: Optional[str], reg: dict) -> str:
|
|||
if requested and requested in types: return requested
|
||||
return "concept"
|
||||
|
||||
def effective_chunk_profile(note_type: str, reg: dict) -> str:
|
||||
t_cfg = reg.get("types", {}).get(note_type, {})
|
||||
if t_cfg and t_cfg.get("chunk_profile"):
|
||||
return t_cfg.get("chunk_profile")
|
||||
return reg.get("defaults", {}).get("chunk_profile", "default")
|
||||
def effective_chunk_profile_name(fm: dict, note_type: str, reg: dict) -> str:
|
||||
"""
|
||||
Ermittelt den Namen des Chunk-Profils.
|
||||
Prio: 1. Frontmatter -> 2. Type-Config -> 3. Default
|
||||
"""
|
||||
# 1. Frontmatter Override
|
||||
override = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
if override and isinstance(override, str):
|
||||
return override
|
||||
|
||||
def effective_retriever_weight(note_type: str, reg: dict) -> float:
|
||||
# 2. Type Config
|
||||
t_cfg = reg.get("types", {}).get(note_type, {})
|
||||
if t_cfg:
|
||||
cp = t_cfg.get("chunking_profile") or t_cfg.get("chunk_profile")
|
||||
if cp: return cp
|
||||
|
||||
# 3. Global Default
|
||||
return reg.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
||||
|
||||
def effective_retriever_weight(fm: dict, note_type: str, reg: dict) -> float:
|
||||
"""
|
||||
Ermittelt das Retriever Weight.
|
||||
Prio: 1. Frontmatter -> 2. Type-Config -> 3. Default
|
||||
"""
|
||||
# 1. Frontmatter Override
|
||||
override = fm.get("retriever_weight")
|
||||
if override is not None:
|
||||
try: return float(override)
|
||||
except: pass
|
||||
|
||||
# 2. Type Config
|
||||
t_cfg = reg.get("types", {}).get(note_type, {})
|
||||
if t_cfg and "retriever_weight" in t_cfg:
|
||||
return float(t_cfg["retriever_weight"])
|
||||
|
||||
# 3. Global Default
|
||||
return float(reg.get("defaults", {}).get("retriever_weight", 1.0))
|
||||
|
||||
|
||||
|
|
@ -77,12 +107,32 @@ class IngestionService:
|
|||
self.registry = load_type_registry()
|
||||
self.embedder = EmbeddingsClient()
|
||||
|
||||
# ACTIVE HASH MODE aus ENV lesen (Default: full)
|
||||
self.active_hash_mode = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full")
|
||||
|
||||
try:
|
||||
ensure_collections(self.client, self.prefix, self.dim)
|
||||
ensure_payload_indexes(self.client, self.prefix)
|
||||
except Exception as e:
|
||||
logger.warning(f"DB init warning: {e}")
|
||||
|
||||
def _get_chunk_config_by_profile(self, profile_name: str, note_type: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Lädt die konkrete Config (target, max, overlap) für einen Profilnamen.
|
||||
"""
|
||||
# Suche direkt in den definierten Profilen der Registry
|
||||
profiles = self.registry.get("chunking_profiles", {})
|
||||
if profile_name in profiles:
|
||||
cfg = profiles[profile_name].copy()
|
||||
# Tuple-Fix für Overlap (wie in chunker.py)
|
||||
if "overlap" in cfg and isinstance(cfg["overlap"], list):
|
||||
cfg["overlap"] = tuple(cfg["overlap"])
|
||||
return cfg
|
||||
|
||||
# Fallback: Wenn Profilname unbekannt, nutze Standard für den Typ via Chunker
|
||||
logger.warning(f"Profile '{profile_name}' not found in registry. Falling back to type defaults.")
|
||||
return get_chunk_config(note_type)
|
||||
|
||||
async def process_file(
|
||||
self,
|
||||
file_path: str,
|
||||
|
|
@ -91,72 +141,71 @@ class IngestionService:
|
|||
apply: bool = False,
|
||||
purge_before: bool = False,
|
||||
note_scope_refs: bool = False,
|
||||
hash_mode: str = "body",
|
||||
hash_source: str = "parsed",
|
||||
hash_normalize: str = "canonical"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Verarbeitet eine einzelne Datei (ASYNC).
|
||||
Inklusive Change Detection (Hash-Check) gegen Qdrant.
|
||||
"""
|
||||
result = {
|
||||
"path": file_path,
|
||||
"status": "skipped",
|
||||
"changed": False,
|
||||
"error": None
|
||||
}
|
||||
|
||||
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
||||
|
||||
# 1. Parse & Frontmatter Validation
|
||||
try:
|
||||
parsed = read_markdown(file_path)
|
||||
if not parsed:
|
||||
return {**result, "error": "Empty or unreadable file"}
|
||||
|
||||
if not parsed: return {**result, "error": "Empty or unreadable file"}
|
||||
fm = normalize_frontmatter(parsed.frontmatter)
|
||||
validate_required_frontmatter(fm)
|
||||
except Exception as e:
|
||||
logger.error(f"Validation failed for {file_path}: {e}")
|
||||
return {**result, "error": f"Validation failed: {str(e)}"}
|
||||
|
||||
# 2. Type & Config Resolution
|
||||
# 2. Type & Config Resolution (FIXED)
|
||||
# Wir ermitteln erst den Typ
|
||||
note_type = resolve_note_type(fm.get("type"), self.registry)
|
||||
fm["type"] = note_type
|
||||
fm["chunk_profile"] = effective_chunk_profile(note_type, self.registry)
|
||||
|
||||
weight = fm.get("retriever_weight")
|
||||
if weight is None:
|
||||
weight = effective_retriever_weight(note_type, self.registry)
|
||||
fm["retriever_weight"] = float(weight)
|
||||
# Dann ermitteln wir die effektiven Werte unter Berücksichtigung des Frontmatters!
|
||||
# Hier lag der Fehler: Vorher wurde einfach überschrieben.
|
||||
effective_profile = effective_chunk_profile_name(fm, note_type, self.registry)
|
||||
effective_weight = effective_retriever_weight(fm, note_type, self.registry)
|
||||
|
||||
# Wir schreiben die effektiven Werte zurück ins FM, damit note_payload sie sicher hat
|
||||
fm["chunk_profile"] = effective_profile
|
||||
fm["retriever_weight"] = effective_weight
|
||||
|
||||
# 3. Build Note Payload
|
||||
try:
|
||||
note_pl = make_note_payload(
|
||||
parsed,
|
||||
vault_root=vault_root,
|
||||
hash_mode=hash_mode,
|
||||
hash_normalize=hash_normalize,
|
||||
hash_source=hash_source,
|
||||
file_path=file_path
|
||||
)
|
||||
if not note_pl.get("fulltext"):
|
||||
note_pl["fulltext"] = getattr(parsed, "body", "") or ""
|
||||
note_pl["retriever_weight"] = fm["retriever_weight"]
|
||||
# Text Body Fallback
|
||||
if not note_pl.get("fulltext"): note_pl["fulltext"] = getattr(parsed, "body", "") or ""
|
||||
|
||||
# Update Payload with explicit effective values (Sicherheit)
|
||||
note_pl["retriever_weight"] = effective_weight
|
||||
note_pl["chunk_profile"] = effective_profile
|
||||
|
||||
note_id = note_pl["note_id"]
|
||||
except Exception as e:
|
||||
logger.error(f"Payload build failed: {e}")
|
||||
return {**result, "error": f"Payload build failed: {str(e)}"}
|
||||
|
||||
# 4. Change Detection (Das fehlende Stück!)
|
||||
# 4. Change Detection
|
||||
old_payload = None
|
||||
if not force_replace:
|
||||
old_payload = self._fetch_note_payload(note_id)
|
||||
|
||||
has_old = old_payload is not None
|
||||
key_current = f"{hash_mode}:{hash_source}:{hash_normalize}"
|
||||
old_hash = (old_payload or {}).get("hashes", {}).get(key_current)
|
||||
new_hash = note_pl.get("hashes", {}).get(key_current)
|
||||
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
|
||||
|
||||
old_hashes = (old_payload or {}).get("hashes")
|
||||
if isinstance(old_hashes, dict): old_hash = old_hashes.get(check_key)
|
||||
elif isinstance(old_hashes, str) and self.active_hash_mode == "body": old_hash = old_hashes
|
||||
else: old_hash = None
|
||||
|
||||
new_hash = note_pl.get("hashes", {}).get(check_key)
|
||||
hash_changed = (old_hash != new_hash)
|
||||
chunks_missing, edges_missing = self._artifacts_missing(note_id)
|
||||
|
||||
|
|
@ -168,18 +217,19 @@ class IngestionService:
|
|||
if not apply:
|
||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
||||
|
||||
# 5. Processing (Chunking, Embedding, Edges)
|
||||
# 5. Processing
|
||||
try:
|
||||
body_text = getattr(parsed, "body", "") or ""
|
||||
|
||||
# --- Config Loading (Clean) ---
|
||||
chunk_config = get_chunk_config(note_type)
|
||||
# Hier greift die Logik aus types.yaml (smart=True/False)
|
||||
# FIX: Wir laden jetzt die Config für das SPEZIFISCHE Profil
|
||||
# (z.B. wenn User "sliding_short" wollte, laden wir dessen Params)
|
||||
chunk_config = self._get_chunk_config_by_profile(effective_profile, note_type)
|
||||
|
||||
chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_config)
|
||||
|
||||
# chunk_payloads werden mit den aktualisierten FM-Werten gebaut
|
||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text)
|
||||
|
||||
# Embedding
|
||||
vecs = []
|
||||
if chunk_pls:
|
||||
texts = [c.get("window") or c.get("text") or "" for c in chunk_pls]
|
||||
|
|
@ -194,7 +244,6 @@ class IngestionService:
|
|||
logger.error(f"Embedding failed: {e}")
|
||||
raise RuntimeError(f"Embedding failed: {e}")
|
||||
|
||||
# Edges
|
||||
try:
|
||||
edges = build_edges_for_note(
|
||||
note_id,
|
||||
|
|
@ -209,7 +258,7 @@ class IngestionService:
|
|||
logger.error(f"Processing failed: {e}", exc_info=True)
|
||||
return {**result, "error": f"Processing failed: {str(e)}"}
|
||||
|
||||
# 6. Upsert Action
|
||||
# 6. Upsert
|
||||
try:
|
||||
if purge_before and has_old:
|
||||
self._purge_artifacts(note_id)
|
||||
|
|
@ -237,8 +286,7 @@ class IngestionService:
|
|||
logger.error(f"Upsert failed: {e}", exc_info=True)
|
||||
return {**result, "error": f"DB Upsert failed: {e}"}
|
||||
|
||||
# --- Qdrant Helper (Restored) ---
|
||||
|
||||
# ... (Restliche Methoden wie _fetch_note_payload bleiben unverändert) ...
|
||||
def _fetch_note_payload(self, note_id: str) -> Optional[dict]:
|
||||
from qdrant_client.http import models as rest
|
||||
col = f"{self.prefix}_notes"
|
||||
|
|
@ -268,38 +316,17 @@ class IngestionService:
|
|||
self.client.delete(collection_name=f"{self.prefix}_{suffix}", points_selector=selector)
|
||||
except Exception: pass
|
||||
|
||||
async def create_from_text(
|
||||
self,
|
||||
markdown_content: str,
|
||||
filename: str,
|
||||
vault_root: str,
|
||||
folder: str = "00_Inbox"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
WP-11 Persistence API Entrypoint.
|
||||
"""
|
||||
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
||||
target_dir = os.path.join(vault_root, folder)
|
||||
os.makedirs(target_dir, exist_ok=True)
|
||||
|
||||
file_path = os.path.join(target_dir, filename)
|
||||
|
||||
try:
|
||||
# Robust Write: Ensure Flush & Sync
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
f.flush()
|
||||
os.fsync(f.fileno())
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
logger.info(f"Written file to {file_path}")
|
||||
except Exception as e:
|
||||
return {"status": "error", "error": f"Disk write failed: {str(e)}"}
|
||||
|
||||
return await self.process_file(
|
||||
file_path=file_path,
|
||||
vault_root=vault_root,
|
||||
apply=True,
|
||||
force_replace=True,
|
||||
purge_before=True
|
||||
)
|
||||
return await self.process_file(file_path=file_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||
|
|
@ -1,17 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/core/note_payload.py
|
||||
Version: 2.1.0 (WP-11 Update: Aliases support)
|
||||
|
||||
Zweck
|
||||
-----
|
||||
Erzeugt ein robustes Note-Payload. Werte wie `retriever_weight`, `chunk_profile`
|
||||
und `edge_defaults` werden in folgender Priorität bestimmt:
|
||||
1) Frontmatter (Note)
|
||||
2) Typ-Registry (config/types.yaml: types.<type>.*)
|
||||
3) Registry-Defaults (config/types.yaml: defaults.*)
|
||||
4) ENV-Defaults (MINDNET_DEFAULT_RETRIEVER_WEIGHT / MINDNET_DEFAULT_CHUNK_PROFILE)
|
||||
FILE: app/core/note_payload.py
|
||||
DESCRIPTION: Baut das JSON-Objekt.
|
||||
FEATURES:
|
||||
1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection.
|
||||
2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile').
|
||||
VERSION: 2.3.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: yaml, os, json, pathlib, hashlib
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -20,6 +16,7 @@ from typing import Any, Dict, Tuple, Optional
|
|||
import os
|
||||
import json
|
||||
import pathlib
|
||||
import hashlib
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
|
|
@ -37,7 +34,6 @@ def _as_dict(x) -> Dict[str, Any]:
|
|||
return dict(x)
|
||||
|
||||
out: Dict[str, Any] = {}
|
||||
# bekannte Attribute übernehmen, sofern vorhanden
|
||||
for attr in (
|
||||
"frontmatter",
|
||||
"body",
|
||||
|
|
@ -56,7 +52,6 @@ def _as_dict(x) -> Dict[str, Any]:
|
|||
if val is not None:
|
||||
out[attr] = val
|
||||
|
||||
# Fallback: wenn immer noch leer, raw speichern
|
||||
if not out:
|
||||
out["raw"] = str(x)
|
||||
|
||||
|
|
@ -64,14 +59,12 @@ def _as_dict(x) -> Dict[str, Any]:
|
|||
|
||||
|
||||
def _pick_args(*args, **kwargs) -> Tuple[Optional[str], Optional[dict]]:
|
||||
"""Extrahiert optionale Zusatzargumente wie path und types_cfg."""
|
||||
path = kwargs.get("path") or (args[0] if args else None)
|
||||
types_cfg = kwargs.get("types_cfg") or kwargs.get("types") or None
|
||||
return path, types_cfg
|
||||
|
||||
|
||||
def _env_float(name: str, default: float) -> float:
|
||||
"""Liest einen Float-Wert aus der Umgebung, mit robustem Fallback."""
|
||||
try:
|
||||
return float(os.environ.get(name, default))
|
||||
except Exception:
|
||||
|
|
@ -79,7 +72,6 @@ def _env_float(name: str, default: float) -> float:
|
|||
|
||||
|
||||
def _ensure_list(x) -> list:
|
||||
"""Garantiert eine String-Liste."""
|
||||
if x is None:
|
||||
return []
|
||||
if isinstance(x, list):
|
||||
|
|
@ -88,13 +80,44 @@ def _ensure_list(x) -> list:
|
|||
return [str(i) for i in x]
|
||||
return [str(x)]
|
||||
|
||||
# --- Hash Logic ---
|
||||
def _compute_hash(content: str) -> str:
|
||||
"""Berechnet einen SHA-256 Hash für den gegebenen String."""
|
||||
if not content:
|
||||
return ""
|
||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||
|
||||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||
"""
|
||||
Stellt den String zusammen, der gehasht werden soll.
|
||||
"""
|
||||
body = str(n.get("body") or "")
|
||||
|
||||
if mode == "body":
|
||||
return body
|
||||
|
||||
if mode == "full":
|
||||
fm = n.get("frontmatter") or {}
|
||||
# Wichtig: Sortierte Keys für deterministisches Verhalten!
|
||||
# Wir nehmen alle steuernden Metadaten auf
|
||||
meta_parts = []
|
||||
# Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden
|
||||
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
||||
val = fm.get(k)
|
||||
if val is not None:
|
||||
meta_parts.append(f"{k}:{val}")
|
||||
|
||||
meta_str = "|".join(meta_parts)
|
||||
return f"{meta_str}||{body}"
|
||||
|
||||
return body
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Type-Registry laden
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_types_config(explicit_cfg: Optional[dict] = None) -> dict:
|
||||
"""Lädt die Type-Registry aus YAML/JSON oder nutzt ein explizit übergebenes Dict."""
|
||||
if explicit_cfg and isinstance(explicit_cfg, dict):
|
||||
return explicit_cfg
|
||||
|
||||
|
|
@ -111,7 +134,6 @@ def _load_types_config(explicit_cfg: Optional[dict] = None) -> dict:
|
|||
|
||||
|
||||
def _cfg_for_type(note_type: str, reg: dict) -> dict:
|
||||
"""Liefert die Konfiguration für einen konkreten Notiztyp aus der Registry."""
|
||||
if not isinstance(reg, dict):
|
||||
return {}
|
||||
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
|
||||
|
|
@ -119,7 +141,6 @@ def _cfg_for_type(note_type: str, reg: dict) -> dict:
|
|||
|
||||
|
||||
def _cfg_defaults(reg: dict) -> dict:
|
||||
"""Liefert den Default-Block aus der Registry (defaults/global)."""
|
||||
if not isinstance(reg, dict):
|
||||
return {}
|
||||
for key in ("defaults", "default", "global"):
|
||||
|
|
@ -136,21 +157,16 @@ def _cfg_defaults(reg: dict) -> dict:
|
|||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Baut das Note-Payload für mindnet_notes auf.
|
||||
|
||||
Erwartete Felder im Payload:
|
||||
- note_id: stabile ID aus Frontmatter (id) oder Note-Objekt
|
||||
- title: Titel der Notiz
|
||||
- type: Notiztyp (z. B. concept, project, journal, ...)
|
||||
- path: Pfad im Vault
|
||||
- retriever_weight: effektives Gewicht für den Retriever
|
||||
- chunk_profile: Profil für Chunking (short|medium|long|default|...)
|
||||
- edge_defaults: Liste von Kanten-Typen, die als Defaults gelten
|
||||
- aliases: Liste von Synonymen (WP-11)
|
||||
Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups.
|
||||
"""
|
||||
n = _as_dict(note)
|
||||
path_arg, types_cfg_explicit = _pick_args(*args, **kwargs)
|
||||
reg = _load_types_config(types_cfg_explicit)
|
||||
|
||||
# Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide')
|
||||
hash_source = kwargs.get("hash_source", "parsed")
|
||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||
|
||||
fm = n.get("frontmatter") or {}
|
||||
fm_type = fm.get("type") or n.get("type") or "concept"
|
||||
note_type = str(fm_type)
|
||||
|
|
@ -158,7 +174,7 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
|||
cfg_type = _cfg_for_type(note_type, reg)
|
||||
cfg_def = _cfg_defaults(reg)
|
||||
|
||||
# --- retriever_weight: Frontmatter > Typ-Config > Registry-Defaults > ENV ---
|
||||
# --- retriever_weight ---
|
||||
default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)
|
||||
retriever_weight = fm.get("retriever_weight")
|
||||
if retriever_weight is None:
|
||||
|
|
@ -171,20 +187,23 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
|||
except Exception:
|
||||
retriever_weight = default_rw
|
||||
|
||||
# --- chunk_profile: Frontmatter > Typ-Config > Registry-Defaults > ENV ---
|
||||
chunk_profile = fm.get("chunk_profile")
|
||||
if chunk_profile is None:
|
||||
chunk_profile = cfg_type.get(
|
||||
"chunk_profile",
|
||||
cfg_def.get(
|
||||
"chunk_profile",
|
||||
os.environ.get("MINDNET_DEFAULT_CHUNK_PROFILE", "medium"),
|
||||
),
|
||||
)
|
||||
if not isinstance(chunk_profile, str):
|
||||
chunk_profile = "medium"
|
||||
# --- chunk_profile (FIXED LOGIC) ---
|
||||
# 1. Frontmatter Override (beide Schreibweisen erlaubt)
|
||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
|
||||
# --- edge_defaults: Frontmatter > Typ-Config > Registry-Defaults ---
|
||||
# 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml)
|
||||
if chunk_profile is None:
|
||||
chunk_profile = cfg_type.get("chunking_profile")
|
||||
|
||||
# 3. Default Config (Fallback auf sliding_standard statt medium)
|
||||
if chunk_profile is None:
|
||||
chunk_profile = cfg_def.get("chunking_profile", "sliding_standard")
|
||||
|
||||
# 4. Safety Fallback
|
||||
if not isinstance(chunk_profile, str) or not chunk_profile:
|
||||
chunk_profile = "sliding_standard"
|
||||
|
||||
# --- edge_defaults ---
|
||||
edge_defaults = fm.get("edge_defaults")
|
||||
if edge_defaults is None:
|
||||
edge_defaults = cfg_type.get(
|
||||
|
|
@ -193,7 +212,7 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
|||
)
|
||||
edge_defaults = _ensure_list(edge_defaults)
|
||||
|
||||
# --- Basis-Metadaten (IDs, Titel, Pfad) ---
|
||||
# --- Basis-Metadaten ---
|
||||
note_id = n.get("note_id") or n.get("id") or fm.get("id")
|
||||
title = n.get("title") or fm.get("title") or ""
|
||||
path = n.get("path") or path_arg
|
||||
|
|
@ -208,29 +227,42 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
|||
"retriever_weight": retriever_weight,
|
||||
"chunk_profile": chunk_profile,
|
||||
"edge_defaults": edge_defaults,
|
||||
"hashes": {} # Init Hash Dict
|
||||
}
|
||||
|
||||
# Tags / Keywords übernehmen
|
||||
# --- MULTI-HASH CALCULATION (Strategy Decoupling) ---
|
||||
# Wir berechnen immer BEIDE Strategien und speichern sie.
|
||||
# ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird.
|
||||
modes_to_calc = ["body", "full"]
|
||||
|
||||
for mode in modes_to_calc:
|
||||
content_to_hash = _get_hash_source_content(n, mode)
|
||||
computed_hash = _compute_hash(content_to_hash)
|
||||
# Key Schema: mode:source:normalize (z.B. "full:parsed:canonical")
|
||||
key = f"{mode}:{hash_source}:{hash_normalize}"
|
||||
payload["hashes"][key] = computed_hash
|
||||
|
||||
# Tags / Keywords
|
||||
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
||||
if tags:
|
||||
payload["tags"] = _ensure_list(tags)
|
||||
|
||||
# WP-11: Aliases übernehmen (für Discovery Service)
|
||||
# Aliases
|
||||
aliases = fm.get("aliases")
|
||||
if aliases:
|
||||
payload["aliases"] = _ensure_list(aliases)
|
||||
|
||||
# Zeitliche Metadaten (sofern vorhanden)
|
||||
# Zeit
|
||||
for k in ("created", "modified", "date"):
|
||||
v = fm.get(k) or n.get(k)
|
||||
if v:
|
||||
payload[k] = str(v)
|
||||
|
||||
# Fulltext (Fallback, falls body im Input)
|
||||
# Fulltext
|
||||
if "body" in n and n["body"]:
|
||||
payload["fulltext"] = str(n["body"])
|
||||
|
||||
# JSON-Roundtrip zur harten Validierung (ASCII beibehalten)
|
||||
# JSON Validation
|
||||
json.loads(json.dumps(payload, ensure_ascii=False))
|
||||
|
||||
return payload
|
||||
|
|
@ -1,43 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/core/parser.py
|
||||
Version: 1.7.1 (fault-tolerant, API-kompatibel)
|
||||
Datum: 2025-10-01
|
||||
|
||||
Zweck
|
||||
-----
|
||||
Fehlertolerantes Einlesen von Markdown-Dateien mit YAML-Frontmatter.
|
||||
Kompatibel zur bisherigen Parser-API, aber robust gegenüber Nicht-UTF-8-Dateien:
|
||||
- Versucht nacheinander: utf-8 → utf-8-sig → cp1252 → latin-1.
|
||||
- Bei Fallback wird ein JSON-Warnhinweis auf stdout ausgegeben; der Import bricht NICHT ab.
|
||||
- YAML-Frontmatter wird mit '---' am Anfang und '---' als Abschluss erkannt.
|
||||
- extract_wikilinks() normalisiert [[id#anchor|label]] → 'id'.
|
||||
|
||||
Öffentliche API (kompatibel):
|
||||
- class ParsedNote(frontmatter: dict, body: str, path: str)
|
||||
- read_markdown(path) -> ParsedNote | None
|
||||
- normalize_frontmatter(fm) -> dict
|
||||
- validate_required_frontmatter(fm, required: tuple[str,...]=("id","title")) -> None
|
||||
- extract_wikilinks(text) -> list[str]
|
||||
- FRONTMATTER_RE (Kompatibilitäts-Konstante; Regex für '---'-Zeilen)
|
||||
|
||||
Beispiele
|
||||
---------
|
||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||
parsed = read_markdown("./vault/30_projects/project-demo.md")
|
||||
fm = normalize_frontmatter(parsed.frontmatter)
|
||||
validate_required_frontmatter(fm)
|
||||
body = parsed.body
|
||||
|
||||
from app.core.parser import extract_wikilinks
|
||||
links = extract_wikilinks(body)
|
||||
|
||||
Abhängigkeiten
|
||||
--------------
|
||||
- PyYAML (yaml)
|
||||
|
||||
Lizenz: MIT (projektintern)
|
||||
FILE: app/core/parser.py
|
||||
DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body.
|
||||
VERSION: 1.7.1
|
||||
STATUS: Active
|
||||
DEPENDENCIES: yaml, re, dataclasses, json, io, os
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
|
|||
|
|
@ -1,28 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
app/core/qdrant.py
|
||||
Version: 2.2.0 (2025-11-11)
|
||||
|
||||
Aufgabe
|
||||
-------
|
||||
- Zentraler Qdrant-Zugriff (Client, Config)
|
||||
- Collection-Anlage (notes/chunks/edges)
|
||||
- **Payload-Indizes sicherstellen** (idempotent)
|
||||
|
||||
Hinweis
|
||||
-------
|
||||
Diese Datei ist als Drop-in-Ersatz gedacht, falls in deinem Projekt noch keine
|
||||
robuste ensure_payload_indexes()-Implementierung vorliegt. Die Signaturen
|
||||
bleiben kompatibel zu scripts.import_markdown und scripts.reset_qdrant.
|
||||
|
||||
API-Notizen
|
||||
-----------
|
||||
- Payload-Indizes werden mit `create_payload_index` angelegt.
|
||||
- Typen stammen aus `qdrant_client.http.models.PayloadSchemaType`:
|
||||
KEYWORD | TEXT | INTEGER | FLOAT | BOOL | GEO | DATETIME
|
||||
- Für häufige Filterfelder (note_id, kind, scope, type, tags, ...) legen wir
|
||||
Indizes an. Das ist laut Qdrant-Doku Best Practice für performante Filter.
|
||||
FILE: app/core/qdrant.py
|
||||
DESCRIPTION: Qdrant-Client Factory und Schema-Management. Erstellt Collections und Payload-Indizes.
|
||||
VERSION: 2.2.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, dataclasses, os
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
|
|||
|
|
@ -1,18 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
app/core/qdrant_points.py - robust points helpers for Qdrant
|
||||
|
||||
- Single source of truth for building PointStruct for notes/chunks/edges
|
||||
- Backward-compatible payloads for edges
|
||||
- Handles both Single-Vector and Named-Vector collections
|
||||
- Deterministic overrides via ENV to avoid auto-detection traps:
|
||||
* NOTES_VECTOR_NAME, CHUNKS_VECTOR_NAME, EDGES_VECTOR_NAME
|
||||
* MINDNET_VECTOR_NAME (fallback)
|
||||
> Set to a concrete name (e.g. "text") to force Named-Vector with that name
|
||||
> Set to "__single__" (or "single") to force Single-Vector
|
||||
|
||||
Version: 1.5.0 (2025-11-08)
|
||||
FILE: app/core/qdrant_points.py
|
||||
DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs.
|
||||
VERSION: 1.5.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, uuid, os
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
|
|
|
|||
|
|
@ -1,56 +0,0 @@
|
|||
"""
|
||||
app/core/ranking.py — Kombiniertes Scoring (WP-04)
|
||||
|
||||
Zweck:
|
||||
Zusammenführen von semantischem Score (normalisiert), Edge-Bonus und
|
||||
Centrality-Bonus in einen Gesamtscore für die Ergebnisreihung.
|
||||
Kompatibilität:
|
||||
Python 3.12+
|
||||
Version:
|
||||
0.1.0 (Erstanlage)
|
||||
Stand:
|
||||
2025-10-07
|
||||
Bezug:
|
||||
WP-04 Ranking-Formel (w_sem, w_edge, w_cent)
|
||||
Nutzung:
|
||||
from app.core.ranking import combine_scores
|
||||
Änderungsverlauf:
|
||||
0.1.0 (2025-10-07) – Erstanlage.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import List, Tuple, Dict
|
||||
|
||||
|
||||
def normalize_scores(values: List[float]) -> List[float]:
|
||||
"""Min-Max-Normalisierung über die Kandidatenmenge (Fallback 0.5 bei Konstanz)."""
|
||||
if not values:
|
||||
return values
|
||||
lo, hi = min(values), max(values)
|
||||
if hi - lo < 1e-9:
|
||||
return [0.5] * len(values)
|
||||
return [(v - lo) / (hi - lo) for v in values]
|
||||
|
||||
|
||||
def combine_scores(
|
||||
hits: List[Tuple[str, float, dict]], # (id, semantic_score, payload)
|
||||
edge_bonus_map: Dict[str, float],
|
||||
centrality_map: Dict[str, float],
|
||||
w_sem: float = 0.70,
|
||||
w_edge: float = 0.25,
|
||||
w_cent: float = 0.05,
|
||||
) -> List[Tuple[str, float, float, float, float]]:
|
||||
"""
|
||||
Liefert Liste von (point_id, total_score, edge_bonus, centrality_bonus, raw_semantic_score),
|
||||
absteigend nach total_score sortiert.
|
||||
"""
|
||||
sem = [h[1] for h in hits]
|
||||
sem_n = normalize_scores(sem)
|
||||
out = []
|
||||
for (pid, s, payload), s_norm in zip(hits, sem_n):
|
||||
e = edge_bonus_map.get(pid, 0.0)
|
||||
c = centrality_map.get(pid, 0.0)
|
||||
total = w_sem * s_norm + w_edge * e + w_cent * c
|
||||
out.append((pid, total, e, c, s))
|
||||
out.sort(key=lambda t: t[1], reverse=True)
|
||||
return out
|
||||
|
|
@ -1,8 +1,10 @@
|
|||
"""
|
||||
app/core/retriever.py — Hybrider Such-Algorithmus
|
||||
|
||||
Version:
|
||||
0.5.3 (WP-06 Fix: Populate 'payload' in QueryHit for meta-data access)
|
||||
FILE: app/core/retriever.py
|
||||
DESCRIPTION: Implementiert die Hybrid-Suche (Vektor + Graph-Expansion) und das Scoring-Modell (Explainability).
|
||||
VERSION: 0.5.3
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.config, app.models.dto, app.core.qdrant*, app.services.embeddings_client, app.core.graph_adapter
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
|
|||
|
|
@ -1,116 +0,0 @@
|
|||
"""app/core/retriever_config.py
|
||||
---------------------------------
|
||||
Zentrale Konfiguration für den mindnet-Retriever (WP-04).
|
||||
|
||||
Zweck:
|
||||
- Lädt config/retriever.yaml (falls vorhanden) oder nutzt sinnvolle Defaults.
|
||||
- Bietet einen gecachten Zugriff auf die Retriever-Config für
|
||||
andere Module (z. B. graph_adapter, retriever).
|
||||
|
||||
Hinweis zur Weiterentwicklung (Selbstjustierung):
|
||||
- Die hier definierten Parameter sind so gewählt, dass sie später
|
||||
durch ein Feedback-/Learning-to-Rank-Modell überschrieben werden
|
||||
können, ohne die restliche Architektur anzupassen.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
except Exception: # pragma: no cover - Fallback, falls PyYAML nicht installiert ist.
|
||||
yaml = None # type: ignore
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RetrieverConfig:
|
||||
semantic_scale: float
|
||||
edge_scale: float
|
||||
centrality_scale: float
|
||||
edge_weights: Dict[str, float]
|
||||
|
||||
@lru_cache
|
||||
def get_retriever_config() -> RetrieverConfig:
|
||||
"""Lädt die Retriever-Konfiguration (YAML + Defaults).
|
||||
|
||||
Reihenfolge:
|
||||
1. Defaults (sinnvoll gewählte Startwerte).
|
||||
2. Optional: config/retriever.yaml bzw. Pfad aus ENV
|
||||
MINDNET_RETRIEVER_CONFIG überschreibt die Defaults.
|
||||
|
||||
Die Funktion ist bewusst gecached, da sich die Konfiguration zur
|
||||
Laufzeit üblicherweise nicht ändert. Für dynamisches Nachladen
|
||||
könnte der Cache explizit geleert werden.
|
||||
"""
|
||||
|
||||
# 1) Defaults – bewusst konservativ gewählt.
|
||||
semantic_scale = 1.0
|
||||
edge_scale = 1.0
|
||||
centrality_scale = 1.0
|
||||
|
||||
edge_weights: Dict[str, float] = {
|
||||
# Wissens-Kanten
|
||||
"depends_on": 1.0,
|
||||
"related_to": 0.7,
|
||||
"similar_to": 0.7,
|
||||
"references": 0.5,
|
||||
# Struktur-Kanten
|
||||
"belongs_to": 0.2,
|
||||
"next": 0.1,
|
||||
"prev": 0.1,
|
||||
# Sonstige / technische Kanten
|
||||
"backlink": 0.2,
|
||||
"references_at": 0.2,
|
||||
}
|
||||
|
||||
# 2) Optional: YAML-Konfiguration laden
|
||||
cfg_path_env = os.getenv("MINDNET_RETRIEVER_CONFIG")
|
||||
if cfg_path_env:
|
||||
cfg_path = Path(cfg_path_env)
|
||||
else:
|
||||
# Project-Root = zwei Ebenen über app/core/
|
||||
cfg_path = Path(__file__).resolve().parents[2] / "config" / "retriever.yaml"
|
||||
|
||||
if yaml is not None and cfg_path.exists():
|
||||
try:
|
||||
data = yaml.safe_load(cfg_path.read_text(encoding="utf-8")) or {}
|
||||
except Exception:
|
||||
data = {}
|
||||
|
||||
retr = data.get("retriever") or {}
|
||||
|
||||
# Skalenwerte überschreiben, falls angegeben
|
||||
try:
|
||||
semantic_scale = float(retr.get("semantic_scale", semantic_scale))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
edge_scale = float(retr.get("edge_scale", edge_scale))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
centrality_scale = float(retr.get("centrality_scale", centrality_scale))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
# Edge-Gewichte je Kanten-Typ
|
||||
ew_cfg = retr.get("edge_weights") or {}
|
||||
if isinstance(ew_cfg, dict):
|
||||
for k, v in ew_cfg.items():
|
||||
try:
|
||||
edge_weights[str(k)] = float(v)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
|
||||
return RetrieverConfig(
|
||||
semantic_scale=semantic_scale,
|
||||
edge_scale=edge_scale,
|
||||
centrality_scale=centrality_scale,
|
||||
edge_weights=edge_weights,
|
||||
)
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
from functools import lru_cache
|
||||
from jsonschema import Draft202012Validator, RefResolver
|
||||
|
||||
SCHEMAS_DIR = os.getenv("SCHEMAS_DIR", os.path.join(os.path.dirname(os.path.dirname(__file__)), "..", "schemas"))
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def load_schema(name: str) -> dict:
|
||||
# name: "note.schema.json" | "chunk.schema.json" | "edge.schema.json"
|
||||
path = os.path.join(SCHEMAS_DIR, name)
|
||||
if not os.path.isfile(path):
|
||||
raise FileNotFoundError(f"Schema not found: {path}")
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def get_validator(name: str) -> Draft202012Validator:
|
||||
schema = load_schema(name)
|
||||
resolver = RefResolver.from_schema(schema)
|
||||
return Draft202012Validator(schema, resolver=resolver)
|
||||
|
|
@ -1,30 +1,11 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/core/type_registry.py
|
||||
Version: 1.0.0
|
||||
Datum: 2025-11-08
|
||||
|
||||
Zweck
|
||||
-----
|
||||
Lädt eine optionale Typ-Registry (config/types.yaml) und stellt
|
||||
komfortable Zugriffsfunktionen bereit. Die Registry ist *optional*:
|
||||
- Fehlt die Datei oder ist das YAML defekt, wird ein konservativer
|
||||
Default (Typ "concept") verwendet und es wird eine Warnung ausgegeben.
|
||||
- Änderungen an der Datei greifen nach einem Neustart des Prozesses.
|
||||
|
||||
Öffentliche API
|
||||
---------------
|
||||
- load_type_registry(path: str = "config/types.yaml") -> dict
|
||||
- get_type_config(note_type: str, reg: dict) -> dict
|
||||
- resolve_note_type(fm_type: str | None, reg: dict) -> str
|
||||
- effective_chunk_profile(note_type: str, reg: dict) -> str | None
|
||||
- profile_overlap(profile: str | None) -> tuple[int,int] # nur Overlap-Empfehlung
|
||||
|
||||
Hinweis
|
||||
-------
|
||||
Die Registry steuert KEINE Breaking Changes. Ohne Datei/Typ bleibt das
|
||||
Verhalten exakt wie im Release-Stand 20251105.
|
||||
FILE: app/core/type_registry.py
|
||||
DESCRIPTION: Loader für types.yaml. Achtung: Wird in der aktuellen Pipeline meist durch lokale Loader in 'ingestion.py' oder 'note_payload.py' umgangen.
|
||||
VERSION: 1.0.0
|
||||
STATUS: Deprecated (Redundant)
|
||||
DEPENDENCIES: yaml, os, functools
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
|
|||
|
|
@ -1,16 +0,0 @@
|
|||
from __future__ import annotations
|
||||
from typing import Dict
|
||||
from jsonschema import ValidationError
|
||||
from .schema_loader import get_validator
|
||||
|
||||
NOTE_SCHEMA_NAME = "note.schema.json"
|
||||
|
||||
def validate_note_payload(payload: Dict) -> None:
|
||||
validator = get_validator(NOTE_SCHEMA_NAME)
|
||||
errors = sorted(validator.iter_errors(payload), key=lambda e: e.path)
|
||||
if errors:
|
||||
msgs = []
|
||||
for e in errors:
|
||||
loc = ".".join([str(x) for x in e.path]) or "<root>"
|
||||
msgs.append(f"{loc}: {e.message}")
|
||||
raise ValidationError(" | ".join(msgs))
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
"""
|
||||
Version 1
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
app = FastAPI(title="mindnet-embed", version="1.0")
|
||||
|
||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # 384-dim
|
||||
_model: SentenceTransformer | None = None
|
||||
|
||||
class EmbedIn(BaseModel):
|
||||
model: Optional[str] = None
|
||||
inputs: List[str]
|
||||
|
||||
class EmbedOut(BaseModel):
|
||||
embeddings: List[List[float]]
|
||||
|
||||
@app.on_event("startup")
|
||||
def _load_model():
|
||||
global _model
|
||||
_model = SentenceTransformer(MODEL_NAME)
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"ok": True, "model": MODEL_NAME, "dim": 384}
|
||||
|
||||
@app.post("/embed", response_model=EmbedOut)
|
||||
def embed(payload: EmbedIn) -> EmbedOut:
|
||||
if _model is None:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded")
|
||||
if not payload.inputs:
|
||||
return EmbedOut(embeddings=[])
|
||||
vecs = _model.encode(payload.inputs, normalize_embeddings=False).tolist()
|
||||
if any(len(v) != 384 for v in vecs):
|
||||
raise HTTPException(status_code=500, detail="Embedding size mismatch (expected 384)")
|
||||
return EmbedOut(embeddings=vecs)
|
||||
|
|
@ -1,6 +1,10 @@
|
|||
"""
|
||||
Version 0.1
|
||||
|
||||
FILE: app/embeddings.py
|
||||
DESCRIPTION: Lokaler Wrapper für SentenceTransformer Embeddings.
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active (Bestätigung durch Aufrufer erforderlich)
|
||||
DEPENDENCIES: app.config, sentence_transformers
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui.py
|
||||
DESCRIPTION: Main Entrypoint für Streamlit. Router, der basierend auf Sidebar-Auswahl die Module (Chat, Editor, Graph) lädt.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, ui_config, ui_sidebar, ui_chat, ui_editor, ui_graph_service, ui_graph*, ui_graph_cytoscape
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
import uuid
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_api.py
|
||||
DESCRIPTION: Wrapper für Backend-Calls (Chat, Ingest, Feedback). Kapselt requests und Error-Handling.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: requests, streamlit, ui_config
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import requests
|
||||
import streamlit as st
|
||||
from ui_config import CHAT_ENDPOINT, INGEST_ANALYZE_ENDPOINT, INGEST_SAVE_ENDPOINT, FEEDBACK_ENDPOINT, API_TIMEOUT
|
||||
|
|
|
|||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_callbacks.py
|
||||
DESCRIPTION: Event-Handler für UI-Interaktionen. Implementiert den Übergang vom Graphen zum Editor (State Transfer).
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, os, ui_utils
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
import os
|
||||
from ui_utils import build_markdown_doc
|
||||
|
|
|
|||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_chat.py
|
||||
DESCRIPTION: Chat-UI. Rendert Nachrichtenverlauf, Quellen-Expanders mit Feedback-Buttons und delegiert bei Bedarf an den Editor.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, ui_api, ui_editor
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
from ui_api import send_chat_message, submit_feedback
|
||||
from ui_editor import render_draft_editor
|
||||
|
|
|
|||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_config.py
|
||||
DESCRIPTION: Zentrale Konfiguration für das Frontend. Definiert API-Endpoints, Timeouts und Graph-Styles (Farben).
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: os, hashlib, dotenv, pathlib
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
from dotenv import load_dotenv
|
||||
|
|
|
|||
|
|
@ -1,3 +1,11 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_editor.py
|
||||
DESCRIPTION: Markdown-Editor mit Live-Vorschau.
|
||||
Refactored für WP-14: Asynchrones Feedback-Handling (Queued State).
|
||||
VERSION: 2.7.0 (Fix: Async Save UI)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, uuid, re, datetime, ui_utils, ui_api
|
||||
"""
|
||||
import streamlit as st
|
||||
import uuid
|
||||
import re
|
||||
|
|
@ -68,14 +76,11 @@ def render_draft_editor(msg):
|
|||
|
||||
# --- UI LAYOUT ---
|
||||
|
||||
# Header Info (Debug Pfad anzeigen, damit wir sicher sind)
|
||||
origin_fname = st.session_state.get(f"{key_base}_origin_filename")
|
||||
|
||||
if origin_fname:
|
||||
# Dateiname extrahieren für saubere Anzeige
|
||||
display_name = str(origin_fname).split("/")[-1]
|
||||
st.success(f"📂 **Update-Modus**: `{display_name}`")
|
||||
# Debugging: Zeige vollen Pfad im Expander
|
||||
with st.expander("Dateipfad Details", expanded=False):
|
||||
st.code(origin_fname)
|
||||
st.markdown(f'<div class="draft-box" style="border-left: 5px solid #ff9f43;">', unsafe_allow_html=True)
|
||||
|
|
@ -165,21 +170,33 @@ def render_draft_editor(msg):
|
|||
save_label = "💾 Update speichern" if origin_fname else "💾 Neu anlegen & Indizieren"
|
||||
|
||||
if st.button(save_label, type="primary", key=f"{key_base}_save"):
|
||||
with st.spinner("Speichere im Vault..."):
|
||||
with st.spinner("Sende an Backend..."):
|
||||
if origin_fname:
|
||||
# UPDATE: Ziel ist der exakte Pfad
|
||||
target_file = origin_fname
|
||||
else:
|
||||
# CREATE: Neuer Dateiname
|
||||
raw_title = final_meta.get("title", "draft")
|
||||
target_file = f"{datetime.now().strftime('%Y%m%d')}-{slugify(raw_title)[:60]}.md"
|
||||
|
||||
result = save_draft_to_vault(final_doc, filename=target_file)
|
||||
|
||||
# --- WP-14 CHANGE START: Handling Async Response ---
|
||||
if "error" in result:
|
||||
st.error(f"Fehler: {result['error']}")
|
||||
else:
|
||||
st.success(f"Gespeichert: {result.get('file_path')}")
|
||||
status = result.get("status", "success")
|
||||
file_path = result.get("file_path", "unbekannt")
|
||||
|
||||
if status == "queued":
|
||||
# Neuer Status für Async Processing
|
||||
st.info(f"✅ **Eingereiht:** Datei `{file_path}` wurde gespeichert.")
|
||||
st.caption("Die KI-Analyse und Indizierung läuft im Hintergrund. Du kannst weiterarbeiten.")
|
||||
else:
|
||||
# Legacy / Synchroner Fall
|
||||
st.success(f"Gespeichert: {file_path}")
|
||||
|
||||
st.balloons()
|
||||
# --- WP-14 CHANGE END ---
|
||||
|
||||
with b2:
|
||||
if st.button("📋 Code anzeigen", key=f"{key_base}_btn_copy"):
|
||||
st.code(final_doc, language="markdown")
|
||||
|
|
@ -189,25 +206,18 @@ def render_draft_editor(msg):
|
|||
def render_manual_editor():
|
||||
"""
|
||||
Rendert den manuellen Editor.
|
||||
PRÜFT, ob eine Edit-Anfrage aus dem Graphen vorliegt!
|
||||
"""
|
||||
|
||||
target_msg = None
|
||||
|
||||
# 1. Prüfen: Gibt es Nachrichten im Verlauf?
|
||||
if st.session_state.messages:
|
||||
last_msg = st.session_state.messages[-1]
|
||||
|
||||
# 2. Ist die letzte Nachricht eine Edit-Anfrage? (Erkennbar am query_id prefix 'edit_')
|
||||
qid = str(last_msg.get("query_id", ""))
|
||||
if qid.startswith("edit_"):
|
||||
target_msg = last_msg
|
||||
|
||||
# 3. Fallback: Leeres Template, falls keine Edit-Anfrage vorliegt
|
||||
if not target_msg:
|
||||
target_msg = {
|
||||
"content": "---\ntype: concept\ntitle: Neue Notiz\nstatus: draft\ntags: []\n---\n# Titel\n",
|
||||
"query_id": f"manual_{uuid.uuid4()}" # Eigene ID, damit neuer State entsteht
|
||||
"query_id": f"manual_{uuid.uuid4()}"
|
||||
}
|
||||
|
||||
render_draft_editor(target_msg)
|
||||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_graph.py
|
||||
DESCRIPTION: Legacy Graph-Explorer (Streamlit-Agraph). Implementiert Physik-Simulation (BarnesHut) und direkten Editor-Sprung.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Maintenance (Active Fallback)
|
||||
DEPENDENCIES: streamlit, streamlit_agraph, qdrant_client, ui_config, ui_callbacks
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
from streamlit_agraph import agraph, Config
|
||||
from qdrant_client import models
|
||||
|
|
|
|||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_graph_cytoscape.py
|
||||
DESCRIPTION: Moderner Graph-Explorer (Cytoscape.js). Features: COSE-Layout, Deep-Linking (URL Params), Active Inspector Pattern (CSS-Styling ohne Re-Render).
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, st_cytoscape, qdrant_client, ui_config, ui_callbacks
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
from st_cytoscape import cytoscape
|
||||
from qdrant_client import models
|
||||
|
|
|
|||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_graph_service.py
|
||||
DESCRIPTION: Data Layer für den Graphen. Greift direkt auf Qdrant zu (Performance), um Knoten/Kanten zu laden und Texte zu rekonstruieren ("Stitching").
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, streamlit_agraph, ui_config, re
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import re
|
||||
from qdrant_client import QdrantClient, models
|
||||
from streamlit_agraph import Node, Edge
|
||||
|
|
|
|||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_sidebar.py
|
||||
DESCRIPTION: Rendert die Sidebar. Steuert den Modus-Wechsel (Chat/Editor/Graph) und globale Settings (Top-K).
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, ui_utils, ui_config
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
from ui_utils import load_history_from_logs
|
||||
from ui_config import HISTORY_FILE
|
||||
|
|
|
|||
|
|
@ -1,3 +1,12 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_utils.py
|
||||
DESCRIPTION: String-Utilities. Parser für Markdown/YAML (LLM-Healing) und Helper für History-Loading.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: re, yaml, unicodedata, json, datetime
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import re
|
||||
import yaml
|
||||
import unicodedata
|
||||
|
|
|
|||
|
|
@ -1,172 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/graph/service.py
|
||||
Version: 0.1.0
|
||||
Datum: 2025-09-10
|
||||
|
||||
Zweck
|
||||
-----
|
||||
Leichtgewichtiger Graph-Layer über Qdrant:
|
||||
- get_note(note_id)
|
||||
- get_chunks(note_id)
|
||||
- neighbors(source_id, kinds=[...], scope=['note','chunk'], depth=1)
|
||||
- walk_bfs(source_id, kinds, max_depth)
|
||||
- context_for_note(note_id, max_neighbors): heuristische Kontextsammlung
|
||||
|
||||
Hinweise
|
||||
--------
|
||||
- Nutzt die bestehenden Collections <prefix>_notes/_chunks/_edges.
|
||||
- Edges werden über Payload-Felder (`kind`, `source_id`, `target_id`) abgefragt.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import List, Dict, Any, Optional, Iterable, Set, Tuple
|
||||
from qdrant_client.http import models as rest
|
||||
from app.core.qdrant import QdrantConfig, get_client
|
||||
|
||||
def _cols(prefix: str):
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
class GraphService:
|
||||
def __init__(self, cfg: Optional[QdrantConfig] = None, prefix: Optional[str] = None):
|
||||
self.cfg = cfg or QdrantConfig.from_env()
|
||||
if prefix:
|
||||
self.cfg.prefix = prefix
|
||||
self.client = get_client(self.cfg)
|
||||
self.notes_col, self.chunks_col, self.edges_col = _cols(self.cfg.prefix)
|
||||
|
||||
# ------------------------ fetch helpers ------------------------
|
||||
def _scroll(self, col: str, flt: Optional[rest.Filter] = None, limit: int = 256):
|
||||
out = []
|
||||
nextp = None
|
||||
while True:
|
||||
pts, nextp = self.client.scroll(
|
||||
collection_name=col,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
limit=limit,
|
||||
offset=nextp,
|
||||
scroll_filter=flt,
|
||||
)
|
||||
if not pts:
|
||||
break
|
||||
out.extend(pts)
|
||||
if nextp is None:
|
||||
break
|
||||
return out
|
||||
|
||||
# ------------------------ public API ---------------------------
|
||||
def get_note(self, note_id: str) -> Optional[Dict[str, Any]]:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
pts, _ = self.client.scroll(self.notes_col, with_payload=True, with_vectors=False, limit=1, scroll_filter=f)
|
||||
return (pts[0].payload or None) if pts else None
|
||||
|
||||
def get_chunks(self, note_id: str) -> List[Dict[str, Any]]:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
pts = self._scroll(self.chunks_col, f)
|
||||
# Sortierung analog Export
|
||||
def key(pl):
|
||||
p = pl.payload or {}
|
||||
s = p.get("seq") or 0
|
||||
ci = p.get("chunk_index") or 0
|
||||
n = 0
|
||||
cid = p.get("chunk_id") or ""
|
||||
if isinstance(cid, str) and "#" in cid:
|
||||
try:
|
||||
n = int(cid.rsplit("#", 1)[-1])
|
||||
except Exception:
|
||||
n = 0
|
||||
return (int(s), int(ci), n)
|
||||
pts_sorted = sorted(pts, key=key)
|
||||
return [p.payload or {} for p in pts_sorted]
|
||||
|
||||
def neighbors(self, source_id: str, kinds: Optional[Iterable[str]] = None,
|
||||
scope: Optional[Iterable[str]] = None, depth: int = 1) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""
|
||||
Liefert eingehende & ausgehende Nachbarn (nur nach kind gefiltert).
|
||||
depth==1: direkte Kanten.
|
||||
"""
|
||||
kinds = list(kinds) if kinds else None
|
||||
must = [rest.FieldCondition(key="source_id", match=rest.MatchValue(value=source_id))]
|
||||
if kinds:
|
||||
must.append(rest.FieldCondition(key="kind", match=rest.MatchAny(any=kinds)))
|
||||
f = rest.Filter(must=must)
|
||||
edges = self._scroll(self.edges_col, f)
|
||||
out = {"out": [], "in": []}
|
||||
for e in edges:
|
||||
out["out"].append(e.payload or {})
|
||||
# Inverse Richtung (eingehend)
|
||||
must_in = [rest.FieldCondition(key="target_id", match=rest.MatchValue(value=source_id))]
|
||||
if kinds:
|
||||
must_in.append(rest.FieldCondition(key="kind", match=rest.MatchAny(any=kinds)))
|
||||
f_in = rest.Filter(must=must_in)
|
||||
edges_in = self._scroll(self.edges_col, f_in)
|
||||
for e in edges_in:
|
||||
out["in"].append(e.payload or {})
|
||||
return out
|
||||
|
||||
def walk_bfs(self, source_id: str, kinds: Iterable[str], max_depth: int = 2) -> Set[str]:
|
||||
visited: Set[str] = {source_id}
|
||||
frontier: Set[str] = {source_id}
|
||||
kinds = list(kinds)
|
||||
for _ in range(max_depth):
|
||||
nxt: Set[str] = set()
|
||||
for s in frontier:
|
||||
neigh = self.neighbors(s, kinds=kinds)
|
||||
for e in neigh["out"]:
|
||||
t = e.get("target_id")
|
||||
if isinstance(t, str) and t not in visited:
|
||||
visited.add(t)
|
||||
nxt.add(t)
|
||||
frontier = nxt
|
||||
if not frontier:
|
||||
break
|
||||
return visited
|
||||
|
||||
def context_for_note(self, note_id: str, kinds: Iterable[str] = ("references","backlink"), max_neighbors: int = 12) -> Dict[str, Any]:
|
||||
"""
|
||||
Heuristischer Kontext: eigene Chunks + Nachbarn nach Kantenarten, dedupliziert.
|
||||
"""
|
||||
note = self.get_note(note_id) or {}
|
||||
chunks = self.get_chunks(note_id)
|
||||
neigh = self.neighbors(note_id, kinds=list(kinds))
|
||||
targets = []
|
||||
for e in neigh["out"]:
|
||||
t = e.get("target_id")
|
||||
if isinstance(t, str):
|
||||
targets.append(t)
|
||||
for e in neigh["in"]:
|
||||
s = e.get("source_id")
|
||||
if isinstance(s, str):
|
||||
targets.append(s)
|
||||
# de-dupe
|
||||
seen = set()
|
||||
uniq = []
|
||||
for t in targets:
|
||||
if t not in seen:
|
||||
seen.add(t)
|
||||
uniq.append(t)
|
||||
uniq = uniq[:max_neighbors]
|
||||
neighbor_notes = [self.get_note(t) for t in uniq]
|
||||
return {
|
||||
"note": note,
|
||||
"chunks": chunks,
|
||||
"neighbors": [n for n in neighbor_notes if n],
|
||||
"edges_out": neigh["out"],
|
||||
"edges_in": neigh["in"],
|
||||
}
|
||||
|
||||
# Optional: Mini-CLI
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
import argparse, json
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV)")
|
||||
ap.add_argument("--note-id", required=True)
|
||||
ap.add_argument("--neighbors", action="store_true", help="Nur Nachbarn anzeigen")
|
||||
args = ap.parse_args()
|
||||
svc = GraphService(prefix=args.prefix)
|
||||
if args.neighbors:
|
||||
out = svc.neighbors(args.note_id, kinds=["references","backlink","prev","next","belongs_to"])
|
||||
else:
|
||||
out = svc.context_for_note(args.note_id)
|
||||
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||||
16
app/main.py
16
app/main.py
|
|
@ -1,11 +1,17 @@
|
|||
"""
|
||||
app/main.py — mindnet API bootstrap
|
||||
FILE: app/main.py
|
||||
DESCRIPTION: Bootstrap der FastAPI Anwendung. Inkludiert Router und Middleware.
|
||||
VERSION: 0.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.config, app.routers.* (embed, qdrant, query, graph, tools, feedback, chat, ingest, admin)
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from fastapi import FastAPI
|
||||
from .config import get_settings
|
||||
from .routers.embed_router import router as embed_router
|
||||
from .routers.qdrant_router import router as qdrant_router
|
||||
#from .routers.embed_router import router as embed_router
|
||||
#from .routers.qdrant_router import router as qdrant_router
|
||||
|
||||
from .routers.query import router as query_router
|
||||
from .routers.graph import router as graph_router
|
||||
|
|
@ -29,8 +35,8 @@ def create_app() -> FastAPI:
|
|||
def healthz():
|
||||
return {"status": "ok", "qdrant": s.QDRANT_URL, "prefix": s.COLLECTION_PREFIX}
|
||||
|
||||
app.include_router(embed_router)
|
||||
app.include_router(qdrant_router)
|
||||
# app.include_router(embed_router)
|
||||
# app.include_router(qdrant_router)
|
||||
|
||||
app.include_router(query_router, prefix="/query", tags=["query"])
|
||||
app.include_router(graph_router, prefix="/graph", tags=["graph"])
|
||||
|
|
|
|||
|
|
@ -1,14 +1,10 @@
|
|||
"""
|
||||
app/models/dto.py — Pydantic-Modelle (DTOs) für WP-04/WP-05/WP-06
|
||||
|
||||
Zweck:
|
||||
Laufzeit-Modelle für FastAPI (Requests/Responses).
|
||||
WP-06 Update: Intent & Intent-Source in ChatResponse.
|
||||
|
||||
Version:
|
||||
0.6.2 (WP-06: Decision Engine Transparency, Erweiterung des Feeback Request)
|
||||
Stand:
|
||||
2025-12-09
|
||||
FILE: app/models/dto.py
|
||||
DESCRIPTION: Pydantic-Modelle (DTOs) für Request/Response Bodies. Definiert das API-Schema.
|
||||
VERSION: 0.6.2
|
||||
STATUS: Active
|
||||
DEPENDENCIES: pydantic, typing, uuid
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
|
|
@ -1,20 +1,10 @@
|
|||
"""
|
||||
app/routers/admin.py — Admin-/Monitoring-Endpunkte (optional)
|
||||
|
||||
Zweck:
|
||||
Liefert einfache Kennzahlen zu Collections (Counts) und Config.
|
||||
Kompatibilität:
|
||||
Python 3.12+, FastAPI 0.110+, qdrant-client 1.x
|
||||
Version:
|
||||
0.1.0 (Erstanlage)
|
||||
Stand:
|
||||
2025-10-07
|
||||
Bezug:
|
||||
- Qdrant Collections: *_notes, *_chunks, *_edges
|
||||
Nutzung:
|
||||
app.include_router(admin.router, prefix="/admin", tags=["admin"])
|
||||
Änderungsverlauf:
|
||||
0.1.0 (2025-10-07) – Erstanlage.
|
||||
FILE: app/routers/admin.py
|
||||
DESCRIPTION: Monitoring-Endpunkt. Zeigt Qdrant-Collection-Counts und geladene Config.
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active (Optional)
|
||||
DEPENDENCIES: qdrant_client, app.config
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
|
|
@ -1,6 +1,11 @@
|
|||
"""
|
||||
app/routers/chat.py — RAG Endpunkt
|
||||
Version: 2.5.0 (Fix: Question Detection protects against False-Positive Interviews)
|
||||
FILE: app/routers/chat.py
|
||||
DESCRIPTION: Haupt-Chat-Interface (RAG & Interview). Enthält Intent-Router (Keywords/LLM) und Prompt-Construction.
|
||||
VERSION: 2.5.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.config, app.models.dto, app.services.llm_service, app.core.retriever, app.services.feedback_service
|
||||
EXTERNAL_CONFIG: config/decision_engine.yaml, config/types.yaml
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
|
|
|
|||
|
|
@ -1,5 +1,10 @@
|
|||
"""
|
||||
Version 0.1
|
||||
FILE: app/routers/embed_router.py
|
||||
DESCRIPTION: Exponiert die lokale Embedding-Funktion als API-Endpunkt.
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.embeddings, pydantic
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
"""
|
||||
app/routers/feedback.py
|
||||
Endpunkt für User-Feedback (WP-04c).
|
||||
FILE: app/routers/feedback.py
|
||||
DESCRIPTION: Endpunkt für explizites User-Feedback (WP-04c).
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.models.dto, app.services.feedback_service
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from app.models.dto import FeedbackRequest
|
||||
|
|
|
|||
|
|
@ -1,21 +1,10 @@
|
|||
"""
|
||||
app/routers/graph.py — Graph-Endpunkte (WP-04)
|
||||
|
||||
Zweck:
|
||||
Liefert die Nachbarschaft einer Note/ID als JSON-Graph (Nodes/Edges/Stats).
|
||||
Kompatibilität:
|
||||
Python 3.12+, FastAPI 0.110+, qdrant-client 1.x
|
||||
Version:
|
||||
0.1.0 (Erstanlage)
|
||||
Stand:
|
||||
2025-10-07
|
||||
Bezug:
|
||||
- app/core/graph_adapter.py
|
||||
- app/models/dto.py
|
||||
Nutzung:
|
||||
app.include_router(graph.router, prefix="/graph", tags=["graph"])
|
||||
Änderungsverlauf:
|
||||
0.1.0 (2025-10-07) – Erstanlage.
|
||||
FILE: app/routers/graph.py
|
||||
DESCRIPTION: Liefert Graph-Daten (Knoten/Kanten) für UI-Visualisierungen basierend auf einer Seed-ID. (WP4)
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, app.models.dto, app.core.graph_adapter, app.config
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
|
|
@ -1,12 +1,17 @@
|
|||
"""
|
||||
app/routers/ingest.py
|
||||
API-Endpunkte für WP-11 (Discovery & Persistence).
|
||||
Delegiert an Services.
|
||||
FILE: app/routers/ingest.py
|
||||
DESCRIPTION: Endpunkte für WP-11. Nimmt Markdown entgegen.
|
||||
Refactored für WP-14: Nutzt BackgroundTasks für non-blocking Save.
|
||||
VERSION: 0.7.0 (Fix: Timeout WP-14)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.core.ingestion, app.services.discovery, fastapi, pydantic
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from fastapi import APIRouter, HTTPException
|
||||
import asyncio
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
|
|
@ -16,7 +21,7 @@ from app.services.discovery import DiscoveryService
|
|||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
# Services Init (Global oder via Dependency Injection)
|
||||
# Services Init
|
||||
discovery_service = DiscoveryService()
|
||||
|
||||
class AnalyzeRequest(BaseModel):
|
||||
|
|
@ -32,7 +37,32 @@ class SaveResponse(BaseModel):
|
|||
status: str
|
||||
file_path: str
|
||||
note_id: str
|
||||
stats: Dict[str, Any]
|
||||
message: str # Neu für UX Feedback
|
||||
stats: Dict[str, Any] # Kann leer sein bei async processing
|
||||
|
||||
# --- Background Task Wrapper ---
|
||||
async def run_ingestion_task(markdown_content: str, filename: str, vault_root: str, folder: str):
|
||||
"""
|
||||
Führt die Ingestion im Hintergrund aus, damit der Request nicht blockiert.
|
||||
"""
|
||||
logger.info(f"🔄 Background Task started: Ingesting {filename}...")
|
||||
try:
|
||||
ingest_service = IngestionService()
|
||||
result = await ingest_service.create_from_text(
|
||||
markdown_content=markdown_content,
|
||||
filename=filename,
|
||||
vault_root=vault_root,
|
||||
folder=folder
|
||||
)
|
||||
# Hier könnte man später Notification-Services (Websockets) triggern
|
||||
if result.get("status") == "error":
|
||||
logger.error(f"❌ Background Ingestion Error for {filename}: {result.get('error')}")
|
||||
else:
|
||||
logger.info(f"✅ Background Task finished: {filename} ({result.get('chunks_count')} Chunks)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Critical Background Task Failure: {e}", exc_info=True)
|
||||
|
||||
|
||||
@router.post("/analyze")
|
||||
async def analyze_draft(req: AnalyzeRequest):
|
||||
|
|
@ -40,7 +70,6 @@ async def analyze_draft(req: AnalyzeRequest):
|
|||
WP-11 Intelligence: Liefert Link-Vorschläge via DiscoveryService.
|
||||
"""
|
||||
try:
|
||||
# Hier rufen wir jetzt den verbesserten Service auf
|
||||
result = await discovery_service.analyze_draft(req.text, req.type)
|
||||
return result
|
||||
except Exception as e:
|
||||
|
|
@ -48,9 +77,10 @@ async def analyze_draft(req: AnalyzeRequest):
|
|||
return {"suggestions": [], "error": str(e)}
|
||||
|
||||
@router.post("/save", response_model=SaveResponse)
|
||||
async def save_note(req: SaveRequest):
|
||||
async def save_note(req: SaveRequest, background_tasks: BackgroundTasks):
|
||||
"""
|
||||
WP-11 Persistence: Speichert und indiziert.
|
||||
WP-14 Fix: Startet Ingestion im Hintergrund (Fire & Forget).
|
||||
Verhindert Timeouts bei aktiver Smart-Edge-Allocation (WP-15).
|
||||
"""
|
||||
try:
|
||||
vault_root = os.getenv("MINDNET_VAULT_ROOT", "./vault")
|
||||
|
|
@ -61,29 +91,31 @@ async def save_note(req: SaveRequest):
|
|||
except: pass
|
||||
|
||||
final_filename = req.filename or f"draft_{int(time.time())}.md"
|
||||
ingest_service = IngestionService()
|
||||
|
||||
# Async Call
|
||||
result = await ingest_service.create_from_text(
|
||||
# Wir geben sofort eine ID zurück (optimistisch),
|
||||
# auch wenn die echte ID erst nach dem Parsing feststeht.
|
||||
# Für UI-Feedback nutzen wir den Filename.
|
||||
|
||||
# Task in die Queue schieben
|
||||
background_tasks.add_task(
|
||||
run_ingestion_task,
|
||||
markdown_content=req.markdown_content,
|
||||
filename=final_filename,
|
||||
vault_root=abs_vault_root,
|
||||
folder=req.folder
|
||||
)
|
||||
|
||||
if result.get("status") == "error":
|
||||
raise HTTPException(status_code=500, detail=result.get("error"))
|
||||
|
||||
return SaveResponse(
|
||||
status="success",
|
||||
file_path=result.get("path", "unknown"),
|
||||
note_id=result.get("note_id", "unknown"),
|
||||
status="queued",
|
||||
file_path=os.path.join(req.folder, final_filename),
|
||||
note_id="pending",
|
||||
message="Speicherung & KI-Analyse im Hintergrund gestartet.",
|
||||
stats={
|
||||
"chunks": result.get("chunks_count", 0),
|
||||
"edges": result.get("edges_count", 0)
|
||||
"chunks": -1, # Indikator für Async
|
||||
"edges": -1
|
||||
}
|
||||
)
|
||||
except HTTPException as he: raise he
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Save failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Save failed: {str(e)}")
|
||||
logger.error(f"Save dispatch failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Save dispatch failed: {str(e)}")
|
||||
|
|
@ -1,160 +0,0 @@
|
|||
"""
|
||||
Version 0.1
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional, List
|
||||
import uuid
|
||||
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel, Field
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http.models import (
|
||||
Distance,
|
||||
VectorParams,
|
||||
PointStruct,
|
||||
Filter,
|
||||
FieldCondition,
|
||||
MatchValue,
|
||||
)
|
||||
|
||||
from ..config import get_settings
|
||||
from ..embeddings import embed_texts
|
||||
|
||||
router = APIRouter(prefix="/qdrant", tags=["qdrant"])
|
||||
|
||||
def _client() -> QdrantClient:
|
||||
s = get_settings()
|
||||
return QdrantClient(url=s.QDRANT_URL, api_key=s.QDRANT_API_KEY)
|
||||
|
||||
def _col(name: str) -> str:
|
||||
return f"{get_settings().COLLECTION_PREFIX}_{name}"
|
||||
|
||||
def _uuid5(s: str) -> str:
|
||||
"""Deterministic UUIDv5 from arbitrary string (server-side point id)."""
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, s))
|
||||
|
||||
# --- Models ---
|
||||
class BaseMeta(BaseModel):
|
||||
note_id: str = Field(..., description="Stable ID of the note (e.g., hash of vault-relative path)")
|
||||
title: Optional[str] = Field(None, description="Note or chunk title")
|
||||
path: Optional[str] = Field(None, description="Vault-relative path to the .md file")
|
||||
Typ: Optional[str] = None
|
||||
Status: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
Rolle: Optional[List[str]] = None # allow list
|
||||
|
||||
class UpsertChunkRequest(BaseMeta):
|
||||
chunk_id: str = Field(..., description="Stable ID of the chunk within the note")
|
||||
text: str = Field(..., description="Chunk text content")
|
||||
links: Optional[List[str]] = Field(default=None, description="Outbound links detected in the chunk")
|
||||
|
||||
class UpsertNoteRequest(BaseMeta):
|
||||
text: Optional[str] = Field(None, description="Full note text (optional)")
|
||||
|
||||
class UpsertEdgeRequest(BaseModel):
|
||||
src_note_id: str
|
||||
dst_note_id: Optional[str] = None
|
||||
src_chunk_id: Optional[str] = None
|
||||
dst_chunk_id: Optional[str] = None
|
||||
relation: str = Field(default="links_to")
|
||||
link_text: Optional[str] = None
|
||||
|
||||
class QueryRequest(BaseModel):
|
||||
query: str
|
||||
limit: int = 5
|
||||
note_id: Optional[str] = None
|
||||
path: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
|
||||
# --- Helpers ---
|
||||
def _ensure_collections():
|
||||
s = get_settings()
|
||||
cli = _client()
|
||||
# chunks
|
||||
try:
|
||||
cli.get_collection(_col("chunks"))
|
||||
except Exception:
|
||||
cli.recreate_collection(_col("chunks"), vectors_config=VectorParams(size=s.VECTOR_SIZE, distance=Distance.COSINE))
|
||||
# notes
|
||||
try:
|
||||
cli.get_collection(_col("notes"))
|
||||
except Exception:
|
||||
cli.recreate_collection(_col("notes"), vectors_config=VectorParams(size=s.VECTOR_SIZE, distance=Distance.COSINE))
|
||||
# edges (dummy vector of size 1)
|
||||
try:
|
||||
cli.get_collection(_col("edges"))
|
||||
except Exception:
|
||||
cli.recreate_collection(_col("edges"), vectors_config=VectorParams(size=1, distance=Distance.COSINE))
|
||||
|
||||
@router.post("/upsert_chunk", summary="Upsert a chunk into mindnet_chunks")
|
||||
def upsert_chunk(req: UpsertChunkRequest) -> dict:
|
||||
_ensure_collections()
|
||||
cli = _client()
|
||||
vec = embed_texts([req.text])[0]
|
||||
payload: dict[str, Any] = req.model_dump()
|
||||
payload.pop("text", None)
|
||||
payload["preview"] = (req.text[:240] + "…") if len(req.text) > 240 else req.text
|
||||
qdrant_id = _uuid5(f"chunk:{req.chunk_id}")
|
||||
pt = PointStruct(id=qdrant_id, vector=vec, payload=payload)
|
||||
cli.upsert(collection_name=_col("chunks"), points=[pt])
|
||||
return {"status": "ok", "id": qdrant_id}
|
||||
|
||||
@router.post("/upsert_note", summary="Upsert a note into mindnet_notes")
|
||||
def upsert_note(req: UpsertNoteRequest) -> dict:
|
||||
_ensure_collections()
|
||||
cli = _client()
|
||||
text_for_embedding = req.text if req.text else (req.title or req.note_id)
|
||||
vec = embed_texts([text_for_embedding])[0]
|
||||
payload: dict[str, Any] = req.model_dump()
|
||||
payload.pop("text", None)
|
||||
qdrant_id = _uuid5(f"note:{req.note_id}")
|
||||
pt = PointStruct(id=qdrant_id, vector=vec, payload=payload)
|
||||
cli.upsert(collection_name=_col("notes"), points=[pt])
|
||||
return {"status": "ok", "id": qdrant_id}
|
||||
|
||||
@router.post("/upsert_edge", summary="Upsert a graph edge into mindnet_edges")
|
||||
def upsert_edge(req: UpsertEdgeRequest) -> dict:
|
||||
_ensure_collections()
|
||||
cli = _client()
|
||||
payload = req.model_dump()
|
||||
vec = [0.0]
|
||||
raw_edge_id = f"{req.src_note_id}|{req.src_chunk_id or ''}->{req.dst_note_id or ''}|{req.dst_chunk_id or ''}|{req.relation}"
|
||||
qdrant_id = _uuid5(f"edge:{raw_edge_id}")
|
||||
pt = PointStruct(id=qdrant_id, vector=vec, payload=payload)
|
||||
cli.upsert(collection_name=_col("edges"), points=[pt])
|
||||
return {"status": "ok", "id": qdrant_id}
|
||||
|
||||
@router.post("/query", summary="Vector query over mindnet_chunks with optional filters")
|
||||
def query(req: QueryRequest) -> dict:
|
||||
_ensure_collections()
|
||||
cli = _client()
|
||||
vec = embed_texts([req.query])[0]
|
||||
|
||||
flt: Optional[Filter] = None
|
||||
conds = []
|
||||
if req.note_id:
|
||||
conds.append(FieldCondition(key="note_id", match=MatchValue(value=req.note_id)))
|
||||
if req.path:
|
||||
conds.append(FieldCondition(key="path", match=MatchValue(value=req.path)))
|
||||
if req.tags:
|
||||
for t in req.tags:
|
||||
conds.append(FieldCondition(key="tags", match=MatchValue(value=t)))
|
||||
if conds:
|
||||
flt = Filter(must=conds)
|
||||
|
||||
res = cli.search(collection_name=_col("chunks"), query_vector=vec, limit=req.limit, with_payload=True, with_vectors=False, query_filter=flt)
|
||||
hits = []
|
||||
for p in res:
|
||||
pl = p.payload or {}
|
||||
hits.append({
|
||||
"chunk_id": p.id,
|
||||
"score": p.score,
|
||||
"note_id": pl.get("note_id"),
|
||||
"title": pl.get("title"),
|
||||
"path": pl.get("path"),
|
||||
"preview": pl.get("preview"),
|
||||
"tags": pl.get("tags"),
|
||||
})
|
||||
return {"results": hits}
|
||||
|
|
@ -1,23 +1,12 @@
|
|||
"""
|
||||
app/routers/query.py — Query-Endpunkte (WP-04)
|
||||
FILE: app/routers/query.py
|
||||
DESCRIPTION: Klassische Such-Endpunkte (Semantic & Hybrid). Initiiert asynchrones Feedback-Logging und ruft den richtigen Retriever Modus auf
|
||||
VERSION: 0.2.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.models.dto, app.core.retriever, app.services.feedback_service
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
Zweck:
|
||||
Stellt POST /query bereit und ruft den passenden Retriever-Modus auf.
|
||||
Kompatibilität:
|
||||
Python 3.12+, FastAPI 0.110+
|
||||
Version:
|
||||
0.1.0 (Erstanlage)
|
||||
Stand:
|
||||
2025-10-07
|
||||
Bezug:
|
||||
- app/core/retriever.py
|
||||
- app/models/dto.py
|
||||
Nutzung:
|
||||
app.include_router(query.router, prefix="/query", tags=["query"])
|
||||
Änderungsverlauf:
|
||||
0.2.0 (2025-12-07) - Update für WP04c Feedback
|
||||
0.1.0 (2025-10-07) – Erstanlage.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||
from app.models.dto import QueryRequest, QueryResponse
|
||||
|
|
|
|||
|
|
@ -1,21 +1,10 @@
|
|||
"""
|
||||
app/routers/tools.py — Tool-Definitionen für Ollama/n8n/MCP (read-only)
|
||||
|
||||
Zweck:
|
||||
Liefert Funktions-Schemas (OpenAI-/Ollama-kompatibles Tool-JSON) für:
|
||||
- mindnet_query -> POST /query
|
||||
- mindnet_subgraph -> GET /graph/{note_id}
|
||||
Kompatibilität:
|
||||
Python 3.12+, FastAPI 0.110+
|
||||
Version:
|
||||
0.1.1 (query ODER query_vector möglich)
|
||||
Stand:
|
||||
2025-10-07
|
||||
Nutzung:
|
||||
app.include_router(tools.router, prefix="/tools", tags=["tools"])
|
||||
Änderungsverlauf:
|
||||
0.1.1 (2025-10-07) – mindnet_query: oneOf(query, query_vector).
|
||||
0.1.0 (2025-10-07) – Erstanlage.
|
||||
FILE: app/routers/tools.py
|
||||
DESCRIPTION: Liefert JSON-Schemas für die Integration als 'Tools' in Agents (Ollama/OpenAI). Read-Only.
|
||||
VERSION: 0.1.1
|
||||
STATUS: Active
|
||||
DEPENDENCIES: fastapi
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
|
|
@ -1,12 +1,11 @@
|
|||
"""
|
||||
app/services/discovery.py
|
||||
Service für Link-Vorschläge und Knowledge-Discovery (WP-11).
|
||||
|
||||
Features:
|
||||
- Sliding Window Analyse für lange Texte.
|
||||
- Footer-Scan für Projekt-Referenzen.
|
||||
- 'Matrix-Logic' für intelligente Kanten-Typen (Experience -> Value = based_on).
|
||||
- Async & Nomic-Embeddings kompatibel.
|
||||
FILE: app/services/discovery.py
|
||||
DESCRIPTION: Service für WP-11. Analysiert Texte, findet Entitäten und schlägt typisierte Verbindungen vor ("Matrix-Logic").
|
||||
VERSION: 0.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.core.qdrant, app.models.dto, app.core.retriever
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
import logging
|
||||
import asyncio
|
||||
|
|
|
|||
|
|
@ -1,12 +1,10 @@
|
|||
"""
|
||||
app/services/embeddings_client.py — Text→Embedding Service
|
||||
|
||||
Zweck:
|
||||
Einheitlicher Client für Embeddings via Ollama (Nomic).
|
||||
Stellt sicher, dass sowohl Async (Ingestion) als auch Sync (Retriever)
|
||||
denselben Vektorraum (768 Dim) nutzen.
|
||||
|
||||
Version: 2.5.0 (Unified Ollama)
|
||||
FILE: app/services/embeddings_client.py
|
||||
DESCRIPTION: Unified Embedding Client. Nutzt Ollama API (HTTP). Ersetzt lokale sentence-transformers.
|
||||
VERSION: 2.5.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: httpx, requests, app.config
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
|
|
|
|||
|
|
@ -1,9 +1,10 @@
|
|||
"""
|
||||
app/services/feedback_service.py
|
||||
Service zum Loggen von Suchanfragen und Feedback (WP-04c).
|
||||
Speichert Daten als JSONL für späteres Self-Tuning (WP-08).
|
||||
|
||||
Version: 1.1 (Chat-Support)
|
||||
FILE: app/services/feedback_service.py
|
||||
DESCRIPTION: Schreibt Search- und Feedback-Logs in JSONL-Dateien.
|
||||
VERSION: 1.1
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.models.dto
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
|
|
|
|||
|
|
@ -1,88 +0,0 @@
|
|||
"""
|
||||
app/services/llm_ollama.py — Ollama-Integration & Prompt-Bau (WP-04)
|
||||
|
||||
Zweck:
|
||||
Prompt-Template & (optionaler) lokaler Aufruf von Ollama. Der Aufruf ist
|
||||
bewusst gekapselt und kann gefahrlos deaktiviert bleiben, bis ihr ein
|
||||
konkretes Modell konfigurieren wollt.
|
||||
Kompatibilität:
|
||||
Python 3.12+
|
||||
Version:
|
||||
0.1.0 (Erstanlage)
|
||||
Stand:
|
||||
2025-10-07
|
||||
Bezug:
|
||||
WP-04/05 Kontextbereitstellung für LLM
|
||||
Nutzung:
|
||||
from app.services.llm_ollama import build_prompt, call_ollama
|
||||
Änderungsverlauf:
|
||||
0.1.0 (2025-10-07) – Erstanlage.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import List, Dict, Optional
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
PROMPT_TEMPLATE = """System: You are a helpful expert.
|
||||
User: {question}
|
||||
|
||||
Context (ranked):
|
||||
{contexts}
|
||||
|
||||
Task: Answer precisely. At the end, list sources (note title + section) and important edge paths.
|
||||
"""
|
||||
|
||||
|
||||
def build_context_block(items: List[Dict]) -> str:
|
||||
"""Formatiert Top-K-Kontexte (Chunks) für den Prompt."""
|
||||
lines = []
|
||||
for i, it in enumerate(items, 1):
|
||||
note = it.get("note_title", "") or it.get("note_id", "")
|
||||
sec = it.get("section", "") or it.get("section_title", "")
|
||||
sc = it.get("score", 0)
|
||||
txt = it.get("text", "") or it.get("body", "") or ""
|
||||
lines.append(f"{i}) {note} — {sec} [score={sc:.2f}]\n{txt}\n")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def build_prompt(question: str, contexts: List[Dict]) -> str:
|
||||
"""Setzt Frage + Kontexte in ein konsistentes Template."""
|
||||
return PROMPT_TEMPLATE.format(question=question, contexts=build_context_block(contexts))
|
||||
|
||||
|
||||
def call_ollama(prompt: str, model: str = "llama3.1:8b", timeout_s: int = 120) -> Optional[str]:
|
||||
"""
|
||||
Optionaler lokaler Aufruf von `ollama run`.
|
||||
Rückgabe: generierter Text oder None bei Fehler/Abbruch.
|
||||
Hinweis: Nur nutzen, wenn Ollama lokal installiert/konfiguriert ist.
|
||||
"""
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
["ollama", "run", model],
|
||||
input=prompt.encode("utf-8"),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
timeout=timeout_s,
|
||||
check=False,
|
||||
)
|
||||
out = proc.stdout.decode("utf-8", errors="replace")
|
||||
# viele ollama Builds streamen JSON-Zeilen; robust extrahieren:
|
||||
try:
|
||||
# Falls JSONL, letztes "response" zusammenfassen
|
||||
texts = []
|
||||
for line in out.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
if "response" in obj:
|
||||
texts.append(obj["response"])
|
||||
except Exception:
|
||||
texts.append(line)
|
||||
return "".join(texts).strip()
|
||||
except Exception:
|
||||
return out.strip()
|
||||
except Exception:
|
||||
return None
|
||||
|
|
@ -1,6 +1,11 @@
|
|||
"""
|
||||
app/services/llm_service.py — LLM Client
|
||||
Version: 2.8.0 (Configurable Concurrency Limit)
|
||||
FILE: app/services/llm_service.py
|
||||
DESCRIPTION: Asynchroner Client für Ollama. Verwaltet Prompts und Background-Last (Semaphore).
|
||||
VERSION: 2.8.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: httpx, yaml, asyncio, app.config
|
||||
EXTERNAL_CONFIG: config/prompts.yaml
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import httpx
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
"""
|
||||
app/services/semantic_analyzer.py — Edge Validation & Filtering
|
||||
Version: 2.0 (Update: Background Priority for Batch Jobs)
|
||||
FILE: app/services/semantic_analyzer.py
|
||||
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
|
||||
VERSION: 2.1.0 (Fix: Strict Edge String Validation against LLM Hallucinations)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.services.llm_service, json, logging
|
||||
LAST_ANALYSIS: 2025-12-16
|
||||
"""
|
||||
|
||||
import json
|
||||
|
|
@ -17,6 +21,34 @@ class SemanticAnalyzer:
|
|||
def __init__(self):
|
||||
self.llm = LLMService()
|
||||
|
||||
def _is_valid_edge_string(self, edge_str: str) -> bool:
|
||||
"""
|
||||
Prüft, ob ein String eine valide Kante im Format 'kind:target' ist.
|
||||
Verhindert, dass LLM-Geschwätz ("Here is the list: ...") als Kante durchrutscht.
|
||||
"""
|
||||
if not isinstance(edge_str, str) or ":" not in edge_str:
|
||||
return False
|
||||
|
||||
parts = edge_str.split(":", 1)
|
||||
kind = parts[0].strip()
|
||||
target = parts[1].strip()
|
||||
|
||||
# Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten.
|
||||
# Erlaubt: "derived_from", "related_to"
|
||||
# Verboten: "derived end of instruction", "Here is the list"
|
||||
if " " in kind:
|
||||
return False
|
||||
|
||||
# Regel 2: Plausible Länge für den Typ
|
||||
if len(kind) > 40 or len(kind) < 2:
|
||||
return False
|
||||
|
||||
# Regel 3: Target darf nicht leer sein
|
||||
if not target:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
|
||||
"""
|
||||
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
|
||||
|
|
@ -55,14 +87,13 @@ class SemanticAnalyzer:
|
|||
)
|
||||
|
||||
try:
|
||||
# 4. LLM Call mit Traffic Control (NEU: priority="background")
|
||||
# Wir nutzen die "Slow Lane", damit der User im Chat nicht warten muss.
|
||||
# 4. LLM Call mit Traffic Control
|
||||
response_json = await self.llm.generate_raw_response(
|
||||
prompt=final_prompt,
|
||||
force_json=True,
|
||||
max_retries=5,
|
||||
base_delay=5.0,
|
||||
priority="background" # <--- WICHTIG: Drosselung aktivieren
|
||||
priority="background"
|
||||
)
|
||||
|
||||
# LOG: Raw Response Preview
|
||||
|
|
@ -87,30 +118,38 @@ class SemanticAnalyzer:
|
|||
valid_edges = []
|
||||
|
||||
# 6. Robuste Validierung (List vs Dict)
|
||||
# Wir sammeln erst alle Strings ein
|
||||
raw_candidates = []
|
||||
|
||||
if isinstance(data, list):
|
||||
# Standardfall: ["kind:target", ...]
|
||||
valid_edges = [str(e) for e in data if isinstance(e, str) and ":" in e]
|
||||
raw_candidates = data
|
||||
|
||||
elif isinstance(data, dict):
|
||||
# Abweichende Formate behandeln
|
||||
logger.info(f"ℹ️ [SemanticAnalyzer] LLM lieferte Dict statt Liste. Versuche Reparatur. Keys: {list(data.keys())}")
|
||||
|
||||
for key, val in data.items():
|
||||
# Fall A: {"edges": ["kind:target"]}
|
||||
if key.lower() in ["edges", "results", "kanten", "matches"] and isinstance(val, list):
|
||||
valid_edges.extend([str(e) for e in val if isinstance(e, str) and ":" in e])
|
||||
raw_candidates.extend(val)
|
||||
|
||||
# Fall B: {"kind": "target"}
|
||||
# Fall B: {"kind": "target"} (Beziehung als Key)
|
||||
elif isinstance(val, str):
|
||||
valid_edges.append(f"{key}:{val}")
|
||||
raw_candidates.append(f"{key}:{val}")
|
||||
|
||||
# Fall C: {"kind": ["target1", "target2"]}
|
||||
elif isinstance(val, list):
|
||||
for target in val:
|
||||
if isinstance(target, str):
|
||||
valid_edges.append(f"{key}:{target}")
|
||||
raw_candidates.append(f"{key}:{target}")
|
||||
|
||||
# Safety: Filtere nur Kanten, die halbwegs valide aussehen
|
||||
# 7. Strict Validation Loop
|
||||
for e in raw_candidates:
|
||||
e_str = str(e)
|
||||
if self._is_valid_edge_string(e_str):
|
||||
valid_edges.append(e_str)
|
||||
else:
|
||||
logger.debug(f" [SemanticAnalyzer] Invalid edge format rejected: '{e_str}'")
|
||||
|
||||
# Safety: Filtere nur Kanten, die halbwegs valide aussehen (Doppelcheck)
|
||||
final_result = [e for e in valid_edges if ":" in e]
|
||||
|
||||
# LOG: Ergebnis
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
version: 2.4.0 # Optimized for Async Intelligence & Hybrid Router
|
||||
version: 2.6.0 # Final WP-15 Config: Smart Edges & Strict/Soft Chunking
|
||||
|
||||
# ==============================================================================
|
||||
# 1. CHUNKING PROFILES
|
||||
|
|
@ -7,7 +7,6 @@ version: 2.4.0 # Optimized for Async Intelligence & Hybrid Router
|
|||
chunking_profiles:
|
||||
|
||||
# A. SHORT & FAST
|
||||
# Für Glossar, Tasks, Risiken. Kleine Schnipsel.
|
||||
sliding_short:
|
||||
strategy: sliding_window
|
||||
enable_smart_edge_allocation: false
|
||||
|
|
@ -16,7 +15,6 @@ chunking_profiles:
|
|||
overlap: [30, 50]
|
||||
|
||||
# B. STANDARD & FAST
|
||||
# Der "Traktor": Robust für Quellen, Journal, Daily Logs.
|
||||
sliding_standard:
|
||||
strategy: sliding_window
|
||||
enable_smart_edge_allocation: false
|
||||
|
|
@ -24,10 +22,8 @@ chunking_profiles:
|
|||
max: 650
|
||||
overlap: [50, 100]
|
||||
|
||||
# C. SMART FLOW (Performance-Safe Mode)
|
||||
# Für Konzepte, Projekte, Erfahrungen.
|
||||
# HINWEIS: 'enable_smart_edge_allocation' ist vorerst FALSE, um Ollama
|
||||
# bei der Generierung nicht zu überlasten. Später wieder aktivieren.
|
||||
# C. SMART FLOW (Text-Fluss)
|
||||
# Nutzt Sliding Window, aber mit LLM-Kanten-Analyse.
|
||||
sliding_smart_edges:
|
||||
strategy: sliding_window
|
||||
enable_smart_edge_allocation: true
|
||||
|
|
@ -35,12 +31,38 @@ chunking_profiles:
|
|||
max: 600
|
||||
overlap: [50, 80]
|
||||
|
||||
# D. SMART STRUCTURE
|
||||
# Für Profile, Werte, Prinzipien. Trennt hart an Überschriften (H2).
|
||||
# D. SMART STRUCTURE (Soft Split)
|
||||
# Trennt bevorzugt an H2, fasst aber kleine Abschnitte zusammen ("Soft Mode").
|
||||
structured_smart_edges:
|
||||
strategy: by_heading
|
||||
enable_smart_edge_allocation: true
|
||||
split_level: 2
|
||||
strict_heading_split: false
|
||||
max: 600
|
||||
target: 400
|
||||
overlap: [50, 80]
|
||||
|
||||
# E. SMART STRUCTURE STRICT (H2 Hard Split)
|
||||
# Trennt ZWINGEND an jeder H2.
|
||||
# Verhindert, dass "Vater" und "Partner" (Profile) oder Werte verschmelzen.
|
||||
structured_smart_edges_strict:
|
||||
strategy: by_heading
|
||||
enable_smart_edge_allocation: true
|
||||
split_level: 2
|
||||
strict_heading_split: true # Hard Mode
|
||||
max: 600
|
||||
target: 400
|
||||
overlap: [50, 80]
|
||||
|
||||
# F. SMART STRUCTURE DEEP (H3 Hard Split + Merge-Check)
|
||||
# Spezialfall für "Leitbild Prinzipien":
|
||||
# - Trennt H1, H2, H3 hart.
|
||||
# - Aber: Merged "leere" H2 (Tier 2) mit der folgenden H3 (MP1).
|
||||
structured_smart_edges_strict_L3:
|
||||
strategy: by_heading
|
||||
enable_smart_edge_allocation: true
|
||||
split_level: 3
|
||||
strict_heading_split: true
|
||||
max: 600
|
||||
target: 400
|
||||
overlap: [50, 80]
|
||||
|
|
@ -59,24 +81,13 @@ defaults:
|
|||
|
||||
types:
|
||||
|
||||
# --- KERNTYPEN (Hoch priorisiert & Smart) ---
|
||||
# --- KERNTYPEN ---
|
||||
|
||||
experience:
|
||||
chunking_profile: sliding_smart_edges
|
||||
retriever_weight: 0.90
|
||||
edge_defaults: ["derived_from", "references"]
|
||||
# Hybrid Classifier: Wenn diese Worte fallen, ist es eine Experience
|
||||
detection_keywords:
|
||||
- "passiert"
|
||||
- "erlebt"
|
||||
- "gefühl"
|
||||
- "situation"
|
||||
- "stolz"
|
||||
- "geärgert"
|
||||
- "reaktion"
|
||||
- "moment"
|
||||
- "konflikt"
|
||||
# Ghostwriter Schema: Sprechende Anweisungen für besseren Textfluss
|
||||
detection_keywords: ["passiert", "erlebt", "gefühl", "situation", "reaktion"]
|
||||
schema:
|
||||
- "Situation (Was ist passiert?)"
|
||||
- "Meine Reaktion (Was habe ich getan?)"
|
||||
|
|
@ -87,48 +98,37 @@ types:
|
|||
chunking_profile: sliding_smart_edges
|
||||
retriever_weight: 0.97
|
||||
edge_defaults: ["references", "depends_on"]
|
||||
detection_keywords:
|
||||
- "projekt"
|
||||
- "vorhaben"
|
||||
- "ziel ist"
|
||||
- "meilenstein"
|
||||
- "planen"
|
||||
- "starten"
|
||||
- "mission"
|
||||
detection_keywords: ["projekt", "vorhaben", "ziel ist", "planen", "starten"]
|
||||
schema:
|
||||
- "Mission & Zielsetzung"
|
||||
- "Aktueller Status & Blockaden"
|
||||
- "Nächste konkrete Schritte"
|
||||
- "Stakeholder & Ressourcen"
|
||||
|
||||
decision:
|
||||
chunking_profile: structured_smart_edges
|
||||
retriever_weight: 1.00 # MAX: Entscheidungen sind Gesetz
|
||||
# Strict, damit jede Entscheidung atomar bleibt
|
||||
chunking_profile: structured_smart_edges_strict
|
||||
retriever_weight: 1.00
|
||||
edge_defaults: ["caused_by", "references"]
|
||||
detection_keywords:
|
||||
- "entschieden"
|
||||
- "wahl"
|
||||
- "optionen"
|
||||
- "alternativen"
|
||||
- "beschluss"
|
||||
- "adr"
|
||||
detection_keywords: ["entschieden", "wahl", "optionen", "alternativen", "adr"]
|
||||
schema:
|
||||
- "Kontext & Problemstellung"
|
||||
- "Betrachtete Optionen (Alternativen)"
|
||||
- "Betrachtete Optionen"
|
||||
- "Die Entscheidung"
|
||||
- "Begründung (Warum diese Wahl?)"
|
||||
- "Begründung"
|
||||
|
||||
# --- PERSÖNLICHKEIT & IDENTITÄT ---
|
||||
|
||||
value:
|
||||
chunking_profile: structured_smart_edges
|
||||
# Strict, damit Werte nicht verschwimmen
|
||||
chunking_profile: structured_smart_edges_strict
|
||||
retriever_weight: 1.00
|
||||
edge_defaults: ["related_to"]
|
||||
detection_keywords: ["wert", "wichtig ist", "moral", "ethik"]
|
||||
schema: ["Definition", "Warum mir das wichtig ist", "Leitsätze für den Alltag"]
|
||||
schema: ["Definition", "Warum mir das wichtig ist", "Leitsätze"]
|
||||
|
||||
principle:
|
||||
chunking_profile: structured_smart_edges
|
||||
# L3 Strict für P3/P3a und Tier2/MP1 Logik
|
||||
chunking_profile: structured_smart_edges_strict_L3
|
||||
retriever_weight: 0.95
|
||||
edge_defaults: ["derived_from", "references"]
|
||||
detection_keywords: ["prinzip", "regel", "grundsatz", "leitlinie"]
|
||||
|
|
@ -138,11 +138,11 @@ types:
|
|||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.90
|
||||
edge_defaults: ["related_to"]
|
||||
detection_keywords: ["glaube", "überzeugung", "denke dass", "meinung"]
|
||||
schema: ["Der Glaubenssatz", "Ursprung & Reflexion"]
|
||||
|
||||
profile:
|
||||
chunking_profile: structured_smart_edges
|
||||
# Strict: Jede Rolle (H2) muss ein eigener Chunk sein
|
||||
chunking_profile: structured_smart_edges_strict
|
||||
retriever_weight: 0.70
|
||||
edge_defaults: ["references", "related_to"]
|
||||
schema: ["Rolle / Identität", "Fakten & Daten", "Historie"]
|
||||
|
|
@ -159,8 +159,8 @@ types:
|
|||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.85
|
||||
edge_defaults: ["related_to", "blocks"]
|
||||
detection_keywords: ["risiko", "gefahr", "bedrohung", "problem", "angst"]
|
||||
schema: ["Beschreibung des Risikos", "Mögliche Auswirkungen", "Gegenmaßnahmen"]
|
||||
detection_keywords: ["risiko", "gefahr", "bedrohung"]
|
||||
schema: ["Beschreibung des Risikos", "Auswirkungen", "Gegenmaßnahmen"]
|
||||
|
||||
# --- BASIS & WISSEN ---
|
||||
|
||||
|
|
@ -168,10 +168,7 @@ types:
|
|||
chunking_profile: sliding_smart_edges
|
||||
retriever_weight: 0.60
|
||||
edge_defaults: ["references", "related_to"]
|
||||
schema:
|
||||
- "Definition"
|
||||
- "Kontext & Hintergrund"
|
||||
- "Verwandte Konzepte"
|
||||
schema: ["Definition", "Kontext", "Verwandte Konzepte"]
|
||||
|
||||
task:
|
||||
chunking_profile: sliding_short
|
||||
|
|
@ -183,19 +180,36 @@ types:
|
|||
chunking_profile: sliding_standard
|
||||
retriever_weight: 0.80
|
||||
edge_defaults: ["references", "related_to"]
|
||||
schema: ["Log-Eintrag", "Gedanken & Erkenntnisse"]
|
||||
schema: ["Log-Eintrag", "Gedanken"]
|
||||
|
||||
source:
|
||||
chunking_profile: sliding_standard
|
||||
retriever_weight: 0.50
|
||||
edge_defaults: []
|
||||
schema:
|
||||
- "Metadaten (Autor, URL, Datum)"
|
||||
- "Kernaussage / Zusammenfassung"
|
||||
- "Zitate & Notizen"
|
||||
schema: ["Metadaten", "Zusammenfassung", "Zitate"]
|
||||
|
||||
glossary:
|
||||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.40
|
||||
edge_defaults: ["related_to"]
|
||||
schema: ["Begriff", "Definition"]
|
||||
|
||||
person:
|
||||
chunking_profile: sliding_standard
|
||||
retriever_weight: 0.50
|
||||
edge_defaults: ["related_to"]
|
||||
schema: ["Rolle", "Beziehung", "Kontext"]
|
||||
|
||||
event:
|
||||
chunking_profile: sliding_standard
|
||||
retriever_weight: 0.60
|
||||
edge_defaults: ["related_to"]
|
||||
schema: ["Datum & Ort", "Teilnehmer", "Ergebnisse"]
|
||||
|
||||
# --- FALLBACK ---
|
||||
|
||||
default:
|
||||
chunking_profile: sliding_standard
|
||||
retriever_weight: 1.00
|
||||
edge_defaults: ["references"]
|
||||
schema: ["Inhalt"]
|
||||
|
|
@ -58,6 +58,7 @@ Das Repository ist in **logische Domänen** unterteilt.
|
|||
| Datei | Inhalt & Zweck |
|
||||
| :--- | :--- |
|
||||
| `05_developer_guide.md` | **Workflow.** Hardware-Setup (Win/Pi/Beelink), Git-Flow, Test-Befehle, Modul-Interna. |
|
||||
| `05_genai_best_practices.md` | **AI Workflow.** Prompt-Library, Templates und Best Practices für die Entwicklung mit LLMs. |
|
||||
|
||||
### 📂 06_Roadmap & 99_Archive
|
||||
*Zielgruppe: Projektleitung*
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
doc_type: glossary
|
||||
audience: all
|
||||
status: active
|
||||
version: 2.6
|
||||
version: 2.6.0
|
||||
context: "Definitionen zentraler Begriffe und Entitäten im Mindnet-System."
|
||||
---
|
||||
|
||||
|
|
@ -13,24 +13,26 @@ context: "Definitionen zentraler Begriffe und Entitäten im Mindnet-System."
|
|||
## Kern-Entitäten
|
||||
|
||||
* **Note:** Repräsentiert eine Markdown-Datei. Die fachliche Haupteinheit.
|
||||
* **Chunk:** Ein Textabschnitt einer Note (meist 512 Tokens). Die technische Sucheinheit (Vektor).
|
||||
* **Chunk:** Ein Textabschnitt einer Note. Die technische Sucheinheit (Vektor). Durch neue Strategien kann dies ein Fließtext-Abschnitt oder ein logisches Kapitel (Heading) sein.
|
||||
* **Edge:** Eine gerichtete Verbindung zwischen zwei Knoten (Chunks oder Notes).
|
||||
* **Vault:** Der lokale Ordner mit den Markdown-Dateien (Source of Truth).
|
||||
* **Frontmatter:** Der YAML-Header am Anfang einer Notiz (enthält `id`, `type`, `title`).
|
||||
|
||||
## Komponenten
|
||||
|
||||
* **Importer:** Das Python-Skript (`ingestion.py`), das Markdown liest und in Qdrant schreibt.
|
||||
* **Importer:** Das Python-Skript (`import_markdown.py`), das Markdown liest und in Qdrant schreibt.
|
||||
* **Retriever:** Die Komponente, die sucht. Nutzt hybrides Scoring (Semantik + Graph).
|
||||
* **Decision Engine:** Teil des Routers, der entscheidet, wie auf eine Anfrage reagiert wird (z.B. Strategie wählen).
|
||||
* **Hybrid Router v5:** Die Logik, die erkennt, ob der User eine Frage stellt (`RAG`) oder einen Befehl gibt (`INTERVIEW`).
|
||||
* **Draft Editor:** Die Web-UI-Komponente, in der generierte Notizen bearbeitet werden.
|
||||
* **Traffic Control:** Ein Mechanismus im `LLMService`, der Chat-Anfragen priorisiert und Hintergrund-Jobs (wie Import) drosselt.
|
||||
* **Traffic Control (WP15):** Ein Mechanismus im `LLMService`, der Prioritäten verwaltet (`realtime` für Chat vs. `background` für Import) und Hintergrund-Tasks mittels Semaphoren drosselt.
|
||||
|
||||
## Konzepte & Features
|
||||
|
||||
* **Active Intelligence:** Feature im Web-Editor, das während des Schreibens automatisch Links vorschlägt.
|
||||
* **Smart Edge Allocation (WP15):** Ein KI-Verfahren, das prüft, ob ein Link in einer Notiz für einen spezifischen Textabschnitt relevant ist.
|
||||
* **Smart Edge Allocation (WP15):** Ein KI-Verfahren, das prüft, ob ein Link in einer Notiz für einen spezifischen Textabschnitt relevant ist, statt ihn blind allen Chunks zuzuordnen.
|
||||
* **Strict Heading Split:** Chunking-Strategie, bei der Überschriften (z.B. H2) als harte Grenzen dienen. Verhindert das Vermischen von Themen (z.B. zwei unterschiedliche Rollen in einem Chunk). Besitzt ein "Safety Net" für zu lange Abschnitte.
|
||||
* **Soft Heading Split:** Chunking-Strategie, die Überschriften respektiert, aber kleine Abschnitte zusammenfasst, um Vektor-Kontext zu füllen ("Fuller Chunks").
|
||||
* **Healing Parser:** UI-Funktion, die fehlerhaften Output des LLMs (z.B. defektes YAML) automatisch repariert.
|
||||
* **Explanation Layer:** Die Schicht, die dem Nutzer erklärt, *warum* ein Suchergebnis gefunden wurde (z.B. "Weil Projekt X davon abhängt").
|
||||
* **Provenance:** Die Herkunft einer Kante.
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ doc_type: user_manual
|
|||
audience: user, author
|
||||
scope: vault, markdown, schema
|
||||
status: active
|
||||
version: 2.6
|
||||
version: 2.7.0
|
||||
context: "Regelwerk für das Erstellen von Notizen im Vault. Die 'Source of Truth' für Autoren."
|
||||
---
|
||||
|
||||
|
|
@ -46,11 +46,12 @@ tags: [ki, entwicklung] # Taxonomie
|
|||
---
|
||||
```
|
||||
|
||||
**Optionale Felder:**
|
||||
**Optionale Felder & Overrides (Advanced):**
|
||||
* `aliases`: [Alpha Projekt] – Wichtig für "Active Intelligence" (Exact Match).
|
||||
* `visibility`: internal (default) / public.
|
||||
|
||||
> **Wichtig:** Felder wie `retriever_weight` oder `chunk_profile` werden zentral über `types.yaml` gesteuert und müssen nicht mehr manuell gesetzt werden (Virtual Schema Layer).
|
||||
* **NEU:** Du kannst die KI-Steuerung manuell überschreiben, wenn dir der Standard für den Typ nicht passt:
|
||||
* `chunking_profile`: Zwingt den Chunker in einen Modus (z.B. `structured_smart_edges_strict`).
|
||||
* `retriever_weight`: Setzt die Wichtigkeit manuell hoch/runter (z.B. `1.5` für extrem wichtig).
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -136,11 +137,9 @@ Ich habe gelernt: Das ist oft das Zeichen kurz vor dem Durchbruch.
|
|||
|
||||
---
|
||||
|
||||
## 6. Best Practices & Beispiele (Klassik)
|
||||
## 6. Best Practices & Beispiele
|
||||
|
||||
Hier sind vollständige Vorlagen für häufige Typen.
|
||||
|
||||
### 6.1 Beispiel: Projekt-Notiz
|
||||
### 6.1 Beispiel: Projekt-Notiz (Standard)
|
||||
Projekte profitieren von `depends_on`, um Abhängigkeiten zu klären.
|
||||
|
||||
```markdown
|
||||
|
|
@ -159,11 +158,11 @@ Wir bauen ein persönliches Wissensnetz.
|
|||
Wir nutzen [[rel:depends_on Qdrant]] für die Vektorsuche und [[rel:depends_on FastAPI]] für das Backend.
|
||||
|
||||
## Architektur
|
||||
Das Konzept basiert auf [[RAG Architecture]]. (Automatisch 'depends_on' durch Typ-Default, falls konfiguriert).
|
||||
Das Konzept basiert auf [[RAG Architecture]].
|
||||
```
|
||||
|
||||
### 6.2 Beispiel: Entscheidung (Decision Record)
|
||||
Entscheidungen sind hoch gewichtet (`retriever_weight: 1.0`).
|
||||
### 6.2 Beispiel: Advanced Tuning (Manuelles Override)
|
||||
Hier zwingen wir das System, eine Entscheidung extrem kleinteilig (`strict`) zu zerlegen und in der Suche maximal zu priorisieren.
|
||||
|
||||
```markdown
|
||||
---
|
||||
|
|
@ -172,6 +171,9 @@ title: ADR: Wahl von Qdrant
|
|||
type: decision
|
||||
status: final
|
||||
tags: [architektur, db]
|
||||
# OVERRIDES: Wir wollen diese Notiz extrem wichtig machen und strikt trennen
|
||||
chunking_profile: structured_smart_edges_strict
|
||||
retriever_weight: 1.5
|
||||
---
|
||||
|
||||
# Entscheidung: Qdrant
|
||||
|
|
@ -180,13 +182,11 @@ Wir haben uns für Qdrant entschieden.
|
|||
|
||||
## Alternativen
|
||||
Wir haben auch [[rel:similar_to Pinecone]] und [[rel:similar_to Weaviate]] betrachtet.
|
||||
|
||||
## Begründung
|
||||
Qdrant erlaubt lokalen Betrieb und [[rel:solves Payload Filtering Requirements]].
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Langfristige Stabilität
|
||||
## 7. Virtual Schema Layer
|
||||
|
||||
Wir nutzen das Prinzip des **Virtual Schema Layers**. Wir kodieren keine Logik (wie `chunk_size`) in die Notizen. Das wird zentral in der `types.yaml` verwaltet. Das bedeutet für dich: Du kannst dich rein auf den Inhalt konzentrieren. Wenn wir die Chunking-Strategie ändern, müssen wir nicht 1000 Markdown-Dateien anfassen.
|
||||
Grundsätzlich gilt das Prinzip des **Virtual Schema Layers**. Die Logik (wie `chunk_size`) wird zentral in der `types.yaml` verwaltet.
|
||||
**Aber:** Als Power-User hast du über die oben genannten Overrides (`chunking_profile`) jederzeit die Möglichkeit, aus diesem Standard auszubrechen, wenn eine spezifische Notiz eine Sonderbehandlung benötigt.
|
||||
|
|
@ -3,7 +3,7 @@ doc_type: technical_reference
|
|||
audience: developer, admin
|
||||
scope: configuration, env
|
||||
status: active
|
||||
version: 2.6
|
||||
version: 2.7.0
|
||||
context: "Referenztabellen für Umgebungsvariablen und YAML-Konfigurationen."
|
||||
---
|
||||
|
||||
|
|
@ -30,10 +30,9 @@ Diese Variablen steuern die Infrastruktur, Timeouts und Feature-Flags.
|
|||
| `MINDNET_OLLAMA_URL` | `http://127.0.0.1:11434`| URL zum LLM-Server. |
|
||||
| `MINDNET_LLM_TIMEOUT` | `300.0` | Timeout in Sekunden (Erhöht für CPU Cold-Starts). |
|
||||
| `MINDNET_API_TIMEOUT` | `300.0` | Frontend Timeout (Erhöht für Smart Edge Wartezeiten). |
|
||||
| `MINDNET_LLM_BACKGROUND_LIMIT`| `2` | **Traffic Control:** Max. parallele Import-Tasks. |
|
||||
| `MINDNET_LLM_BACKGROUND_LIMIT`| `2` | **Traffic Control (Neu):** Max. parallele Import-Tasks (Semaphore). |
|
||||
| `MINDNET_VAULT_ROOT` | `./vault` | Pfad für Write-Back Operationen (Drafts). |
|
||||
| `MINDNET_HASH_COMPARE` | `Body` | Import-Strategie: `Body`, `Frontmatter` oder `Full`. |
|
||||
| `MINDNET_HASH_SOURCE` | `parsed` | Hash-Quelle: `parsed`, `raw` oder `file`. |
|
||||
| `MINDNET_CHANGE_DETECTION_MODE` | `full` | **Change Detection (Neu):** `full` (Text + Meta) oder `body` (nur Text). |
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -41,22 +40,34 @@ Diese Variablen steuern die Infrastruktur, Timeouts und Feature-Flags.
|
|||
|
||||
Steuert das Import-Verhalten, Chunking und die Kanten-Logik pro Typ.
|
||||
|
||||
**Referenztabelle (Stand v2.6):**
|
||||
### 2.1 Konfigurations-Hierarchie (Override-Logik)
|
||||
Seit Version 2.7.0 gilt für `chunking_profile` und `retriever_weight` folgende Priorität:
|
||||
|
||||
| Typ (`type`) | Chunk Profile | Retriever Weight | Smart Edges? | Beschreibung |
|
||||
1. **Frontmatter (Höchste Prio):** Ein Wert direkt in der Markdown-Datei überschreibt alles.
|
||||
* *Beispiel:* `chunking_profile: structured_smart_edges_strict` im Header einer Notiz erzwingt diesen Splitter, egal welcher Typ eingestellt ist.
|
||||
2. **Type Config:** Der Standardwert für den `type` (z.B. `concept`) aus `types.yaml`.
|
||||
3. **Global Default:** Fallback aus `defaults` in `types.yaml`.
|
||||
|
||||
### 2.2 Typ-Referenztabelle
|
||||
|
||||
| Typ (`type`) | Chunk Profile (Standard) | Retriever Weight | Smart Edges? | Beschreibung |
|
||||
| :--- | :--- | :--- | :--- | :--- |
|
||||
| **concept** | `sliding_smart_edges` | 0.60 | Ja | Abstrakte Begriffe. |
|
||||
| **project** | `sliding_smart_edges` | 0.97 | Ja | Aktive Vorhaben. |
|
||||
| **decision** | `structured_smart_edges` | 1.00 | Ja | Entscheidungen (ADRs). |
|
||||
| **decision** | `structured_smart_edges_strict` | 1.00 | Ja | Entscheidungen (ADRs). Atomar. |
|
||||
| **experience** | `sliding_smart_edges` | 0.90 | Ja | Persönliche Learnings. |
|
||||
| **journal** | `sliding_standard` | 0.80 | Nein | Logs / Dailies. |
|
||||
| **value** | `structured_smart_edges` | 1.00 | Ja | Werte/Prinzipien. |
|
||||
| **value** | `structured_smart_edges_strict` | 1.00 | Ja | Werte/Prinzipien. Atomar. |
|
||||
| **risk** | `sliding_short` | 0.90 | Nein | Risiken. |
|
||||
| **person** | `sliding_standard` | 0.50 | Nein | Profile. |
|
||||
| **source** | `sliding_standard` | 0.50 | Nein | Externe Quellen. |
|
||||
| **event** | `sliding_standard` | 0.60 | Nein | Meetings. |
|
||||
| **goal** | `sliding_standard` | 0.95 | Nein | Strategische Ziele. |
|
||||
| **goal** | `sliding_smart_edges` | 0.95 | Nein | Strategische Ziele. |
|
||||
| **belief** | `sliding_short` | 0.90 | Nein | Glaubenssätze. |
|
||||
| **profile** | `structured_smart_edges_strict` | 0.70 | Nein | Rollenprofile. Strict Split. |
|
||||
| **principle** | `structured_smart_edges_strict_L3`| 0.95 | Nein | Prinzipien. Tiefer Split (H3) für Mikro-Prinzipien. |
|
||||
| **task** | `sliding_short` | 0.80 | Nein | Aufgaben. |
|
||||
| **glossary** | `sliding_short` | 0.40 | Nein | Begriffsdefinitionen. |
|
||||
| **default** | `sliding_standard` | 1.00 | Nein | Fallback. |
|
||||
|
||||
*Hinweis: `Smart Edges?` entspricht dem YAML-Key `enable_smart_edge_allocation: true`.*
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ doc_type: technical_reference
|
|||
audience: developer, architect
|
||||
scope: database, qdrant, schema
|
||||
status: active
|
||||
version: 2.6
|
||||
version: 2.7.0
|
||||
context: "Exakte Definition der Datenmodelle (Payloads) in Qdrant und Index-Anforderungen."
|
||||
---
|
||||
|
||||
|
|
@ -31,13 +31,20 @@ Repräsentiert die Metadaten einer Markdown-Datei (1:1 Beziehung).
|
|||
"note_id": "string (keyword)", // UUIDv5 (deterministisch) oder Slug
|
||||
"title": "string (text)", // Titel aus Frontmatter
|
||||
"type": "string (keyword)", // Logischer Typ (z.B. 'project', 'concept')
|
||||
"retriever_weight": "float", // Numerische Wichtigkeit (0.0-1.0), aus types.yaml
|
||||
"chunk_profile": "string", // Genutztes Profil (z.B. 'sliding_smart_edges')
|
||||
"retriever_weight": "float", // Effektive Wichtigkeit (Frontmatter > Type > Default)
|
||||
"chunk_profile": "string", // Effektives Profil (Frontmatter > Type > Default)
|
||||
"edge_defaults": ["string"], // Liste der aktiven Default-Kanten
|
||||
"tags": ["string"], // Liste von Tags aus Frontmatter
|
||||
"aliases": ["string"], // Synonyme für Discovery (WP-11)
|
||||
"created": "string (iso-date)", // Erstellungsdatum
|
||||
"updated": "integer", // Timestamp (File Modification Time)
|
||||
"fulltext": "string (no-index)" // Gesamter Text (nur für Recovery/Export)
|
||||
"fulltext": "string (no-index)", // Gesamter Text (nur für Recovery/Export)
|
||||
|
||||
// NEU in v2.7: Multi-Hash für flexible Change Detection
|
||||
"hashes": {
|
||||
"body:parsed:canonical": "string", // Hash nur über den Text-Body
|
||||
"full:parsed:canonical": "string" // Hash über Text + Metadaten (Tags, Title, Config)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
|
@ -68,10 +75,12 @@ Die atomare Sucheinheit. Enthält den Vektor.
|
|||
"text": "string (text)", // Reintext für Anzeige (ohne Overlap)
|
||||
"window": "string (text)", // Text + Overlap (Basis für Embedding)
|
||||
"ord": "integer", // Laufende Nummer (1..N) für Sortierung
|
||||
"retriever_weight": "float", // Kopie aus Note (für Query-Speed)
|
||||
"chunk_profile": "string", // Vererbt von Note
|
||||
"retriever_weight": "float", // Geerbt von Note (für schnelles Re-Ranking)
|
||||
"chunk_profile": "string", // Geerbt von Note (für Debugging/Filtering)
|
||||
"neighbors_prev": ["string"], // ID des Vorgängers (Linked List)
|
||||
"neighbors_next": ["string"] // ID des Nachfolgers
|
||||
"neighbors_next": ["string"], // ID des Nachfolgers
|
||||
"section": "string", // Pfad/Überschrift, zu der der Chunk gehört
|
||||
"source_path": "string" // Relativer Pfad zur Datei
|
||||
}
|
||||
```
|
||||
|
||||
|
|
@ -99,7 +108,7 @@ Gerichtete Kanten zwischen Knoten. Stark erweitert in v2.6 für Provenienz-Track
|
|||
"note_id": "string (keyword)", // Owner Note ID (Ursprung der Kante)
|
||||
|
||||
// Provenance & Quality (WP03/WP15)
|
||||
"provenance": "keyword", // 'explicit', 'rule', 'smart' (NEU)
|
||||
"provenance": "keyword", // 'explicit', 'rule', 'smart', 'structure'
|
||||
"rule_id": "string (keyword)", // Traceability: 'inline:rel', 'explicit:wikilink', 'smart:llm'
|
||||
"confidence": "float" // Vertrauenswürdigkeit (0.0 - 1.0)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ doc_type: technical_reference
|
|||
audience: developer, frontend_architect
|
||||
scope: architecture, graph_viz, state_management
|
||||
status: active
|
||||
version: 2.6
|
||||
version: 2.7.0
|
||||
context: "Technische Dokumentation des modularen Streamlit-Frontends, der Graph-Engines und des Editors."
|
||||
---
|
||||
|
||||
|
|
@ -28,7 +28,7 @@ Seit Version 2.6 ist das Frontend (`app/frontend/`) kein Monolith mehr, sondern
|
|||
| `ui_utils.py` | **Helper.** Markdown-Parsing (`parse_markdown_draft`) und String-Normalisierung. |
|
||||
| `ui_graph_service.py`| **Data Logic.** Holt Daten aus Qdrant und bereitet Nodes/Edges auf (unabhängig von der Vis-Library). |
|
||||
| `ui_graph_cytoscape.py`| **View: Graph.** Implementierung mit `st-cytoscape` (COSE Layout). |
|
||||
| `ui_editor.py` | **View: Editor.** Logik für Drafts und manuelles Editieren. |
|
||||
| `ui_editor.py` | **View: Editor.** Logik für Drafts, manuelles Editieren und **Async Feedback**. |
|
||||
|
||||
### 1.2 Konfiguration (`ui_config.py`)
|
||||
|
||||
|
|
@ -117,7 +117,19 @@ Der `switch_to_editor_callback` in `ui_callbacks.py` implementiert folgende Kask
|
|||
})
|
||||
st.session_state["sidebar_mode_selection"] = "📝 Manueller Editor"
|
||||
|
||||
Dies garantiert, dass der Editor immer den **echten, aktuellen Stand** der Markdown-Datei anzeigt.
|
||||
### 3.3 Async Save Pattern (Neu in v2.7 / WP-14)
|
||||
Um Timeouts bei der Smart-Edge-Berechnung zu vermeiden, nutzt der Editor ein **"Fire & Forget"** Muster.
|
||||
|
||||
1. **Request:** UI sendet Markdown an `/ingest/save`.
|
||||
2. **Backend:**
|
||||
* Validiert Request.
|
||||
* Speichert Datei auf Disk (Persistenz garantiert).
|
||||
* Startet `BackgroundTasks` für LLM-Analyse und Embedding.
|
||||
* Returniert sofort `status: queued`.
|
||||
3. **UI Feedback:**
|
||||
* Editor zeigt "Erfolgreich eingereiht".
|
||||
* User muss nicht warten.
|
||||
* (ToDo: WebSocket Notification bei Abschluss).
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
|
|
@ -3,36 +3,45 @@ doc_type: technical_reference
|
|||
audience: developer, devops
|
||||
scope: backend, ingestion, smart_edges
|
||||
status: active
|
||||
version: 2.6
|
||||
context: "Detaillierte technische Beschreibung der Import-Pipeline, Quality Gates und CLI-Befehle."
|
||||
version: 2.7.0
|
||||
context: "Detaillierte technische Beschreibung der Import-Pipeline, Chunking-Strategien und CLI-Befehle."
|
||||
---
|
||||
|
||||
# Ingestion Pipeline & Smart Processing
|
||||
|
||||
**Quellen:** `pipeline_playbook.md`, `Handbuch.md`
|
||||
|
||||
Die Ingestion transformiert Markdown in den Graphen. Entrypoint: `scripts/import_markdown.py`.
|
||||
Die Ingestion transformiert Markdown in den Graphen. Entrypoint: `scripts/import_markdown.py` (CLI) oder `routers/ingest.py` (API).
|
||||
|
||||
## 1. Der Import-Prozess (13-Schritte-Workflow)
|
||||
## 1. Der Import-Prozess (14-Schritte-Workflow)
|
||||
|
||||
Der Prozess ist **asynchron** und **idempotent**.
|
||||
|
||||
1. **Markdown lesen:** Rekursives Scannen des Vaults.
|
||||
2. **Frontmatter extrahieren:** Validierung von Pflichtfeldern (`id`, `type`, `title`).
|
||||
3. **Typauflösung:** Bestimmung des `type` via `types.yaml`.
|
||||
4. **Note-Payload generieren:** Erstellen des JSON-Objekts für `mindnet_notes`.
|
||||
5. **Chunking anwenden:** Zerlegung des Textes basierend auf dem `chunk_profile` (siehe unten).
|
||||
6. **Smart Edge Allocation (WP15):**
|
||||
1. **Trigger & Async Dispatch:**
|
||||
* **API (`/save`):** Nimmt Request entgegen, validiert und startet Background-Task ("Fire & Forget"). Antwortet sofort mit `202/Queued`.
|
||||
* **CLI:** Iteriert über Dateien und nutzt `asyncio.Semaphore` zur Drosselung.
|
||||
2. **Markdown lesen:** Rekursives Scannen des Vaults.
|
||||
3. **Frontmatter extrahieren:** Validierung von Pflichtfeldern (`id`, `type`, `title`).
|
||||
4. **Config Resolution:**
|
||||
* Bestimmung von `chunking_profile` und `retriever_weight`.
|
||||
* **Priorität:** 1. Frontmatter (Override) -> 2. `types.yaml` (Type) -> 3. Default.
|
||||
5. **Note-Payload generieren:**
|
||||
* Erstellen des JSON-Objekts für `mindnet_notes`.
|
||||
* **Multi-Hash Calculation:** Berechnet Hashtabellen für `body` (nur Text) und `full` (Text + Metadaten).
|
||||
6. **Change Detection:**
|
||||
* Vergleich des Hashes mit Qdrant.
|
||||
* Strategie wählbar via ENV `MINDNET_CHANGE_DETECTION_MODE` (`full` oder `body`).
|
||||
7. **Chunking anwenden:** Zerlegung des Textes basierend auf dem ermittelten Profil (siehe Kap. 3).
|
||||
8. **Smart Edge Allocation (WP15):**
|
||||
* Wenn `enable_smart_edge_allocation: true`: Der `SemanticAnalyzer` sendet Chunks an das LLM.
|
||||
* **Traffic Control:** Request nutzt `priority="background"`. Semaphore (Limit: 2) drosselt die Last.
|
||||
* **Traffic Control:** Request nutzt `priority="background"`. Semaphore (Limit via `.env`) drosselt die Last.
|
||||
* **Resilienz:** Bei Timeout (Ollama) greift ein Fallback (Broadcasting an alle Chunks).
|
||||
7. **Inline-Kanten finden:** Parsing von `[[rel:...]]`.
|
||||
8. **Callout-Kanten finden:** Parsing von `> [!edge]`.
|
||||
9. **Default-Edges erzeugen:** Anwendung der `edge_defaults` aus Registry.
|
||||
10. **Strukturkanten erzeugen:** `belongs_to`, `next`, `prev`.
|
||||
11. **Embedding (Async):** Generierung via `nomic-embed-text` (768d).
|
||||
12. **Strict Mode:** Abbruch bei leeren Embeddings oder Dimension 0.
|
||||
13. **Diagnose:** Integritäts-Check nach dem Lauf.
|
||||
9. **Inline-Kanten finden:** Parsing von `[[rel:...]]`.
|
||||
10. **Callout-Kanten finden:** Parsing von `> [!edge]`.
|
||||
11. **Default-Edges erzeugen:** Anwendung der `edge_defaults` aus Registry.
|
||||
12. **Strukturkanten erzeugen:** `belongs_to`, `next`, `prev`.
|
||||
13. **Embedding (Async):** Generierung via `nomic-embed-text` (768 Dim).
|
||||
14. **Diagnose:** Integritäts-Check nach dem Lauf.
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -44,6 +53,8 @@ Für regelmäßige Updates (Cronjob). Erkennt Änderungen via Hash.
|
|||
```bash
|
||||
export QDRANT_URL="http://localhost:6333"
|
||||
export COLLECTION_PREFIX="mindnet"
|
||||
# Steuert, wann eine Datei als "geändert" gilt
|
||||
export MINDNET_CHANGE_DETECTION_MODE="full"
|
||||
|
||||
# Nutzt das Venv der Produktionsumgebung
|
||||
/home/llmadmin/mindnet/.venv/bin/python3 -m scripts.import_markdown \
|
||||
|
|
@ -58,7 +69,7 @@ export COLLECTION_PREFIX="mindnet"
|
|||
> Das Flag `--purge-before-upsert` ist kritisch. Es löscht vor dem Schreiben einer Note ihre alten Chunks/Edges. Ohne dieses Flag entstehen **"Geister-Chunks"** (alte Textabschnitte, die im Markdown gelöscht wurden, aber im Index verbleiben).
|
||||
|
||||
### 2.2 Full Rebuild (Clean Slate)
|
||||
Notwendig bei Änderungen an `types.yaml` oder Modell-Wechsel.
|
||||
Notwendig bei Änderungen an `types.yaml` (z.B. neue Chunking-Profile) oder Modell-Wechsel.
|
||||
|
||||
```bash
|
||||
# 0. Modell sicherstellen
|
||||
|
|
@ -68,6 +79,7 @@ ollama pull nomic-embed-text
|
|||
python3 -m scripts.reset_qdrant --mode wipe --prefix "mindnet" --yes
|
||||
|
||||
# 2. Vollständiger Import (Force)
|
||||
# --force ignoriert alle Hashes und schreibt alles neu
|
||||
python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --force
|
||||
```
|
||||
|
||||
|
|
@ -75,18 +87,38 @@ python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --
|
|||
|
||||
## 3. Chunking & Payload
|
||||
|
||||
Das Chunking ist profilbasiert.
|
||||
Das Chunking ist profilbasiert und in `types.yaml` konfiguriert.
|
||||
|
||||
| Profil | Max Token | Overlap | Einsatz |
|
||||
### 3.1 Profile und Strategien
|
||||
|
||||
| Profil | Strategie | Parameter | Einsatzgebiet |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| `sliding_short` | 128 | 20 | Logs, Chats. |
|
||||
| `sliding_standard` | 512 | 50 | Massendaten. |
|
||||
| `sliding_smart_edges`| 512 | 50 | Wichtige Inhalte (Experience, Project). |
|
||||
| `structured_smart` | n/a | n/a | Trennt strikt an Headings (für ADRs). |
|
||||
| `sliding_short` | `sliding_window` | Max: 350, Target: 200 | Kurze Logs, Chats, Risiken. |
|
||||
| `sliding_standard` | `sliding_window` | Max: 650, Target: 450 | Massendaten (Journal, Quellen). |
|
||||
| `sliding_smart_edges`| `sliding_window` | Max: 600, Target: 400 | Fließtexte mit hohem Wert (Projekte, Erfahrungen). |
|
||||
| `structured_smart_edges` | `by_heading` | `strict: false` (Soft) | Strukturierte Texte, wo kleine Abschnitte gemergt werden dürfen. |
|
||||
| `structured_smart_edges_strict` | `by_heading` | `strict: true` (Hard) | **Atomare Einheiten**: Entscheidungen, Werte, Profile. |
|
||||
| `structured_smart_edges_strict_L3`| `by_heading` | `strict: true`, `level: 3` | Tief geschachtelte Prinzipien (Tier 2/MP1 Logik). |
|
||||
|
||||
**Payload-Felder:**
|
||||
* `text`: Der reine Inhalt (Anzeige).
|
||||
* `window`: Inhalt plus Overlap (für Embedding).
|
||||
### 3.2 Die `by_heading` Logik (v2.9 Hybrid)
|
||||
|
||||
Die Strategie `by_heading` zerlegt Texte anhand ihrer Struktur (Überschriften). Sie unterstützt seit v2.9 ein "Safety Net" gegen zu große Chunks.
|
||||
|
||||
* **Split Level:** Definiert die Tiefe (z.B. `2` = H1 & H2 triggern Split).
|
||||
* **Modus "Strict" (`strict_heading_split: true`):**
|
||||
* Jede Überschrift (`<= split_level`) erzwingt einen neuen Chunk.
|
||||
* *Merge-Check:* Wenn der vorherige Chunk leer war (nur Überschriften), wird gemergt (verhindert verwaiste Überschriften).
|
||||
* *Safety Net:* Wird ein Abschnitt zu lang (> `max` Token), wird auch ohne Überschrift getrennt.
|
||||
* **Modus "Soft" (`strict_heading_split: false`):**
|
||||
* **Hierarchie-Check:** Überschriften *oberhalb* des Split-Levels (z.B. H1 bei Level 2) erzwingen **immer** einen Split.
|
||||
* **Füll-Logik:** Überschriften *auf* dem Split-Level (z.B. H2) lösen nur dann einen neuen Chunk aus, wenn der aktuelle Chunk die `target`-Größe erreicht hat.
|
||||
* *Safety Net:* Auch hier greift das `max` Token Limit.
|
||||
|
||||
### 3.3 Payload-Felder (Qdrant)
|
||||
|
||||
* `text`: Der reine Inhalt (Anzeige im UI). Überschriften bleiben erhalten.
|
||||
* `window`: Inhalt plus Overlap (für Embedding). Bei `by_heading` wird der Kontext (Eltern-Überschrift) oft vorangestellt.
|
||||
* `chunk_profile`: Das effektiv genutzte Profil (zur Nachverfolgung).
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
|
|
@ -1,76 +0,0 @@
|
|||
# SYSTEM-ANWEISUNG: SICHERS MARKDOWN-RENDERING
|
||||
|
||||
Du agierst als technischer Dokumentations-Assistent. Deine Aufgabe ist das Erstellen von Markdown-Dateien (`.md`), die oft selbst Code-Blöcke (Python, JSON, YAML, Bash) enthalten.
|
||||
|
||||
**DAS PROBLEM:**
|
||||
Wenn du eine Markdown-Datei generierst, die Code-Blöcke (```) enthält, und diese Ausgabe selbst in einen Code-Block packst, interpretiert das Chat-Interface das erste innere ` ``` ` oft fälschlicherweise als das Ende der Ausgabe. Das "zerreißt" die Datei und macht das Kopieren unmöglich.
|
||||
|
||||
**DIE REGEL (STRIKT BEFOLGEN):**
|
||||
Um eine ununterbrochene Darstellung zu garantieren, musst du zwingend eine der folgenden Kapselungs-Methoden anwenden:
|
||||
|
||||
### Methode A: Die 4-Backtick-Methode (Bevorzugt)
|
||||
Umschließe den **gesamten** Datei-Inhalt mit **4 Backticks** statt 3.
|
||||
Dies erlaubt dir, innerhalb der Datei normale 3 Backticks zu verwenden.
|
||||
|
||||
Beispiel für deinen Output:
|
||||
````markdown
|
||||
---
|
||||
title: Beispiel
|
||||
---
|
||||
Hier ist Python Code:
|
||||
```python
|
||||
print("Hello")
|
||||
### Methode B: Die 4-Space-Einrückung (Alternative)
|
||||
Wenn du außen 3 Backticks verwendest, darfst du im Inneren **KEINE** Backticks verwenden.
|
||||
Stattdessen müssen alle inneren Code-Beispiele mit **4 Leerzeichen (Spaces)** eingerückt werden.
|
||||
|
||||
Beispiel für deinen Output:
|
||||
```markdown
|
||||
Hier ist Python Code:
|
||||
|
||||
print("Hello")
|
||||
```
|
||||
|
||||
**ZUSAMMENFASSUNG:**
|
||||
Generiere niemals verschachtelte 3-Backtick-Blöcke innerhalb von 3-Backtick-Blöcken. Nutze immer **4 Backticks** für den äußersten Container.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
Du agierst als **Technical Documentation Lead**.
|
||||
|
||||
**Kontext:**
|
||||
Wir haben soeben das Workpackage (WP) abgeschlossen. Der Code ist implementiert, getestet und die Änderungen sind im Chat-Verlauf dokumentiert.
|
||||
Jetzt müssen wir die Systemdokumentation (Mindnet v2.6 Modular Docs) aktualisieren, um den neuen Stand widerzuspiegeln.
|
||||
|
||||
**Deine Aufgabe - Phase 1: Identifikation**
|
||||
Analysiere die durchgeführten Änderungen dieses Workpackages.
|
||||
Nutze die beiliegende `00_documentation_map.md`, um zu identifizieren, welche Dokumentations-Module von diesen Änderungen betroffen sind.
|
||||
|
||||
**Beachte die Mapping-Logik:**
|
||||
* Haben wir neue Features/Konzepte eingeführt? -> `00_glossary.md`, `02_Concepts/*`
|
||||
* Haben wir die Datenbank/Payloads geändert? -> `03_tech_data_model.md`
|
||||
* Hat sich der Import/Algorithmus geändert? -> `03_tech_ingestion_pipeline.md`, `03_tech_retrieval_scoring.md`
|
||||
* Muss der Admin etwas Neues konfigurieren? -> `03_tech_configuration.md`, `04_admin_operations.md`
|
||||
* Ändert sich etwas für den Nutzer/Autor? -> `01_User_Manual/*`
|
||||
|
||||
**Output für Phase 1:**
|
||||
Erstelle eine **Liste der betroffenen Dateien** mit einer kurzen Begründung pro Datei (z.B. "Muss neuen Parameter X aufnehmen").
|
||||
Fordere mich dann explizit auf, dir diese spezifischen Dateien hochzuladen.
|
||||
|
||||
---
|
||||
|
||||
**Deine Aufgabe - Phase 2: Sequenzielle Bearbeitung (Warte auf Dateien)**
|
||||
Sobald ich die Dateien hochgeladen habe, aktualisieren wir sie **Schritt für Schritt**.
|
||||
1. Nimm dir **eine** Datei aus der Liste vor.
|
||||
2. Schreibe den kompletten, aktualisierten Inhalt dieser Datei als Markdown-Codeblock.
|
||||
* *Wichtig:* Halte dich strikt an den bestehenden Stil (Frontmatter, JSON-Beispiele, Warnhinweise).
|
||||
* *Wichtig:* Füge Änderungen nahtlos ein, lösche nichts Relevantes.
|
||||
3. **Warte** nach der Ausgabe der Datei auf mein "OK" oder "Weiter", bevor du die nächste Datei bearbeitest.
|
||||
|
||||
**Sonderaufgabe Roadmap:**
|
||||
Aktualisiere am Ende immer die `06_active_roadmap.md`:
|
||||
* Setze den Status des aktuellen WPs auf "Fertig/Live".
|
||||
* Verschiebe Details in die Historie-Tabelle (falls relevant).
|
||||
|
||||
**Bist du bereit für die Analyse? (Ich habe Map und Roadmap hochgeladen).**
|
||||
|
|
@ -3,151 +3,292 @@ doc_type: developer_guide
|
|||
audience: developer
|
||||
scope: workflow, testing, architecture, modules
|
||||
status: active
|
||||
version: 2.6
|
||||
version: 2.6.1
|
||||
context: "Umfassender Guide für Entwickler: Architektur, Modul-Interna (Deep Dive), Setup, Git-Workflow und Erweiterungs-Anleitungen."
|
||||
---
|
||||
|
||||
# Mindnet Developer Guide & Workflow
|
||||
|
||||
**Quellen:** `developer_guide.md`, `dev_workflow.md`
|
||||
**Quellen:** `developer_guide.md`, `dev_workflow.md`, `Architecture_Audit_v2.6`
|
||||
|
||||
Dieser Guide vereint das technische Verständnis der Module mit dem operativen Workflow zwischen Windows (Dev) und Linux (Runtime).
|
||||
Dieser Guide ist die zentrale technische Referenz für Mindnet v2.6. Er vereint das technische Verständnis der Module mit dem operativen Workflow zwischen Windows (Dev) und Linux (Runtime).
|
||||
|
||||
---
|
||||
|
||||
## 1. Die physische Architektur
|
||||
# Inhaltsverzeichnis
|
||||
- [Mindnet Developer Guide \& Workflow](#mindnet-developer-guide--workflow)
|
||||
- [Inhaltsverzeichnis](#inhaltsverzeichnis)
|
||||
- [1. Einführung \& Systemüberblick](#1-einführung--systemüberblick)
|
||||
- [Was ist Mindnet?](#was-ist-mindnet)
|
||||
- [Kern-Philosophie](#kern-philosophie)
|
||||
- [2. Architektur](#2-architektur)
|
||||
- [2.1 High-Level Übersicht](#21-high-level-übersicht)
|
||||
- [2.2 Datenfluss-Muster](#22-datenfluss-muster)
|
||||
- [A. Ingestion (Write)](#a-ingestion-write)
|
||||
- [B. Retrieval (Read)](#b-retrieval-read)
|
||||
- [C. Visualisierung (Graph)](#c-visualisierung-graph)
|
||||
- [3. Physische Architektur](#3-physische-architektur)
|
||||
- [4. Projektstruktur \& Modul-Referenz (Deep Dive)](#4-projektstruktur--modul-referenz-deep-dive)
|
||||
- [4.1 Verzeichnisbaum](#41-verzeichnisbaum)
|
||||
- [4.2 Frontend Architecture (`app/frontend/`)](#42-frontend-architecture-appfrontend)
|
||||
- [Frontend Design Patterns (Wichtig!)](#frontend-design-patterns-wichtig)
|
||||
- [4.3 Backend Architecture (`app/`)](#43-backend-architecture-app)
|
||||
- [4.4 Scripts \& Tooling (Die Admin-Toolbox)](#44-scripts--tooling-die-admin-toolbox)
|
||||
- [1. Script-Übersicht](#1-script-übersicht)
|
||||
- [2. Einsatzszenarien \& Bewertung](#2-einsatzszenarien--bewertung)
|
||||
- [5. Maintenance \& "Kill List"](#5-maintenance--kill-list)
|
||||
- [6. Lokales Setup (Development)](#6-lokales-setup-development)
|
||||
- [7. Der Entwicklungs-Zyklus (Workflow)](#7-der-entwicklungs-zyklus-workflow)
|
||||
- [Phase 1: Windows (Code)](#phase-1-windows-code)
|
||||
- [Phase 2: Beelink (Test / Dev)](#phase-2-beelink-test--dev)
|
||||
- [Phase 3: Release \& Deployment (Prod)](#phase-3-release--deployment-prod)
|
||||
- [8. Erweiterungs-Guide: "Teach-the-AI"](#8-erweiterungs-guide-teach-the-ai)
|
||||
- [Workflow A: Neuen Typ implementieren (z. B. `type: risk`)](#workflow-a-neuen-typ-implementieren-z-b-type-risk)
|
||||
- [Workflow B: Graph-Farben ändern](#workflow-b-graph-farben-ändern)
|
||||
- [9. Tests \& Debugging](#9-tests--debugging)
|
||||
- [10. Troubleshooting \& One-Liners](#10-troubleshooting--one-liners)
|
||||
|
||||
---
|
||||
|
||||
## 1. Einführung & Systemüberblick
|
||||
|
||||
### Was ist Mindnet?
|
||||
Mindnet ist ein **Hybrides Knowledge Management System**, das klassische Notizen (Markdown) mit KI-gestützter Analyse verbindet. Es kombiniert **RAG** (Retrieval Augmented Generation) mit einer **Graphen-Datenbank** (Qdrant), um Wissen nicht nur semantisch zu finden, sondern auch strukturell zu vernetzen.
|
||||
|
||||
### Kern-Philosophie
|
||||
1. **Filesystem First:** Die Wahrheit liegt immer auf der Festplatte (Markdown-Dateien). Die Datenbank ist ein abgeleiteter Index.
|
||||
2. **Hybrid Retrieval:** Relevanz entsteht aus Textähnlichkeit (Semantik) + Graphen-Verbindungen (Edges) + Wichtigkeit (Centrality).
|
||||
3. **Active Intelligence:** Das System wartet nicht nur auf Anfragen, sondern schlägt beim Schreiben proaktiv Verbindungen vor ("Matrix Logic").
|
||||
4. **Local Privacy:** Alle KI-Berechnungen (Ollama) laufen lokal. Keine Cloud-Abhängigkeit für Inference.
|
||||
|
||||
---
|
||||
|
||||
## 2. Architektur
|
||||
|
||||
### 2.1 High-Level Übersicht
|
||||
Das System folgt einer strikten Trennung zwischen Frontend (Streamlit) und Backend (FastAPI), wobei bestimmte Performance-Pfade (Graph-Visualisierung) optimiert wurden.
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
User((User))
|
||||
|
||||
subgraph "Frontend Layer (Streamlit)"
|
||||
UI["ui.py Router"]
|
||||
ViewChat["Chat View"]
|
||||
ViewGraph["Graph View"]
|
||||
ViewEditor["Editor View"]
|
||||
Logic["Callbacks & State"]
|
||||
end
|
||||
|
||||
subgraph "Backend Layer (FastAPI)"
|
||||
API["main.py"]
|
||||
RouterChat["Chat / RAG"]
|
||||
RouterIngest["Ingest / Write"]
|
||||
CoreRet["Retriever Engine"]
|
||||
CoreIngest["Ingestion Pipeline"]
|
||||
end
|
||||
|
||||
subgraph "Infrastructure & Services"
|
||||
LLM["Ollama (Phi3/Nomic)"]
|
||||
DB[("Qdrant Vector DB")]
|
||||
FS["File System (.md)"]
|
||||
end
|
||||
|
||||
User <--> UI
|
||||
UI -- "REST (Chat, Save, Feedback)" --> API
|
||||
UI -. "Direct Read (Graph Viz Performance)" .-> DB
|
||||
API -- "Embeddings & Completion" --> LLM
|
||||
API -- "Read/Write" --> DB
|
||||
API -- "Read/Write (Source of Truth)" --> FS
|
||||
```
|
||||
|
||||
### 2.2 Datenfluss-Muster
|
||||
|
||||
#### A. Ingestion (Write)
|
||||
Vom Markdown zur Vektor-Datenbank.
|
||||
```mermaid
|
||||
graph LR
|
||||
MD["Markdown File"] --> Parser("Parser")
|
||||
Parser --> Chunker("Chunker")
|
||||
Chunker -- "Text Chunks" --> SemAn{"SemanticAnalyzer<br/>(LLM)"}
|
||||
SemAn -- "Smart Edges" --> Embedder("Embedder")
|
||||
Embedder --> DB[("Qdrant<br/>Points")]
|
||||
|
||||
style DB fill:#f9f,stroke:#333,stroke-width:2px
|
||||
style SemAn fill:#ff9,stroke:#333,stroke-width:2px
|
||||
```
|
||||
|
||||
#### B. Retrieval (Read)
|
||||
Die hybride Suche für Chat & RAG.
|
||||
```mermaid
|
||||
graph LR
|
||||
Query(["Query"]) --> Embed("Embedding")
|
||||
Embed --> Hybrid{"Hybrid Search"}
|
||||
|
||||
subgraph Search Components
|
||||
Vec["Vector Score"]
|
||||
Graph["Graph/Edge Bonus"]
|
||||
end
|
||||
|
||||
Vec --> Hybrid
|
||||
Graph --> Hybrid
|
||||
|
||||
Hybrid --> Rank("Re-Ranking")
|
||||
Rank --> Ctx["LLM Context"]
|
||||
```
|
||||
|
||||
#### C. Visualisierung (Graph)
|
||||
Der optimierte Pfad für das Frontend.
|
||||
```mermaid
|
||||
graph LR
|
||||
UI["Frontend UI"] --> Service("GraphService")
|
||||
Service -- "Direct Read" --> DB[("Qdrant<br/>Edges Collection")]
|
||||
DB --> Cyto["Cytoscape<br/>Rendering"]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Physische Architektur
|
||||
|
||||
Mindnet läuft in einer verteilten Umgebung (Post-WP15 Setup).
|
||||
|
||||
* **Windows 11 (VS Code):** Hier schreibst du Code. **Nie** direkt auf `main` arbeiten!
|
||||
* **Beelink (Runtime):** Der Server. Hier läuft die Software. Wir nutzen **Systemd-Services**:
|
||||
* **PROD:** API (8001) + UI (8501). Ordner: `~/mindnet`.
|
||||
* **DEV:** API (8002) + UI (8502). Ordner: `~/mindnet_dev`.
|
||||
* **Gitea:** Der "Safe" (Raspberry Pi). Speichert den Code und verwaltet Versionen.
|
||||
* **Windows 11 (VS Code):** Entwicklungsumgebung. **Nie** direkt auf `main` arbeiten!
|
||||
* **Beelink (Runtime):** Der Server hostet zwei Instanzen via Systemd:
|
||||
* **PROD:** API (Port 8001) + UI (Port 8501). Home: `~/mindnet`.
|
||||
* **DEV:** API (Port 8002) + UI (Port 8502). Home: `~/mindnet_dev`.
|
||||
* **Gitea (Raspberry Pi):** Versionskontrolle ("Safe"). Speichert den Code.
|
||||
|
||||
---
|
||||
|
||||
## 2. Projektstruktur & Referenz
|
||||
## 4. Projektstruktur & Modul-Referenz (Deep Dive)
|
||||
|
||||
### 2.1 Verzeichnisbaum
|
||||
Das System ist modular aufgebaut. Hier ist die detaillierte Analyse aller Komponenten.
|
||||
|
||||
### 4.1 Verzeichnisbaum
|
||||
|
||||
```text
|
||||
mindnet/
|
||||
├── app/
|
||||
│ ├── core/ # Ingestion, Chunker, Qdrant Wrapper
|
||||
│ ├── routers/ # FastAPI Endpoints
|
||||
│ ├── services/ # Ollama Client, Traffic Control
|
||||
│ ├── core/ # Business Logic & Algorithms
|
||||
│ ├── routers/ # API Interface (FastAPI)
|
||||
│ ├── services/ # External Integrations (LLM, DB)
|
||||
│ ├── models/ # Pydantic DTOs
|
||||
│ └── frontend/ # Streamlit UI Module
|
||||
├── config/ # YAML Configs (Single Source of Truth)
|
||||
├── scripts/ # CLI Tools (Import, Diagnose, Reset)
|
||||
├── tests/ # Pytest Suite & Smoke Scripts
|
||||
└── vault/ # Lokaler Test-Content
|
||||
│ └── frontend/ # UI Logic (Streamlit)
|
||||
├── config/ # Configuration Files (YAML)
|
||||
├── scripts/ # CLI Tools (Ops & Maintenance)
|
||||
└── vault/ # Local Content Storage
|
||||
```
|
||||
|
||||
### 2.2 Vollständige Datei-Referenz (Auto-Scan)
|
||||
### 4.2 Frontend Architecture (`app/frontend/`)
|
||||
|
||||
Eine Übersicht aller Skripte und Module im System.
|
||||
Das Frontend ist eine Streamlit-App, die sich wie eine Single-Page-Application (SPA) verhält.
|
||||
|
||||
| Datei/Pfad | Typ | Beschreibung |
|
||||
| Modul | Status | Verantwortung |
|
||||
| :--- | :--- | :--- |
|
||||
| **Backend Core** | | |
|
||||
| `app/main.py` | Skript | Bootstrap der FastAPI API. |
|
||||
| `app/config.py` | Config | Zentrale Konfiguration (Pydantic Settings). |
|
||||
| `app/core/ingestion.py` | Core Modul | Async Ingestion Service & Change Detection. |
|
||||
| `app/core/chunker.py` | Core Modul | Smart Chunker Orchestrator. |
|
||||
| `app/core/retriever.py` | Core Modul | Hybrider Such-Algorithmus (Semantik + Graph). |
|
||||
| `app/core/ranking.py` | Core Modul | Kombiniertes Scoring (WP-04). |
|
||||
| `app/core/graph_adapter.py` | Core Modul | Adjazenzaufbau & Subgraph-Expansion. |
|
||||
| `app/core/qdrant.py` | Core Modul | Qdrant Client Wrapper. |
|
||||
| `app/core/qdrant_points.py` | Core Modul | Robuste Point-Helper für Qdrant (Retry-Logik). |
|
||||
| `app/core/derive_edges.py` | Core Modul | Edge-Erzeugung aus Markdown. |
|
||||
| `app/core/edges.py` | Core Modul | Datenstrukturen für Kanten. |
|
||||
| `app/core/edges_writer.py` | Core Modul | Schreibt Kanten in die DB. |
|
||||
| `app/core/note_payload.py` | Core Modul | Builder für Note-Metadaten. |
|
||||
| `app/core/chunk_payload.py` | Core Modul | Builder für Chunk-Payloads. |
|
||||
| `app/core/type_registry.py` | Core Modul | Logik zum Laden der `types.yaml`. |
|
||||
| `app/core/schema_loader.py` | Core Modul | Lädt JSON-Schemas für Validierung. |
|
||||
| `app/core/env_vars.py` | Core Modul | Environment-Variablen Konstanten. |
|
||||
| **API Router** | | |
|
||||
| `app/routers/chat.py` | API Router | RAG Endpunkt & Hybrid Router. |
|
||||
| `app/routers/query.py` | API Router | Query-Endpunkte (WP-04). |
|
||||
| `app/routers/graph.py` | API Router | Graph-Endpunkte (WP-04). |
|
||||
| `app/routers/ingest.py` | API Router | Ingestion-Trigger & Analyse. |
|
||||
| `app/routers/feedback.py` | API Router | Feedback-Endpunkt. |
|
||||
| `app/routers/tools.py` | API Router | Tool-Definitionen für Ollama/n8n/MCP. |
|
||||
| `app/routers/admin.py` | API Router | Admin-/Monitoring-Endpunkte. |
|
||||
| **Services** | | |
|
||||
| `app/services/llm_service.py` | Service | LLM Client mit Traffic Control. |
|
||||
| `app/services/llm_ollama.py` | Service | Legacy: Ollama-Integration & Prompt-Bau. |
|
||||
| `app/services/embeddings_client.py` | Service | Async Text→Embedding Service. |
|
||||
| `app/services/semantic_analyzer.py` | Service | Smart Edge Validation & Filtering. |
|
||||
| `app/services/discovery.py` | Service | Backend Intelligence (Matrix-Logik). |
|
||||
| `app/services/feedback_service.py` | Service | Schreibt JSONL-Logs. |
|
||||
| **Frontend** | | |
|
||||
| `app/frontend/ui.py` | Frontend | Entrypoint (Streamlit). |
|
||||
| `app/frontend/ui_editor.py` | Frontend | Editor-View & Logic. |
|
||||
| `app/frontend/ui_chat.py` | Frontend | Chat-View. |
|
||||
| `app/frontend/ui_graph_cytoscape.py` | Frontend | Graph-Visualisierung (Modern). |
|
||||
| `app/frontend/ui_graph.py` | Frontend | Graph-Visualisierung (Legacy). |
|
||||
| `app/frontend/ui_graph_service.py` | Frontend | Datenaufbereitung für Graphen. |
|
||||
| `app/frontend/ui_callbacks.py` | Frontend | Event-Handler. |
|
||||
| `app/frontend/ui_api.py` | Frontend | Backend-Bridge. |
|
||||
| `app/frontend/ui_utils.py` | Frontend | Helper (Healing Parser). |
|
||||
| `app/frontend/ui_config.py` | Frontend | Konstanten (Farben, URLs). |
|
||||
| **CLI & Scripts** | | |
|
||||
| `scripts/import_markdown.py` | Skript | Haupt-Importer CLI. |
|
||||
| `scripts/reset_qdrant.py` | Skript | Löscht Collections (`--mode wipe`). |
|
||||
| `scripts/payload_dryrun.py` | Skript | Zeigt Payloads VOR dem Upsert. |
|
||||
| `scripts/edges_dryrun.py` | Skript | Erzeugt Edges ohne DB-Write. |
|
||||
| `scripts/edges_full_check.py` | Skript | Prüft Graph-Integrität. |
|
||||
| `scripts/resolve_unresolved_references.py`| Skript | Löst Wikilinks nachträglich auf. |
|
||||
| `scripts/audit_vault_vs_qdrant.py` | Skript | Konsistenz-Check File vs. DB. |
|
||||
| `scripts/audit_edges_vs_expectations.py`| Skript | Prüft Kanten gegen Erwartungswert. |
|
||||
| `scripts/setup_mindnet_collections.py` | Skript | Richtet Collections initial ein. |
|
||||
| `scripts/export_markdown.py` | Skript | Exportiert Qdrant zurück zu Markdown. |
|
||||
| `scripts/wp04_smoketest.py` | Skript | E2E-Schnelltest der WP-04 Endpunkte. |
|
||||
| `scripts/health_check_mindnet.py` | Skript | System Health Check. |
|
||||
| `scripts/report_hashes.py` | Skript | Übersicht bei Mehrfach-Hashes. |
|
||||
| `scripts/make_test_vault.py` | Skript | Erzeugt minimalen Test-Vault. |
|
||||
| `scripts/ollama_tool_runner.py` | Skript | Minimaler Tool-Caller für Ollama. |
|
||||
| **`ui.py`** | 🟢 **Core** | **Main Router.** Initialisiert Session-State und entscheidet anhand der Sidebar-Auswahl, welche View gerendert wird. |
|
||||
| **`ui_config.py`** | 🟢 **Config** | **Constants.** Zentraler Ort für Farben (`GRAPH_COLORS`), API-URLs und Timeouts. Änderungen am Look & Feel passieren hier. |
|
||||
| **`ui_chat.py`** | 🟢 **View** | **Chat UI.** Rendert Nachrichtenverlauf, Intent-Badges, Quellen-Expanders und Feedback-Buttons. |
|
||||
| **`ui_editor.py`** | 🟢 **View** | **Editor UI.** Markdown-Editor mit Live-Vorschau. Integriert "Intelligence" (KI-Link-Vorschläge). |
|
||||
| **`ui_graph_cytoscape.py`**| 🟢 **View** | **Modern Graph.** Interaktiver Graph basierend auf Cytoscape.js (COSE Layout). |
|
||||
| **`ui_graph.py`** | 🟡 **Legacy** | **Graph UI (Fallback).** Alte Implementierung mittels `streamlit-agraph`. |
|
||||
| **`ui_callbacks.py`** | 🟢 **Logic** | **State Controller.** Handhabt komplexe State-Übergänge (z.B. Graph -> Editor). |
|
||||
| **`ui_utils.py`** | 🟢 **Logic** | **Helper.** Enthält den **Healing Parser** (`parse_markdown_draft`), der defektes JSON/YAML von LLMs repariert. |
|
||||
| **`ui_api.py`** | 🟢 **Data** | **API Client.** Wrapper für Backend REST-Calls. |
|
||||
| **`ui_graph_service.py`** | 🟢 **Data** | **Performance Hack.** Greift direkt auf Qdrant zu (bypass API), um Graphen schnell zu laden. |
|
||||
|
||||
#### Frontend Design Patterns (Wichtig!)
|
||||
|
||||
1. **Active Inspector Pattern (`ui_graph_cytoscape.py`)**
|
||||
Um Re-Renders im Graphen zu vermeiden, nutzen wir CSS-Klassen. Wird ein Knoten angeklickt, ändert sich nur die CSS-Klasse (`.inspected`), aber die Physik-Simulation startet nicht neu. Das sorgt für ein stabiles UI-Gefühl.
|
||||
|
||||
2. **Resurrection Pattern (`ui_editor.py`)**
|
||||
Streamlit neigt dazu, Eingaben bei Re-Runs zu "vergessen". Der Editor synchronisiert seinen Inhalt aggressiv in den `session_state`.
|
||||
* Logik: `if widget_key not in session_state: restore_from_data_key()`.
|
||||
* Ergebnis: Texteingaben überleben Tab-Wechsel.
|
||||
|
||||
3. **Filesystem First (`ui_callbacks.py`)**
|
||||
Wenn man im Graphen auf "Bearbeiten" klickt:
|
||||
1. Versucht das System, die **echte Datei** von der Festplatte zu lesen.
|
||||
2. Nur wenn das fehlschlägt, wird der Text aus den Datenbank-Chunks rekonstruiert ("Stitching").
|
||||
Dies verhindert, dass veraltete Datenbank-Stände die echten Dateien überschreiben.
|
||||
|
||||
### 4.3 Backend Architecture (`app/`)
|
||||
|
||||
Das Backend ist das Herzstück. Es stellt die Logik via REST-API bereit.
|
||||
|
||||
| Layer | Datei | Status | Verantwortung |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| **Entry** | `app/main.py` | 🟢 **Core** | **Entrypoint.** Initialisiert FastAPI, CORS, und bindet alle Router ein. |
|
||||
| **Config** | `app/config.py` | 🟢 **Core** | **Settings.** Zentrale Konfiguration (Pydantic). Lädt Env-Vars für Qdrant, LLM und Pfade. |
|
||||
| **Router** | `app/routers/chat.py` | 🟢 **API** | **Conversation API.** Haupt-Endpunkt für Chat. Entscheidet zwischen Interview- und RAG-Modus. |
|
||||
| | `app/routers/ingest.py` | 🟢 **API** | **Write API.** Nimmt Markdown entgegen, steuert Ingestion und Discovery-Analyse. |
|
||||
| | `app/routers/query.py` | 🟢 **API** | **Search API.** Klassischer Hybrid-Retriever Endpunkt. |
|
||||
| | `app/routers/graph.py` | 🟢 **API** | **Viz API.** Liefert Knoten/Kanten für Frontend-Graphen (Cytoscape). |
|
||||
| | `app/routers/tools.py` | 🟢 **API** | **Agent Specs.** Liefert JSON-Schemas für die Integration in externe Agenten (Ollama/MCP). |
|
||||
| **Engine** | `app/core/ingestion.py` | ⚙️ **Core** | **Pipeline Controller.** Koordiniert Parsing, Hashing (Change-Detection) und DB-Upserts. |
|
||||
| | `app/core/retriever.py` | ⚙️ **Core** | **Search Engine.** Berechnet Scores (Vektor + Graph + Centrality) und baut Erklärungen. |
|
||||
| | `app/core/chunker.py` | ⚙️ **Core** | **Segmentation.** Zerlegt Text intelligent. Orchestriert `SemanticAnalyzer` für Smart Edges. |
|
||||
| | `app/core/parser.py` | ⚙️ **Core** | **I/O.** Liest Markdown robust (Encoding-Fallback), trennt Frontmatter/Body. |
|
||||
| | `app/core/derive_edges.py`| ⚙️ **Core** | **Link Extractor.** Findet Wikilinks, Callouts und Typed Relations im Text. |
|
||||
| | `app/core/note_payload.py`| ⚙️ **Core** | **Builder.** Erzeugt JSON für `mindnet_notes`. Vererbt Configs (Frontmatter > Type > Default). |
|
||||
| | `app/core/qdrant_points.py`| ⚙️ **Core** | **Object Mapper.** Wandelt Payloads in Qdrant `PointStruct`s um. Handhabt UUIDs. |
|
||||
| **Services** | `app/services/llm_service.py`| 🧠 **AI** | **AI Client.** Async Client für Ollama. Verwaltet Concurrency (Semaphore). |
|
||||
| | `app/services/embeddings_client.py`| 🧠 **AI** | **Vector Client.** Unified Client für Embeddings (Ollama/Nomic). Ersetzt lokale Modelle. |
|
||||
| | `app/services/discovery.py`| 🧠 **AI** | **Intelligence.** "Matrix Logic" für Link-Vorschläge (WP-11). |
|
||||
| | `app/services/semantic_analyzer.py`| 🧠 **AI** | **Filter.** KI-Validierung von Kanten im Hintergrund (Background Priority). |
|
||||
|
||||
### 4.4 Scripts & Tooling (Die Admin-Toolbox)
|
||||
|
||||
Der Ordner `scripts/` enthält verifizierte Werkzeuge für den Betrieb.
|
||||
|
||||
#### 1. Script-Übersicht
|
||||
|
||||
| Skript | Status | Zweck | Argumente & Parameter | Beispielaufruf |
|
||||
| :--- | :--- | :--- | :--- | :--- |
|
||||
| **`import_markdown.py`** | 🟢 **Core** | **Ingestion.** Scannt den Vault und importiert Dateien asynchron in Qdrant. Beachtet Hashes. | `--vault PATH` (Quellordner)<br>`--prefix TEXT` (Collection Prefix)<br>`--force` (Erzwingt Re-Index)<br>`--apply` (Schreibt in DB; sonst Dry-Run) | `python3 scripts/import_markdown.py --vault ./vault --apply` |
|
||||
| **`reset_qdrant.py`** | 🟢 **Core** | **DB-Reset.** Löscht Collections (`wipe`) oder leert sie (`truncate`). Legt Indizes neu an. | `--mode {wipe,truncate}`<br>`--prefix TEXT`<br>`--yes` (Keine Rückfrage)<br>`--no-indexes` (Skip Index-Erstellung) | `python3 -m scripts.reset_qdrant --mode wipe --yes` |
|
||||
| **`health_check_mindnet.py`** | 🟢 **Ops** | **Monitoring.** Prüft API-Verfügbarkeit (`/query`) und Antwortqualität. Standalone (keine Imports). | `--url URL`<br>`--query TEXT`<br>`--top-k INT`<br>`--strict` (Exit-Code 1 bei Warnings) | `python3 scripts/health_check_mindnet.py --strict` |
|
||||
| **`payload_dryrun.py`** | 🟡 **Dev** | **Debugger.** Simuliert die JSON-Erstellung (Payloads) ohne DB-Schreibzugriff. Prüft `types.yaml`. | `--vault PATH`<br>`--note-id ID` (Filter)<br>`--with-edges` (Zeigt Kanten) | `python3 -m scripts.payload_dryrun --vault ./vault --note-id "projekt-alpha"` |
|
||||
| **`edges_full_check.py`** | 🟡 **Audit** | **Integritäts-Check.** Zählt Kanten in der DB und prüft logische Konsistenz (z.B. Chunk-Reihenfolge). | Keine (Liest Konfiguration nur aus Umgebungsvariablen `.env`) | `python3 -m scripts.edges_full_check` |
|
||||
| **`resolve_unresolved_references.py`** | 🔵 **Maint** | **Link-Healer.** Repariert "tote" Links in der DB nachträglich und erzeugt Backlinks. | `--prefix TEXT`<br>`--limit INT`<br>`--apply` (Schreibt Änderungen) | `python3 -m scripts.resolve_unresolved_references --apply` |
|
||||
| **`export_markdown.py`** | ⚪ **Utility** | **Reverse ETL.** Exportiert den Datenbank-Inhalt zurück in Markdown-Dateien (Backup/Recovery). | `--out PATH` (Ziel)<br>`--note-id ID`<br>`--include-edges {yaml,footer}`<br>`--flatten-paths` | `python3 -m scripts.export_markdown --out ./backup` |
|
||||
|
||||
#### 2. Einsatzszenarien & Bewertung
|
||||
|
||||
* **🟢 Essentiell für den Betrieb (Must-Have):**
|
||||
* **`import_markdown.py`**: Das Arbeitspferd. Ohne dieses Skript kommen keine Daten ins System.
|
||||
* **`reset_qdrant.py`**: Zwingend notwendig für CI/CD-Pipelines.
|
||||
* **`health_check_mindnet.py`**: Ideal für Docker-Healthchecks.
|
||||
|
||||
* **🟡 Hilfreich für Entwicklung & Debugging (Should-Have):**
|
||||
* **`payload_dryrun.py`**: Wertvoll, wenn man an der `config/types.yaml` arbeitet.
|
||||
* **`edges_full_check.py`**: Wichtiges Diagnose-Tool für den Graphen.
|
||||
|
||||
* **🔵 Wartung & Spezialfälle (Nice-to-Have):**
|
||||
* **`resolve_unresolved_references.py`**: Sinnvoll in einem "Knowledge Garden", wo oft Links auf noch nicht existierende Notizen gesetzt werden ("Red Links"). Dieses Skript "heilt" den Graphen nachträglich.
|
||||
* **`export_markdown.py`**: Ein Notfall-Tool. Da Mindnet nach dem Prinzip "Filesystem First" arbeitet, ist ein Export aus der DB selten nötig, kann aber bei versehentlichem Löschen von Dateien lebensrettend sein.
|
||||
|
||||
---
|
||||
|
||||
## 3. Core-Module im Detail (Architektur)
|
||||
## 5. Maintenance & "Kill List"
|
||||
|
||||
Hier wird erklärt, *wie* die wichtigsten Komponenten unter der Haube arbeiten.
|
||||
Folgende Dateien wurden im Audit v2.6 als veraltet, redundant oder "Zombie-Code" identifiziert und sollten entfernt werden.
|
||||
|
||||
### 3.1 Der Importer (`scripts.import_markdown`)
|
||||
Dies ist das komplexeste Modul.
|
||||
* **Orchestrierung:** Es ruft `app.core.chunker` für die Textzerlegung und `app.services.semantic_analyzer` für Smart Edges auf.
|
||||
* **Idempotenz:** Der Importer kann beliebig oft laufen. Er nutzt deterministische IDs (UUIDv5) und überschreibt vorhandene Einträge konsistent.
|
||||
* **Robustheit:** In `ingestion.py` sind Mechanismen wie Change Detection (Hash-Vergleich) und Robust File I/O implementiert.
|
||||
|
||||
### 3.2 Der Hybrid Router (`app.routers.chat`)
|
||||
Hier liegt die Logik für Intent Detection (WP06) und Interview-Modus (WP07).
|
||||
* **Question Detection:** Prüft zuerst regelbasiert, ob der Input eine Frage ist (`?`, W-Wörter). Falls ja -> RAG.
|
||||
* **Keyword Match:** Prüft Keywords aus `decision_engine.yaml` und `types.yaml`.
|
||||
* **Priority:** Ruft `llm_service` mit `priority="realtime"` auf, um die Import-Warteschlange zu umgehen.
|
||||
|
||||
### 3.3 Der Retriever (`app.core.retriever`)
|
||||
Hier passiert das Scoring (WP04a).
|
||||
* **Hybrid Search:** Der Chat-Endpoint erzwingt `mode="hybrid"`.
|
||||
* **Strategic Retrieval:** In `chat.py` wird der Retriever ggf. *zweimal* aufgerufen, wenn ein Intent (z.B. `DECISION`) eine Injection (`value`) erfordert.
|
||||
|
||||
### 3.4 Das Frontend (`app.frontend.ui`)
|
||||
Eine Streamlit-App (WP10/19).
|
||||
* **Resurrection Pattern:** Das UI nutzt ein spezielles State-Management (`st.session_state`), um Eingaben bei Tab-Wechseln (Chat <-> Editor) zu erhalten. Widgets synchronisieren sich via Callbacks.
|
||||
* **Healing Parser:** Die Funktion `parse_markdown_draft` repariert defekte YAML-Frontmatter (z.B. fehlendes `---`) vom LLM automatisch.
|
||||
|
||||
### 3.5 Traffic Control (`app.services.llm_service`)
|
||||
Neu in v2.6. Stellt sicher, dass Batch-Prozesse (Import) den Live-Chat nicht ausbremsen.
|
||||
* **Methode:** `generate_raw_response(..., priority="background")` aktiviert eine Semaphore.
|
||||
* **Limit:** Konfigurierbar über `MINDNET_LLM_BACKGROUND_LIMIT` (Default: 2).
|
||||
| Datei | Diagnose | Empfohlene Aktion |
|
||||
| :--- | :--- | :--- |
|
||||
| `app/embed_server.py` | **Zombie.** Alter Standalone-Server. | 🗑️ Löschen |
|
||||
| `app/embeddings.py` | **Zombie.** Veraltete lokale Lib. **(Achtung: Erst Importe in `main.py` entfernen!)** | 🗑️ Löschen |
|
||||
| `app/routers/embed_router.py` | **Zombie.** Nutzt `embeddings.py`. | 🗑️ Löschen |
|
||||
| `app/routers/qdrant_router.py`| **Deprecated.** Keine Logik, nur CRUD. | 🗑️ Löschen |
|
||||
| `app/core/edges.py` | **Redundant.** Ersetzt durch `derive_edges.py`. | 🗑️ Löschen |
|
||||
| `app/core/ranking.py` | **Redundant.** Logik in `retriever.py` integriert. | 🗑️ Löschen |
|
||||
| `app/core/type_registry.py` | **Redundant.** Logik in `ingestion.py` integriert. | 🗑️ Löschen |
|
||||
| `app/core/env_vars.py` | **Veraltet.** Ersetzt durch `config.py`. | 🗑️ Löschen |
|
||||
| `app/services/llm_ollama.py` | **Veraltet.** Ersetzt durch `llm_service.py`. | 🗑️ Löschen |
|
||||
|
||||
---
|
||||
|
||||
## 4. Lokales Setup (Development)
|
||||
## 6. Lokales Setup (Development)
|
||||
|
||||
**Voraussetzungen:** Python 3.10+, Docker, Ollama.
|
||||
|
||||
|
|
@ -170,16 +311,18 @@ ollama pull nomic-embed-text
|
|||
**Konfiguration (`.env`):**
|
||||
```ini
|
||||
QDRANT_URL="http://localhost:6333"
|
||||
MINDNET_OLLAMA_URL="http://localhost:11434"
|
||||
MINDNET_LLM_MODEL="phi3:mini"
|
||||
MINDNET_EMBEDDING_MODEL="nomic-embed-text"
|
||||
COLLECTION_PREFIX="mindnet_dev"
|
||||
VECTOR_DIM=768
|
||||
MINDNET_LLM_BACKGROUND_LIMIT=2
|
||||
MINDNET_API_URL="http://localhost:8002"
|
||||
MINDNET_LLM_TIMEOUT=300.0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Der Entwicklungs-Zyklus (Workflow)
|
||||
## 7. Der Entwicklungs-Zyklus (Workflow)
|
||||
|
||||
### Phase 1: Windows (Code)
|
||||
1. **Basis aktualisieren:** `git checkout main && git pull`.
|
||||
|
|
@ -219,7 +362,7 @@ Wenn alles getestet ist:
|
|||
|
||||
---
|
||||
|
||||
## 6. Erweiterungs-Guide: "Teach-the-AI"
|
||||
## 8. Erweiterungs-Guide: "Teach-the-AI"
|
||||
|
||||
Mindnet lernt nicht durch Training (Fine-Tuning), sondern durch **Konfiguration** und **Vernetzung**.
|
||||
|
||||
|
|
@ -227,6 +370,7 @@ Mindnet lernt nicht durch Training (Fine-Tuning), sondern durch **Konfiguration*
|
|||
1. **Physik (`config/types.yaml`):**
|
||||
```yaml
|
||||
risk:
|
||||
chunk_profile: sliding_short
|
||||
retriever_weight: 0.90 # Sehr wichtig
|
||||
edge_defaults: ["blocks"] # Automatische Kante
|
||||
detection_keywords: ["gefahr", "risiko"]
|
||||
|
|
@ -238,21 +382,20 @@ Mindnet lernt nicht durch Training (Fine-Tuning), sondern durch **Konfiguration*
|
|||
```
|
||||
*Ergebnis:* Wenn der Intent `DECISION` erkannt wird, sucht das System nun auch aktiv nach Risiken.
|
||||
|
||||
### Workflow B: Interview-Schema anpassen (WP07)
|
||||
Wenn Mindnet neue Fragen stellen soll (z.B. "Budget" bei Projekten):
|
||||
1. **Schema (`config/types.yaml`):**
|
||||
```yaml
|
||||
project:
|
||||
schema:
|
||||
- "Titel"
|
||||
- "Ziel"
|
||||
- "Budget (Neu)"
|
||||
```
|
||||
2. **Kein Code nötig:** Der `One-Shot Extractor` (Prompt Template) liest diese Liste dynamisch.
|
||||
### Workflow B: Graph-Farben ändern
|
||||
1. Öffne `app/frontend/ui_config.py`.
|
||||
2. Bearbeite das Dictionary `GRAPH_COLORS`.
|
||||
|
||||
```python
|
||||
GRAPH_COLORS = {
|
||||
"project": "#FF4B4B",
|
||||
"risk": "#8B0000" # Neu
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Tests & Debugging
|
||||
## 9. Tests & Debugging
|
||||
|
||||
**Unit Tests (Pytest):**
|
||||
```bash
|
||||
|
|
@ -280,7 +423,7 @@ python tests/test_feedback_smoke.py --url http://localhost:8002/query
|
|||
|
||||
---
|
||||
|
||||
## 8. Troubleshooting & One-Liners
|
||||
## 10. Troubleshooting & One-Liners
|
||||
|
||||
**DB komplett zurücksetzen (Vorsicht!):**
|
||||
```bash
|
||||
|
|
@ -288,6 +431,12 @@ python tests/test_feedback_smoke.py --url http://localhost:8002/query
|
|||
python3 -m scripts.reset_qdrant --mode wipe --prefix "mindnet_dev" --yes
|
||||
```
|
||||
|
||||
**Graphen reparieren (Red Links auflösen):**
|
||||
Nutze dies, wenn Kanten im Graphen ins Leere zeigen (weil die Notiz beim Import noch nicht da war).
|
||||
```bash
|
||||
python3 -m scripts.resolve_unresolved_references --apply
|
||||
```
|
||||
|
||||
**Einen einzelnen File inspizieren (Parser-Sicht):**
|
||||
```bash
|
||||
python3 tests/inspect_one_note.py --file ./vault/MeinFile.md
|
||||
|
|
@ -300,5 +449,9 @@ journalctl -u mindnet-ui-dev -f
|
|||
```
|
||||
|
||||
**"UnicodeDecodeError in .env":**
|
||||
* Ursache: Umlaute oder Sonderzeichen in der `.env`.
|
||||
* Lösung: Datei bereinigen (nur ASCII) und sicherstellen, dass UTF-8 ohne BOM genutzt wird.
|
||||
* **Ursache:** Umlaute oder Sonderzeichen in der `.env`.
|
||||
* **Lösung:** Datei bereinigen (nur ASCII) und sicherstellen, dass UTF-8 ohne BOM genutzt wird.
|
||||
|
||||
**"Read timed out" im Frontend:**
|
||||
* **Ursache:** Smart Edges brauchen länger als 60s.
|
||||
* **Lösung:** `MINDNET_API_TIMEOUT=300.0` in `.env`.
|
||||
152
docs/05_Development/05_genai_best_practices.md
Normal file
152
docs/05_Development/05_genai_best_practices.md
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
---
|
||||
doc_type: developer_guide
|
||||
audience: developer, architect
|
||||
scope: genai, prompting, workflow
|
||||
status: active
|
||||
version: 1.0
|
||||
context: "Leitfaden für die effiziente Softwareentwicklung mit LLMs im Mindnet-Projekt."
|
||||
---
|
||||
|
||||
# GenAI Development Best Practices & Prompt Library
|
||||
|
||||
Dieser Leitfaden definiert Standards für die Zusammenarbeit mit KI-Modellen (ChatGPT, Claude, Gemini) im Rahmen der Mindnet-Entwicklung. Ziel ist es, Halluzinationen zu minimieren, den Kontext effizient zu nutzen und die Dokumentation synchron zum Code zu halten.
|
||||
|
||||
---
|
||||
|
||||
## 1. Grundprinzipien
|
||||
|
||||
### 1.1 Context is King (aber teuer)
|
||||
LLMs haben ein begrenztes Kontext-Fenster.
|
||||
* **Don't:** "Hier ist mein ganzer Code, fix den Bug." (Führt zu Vergessen von Details).
|
||||
* **Do:** Nutze die **"Map & Fetch" Strategie**:
|
||||
1. Gib der KI eine Inhaltsübersicht (z.B. `project_scan_report.json` oder `tree`).
|
||||
2. Lass die KI entscheiden, welche Dateien sie für die Aufgabe benötigt.
|
||||
3. Lade nur diese Dateien hoch.
|
||||
|
||||
### 1.2 Trust but Verify (Validierung)
|
||||
* **Code:** Führe generierten Code **immer** lokal aus (Unit Tests oder Smoke Tests), bevor du ihn committest.
|
||||
* **Pfade:** KIs erfinden gerne Pfade (z.B. `app/utils.py`, obwohl es `app/core/utils.py` ist). Prüfe Importe immer gegen die Projektstruktur.
|
||||
* **Security:** Achte darauf, dass keine Secrets (API-Keys) in den Prompts landen und keine Secrets vom LLM halluziniert und hardcodiert werden.
|
||||
|
||||
### 1.3 Atomic Chats
|
||||
Nutze für verschiedene Aufgaben frische Chat-Kontexte.
|
||||
* Ein Chat für "Frontend Refactoring".
|
||||
* Ein neuer Chat für "Documentation Update".
|
||||
* *Grund:* Alte Chats akkumulieren "Rauschen" und führen zu Fehlern.
|
||||
|
||||
---
|
||||
|
||||
## 2. Prompt Library (Standard-Vorlagen)
|
||||
|
||||
Nutze diese Prompts, um konsistente Ergebnisse zu erzielen.
|
||||
|
||||
### 2.1 Der "Render-Safe" Prompt (System Instruction)
|
||||
**Wann nutzen?** Immer am Anfang eines Chats, wenn die KI Markdown-Dateien oder Code generieren soll.
|
||||
**Zweck:** Verhindert, dass die Antwort im Chat-Fenster abbricht, weil die KI Code-Blöcke falsch verschachtelt.
|
||||
|
||||
```text
|
||||
# SYSTEM-ANWEISUNG: SICHERES MARKDOWN-RENDERING
|
||||
|
||||
Du agierst als technischer Assistent. Deine Aufgabe ist das Erstellen von Markdown-Dateien, die oft selbst Code-Blöcke enthalten.
|
||||
|
||||
**DAS PROBLEM:**
|
||||
Wenn du eine Markdown-Datei generierst, die Code-Blöcke (```) enthält, und diese Ausgabe selbst in einen Code-Block packst, interpretiert das Chat-Interface das erste innere ``` oft fälschlicherweise als das Ende der Ausgabe.
|
||||
|
||||
**DIE REGEL (STRIKT BEFOLGEN):**
|
||||
Um eine ununterbrochene Darstellung zu garantieren, musst du zwingend eine der folgenden Kapselungs-Methoden anwenden:
|
||||
|
||||
### Methode A: Die 4-Backtick-Methode (Bevorzugt)
|
||||
Umschließe den **gesamten** Datei-Inhalt mit **4 Backticks** (````).
|
||||
Dies erlaubt dir, innerhalb der Datei normale 3 Backticks zu verwenden.
|
||||
|
||||
### Methode B: Die 4-Space-Einrückung (Alternative)
|
||||
Wenn du außen 3 Backticks verwendest, darfst du im Inneren **KEINE** Backticks verwenden.
|
||||
Stattdessen müssen alle inneren Code-Beispiele mit **4 Leerzeichen (Spaces)** eingerückt werden.
|
||||
|
||||
**ZUSAMMENFASSUNG:**
|
||||
Generiere niemals verschachtelte 3-Backtick-Blöcke innerhalb von 3-Backtick-Blöcken.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2.2 Der "Doku-Update" Prompt (Nach WP-Abschluss)
|
||||
**Wann nutzen?** Wenn ein Feature fertig codiert ist und die Doku (`docs/`) nachgezogen werden muss.
|
||||
**Zweck:** Automatische Identifikation der betroffenen Doku-Dateien ohne manuelles Suchen.
|
||||
|
||||
**Vorbedingung:** Lade `docs/00_General/00_documentation_map.md` und `docs/06_Roadmap/06_active_roadmap.md` hoch.
|
||||
|
||||
```text
|
||||
Du agierst als **Technical Documentation Lead**.
|
||||
|
||||
**Kontext:**
|
||||
Wir haben soeben ein Workpackage (WP) abgeschlossen. Der Code ist implementiert.
|
||||
Jetzt müssen wir die Systemdokumentation (Mindnet v2.6 Modular Docs) aktualisieren.
|
||||
|
||||
**Deine Aufgabe - Phase 1: Identifikation**
|
||||
Analysiere die durchgeführten Änderungen dieses Workpackages (aus dem Chat-Verlauf).
|
||||
Nutze die beiliegende `00_documentation_map.md`, um zu identifizieren, welche Dokumentations-Module betroffen sind.
|
||||
|
||||
**Mapping-Logik:**
|
||||
* Neue Features? -> `00_glossary.md`, `02_Concepts/*`
|
||||
* DB/Payloads geändert? -> `03_tech_data_model.md`
|
||||
* Import/Algorithmus geändert? -> `03_tech_ingestion_pipeline.md`, `03_tech_retrieval_scoring.md`
|
||||
* Neue Configs? -> `03_tech_configuration.md`, `04_admin_operations.md`
|
||||
* UI/UX geändert? -> `01_User_Manual/*`, `03_tech_frontend.md`
|
||||
|
||||
**Output für Phase 1:**
|
||||
Erstelle eine **Liste der betroffenen Dateien** mit Begründung.
|
||||
Fordere mich explizit auf, dir diese Dateien hochzuladen.
|
||||
|
||||
---
|
||||
|
||||
**Deine Aufgabe - Phase 2: Sequenzielle Bearbeitung**
|
||||
Sobald ich die Dateien hochgeladen habe:
|
||||
1. Nimm dir **eine** Datei aus der Liste vor.
|
||||
2. Schreibe den kompletten, aktualisierten Inhalt (Markdown).
|
||||
* *Wichtig:* Halte dich an den bestehenden Stil und die "Render-Safe"-Regel (4 Backticks).
|
||||
3. **Warte** nach der Ausgabe auf mein "OK", bevor du die nächste Datei bearbeitest.
|
||||
|
||||
**Bist du bereit für die Analyse?**
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2.3 Der "Code Architect" Prompt (Refactoring & Analyse)
|
||||
**Wann nutzen?** Wenn du dich in den Code einarbeiten willst oder Aufräumen musst.
|
||||
**Vorbedingung:** Führe lokal `deep_scan.py` aus und lade `project_scan_report.json` hoch.
|
||||
|
||||
```text
|
||||
Du agierst als **Senior Software Architect**.
|
||||
|
||||
**INPUT:**
|
||||
Ich habe dir die Datei `project_scan_report.json` hochgeladen. Sie enthält eine Liste aller Dateien und ihrer Import-Beziehungen.
|
||||
|
||||
**DIE SOLL-STRUKTUR (4 SÄULEN):**
|
||||
Jede Datei muss einem dieser Zweige zugeordnet werden können:
|
||||
1. **Backend:** via `app/main.py`
|
||||
2. **Frontend:** via `app/frontend/ui.py`
|
||||
3. **Batch/Ops:** via `scripts/` (Produktions-Tools)
|
||||
4. **Tests:** via `tests/`
|
||||
|
||||
**DEINE AUFGABE:**
|
||||
Analysiere das JSON. Identifiziere "Zombies" (Dateien, die nirgends importiert werden und keinen klaren Entrypoint haben).
|
||||
Erstelle eine:
|
||||
1. **Modul-Tabelle** (Wer ruft wen auf?).
|
||||
2. **Lösch-Vorschlagsliste** (Dead Code).
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Workflow für ein Workpackage (WP)
|
||||
|
||||
Ein typisches Mindnet-Feature wird so entwickelt:
|
||||
|
||||
1. **Start:** Neuer Chat. Prompt **2.1 (Render-Safe)** eingeben.
|
||||
2. **Kontext:** `project_scan_report.json` oder relevante Core-Dateien hochladen.
|
||||
3. **Code:** Feature implementieren (Iterativ).
|
||||
4. **Test:** Code lokal validieren.
|
||||
5. **Doku:**
|
||||
* Neuer Chat (optional, für sauberen Kontext).
|
||||
* Prompt **2.1** + Prompt **2.2 (Doku-Update)** eingeben.
|
||||
* Doku aktualisieren lassen.
|
||||
6. **Commit:** Code + Doku zusammen pushen.
|
||||
|
|
@ -2,18 +2,18 @@
|
|||
doc_type: roadmap
|
||||
audience: product_owner, developer
|
||||
status: active
|
||||
version: 2.6
|
||||
version: 2.7
|
||||
context: "Aktuelle Planung für kommende Features (ab WP16), Release-Strategie und Historie der abgeschlossenen WPs."
|
||||
---
|
||||
|
||||
# Mindnet Active Roadmap
|
||||
|
||||
**Aktueller Stand:** v2.6.0 (Post-WP19)
|
||||
**Fokus:** Visualisierung, Exploration & Deep Search.
|
||||
**Aktueller Stand:** v2.6.0 (Post-WP15/WP19)
|
||||
**Fokus:** Visualisierung, Exploration & Intelligent Ingestion.
|
||||
|
||||
## 1. Programmstatus
|
||||
|
||||
Wir haben mit der Implementierung des Graph Explorers (WP19) einen Meilenstein in **Phase E (Maintenance & Scaling)** erreicht. Die Architektur ist nun modular. Der nächste Schritt (WP19a) vertieft die Analyse-Fähigkeiten.
|
||||
Wir haben mit der Implementierung des Graph Explorers (WP19) und der Smart Edge Allocation (WP15) die Basis für ein intelligentes, robustes System gelegt. Der nächste Schritt (WP19a) vertieft die Analyse, während WP16 die "Eingangs-Intelligenz" erhöht.
|
||||
|
||||
| Phase | Fokus | Status |
|
||||
| :--- | :--- | :--- |
|
||||
|
|
@ -43,14 +43,14 @@ Eine Übersicht der implementierten Features zum schnellen Auffinden von Funktio
|
|||
| **WP-10** | Web UI | Streamlit-Frontend als Ersatz für das Terminal. |
|
||||
| **WP-10a**| Draft Editor | GUI-Komponente zum Bearbeiten und Speichern generierter Notizen. |
|
||||
| **WP-11** | Backend Intelligence | `nomic-embed-text` (768d) und Matrix-Logik für Kanten-Typisierung. |
|
||||
| **WP-15** | Smart Edge Allocation | LLM-Filter für Kanten in Chunks + Traffic Control (Semaphore). |
|
||||
| **WP-15** | Smart Edge Allocation | LLM-Filter für Kanten in Chunks + Traffic Control (Semaphore) + Strict Chunking. |
|
||||
| **WP-19** | Graph Visualisierung | **Frontend Modularisierung:** Umbau auf `ui_*.py`.<br>**Graph Engines:** Parallelbetrieb von Cytoscape (COSE) und Agraph.<br>**Tools:** "Single Source of Truth" Editor, Persistenz via URL. |
|
||||
|
||||
---
|
||||
|
||||
## 3. Offene Workpackages (Planung)
|
||||
|
||||
Diese Features stehen als nächstes an.
|
||||
Diese Features stehen als nächstes an oder befinden sich in der Umsetzung.
|
||||
|
||||
### WP-19a – Graph Intelligence & Discovery (Sprint-Fokus)
|
||||
**Status:** 🚀 Startklar
|
||||
|
|
@ -59,11 +59,26 @@ Diese Features stehen als nächstes an.
|
|||
* **Filter-Logik:** "Zeige nur Wege, die zu `type:decision` führen".
|
||||
* **Chunk Inspection:** Umschaltbare Granularität (Notiz vs. Chunk) zur Validierung des Smart Chunkers.
|
||||
|
||||
### WP-16 – Auto-Discovery & Enrichment
|
||||
### WP-14 – Review / Refactoring / Dokumentation
|
||||
**Status:** 🟡 Laufend (Phase E)
|
||||
**Ziel:** Technische Schulden abbauen, die durch schnelle Feature-Entwicklung (WP15/WP19) entstanden sind.
|
||||
* **Refactoring `chunker.py`:** Die Datei ist monolithisch geworden (Parsing, Strategien, LLM-Orchestrierung).
|
||||
* *Lösung:* Aufteilung in ein Package `app/core/chunking/` mit Modulen (`strategies.py`, `orchestration.py`, `utils.py`).
|
||||
* **Dokumentation:** Kontinuierliche Synchronisation von Code und Docs (v2.6 Stand).
|
||||
|
||||
### WP-16 – Auto-Discovery & Intelligent Ingestion
|
||||
**Status:** 🟡 Geplant
|
||||
**Ziel:** Automatisches Erkennen von fehlenden Kanten in "dummem" Text *vor* der Speicherung.
|
||||
* **Problem:** Nutzer vergessen Wikilinks.
|
||||
* **Lösung:** Ein "Enricher" scannt Text vor dem Import, findet Keywords (z.B. "Mindnet") und schlägt Links vor (`[[Mindnet]]`).
|
||||
**Ziel:** Das System soll "dumme" Textdateien beim Import automatisch analysieren, strukturieren und anreichern, bevor sie gespeichert werden.
|
||||
**Kern-Features:**
|
||||
1. **Smart Link Enricher:** Automatisches Erkennen von fehlenden Kanten in Texten ohne explizite Wikilinks. Ein "Enricher" scannt Text vor dem Import, findet Keywords (z.B. "Mindnet") und schlägt Links vor (`[[Mindnet]]`).
|
||||
2. **Structure Analyzer (Auto-Strategy):**
|
||||
* *Problem:* Manuelle Zuweisung von `chunking_profile` in `types.yaml` ist starr.
|
||||
* *Lösung:* Vorschalten einer Analysestufe im Importer (`chunker.py`), die die Text-Topologie prüft und die Strategie wählt.
|
||||
* *Metrik 1 (Heading Density):* Verhältnis `Anzahl Überschriften / Wortanzahl`. Hohe Dichte (> 1/200) -> Indikator für `structured_smart_edges`. Niedrige Dichte -> `sliding_smart_edges`.
|
||||
* *Metrik 2 (Variance):* Regelmäßigkeit der Abstände zwischen Headings.
|
||||
3. **Context-Aware Hierarchy Merging:**
|
||||
* *Problem:* Leere Zwischenüberschriften (z.B. "Tier 2") gingen früher als bedeutungslose Chunks verloren oder wurden isoliert.
|
||||
* *Lösung:* Generalisierung der Logik aus WP-15, die leere Eltern-Elemente automatisch mit dem ersten Kind-Element verschmilzt ("Tier 2 + MP1"), um den Kontext für das Embedding zu wahren.
|
||||
|
||||
### WP-17 – Conversational Memory (Gedächtnis)
|
||||
**Status:** 🟡 Geplant
|
||||
|
|
@ -77,6 +92,11 @@ Diese Features stehen als nächstes an.
|
|||
* **Feature:** Cronjob `check_graph_integrity.py`.
|
||||
* **Funktion:** Findet "Dangling Edges" (Links auf gelöschte Notizen) und repariert/löscht sie.
|
||||
|
||||
### WP-13 – MCP-Integration & Agenten-Layer
|
||||
**Status:** 🟡 Geplant
|
||||
**Ziel:** mindnet als MCP-Server bereitstellen, damit Agenten (Claude Desktop, OpenAI) standardisierte Tools nutzen können.
|
||||
* **Umfang:** MCP-Server mit Tools (`mindnet_query`, `mindnet_explain`, etc.).
|
||||
|
||||
### WP-20 – Cloud Hybrid Mode (Optional)
|
||||
**Status:** ⚪ Optional
|
||||
**Ziel:** "Turbo-Modus" für Massen-Imports.
|
||||
|
|
@ -91,4 +111,6 @@ graph TD
|
|||
WP19(Graph Viz) --> WP19a(Discovery)
|
||||
WP19a --> WP17(Memory)
|
||||
WP15(Smart Edges) --> WP16(Auto-Discovery)
|
||||
WP15 --> WP14(Refactoring)
|
||||
WP03(Import) --> WP18(Health Check)
|
||||
WP03/WP04 --> WP13(MCP)
|
||||
202
docs/06_Roadmap/06_handover_prompts.md
Normal file
202
docs/06_Roadmap/06_handover_prompts.md
Normal file
|
|
@ -0,0 +1,202 @@
|
|||
---
|
||||
doc_type: operations
|
||||
audience: developer, admin
|
||||
status: active
|
||||
version: 1.0
|
||||
context: "Sammlung von Initialisierungs-Prompts für neue Chat-Sessions. Jeder Prompt entspricht dem Projektauftrag für ein spezifisches Workpackage."
|
||||
---
|
||||
|
||||
# Mindnet WP-Handover Prompts
|
||||
|
||||
**Verwendung:**
|
||||
Kopiere den entsprechenden Block in ein **neues** Chat-Fenster, um die KI-Instanz exakt auf den Kontext und die Ziele des Workpackages einzustellen.
|
||||
|
||||
---
|
||||
|
||||
## WP-19a: Graph Intelligence & Discovery
|
||||
|
||||
**Status:** 🚀 Startklar
|
||||
**Fokus:** Frontend-Erweiterung, Semantische Suche, Filterung.
|
||||
|
||||
```text
|
||||
Du bist der Lead Developer für "Mindnet", ein lokales RAG-System (Python/FastAPI/Streamlit/Qdrant).
|
||||
Wir starten jetzt **WP-19a: Graph Intelligence & Discovery**.
|
||||
|
||||
**Status Quo (v2.6.0):**
|
||||
- Backend: Async API mit `/query` (Hybrid Search) und `/chat`.
|
||||
- Frontend: Modularisiertes Streamlit (`ui.py`, `ui_graph.py`).
|
||||
- Daten: Qdrant Indizes (`notes`, `chunks`, `edges`) sind gefüllt.
|
||||
|
||||
**Dein Auftrag (WP-19a):**
|
||||
Implementiere "Deep Dive" Werkzeuge im Frontend, um den Graphen nicht nur zu sehen, sondern zu verstehen.
|
||||
1. **Neues UI-Modul:** Erstelle `app/frontend/ui_discovery.py`.
|
||||
2. **Discovery Tab:** Implementiere eine Oberfläche für:
|
||||
- Semantische Suche (ohne Chat-Modus).
|
||||
- Wildcard-Filter ("Zeige alle Notes vom Typ 'decision'").
|
||||
- Pfad-Analyse ("Wie sind Note A und Note B verbunden?").
|
||||
3. **Chunk Inspection:** Baue einen Toggle, der im Graph-Explorer zwischen "Note-View" (grob) und "Chunk-View" (fein) umschaltet.
|
||||
|
||||
**Regeln:**
|
||||
- Nutze `st.session_state` für Persistenz zwischen Re-Renders.
|
||||
- Halte die Business-Logik im Backend (ggf. neue Endpoints in `app/routers/query.py`).
|
||||
- Gib vollständigen, lauffähigen Code aus.
|
||||
|
||||
Bitte bestätige die Übernahme und skizziere die Architektur für `ui_discovery.py`.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## WP-13: MCP Integration & Agenten-Layer
|
||||
|
||||
**Status:** 🟡 Geplant
|
||||
**Fokus:** Schnittstelle für externe Agenten (Claude Desktop).
|
||||
|
||||
```text
|
||||
Du bist der Lead Developer für "Mindnet" (Python/FastAPI/Qdrant).
|
||||
Wir starten jetzt **WP-13: MCP Integration**.
|
||||
|
||||
**Status Quo (v2.6.0):**
|
||||
- Das System läuft stabil asynchron.
|
||||
- `types.yaml` steuert die Logik.
|
||||
- Es existieren Services für Retrieval und Graph-Access.
|
||||
|
||||
**Dein Auftrag (WP-13):**
|
||||
Implementiere einen MCP-Server (Model Context Protocol), der Mindnet als "Tool" für Claude Desktop verfügbar macht.
|
||||
1. **Server:** Erstelle `app/mcp_server.py` basierend auf dem `mcp`-SDK.
|
||||
2. **Tools:** Implementiere folgende Tools:
|
||||
- `search_notes(query)`: Nutzt unseren Hybriden Retriever.
|
||||
- `read_note(id)`: Liest Fulltext einer Notiz.
|
||||
- `list_connections(id)`: Zeigt Edges an.
|
||||
- `Notes(content)`: (Optional) Nutzt die Ingestion-Pipeline.
|
||||
3. **Integration:** Nutze die bestehenden Services (`Retriever`, `QdrantClient`) wieder – kein redundanter Code!
|
||||
4. **Async:** Achte auf Kompatibilität zwischen MCP und unserem `asyncio` Core.
|
||||
|
||||
**Regeln:**
|
||||
- Halte dich strikt an die Architektur in `mindnet_technical_architecture.md`.
|
||||
- Aktualisiere den `admin_guide.md` mit Anweisungen zur Einbindung in Claude Desktop config.
|
||||
|
||||
Bitte bestätige die Übernahme und zeige einen ersten Entwurf für `app/mcp_server.py`.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## WP-14: Review & Refactoring
|
||||
|
||||
**Status:** 🟡 Laufend
|
||||
**Fokus:** Code-Qualität, Modularisierung, Technische Schulden.
|
||||
|
||||
```text
|
||||
Du bist der Software-Architekt für "Mindnet".
|
||||
Wir starten **WP-14: Review & Refactoring**.
|
||||
|
||||
**Status Quo (v2.6.0):**
|
||||
- Das System ist funktional mächtig, aber einige Dateien (z.B. `chunker.py`) sind monolithisch geworden.
|
||||
- Dokumentation und Code müssen synchronisiert werden.
|
||||
|
||||
**Dein Auftrag (WP-14):**
|
||||
1. **Refactoring `chunker.py`:** Zerlege den Monolithen in ein sauberes Package `app/core/chunking/`.
|
||||
- `strategies.py`: Enthält `sliding_window` und `by_heading` Logik.
|
||||
- `orchestration.py`: Enthält `assemble_chunks` und Smart-Edge-Flow.
|
||||
- `utils.py`: Helper.
|
||||
2. **Cleanup:** Entferne veraltete / auskommentierte Code-Blöcke im gesamten Projekt.
|
||||
3. **Doc-Sync:** Prüfe, ob alle Parameter in `types.yaml` auch im Code verwendet werden (Dead Config Detection).
|
||||
|
||||
**Regeln:**
|
||||
- Funktionalität darf sich nicht ändern (Regression Tests!).
|
||||
- Imports in `scripts/import_markdown.py` müssen angepasst werden.
|
||||
|
||||
Bitte bestätige und beginne mit der Strukturierung des neuen `chunking` Packages.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## WP-16: Auto-Discovery & Intelligent Ingestion
|
||||
|
||||
**Status:** 🟡 Geplant
|
||||
**Fokus:** Automatisierung beim Import, "Smarter Text".
|
||||
|
||||
```text
|
||||
Du bist der Lead Developer für "Mindnet".
|
||||
Wir starten **WP-16: Auto-Discovery & Intelligent Ingestion**.
|
||||
|
||||
**Status Quo (v2.6.0):**
|
||||
- Import verlässt sich auf explizite Wikilinks und manuelle Profil-Wahl in `types.yaml`.
|
||||
- Smart Edges filtern nur vorhandene Links.
|
||||
|
||||
**Dein Auftrag (WP-16):**
|
||||
Mache den Import intelligenter, bevor Daten gespeichert werden.
|
||||
1. **Structure Analyzer:** Erweitere den Chunker um eine Vor-Analyse.
|
||||
- Berechne "Heading Density" (Überschriften pro Wort).
|
||||
- Wähle automatisch `structured` (hohe Dichte) oder `sliding` (niedrige Dichte), wenn `profile: auto` gesetzt ist.
|
||||
2. **Smart Link Enricher:** Implementiere einen Service, der im Text nach Keywords sucht, die als Titel anderer Notizen existieren (Exact Match & Fuzzy Match).
|
||||
- Schlage diese als `suggested_edges` vor.
|
||||
3. **Hierarchy Merging:** Generalisiere die Logik, dass leere Überschriften ("Tier 2") automatisch mit dem folgenden Inhalt verschmelzen.
|
||||
|
||||
**Regeln:**
|
||||
- Performance beachten! Der Import darf nicht ewig dauern.
|
||||
- Änderungen am Content nur im RAM, Original-Datei bleibt unangetastet (außer User will Writeback).
|
||||
|
||||
Bitte bestätige und skizziere den Algorithmus für den Structure Analyzer.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## WP-17: Conversational Memory
|
||||
|
||||
**Status:** 🟡 Geplant
|
||||
**Fokus:** Dialog-Qualität, Kontext.
|
||||
|
||||
```text
|
||||
Du bist der AI-Engineer für "Mindnet".
|
||||
Wir starten **WP-17: Conversational Memory**.
|
||||
|
||||
**Status Quo (v2.6.0):**
|
||||
- Chat ist "stateless". Jede Anfrage wird isoliert betrachtet.
|
||||
- RAG funktioniert, aber Rückfragen ("Was meinst du damit?") scheitern.
|
||||
|
||||
**Dein Auftrag (WP-17):**
|
||||
Implementiere ein Kurzzeitgedächtnis für den Chat.
|
||||
1. **API Update:** Erweitere `ChatRequest` (DTO) um ein Feld `history: List[Message]`.
|
||||
2. **Frontend:** Passe `ui.py` an, um die letzten N Nachrichten mitzusenden.
|
||||
3. **Token Management:** Implementiere eine Logik im `LLMService`, die das Kontext-Fenster (z.B. 4k Token) balanciert:
|
||||
- System Prompt (fest)
|
||||
- RAG Chunks (hoch gewichtet)
|
||||
- Chat History (auffüllen bis Limit)
|
||||
4. **Prompting:** Integriere `{chat_history}` in das Template in `prompts.yaml`.
|
||||
|
||||
**Regeln:**
|
||||
- Nutze effizientes Truncation (älteste Nachrichten zuerst weg).
|
||||
- History darf RAG-Wissen nicht verdrängen (Reserviere min. 60% für RAG).
|
||||
|
||||
Bitte bestätige und zeige das aktualisierte Pydantic-Modell für `ChatRequest`.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## WP-18: Graph Health & Maintenance
|
||||
|
||||
**Status:** 🟡 Geplant
|
||||
**Fokus:** Datenintegrität, Garbage Collection.
|
||||
|
||||
```text
|
||||
Du bist der DevOps Engineer für "Mindnet".
|
||||
Wir starten **WP-18: Graph Health & Maintenance**.
|
||||
|
||||
**Status Quo (v2.6.0):**
|
||||
- Import ist asynchron. Bei Timeouts kann der DB-Stand vom File-System abweichen (Hash-Mismatch).
|
||||
- Gelöschte Notizen hinterlassen "Dangling Edges".
|
||||
|
||||
**Dein Auftrag (WP-18):**
|
||||
1. **Transactional Ingestion:** Implementiere einen "Two-Phase Commit" für den Import.
|
||||
- Der File-Hash für die Änderungserkennung darf erst aktualisiert werden, wenn der Qdrant-Upsert *bestätigt* erfolgreich war.
|
||||
2. **Integrity Script:** Erstelle `scripts/check_graph_integrity.py`.
|
||||
- Prüfe: Gibt es Edges, deren `target_id` nicht in `notes` existiert?
|
||||
- Aktion: Report oder Auto-Delete.
|
||||
3. **Resolve References:** Erweitere das Skript um Logik, die "Unresolved Targets" (Text-Links) nachträglich in echte UUID-Links wandelt, wenn die Ziel-Notiz später importiert wurde.
|
||||
|
||||
**Regeln:**
|
||||
- Sicherheit geht vor Geschwindigkeit.
|
||||
- Keine Datenlöschung ohne Log-Eintrag.
|
||||
|
||||
Bitte bestätige und skizziere die Logik für den Transactional Hash Update.
|
||||
```
|
||||
|
|
@ -1,90 +0,0 @@
|
|||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "mindnet_chunk",
|
||||
"type": "object",
|
||||
"description": "Chunk-Payload (Qdrant). Kompatibel mit Alt-Feldern und neuen Feldern für Export/Roundtrip.",
|
||||
"required": ["id", "note_id", "chunk_index", "path"],
|
||||
"properties": {
|
||||
"id": { "type": "string" },
|
||||
"scope": { "type": "string", "enum": ["chunk"] },
|
||||
"note_id": { "type": "string" },
|
||||
"note_title": { "type": "string" },
|
||||
"note_type": { "type": "string" },
|
||||
"note_status": { "type": "string" },
|
||||
"type": { "type": "string", "description": "Legacy: früherer Chunk-Typ; kann dem Note-Typ entsprechen" },
|
||||
"area": { "type": "string" },
|
||||
"project": { "type": "string" },
|
||||
"tags": { "type": "array", "items": { "type": "string" } },
|
||||
|
||||
"note_path": { "type": "string" },
|
||||
"path": { "type": "string" },
|
||||
|
||||
"chunk_index": { "type": "integer" },
|
||||
"section_title":{ "type": ["string","null"] },
|
||||
"section_path": { "type": ["string","null"] },
|
||||
|
||||
"char_start": { "type": ["integer","null"] },
|
||||
"char_end": { "type": ["integer","null"] },
|
||||
"char_len": { "type": "integer" },
|
||||
|
||||
"token_count": { "type": "integer", "description": "Legacy: frühere Token-Zahl" },
|
||||
"token_est": { "type": "integer", "description": "Neue grobe Token-Schätzung (≈ len(text)/4)" },
|
||||
|
||||
"neighbors": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prev": { "type": ["string","null"] },
|
||||
"next": { "type": ["string","null"] }
|
||||
},
|
||||
"additionalProperties": false
|
||||
},
|
||||
|
||||
"text": { "type": "string" },
|
||||
"text_sha256": { "type": "string", "pattern": "^sha256:[0-9a-fA-F]{64}$" },
|
||||
"lang": { "type": "string" },
|
||||
|
||||
"wikilinks": { "type": "array", "items": { "type": "string" } },
|
||||
"external_links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"anyOf": [
|
||||
{ "type": "string" },
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"href": { "type": "string" },
|
||||
"label": { "type": ["string","null"] }
|
||||
},
|
||||
"required": ["href"],
|
||||
"additionalProperties": false
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"references": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"target_id": { "type": "string" },
|
||||
"kind": { "type": "string" }
|
||||
},
|
||||
"required": ["target_id","kind"],
|
||||
"additionalProperties": true
|
||||
}
|
||||
},
|
||||
|
||||
"embed_model": { "type": "string" },
|
||||
"embed_dim": { "type": "integer" },
|
||||
"embed_version": { "type": "integer" },
|
||||
|
||||
"created_at": { "type": "string" }
|
||||
},
|
||||
|
||||
"allOf": [
|
||||
{ "anyOf": [ { "required": ["token_count"] }, { "required": ["token_est"] } ] },
|
||||
{ "anyOf": [ { "required": ["type"] }, { "required": ["note_type"] } ] }
|
||||
],
|
||||
|
||||
"additionalProperties": true
|
||||
}
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"title": "mindnet_edge",
|
||||
"type": "object",
|
||||
"description": "Edge-Payload (Qdrant). Unterstützt Legacy (edge_type/src_id/dst_id) und neues Schema (kind/source_id/target_id/note_id/status).",
|
||||
|
||||
"properties": {
|
||||
"scope": { "type": "string", "enum": ["note","chunk"] },
|
||||
|
||||
"edge_type": { "type": "string", "description": "Legacy: z. B. references/backlink/belongs_to/prev/next" },
|
||||
"src_id": { "type": "string", "description": "Legacy: source_id" },
|
||||
"dst_id": { "type": "string", "description": "Legacy: target_id" },
|
||||
|
||||
"kind": { "type": "string", "description": "Neu: z. B. references/backlink/belongs_to/prev/next" },
|
||||
"source_id": { "type": "string" },
|
||||
"target_id": { "type": "string" },
|
||||
"note_id": { "type": "string", "description": "Owner-Note für diesen Edge (Filter/Purge)" },
|
||||
"status": { "type": "string", "description": "optional, z. B. 'unresolved'" },
|
||||
|
||||
"weight": { "type": "number" },
|
||||
"meta": { "type": "object" },
|
||||
"created_at":{ "type": "string" }
|
||||
},
|
||||
|
||||
"anyOf": [
|
||||
{ "required": ["src_id", "dst_id", "edge_type", "scope"] },
|
||||
{ "required": ["source_id", "target_id", "kind", "scope"] }
|
||||
],
|
||||
|
||||
"additionalProperties": true
|
||||
}
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"title": "mindnet note payload",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"note_id": { "type": "string" },
|
||||
"title": { "type": ["string","null"] },
|
||||
"type": { "type": ["string","null"] },
|
||||
"status": { "type": ["string","null"] },
|
||||
"created": { "type": ["string","null"] },
|
||||
"updated": { "type": ["string","null"] },
|
||||
"path": { "type": ["string","null"] },
|
||||
"tags": { "type": ["array","null"], "items": { "type": "string" } },
|
||||
"area": { "type": ["string","null"] },
|
||||
"project": { "type": ["string","null"] },
|
||||
"source": { "type": ["string","null"] },
|
||||
"lang": { "type": ["string","null"] },
|
||||
"slug": { "type": ["string","null"] },
|
||||
"aliases": { "type": ["array","null"], "items": { "type": "string" } },
|
||||
|
||||
"fulltext": { "type": ["string","null"] },
|
||||
"references": { "type": ["array","null"], "items": { "type": "string" } },
|
||||
|
||||
"hash_fulltext": { "type": ["string","null"], "pattern": "^[a-f0-9]{64}$" },
|
||||
"hash_signature": { "type": ["string","null"] },
|
||||
|
||||
"hash_body": { "type": ["string","null"], "pattern": "^[a-f0-9]{64}$" },
|
||||
"hash_frontmatter": { "type": ["string","null"], "pattern": "^[a-f0-9]{64}$" },
|
||||
"hash_full": { "type": ["string","null"], "pattern": "^[a-f0-9]{64}$" },
|
||||
|
||||
"hashes": {
|
||||
"type": ["object","null"],
|
||||
"description": "Mapping: <mode>:<source>:<normalize> -> sha256 hex",
|
||||
"patternProperties": {
|
||||
"^(body|frontmatter|full):(parsed|raw):(canonical|none)$": {
|
||||
"type": "string",
|
||||
"pattern": "^[a-f0-9]{64}$"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false
|
||||
}
|
||||
},
|
||||
"required": ["note_id"],
|
||||
"additionalProperties": true
|
||||
}
|
||||
|
|
@ -3,241 +3,222 @@
|
|||
"""
|
||||
resolve_unresolved_references.py — Unaufgelöste Wikilinks in Qdrant nachträglich auflösen
|
||||
|
||||
Version: 1.0.0 (2025-09-05)
|
||||
Version: 1.1.0 (Fixed for v2.6 Architecture)
|
||||
|
||||
Zweck
|
||||
------
|
||||
- Findet Edges in {prefix}_edges mit payload.status=="unresolved" und versucht, den Zielknoten
|
||||
anhand bereits vorhandener Notes in {prefix}_notes aufzulösen.
|
||||
- Aktualisiert die Edges (setzt target_id, entfernt status, setzt resolution), und erzeugt
|
||||
– NUR für Note-Level 'references' – die symmetrische 'backlink'-Kante.
|
||||
|
||||
Warum?
|
||||
------
|
||||
- Beim ersten Import können Links auf (noch) nicht existierende Notizen zeigen.
|
||||
- Sobald die Zielnotiz später existiert, kann dieses Skript die Kanten reparieren.
|
||||
- Findet Edges in {prefix}_edges mit payload.status=="unresolved".
|
||||
- Baut einen In-Memory Index aller Notizen (Titel/Alias -> ID).
|
||||
- Aktualisiert die Edges (setzt target_id, entfernt status).
|
||||
- Erzeugt symmetrische 'backlink'-Kanten für 'references'.
|
||||
|
||||
Aufruf
|
||||
------
|
||||
# Dry-Run (Standard):
|
||||
python3 -m scripts.resolve_unresolved_references --prefix mindnet
|
||||
|
||||
# Anwenden:
|
||||
python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply
|
||||
|
||||
# Optional: nur X Edges anfassen
|
||||
python3 -m scripts.resolve_unresolved_references --prefix mindnet --apply --limit 500
|
||||
|
||||
Parameter
|
||||
---------
|
||||
--prefix : Collection-Prefix (Default: aus Env COLLECION_PREFIX oder "mindnet")
|
||||
--apply : Änderungen tatsächlich schreiben (ohne --apply = Dry-Run)
|
||||
--limit : Max. Anzahl unaufgelöster Edges, die in diesem Lauf bearbeitet werden (Default: keine Begrenzung)
|
||||
--batch : Upsert-Batchgröße (Default: 512)
|
||||
|
||||
Voraussetzungen / Hinweise
|
||||
--------------------------
|
||||
- Bitte im aktivierten venv laufen lassen (deine Umgebung: `.venv`).
|
||||
- Qdrant-URL/Key/Prefix/Vektor-Dim werden wie üblich aus ENV gelesen (sieh app/core/qdrant.py). # noqa
|
||||
- Nutzt die vorhandenen Utilities:
|
||||
- app/core/qdrant.py (Client/Collections)
|
||||
- app/core/qdrant_points.py (points_for_edges/upsert_batch)
|
||||
- app/core/derive_edges.py (build_note_index/resolve_target)
|
||||
|
||||
Änderungshistorie
|
||||
-----------------
|
||||
1.0.0 Erstveröffentlichung.
|
||||
python3 -m scripts.resolve_unresolved_references --apply
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
from typing import Any, Dict, List, Tuple, Iterable
|
||||
import uuid
|
||||
from typing import List, Dict, Any, Iterable
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, collection_names # :contentReference[oaicite:3]{index=3}
|
||||
from app.core.qdrant_points import points_for_edges, upsert_batch # :contentReference[oaicite:4]{index=4}
|
||||
from app.core.derive_edges import build_note_index, resolve_target # :contentReference[oaicite:5]{index=5}
|
||||
|
||||
|
||||
def _scroll(client: QdrantClient, **kwargs):
|
||||
"""
|
||||
Wrapper um qdrant_client.scroll() für unterschiedliche Client-Versionen:
|
||||
neuere: (points, next_offset)
|
||||
ältere: (points, next_page_offset, _)
|
||||
"""
|
||||
res = client.scroll(**kwargs)
|
||||
if isinstance(res, tuple):
|
||||
if len(res) == 2:
|
||||
points, next_off = res
|
||||
else:
|
||||
# ältere Signatur: (points, next_off, _)
|
||||
points, next_off, _ = res[0], res[1], res[2]
|
||||
else:
|
||||
# sehr alte Clients -> konservativ behandeln
|
||||
points, next_off = res, None
|
||||
return points, next_off
|
||||
|
||||
|
||||
def _load_all_notes(client: QdrantClient, notes_col: str) -> List[Dict[str, Any]]:
|
||||
notes: List[Dict[str, Any]] = []
|
||||
next_off = None
|
||||
while True:
|
||||
pts, next_off = _scroll(
|
||||
client,
|
||||
collection_name=notes_col,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
limit=1024,
|
||||
offset=next_off,
|
||||
)
|
||||
for p in pts or []:
|
||||
pl = getattr(p, "payload", {}) or {}
|
||||
# Erwartet Felder: note_id, title, path etc. (gemäß Schema) # :contentReference[oaicite:6]{index=6}
|
||||
if pl.get("note_id"):
|
||||
notes.append(pl)
|
||||
if not next_off:
|
||||
break
|
||||
return notes
|
||||
|
||||
|
||||
def _iter_unresolved_edges(client: QdrantClient, edges_col: str) -> Iterable[rest.Record]:
|
||||
"""
|
||||
Liefert alle Edge-Records mit payload.status == 'unresolved' und 'target_label' (string).
|
||||
"""
|
||||
f = rest.Filter(
|
||||
must=[
|
||||
rest.FieldCondition(key="status", match=rest.MatchValue(value="unresolved")),
|
||||
]
|
||||
)
|
||||
next_off = None
|
||||
while True:
|
||||
pts, next_off = _scroll(
|
||||
client,
|
||||
collection_name=edges_col,
|
||||
scroll_filter=f,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
limit=1024,
|
||||
offset=next_off,
|
||||
)
|
||||
for p in pts or []:
|
||||
pl = getattr(p, "payload", {}) or {}
|
||||
if isinstance(pl.get("target_label"), str):
|
||||
yield p
|
||||
if not next_off:
|
||||
break
|
||||
from qdrant_client import models
|
||||
from app.core.qdrant import QdrantConfig, get_client
|
||||
from app.core.qdrant_points import points_for_edges
|
||||
|
||||
# Logging Setup
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def _make_backlink(source_note_id: str, target_note_id: str, extra: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Baue eine 'backlink'-Edge-Payload source <- target (note-level).
|
||||
Hilfsfunktion: Erzeugt die Payload für den Backlink.
|
||||
"""
|
||||
e = {
|
||||
"kind": "backlink",
|
||||
return {
|
||||
"source_id": target_note_id,
|
||||
"target_id": source_note_id,
|
||||
"kind": "backlink",
|
||||
"scope": "note",
|
||||
"text": f"Backlink from {extra.get('alias') or 'note'}",
|
||||
"rule_id": "derived:backlink",
|
||||
"confidence": 0.9
|
||||
}
|
||||
# Metafelder aus dem Original übernehmen (ohne status)
|
||||
copy_keys = ["raw", "alias", "heading", "resolution"]
|
||||
for k in copy_keys:
|
||||
if k in extra:
|
||||
e[k] = extra[k]
|
||||
return e
|
||||
|
||||
def build_lookup_index(client, collection_name: str) -> Dict[str, str]:
|
||||
"""
|
||||
Lädt ALLE Notizen und baut ein Mapping:
|
||||
lower(title) -> note_id
|
||||
lower(alias) -> note_id
|
||||
"""
|
||||
logger.info("Building lookup index from existing notes...")
|
||||
lookup = {}
|
||||
|
||||
# Scroll über alle Notizen
|
||||
next_offset = None
|
||||
count = 0
|
||||
while True:
|
||||
records, next_offset = client.scroll(
|
||||
collection_name=collection_name,
|
||||
limit=1000,
|
||||
offset=next_offset,
|
||||
with_payload=True,
|
||||
with_vectors=False
|
||||
)
|
||||
|
||||
for record in records:
|
||||
pl = record.payload or {}
|
||||
nid = pl.get("note_id")
|
||||
if not nid: continue
|
||||
|
||||
# 1. Titel
|
||||
title = pl.get("title")
|
||||
if title:
|
||||
lookup[str(title).lower().strip()] = nid
|
||||
|
||||
# 2. Aliases (WP-11)
|
||||
aliases = pl.get("aliases", [])
|
||||
if isinstance(aliases, str): aliases = [aliases]
|
||||
for a in aliases:
|
||||
lookup[str(a).lower().strip()] = nid
|
||||
|
||||
count += len(records)
|
||||
if next_offset is None:
|
||||
break
|
||||
|
||||
logger.info(f"Index built. Mapped {len(lookup)} terms to {count} unique notes.")
|
||||
return lookup
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--prefix", help="Collection-Prefix (Default: Env/COLLECTION_PREFIX oder 'mindnet')")
|
||||
ap.add_argument("--apply", action="store_true", help="Änderungen schreiben (ohne Flag = Dry-Run)")
|
||||
ap.add_argument("--limit", type=int, default=0, help="Max. Anzahl unaufgelöster Edges bearbeiten (0 = alle)")
|
||||
ap.add_argument("--batch", type=int, default=512, help="Upsert-Batchgröße")
|
||||
args = ap.parse_args()
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--prefix", default=None, help="Collection prefix")
|
||||
parser.add_argument("--apply", action="store_true", help="Write changes to DB")
|
||||
parser.add_argument("--limit", type=int, default=0, help="Max edges to process (0=all)")
|
||||
parser.add_argument("--batch", type=int, default=100, help="Upsert batch size")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Qdrant-Setup
|
||||
cfg = QdrantConfig.from_env()
|
||||
if args.prefix:
|
||||
cfg.prefix = args.prefix
|
||||
|
||||
client = get_client(cfg)
|
||||
ensure_collections(client, cfg.prefix, cfg.dim) # sorgt u. a. für 1D-Vektor-Collection bei Edges :contentReference[oaicite:7]{index=7}
|
||||
notes_col, _, edges_col = collection_names(cfg.prefix) # :contentReference[oaicite:8]{index=8}
|
||||
edges_col = f"{cfg.prefix}_edges"
|
||||
notes_col = f"{cfg.prefix}_notes"
|
||||
|
||||
# Notes laden & Index bauen
|
||||
notes = _load_all_notes(client, notes_col)
|
||||
idx = build_note_index(notes) # (by_id, by_slug, by_file_slug) :contentReference[oaicite:9]{index=9}
|
||||
|
||||
# Unresolved-Edges scannen
|
||||
to_fix: List[dict] = []
|
||||
backlinks: List[dict] = []
|
||||
processed = 0
|
||||
resolved = 0
|
||||
|
||||
for rec in _iter_unresolved_edges(client, edges_col):
|
||||
if args.limit and processed >= args.limit:
|
||||
break
|
||||
processed += 1
|
||||
|
||||
pl = dict(rec.payload or {})
|
||||
kind = pl.get("kind") or "references"
|
||||
src = pl.get("source_id")
|
||||
tgt_label = pl.get("target_label") or pl.get("target_id") # Fallback
|
||||
|
||||
# Zielauflösung
|
||||
resolved_id, how = resolve_target(str(tgt_label), idx) # :contentReference[oaicite:10]{index=10}
|
||||
if not resolved_id:
|
||||
continue # weiterhin unresolved
|
||||
|
||||
# Edge-Update
|
||||
new_pl = dict(pl)
|
||||
new_pl["target_id"] = resolved_id
|
||||
new_pl["resolution"] = how
|
||||
if "status" in new_pl:
|
||||
del new_pl["status"]
|
||||
# ID stabil lassen -> points_for_edges erzeugt UUID aus edge_id/Fallback :contentReference[oaicite:11]{index=11}
|
||||
if "edge_id" not in new_pl:
|
||||
# stabiler Key aus (kind, src, tgt, evtl. seq)
|
||||
seq = new_pl.get("seq") or new_pl.get("order") or ""
|
||||
new_pl["edge_id"] = f"{kind}:{src}->{resolved_id}#{seq}"
|
||||
|
||||
to_fix.append(new_pl)
|
||||
resolved += 1
|
||||
|
||||
# Nur bei Note-Level references (nicht references_at) -> Backlink erzeugen
|
||||
if kind == "references":
|
||||
extra = {k: new_pl.get(k) for k in ("raw", "alias", "heading")}
|
||||
extra["resolution"] = how
|
||||
backlinks.append(_make_backlink(source_note_id=src, target_note_id=resolved_id, extra=extra))
|
||||
|
||||
# Ergebnis ausgeben
|
||||
summary = {
|
||||
"prefix": cfg.prefix,
|
||||
"scanned_unresolved": processed,
|
||||
"resolved": resolved,
|
||||
"backlinks_to_create": len(backlinks),
|
||||
"apply": bool(args.apply),
|
||||
}
|
||||
print(json.dumps(summary, ensure_ascii=False))
|
||||
|
||||
if not args.apply:
|
||||
# 1. Index aufbauen
|
||||
try:
|
||||
lookup_index = build_lookup_index(client, notes_col)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to build index: {e}")
|
||||
return
|
||||
|
||||
# Upserts (in Batches)
|
||||
def _batched(items: List[dict], n: int) -> Iterable[List[dict]]:
|
||||
for i in range(0, len(items), n):
|
||||
yield items[i : i + n]
|
||||
# 2. Unresolved Edges finden
|
||||
logger.info(f"Scanning for unresolved edges in {edges_col}...")
|
||||
|
||||
# 1) Updates für reparierte Edges
|
||||
for chunk in _batched(to_fix, args.batch):
|
||||
col, pts = points_for_edges(cfg.prefix, chunk) # sorgt für Edge-UUID & Dummy-Vector :contentReference[oaicite:12]{index=12}
|
||||
upsert_batch(client, col, pts)
|
||||
scroll_filter = models.Filter(
|
||||
must=[
|
||||
models.FieldCondition(key="status", match=models.MatchValue(value="unresolved"))
|
||||
]
|
||||
)
|
||||
|
||||
# 2) Backlinks (nur references)
|
||||
for chunk in _batched(backlinks, args.batch):
|
||||
col, pts = points_for_edges(cfg.prefix, chunk)
|
||||
upsert_batch(client, col, pts)
|
||||
unresolved_edges = []
|
||||
next_page = None
|
||||
while True:
|
||||
res, next_page = client.scroll(
|
||||
collection_name=edges_col,
|
||||
scroll_filter=scroll_filter,
|
||||
limit=500,
|
||||
with_payload=True,
|
||||
offset=next_page
|
||||
)
|
||||
unresolved_edges.extend(res)
|
||||
if next_page is None or (args.limit > 0 and len(unresolved_edges) >= args.limit):
|
||||
break
|
||||
|
||||
if args.limit > 0:
|
||||
unresolved_edges = unresolved_edges[:args.limit]
|
||||
|
||||
logger.info(f"Found {len(unresolved_edges)} unresolved edges.")
|
||||
|
||||
# 3. Auflösen
|
||||
to_fix = []
|
||||
backlinks = []
|
||||
resolved_count = 0
|
||||
|
||||
for pt in unresolved_edges:
|
||||
pl = pt.payload
|
||||
# Der gesuchte Begriff steckt oft in 'raw_target' (wenn Parser es speichert)
|
||||
# oder wir nutzen die 'target_id', falls diese temporär den Namen hält (Legacy Parser Verhalten).
|
||||
# Im v2.6 Parser ist die target_id bei unresolved links oft der slug oder name.
|
||||
|
||||
# Strategie: Wir schauen uns das Payload an.
|
||||
# Fall A: derive_edges hat target_id="[[Missing Note]]" gesetzt (selten)
|
||||
# Fall B: target_id ist der Slug/Titel in Kleinbuchstaben (häufig)
|
||||
# Fall C: Es gibt ein Feld 'raw' oder 'text'
|
||||
|
||||
candidate = pl.get("target_id")
|
||||
|
||||
# Versuch der Auflösung
|
||||
target_nid = lookup_index.get(str(candidate).lower().strip())
|
||||
|
||||
if target_nid:
|
||||
# TREFFER!
|
||||
new_pl = pl.copy()
|
||||
new_pl["target_id"] = target_nid
|
||||
new_pl.pop("status", None) # Status entfernen -> ist jetzt resolved
|
||||
new_pl["resolution"] = "healed_by_script"
|
||||
|
||||
# Neue Edge ID generieren (Clean architecture)
|
||||
# Wir behalten die alte ID NICHT, da die ID oft target_id enthält und wir Duplikate vermeiden wollen.
|
||||
# Alternativ: Update auf bestehender ID. Wir machen hier ein Update.
|
||||
|
||||
to_fix.append({
|
||||
"id": pt.id,
|
||||
"payload": new_pl
|
||||
})
|
||||
|
||||
# Backlink erzeugen? Nur wenn es eine Referenz ist
|
||||
if pl.get("kind") == "references":
|
||||
backlinks.append(_make_backlink(
|
||||
source_note_id=pl.get("source_id"),
|
||||
target_note_id=target_nid,
|
||||
extra={"alias": candidate}
|
||||
))
|
||||
|
||||
resolved_count += 1
|
||||
|
||||
logger.info(f"Resolvable: {resolved_count}/{len(unresolved_edges)}")
|
||||
|
||||
if not args.apply:
|
||||
logger.info("DRY RUN. Use --apply to execute.")
|
||||
return
|
||||
|
||||
# 4. Schreiben
|
||||
if to_fix:
|
||||
logger.info(f"Updating {len(to_fix)} edges...")
|
||||
# Qdrant Update: Wir überschreiben den Point.
|
||||
# Achtung: client.upsert erwartet PointStructs.
|
||||
|
||||
points_to_upsert = [
|
||||
models.PointStruct(id=u["id"], payload=u["payload"], vector={})
|
||||
for u in to_fix
|
||||
]
|
||||
|
||||
# Batchweise
|
||||
for i in range(0, len(points_to_upsert), args.batch):
|
||||
batch = points_to_upsert[i:i+args.batch]
|
||||
client.upsert(collection_name=edges_col, points=batch)
|
||||
|
||||
if backlinks:
|
||||
logger.info(f"Creating {len(backlinks)} backlinks...")
|
||||
# Hier nutzen wir den Helper aus qdrant_points für saubere IDs
|
||||
col, bl_points = points_for_edges(backlinks, cfg.prefix)
|
||||
# batchweise
|
||||
for i in range(0, len(bl_points), args.batch):
|
||||
batch = bl_points[i:i+args.batch]
|
||||
client.upsert(collection_name=col, points=batch)
|
||||
|
||||
logger.info("Done.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user