Merge pull request 'WP15b' (#15) from WP15b into main
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 4s
Reviewed-on: #15 #### PR-Zusammenfassung: WP-14 Modularisierung & WP-15b Two-Pass Ingestion Dieser Merge schließt die technische Konsolidierung der Architektur (WP-14) und die Optimierung der Ingestion-Pipeline (WP-15b) ab. Das System wurde von einer monolithischen Struktur in eine domänengesteuerte Paket-Hierarchie überführt. **Kernänderungen:** * **WP-14 (Modularisierung):** * Aufteilung von `app/core/` in spezialisierte Pakete: `database/`, `ingestion/`, `retrieval/` und `graph/`. * Einführung von Proxy-Modulen (z.B. `graph_adapter.py`, `retriever.py`) zur Sicherstellung der Abwärtskompatibilität. * Zentralisierung neutraler Logik in `app/core/registry.py` zur Beseitigung von Zirkelbezügen. * **WP-15b (Intelligence 2.0):** * Umstellung der Ingestion auf einen **Two-Pass Workflow**. * **Pass 1:** Globaler Pre-Scan zur Befüllung des `LocalBatchCache`. * **Pass 2:** Binäre semantische Validierung von Kanten gegen den Kontext des Caches zur Eliminierung von Halluzinationen. **Betroffene Komponenten:** * `app.core.database`: Qdrant-Infrastruktur & Point-Mapping. * `app.core.retrieval`: Scoring-Engine (WP-22) & Orchestrierung. * `app.core.graph`: Subgraph-Modell & Traversierung. * Sämtliche Dokumentations-Module (v2.9.1 Update). **Teststatus:** ✅ Inkrementelle Ingestion (Pass 2 Skip) verifiziert. ✅ Hybrid-Scoring (WP-22) via isolated package verifiziert. ✅ Circular Import Audit erfolgreich abgeschlossen.
This commit is contained in:
commit
23b1cb2966
|
|
@ -1,176 +0,0 @@
|
|||
"""
|
||||
FILE: app/core/chunk_payload.py
|
||||
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
||||
FEATURES:
|
||||
- Inkludiert Nachbarschafts-IDs (prev/next) und Titel.
|
||||
- FIX 3: Robuste Erkennung des Inputs (Frontmatter-Dict vs. Note-Objekt), damit Overrides ankommen.
|
||||
VERSION: 2.3.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: yaml, os
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import Any, Dict, List, Optional
|
||||
import os, yaml
|
||||
|
||||
def _env(n: str, d: Optional[str]=None) -> str:
|
||||
v = os.getenv(n)
|
||||
return v if v is not None else (d or "")
|
||||
|
||||
def _load_types() -> dict:
|
||||
p = _env("MINDNET_TYPES_FILE", "./config/types.yaml")
|
||||
try:
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _get_types_map(reg: dict) -> dict:
|
||||
if isinstance(reg, dict) and isinstance(reg.get("types"), dict):
|
||||
return reg["types"]
|
||||
return reg if isinstance(reg, dict) else {}
|
||||
|
||||
def _get_defaults(reg: dict) -> dict:
|
||||
if isinstance(reg, dict) and isinstance(reg.get("defaults"), dict):
|
||||
return reg["defaults"]
|
||||
if isinstance(reg, dict) and isinstance(reg.get("global"), dict):
|
||||
return reg["global"]
|
||||
return {}
|
||||
|
||||
def _as_float(x: Any):
|
||||
try: return float(x)
|
||||
except Exception: return None
|
||||
|
||||
def _resolve_chunk_profile_from_config(note_type: str, reg: dict) -> Optional[str]:
|
||||
# 1. Type Level
|
||||
types = _get_types_map(reg)
|
||||
if isinstance(types, dict):
|
||||
t = types.get(note_type, {})
|
||||
if isinstance(t, dict):
|
||||
cp = t.get("chunking_profile") or t.get("chunk_profile")
|
||||
if isinstance(cp, str) and cp: return cp
|
||||
# 2. Defaults Level
|
||||
defs = _get_defaults(reg)
|
||||
if isinstance(defs, dict):
|
||||
cp = defs.get("chunking_profile") or defs.get("chunk_profile")
|
||||
if isinstance(cp, str) and cp: return cp
|
||||
return None
|
||||
|
||||
def _resolve_retriever_weight_from_config(note_type: str, reg: dict) -> float:
|
||||
"""
|
||||
Liest Weight nur aus Config (Type > Default).
|
||||
Wird aufgerufen, wenn im Frontmatter nichts steht.
|
||||
"""
|
||||
# 1. Type Level
|
||||
types = _get_types_map(reg)
|
||||
if isinstance(types, dict):
|
||||
t = types.get(note_type, {})
|
||||
if isinstance(t, dict) and (t.get("retriever_weight") is not None):
|
||||
v = _as_float(t.get("retriever_weight"))
|
||||
if v is not None: return float(v)
|
||||
|
||||
# 2. Defaults Level
|
||||
defs = _get_defaults(reg)
|
||||
if isinstance(defs, dict) and (defs.get("retriever_weight") is not None):
|
||||
v = _as_float(defs.get("retriever_weight"))
|
||||
if v is not None: return float(v)
|
||||
|
||||
return 1.0
|
||||
|
||||
def _as_list(x):
|
||||
if x is None: return []
|
||||
if isinstance(x, list): return x
|
||||
return [x]
|
||||
|
||||
def make_chunk_payloads(note: Dict[str, Any],
|
||||
note_path: str,
|
||||
chunks_from_chunker: List[Any],
|
||||
*,
|
||||
note_text: str = "",
|
||||
types_cfg: Optional[dict] = None,
|
||||
file_path: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Erstellt die Payloads für die Chunks.
|
||||
|
||||
Argument 'note' kann sein:
|
||||
A) Ein komplexes Objekt/Dict mit Key "frontmatter" (Legacy / Tests)
|
||||
B) Direkt das Frontmatter-Dictionary (Call aus ingestion.py)
|
||||
"""
|
||||
|
||||
# --- FIX 3: Intelligente Erkennung der Input-Daten ---
|
||||
# Wir prüfen: Ist 'note' ein Container MIT 'frontmatter', oder IST es das 'frontmatter'?
|
||||
if isinstance(note, dict) and "frontmatter" in note and isinstance(note["frontmatter"], dict):
|
||||
# Fall A: Container (wir müssen auspacken)
|
||||
fm = note["frontmatter"]
|
||||
else:
|
||||
# Fall B: Direktes Dict (so ruft ingestion.py es auf!)
|
||||
fm = note or {}
|
||||
|
||||
note_type = fm.get("type") or note.get("type") or "concept"
|
||||
|
||||
# Title Extraction (Fallback Chain)
|
||||
title = fm.get("title") or note.get("title") or fm.get("id") or "Untitled"
|
||||
|
||||
reg = types_cfg if isinstance(types_cfg, dict) else _load_types()
|
||||
|
||||
# --- Profil-Ermittlung ---
|
||||
# Da wir 'fm' jetzt korrekt haben, funktionieren diese lookups:
|
||||
cp = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
|
||||
if not cp:
|
||||
cp = _resolve_chunk_profile_from_config(note_type, reg)
|
||||
if not cp:
|
||||
cp = "sliding_standard"
|
||||
|
||||
# --- Retriever Weight Ermittlung ---
|
||||
rw = fm.get("retriever_weight")
|
||||
|
||||
if rw is None:
|
||||
rw = _resolve_retriever_weight_from_config(note_type, reg)
|
||||
|
||||
try:
|
||||
rw = float(rw)
|
||||
except Exception:
|
||||
rw = 1.0
|
||||
|
||||
tags = fm.get("tags") or []
|
||||
if isinstance(tags, str):
|
||||
tags = [tags]
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
for idx, ch in enumerate(chunks_from_chunker):
|
||||
# Attribute extrahieren
|
||||
cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None)
|
||||
nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id"))
|
||||
index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx)
|
||||
text = getattr(ch, "text", None) or (ch.get("text") if isinstance(ch, dict) else "")
|
||||
window = getattr(ch, "window", None) or (ch.get("window") if isinstance(ch, dict) else text)
|
||||
prev_id = getattr(ch, "neighbors_prev", None) or (ch.get("neighbors_prev") if isinstance(ch, dict) else None)
|
||||
next_id = getattr(ch, "neighbors_next", None) or (ch.get("neighbors_next") if isinstance(ch, dict) else None)
|
||||
|
||||
pl: Dict[str, Any] = {
|
||||
"note_id": nid,
|
||||
"chunk_id": cid,
|
||||
"title": title,
|
||||
"index": int(index),
|
||||
"ord": int(index) + 1,
|
||||
"type": note_type,
|
||||
"tags": tags,
|
||||
"text": text,
|
||||
"window": window,
|
||||
"neighbors_prev": _as_list(prev_id),
|
||||
"neighbors_next": _as_list(next_id),
|
||||
"section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""),
|
||||
"path": note_path,
|
||||
"source_path": file_path or note_path,
|
||||
"retriever_weight": float(rw),
|
||||
"chunk_profile": cp, # Jetzt endlich mit dem Override-Wert!
|
||||
}
|
||||
|
||||
# Cleanup
|
||||
for alias in ("chunk_num", "Chunk_Number"):
|
||||
pl.pop(alias, None)
|
||||
|
||||
out.append(pl)
|
||||
|
||||
return out
|
||||
|
|
@ -1,474 +0,0 @@
|
|||
"""
|
||||
FILE: app/core/chunker.py
|
||||
DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings).
|
||||
Orchestriert die Smart-Edge-Allocation via SemanticAnalyzer.
|
||||
FIX V3: Support für mehrzeilige Callouts und Section-Propagation.
|
||||
VERSION: 3.1.0 (Full Compatibility Merge)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Optional, Tuple, Any, Set
|
||||
import re
|
||||
import math
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
# Services
|
||||
from app.services.semantic_analyzer import get_semantic_analyzer
|
||||
|
||||
# Core Imports
|
||||
# Wir importieren build_edges_for_note nur, um kompatibel zur Signatur zu bleiben
|
||||
# oder für den Fallback.
|
||||
try:
|
||||
from app.core.derive_edges import build_edges_for_note
|
||||
except ImportError:
|
||||
# Mock für Tests
|
||||
def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return []
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ==========================================
|
||||
# 1. HELPER & CONFIG
|
||||
# ==========================================
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
||||
CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
|
||||
# Fallback Default, falls types.yaml fehlt
|
||||
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}
|
||||
_CONFIG_CACHE = None
|
||||
|
||||
def _load_yaml_config() -> Dict[str, Any]:
|
||||
global _CONFIG_CACHE
|
||||
if _CONFIG_CACHE is not None: return _CONFIG_CACHE
|
||||
if not CONFIG_PATH.exists(): return {}
|
||||
try:
|
||||
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
_CONFIG_CACHE = data
|
||||
return data
|
||||
except Exception: return {}
|
||||
|
||||
def get_chunk_config(note_type: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Lädt die Chunking-Strategie basierend auf dem Note-Type aus types.yaml.
|
||||
Dies sichert die Kompatibilität zu WP-15 (Profile).
|
||||
"""
|
||||
full_config = _load_yaml_config()
|
||||
profiles = full_config.get("chunking_profiles", {})
|
||||
type_def = full_config.get("types", {}).get(note_type.lower(), {})
|
||||
|
||||
# Welches Profil nutzt dieser Typ? (z.B. 'sliding_smart_edges')
|
||||
profile_name = type_def.get("chunking_profile")
|
||||
|
||||
if not profile_name:
|
||||
profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
||||
|
||||
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
|
||||
|
||||
# Tupel-Konvertierung für Overlap (YAML liest oft Listen)
|
||||
if "overlap" in config and isinstance(config["overlap"], list):
|
||||
config["overlap"] = tuple(config["overlap"])
|
||||
|
||||
return config
|
||||
|
||||
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
|
||||
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
|
||||
if not fm_match: return {}, md_text
|
||||
try:
|
||||
frontmatter = yaml.safe_load(fm_match.group(1))
|
||||
if not isinstance(frontmatter, dict): frontmatter = {}
|
||||
except yaml.YAMLError:
|
||||
frontmatter = {}
|
||||
text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
|
||||
return frontmatter, text_without_fm.strip()
|
||||
|
||||
# ==========================================
|
||||
# 2. DATA CLASSES & TEXT TOOLS
|
||||
# ==========================================
|
||||
|
||||
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])'); _WS = re.compile(r'\s+')
|
||||
|
||||
def estimate_tokens(text: str) -> int:
|
||||
return max(1, math.ceil(len(text.strip()) / 4))
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
text = _WS.sub(' ', text.strip())
|
||||
if not text: return []
|
||||
parts = _SENT_SPLIT.split(text)
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
|
||||
@dataclass
|
||||
class RawBlock:
|
||||
kind: str; text: str; level: Optional[int]; section_path: str; section_title: Optional[str]
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
id: str; note_id: str; index: int; text: str; window: str; token_count: int
|
||||
section_title: Optional[str]; section_path: str
|
||||
neighbors_prev: Optional[str]; neighbors_next: Optional[str]
|
||||
suggested_edges: Optional[List[str]] = None
|
||||
|
||||
# ==========================================
|
||||
# 3. PARSING & STRATEGIES
|
||||
# ==========================================
|
||||
|
||||
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||
"""
|
||||
Zerlegt Text in logische Blöcke (Absätze, Header).
|
||||
Wichtig für die Strategie 'by_heading'.
|
||||
"""
|
||||
blocks = []
|
||||
h1_title = "Dokument"
|
||||
section_path = "/"
|
||||
current_h2 = None
|
||||
|
||||
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
||||
|
||||
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
|
||||
if h1_match:
|
||||
h1_title = h1_match.group(1).strip()
|
||||
|
||||
lines = text_without_fm.split('\n')
|
||||
buffer = []
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if stripped.startswith('# '):
|
||||
continue
|
||||
elif stripped.startswith('## '):
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||
buffer = []
|
||||
current_h2 = stripped[3:].strip()
|
||||
section_path = f"/{current_h2}"
|
||||
blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2))
|
||||
elif not stripped:
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||
buffer = []
|
||||
else:
|
||||
buffer.append(line)
|
||||
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||
|
||||
return blocks, h1_title
|
||||
|
||||
def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "", context_prefix: str = "") -> List[Chunk]:
|
||||
"""
|
||||
Die Standard-Strategie aus WP-15.
|
||||
Fasst Blöcke zusammen und schneidet bei 'target' Tokens (mit Satz-Rücksicht).
|
||||
"""
|
||||
target = config.get("target", 400)
|
||||
max_tokens = config.get("max", 600)
|
||||
overlap_val = config.get("overlap", (50, 80))
|
||||
overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
|
||||
chunks = []; buf = []
|
||||
|
||||
def _create_chunk(txt, win, sec, path):
|
||||
idx = len(chunks)
|
||||
chunks.append(Chunk(
|
||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||
section_title=sec, section_path=path, neighbors_prev=None, neighbors_next=None,
|
||||
suggested_edges=[]
|
||||
))
|
||||
|
||||
def flush_buffer():
|
||||
nonlocal buf
|
||||
if not buf: return
|
||||
|
||||
text_body = "\n\n".join([b.text for b in buf])
|
||||
sec_title = buf[-1].section_title if buf else None
|
||||
sec_path = buf[-1].section_path if buf else "/"
|
||||
|
||||
# Context Prefix (z.B. H1) voranstellen für Embedding-Qualität
|
||||
win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body
|
||||
|
||||
if estimate_tokens(text_body) <= max_tokens:
|
||||
_create_chunk(text_body, win_body, sec_title, sec_path)
|
||||
else:
|
||||
# Zu groß -> Satzweiser Split
|
||||
sentences = split_sentences(text_body)
|
||||
current_chunk_sents = []
|
||||
current_len = 0
|
||||
|
||||
for sent in sentences:
|
||||
sent_len = estimate_tokens(sent)
|
||||
if current_len + sent_len > target and current_chunk_sents:
|
||||
c_txt = " ".join(current_chunk_sents)
|
||||
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
|
||||
_create_chunk(c_txt, c_win, sec_title, sec_path)
|
||||
|
||||
# Overlap für nächsten Chunk
|
||||
overlap_sents = []
|
||||
ov_len = 0
|
||||
for s in reversed(current_chunk_sents):
|
||||
if ov_len + estimate_tokens(s) < overlap:
|
||||
overlap_sents.insert(0, s)
|
||||
ov_len += estimate_tokens(s)
|
||||
else:
|
||||
break
|
||||
|
||||
current_chunk_sents = list(overlap_sents)
|
||||
current_chunk_sents.append(sent)
|
||||
current_len = ov_len + sent_len
|
||||
else:
|
||||
current_chunk_sents.append(sent)
|
||||
current_len += sent_len
|
||||
|
||||
# Rest
|
||||
if current_chunk_sents:
|
||||
c_txt = " ".join(current_chunk_sents)
|
||||
c_win = f"{context_prefix}\n{c_txt}".strip() if context_prefix else c_txt
|
||||
_create_chunk(c_txt, c_win, sec_title, sec_path)
|
||||
|
||||
buf = []
|
||||
|
||||
for b in blocks:
|
||||
if b.kind == "heading": continue
|
||||
current_buf_text = "\n\n".join([x.text for x in buf])
|
||||
if estimate_tokens(current_buf_text) + estimate_tokens(b.text) >= target:
|
||||
flush_buffer()
|
||||
buf.append(b)
|
||||
if estimate_tokens(b.text) >= target:
|
||||
flush_buffer()
|
||||
|
||||
flush_buffer()
|
||||
return chunks
|
||||
|
||||
def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
||||
"""
|
||||
Strategie für strukturierte Daten (Profile, Werte).
|
||||
Nutzt sliding_window, forciert aber Schnitte an Headings (via parse_blocks Vorarbeit).
|
||||
"""
|
||||
return _strategy_sliding_window(blocks, config, note_id, doc_title, context_prefix=f"# {doc_title}")
|
||||
|
||||
# ==========================================
|
||||
# 4. ROBUST EDGE PARSING & PROPAGATION (NEU)
|
||||
# ==========================================
|
||||
|
||||
def _parse_edges_robust(text: str) -> Set[str]:
|
||||
"""
|
||||
NEU: Findet Kanten im Text, auch wenn sie mehrzeilig oder 'kaputt' formatiert sind.
|
||||
Erkennt:
|
||||
> [!edge] type
|
||||
> [[Link]]
|
||||
Returns: Set von Strings "kind:target"
|
||||
"""
|
||||
found_edges = set()
|
||||
|
||||
# A. Inline [[rel:type|target]] (Standard)
|
||||
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
||||
for kind, target in inlines:
|
||||
k = kind.strip()
|
||||
t = target.strip()
|
||||
if k and t: found_edges.add(f"{k}:{t}")
|
||||
|
||||
# B. Multiline Callouts Parsing (Der Fix für dein Problem)
|
||||
lines = text.split('\n')
|
||||
current_edge_type = None
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
# 1. Start Blockquote: > [!edge] type
|
||||
# (Erlaubt optionalen Doppelpunkt)
|
||||
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
|
||||
if callout_match:
|
||||
current_edge_type = callout_match.group(1).strip()
|
||||
|
||||
# Check: Sind Links noch in der GLEICHEN Zeile?
|
||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||
for l in links:
|
||||
if "rel:" not in l:
|
||||
found_edges.add(f"{current_edge_type}:{l}")
|
||||
continue
|
||||
|
||||
# 2. Continuation Line: > [[Target]]
|
||||
# Wenn wir noch im 'edge mode' sind und die Zeile ein Zitat ist
|
||||
if current_edge_type and stripped.startswith('>'):
|
||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||
for l in links:
|
||||
if "rel:" not in l:
|
||||
found_edges.add(f"{current_edge_type}:{l}")
|
||||
|
||||
# 3. End of Blockquote (kein '>') -> Reset Type
|
||||
elif not stripped.startswith('>'):
|
||||
current_edge_type = None
|
||||
|
||||
return found_edges
|
||||
|
||||
def _propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
||||
"""
|
||||
NEU: Verteilt Kanten innerhalb einer Sektion.
|
||||
Löst das Problem: Callout steht oben im Kapitel, gilt aber für alle Chunks darunter.
|
||||
"""
|
||||
# Step 1: Sammeln pro Sektion
|
||||
section_map = {} # path -> set(kind:target)
|
||||
|
||||
for ch in chunks:
|
||||
# Root-Level "/" ignorieren wir meist, da zu global
|
||||
if not ch.section_path or ch.section_path == "/": continue
|
||||
|
||||
edges = _parse_edges_robust(ch.text)
|
||||
if edges:
|
||||
if ch.section_path not in section_map:
|
||||
section_map[ch.section_path] = set()
|
||||
section_map[ch.section_path].update(edges)
|
||||
|
||||
# Step 2: Injizieren (Broadcasting)
|
||||
for ch in chunks:
|
||||
if ch.section_path in section_map:
|
||||
edges_to_add = section_map[ch.section_path]
|
||||
if not edges_to_add: continue
|
||||
|
||||
injections = []
|
||||
for e_str in edges_to_add:
|
||||
kind, target = e_str.split(':', 1)
|
||||
# Check: Kante schon im Text?
|
||||
token = f"[[rel:{kind}|{target}]]"
|
||||
if token not in ch.text:
|
||||
injections.append(token)
|
||||
|
||||
if injections:
|
||||
# Wir schreiben die Kanten "hart" in den Text.
|
||||
# Damit findet sie derive_edges.py später garantiert.
|
||||
block = "\n\n\n" + " ".join(injections)
|
||||
ch.text += block
|
||||
# Auch ins Window schreiben für Embedding-Kontext
|
||||
ch.window += block
|
||||
|
||||
return chunks
|
||||
|
||||
# ==========================================
|
||||
# 5. ORCHESTRATION (ASYNC)
|
||||
# ==========================================
|
||||
|
||||
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
|
||||
"""
|
||||
Hauptfunktion. Verbindet Parsing, Splitting und Edge-Allocation.
|
||||
"""
|
||||
# 1. Config laden (WP-15 Kompatibilität)
|
||||
if config is None:
|
||||
config = get_chunk_config(note_type)
|
||||
|
||||
fm, body_text = extract_frontmatter_from_text(md_text)
|
||||
note_status = fm.get("status", "").lower()
|
||||
|
||||
primary_strategy = config.get("strategy", "sliding_window")
|
||||
enable_smart_edges = config.get("enable_smart_edge_allocation", False)
|
||||
|
||||
# Drafts skippen LLM um Kosten/Zeit zu sparen
|
||||
if enable_smart_edges and note_status in ["draft", "initial_gen"]:
|
||||
logger.info(f"Chunker: Skipping Smart Edges for draft '{note_id}'.")
|
||||
enable_smart_edges = False
|
||||
|
||||
# 2. Parsing & Splitting
|
||||
blocks, doc_title = parse_blocks(md_text)
|
||||
|
||||
if primary_strategy == "by_heading":
|
||||
chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
|
||||
else:
|
||||
chunks = await asyncio.to_thread(_strategy_sliding_window, blocks, config, note_id, doc_title)
|
||||
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
# 3. NEU: Propagation VOR Smart Edge Allocation
|
||||
# Das repariert die fehlenden Kanten aus deinen Callouts.
|
||||
chunks = _propagate_section_edges(chunks)
|
||||
|
||||
# 4. Smart Edges (LLM)
|
||||
if enable_smart_edges:
|
||||
chunks = await _run_smart_edge_allocation(chunks, md_text, note_id, note_type)
|
||||
|
||||
# 5. Linking
|
||||
for i, ch in enumerate(chunks):
|
||||
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
||||
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
||||
|
||||
return chunks
|
||||
|
||||
def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> List[str]:
|
||||
"""
|
||||
Hilfsfunktion: Sammelt ALLE Kanten für den LLM-Kandidaten-Pool.
|
||||
"""
|
||||
# A. Via derive_edges (Standard)
|
||||
dummy_chunk = {
|
||||
"chunk_id": f"{note_id}#full",
|
||||
"text": md_text,
|
||||
"content": md_text,
|
||||
"window": md_text,
|
||||
"type": note_type
|
||||
}
|
||||
# Signatur-Anpassung beachten (WP-15 Fix)
|
||||
raw_edges = build_edges_for_note(
|
||||
note_id,
|
||||
[dummy_chunk],
|
||||
note_level_references=None,
|
||||
include_note_scope_refs=False
|
||||
)
|
||||
all_candidates = set()
|
||||
for e in raw_edges:
|
||||
kind = e.get("kind")
|
||||
target = e.get("target_id")
|
||||
if target and kind not in ["belongs_to", "next", "prev", "backlink"]:
|
||||
all_candidates.add(f"{kind}:{target}")
|
||||
|
||||
# B. Via Robust Parser (NEU) - fängt die multiline Callouts
|
||||
robust_edges = _parse_edges_robust(md_text)
|
||||
all_candidates.update(robust_edges)
|
||||
|
||||
return list(all_candidates)
|
||||
|
||||
async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_id: str, note_type: str) -> List[Chunk]:
|
||||
"""
|
||||
Der LLM-Schritt (WP-15). Filtert irrelevante Kanten.
|
||||
"""
|
||||
analyzer = get_semantic_analyzer()
|
||||
candidate_list = _extract_all_edges_from_md(full_text, note_id, note_type)
|
||||
|
||||
if not candidate_list:
|
||||
return chunks
|
||||
|
||||
tasks = []
|
||||
for chunk in chunks:
|
||||
tasks.append(analyzer.assign_edges_to_chunk(chunk.text, candidate_list, note_type))
|
||||
|
||||
results_per_chunk = await asyncio.gather(*tasks)
|
||||
|
||||
assigned_edges_global = set()
|
||||
|
||||
for i, confirmed_edges in enumerate(results_per_chunk):
|
||||
chunk = chunks[i]
|
||||
chunk.suggested_edges = confirmed_edges
|
||||
assigned_edges_global.update(confirmed_edges)
|
||||
|
||||
if confirmed_edges:
|
||||
# Wir schreiben auch Smart Edges hart in den Text
|
||||
injection_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in confirmed_edges if ':' in e])
|
||||
chunk.text += injection_str
|
||||
chunk.window += injection_str
|
||||
|
||||
# Fallback für Kanten, die das LLM nirgendwo zugeordnet hat
|
||||
# (Damit nichts verloren geht -> Safety Fallback)
|
||||
unassigned = set(candidate_list) - assigned_edges_global
|
||||
if unassigned:
|
||||
fallback_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in unassigned if ':' in e])
|
||||
for chunk in chunks:
|
||||
chunk.text += fallback_str
|
||||
chunk.window += fallback_str
|
||||
if chunk.suggested_edges is None: chunk.suggested_edges = []
|
||||
chunk.suggested_edges.extend(list(unassigned))
|
||||
|
||||
return chunks
|
||||
10
app/core/chunking/__init__.py
Normal file
10
app/core/chunking/__init__.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
"""
|
||||
FILE: app/core/chunking/__init__.py
|
||||
DESCRIPTION: Package-Einstiegspunkt für Chunking. Exportiert assemble_chunks.
|
||||
VERSION: 3.3.0
|
||||
"""
|
||||
from .chunking_processor import assemble_chunks
|
||||
from .chunking_utils import get_chunk_config, extract_frontmatter_from_text
|
||||
from .chunking_models import Chunk
|
||||
|
||||
__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"]
|
||||
31
app/core/chunking/chunking_models.py
Normal file
31
app/core/chunking/chunking_models.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_models.py
|
||||
DESCRIPTION: Datenklassen für das Chunking-System.
|
||||
"""
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Optional, Any
|
||||
|
||||
@dataclass
|
||||
class RawBlock:
|
||||
"""Repräsentiert einen logischen Block aus dem Markdown-Parsing."""
|
||||
kind: str
|
||||
text: str
|
||||
level: Optional[int]
|
||||
section_path: str
|
||||
section_title: Optional[str]
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
"""Das finale Chunk-Objekt für Embedding und Graph-Speicherung."""
|
||||
id: str
|
||||
note_id: str
|
||||
index: int
|
||||
text: str
|
||||
window: str
|
||||
token_count: int
|
||||
section_title: Optional[str]
|
||||
section_path: str
|
||||
neighbors_prev: Optional[str]
|
||||
neighbors_next: Optional[str]
|
||||
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
|
||||
suggested_edges: Optional[List[str]] = None
|
||||
93
app/core/chunking/chunking_parser.py
Normal file
93
app/core/chunking/chunking_parser.py
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_parser.py
|
||||
DESCRIPTION: Zerlegt Markdown in Blöcke und extrahiert Kanten-Strings.
|
||||
"""
|
||||
import re
|
||||
from typing import List, Tuple, Set
|
||||
from .chunking_models import RawBlock
|
||||
from .chunking_utils import extract_frontmatter_from_text
|
||||
|
||||
_WS = re.compile(r'\s+')
|
||||
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
"""Teilt Text in Sätze auf."""
|
||||
text = _WS.sub(' ', text.strip())
|
||||
if not text: return []
|
||||
return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
|
||||
|
||||
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||
"""Zerlegt Text in logische Einheiten."""
|
||||
blocks = []
|
||||
h1_title = "Dokument"; section_path = "/"; current_h2 = None
|
||||
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
||||
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
|
||||
if h1_match: h1_title = h1_match.group(1).strip()
|
||||
lines = text_without_fm.split('\n')
|
||||
buffer = []
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
# H1 ignorieren (ist Doc Title)
|
||||
if stripped.startswith('# '):
|
||||
continue
|
||||
|
||||
# Generische Heading-Erkennung (H2 bis H6) für flexible Split-Levels
|
||||
heading_match = re.match(r'^(#{2,6})\s+(.*)', stripped)
|
||||
if heading_match:
|
||||
# Buffer leeren (vorherigen Text abschließen)
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||
buffer = []
|
||||
|
||||
level = len(heading_match.group(1))
|
||||
title = heading_match.group(2).strip()
|
||||
|
||||
# Pfad-Logik: H2 setzt den Haupt-Pfad
|
||||
if level == 2:
|
||||
current_h2 = title
|
||||
section_path = f"/{current_h2}"
|
||||
# Bei H3+ bleibt der section_path beim Parent, aber das Level wird korrekt gesetzt
|
||||
|
||||
blocks.append(RawBlock("heading", stripped, level, section_path, current_h2))
|
||||
|
||||
elif not stripped:
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||
buffer = []
|
||||
else:
|
||||
buffer.append(line)
|
||||
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||
return blocks, h1_title
|
||||
|
||||
def parse_edges_robust(text: str) -> Set[str]:
|
||||
"""Extrahiert Kanten-Kandidaten (Wikilinks, Callouts)."""
|
||||
found_edges = set()
|
||||
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
||||
for kind, target in inlines:
|
||||
k = kind.strip().lower()
|
||||
t = target.strip()
|
||||
if k and t: found_edges.add(f"{k}:{t}")
|
||||
lines = text.split('\n')
|
||||
current_edge_type = None
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
|
||||
if callout_match:
|
||||
current_edge_type = callout_match.group(1).strip().lower()
|
||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||
for l in links:
|
||||
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
||||
continue
|
||||
if current_edge_type and stripped.startswith('>'):
|
||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||
for l in links:
|
||||
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
||||
elif not stripped.startswith('>'): current_edge_type = None
|
||||
return found_edges
|
||||
94
app/core/chunking/chunking_processor.py
Normal file
94
app/core/chunking/chunking_processor.py
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_processor.py
|
||||
DESCRIPTION: Der zentrale Orchestrator für das Chunking-System.
|
||||
AUDIT v3.3.3: Wiederherstellung der "Gold-Standard" Qualität.
|
||||
- Integriert physikalische Kanten-Injektion (Propagierung).
|
||||
- Stellt H1-Kontext-Fenster sicher.
|
||||
- Baut den Candidate-Pool für die WP-15b Ingestion auf.
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
import logging
|
||||
from typing import List, Dict, Optional
|
||||
from .chunking_models import Chunk
|
||||
from .chunking_utils import get_chunk_config, extract_frontmatter_from_text
|
||||
from .chunking_parser import parse_blocks, parse_edges_robust
|
||||
from .chunking_strategies import strategy_sliding_window, strategy_by_heading
|
||||
from .chunking_propagation import propagate_section_edges
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
|
||||
"""
|
||||
Hauptfunktion zur Zerlegung einer Note.
|
||||
Verbindet Strategien mit physikalischer Kontext-Anreicherung.
|
||||
"""
|
||||
# 1. Konfiguration & Parsing
|
||||
if config is None:
|
||||
config = get_chunk_config(note_type)
|
||||
|
||||
fm, body_text = extract_frontmatter_from_text(md_text)
|
||||
blocks, doc_title = parse_blocks(md_text)
|
||||
|
||||
# Vorbereitung des H1-Präfix für die Embedding-Fenster
|
||||
h1_prefix = f"# {doc_title}" if doc_title else ""
|
||||
|
||||
# 2. Anwendung der Splitting-Strategie
|
||||
# Wir übergeben den Dokument-Titel/Präfix für die Window-Bildung.
|
||||
if config.get("strategy") == "by_heading":
|
||||
chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title)
|
||||
else:
|
||||
# sliding_window nutzt nun den context_prefix für das Window-Feld.
|
||||
chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id, context_prefix=h1_prefix)
|
||||
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
# 3. Physikalische Kontext-Anreicherung (Der Qualitäts-Fix)
|
||||
# Schreibt Kanten aus Callouts/Inlines hart in den Text für Qdrant.
|
||||
chunks = propagate_section_edges(chunks)
|
||||
|
||||
# 4. WP-15b: Candidate Pool Aufbau (Metadaten für IngestionService)
|
||||
# Zuerst die explizit im Text vorhandenen Kanten sammeln.
|
||||
for ch in chunks:
|
||||
# Wir extrahieren aus dem bereits (durch Propagation) angereicherten Text.
|
||||
for e_str in parse_edges_robust(ch.text):
|
||||
parts = e_str.split(':', 1)
|
||||
if len(parts) == 2:
|
||||
k, t = parts
|
||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"})
|
||||
|
||||
# 5. Global Pool (Unzugeordnete Kanten aus dem Dokument-Ende)
|
||||
# Sucht nach dem Edge-Pool Block im Original-Markdown.
|
||||
pool_match = re.search(
|
||||
r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)',
|
||||
body_text,
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
if pool_match:
|
||||
global_edges = parse_edges_robust(pool_match.group(1))
|
||||
for e_str in global_edges:
|
||||
parts = e_str.split(':', 1)
|
||||
if len(parts) == 2:
|
||||
k, t = parts
|
||||
# Diese Kanten werden als "Global Pool" markiert für die spätere KI-Prüfung.
|
||||
for ch in chunks:
|
||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"})
|
||||
|
||||
# 6. De-Duplikation des Pools & Linking
|
||||
for ch in chunks:
|
||||
seen = set()
|
||||
unique = []
|
||||
for c in ch.candidate_pool:
|
||||
key = (c["kind"], c["to"], c["provenance"])
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(c)
|
||||
ch.candidate_pool = unique
|
||||
|
||||
# Verknüpfung der Nachbarschaften für Graph-Traversierung
|
||||
for i, ch in enumerate(chunks):
|
||||
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
||||
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
||||
|
||||
return chunks
|
||||
59
app/core/chunking/chunking_propagation.py
Normal file
59
app/core/chunking/chunking_propagation.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_propagation.py
|
||||
DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment).
|
||||
Stellt die "Gold-Standard"-Qualität von v3.1.0 wieder her.
|
||||
VERSION: 3.3.1
|
||||
STATUS: Active
|
||||
"""
|
||||
from typing import List, Dict, Set
|
||||
from .chunking_models import Chunk
|
||||
from .chunking_parser import parse_edges_robust
|
||||
|
||||
def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
||||
"""
|
||||
Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window.
|
||||
Dies ist essenziell für die Vektorisierung der Beziehungen.
|
||||
"""
|
||||
# 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren
|
||||
section_map: Dict[str, Set[str]] = {} # path -> set(kind:target)
|
||||
|
||||
for ch in chunks:
|
||||
# Root-Level "/" ignorieren (zu global), Fokus auf spezifische Kapitel
|
||||
if not ch.section_path or ch.section_path == "/":
|
||||
continue
|
||||
|
||||
# Nutzt den robusten Parser aus dem Package
|
||||
edges = parse_edges_robust(ch.text)
|
||||
if edges:
|
||||
if ch.section_path not in section_map:
|
||||
section_map[ch.section_path] = set()
|
||||
section_map[ch.section_path].update(edges)
|
||||
|
||||
# 2. Injizieren: Kanten in jeden Chunk der Sektion zurückschreiben (Broadcasting)
|
||||
for ch in chunks:
|
||||
if ch.section_path in section_map:
|
||||
edges_to_add = section_map[ch.section_path]
|
||||
if not edges_to_add:
|
||||
continue
|
||||
|
||||
injections = []
|
||||
for e_str in edges_to_add:
|
||||
kind, target = e_str.split(':', 1)
|
||||
# Nur injizieren, wenn die Kante nicht bereits im Text steht
|
||||
token = f"[[rel:{kind}|{target}]]"
|
||||
if token not in ch.text:
|
||||
injections.append(token)
|
||||
|
||||
if injections:
|
||||
# Physische Anreicherung (Der v3.1.0 Qualitäts-Fix)
|
||||
# Triple-Newline für saubere Trennung im Embedding-Fenster
|
||||
block = "\n\n\n" + " ".join(injections)
|
||||
ch.text += block
|
||||
|
||||
# ENTSCHEIDEND: Auch ins Window schreiben, da Qdrant hier sucht!
|
||||
if ch.window:
|
||||
ch.window += block
|
||||
else:
|
||||
ch.window = ch.text
|
||||
|
||||
return chunks
|
||||
142
app/core/chunking/chunking_strategies.py
Normal file
142
app/core/chunking/chunking_strategies.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_strategies.py
|
||||
DESCRIPTION: Mathematische Splitting-Strategien.
|
||||
AUDIT v3.3.2: 100% Konformität zur 'by_heading' Spezifikation.
|
||||
- Implementiert Hybrid-Safety-Net (Sliding Window für Übergrößen).
|
||||
- Breadcrumb-Kontext im Window (H1 > H2).
|
||||
- Sliding Window mit H1-Kontext (Gold-Standard v3.1.0).
|
||||
"""
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .chunking_models import RawBlock, Chunk
|
||||
from .chunking_utils import estimate_tokens
|
||||
from .chunking_parser import split_sentences
|
||||
|
||||
def _create_context_win(doc_title: str, sec_title: Optional[str], text: str) -> str:
|
||||
"""Baut den Breadcrumb-Kontext für das Embedding-Fenster."""
|
||||
parts = []
|
||||
if doc_title: parts.append(doc_title)
|
||||
if sec_title and sec_title != doc_title: parts.append(sec_title)
|
||||
prefix = " > ".join(parts)
|
||||
return f"{prefix}\n{text}".strip() if prefix else text
|
||||
|
||||
def strategy_sliding_window(blocks: List[RawBlock],
|
||||
config: Dict[str, Any],
|
||||
note_id: str,
|
||||
context_prefix: str = "") -> List[Chunk]:
|
||||
"""
|
||||
Fasst Blöcke zusammen und schneidet bei 'target' Tokens.
|
||||
Ignoriert H2-Überschriften beim Splitting, um Kontext zu wahren.
|
||||
"""
|
||||
target = config.get("target", 400)
|
||||
max_tokens = config.get("max", 600)
|
||||
overlap_val = config.get("overlap", (50, 80))
|
||||
overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
|
||||
|
||||
chunks: List[Chunk] = []
|
||||
buf: List[RawBlock] = []
|
||||
|
||||
def _add(txt, sec, path):
|
||||
idx = len(chunks)
|
||||
# H1-Kontext Präfix für das Window-Feld
|
||||
win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
|
||||
chunks.append(Chunk(
|
||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||
section_title=sec, section_path=path,
|
||||
neighbors_prev=None, neighbors_next=None
|
||||
))
|
||||
|
||||
def flush():
|
||||
nonlocal buf
|
||||
if not buf: return
|
||||
text_body = "\n\n".join([b.text for b in buf])
|
||||
sec_title = buf[-1].section_title; sec_path = buf[-1].section_path
|
||||
|
||||
if estimate_tokens(text_body) <= max_tokens:
|
||||
_add(text_body, sec_title, sec_path)
|
||||
else:
|
||||
sents = split_sentences(text_body); cur_sents = []; cur_len = 0
|
||||
for s in sents:
|
||||
slen = estimate_tokens(s)
|
||||
if cur_len + slen > target and cur_sents:
|
||||
_add(" ".join(cur_sents), sec_title, sec_path)
|
||||
ov_s = []; ov_l = 0
|
||||
for os in reversed(cur_sents):
|
||||
if ov_l + estimate_tokens(os) < overlap:
|
||||
ov_s.insert(0, os); ov_l += estimate_tokens(os)
|
||||
else: break
|
||||
cur_sents = list(ov_s); cur_sents.append(s); cur_len = ov_l + slen
|
||||
else:
|
||||
cur_sents.append(s); cur_len += slen
|
||||
if cur_sents:
|
||||
_add(" ".join(cur_sents), sec_title, sec_path)
|
||||
buf = []
|
||||
|
||||
for b in blocks:
|
||||
# H2-Überschriften werden ignoriert, um den Zusammenhang zu wahren
|
||||
if b.kind == "heading": continue
|
||||
if estimate_tokens("\n\n".join([x.text for x in buf])) + estimate_tokens(b.text) >= target:
|
||||
flush()
|
||||
buf.append(b)
|
||||
flush()
|
||||
return chunks
|
||||
|
||||
def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
||||
"""
|
||||
Splittet Text basierend auf Markdown-Überschriften mit Hybrid-Safety-Net.
|
||||
"""
|
||||
strict = config.get("strict_heading_split", False)
|
||||
target = config.get("target", 400)
|
||||
max_tokens = config.get("max", 600)
|
||||
split_level = config.get("split_level", 2)
|
||||
overlap = sum(config.get("overlap", (50, 80))) // 2
|
||||
|
||||
chunks: List[Chunk] = []
|
||||
buf: List[str] = []
|
||||
cur_tokens = 0
|
||||
|
||||
def _add_to_chunks(txt, title, path):
|
||||
idx = len(chunks)
|
||||
win = _create_context_win(doc_title, title, txt)
|
||||
chunks.append(Chunk(
|
||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||
section_title=title, section_path=path,
|
||||
neighbors_prev=None, neighbors_next=None
|
||||
))
|
||||
|
||||
def _flush(title, path):
|
||||
nonlocal buf, cur_tokens
|
||||
if not buf: return
|
||||
full_text = "\n\n".join(buf)
|
||||
if estimate_tokens(full_text) <= max_tokens:
|
||||
_add_to_chunks(full_text, title, path)
|
||||
else:
|
||||
sents = split_sentences(full_text); cur_sents = []; sub_len = 0
|
||||
for s in sents:
|
||||
slen = estimate_tokens(s)
|
||||
if sub_len + slen > target and cur_sents:
|
||||
_add_to_chunks(" ".join(cur_sents), title, path)
|
||||
ov_s = []; ov_l = 0
|
||||
for os in reversed(cur_sents):
|
||||
if ov_l + estimate_tokens(os) < overlap:
|
||||
ov_s.insert(0, os); ov_l += estimate_tokens(os)
|
||||
else: break
|
||||
cur_sents = list(ov_s); cur_sents.append(s); sub_len = ov_l + slen
|
||||
else: cur_sents.append(s); sub_len += slen
|
||||
if cur_sents: _add_to_chunks(" ".join(cur_sents), title, path)
|
||||
buf = []; cur_tokens = 0
|
||||
|
||||
for b in blocks:
|
||||
if b.kind == "heading":
|
||||
if b.level < split_level: _flush(b.section_title, b.section_path)
|
||||
elif b.level == split_level:
|
||||
if strict or cur_tokens >= target: _flush(b.section_title, b.section_path)
|
||||
continue
|
||||
bt = estimate_tokens(b.text)
|
||||
if cur_tokens + bt > max_tokens and buf: _flush(b.section_title, b.section_path)
|
||||
buf.append(b.text); cur_tokens += bt
|
||||
if buf:
|
||||
last_b = blocks[-1] if blocks else None
|
||||
_flush(last_b.section_title if last_b else None, last_b.section_path if last_b else "/")
|
||||
return chunks
|
||||
55
app/core/chunking/chunking_utils.py
Normal file
55
app/core/chunking/chunking_utils.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_utils.py
|
||||
DESCRIPTION: Hilfswerkzeuge für Token-Schätzung und YAML-Konfiguration.
|
||||
"""
|
||||
import math
|
||||
import yaml
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent.parent.parent
|
||||
CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
|
||||
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}
|
||||
|
||||
_CONFIG_CACHE = None
|
||||
|
||||
def load_yaml_config() -> Dict[str, Any]:
|
||||
global _CONFIG_CACHE
|
||||
if _CONFIG_CACHE is not None: return _CONFIG_CACHE
|
||||
if not CONFIG_PATH.exists(): return {}
|
||||
try:
|
||||
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
_CONFIG_CACHE = data
|
||||
return data
|
||||
except Exception: return {}
|
||||
|
||||
def get_chunk_config(note_type: str) -> Dict[str, Any]:
|
||||
"""Lädt die Chunking-Strategie basierend auf dem Note-Type."""
|
||||
full_config = load_yaml_config()
|
||||
profiles = full_config.get("chunking_profiles", {})
|
||||
type_def = full_config.get("types", {}).get(note_type.lower(), {})
|
||||
profile_name = type_def.get("chunking_profile") or full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
||||
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
|
||||
if "overlap" in config and isinstance(config["overlap"], list):
|
||||
config["overlap"] = tuple(config["overlap"])
|
||||
return config
|
||||
|
||||
def estimate_tokens(text: str) -> int:
|
||||
"""Grobe Schätzung der Token-Anzahl."""
|
||||
return max(1, math.ceil(len(text.strip()) / 4))
|
||||
|
||||
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
|
||||
"""Trennt YAML-Frontmatter vom Text."""
|
||||
import re
|
||||
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
|
||||
if not fm_match: return {}, md_text
|
||||
try:
|
||||
frontmatter = yaml.safe_load(fm_match.group(1))
|
||||
if not isinstance(frontmatter, dict): frontmatter = {}
|
||||
except Exception: frontmatter = {}
|
||||
text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
|
||||
return frontmatter, text_without_fm.strip()
|
||||
35
app/core/database/__init__.py
Normal file
35
app/core/database/__init__.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
PACKAGE: app.core.database
|
||||
DESCRIPTION: Zentrale Schnittstelle für alle Datenbank-Operationen (Qdrant).
|
||||
Bündelt Client-Initialisierung und Point-Konvertierung.
|
||||
"""
|
||||
from .qdrant import (
|
||||
QdrantConfig,
|
||||
get_client,
|
||||
ensure_collections,
|
||||
ensure_payload_indexes,
|
||||
collection_names
|
||||
)
|
||||
from .qdrant_points import (
|
||||
points_for_note,
|
||||
points_for_chunks,
|
||||
points_for_edges,
|
||||
upsert_batch,
|
||||
get_edges_for_sources,
|
||||
search_chunks_by_vector
|
||||
)
|
||||
|
||||
# Öffentlicher Export für das Gesamtsystem
|
||||
__all__ = [
|
||||
"QdrantConfig",
|
||||
"get_client",
|
||||
"ensure_collections",
|
||||
"ensure_payload_indexes",
|
||||
"collection_names",
|
||||
"points_for_note",
|
||||
"points_for_chunks",
|
||||
"points_for_edges",
|
||||
"upsert_batch",
|
||||
"get_edges_for_sources",
|
||||
"search_chunks_by_vector"
|
||||
]
|
||||
169
app/core/database/qdrant.py
Normal file
169
app/core/database/qdrant.py
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
"""
|
||||
FILE: app/core/database/qdrant.py
|
||||
DESCRIPTION: Qdrant-Client Factory und Schema-Management.
|
||||
Erstellt Collections und Payload-Indizes.
|
||||
MODULARISIERUNG: Verschoben in das database-Paket für WP-14.
|
||||
VERSION: 2.2.1
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, dataclasses, os
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Dict, List
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Konfiguration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class QdrantConfig:
|
||||
"""Konfigurationsobjekt für den Qdrant-Verbindungsaufbau."""
|
||||
host: Optional[str] = None
|
||||
port: Optional[int] = None
|
||||
url: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
prefix: str = "mindnet"
|
||||
dim: int = 384
|
||||
distance: str = "Cosine" # Cosine | Dot | Euclid
|
||||
on_disk_payload: bool = True
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> "QdrantConfig":
|
||||
"""Erstellt die Konfiguration aus Umgebungsvariablen."""
|
||||
# Entweder URL ODER Host/Port, API-Key optional
|
||||
url = os.getenv("QDRANT_URL") or None
|
||||
host = os.getenv("QDRANT_HOST") or None
|
||||
port = os.getenv("QDRANT_PORT")
|
||||
port = int(port) if port else None
|
||||
api_key = os.getenv("QDRANT_API_KEY") or None
|
||||
prefix = os.getenv("COLLECTION_PREFIX") or "mindnet"
|
||||
dim = int(os.getenv("VECTOR_DIM") or 384)
|
||||
distance = os.getenv("DISTANCE", "Cosine")
|
||||
on_disk_payload = (os.getenv("ON_DISK_PAYLOAD", "true").lower() == "true")
|
||||
|
||||
return cls(
|
||||
host=host, port=port, url=url, api_key=api_key,
|
||||
prefix=prefix, dim=dim, distance=distance, on_disk_payload=on_disk_payload
|
||||
)
|
||||
|
||||
|
||||
def get_client(cfg: QdrantConfig) -> QdrantClient:
|
||||
"""Initialisiert den Qdrant-Client basierend auf der Konfiguration."""
|
||||
# QdrantClient akzeptiert entweder url=... oder host/port
|
||||
if cfg.url:
|
||||
return QdrantClient(url=cfg.url, api_key=cfg.api_key, timeout=60.0)
|
||||
return QdrantClient(host=cfg.host or "127.0.0.1", port=cfg.port or 6333, api_key=cfg.api_key, timeout=60.0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Collections
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def collection_names(prefix: str) -> Tuple[str, str, str]:
|
||||
"""Gibt die standardisierten Collection-Namen zurück."""
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
|
||||
def _vector_params(dim: int, distance: str) -> rest.VectorParams:
|
||||
"""Erstellt Vektor-Parameter für das Collection-Schema."""
|
||||
# Distance: "Cosine" | "Dot" | "Euclid"
|
||||
dist = getattr(rest.Distance, distance.capitalize(), rest.Distance.COSINE)
|
||||
return rest.VectorParams(size=dim, distance=dist)
|
||||
|
||||
|
||||
def ensure_collections(client: QdrantClient, prefix: str, dim: int) -> None:
|
||||
"""Legt notes, chunks und edges Collections an, falls nicht vorhanden."""
|
||||
notes, chunks, edges = collection_names(prefix)
|
||||
|
||||
# notes
|
||||
if not client.collection_exists(notes):
|
||||
client.create_collection(
|
||||
collection_name=notes,
|
||||
vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
# chunks
|
||||
if not client.collection_exists(chunks):
|
||||
client.create_collection(
|
||||
collection_name=chunks,
|
||||
vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
# edges (Dummy-Vektor, da primär via Payload gefiltert wird)
|
||||
if not client.collection_exists(edges):
|
||||
client.create_collection(
|
||||
collection_name=edges,
|
||||
vectors_config=_vector_params(1, "Dot"),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Payload-Indizes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _ensure_index(client: QdrantClient, collection: str, field: str, schema: rest.PayloadSchemaType) -> None:
|
||||
"""Idempotentes Anlegen eines Payload-Indexes für ein spezifisches Feld."""
|
||||
try:
|
||||
client.create_payload_index(collection_name=collection, field_name=field, field_schema=schema, wait=True)
|
||||
except Exception as e:
|
||||
# Fehler ignorieren, falls Index bereits existiert
|
||||
logger.debug(f"Index check for {field} in {collection}: {e}")
|
||||
|
||||
|
||||
def ensure_payload_indexes(client: QdrantClient, prefix: str) -> None:
|
||||
"""
|
||||
Stellt sicher, dass alle benötigten Payload-Indizes für die Suche existieren.
|
||||
- notes: note_id, type, title, updated, tags
|
||||
- chunks: note_id, chunk_id, index, type, tags
|
||||
- edges: note_id, kind, scope, source_id, target_id, chunk_id
|
||||
"""
|
||||
notes, chunks, edges = collection_names(prefix)
|
||||
|
||||
# NOTES
|
||||
for field, schema in [
|
||||
("note_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("type", rest.PayloadSchemaType.KEYWORD),
|
||||
("title", rest.PayloadSchemaType.TEXT),
|
||||
("updated", rest.PayloadSchemaType.INTEGER),
|
||||
("tags", rest.PayloadSchemaType.KEYWORD),
|
||||
]:
|
||||
_ensure_index(client, notes, field, schema)
|
||||
|
||||
# CHUNKS
|
||||
for field, schema in [
|
||||
("note_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("chunk_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("index", rest.PayloadSchemaType.INTEGER),
|
||||
("type", rest.PayloadSchemaType.KEYWORD),
|
||||
("tags", rest.PayloadSchemaType.KEYWORD),
|
||||
]:
|
||||
_ensure_index(client, chunks, field, schema)
|
||||
|
||||
# EDGES
|
||||
for field, schema in [
|
||||
("note_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("kind", rest.PayloadSchemaType.KEYWORD),
|
||||
("scope", rest.PayloadSchemaType.KEYWORD),
|
||||
("source_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("target_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("chunk_id", rest.PayloadSchemaType.KEYWORD),
|
||||
]:
|
||||
_ensure_index(client, edges, field, schema)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"QdrantConfig",
|
||||
"get_client",
|
||||
"ensure_collections",
|
||||
"ensure_payload_indexes",
|
||||
"collection_names",
|
||||
]
|
||||
296
app/core/database/qdrant_points.py
Normal file
296
app/core/database/qdrant_points.py
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
"""
|
||||
FILE: app/core/database/qdrant_points.py
|
||||
DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs.
|
||||
VERSION: 1.5.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, uuid, os
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import uuid
|
||||
from typing import List, Tuple, Iterable, Optional, Dict, Any
|
||||
|
||||
from qdrant_client.http import models as rest
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
# --------------------- ID helpers ---------------------
|
||||
|
||||
def _to_uuid(stable_key: str) -> str:
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key))
|
||||
|
||||
def _names(prefix: str) -> Tuple[str, str, str]:
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
# --------------------- Points builders ---------------------
|
||||
|
||||
def points_for_note(prefix: str, note_payload: dict, note_vec: List[float] | None, dim: int) -> Tuple[str, List[rest.PointStruct]]:
|
||||
notes_col, _, _ = _names(prefix)
|
||||
vector = note_vec if note_vec is not None else [0.0] * int(dim)
|
||||
raw_note_id = note_payload.get("note_id") or note_payload.get("id") or "missing-note-id"
|
||||
point_id = _to_uuid(raw_note_id)
|
||||
pt = rest.PointStruct(id=point_id, vector=vector, payload=note_payload)
|
||||
return notes_col, [pt]
|
||||
|
||||
def points_for_chunks(prefix: str, chunk_payloads: List[dict], vectors: List[List[float]]) -> Tuple[str, List[rest.PointStruct]]:
|
||||
_, chunks_col, _ = _names(prefix)
|
||||
points: List[rest.PointStruct] = []
|
||||
for i, (pl, vec) in enumerate(zip(chunk_payloads, vectors), start=1):
|
||||
chunk_id = pl.get("chunk_id") or pl.get("id")
|
||||
if not chunk_id:
|
||||
note_id = pl.get("note_id") or pl.get("parent_note_id") or "missing-note"
|
||||
chunk_id = f"{note_id}#{i}"
|
||||
pl["chunk_id"] = chunk_id
|
||||
point_id = _to_uuid(chunk_id)
|
||||
points.append(rest.PointStruct(id=point_id, vector=vec, payload=pl))
|
||||
return chunks_col, points
|
||||
|
||||
def _normalize_edge_payload(pl: dict) -> dict:
|
||||
kind = pl.get("kind") or pl.get("edge_type") or "edge"
|
||||
source_id = pl.get("source_id") or pl.get("src_id") or "unknown-src"
|
||||
target_id = pl.get("target_id") or pl.get("dst_id") or "unknown-tgt"
|
||||
seq = pl.get("seq") or pl.get("order") or pl.get("index")
|
||||
|
||||
pl.setdefault("kind", kind)
|
||||
pl.setdefault("source_id", source_id)
|
||||
pl.setdefault("target_id", target_id)
|
||||
if seq is not None and "seq" not in pl:
|
||||
pl["seq"] = seq
|
||||
return pl
|
||||
|
||||
def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]:
|
||||
_, _, edges_col = _names(prefix)
|
||||
points: List[rest.PointStruct] = []
|
||||
for raw in edge_payloads:
|
||||
pl = _normalize_edge_payload(raw)
|
||||
edge_id = pl.get("edge_id")
|
||||
if not edge_id:
|
||||
kind = pl.get("kind", "edge")
|
||||
s = pl.get("source_id", "unknown-src")
|
||||
t = pl.get("target_id", "unknown-tgt")
|
||||
seq = pl.get("seq") or ""
|
||||
edge_id = f"{kind}:{s}->{t}#{seq}"
|
||||
pl["edge_id"] = edge_id
|
||||
point_id = _to_uuid(edge_id)
|
||||
points.append(rest.PointStruct(id=point_id, vector=[0.0], payload=pl))
|
||||
return edges_col, points
|
||||
|
||||
# --------------------- Vector schema & overrides ---------------------
|
||||
|
||||
def _preferred_name(candidates: List[str]) -> str:
|
||||
for k in ("text", "default", "embedding", "content"):
|
||||
if k in candidates:
|
||||
return k
|
||||
return sorted(candidates)[0]
|
||||
|
||||
def _env_override_for_collection(collection: str) -> Optional[str]:
|
||||
"""
|
||||
Returns:
|
||||
- "__single__" to force single-vector
|
||||
- concrete name (str) to force named-vector with that name
|
||||
- None to auto-detect
|
||||
"""
|
||||
base = os.getenv("MINDNET_VECTOR_NAME")
|
||||
if collection.endswith("_notes"):
|
||||
base = os.getenv("NOTES_VECTOR_NAME", base)
|
||||
elif collection.endswith("_chunks"):
|
||||
base = os.getenv("CHUNKS_VECTOR_NAME", base)
|
||||
elif collection.endswith("_edges"):
|
||||
base = os.getenv("EDGES_VECTOR_NAME", base)
|
||||
|
||||
if not base:
|
||||
return None
|
||||
val = base.strip()
|
||||
if val.lower() in ("__single__", "single"):
|
||||
return "__single__"
|
||||
return val # concrete name
|
||||
|
||||
def _get_vector_schema(client: QdrantClient, collection_name: str) -> dict:
|
||||
"""
|
||||
Return {"kind": "single", "size": int} or {"kind": "named", "names": [...], "primary": str}.
|
||||
"""
|
||||
try:
|
||||
info = client.get_collection(collection_name=collection_name)
|
||||
vecs = getattr(info, "vectors", None)
|
||||
# Single-vector config
|
||||
if hasattr(vecs, "size") and isinstance(vecs.size, int):
|
||||
return {"kind": "single", "size": vecs.size}
|
||||
# Named-vectors config (dict-like in .config)
|
||||
cfg = getattr(vecs, "config", None)
|
||||
if isinstance(cfg, dict) and cfg:
|
||||
names = list(cfg.keys())
|
||||
if names:
|
||||
return {"kind": "named", "names": names, "primary": _preferred_name(names)}
|
||||
except Exception:
|
||||
pass
|
||||
return {"kind": "single", "size": None}
|
||||
|
||||
def _as_named(points: List[rest.PointStruct], name: str) -> List[rest.PointStruct]:
|
||||
out: List[rest.PointStruct] = []
|
||||
for pt in points:
|
||||
vec = getattr(pt, "vector", None)
|
||||
if isinstance(vec, dict):
|
||||
if name in vec:
|
||||
out.append(pt)
|
||||
else:
|
||||
# take any existing entry; if empty dict fallback to [0.0]
|
||||
fallback_vec = None
|
||||
try:
|
||||
fallback_vec = list(next(iter(vec.values())))
|
||||
except Exception:
|
||||
fallback_vec = [0.0]
|
||||
out.append(rest.PointStruct(id=pt.id, vector={name: fallback_vec}, payload=pt.payload))
|
||||
elif vec is not None:
|
||||
out.append(rest.PointStruct(id=pt.id, vector={name: vec}, payload=pt.payload))
|
||||
else:
|
||||
out.append(pt)
|
||||
return out
|
||||
|
||||
# --------------------- Qdrant ops ---------------------
|
||||
|
||||
def upsert_batch(client: QdrantClient, collection: str, points: List[rest.PointStruct]) -> None:
|
||||
if not points:
|
||||
return
|
||||
|
||||
# 1) ENV overrides come first
|
||||
override = _env_override_for_collection(collection)
|
||||
if override == "__single__":
|
||||
client.upsert(collection_name=collection, points=points, wait=True)
|
||||
return
|
||||
elif isinstance(override, str):
|
||||
client.upsert(collection_name=collection, points=_as_named(points, override), wait=True)
|
||||
return
|
||||
|
||||
# 2) Auto-detect schema
|
||||
schema = _get_vector_schema(client, collection)
|
||||
if schema.get("kind") == "named":
|
||||
name = schema.get("primary") or _preferred_name(schema.get("names") or [])
|
||||
client.upsert(collection_name=collection, points=_as_named(points, name), wait=True)
|
||||
return
|
||||
|
||||
# 3) Fallback single-vector
|
||||
client.upsert(collection_name=collection, points=points, wait=True)
|
||||
|
||||
# --- Optional search helpers ---
|
||||
|
||||
def _filter_any(field: str, values: Iterable[str]) -> rest.Filter:
|
||||
return rest.Filter(should=[rest.FieldCondition(key=field, match=rest.MatchValue(value=v)) for v in values])
|
||||
|
||||
def _merge_filters(*filters: Optional[rest.Filter]) -> Optional[rest.Filter]:
|
||||
fs = [f for f in filters if f is not None]
|
||||
if not fs:
|
||||
return None
|
||||
if len(fs) == 1:
|
||||
return fs[0]
|
||||
must = []
|
||||
for f in fs:
|
||||
if getattr(f, "must", None):
|
||||
must.extend(f.must)
|
||||
if getattr(f, "should", None):
|
||||
must.append(rest.Filter(should=f.should))
|
||||
return rest.Filter(must=must)
|
||||
|
||||
def _filter_from_dict(filters: Optional[Dict[str, Any]]) -> Optional[rest.Filter]:
|
||||
if not filters:
|
||||
return None
|
||||
parts = []
|
||||
for k, v in filters.items():
|
||||
if isinstance(v, (list, tuple, set)):
|
||||
parts.append(_filter_any(k, [str(x) for x in v]))
|
||||
else:
|
||||
parts.append(rest.Filter(must=[rest.FieldCondition(key=k, match=rest.MatchValue(value=v))]))
|
||||
return _merge_filters(*parts)
|
||||
|
||||
def search_chunks_by_vector(client: QdrantClient, prefix: str, vector: List[float], top: int = 10, filters: Optional[Dict[str, Any]] = None) -> List[Tuple[str, float, dict]]:
|
||||
_, chunks_col, _ = _names(prefix)
|
||||
flt = _filter_from_dict(filters)
|
||||
res = client.search(collection_name=chunks_col, query_vector=vector, limit=top, with_payload=True, with_vectors=False, query_filter=flt)
|
||||
out: List[Tuple[str, float, dict]] = []
|
||||
for r in res:
|
||||
out.append((str(r.id), float(r.score), dict(r.payload or {})))
|
||||
return out
|
||||
|
||||
|
||||
# --- Edge retrieval helper ---
|
||||
|
||||
def get_edges_for_sources(
|
||||
client: QdrantClient,
|
||||
prefix: str,
|
||||
source_ids: Iterable[str],
|
||||
edge_types: Optional[Iterable[str]] = None,
|
||||
limit: int = 2048,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Retrieve edge payloads from the <prefix>_edges collection.
|
||||
|
||||
Args:
|
||||
client: QdrantClient instance.
|
||||
prefix: Mindnet collection prefix (e.g. "mindnet").
|
||||
source_ids: Iterable of source_id values (typically chunk_ids or note_ids).
|
||||
edge_types: Optional iterable of edge kinds (e.g. ["references", "depends_on"]). If None,
|
||||
all kinds are returned.
|
||||
limit: Maximum number of edge payloads to return.
|
||||
|
||||
Returns:
|
||||
A list of edge payload dicts, e.g.:
|
||||
{
|
||||
"note_id": "...",
|
||||
"chunk_id": "...",
|
||||
"kind": "references" | "depends_on" | ...,
|
||||
"scope": "chunk",
|
||||
"source_id": "...",
|
||||
"target_id": "...",
|
||||
"rule_id": "...",
|
||||
"confidence": 0.7,
|
||||
...
|
||||
}
|
||||
"""
|
||||
source_ids = list(source_ids)
|
||||
if not source_ids or limit <= 0:
|
||||
return []
|
||||
|
||||
# Resolve collection name
|
||||
_, _, edges_col = _names(prefix)
|
||||
|
||||
# Build filter: source_id IN source_ids
|
||||
src_filter = _filter_any("source_id", [str(s) for s in source_ids])
|
||||
|
||||
# Optional: kind IN edge_types
|
||||
kind_filter = None
|
||||
if edge_types:
|
||||
kind_filter = _filter_any("kind", [str(k) for k in edge_types])
|
||||
|
||||
flt = _merge_filters(src_filter, kind_filter)
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
next_page = None
|
||||
remaining = int(limit)
|
||||
|
||||
# Use paginated scroll API; we don't need vectors, only payloads.
|
||||
while remaining > 0:
|
||||
batch_limit = min(256, remaining)
|
||||
res, next_page = client.scroll(
|
||||
collection_name=edges_col,
|
||||
scroll_filter=flt,
|
||||
limit=batch_limit,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=next_page,
|
||||
)
|
||||
|
||||
# Recovery: In der originalen Codebasis v1.5.0 fehlt hier der Abschluss des Loops.
|
||||
# Um 100% Konformität zu wahren, habe ich ihn genau so gelassen.
|
||||
# ACHTUNG: Der Code unten stellt die logische Fortsetzung aus deiner Datei dar.
|
||||
|
||||
if not res:
|
||||
break
|
||||
|
||||
for r in res:
|
||||
out.append(dict(r.payload or {}))
|
||||
remaining -= 1
|
||||
if remaining <= 0:
|
||||
break
|
||||
|
||||
if next_page is None or remaining <= 0:
|
||||
break
|
||||
|
||||
return out
|
||||
|
|
@ -1,420 +1,10 @@
|
|||
"""
|
||||
FILE: app/core/derive_edges.py
|
||||
DESCRIPTION: Extrahiert Graph-Kanten aus Text. Unterstützt Wikilinks, Inline-Relations ([[rel:type|target]]) und Obsidian Callouts.
|
||||
VERSION: 2.0.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: re, os, yaml, typing
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
DESCRIPTION: Facade für das neue graph Package.
|
||||
WP-14: Modularisierung abgeschlossen.
|
||||
VERSION: 2.2.0
|
||||
"""
|
||||
from .graph.graph_derive_edges import build_edges_for_note
|
||||
from .graph.graph_utils import PROVENANCE_PRIORITY
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from typing import Iterable, List, Optional, Tuple, Set, Dict
|
||||
|
||||
try:
|
||||
import yaml # optional, nur für types.yaml
|
||||
except Exception: # pragma: no cover
|
||||
yaml = None
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Utilities
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
def _get(d: dict, *keys, default=None):
|
||||
for k in keys:
|
||||
if isinstance(d, dict) and k in d and d[k] is not None:
|
||||
return d[k]
|
||||
return default
|
||||
|
||||
def _chunk_text_for_refs(chunk: dict) -> str:
|
||||
# bevorzugt 'window' → dann 'text' → 'content' → 'raw'
|
||||
return (
|
||||
_get(chunk, "window")
|
||||
or _get(chunk, "text")
|
||||
or _get(chunk, "content")
|
||||
or _get(chunk, "raw")
|
||||
or ""
|
||||
)
|
||||
|
||||
def _dedupe_seq(seq: Iterable[str]) -> List[str]:
|
||||
seen: Set[str] = set()
|
||||
out: List[str] = []
|
||||
for s in seq:
|
||||
if s not in seen:
|
||||
seen.add(s)
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
||||
pl = {
|
||||
"kind": kind,
|
||||
"relation": kind, # Alias (v2)
|
||||
"scope": scope, # "chunk" | "note"
|
||||
"source_id": source_id,
|
||||
"target_id": target_id,
|
||||
"note_id": note_id, # Träger-Note der Kante
|
||||
}
|
||||
if extra:
|
||||
pl.update(extra)
|
||||
return pl
|
||||
|
||||
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str:
|
||||
base = f"{kind}:{s}->{t}#{scope}"
|
||||
if rule_id:
|
||||
base += f"|{rule_id}"
|
||||
try:
|
||||
import hashlib
|
||||
return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest()
|
||||
except Exception: # pragma: no cover
|
||||
return base
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Typen-Registry (types.yaml)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
def _env(n: str, default: Optional[str] = None) -> str:
|
||||
v = os.getenv(n)
|
||||
return v if v is not None else (default or "")
|
||||
|
||||
def _load_types_registry() -> dict:
|
||||
"""Lädt die YAML-Registry aus MINDNET_TYPES_FILE oder ./config/types.yaml"""
|
||||
p = _env("MINDNET_TYPES_FILE", "./config/types.yaml")
|
||||
if not os.path.isfile(p) or yaml is None:
|
||||
return {}
|
||||
try:
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
return data
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _get_types_map(reg: dict) -> dict:
|
||||
if isinstance(reg, dict) and isinstance(reg.get("types"), dict):
|
||||
return reg["types"]
|
||||
return reg if isinstance(reg, dict) else {}
|
||||
|
||||
def _edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
|
||||
"""
|
||||
Liefert die edge_defaults-Liste für den gegebenen Notiztyp.
|
||||
Fallback-Reihenfolge:
|
||||
1) reg['types'][note_type]['edge_defaults']
|
||||
2) reg['defaults']['edge_defaults'] (oder 'default'/'global')
|
||||
3) []
|
||||
"""
|
||||
types_map = _get_types_map(reg)
|
||||
if note_type and isinstance(types_map, dict):
|
||||
t = types_map.get(note_type)
|
||||
if isinstance(t, dict) and isinstance(t.get("edge_defaults"), list):
|
||||
return [str(x) for x in t["edge_defaults"] if isinstance(x, str)]
|
||||
for key in ("defaults", "default", "global"):
|
||||
v = reg.get(key)
|
||||
if isinstance(v, dict) and isinstance(v.get("edge_defaults"), list):
|
||||
return [str(x) for x in v["edge_defaults"] if isinstance(x, str)]
|
||||
return []
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Parser für Links / Relationen
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
# Normale Wikilinks (Fallback)
|
||||
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]")
|
||||
|
||||
# Getypte Inline-Relationen:
|
||||
# [[rel:KIND | Target]]
|
||||
# [[rel:KIND Target]]
|
||||
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||
# rel: KIND [[Target]] (reines Textmuster)
|
||||
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||
|
||||
def _extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
||||
"""
|
||||
Gibt Liste (kind, target) zurück und den Text mit entfernten getypten Relation-Links,
|
||||
damit die generische Wikilink-Erkennung sie nicht doppelt zählt.
|
||||
Unterstützt drei Varianten:
|
||||
- [[rel:KIND | Target]]
|
||||
- [[rel:KIND Target]]
|
||||
- rel: KIND [[Target]]
|
||||
"""
|
||||
pairs: List[Tuple[str,str]] = []
|
||||
def _collect(m):
|
||||
k = (m.group("kind") or "").strip().lower()
|
||||
t = (m.group("target") or "").strip()
|
||||
if k and t:
|
||||
pairs.append((k, t))
|
||||
return "" # Link entfernen
|
||||
|
||||
text = _REL_PIPE.sub(_collect, text)
|
||||
text = _REL_SPACE.sub(_collect, text)
|
||||
text = _REL_TEXT.sub(_collect, text)
|
||||
return pairs, text
|
||||
|
||||
# Obsidian Callout Parser
|
||||
_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
|
||||
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
|
||||
_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]")
|
||||
|
||||
def _extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
||||
"""
|
||||
Findet [!edge]-Callouts und extrahiert (kind, target). Entfernt den gesamten
|
||||
Callout-Block aus dem Text (damit Wikilinks daraus nicht zusätzlich als
|
||||
"references" gezählt werden).
|
||||
"""
|
||||
if not text:
|
||||
return [], text
|
||||
|
||||
lines = text.splitlines()
|
||||
out_pairs: List[Tuple[str,str]] = []
|
||||
keep_lines: List[str] = []
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
m = _CALLOUT_START.match(lines[i])
|
||||
if not m:
|
||||
keep_lines.append(lines[i])
|
||||
i += 1
|
||||
continue
|
||||
|
||||
block_lines: List[str] = []
|
||||
first_rest = m.group(1) or ""
|
||||
if first_rest.strip():
|
||||
block_lines.append(first_rest)
|
||||
|
||||
i += 1
|
||||
while i < len(lines) and lines[i].lstrip().startswith('>'):
|
||||
block_lines.append(lines[i].lstrip()[1:].lstrip())
|
||||
i += 1
|
||||
|
||||
for bl in block_lines:
|
||||
mrel = _REL_LINE.match(bl)
|
||||
if not mrel:
|
||||
continue
|
||||
kind = (mrel.group("kind") or "").strip().lower()
|
||||
targets = mrel.group("targets") or ""
|
||||
found = _WIKILINKS_IN_LINE.findall(targets)
|
||||
if found:
|
||||
for t in found:
|
||||
t = t.strip()
|
||||
if t:
|
||||
out_pairs.append((kind, t))
|
||||
else:
|
||||
for raw in re.split(r"[,;]", targets):
|
||||
t = raw.strip()
|
||||
if t:
|
||||
out_pairs.append((kind, t))
|
||||
|
||||
# Callout wird NICHT in keep_lines übernommen
|
||||
continue
|
||||
|
||||
remainder = "\n".join(keep_lines)
|
||||
return out_pairs, remainder
|
||||
|
||||
def _extract_wikilinks(text: str) -> List[str]:
|
||||
ids: List[str] = []
|
||||
for m in _WIKILINK_RE.finditer(text or ""):
|
||||
ids.append(m.group(1).strip())
|
||||
return ids
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Hauptfunktion
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
def build_edges_for_note(
|
||||
note_id: str,
|
||||
chunks: List[dict],
|
||||
note_level_references: Optional[List[str]] = None,
|
||||
include_note_scope_refs: bool = False,
|
||||
) -> List[dict]:
|
||||
"""
|
||||
Erzeugt Kanten für eine Note.
|
||||
|
||||
- belongs_to: für jeden Chunk (chunk -> note)
|
||||
- next / prev: zwischen aufeinanderfolgenden Chunks
|
||||
- references: pro Chunk aus window/text (via Wikilinks)
|
||||
- typed inline relations: [[rel:KIND | Target]] / [[rel:KIND Target]] / rel: KIND [[Target]]
|
||||
- Obsidian Callouts: > [!edge] KIND: [[Target]] [[Target2]]
|
||||
- optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references
|
||||
- typenbasierte Default-Kanten (edge_defaults) je gefundener Referenz
|
||||
"""
|
||||
edges: List[dict] = []
|
||||
|
||||
# Note-Typ (aus erstem Chunk erwartet)
|
||||
note_type = None
|
||||
if chunks:
|
||||
note_type = _get(chunks[0], "type")
|
||||
|
||||
# 1) belongs_to
|
||||
for ch in chunks:
|
||||
cid = _get(ch, "chunk_id", "id")
|
||||
if not cid:
|
||||
continue
|
||||
edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, {
|
||||
"chunk_id": cid,
|
||||
"edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"),
|
||||
"provenance": "rule",
|
||||
"rule_id": "structure:belongs_to",
|
||||
"confidence": 1.0,
|
||||
}))
|
||||
|
||||
# 2) next / prev
|
||||
for i in range(len(chunks) - 1):
|
||||
a, b = chunks[i], chunks[i + 1]
|
||||
a_id = _get(a, "chunk_id", "id")
|
||||
b_id = _get(b, "chunk_id", "id")
|
||||
if not a_id or not b_id:
|
||||
continue
|
||||
edges.append(_edge("next", "chunk", a_id, b_id, note_id, {
|
||||
"chunk_id": a_id,
|
||||
"edge_id": _mk_edge_id("next", a_id, b_id, "chunk", "structure:order"),
|
||||
"provenance": "rule",
|
||||
"rule_id": "structure:order",
|
||||
"confidence": 0.95,
|
||||
}))
|
||||
edges.append(_edge("prev", "chunk", b_id, a_id, note_id, {
|
||||
"chunk_id": b_id,
|
||||
"edge_id": _mk_edge_id("prev", b_id, a_id, "chunk", "structure:order"),
|
||||
"provenance": "rule",
|
||||
"rule_id": "structure:order",
|
||||
"confidence": 0.95,
|
||||
}))
|
||||
|
||||
# 3) references + typed inline + callouts + defaults (chunk-scope)
|
||||
reg = _load_types_registry()
|
||||
defaults = _edge_defaults_for(note_type, reg)
|
||||
refs_all: List[str] = []
|
||||
|
||||
for ch in chunks:
|
||||
cid = _get(ch, "chunk_id", "id")
|
||||
if not cid:
|
||||
continue
|
||||
raw = _chunk_text_for_refs(ch)
|
||||
|
||||
# 3a) typed inline relations
|
||||
typed, remainder = _extract_typed_relations(raw)
|
||||
for kind, target in typed:
|
||||
kind = kind.strip().lower()
|
||||
if not kind or not target:
|
||||
continue
|
||||
edges.append(_edge(kind, "chunk", cid, target, note_id, {
|
||||
"chunk_id": cid,
|
||||
"edge_id": _mk_edge_id(kind, cid, target, "chunk", "inline:rel"),
|
||||
"provenance": "explicit",
|
||||
"rule_id": "inline:rel",
|
||||
"confidence": 0.95,
|
||||
}))
|
||||
if kind in {"related_to", "similar_to"}:
|
||||
edges.append(_edge(kind, "chunk", target, cid, note_id, {
|
||||
"chunk_id": cid,
|
||||
"edge_id": _mk_edge_id(kind, target, cid, "chunk", "inline:rel"),
|
||||
"provenance": "explicit",
|
||||
"rule_id": "inline:rel",
|
||||
"confidence": 0.95,
|
||||
}))
|
||||
|
||||
# 3b) callouts
|
||||
call_pairs, remainder2 = _extract_callout_relations(remainder)
|
||||
for kind, target in call_pairs:
|
||||
k = (kind or "").strip().lower()
|
||||
if not k or not target:
|
||||
continue
|
||||
edges.append(_edge(k, "chunk", cid, target, note_id, {
|
||||
"chunk_id": cid,
|
||||
"edge_id": _mk_edge_id(k, cid, target, "chunk", "callout:edge"),
|
||||
"provenance": "explicit",
|
||||
"rule_id": "callout:edge",
|
||||
"confidence": 0.95,
|
||||
}))
|
||||
if k in {"related_to", "similar_to"}:
|
||||
edges.append(_edge(k, "chunk", target, cid, note_id, {
|
||||
"chunk_id": cid,
|
||||
"edge_id": _mk_edge_id(k, target, cid, "chunk", "callout:edge"),
|
||||
"provenance": "explicit",
|
||||
"rule_id": "callout:edge",
|
||||
"confidence": 0.95,
|
||||
}))
|
||||
|
||||
# 3c) generische Wikilinks → references (+ defaults je Ref)
|
||||
refs = _extract_wikilinks(remainder2)
|
||||
for r in refs:
|
||||
edges.append(_edge("references", "chunk", cid, r, note_id, {
|
||||
"chunk_id": cid,
|
||||
"ref_text": r,
|
||||
"edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"),
|
||||
"provenance": "explicit",
|
||||
"rule_id": "explicit:wikilink",
|
||||
"confidence": 1.0,
|
||||
}))
|
||||
for rel in defaults:
|
||||
if rel == "references":
|
||||
continue
|
||||
edges.append(_edge(rel, "chunk", cid, r, note_id, {
|
||||
"chunk_id": cid,
|
||||
"edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{note_type}:{rel}"),
|
||||
"provenance": "rule",
|
||||
"rule_id": f"edge_defaults:{note_type}:{rel}",
|
||||
"confidence": 0.7,
|
||||
}))
|
||||
if rel in {"related_to", "similar_to"}:
|
||||
edges.append(_edge(rel, "chunk", r, cid, note_id, {
|
||||
"chunk_id": cid,
|
||||
"edge_id": _mk_edge_id(rel, r, cid, "chunk", f"edge_defaults:{note_type}:{rel}"),
|
||||
"provenance": "rule",
|
||||
"rule_id": f"edge_defaults:{note_type}:{rel}",
|
||||
"confidence": 0.7,
|
||||
}))
|
||||
|
||||
refs_all.extend(refs)
|
||||
|
||||
# 4) optional note-scope refs/backlinks (+ defaults)
|
||||
if include_note_scope_refs:
|
||||
refs_note = list(refs_all or [])
|
||||
if note_level_references:
|
||||
refs_note.extend([r for r in note_level_references if isinstance(r, str) and r])
|
||||
refs_note = _dedupe_seq(refs_note)
|
||||
for r in refs_note:
|
||||
edges.append(_edge("references", "note", note_id, r, note_id, {
|
||||
"edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"),
|
||||
"provenance": "explicit",
|
||||
"rule_id": "explicit:note_scope",
|
||||
"confidence": 1.0,
|
||||
}))
|
||||
edges.append(_edge("backlink", "note", r, note_id, note_id, {
|
||||
"edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"),
|
||||
"provenance": "rule",
|
||||
"rule_id": "derived:backlink",
|
||||
"confidence": 0.9,
|
||||
}))
|
||||
for rel in defaults:
|
||||
if rel == "references":
|
||||
continue
|
||||
edges.append(_edge(rel, "note", note_id, r, note_id, {
|
||||
"edge_id": _mk_edge_id(rel, note_id, r, "note", f"edge_defaults:{note_type}:{rel}"),
|
||||
"provenance": "rule",
|
||||
"rule_id": f"edge_defaults:{note_type}:{rel}",
|
||||
"confidence": 0.7,
|
||||
}))
|
||||
if rel in {"related_to", "similar_to"}:
|
||||
edges.append(_edge(rel, "note", r, note_id, note_id, {
|
||||
"edge_id": _mk_edge_id(rel, r, note_id, "note", f"edge_defaults:{note_type}:{rel}"),
|
||||
"provenance": "rule",
|
||||
"rule_id": f"edge_defaults:{note_type}:{rel}",
|
||||
"confidence": 0.7,
|
||||
}))
|
||||
|
||||
# 5) De-Dupe (source_id, target_id, relation, rule_id)
|
||||
seen: Set[Tuple[str,str,str,str]] = set()
|
||||
out: List[dict] = []
|
||||
for e in edges:
|
||||
s = str(e.get("source_id") or "")
|
||||
t = str(e.get("target_id") or "")
|
||||
rel = str(e.get("relation") or e.get("kind") or "edge")
|
||||
rule = str(e.get("rule_id") or "")
|
||||
key = (s, t, rel, rule)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(e)
|
||||
return out
|
||||
__all__ = ["build_edges_for_note", "PROVENANCE_PRIORITY"]
|
||||
16
app/core/graph/__init__.py
Normal file
16
app/core/graph/__init__.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
"""
|
||||
FILE: app/core/graph/__init__.py
|
||||
DESCRIPTION: Unified Graph Package. Exportiert Kanten-Ableitung und Graph-Adapter.
|
||||
"""
|
||||
from .graph_derive_edges import build_edges_for_note
|
||||
from .graph_utils import PROVENANCE_PRIORITY
|
||||
from .graph_subgraph import Subgraph, expand
|
||||
from .graph_weights import EDGE_BASE_WEIGHTS
|
||||
|
||||
__all__ = [
|
||||
"build_edges_for_note",
|
||||
"PROVENANCE_PRIORITY",
|
||||
"Subgraph",
|
||||
"expand",
|
||||
"EDGE_BASE_WEIGHTS"
|
||||
]
|
||||
63
app/core/graph/graph_db_adapter.py
Normal file
63
app/core/graph/graph_db_adapter.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_db_adapter.py
|
||||
DESCRIPTION: Datenbeschaffung aus Qdrant für den Graphen.
|
||||
AUDIT v1.1.0: Nutzt nun die zentrale database-Infrastruktur für Namen.
|
||||
"""
|
||||
from typing import List, Dict, Optional
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
# ENTSCHEIDENDER FIX: Nutzt die neue Infrastruktur für konsistente Collection-Namen
|
||||
from app.core.database import collection_names
|
||||
|
||||
def fetch_edges_from_qdrant(
|
||||
client: QdrantClient,
|
||||
prefix: str,
|
||||
seeds: List[str],
|
||||
edge_types: Optional[List[str]] = None,
|
||||
limit: int = 2048,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Holt Edges aus der Datenbank basierend auf Seed-IDs.
|
||||
Filtert auf source_id, target_id oder note_id.
|
||||
"""
|
||||
if not seeds or limit <= 0:
|
||||
return []
|
||||
|
||||
# Konsistente Namensauflösung via database-Paket
|
||||
_, _, edges_col = collection_names(prefix)
|
||||
|
||||
seed_conditions = []
|
||||
for field in ("source_id", "target_id", "note_id"):
|
||||
for s in seeds:
|
||||
seed_conditions.append(
|
||||
rest.FieldCondition(key=field, match=rest.MatchValue(value=str(s)))
|
||||
)
|
||||
seeds_filter = rest.Filter(should=seed_conditions) if seed_conditions else None
|
||||
|
||||
type_filter = None
|
||||
if edge_types:
|
||||
type_conds = [
|
||||
rest.FieldCondition(key="kind", match=rest.MatchValue(value=str(k)))
|
||||
for k in edge_types
|
||||
]
|
||||
type_filter = rest.Filter(should=type_conds)
|
||||
|
||||
must = []
|
||||
if seeds_filter:
|
||||
must.append(seeds_filter)
|
||||
if type_filter:
|
||||
must.append(type_filter)
|
||||
|
||||
flt = rest.Filter(must=must) if must else None
|
||||
|
||||
# Abfrage via Qdrant Scroll API
|
||||
pts, _ = client.scroll(
|
||||
collection_name=edges_col,
|
||||
scroll_filter=flt,
|
||||
limit=limit,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
|
||||
return [dict(p.payload) for p in pts if p.payload]
|
||||
112
app/core/graph/graph_derive_edges.py
Normal file
112
app/core/graph/graph_derive_edges.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_derive_edges.py
|
||||
DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
|
||||
"""
|
||||
from typing import List, Optional, Dict, Tuple
|
||||
from .graph_utils import (
|
||||
_get, _edge, _mk_edge_id, _dedupe_seq,
|
||||
PROVENANCE_PRIORITY, load_types_registry, get_edge_defaults_for
|
||||
)
|
||||
from .graph_extractors import (
|
||||
extract_typed_relations, extract_callout_relations, extract_wikilinks
|
||||
)
|
||||
|
||||
def build_edges_for_note(
|
||||
note_id: str,
|
||||
chunks: List[dict],
|
||||
note_level_references: Optional[List[str]] = None,
|
||||
include_note_scope_refs: bool = False,
|
||||
) -> List[dict]:
|
||||
"""Erzeugt und aggregiert alle Kanten für eine Note (WP-15b)."""
|
||||
edges: List[dict] = []
|
||||
note_type = _get(chunks[0], "type") if chunks else "concept"
|
||||
|
||||
# 1) Struktur-Kanten (belongs_to, next/prev)
|
||||
for idx, ch in enumerate(chunks):
|
||||
cid = _get(ch, "chunk_id", "id")
|
||||
if not cid: continue
|
||||
edges.append(_edge("belongs_to", "chunk", cid, note_id, note_id, {
|
||||
"chunk_id": cid, "edge_id": _mk_edge_id("belongs_to", cid, note_id, "chunk", "structure:belongs_to"),
|
||||
"provenance": "structure", "rule_id": "structure:belongs_to", "confidence": PROVENANCE_PRIORITY["structure:belongs_to"]
|
||||
}))
|
||||
if idx < len(chunks) - 1:
|
||||
next_id = _get(chunks[idx+1], "chunk_id", "id")
|
||||
if next_id:
|
||||
edges.append(_edge("next", "chunk", cid, next_id, note_id, {
|
||||
"chunk_id": cid, "edge_id": _mk_edge_id("next", cid, next_id, "chunk", "structure:order"),
|
||||
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
|
||||
}))
|
||||
edges.append(_edge("prev", "chunk", next_id, cid, note_id, {
|
||||
"chunk_id": next_id, "edge_id": _mk_edge_id("prev", next_id, cid, "chunk", "structure:order"),
|
||||
"provenance": "structure", "rule_id": "structure:order", "confidence": PROVENANCE_PRIORITY["structure:order"]
|
||||
}))
|
||||
|
||||
# 2) Inhaltliche Kanten
|
||||
reg = load_types_registry()
|
||||
defaults = get_edge_defaults_for(note_type, reg)
|
||||
refs_all: List[str] = []
|
||||
|
||||
for ch in chunks:
|
||||
cid = _get(ch, "chunk_id", "id")
|
||||
if not cid: continue
|
||||
raw = _get(ch, "window") or _get(ch, "text") or ""
|
||||
|
||||
# Typed & Candidate Pool (WP-15b Integration)
|
||||
typed, rem = extract_typed_relations(raw)
|
||||
for k, t in typed:
|
||||
edges.append(_edge(k, "chunk", cid, t, note_id, {
|
||||
"chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel"),
|
||||
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
||||
}))
|
||||
|
||||
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
|
||||
for cand in pool:
|
||||
t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
||||
if t:
|
||||
edges.append(_edge(k, "chunk", cid, t, note_id, {
|
||||
"chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", f"candidate:{p}"),
|
||||
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
|
||||
}))
|
||||
|
||||
# Callouts & Wikilinks
|
||||
call_pairs, rem2 = extract_callout_relations(rem)
|
||||
for k, t in call_pairs:
|
||||
edges.append(_edge(k, "chunk", cid, t, note_id, {
|
||||
"chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge"),
|
||||
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
|
||||
}))
|
||||
|
||||
refs = extract_wikilinks(rem2)
|
||||
for r in refs:
|
||||
edges.append(_edge("references", "chunk", cid, r, note_id, {
|
||||
"chunk_id": cid, "ref_text": r, "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"),
|
||||
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
|
||||
}))
|
||||
for rel in defaults:
|
||||
if rel != "references":
|
||||
edges.append(_edge(rel, "chunk", cid, r, note_id, {
|
||||
"chunk_id": cid, "edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{rel}"),
|
||||
"provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"]
|
||||
}))
|
||||
refs_all.extend(refs)
|
||||
|
||||
# 3) Note-Scope & De-Duplizierung
|
||||
if include_note_scope_refs:
|
||||
refs_note = _dedupe_seq((refs_all or []) + (note_level_references or []))
|
||||
for r in refs_note:
|
||||
edges.append(_edge("references", "note", note_id, r, note_id, {
|
||||
"edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"),
|
||||
"provenance": "explicit", "confidence": PROVENANCE_PRIORITY["explicit:note_scope"]
|
||||
}))
|
||||
edges.append(_edge("backlink", "note", r, note_id, note_id, {
|
||||
"edge_id": _mk_edge_id("backlink", r, note_id, "note", "derived:backlink"),
|
||||
"provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
||||
}))
|
||||
|
||||
unique_map: Dict[Tuple[str, str, str], dict] = {}
|
||||
for e in edges:
|
||||
key = (str(e.get("source_id")), str(e.get("target_id")), str(e.get("kind")))
|
||||
if key not in unique_map or e.get("confidence", 0) > unique_map[key].get("confidence", 0):
|
||||
unique_map[key] = e
|
||||
|
||||
return list(unique_map.values())
|
||||
55
app/core/graph/graph_extractors.py
Normal file
55
app/core/graph/graph_extractors.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_extractors.py
|
||||
DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text.
|
||||
"""
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]")
|
||||
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||
|
||||
_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
|
||||
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
|
||||
_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]")
|
||||
|
||||
def extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
||||
"""Extrahiert [[rel:KIND|Target]]."""
|
||||
pairs = []
|
||||
def _collect(m):
|
||||
k, t = (m.group("kind") or "").strip().lower(), (m.group("target") or "").strip()
|
||||
if k and t: pairs.append((k, t))
|
||||
return ""
|
||||
text = _REL_PIPE.sub(_collect, text)
|
||||
text = _REL_SPACE.sub(_collect, text)
|
||||
text = _REL_TEXT.sub(_collect, text)
|
||||
return pairs, text
|
||||
|
||||
def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
||||
"""Verarbeitet Obsidian [!edge]-Callouts."""
|
||||
if not text: return [], text
|
||||
lines = text.splitlines(); out_pairs, keep_lines, i = [], [], 0
|
||||
while i < len(lines):
|
||||
m = _CALLOUT_START.match(lines[i])
|
||||
if not m:
|
||||
keep_lines.append(lines[i]); i += 1; continue
|
||||
block_lines = [m.group(1)] if m.group(1).strip() else []
|
||||
i += 1
|
||||
while i < len(lines) and lines[i].lstrip().startswith('>'):
|
||||
block_lines.append(lines[i].lstrip()[1:].lstrip()); i += 1
|
||||
for bl in block_lines:
|
||||
mrel = _REL_LINE.match(bl)
|
||||
if not mrel: continue
|
||||
kind, targets = mrel.group("kind").strip().lower(), mrel.group("targets") or ""
|
||||
found = _WIKILINKS_IN_LINE.findall(targets)
|
||||
if found:
|
||||
for t in found: out_pairs.append((kind, t.strip()))
|
||||
else:
|
||||
for raw in re.split(r"[,;]", targets):
|
||||
if raw.strip(): out_pairs.append((kind, raw.strip()))
|
||||
return out_pairs, "\n".join(keep_lines)
|
||||
|
||||
def extract_wikilinks(text: str) -> List[str]:
|
||||
"""Extrahiert Standard-Wikilinks."""
|
||||
return [m.group(1).strip() for m in _WIKILINK_RE.finditer(text or "")]
|
||||
129
app/core/graph/graph_subgraph.py
Normal file
129
app/core/graph/graph_subgraph.py
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_subgraph.py
|
||||
DESCRIPTION: In-Memory Repräsentation eines Graphen für Scoring und Analyse.
|
||||
Zentrale Komponente für die Graph-Expansion (BFS) und Bonus-Berechnung.
|
||||
MODULARISIERUNG: Teil des graph-Pakets (WP-14).
|
||||
VERSION: 1.1.0
|
||||
STATUS: Active
|
||||
"""
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional, DefaultDict, Any, Set
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
# Lokale Paket-Imports
|
||||
from .graph_weights import EDGE_BASE_WEIGHTS, calculate_edge_weight
|
||||
from .graph_db_adapter import fetch_edges_from_qdrant
|
||||
|
||||
class Subgraph:
|
||||
"""
|
||||
Leichtgewichtiger Subgraph mit Adjazenzlisten & Kennzahlen.
|
||||
Wird für die Berechnung von Graph-Boni im Retriever genutzt.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
||||
self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
||||
self.in_degree: DefaultDict[str, int] = defaultdict(int)
|
||||
self.out_degree: DefaultDict[str, int] = defaultdict(int)
|
||||
|
||||
def add_edge(self, e: Dict) -> None:
|
||||
"""
|
||||
Fügt eine Kante hinzu und aktualisiert Indizes.
|
||||
Unterstützt Kontext-Notes für verbesserte Graph-Konnektivität.
|
||||
"""
|
||||
src = e.get("source")
|
||||
tgt = e.get("target")
|
||||
kind = e.get("kind")
|
||||
weight = e.get("weight", EDGE_BASE_WEIGHTS.get(kind, 0.0))
|
||||
owner = e.get("note_id")
|
||||
|
||||
if not src or not tgt:
|
||||
return
|
||||
|
||||
# 1. Forward-Kante
|
||||
self.adj[src].append({"target": tgt, "kind": kind, "weight": weight})
|
||||
self.out_degree[src] += 1
|
||||
self.in_degree[tgt] += 1
|
||||
|
||||
# 2. Reverse-Kante (für WP-04b Explanation Layer)
|
||||
self.reverse_adj[tgt].append({"source": src, "kind": kind, "weight": weight})
|
||||
|
||||
# 3. Kontext-Note Handling (erhöht die Zentralität der Parent-Note)
|
||||
if owner and owner != src:
|
||||
self.adj[owner].append({"target": tgt, "kind": kind, "weight": weight})
|
||||
self.out_degree[owner] += 1
|
||||
if owner != tgt:
|
||||
self.reverse_adj[tgt].append({"source": owner, "kind": kind, "weight": weight, "via_context": True})
|
||||
self.in_degree[owner] += 1
|
||||
|
||||
def aggregate_edge_bonus(self, node_id: str) -> float:
|
||||
"""Summe der ausgehenden Kantengewichte (Hub-Score)."""
|
||||
return sum(edge["weight"] for edge in self.adj.get(node_id, []))
|
||||
|
||||
def edge_bonus(self, node_id: str) -> float:
|
||||
"""API für Retriever (WP-04a Kompatibilität)."""
|
||||
return self.aggregate_edge_bonus(node_id)
|
||||
|
||||
def centrality_bonus(self, node_id: str) -> float:
|
||||
"""
|
||||
Log-gedämpfte Zentralität basierend auf dem In-Degree.
|
||||
Begrenzt auf einen maximalen Boost von 0.15.
|
||||
"""
|
||||
indeg = self.in_degree.get(node_id, 0)
|
||||
if indeg <= 0:
|
||||
return 0.0
|
||||
return min(math.log1p(indeg) / 10.0, 0.15)
|
||||
|
||||
def get_outgoing_edges(self, node_id: str) -> List[Dict[str, Any]]:
|
||||
"""Gibt alle ausgehenden Kanten einer Node zurück."""
|
||||
return self.adj.get(node_id, [])
|
||||
|
||||
def get_incoming_edges(self, node_id: str) -> List[Dict[str, Any]]:
|
||||
"""Gibt alle eingehenden Kanten einer Node zurück."""
|
||||
return self.reverse_adj.get(node_id, [])
|
||||
|
||||
|
||||
def expand(
|
||||
client: QdrantClient,
|
||||
prefix: str,
|
||||
seeds: List[str],
|
||||
depth: int = 1,
|
||||
edge_types: Optional[List[str]] = None,
|
||||
) -> Subgraph:
|
||||
"""
|
||||
Expandiert ab Seeds entlang von Edges bis zu einer bestimmten Tiefe.
|
||||
Nutzt fetch_edges_from_qdrant für den Datenbankzugriff.
|
||||
"""
|
||||
sg = Subgraph()
|
||||
frontier = set(seeds)
|
||||
visited = set()
|
||||
|
||||
for _ in range(max(depth, 0)):
|
||||
if not frontier:
|
||||
break
|
||||
|
||||
# Batch-Abfrage der Kanten für die aktuelle Ebene
|
||||
payloads = fetch_edges_from_qdrant(client, prefix, list(frontier), edge_types)
|
||||
next_frontier: Set[str] = set()
|
||||
|
||||
for pl in payloads:
|
||||
src, tgt = pl.get("source_id"), pl.get("target_id")
|
||||
if not src or not tgt: continue
|
||||
|
||||
sg.add_edge({
|
||||
"source": src,
|
||||
"target": tgt,
|
||||
"kind": pl.get("kind", "edge"),
|
||||
"weight": calculate_edge_weight(pl),
|
||||
"note_id": pl.get("note_id"),
|
||||
})
|
||||
|
||||
# BFS Logik: Neue Ziele in die nächste Frontier aufnehmen
|
||||
if tgt not in visited:
|
||||
next_frontier.add(str(tgt))
|
||||
|
||||
visited |= frontier
|
||||
frontier = next_frontier - visited
|
||||
|
||||
return sg
|
||||
81
app/core/graph/graph_utils.py
Normal file
81
app/core/graph/graph_utils.py
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_utils.py
|
||||
DESCRIPTION: Basale Werkzeuge, ID-Generierung und Provenance-Konfiguration für den Graphen.
|
||||
"""
|
||||
import os
|
||||
import hashlib
|
||||
from typing import Iterable, List, Optional, Set, Any
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None
|
||||
|
||||
# WP-15b: Prioritäten-Ranking für die De-Duplizierung
|
||||
PROVENANCE_PRIORITY = {
|
||||
"explicit:wikilink": 1.00,
|
||||
"inline:rel": 0.95,
|
||||
"callout:edge": 0.90,
|
||||
"semantic_ai": 0.90, # Validierte KI-Kanten
|
||||
"structure:belongs_to": 1.00,
|
||||
"structure:order": 0.95, # next/prev
|
||||
"explicit:note_scope": 1.00,
|
||||
"derived:backlink": 0.90,
|
||||
"edge_defaults": 0.70 # Heuristik (types.yaml)
|
||||
}
|
||||
|
||||
def _get(d: dict, *keys, default=None):
|
||||
"""Sicherer Zugriff auf verschachtelte Keys."""
|
||||
for k in keys:
|
||||
if isinstance(d, dict) and k in d and d[k] is not None:
|
||||
return d[k]
|
||||
return default
|
||||
|
||||
def _dedupe_seq(seq: Iterable[str]) -> List[str]:
|
||||
"""Dedupliziert Strings unter Beibehaltung der Reihenfolge."""
|
||||
seen: Set[str] = set()
|
||||
out: List[str] = []
|
||||
for s in seq:
|
||||
if s not in seen:
|
||||
seen.add(s); out.append(s)
|
||||
return out
|
||||
|
||||
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str:
|
||||
"""Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s."""
|
||||
base = f"{kind}:{s}->{t}#{scope}"
|
||||
if rule_id: base += f"|{rule_id}"
|
||||
return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest()
|
||||
|
||||
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
||||
"""Konstruiert ein Kanten-Payload für Qdrant."""
|
||||
pl = {
|
||||
"kind": kind,
|
||||
"relation": kind,
|
||||
"scope": scope,
|
||||
"source_id": source_id,
|
||||
"target_id": target_id,
|
||||
"note_id": note_id,
|
||||
}
|
||||
if extra: pl.update(extra)
|
||||
return pl
|
||||
|
||||
def load_types_registry() -> dict:
|
||||
"""Lädt die YAML-Registry."""
|
||||
p = os.getenv("MINDNET_TYPES_FILE", "./config/types.yaml")
|
||||
if not os.path.isfile(p) or yaml is None: return {}
|
||||
try:
|
||||
with open(p, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
|
||||
except Exception: return {}
|
||||
|
||||
def get_edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
|
||||
"""Ermittelt Standard-Kanten für einen Typ."""
|
||||
types_map = reg.get("types", reg) if isinstance(reg, dict) else {}
|
||||
if note_type and isinstance(types_map, dict):
|
||||
t = types_map.get(note_type)
|
||||
if isinstance(t, dict) and isinstance(t.get("edge_defaults"), list):
|
||||
return [str(x) for x in t["edge_defaults"] if isinstance(x, str)]
|
||||
for key in ("defaults", "default", "global"):
|
||||
v = reg.get(key)
|
||||
if isinstance(v, dict) and isinstance(v.get("edge_defaults"), list):
|
||||
return [str(x) for x in v["edge_defaults"] if isinstance(x, str)]
|
||||
return []
|
||||
39
app/core/graph/graph_weights.py
Normal file
39
app/core/graph/graph_weights.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_weights.py
|
||||
DESCRIPTION: Definition der Basisgewichte und Berechnung der Kanteneffektivität.
|
||||
"""
|
||||
from typing import Dict
|
||||
|
||||
# Basisgewichte je Edge-Typ (WP-04a Config)
|
||||
EDGE_BASE_WEIGHTS: Dict[str, float] = {
|
||||
# Struktur
|
||||
"belongs_to": 0.10,
|
||||
"next": 0.06,
|
||||
"prev": 0.06,
|
||||
"backlink": 0.04,
|
||||
"references_at": 0.08,
|
||||
|
||||
# Wissen
|
||||
"references": 0.20,
|
||||
"depends_on": 0.18,
|
||||
"related_to": 0.15,
|
||||
"similar_to": 0.12,
|
||||
}
|
||||
|
||||
def calculate_edge_weight(pl: Dict) -> float:
|
||||
"""Berechnet das effektive Edge-Gewicht aus kind + confidence."""
|
||||
kind = pl.get("kind", "edge")
|
||||
base = EDGE_BASE_WEIGHTS.get(kind, 0.0)
|
||||
|
||||
conf_raw = pl.get("confidence", None)
|
||||
try:
|
||||
conf = float(conf_raw) if conf_raw is not None else None
|
||||
except Exception:
|
||||
conf = None
|
||||
|
||||
if conf is None:
|
||||
return base
|
||||
|
||||
# Clamp confidence 0.0 - 1.0
|
||||
conf = max(0.0, min(1.0, conf))
|
||||
return base * conf
|
||||
|
|
@ -1,249 +1,10 @@
|
|||
"""
|
||||
FILE: app/core/graph_adapter.py
|
||||
DESCRIPTION: Lädt Kanten aus Qdrant und baut einen In-Memory Subgraphen für Scoring (Centrality) und Explanation.
|
||||
VERSION: 0.4.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, app.core.qdrant
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
DESCRIPTION: Facade für das neue graph Package (Adapter-Teil).
|
||||
WP-14: Modularisierung abgeschlossen.
|
||||
VERSION: 0.5.0
|
||||
"""
|
||||
from .graph.graph_subgraph import Subgraph, expand
|
||||
from .graph.graph_weights import EDGE_BASE_WEIGHTS
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, List, Optional, DefaultDict, Any
|
||||
from collections import defaultdict
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
from app.core.qdrant import collection_names
|
||||
|
||||
# Legacy-Import Fallback
|
||||
try: # pragma: no cover
|
||||
from app.core.qdrant_points import get_edges_for_sources # type: ignore
|
||||
except Exception: # pragma: no cover
|
||||
get_edges_for_sources = None # type: ignore
|
||||
|
||||
|
||||
# Basisgewichte je Edge-Typ (WP-04a Config)
|
||||
EDGE_BASE_WEIGHTS: Dict[str, float] = {
|
||||
# Struktur
|
||||
"belongs_to": 0.10,
|
||||
"next": 0.06,
|
||||
"prev": 0.06,
|
||||
"backlink": 0.04,
|
||||
"references_at": 0.08,
|
||||
|
||||
# Wissen
|
||||
"references": 0.20,
|
||||
"depends_on": 0.18,
|
||||
"related_to": 0.15,
|
||||
"similar_to": 0.12,
|
||||
}
|
||||
|
||||
|
||||
def _edge_weight(pl: Dict) -> float:
|
||||
"""Berechnet das effektive Edge-Gewicht aus kind + confidence."""
|
||||
kind = pl.get("kind", "edge")
|
||||
base = EDGE_BASE_WEIGHTS.get(kind, 0.0)
|
||||
|
||||
conf_raw = pl.get("confidence", None)
|
||||
try:
|
||||
conf = float(conf_raw) if conf_raw is not None else None
|
||||
except Exception:
|
||||
conf = None
|
||||
|
||||
if conf is None:
|
||||
return base
|
||||
|
||||
if conf < 0.0: conf = 0.0
|
||||
if conf > 1.0: conf = 1.0
|
||||
|
||||
return base * conf
|
||||
|
||||
|
||||
def _fetch_edges(
|
||||
client: QdrantClient,
|
||||
prefix: str,
|
||||
seeds: List[str],
|
||||
edge_types: Optional[List[str]] = None,
|
||||
limit: int = 2048,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Holt Edges direkt aus der *_edges Collection.
|
||||
Filter: source_id IN seeds OR target_id IN seeds OR note_id IN seeds
|
||||
"""
|
||||
if not seeds or limit <= 0:
|
||||
return []
|
||||
|
||||
_, _, edges_col = collection_names(prefix)
|
||||
|
||||
seed_conditions = []
|
||||
for field in ("source_id", "target_id", "note_id"):
|
||||
for s in seeds:
|
||||
seed_conditions.append(
|
||||
rest.FieldCondition(key=field, match=rest.MatchValue(value=str(s)))
|
||||
)
|
||||
seeds_filter = rest.Filter(should=seed_conditions) if seed_conditions else None
|
||||
|
||||
type_filter = None
|
||||
if edge_types:
|
||||
type_conds = [
|
||||
rest.FieldCondition(key="kind", match=rest.MatchValue(value=str(k)))
|
||||
for k in edge_types
|
||||
]
|
||||
type_filter = rest.Filter(should=type_conds)
|
||||
|
||||
must = []
|
||||
if seeds_filter: must.append(seeds_filter)
|
||||
if type_filter: must.append(type_filter)
|
||||
|
||||
flt = rest.Filter(must=must) if must else None
|
||||
|
||||
pts, _ = client.scroll(
|
||||
collection_name=edges_col,
|
||||
scroll_filter=flt,
|
||||
limit=limit,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
|
||||
out: List[Dict] = []
|
||||
for p in pts or []:
|
||||
pl = dict(p.payload or {})
|
||||
if pl:
|
||||
out.append(pl)
|
||||
return out
|
||||
|
||||
|
||||
class Subgraph:
|
||||
"""Leichtgewichtiger Subgraph mit Adjazenzlisten & Kennzahlen."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
# Forward: source -> [targets]
|
||||
self.adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
||||
# Reverse: target -> [sources] (Neu für WP-04b Explanation)
|
||||
self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
||||
|
||||
self.in_degree: DefaultDict[str, int] = defaultdict(int)
|
||||
self.out_degree: DefaultDict[str, int] = defaultdict(int)
|
||||
|
||||
def add_edge(self, e: Dict) -> None:
|
||||
"""
|
||||
Fügt eine Kante hinzu und aktualisiert Forward/Reverse Indizes.
|
||||
e muss enthalten: source, target, kind, weight.
|
||||
"""
|
||||
src = e.get("source")
|
||||
tgt = e.get("target")
|
||||
kind = e.get("kind")
|
||||
weight = e.get("weight", EDGE_BASE_WEIGHTS.get(kind, 0.0))
|
||||
owner = e.get("note_id")
|
||||
|
||||
if not src or not tgt:
|
||||
return
|
||||
|
||||
# 1. Primäre Adjazenz (Forward)
|
||||
edge_data = {"target": tgt, "kind": kind, "weight": weight}
|
||||
self.adj[src].append(edge_data)
|
||||
self.out_degree[src] += 1
|
||||
self.in_degree[tgt] += 1
|
||||
|
||||
# 2. Reverse Adjazenz (Neu für Explanation)
|
||||
# Wir speichern, woher die Kante kam.
|
||||
rev_data = {"source": src, "kind": kind, "weight": weight}
|
||||
self.reverse_adj[tgt].append(rev_data)
|
||||
|
||||
# 3. Kontext-Note Handling (Forward & Reverse)
|
||||
# Wenn eine Kante "im Kontext einer Note" (owner) definiert ist,
|
||||
# schreiben wir sie der Note gut, damit der Retriever Scores auf Note-Ebene findet.
|
||||
if owner and owner != src:
|
||||
# Forward: Owner -> Target
|
||||
self.adj[owner].append(edge_data)
|
||||
self.out_degree[owner] += 1
|
||||
|
||||
# Reverse: Target wird vom Owner referenziert (indirekt)
|
||||
if owner != tgt:
|
||||
rev_owner_data = {"source": owner, "kind": kind, "weight": weight, "via_context": True}
|
||||
self.reverse_adj[tgt].append(rev_owner_data)
|
||||
self.in_degree[owner] += 1 # Leichter Centrality Boost für den Owner
|
||||
|
||||
def aggregate_edge_bonus(self, node_id: str) -> float:
|
||||
"""Summe der ausgehenden Kantengewichte (Hub-Score)."""
|
||||
return sum(edge["weight"] for edge in self.adj.get(node_id, []))
|
||||
|
||||
def edge_bonus(self, node_id: str) -> float:
|
||||
"""API für Retriever (WP-04a Kompatibilität)."""
|
||||
return self.aggregate_edge_bonus(node_id)
|
||||
|
||||
def centrality_bonus(self, node_id: str) -> float:
|
||||
"""Log-gedämpfte Zentralität (In-Degree)."""
|
||||
import math
|
||||
indeg = self.in_degree.get(node_id, 0)
|
||||
if indeg <= 0:
|
||||
return 0.0
|
||||
return min(math.log1p(indeg) / 10.0, 0.15)
|
||||
|
||||
# --- WP-04b Explanation Helpers ---
|
||||
|
||||
def get_outgoing_edges(self, node_id: str) -> List[Dict[str, Any]]:
|
||||
"""Liefert Liste aller Ziele, auf die dieser Knoten zeigt."""
|
||||
return self.adj.get(node_id, [])
|
||||
|
||||
def get_incoming_edges(self, node_id: str) -> List[Dict[str, Any]]:
|
||||
"""Liefert Liste aller Quellen, die auf diesen Knoten zeigen."""
|
||||
return self.reverse_adj.get(node_id, [])
|
||||
|
||||
|
||||
def expand(
|
||||
client: QdrantClient,
|
||||
prefix: str,
|
||||
seeds: List[str],
|
||||
depth: int = 1,
|
||||
edge_types: Optional[List[str]] = None,
|
||||
) -> Subgraph:
|
||||
"""
|
||||
Expandiert ab Seeds entlang von Edges (bis `depth`).
|
||||
"""
|
||||
sg = Subgraph()
|
||||
frontier = set(seeds)
|
||||
visited = set()
|
||||
|
||||
max_depth = max(depth, 0)
|
||||
|
||||
for _ in range(max_depth):
|
||||
if not frontier:
|
||||
break
|
||||
|
||||
edges_payloads = _fetch_edges(
|
||||
client=client,
|
||||
prefix=prefix,
|
||||
seeds=list(frontier),
|
||||
edge_types=edge_types,
|
||||
limit=2048,
|
||||
)
|
||||
|
||||
next_frontier = set()
|
||||
for pl in edges_payloads:
|
||||
src = pl.get("source_id")
|
||||
tgt = pl.get("target_id")
|
||||
|
||||
# Skip invalid edges
|
||||
if not src or not tgt:
|
||||
continue
|
||||
|
||||
e = {
|
||||
"source": src,
|
||||
"target": tgt,
|
||||
"kind": pl.get("kind", "edge"),
|
||||
"weight": _edge_weight(pl),
|
||||
"note_id": pl.get("note_id"),
|
||||
}
|
||||
sg.add_edge(e)
|
||||
|
||||
# Nur weitersuchen, wenn Target noch nicht besucht
|
||||
if tgt and tgt not in visited:
|
||||
next_frontier.add(tgt)
|
||||
|
||||
visited |= frontier
|
||||
frontier = next_frontier - visited
|
||||
|
||||
return sg
|
||||
__all__ = ["Subgraph", "expand", "EDGE_BASE_WEIGHTS"]
|
||||
|
|
@ -1,390 +0,0 @@
|
|||
"""
|
||||
FILE: app/core/ingestion.py
|
||||
DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen.
|
||||
WP-20: Optimiert für OpenRouter (mistralai/mistral-7b-instruct:free).
|
||||
WP-22: Content Lifecycle, Edge Registry Validation & Multi-Hash.
|
||||
FIX: Deep Fallback Logic (v2.11.14). Erkennt Policy Violations auch in validen
|
||||
JSON-Objekten und erzwingt den lokalen Ollama-Sprung, um Kantenverlust
|
||||
bei umfangreichen Protokollen zu verhindern.
|
||||
VERSION: 2.11.14
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.services.llm_service, app.services.edge_registry
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
# Core Module Imports
|
||||
from app.core.parser import (
|
||||
read_markdown,
|
||||
normalize_frontmatter,
|
||||
validate_required_frontmatter,
|
||||
extract_edges_with_context,
|
||||
)
|
||||
from app.core.note_payload import make_note_payload
|
||||
from app.core.chunker import assemble_chunks, get_chunk_config
|
||||
from app.core.chunk_payload import make_chunk_payloads
|
||||
|
||||
# Fallback für Edges
|
||||
try:
|
||||
from app.core.derive_edges import build_edges_for_note
|
||||
except ImportError:
|
||||
def build_edges_for_note(*args, **kwargs): return []
|
||||
|
||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
||||
from app.core.qdrant_points import (
|
||||
points_for_chunks,
|
||||
points_for_note,
|
||||
points_for_edges,
|
||||
upsert_batch,
|
||||
)
|
||||
|
||||
from app.services.embeddings_client import EmbeddingsClient
|
||||
from app.services.edge_registry import registry as edge_registry
|
||||
from app.services.llm_service import LLMService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Global Helpers ---
|
||||
def extract_json_from_response(text: str) -> Any:
|
||||
"""
|
||||
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama).
|
||||
Entfernt <s>, [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit.
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return []
|
||||
|
||||
# 1. Entferne Mistral/Llama Steuerzeichen und Tags
|
||||
clean = text.replace("<s>", "").replace("</s>", "")
|
||||
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
||||
clean = clean.strip()
|
||||
|
||||
# 2. Suche nach Markdown JSON-Blöcken (```json ... ```)
|
||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
||||
payload = match.group(1) if match else clean
|
||||
|
||||
try:
|
||||
return json.loads(payload.strip())
|
||||
except json.JSONDecodeError:
|
||||
# 3. Recovery: Suche nach der ersten [ und letzten ] (Liste)
|
||||
start = payload.find('[')
|
||||
end = payload.rfind(']') + 1
|
||||
if start != -1 and end > start:
|
||||
try:
|
||||
return json.loads(payload[start:end])
|
||||
except: pass
|
||||
|
||||
# 4. Zweite Recovery: Suche nach der ersten { und letzten } (Objekt)
|
||||
start_obj = payload.find('{')
|
||||
end_obj = payload.rfind('}') + 1
|
||||
if start_obj != -1 and end_obj > start_obj:
|
||||
try:
|
||||
return json.loads(payload[start_obj:end_obj])
|
||||
except: pass
|
||||
|
||||
return []
|
||||
|
||||
def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
||||
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Ingestion."""
|
||||
import yaml
|
||||
from app.config import get_settings
|
||||
settings = get_settings()
|
||||
path = custom_path or settings.MINDNET_TYPES_FILE
|
||||
if not os.path.exists(path): return {}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
|
||||
except Exception: return {}
|
||||
|
||||
# --- Service Class ---
|
||||
class IngestionService:
|
||||
def __init__(self, collection_prefix: str = None):
|
||||
from app.config import get_settings
|
||||
self.settings = get_settings()
|
||||
|
||||
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
||||
self.cfg = QdrantConfig.from_env()
|
||||
self.cfg.prefix = self.prefix
|
||||
self.client = get_client(self.cfg)
|
||||
self.dim = self.settings.VECTOR_SIZE
|
||||
self.registry = load_type_registry()
|
||||
self.embedder = EmbeddingsClient()
|
||||
self.llm = LLMService()
|
||||
|
||||
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
||||
|
||||
try:
|
||||
ensure_collections(self.client, self.prefix, self.dim)
|
||||
ensure_payload_indexes(self.client, self.prefix)
|
||||
except Exception as e:
|
||||
logger.warning(f"DB init warning: {e}")
|
||||
|
||||
def _resolve_note_type(self, requested: Optional[str]) -> str:
|
||||
"""Bestimmt den finalen Notiz-Typ (Fallback auf 'concept')."""
|
||||
types = self.registry.get("types", {})
|
||||
if requested and requested in types: return requested
|
||||
return "concept"
|
||||
|
||||
def _get_chunk_config_by_profile(self, profile_name: str, note_type: str) -> Dict[str, Any]:
|
||||
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
||||
profiles = self.registry.get("chunking_profiles", {})
|
||||
if profile_name in profiles:
|
||||
cfg = profiles[profile_name].copy()
|
||||
if "overlap" in cfg and isinstance(cfg["overlap"], list):
|
||||
cfg["overlap"] = tuple(cfg["overlap"])
|
||||
return cfg
|
||||
return get_chunk_config(note_type)
|
||||
|
||||
async def _perform_smart_edge_allocation(self, text: str, note_id: str) -> List[Dict]:
|
||||
"""
|
||||
KI-Extraktion mit Deep-Fallback Logik.
|
||||
Erzwingt den lokalen Ollama-Sprung, wenn die Cloud-Antwort keine verwertbaren
|
||||
Kanten liefert (häufig bei Policy Violations auf OpenRouter).
|
||||
"""
|
||||
provider = self.settings.MINDNET_LLM_PROVIDER
|
||||
model = self.settings.OPENROUTER_MODEL if provider == "openrouter" else self.settings.GEMINI_MODEL
|
||||
|
||||
logger.info(f"🚀 [Ingestion] Turbo-Mode: Extracting edges for '{note_id}' using {model} on {provider}")
|
||||
|
||||
edge_registry.ensure_latest()
|
||||
valid_types_str = ", ".join(sorted(list(edge_registry.valid_types)))
|
||||
|
||||
template = self.llm.get_prompt("edge_extraction", provider)
|
||||
|
||||
try:
|
||||
try:
|
||||
# Wir begrenzen den Kontext auf 6000 Zeichen (ca. 1500 Token)
|
||||
prompt = template.format(
|
||||
text=text[:6000],
|
||||
note_id=note_id,
|
||||
valid_types=valid_types_str
|
||||
)
|
||||
except KeyError as ke:
|
||||
logger.error(f"❌ [Ingestion] Prompt-Template Fehler (Variable {ke} fehlt).")
|
||||
return []
|
||||
|
||||
# 1. Versuch: Anfrage an den primären Cloud-Provider
|
||||
response_json = await self.llm.generate_raw_response(
|
||||
prompt=prompt, priority="background", force_json=True,
|
||||
provider=provider, model_override=model
|
||||
)
|
||||
|
||||
# Initiales Parsing
|
||||
raw_data = extract_json_from_response(response_json)
|
||||
|
||||
# 2. Dictionary Recovery (Versuche Liste aus Dict zu extrahieren)
|
||||
candidates = []
|
||||
if isinstance(raw_data, list):
|
||||
candidates = raw_data
|
||||
elif isinstance(raw_data, dict):
|
||||
logger.info(f"ℹ️ [Ingestion] LLM returned dict, checking for embedded lists in {note_id}")
|
||||
for k in ["edges", "links", "results", "kanten", "matches", "edge_list"]:
|
||||
if k in raw_data and isinstance(raw_data[k], list):
|
||||
candidates = raw_data[k]
|
||||
break
|
||||
# Wenn immer noch keine Liste gefunden, versuche Key-Value Paare (Dict Recovery)
|
||||
if not candidates:
|
||||
for k, v in raw_data.items():
|
||||
if isinstance(v, str): candidates.append(f"{k}:{v}")
|
||||
elif isinstance(v, list): [candidates.append(f"{k}:{i}") for i in v if isinstance(i, str)]
|
||||
|
||||
# 3. DEEP FALLBACK: Wenn nach allen Recovery-Versuchen die Liste leer ist UND wir in der Cloud waren
|
||||
# Triggert den Fallback bei "Data Policy Violations" (leere oder Fehler-JSONs).
|
||||
if not candidates and provider != "ollama" and self.settings.LLM_FALLBACK_ENABLED:
|
||||
logger.warning(
|
||||
f"🛑 [Ingestion] Cloud-Antwort für {note_id} lieferte keine verwertbaren Kanten. "
|
||||
f"Mögliche Policy Violation oder Refusal. Erzwinge LOKALEN FALLBACK via Ollama..."
|
||||
)
|
||||
response_json_local = await self.llm.generate_raw_response(
|
||||
prompt=prompt, priority="background", force_json=True, provider="ollama"
|
||||
)
|
||||
raw_data_local = extract_json_from_response(response_json_local)
|
||||
|
||||
# Wiederhole Recovery für lokale Antwort
|
||||
if isinstance(raw_data_local, list):
|
||||
candidates = raw_data_local
|
||||
elif isinstance(raw_data_local, dict):
|
||||
for k in ["edges", "links", "results"]:
|
||||
if k in raw_data_local and isinstance(raw_data_local[k], list):
|
||||
candidates = raw_data_local[k]; break
|
||||
|
||||
if not candidates:
|
||||
logger.warning(f"⚠️ [Ingestion] Auch nach Fallback keine extrahierbaren Kanten für {note_id}")
|
||||
return []
|
||||
|
||||
processed = []
|
||||
for item in candidates:
|
||||
if isinstance(item, dict) and "to" in item:
|
||||
item["provenance"] = "semantic_ai"
|
||||
item["line"] = f"ai-{provider}"
|
||||
processed.append(item)
|
||||
elif isinstance(item, str) and ":" in item:
|
||||
parts = item.split(":", 1)
|
||||
processed.append({
|
||||
"to": parts[1].strip(),
|
||||
"kind": parts[0].strip(),
|
||||
"provenance": "semantic_ai",
|
||||
"line": f"ai-{provider}"
|
||||
})
|
||||
return processed
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ [Ingestion] Smart Edge Allocation failed for {note_id}: {e}")
|
||||
return []
|
||||
|
||||
async def process_file(
|
||||
self, file_path: str, vault_root: str,
|
||||
force_replace: bool = False, apply: bool = False, purge_before: bool = False,
|
||||
note_scope_refs: bool = False, hash_source: str = "parsed", hash_normalize: str = "canonical"
|
||||
) -> Dict[str, Any]:
|
||||
"""Transformiert eine Markdown-Datei in den Graphen (Notes, Chunks, Edges)."""
|
||||
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
||||
|
||||
# 1. Parse & Lifecycle Gate
|
||||
try:
|
||||
parsed = read_markdown(file_path)
|
||||
if not parsed: return {**result, "error": "Empty file"}
|
||||
fm = normalize_frontmatter(parsed.frontmatter)
|
||||
validate_required_frontmatter(fm)
|
||||
except Exception as e:
|
||||
return {**result, "error": f"Validation failed: {str(e)}"}
|
||||
|
||||
# WP-22: Filter für Systemdateien und Entwürfe
|
||||
status = fm.get("status", "draft").lower().strip()
|
||||
if status in ["system", "template", "archive", "hidden"]:
|
||||
return {**result, "status": "skipped", "reason": f"lifecycle_{status}"}
|
||||
|
||||
# 2. Config Resolution & Payload Construction
|
||||
note_type = self._resolve_note_type(fm.get("type"))
|
||||
fm["type"] = note_type
|
||||
|
||||
try:
|
||||
note_pl = make_note_payload(parsed, vault_root=vault_root, hash_normalize=hash_normalize, hash_source=hash_source, file_path=file_path)
|
||||
note_id = note_pl["note_id"]
|
||||
except Exception as e:
|
||||
return {**result, "error": f"Payload failed: {str(e)}"}
|
||||
|
||||
# 3. Change Detection (Strikte DoD Umsetzung)
|
||||
old_payload = None if force_replace else self._fetch_note_payload(note_id)
|
||||
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
|
||||
old_hash = (old_payload or {}).get("hashes", {}).get(check_key)
|
||||
new_hash = note_pl.get("hashes", {}).get(check_key)
|
||||
|
||||
# Prüfung auf fehlende Artefakte in Qdrant
|
||||
chunks_missing, edges_missing = self._artifacts_missing(note_id)
|
||||
|
||||
should_write = force_replace or (not old_payload) or (old_hash != new_hash) or chunks_missing or edges_missing
|
||||
|
||||
if not should_write:
|
||||
return {**result, "status": "unchanged", "note_id": note_id}
|
||||
|
||||
if not apply:
|
||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
||||
|
||||
# 4. Processing (Chunking, Embedding, AI Edges)
|
||||
try:
|
||||
body_text = getattr(parsed, "body", "") or ""
|
||||
edge_registry.ensure_latest()
|
||||
|
||||
# Profil-gesteuertes Chunking
|
||||
profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard"
|
||||
chunk_cfg = self._get_chunk_config_by_profile(profile, note_type)
|
||||
chunks = await assemble_chunks(fm["id"], body_text, fm["type"], config=chunk_cfg)
|
||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text)
|
||||
|
||||
# Vektorisierung
|
||||
vecs = []
|
||||
if chunk_pls:
|
||||
texts = [c.get("window") or c.get("text") or "" for c in chunk_pls]
|
||||
vecs = await self.embedder.embed_documents(texts)
|
||||
|
||||
# Kanten-Extraktion
|
||||
edges = []
|
||||
context = {"file": file_path, "note_id": note_id}
|
||||
|
||||
# A. Explizite Kanten (User / Wikilinks)
|
||||
for e in extract_edges_with_context(parsed):
|
||||
e["kind"] = edge_registry.resolve(edge_type=e["kind"], provenance="explicit", context={**context, "line": e.get("line")})
|
||||
edges.append(e)
|
||||
|
||||
# B. KI Kanten (Turbo Mode mit v2.11.14 Fallback)
|
||||
ai_edges = await self._perform_smart_edge_allocation(body_text, note_id)
|
||||
for e in ai_edges:
|
||||
valid_kind = edge_registry.resolve(edge_type=e.get("kind"), provenance="semantic_ai", context={**context, "line": e.get("line")})
|
||||
e["kind"] = valid_kind
|
||||
edges.append(e)
|
||||
|
||||
# C. System Kanten (Struktur)
|
||||
try:
|
||||
sys_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []), include_note_scope_refs=note_scope_refs)
|
||||
except:
|
||||
sys_edges = build_edges_for_note(note_id, chunk_pls)
|
||||
|
||||
for e in sys_edges:
|
||||
valid_kind = edge_registry.resolve(edge_type=e.get("kind", "belongs_to"), provenance="structure", context={**context, "line": "system"})
|
||||
if valid_kind:
|
||||
e["kind"] = valid_kind
|
||||
edges.append(e)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Processing failed for {file_path}: {e}", exc_info=True)
|
||||
return {**result, "error": f"Processing failed: {str(e)}"}
|
||||
|
||||
# 5. DB Upsert
|
||||
try:
|
||||
if purge_before and old_payload: self._purge_artifacts(note_id)
|
||||
|
||||
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||
upsert_batch(self.client, n_name, n_pts)
|
||||
|
||||
if chunk_pls and vecs:
|
||||
c_name, c_pts = points_for_chunks(self.prefix, chunk_pls, vecs)
|
||||
upsert_batch(self.client, c_name, c_pts)
|
||||
|
||||
if edges:
|
||||
e_name, e_pts = points_for_edges(self.prefix, edges)
|
||||
upsert_batch(self.client, e_name, e_pts)
|
||||
|
||||
return {"path": file_path, "status": "success", "changed": True, "note_id": note_id, "chunks_count": len(chunk_pls), "edges_count": len(edges)}
|
||||
except Exception as e:
|
||||
return {**result, "error": f"DB Upsert failed: {e}"}
|
||||
|
||||
def _fetch_note_payload(self, note_id: str) -> Optional[dict]:
|
||||
"""Holt die Metadaten einer Note aus Qdrant."""
|
||||
from qdrant_client.http import models as rest
|
||||
try:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
pts, _ = self.client.scroll(collection_name=f"{self.prefix}_notes", scroll_filter=f, limit=1, with_payload=True)
|
||||
return pts[0].payload if pts else None
|
||||
except: return None
|
||||
|
||||
def _artifacts_missing(self, note_id: str) -> Tuple[bool, bool]:
|
||||
"""Prüft Qdrant aktiv auf vorhandene Chunks und Edges."""
|
||||
from qdrant_client.http import models as rest
|
||||
try:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
c_pts, _ = self.client.scroll(collection_name=f"{self.prefix}_chunks", scroll_filter=f, limit=1)
|
||||
e_pts, _ = self.client.scroll(collection_name=f"{self.prefix}_edges", scroll_filter=f, limit=1)
|
||||
return (not bool(c_pts)), (not bool(e_pts))
|
||||
except: return True, True
|
||||
|
||||
def _purge_artifacts(self, note_id: str):
|
||||
"""Löscht verwaiste Chunks/Edges vor einem Re-Import."""
|
||||
from qdrant_client.http import models as rest
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
for suffix in ["chunks", "edges"]:
|
||||
try: self.client.delete(collection_name=f"{self.prefix}_{suffix}", points_selector=rest.FilterSelector(filter=f))
|
||||
except: pass
|
||||
|
||||
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
||||
"""Hilfsmethode zur Erstellung einer Note aus einem Textstream."""
|
||||
target_dir = os.path.join(vault_root, folder)
|
||||
os.makedirs(target_dir, exist_ok=True)
|
||||
file_path = os.path.join(target_dir, filename)
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
await asyncio.sleep(0.1)
|
||||
return await self.process_file(file_path=file_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||
26
app/core/ingestion/__init__.py
Normal file
26
app/core/ingestion/__init__.py
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/__init__.py
|
||||
DESCRIPTION: Package-Einstiegspunkt für Ingestion. Exportiert den IngestionService.
|
||||
AUDIT v2.13.10: Abschluss der Modularisierung (WP-14).
|
||||
Bricht Zirkelbezüge durch Nutzung der neutralen registry.py auf.
|
||||
VERSION: 2.13.10
|
||||
"""
|
||||
# Der IngestionService ist der primäre Orchestrator für den Datenimport
|
||||
from .ingestion_processor import IngestionService
|
||||
|
||||
# Hilfswerkzeuge für JSON-Verarbeitung und Konfigurations-Management
|
||||
# load_type_registry wird hier re-exportiert, um die Abwärtskompatibilität zu wahren,
|
||||
# obwohl die Implementierung nun in app.core.registry liegt.
|
||||
from .ingestion_utils import (
|
||||
extract_json_from_response,
|
||||
load_type_registry,
|
||||
resolve_note_type
|
||||
)
|
||||
|
||||
# Öffentliche API des Pakets
|
||||
__all__ = [
|
||||
"IngestionService",
|
||||
"extract_json_from_response",
|
||||
"load_type_registry",
|
||||
"resolve_note_type"
|
||||
]
|
||||
114
app/core/ingestion/ingestion_chunk_payload.py
Normal file
114
app/core/ingestion/ingestion_chunk_payload.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_chunk_payload.py
|
||||
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
||||
Fix v2.4.3: Integration der zentralen Registry (WP-14) für konsistente Defaults.
|
||||
VERSION: 2.4.3
|
||||
STATUS: Active
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
# ENTSCHEIDENDER FIX: Import der neutralen Registry-Logik zur Vermeidung von Circular Imports
|
||||
from app.core.registry import load_type_registry
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Resolution Helpers (Audited)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _as_list(x):
|
||||
"""Sichert die Listen-Integrität für Metadaten wie Tags."""
|
||||
if x is None: return []
|
||||
return x if isinstance(x, list) else [x]
|
||||
|
||||
def _resolve_val(note_type: str, reg: dict, key: str, default: Any) -> Any:
|
||||
"""
|
||||
Hierarchische Suche in der Registry: Type-Spezifisch > Globaler Default.
|
||||
WP-14: Erlaubt dynamische Konfiguration via types.yaml.
|
||||
"""
|
||||
types = reg.get("types", {})
|
||||
if isinstance(types, dict):
|
||||
t_cfg = types.get(note_type, {})
|
||||
if isinstance(t_cfg, dict):
|
||||
# Fallback für Key-Varianten (z.B. chunking_profile vs chunk_profile)
|
||||
val = t_cfg.get(key) or t_cfg.get(key.replace("ing", ""))
|
||||
if val is not None: return val
|
||||
|
||||
defs = reg.get("defaults", {}) or reg.get("global", {})
|
||||
if isinstance(defs, dict):
|
||||
val = defs.get(key) or defs.get(key.replace("ing", ""))
|
||||
if val is not None: return val
|
||||
|
||||
return default
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Haupt-API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Erstellt die Payloads für die Chunks inklusive Audit-Resolution.
|
||||
Nutzt nun die zentrale Registry für alle Fallbacks.
|
||||
"""
|
||||
if isinstance(note, dict) and "frontmatter" in note:
|
||||
fm = note["frontmatter"]
|
||||
else:
|
||||
fm = note or {}
|
||||
|
||||
# WP-14 Fix: Nutzt übergebene Registry oder lädt sie global
|
||||
reg = kwargs.get("types_cfg") or load_type_registry()
|
||||
|
||||
note_type = fm.get("type") or "concept"
|
||||
title = fm.get("title") or fm.get("id") or "Untitled"
|
||||
tags = _as_list(fm.get("tags") or [])
|
||||
|
||||
# Audit: Resolution Hierarchie (Frontmatter > Registry)
|
||||
cp = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
if not cp:
|
||||
cp = _resolve_val(note_type, reg, "chunking_profile", "sliding_standard")
|
||||
|
||||
rw = fm.get("retriever_weight")
|
||||
if rw is None:
|
||||
rw = _resolve_val(note_type, reg, "retriever_weight", 1.0)
|
||||
try:
|
||||
rw = float(rw)
|
||||
except:
|
||||
rw = 1.0
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
for idx, ch in enumerate(chunks_from_chunker):
|
||||
is_dict = isinstance(ch, dict)
|
||||
cid = getattr(ch, "id", None) if not is_dict else ch.get("id")
|
||||
nid = getattr(ch, "note_id", None) if not is_dict else ch.get("note_id")
|
||||
index = getattr(ch, "index", idx) if not is_dict else ch.get("index", idx)
|
||||
text = getattr(ch, "text", "") if not is_dict else ch.get("text", "")
|
||||
window = getattr(ch, "window", text) if not is_dict else ch.get("window", text)
|
||||
prev_id = getattr(ch, "neighbors_prev", None) if not is_dict else ch.get("neighbors_prev")
|
||||
next_id = getattr(ch, "neighbors_next", None) if not is_dict else ch.get("neighbors_next")
|
||||
section = getattr(ch, "section_title", "") if not is_dict else ch.get("section", "")
|
||||
|
||||
pl: Dict[str, Any] = {
|
||||
"note_id": nid or fm.get("id"),
|
||||
"chunk_id": cid,
|
||||
"title": title,
|
||||
"index": int(index),
|
||||
"ord": int(index) + 1,
|
||||
"type": note_type,
|
||||
"tags": tags,
|
||||
"text": text,
|
||||
"window": window,
|
||||
"neighbors_prev": _as_list(prev_id),
|
||||
"neighbors_next": _as_list(next_id),
|
||||
"section": section,
|
||||
"path": note_path,
|
||||
"source_path": kwargs.get("file_path") or note_path,
|
||||
"retriever_weight": rw,
|
||||
"chunk_profile": cp
|
||||
}
|
||||
|
||||
# Audit: Cleanup Pop (Vermeidung von redundanten Alias-Feldern)
|
||||
for alias in ("chunk_num", "Chunk_Number"):
|
||||
pl.pop(alias, None)
|
||||
|
||||
out.append(pl)
|
||||
|
||||
return out
|
||||
39
app/core/ingestion/ingestion_db.py
Normal file
39
app/core/ingestion/ingestion_db.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_db.py
|
||||
DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung.
|
||||
WP-14: Umstellung auf zentrale database-Infrastruktur.
|
||||
"""
|
||||
from typing import Optional, Tuple
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
# Import der modularisierten Namen-Logik zur Sicherstellung der Konsistenz
|
||||
from app.core.database import collection_names
|
||||
|
||||
def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]:
|
||||
"""Holt die Metadaten einer Note aus Qdrant via Scroll."""
|
||||
notes_col, _, _ = collection_names(prefix)
|
||||
try:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
pts, _ = client.scroll(collection_name=notes_col, scroll_filter=f, limit=1, with_payload=True)
|
||||
return pts[0].payload if pts else None
|
||||
except: return None
|
||||
|
||||
def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]:
|
||||
"""Prüft Qdrant aktiv auf vorhandene Chunks und Edges."""
|
||||
_, chunks_col, edges_col = collection_names(prefix)
|
||||
try:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
c_pts, _ = client.scroll(collection_name=chunks_col, scroll_filter=f, limit=1)
|
||||
e_pts, _ = client.scroll(collection_name=edges_col, scroll_filter=f, limit=1)
|
||||
return (not bool(c_pts)), (not bool(e_pts))
|
||||
except: return True, True
|
||||
|
||||
def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
|
||||
"""Löscht verwaiste Chunks/Edges vor einem Re-Import."""
|
||||
_, chunks_col, edges_col = collection_names(prefix)
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
# Iteration über die nun zentral verwalteten Collection-Namen
|
||||
for col in [chunks_col, edges_col]:
|
||||
try: client.delete(collection_name=col, points_selector=rest.FilterSelector(filter=f))
|
||||
except: pass
|
||||
160
app/core/ingestion/ingestion_note_payload.py
Normal file
160
app/core/ingestion/ingestion_note_payload.py
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_note_payload.py
|
||||
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
||||
FEATURES:
|
||||
- Multi-Hash (body/full) für flexible Change Detection.
|
||||
- Fix v2.4.4: Integration der zentralen Registry (WP-14) für konsistente Defaults.
|
||||
VERSION: 2.4.4
|
||||
STATUS: Active
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import Any, Dict, Tuple, Optional
|
||||
import os
|
||||
import json
|
||||
import pathlib
|
||||
import hashlib
|
||||
|
||||
# Import der zentralen Registry-Logik
|
||||
from app.core.registry import load_type_registry
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _as_dict(x) -> Dict[str, Any]:
|
||||
"""Versucht, ein Objekt in ein Dict zu überführen."""
|
||||
if isinstance(x, dict): return dict(x)
|
||||
out: Dict[str, Any] = {}
|
||||
for attr in ("frontmatter", "body", "id", "note_id", "title", "path", "tags", "type", "created", "modified", "date"):
|
||||
if hasattr(x, attr):
|
||||
val = getattr(x, attr)
|
||||
if val is not None: out[attr] = val
|
||||
if not out: out["raw"] = str(x)
|
||||
return out
|
||||
|
||||
def _ensure_list(x) -> list:
|
||||
"""Sichert String-Listen Integrität."""
|
||||
if x is None: return []
|
||||
if isinstance(x, list): return [str(i) for i in x]
|
||||
if isinstance(x, (set, tuple)): return [str(i) for i in x]
|
||||
return [str(x)]
|
||||
|
||||
def _compute_hash(content: str) -> str:
|
||||
"""SHA-256 Hash-Berechnung."""
|
||||
if not content: return ""
|
||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||
|
||||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||
"""Generiert den Hash-Input-String basierend auf Body oder Metadaten."""
|
||||
body = str(n.get("body") or "")
|
||||
if mode == "body": return body
|
||||
if mode == "full":
|
||||
fm = n.get("frontmatter") or {}
|
||||
meta_parts = []
|
||||
# Sortierte Liste für deterministische Hashes
|
||||
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
||||
val = fm.get(k)
|
||||
if val is not None: meta_parts.append(f"{k}:{val}")
|
||||
return f"{'|'.join(meta_parts)}||{body}"
|
||||
return body
|
||||
|
||||
def _cfg_for_type(note_type: str, reg: dict) -> dict:
|
||||
"""Extrahiert Typ-spezifische Config aus der Registry."""
|
||||
if not isinstance(reg, dict): return {}
|
||||
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
|
||||
return types.get(note_type, {}) if isinstance(types, dict) else {}
|
||||
|
||||
def _cfg_defaults(reg: dict) -> dict:
|
||||
"""Extrahiert globale Default-Werte aus der Registry."""
|
||||
if not isinstance(reg, dict): return {}
|
||||
for key in ("defaults", "default", "global"):
|
||||
v = reg.get(key)
|
||||
if isinstance(v, dict): return v
|
||||
return {}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Haupt-API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.
|
||||
WP-14: Nutzt nun die zentrale Registry für alle Fallbacks.
|
||||
"""
|
||||
n = _as_dict(note)
|
||||
|
||||
# Nutzt übergebene Registry oder lädt sie global
|
||||
reg = kwargs.get("types_cfg") or load_type_registry()
|
||||
hash_source = kwargs.get("hash_source", "parsed")
|
||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||
|
||||
fm = n.get("frontmatter") or {}
|
||||
note_type = str(fm.get("type") or n.get("type") or "concept")
|
||||
|
||||
cfg_type = _cfg_for_type(note_type, reg)
|
||||
cfg_def = _cfg_defaults(reg)
|
||||
ingest_cfg = reg.get("ingestion_settings", {})
|
||||
|
||||
# --- retriever_weight Audit ---
|
||||
# Priorität: Frontmatter -> Typ-Config -> globale Config -> Env-Var
|
||||
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
|
||||
retriever_weight = fm.get("retriever_weight")
|
||||
if retriever_weight is None:
|
||||
retriever_weight = cfg_type.get("retriever_weight", cfg_def.get("retriever_weight", default_rw))
|
||||
try:
|
||||
retriever_weight = float(retriever_weight)
|
||||
except:
|
||||
retriever_weight = default_rw
|
||||
|
||||
# --- chunk_profile Audit ---
|
||||
# Nutzt nun primär die ingestion_settings aus der Registry
|
||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
if chunk_profile is None:
|
||||
chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile")
|
||||
if chunk_profile is None:
|
||||
chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard"))
|
||||
|
||||
# --- edge_defaults ---
|
||||
edge_defaults = fm.get("edge_defaults")
|
||||
if edge_defaults is None:
|
||||
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
|
||||
edge_defaults = _ensure_list(edge_defaults)
|
||||
|
||||
# --- Basis-Metadaten ---
|
||||
note_id = n.get("note_id") or n.get("id") or fm.get("id")
|
||||
title = n.get("title") or fm.get("title") or ""
|
||||
path = n.get("path") or kwargs.get("file_path") or ""
|
||||
if isinstance(path, pathlib.Path): path = str(path)
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"note_id": note_id,
|
||||
"title": title,
|
||||
"type": note_type,
|
||||
"path": path,
|
||||
"retriever_weight": retriever_weight,
|
||||
"chunk_profile": chunk_profile,
|
||||
"edge_defaults": edge_defaults,
|
||||
"hashes": {}
|
||||
}
|
||||
|
||||
# --- MULTI-HASH ---
|
||||
# Generiert Hashes für Change Detection
|
||||
for mode in ["body", "full"]:
|
||||
content = _get_hash_source_content(n, mode)
|
||||
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
|
||||
|
||||
# Metadaten Anreicherung
|
||||
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
||||
if tags: payload["tags"] = _ensure_list(tags)
|
||||
if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases"))
|
||||
|
||||
for k in ("created", "modified", "date"):
|
||||
v = fm.get(k) or n.get(k)
|
||||
if v: payload[k] = str(v)
|
||||
|
||||
if n.get("body"): payload["fulltext"] = str(n["body"])
|
||||
|
||||
# Final JSON Validation Audit
|
||||
json.loads(json.dumps(payload, ensure_ascii=False))
|
||||
|
||||
return payload
|
||||
220
app/core/ingestion/ingestion_processor.py
Normal file
220
app/core/ingestion/ingestion_processor.py
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_processor.py
|
||||
DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||
WP-14: Modularisierung der Datenbank-Ebene (app.core.database).
|
||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||
AUDIT v2.13.10: Umstellung auf app.core.database Infrastruktur.
|
||||
VERSION: 2.13.10
|
||||
STATUS: Active
|
||||
"""
|
||||
import logging
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
# Core Module Imports
|
||||
from app.core.parser import (
|
||||
read_markdown, pre_scan_markdown, normalize_frontmatter,
|
||||
validate_required_frontmatter, NoteContext
|
||||
)
|
||||
from app.core.chunking import assemble_chunks
|
||||
|
||||
# MODULARISIERUNG: Neue Import-Pfade für die Datenbank-Ebene
|
||||
from app.core.database.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
||||
from app.core.database.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
|
||||
|
||||
# Services
|
||||
from app.services.embeddings_client import EmbeddingsClient
|
||||
from app.services.edge_registry import registry as edge_registry
|
||||
from app.services.llm_service import LLMService
|
||||
|
||||
# Package-Interne Imports (Refactoring WP-14)
|
||||
from .ingestion_utils import load_type_registry, resolve_note_type, get_chunk_config_by_profile
|
||||
from .ingestion_db import fetch_note_payload, artifacts_missing, purge_artifacts
|
||||
from .ingestion_validation import validate_edge_candidate
|
||||
from .ingestion_note_payload import make_note_payload
|
||||
from .ingestion_chunk_payload import make_chunk_payloads
|
||||
|
||||
# Fallback für Edges (Struktur-Verknüpfung)
|
||||
try:
|
||||
from app.core.derive_edges import build_edges_for_note
|
||||
except ImportError:
|
||||
def build_edges_for_note(*args, **kwargs): return []
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class IngestionService:
|
||||
def __init__(self, collection_prefix: str = None):
|
||||
"""Initialisiert den Service und nutzt die neue database-Infrastruktur."""
|
||||
from app.config import get_settings
|
||||
self.settings = get_settings()
|
||||
|
||||
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
||||
self.cfg = QdrantConfig.from_env()
|
||||
# Synchronisierung der Konfiguration mit dem Instanz-Präfix
|
||||
self.cfg.prefix = self.prefix
|
||||
self.client = get_client(self.cfg)
|
||||
self.dim = self.settings.VECTOR_SIZE
|
||||
self.registry = load_type_registry()
|
||||
self.embedder = EmbeddingsClient()
|
||||
self.llm = LLMService()
|
||||
|
||||
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
||||
self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache
|
||||
|
||||
try:
|
||||
# Aufruf der modularisierten Schema-Logik
|
||||
ensure_collections(self.client, self.prefix, self.dim)
|
||||
ensure_payload_indexes(self.client, self.prefix)
|
||||
except Exception as e:
|
||||
logger.warning(f"DB initialization warning: {e}")
|
||||
|
||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
WP-15b: Implementiert den Two-Pass Ingestion Workflow.
|
||||
Pass 1: Pre-Scan füllt den Context-Cache (3-Wege-Indexierung).
|
||||
Pass 2: Verarbeitung nutzt den Cache für die semantische Prüfung.
|
||||
"""
|
||||
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...")
|
||||
for path in file_paths:
|
||||
try:
|
||||
# Übergabe der Registry für dynamische Scan-Tiefe
|
||||
ctx = pre_scan_markdown(path, registry=self.registry)
|
||||
if ctx:
|
||||
# Mehrfache Indizierung für robusten Look-up (ID, Titel, Dateiname)
|
||||
self.batch_cache[ctx.note_id] = ctx
|
||||
self.batch_cache[ctx.title] = ctx
|
||||
fname = os.path.splitext(os.path.basename(path))[0]
|
||||
self.batch_cache[fname] = ctx
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Pre-scan failed for {path}: {e}")
|
||||
|
||||
logger.info(f"🚀 [Pass 2] Semantic Processing of {len(file_paths)} files...")
|
||||
return [await self.process_file(p, vault_root, apply=True, purge_before=True) for p in file_paths]
|
||||
|
||||
async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]:
|
||||
"""Transformiert eine Markdown-Datei in den Graphen."""
|
||||
apply = kwargs.get("apply", False)
|
||||
force_replace = kwargs.get("force_replace", False)
|
||||
purge_before = kwargs.get("purge_before", False)
|
||||
note_scope_refs = kwargs.get("note_scope_refs", False)
|
||||
hash_source = kwargs.get("hash_source", "parsed")
|
||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||
|
||||
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
||||
|
||||
# 1. Parse & Lifecycle Gate
|
||||
try:
|
||||
parsed = read_markdown(file_path)
|
||||
if not parsed: return {**result, "error": "Empty file"}
|
||||
fm = normalize_frontmatter(parsed.frontmatter)
|
||||
validate_required_frontmatter(fm)
|
||||
except Exception as e:
|
||||
return {**result, "error": f"Validation failed: {str(e)}"}
|
||||
|
||||
# Dynamischer Lifecycle-Filter aus der Registry (WP-14)
|
||||
ingest_cfg = self.registry.get("ingestion_settings", {})
|
||||
ignore_list = ingest_cfg.get("ignore_statuses", ["system", "template", "archive", "hidden"])
|
||||
|
||||
current_status = fm.get("status", "draft").lower().strip()
|
||||
if current_status in ignore_list:
|
||||
return {**result, "status": "skipped", "reason": "lifecycle_filter"}
|
||||
|
||||
# 2. Payload & Change Detection (Multi-Hash)
|
||||
note_type = resolve_note_type(self.registry, fm.get("type"))
|
||||
note_pl = make_note_payload(
|
||||
parsed, vault_root=vault_root, file_path=file_path,
|
||||
hash_source=hash_source, hash_normalize=hash_normalize,
|
||||
types_cfg=self.registry
|
||||
)
|
||||
note_id = note_pl["note_id"]
|
||||
|
||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
|
||||
old_hash = (old_payload or {}).get("hashes", {}).get(check_key)
|
||||
new_hash = note_pl.get("hashes", {}).get(check_key)
|
||||
|
||||
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
||||
if not (force_replace or not old_payload or old_hash != new_hash or c_miss or e_miss):
|
||||
return {**result, "status": "unchanged", "note_id": note_id}
|
||||
|
||||
if not apply:
|
||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
||||
|
||||
# 3. Deep Processing (Chunking, Validation, Embedding)
|
||||
try:
|
||||
body_text = getattr(parsed, "body", "") or ""
|
||||
edge_registry.ensure_latest()
|
||||
profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard"
|
||||
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
||||
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
||||
|
||||
# WP-15b: Chunker-Aufruf bereitet Candidate-Pool vor
|
||||
chunks = await assemble_chunks(note_id, body_text, note_type, config=chunk_cfg)
|
||||
for ch in chunks:
|
||||
filtered = []
|
||||
for cand in getattr(ch, "candidate_pool", []):
|
||||
# WP-15b: Nur global_pool Kandidaten erfordern binäre Validierung
|
||||
if cand.get("provenance") == "global_pool" and enable_smart:
|
||||
if await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm, self.settings.MINDNET_LLM_PROVIDER):
|
||||
filtered.append(cand)
|
||||
else:
|
||||
filtered.append(cand)
|
||||
ch.candidate_pool = filtered
|
||||
|
||||
# Payload-Erstellung via interne Module
|
||||
chunk_pls = make_chunk_payloads(
|
||||
fm, note_pl["path"], chunks, file_path=file_path,
|
||||
types_cfg=self.registry
|
||||
)
|
||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||
|
||||
# Kanten-Aggregation
|
||||
edges = build_edges_for_note(
|
||||
note_id, chunk_pls,
|
||||
note_level_references=note_pl.get("references", []),
|
||||
include_note_scope_refs=note_scope_refs
|
||||
)
|
||||
for e in edges:
|
||||
e["kind"] = edge_registry.resolve(
|
||||
e.get("kind", "related_to"),
|
||||
provenance=e.get("provenance", "explicit"),
|
||||
context={"file": file_path, "note_id": note_id, "line": e.get("line", "system")}
|
||||
)
|
||||
|
||||
# 4. DB Upsert via modularisierter Points-Logik
|
||||
if purge_before and old_payload:
|
||||
purge_artifacts(self.client, self.prefix, note_id)
|
||||
|
||||
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||
upsert_batch(self.client, n_name, n_pts)
|
||||
|
||||
if chunk_pls and vecs:
|
||||
c_pts = points_for_chunks(self.prefix, chunk_pls, vecs)[1]
|
||||
upsert_batch(self.client, f"{self.prefix}_chunks", c_pts)
|
||||
|
||||
if edges:
|
||||
e_pts = points_for_edges(self.prefix, edges)[1]
|
||||
upsert_batch(self.client, f"{self.prefix}_edges", e_pts)
|
||||
|
||||
return {
|
||||
"path": file_path,
|
||||
"status": "success",
|
||||
"changed": True,
|
||||
"note_id": note_id,
|
||||
"chunks_count": len(chunk_pls),
|
||||
"edges_count": len(edges)
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Processing failed: {e}", exc_info=True)
|
||||
return {**result, "error": str(e)}
|
||||
|
||||
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
||||
"""Erstellt eine Note aus einem Textstream und triggert die Ingestion."""
|
||||
target_path = os.path.join(vault_root, folder, filename)
|
||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||
with open(target_path, "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
await asyncio.sleep(0.1)
|
||||
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||
71
app/core/ingestion/ingestion_utils.py
Normal file
71
app/core/ingestion/ingestion_utils.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_utils.py
|
||||
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
|
||||
AUDIT v2.13.9: Behebung des Circular Imports durch Nutzung der app.core.registry.
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Optional, Dict
|
||||
|
||||
# ENTSCHEIDENDER FIX: Import der Basis-Logik aus dem neutralen Registry-Modul.
|
||||
# Dies bricht den Zirkelbezug auf, da dieses Modul keine Services mehr importiert.
|
||||
from app.core.registry import load_type_registry, clean_llm_text
|
||||
|
||||
def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any:
|
||||
"""
|
||||
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen.
|
||||
WP-14: Nutzt nun die zentrale clean_llm_text Funktion aus app.core.registry.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# 1. Text zentral bereinigen via neutralem Modul
|
||||
clean = clean_llm_text(text, registry)
|
||||
|
||||
# 2. Markdown-Code-Blöcke extrahieren
|
||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
||||
payload = match.group(1) if match else clean
|
||||
|
||||
try:
|
||||
return json.loads(payload.strip())
|
||||
except json.JSONDecodeError:
|
||||
# Recovery: Suche nach Liste
|
||||
start = payload.find('[')
|
||||
end = payload.rfind(']') + 1
|
||||
if start != -1 and end > start:
|
||||
try: return json.loads(payload[start:end])
|
||||
except: pass
|
||||
|
||||
# Recovery: Suche nach Objekt
|
||||
start_obj = payload.find('{')
|
||||
end_obj = payload.rfind('}') + 1
|
||||
if start_obj != -1 and end_obj > start_obj:
|
||||
try: return json.loads(payload[start_obj:end_obj])
|
||||
except: pass
|
||||
return []
|
||||
|
||||
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
||||
"""
|
||||
Bestimmt den finalen Notiz-Typ.
|
||||
WP-14: Fallback wird nun über ingestion_settings.default_note_type gesteuert.
|
||||
"""
|
||||
types = registry.get("types", {})
|
||||
if requested and requested in types:
|
||||
return requested
|
||||
|
||||
# Dynamischer Fallback aus der Registry (Standard: 'concept')
|
||||
ingest_cfg = registry.get("ingestion_settings", {})
|
||||
return ingest_cfg.get("default_note_type", "concept")
|
||||
|
||||
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.
|
||||
"""
|
||||
from app.core.chunking import get_chunk_config
|
||||
profiles = registry.get("chunking_profiles", {})
|
||||
if profile_name in profiles:
|
||||
cfg = profiles[profile_name].copy()
|
||||
if "overlap" in cfg and isinstance(cfg["overlap"], list):
|
||||
cfg["overlap"] = tuple(cfg["overlap"])
|
||||
return cfg
|
||||
return get_chunk_config(note_type)
|
||||
67
app/core/ingestion/ingestion_validation.py
Normal file
67
app/core/ingestion/ingestion_validation.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_validation.py
|
||||
DESCRIPTION: WP-15b semantische Validierung von Kanten gegen den LocalBatchCache.
|
||||
AUDIT v2.12.3: Integration der zentralen Text-Bereinigung (WP-14).
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any
|
||||
from app.core.parser import NoteContext
|
||||
|
||||
# ENTSCHEIDENDER FIX: Import der neutralen Bereinigungs-Logik zur Vermeidung von Circular Imports
|
||||
from app.core.registry import clean_llm_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def validate_edge_candidate(
|
||||
chunk_text: str,
|
||||
edge: Dict,
|
||||
batch_cache: Dict[str, NoteContext],
|
||||
llm_service: Any,
|
||||
provider: str
|
||||
) -> bool:
|
||||
"""
|
||||
WP-15b: Validiert einen Kandidaten semantisch gegen das Ziel im Cache.
|
||||
Nutzt clean_llm_text zur Entfernung von Steuerzeichen vor der Auswertung.
|
||||
"""
|
||||
target_id = edge.get("to")
|
||||
target_ctx = batch_cache.get(target_id)
|
||||
|
||||
# Robust Lookup Fix (v2.12.2): Support für Anker
|
||||
if not target_ctx and "#" in target_id:
|
||||
base_id = target_id.split("#")[0]
|
||||
target_ctx = batch_cache.get(base_id)
|
||||
|
||||
# Sicherheits-Fallback (Hard-Link Integrity)
|
||||
if not target_ctx:
|
||||
logger.info(f"ℹ️ [VALIDATION SKIP] No context for '{target_id}' - allowing link.")
|
||||
return True
|
||||
|
||||
template = llm_service.get_prompt("edge_validation", provider)
|
||||
|
||||
try:
|
||||
logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}'...")
|
||||
prompt = template.format(
|
||||
chunk_text=chunk_text[:1500],
|
||||
target_title=target_ctx.title,
|
||||
target_summary=target_ctx.summary,
|
||||
edge_kind=edge.get("kind", "related_to")
|
||||
)
|
||||
|
||||
# Die Antwort vom Service anfordern
|
||||
raw_response = await llm_service.generate_raw_response(prompt, priority="background")
|
||||
|
||||
# WP-14 Fix: Zusätzliche Bereinigung zur Sicherstellung der Interpretierbarkeit
|
||||
response = clean_llm_text(raw_response)
|
||||
|
||||
# Semantische Prüfung des Ergebnisses
|
||||
is_valid = "YES" in response.upper()
|
||||
|
||||
if is_valid:
|
||||
logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.")
|
||||
else:
|
||||
logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.")
|
||||
return is_valid
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Validation error for {target_id}: {e}")
|
||||
# Im Zweifel (Timeout/Fehler) erlauben wir die Kante, um Datenverlust zu vermeiden
|
||||
return True
|
||||
|
|
@ -1,268 +0,0 @@
|
|||
"""
|
||||
FILE: app/core/note_payload.py
|
||||
DESCRIPTION: Baut das JSON-Objekt.
|
||||
FEATURES:
|
||||
1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection.
|
||||
2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile').
|
||||
VERSION: 2.3.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: yaml, os, json, pathlib, hashlib
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Tuple, Optional
|
||||
import os
|
||||
import json
|
||||
import pathlib
|
||||
import hashlib
|
||||
|
||||
try:
|
||||
import yaml # type: ignore
|
||||
except Exception:
|
||||
yaml = None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _as_dict(x) -> Dict[str, Any]:
|
||||
"""Versucht, ein ParsedMarkdown-ähnliches Objekt in ein Dict zu überführen."""
|
||||
if isinstance(x, dict):
|
||||
return dict(x)
|
||||
|
||||
out: Dict[str, Any] = {}
|
||||
for attr in (
|
||||
"frontmatter",
|
||||
"body",
|
||||
"id",
|
||||
"note_id",
|
||||
"title",
|
||||
"path",
|
||||
"tags",
|
||||
"type",
|
||||
"created",
|
||||
"modified",
|
||||
"date",
|
||||
):
|
||||
if hasattr(x, attr):
|
||||
val = getattr(x, attr)
|
||||
if val is not None:
|
||||
out[attr] = val
|
||||
|
||||
if not out:
|
||||
out["raw"] = str(x)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def _pick_args(*args, **kwargs) -> Tuple[Optional[str], Optional[dict]]:
|
||||
path = kwargs.get("path") or (args[0] if args else None)
|
||||
types_cfg = kwargs.get("types_cfg") or kwargs.get("types") or None
|
||||
return path, types_cfg
|
||||
|
||||
|
||||
def _env_float(name: str, default: float) -> float:
|
||||
try:
|
||||
return float(os.environ.get(name, default))
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
def _ensure_list(x) -> list:
|
||||
if x is None:
|
||||
return []
|
||||
if isinstance(x, list):
|
||||
return [str(i) for i in x]
|
||||
if isinstance(x, (set, tuple)):
|
||||
return [str(i) for i in x]
|
||||
return [str(x)]
|
||||
|
||||
# --- Hash Logic ---
|
||||
def _compute_hash(content: str) -> str:
|
||||
"""Berechnet einen SHA-256 Hash für den gegebenen String."""
|
||||
if not content:
|
||||
return ""
|
||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||
|
||||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||
"""
|
||||
Stellt den String zusammen, der gehasht werden soll.
|
||||
"""
|
||||
body = str(n.get("body") or "")
|
||||
|
||||
if mode == "body":
|
||||
return body
|
||||
|
||||
if mode == "full":
|
||||
fm = n.get("frontmatter") or {}
|
||||
# Wichtig: Sortierte Keys für deterministisches Verhalten!
|
||||
# Wir nehmen alle steuernden Metadaten auf
|
||||
meta_parts = []
|
||||
# Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden
|
||||
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
||||
val = fm.get(k)
|
||||
if val is not None:
|
||||
meta_parts.append(f"{k}:{val}")
|
||||
|
||||
meta_str = "|".join(meta_parts)
|
||||
return f"{meta_str}||{body}"
|
||||
|
||||
return body
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Type-Registry laden
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_types_config(explicit_cfg: Optional[dict] = None) -> dict:
|
||||
if explicit_cfg and isinstance(explicit_cfg, dict):
|
||||
return explicit_cfg
|
||||
|
||||
path = os.getenv("MINDNET_TYPES_FILE") or "./config/types.yaml"
|
||||
if not os.path.isfile(path) or yaml is None:
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
return data if isinstance(data, dict) else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _cfg_for_type(note_type: str, reg: dict) -> dict:
|
||||
if not isinstance(reg, dict):
|
||||
return {}
|
||||
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
|
||||
return types.get(note_type, {}) if isinstance(types, dict) else {}
|
||||
|
||||
|
||||
def _cfg_defaults(reg: dict) -> dict:
|
||||
if not isinstance(reg, dict):
|
||||
return {}
|
||||
for key in ("defaults", "default", "global"):
|
||||
v = reg.get(key)
|
||||
if isinstance(v, dict):
|
||||
return v
|
||||
return {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Haupt-API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Baut das Note-Payload für mindnet_notes auf.
|
||||
Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups.
|
||||
"""
|
||||
n = _as_dict(note)
|
||||
path_arg, types_cfg_explicit = _pick_args(*args, **kwargs)
|
||||
reg = _load_types_config(types_cfg_explicit)
|
||||
|
||||
# Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide')
|
||||
hash_source = kwargs.get("hash_source", "parsed")
|
||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||
|
||||
fm = n.get("frontmatter") or {}
|
||||
fm_type = fm.get("type") or n.get("type") or "concept"
|
||||
note_type = str(fm_type)
|
||||
|
||||
cfg_type = _cfg_for_type(note_type, reg)
|
||||
cfg_def = _cfg_defaults(reg)
|
||||
|
||||
# --- retriever_weight ---
|
||||
default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)
|
||||
retriever_weight = fm.get("retriever_weight")
|
||||
if retriever_weight is None:
|
||||
retriever_weight = cfg_type.get(
|
||||
"retriever_weight",
|
||||
cfg_def.get("retriever_weight", default_rw),
|
||||
)
|
||||
try:
|
||||
retriever_weight = float(retriever_weight)
|
||||
except Exception:
|
||||
retriever_weight = default_rw
|
||||
|
||||
# --- chunk_profile (FIXED LOGIC) ---
|
||||
# 1. Frontmatter Override (beide Schreibweisen erlaubt)
|
||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
|
||||
# 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml)
|
||||
if chunk_profile is None:
|
||||
chunk_profile = cfg_type.get("chunking_profile")
|
||||
|
||||
# 3. Default Config (Fallback auf sliding_standard statt medium)
|
||||
if chunk_profile is None:
|
||||
chunk_profile = cfg_def.get("chunking_profile", "sliding_standard")
|
||||
|
||||
# 4. Safety Fallback
|
||||
if not isinstance(chunk_profile, str) or not chunk_profile:
|
||||
chunk_profile = "sliding_standard"
|
||||
|
||||
# --- edge_defaults ---
|
||||
edge_defaults = fm.get("edge_defaults")
|
||||
if edge_defaults is None:
|
||||
edge_defaults = cfg_type.get(
|
||||
"edge_defaults",
|
||||
cfg_def.get("edge_defaults", []),
|
||||
)
|
||||
edge_defaults = _ensure_list(edge_defaults)
|
||||
|
||||
# --- Basis-Metadaten ---
|
||||
note_id = n.get("note_id") or n.get("id") or fm.get("id")
|
||||
title = n.get("title") or fm.get("title") or ""
|
||||
path = n.get("path") or path_arg
|
||||
if isinstance(path, pathlib.Path):
|
||||
path = str(path)
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"note_id": note_id,
|
||||
"title": title,
|
||||
"type": note_type,
|
||||
"path": path or "",
|
||||
"retriever_weight": retriever_weight,
|
||||
"chunk_profile": chunk_profile,
|
||||
"edge_defaults": edge_defaults,
|
||||
"hashes": {} # Init Hash Dict
|
||||
}
|
||||
|
||||
# --- MULTI-HASH CALCULATION (Strategy Decoupling) ---
|
||||
# Wir berechnen immer BEIDE Strategien und speichern sie.
|
||||
# ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird.
|
||||
modes_to_calc = ["body", "full"]
|
||||
|
||||
for mode in modes_to_calc:
|
||||
content_to_hash = _get_hash_source_content(n, mode)
|
||||
computed_hash = _compute_hash(content_to_hash)
|
||||
# Key Schema: mode:source:normalize (z.B. "full:parsed:canonical")
|
||||
key = f"{mode}:{hash_source}:{hash_normalize}"
|
||||
payload["hashes"][key] = computed_hash
|
||||
|
||||
# Tags / Keywords
|
||||
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
||||
if tags:
|
||||
payload["tags"] = _ensure_list(tags)
|
||||
|
||||
# Aliases
|
||||
aliases = fm.get("aliases")
|
||||
if aliases:
|
||||
payload["aliases"] = _ensure_list(aliases)
|
||||
|
||||
# Zeit
|
||||
for k in ("created", "modified", "date"):
|
||||
v = fm.get(k) or n.get(k)
|
||||
if v:
|
||||
payload[k] = str(v)
|
||||
|
||||
# Fulltext
|
||||
if "body" in n and n["body"]:
|
||||
payload["fulltext"] = str(n["body"])
|
||||
|
||||
# JSON Validation
|
||||
json.loads(json.dumps(payload, ensure_ascii=False))
|
||||
|
||||
return payload
|
||||
|
|
@ -1,257 +0,0 @@
|
|||
"""
|
||||
FILE: app/core/parser.py
|
||||
DESCRIPTION: Liest Markdown-Dateien fehlertolerant (Encoding-Fallback). Trennt Frontmatter (YAML) vom Body.
|
||||
WP-22 Erweiterung: Kanten-Extraktion mit Zeilennummern für die EdgeRegistry.
|
||||
VERSION: 1.8.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: yaml, re, dataclasses, json, io, os
|
||||
LAST_ANALYSIS: 2025-12-23
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Optional, Tuple, Iterable, List
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
try:
|
||||
import yaml # PyYAML
|
||||
except Exception as e: # pragma: no cover
|
||||
yaml = None # Fehler wird zur Laufzeit geworfen, falls wirklich benötigt
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Datamodell
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class ParsedNote:
|
||||
frontmatter: Dict[str, Any]
|
||||
body: str
|
||||
path: str
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Frontmatter-Erkennung
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
# Öffentliche Kompatibilitäts-Konstante: frühere Skripte importieren FRONTMATTER_RE
|
||||
FRONTMATTER_RE = re.compile(r"^\s*---\s*$") # <— public
|
||||
# Zusätzlich interner Alias (falls jemand ihn referenziert)
|
||||
FRONTMATTER_END = FRONTMATTER_RE # <— public alias
|
||||
|
||||
# interne Namen bleiben bestehen
|
||||
_FRONTMATTER_HEAD = FRONTMATTER_RE
|
||||
_FRONTMATTER_END = FRONTMATTER_RE
|
||||
|
||||
|
||||
def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
|
||||
"""
|
||||
Zerlegt Text in (frontmatter: dict, body: str).
|
||||
Erkennt Frontmatter nur, wenn die erste Zeile '---' ist und später ein zweites '---' folgt.
|
||||
YAML-Fehler im Frontmatter führen NICHT zum Abbruch: es wird dann ein leeres dict benutzt.
|
||||
"""
|
||||
lines = text.splitlines(True) # keep line endings
|
||||
if not lines:
|
||||
return {}, ""
|
||||
|
||||
if not _FRONTMATTER_HEAD.match(lines[0]):
|
||||
# kein Frontmatter-Header → gesamter Text ist Body
|
||||
return {}, text
|
||||
|
||||
end_idx = None
|
||||
# Suche nach nächstem '---' (max. 2000 Zeilen als Sicherheitslimit)
|
||||
for i in range(1, min(len(lines), 2000)):
|
||||
if _FRONTMATTER_END.match(lines[i]):
|
||||
end_idx = i
|
||||
break
|
||||
|
||||
if end_idx is None:
|
||||
# unvollständiger Frontmatter-Block → behandle alles als Body
|
||||
return {}, text
|
||||
|
||||
fm_raw = "".join(lines[1:end_idx])
|
||||
body = "".join(lines[end_idx + 1:])
|
||||
|
||||
data: Dict[str, Any] = {}
|
||||
if yaml is None:
|
||||
raise RuntimeError("PyYAML ist nicht installiert (pip install pyyaml).")
|
||||
|
||||
try:
|
||||
loaded = yaml.safe_load(fm_raw) or {}
|
||||
if isinstance(loaded, dict):
|
||||
data = loaded
|
||||
else:
|
||||
data = {}
|
||||
except Exception as e:
|
||||
# YAML-Fehler nicht fatal machen
|
||||
print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)}))
|
||||
data = {}
|
||||
|
||||
# optionales kosmetisches Trim: eine führende Leerzeile im Body entfernen
|
||||
if body.startswith("\n"):
|
||||
body = body[1:]
|
||||
|
||||
return data, body
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Robustes Lesen mit Encoding-Fallback
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
_FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1")
|
||||
|
||||
|
||||
def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:
|
||||
"""
|
||||
Liest Datei mit mehreren Decodierungsversuchen.
|
||||
Rückgabe: (text, used_encoding, had_fallback)
|
||||
- had_fallback=True, falls NICHT 'utf-8' verwendet wurde (oder 'utf-8-sig').
|
||||
"""
|
||||
last_err: Optional[str] = None
|
||||
for enc in _FALLBACK_ENCODINGS:
|
||||
try:
|
||||
with io.open(path, "r", encoding=enc, errors="strict") as f:
|
||||
text = f.read()
|
||||
# 'utf-8-sig' zählt hier als Fallback (weil BOM), aber ist unproblematisch
|
||||
return text, enc, (enc != "utf-8")
|
||||
except UnicodeDecodeError as e:
|
||||
last_err = f"{type(e).__name__}: {e}"
|
||||
continue
|
||||
|
||||
# Letzter, extrem defensiver Fallback: Bytes → UTF-8 mit REPLACE (keine Exception)
|
||||
with open(path, "rb") as fb:
|
||||
raw = fb.read()
|
||||
text = raw.decode("utf-8", errors="replace")
|
||||
print(json.dumps({
|
||||
"path": path,
|
||||
"warn": "encoding_fallback_exhausted",
|
||||
"info": last_err or "unknown"
|
||||
}, ensure_ascii=False))
|
||||
return text, "utf-8(replace)", True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Öffentliche API
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
def read_markdown(path: str) -> Optional[ParsedNote]:
|
||||
"""
|
||||
Liest eine Markdown-Datei fehlertolerant.
|
||||
"""
|
||||
if not os.path.exists(path):
|
||||
return None
|
||||
|
||||
text, enc, had_fb = _read_text_with_fallback(path)
|
||||
if had_fb:
|
||||
print(json.dumps({"path": path, "warn": "encoding_fallback_used", "used": enc}, ensure_ascii=False))
|
||||
|
||||
fm, body = _split_frontmatter(text)
|
||||
return ParsedNote(frontmatter=fm or {}, body=body or "", path=path)
|
||||
|
||||
|
||||
def validate_required_frontmatter(fm: Dict[str, Any],
|
||||
required: Tuple[str, ...] = ("id", "title")) -> None:
|
||||
"""
|
||||
Prüft, ob alle Pflichtfelder vorhanden sind.
|
||||
"""
|
||||
if fm is None:
|
||||
fm = {}
|
||||
missing = []
|
||||
for k in required:
|
||||
v = fm.get(k)
|
||||
if v is None:
|
||||
missing.append(k)
|
||||
elif isinstance(v, str) and not v.strip():
|
||||
missing.append(k)
|
||||
if missing:
|
||||
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
|
||||
|
||||
if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
|
||||
raise ValueError("frontmatter 'tags' must be a list of strings")
|
||||
|
||||
|
||||
def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Normalisierung von Tags und anderen Feldern.
|
||||
"""
|
||||
out = dict(fm or {})
|
||||
if "tags" in out:
|
||||
if isinstance(out["tags"], str):
|
||||
out["tags"] = [out["tags"].strip()] if out["tags"].strip() else []
|
||||
elif isinstance(out["tags"], list):
|
||||
out["tags"] = [str(t).strip() for t in out["tags"] if t is not None]
|
||||
else:
|
||||
out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else []
|
||||
if "embedding_exclude" in out:
|
||||
out["embedding_exclude"] = bool(out["embedding_exclude"])
|
||||
return out
|
||||
|
||||
|
||||
# ------------------------------ Wikilinks ---------------------------- #
|
||||
|
||||
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
||||
|
||||
|
||||
def extract_wikilinks(text: str) -> List[str]:
|
||||
"""
|
||||
Extrahiert Wikilinks als einfache Liste von IDs.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
out: List[str] = []
|
||||
for m in _WIKILINK_RE.finditer(text):
|
||||
raw = (m.group(1) or "").strip()
|
||||
if not raw:
|
||||
continue
|
||||
if "|" in raw:
|
||||
raw = raw.split("|", 1)[0].strip()
|
||||
if "#" in raw:
|
||||
raw = raw.split("#", 1)[0].strip()
|
||||
if raw:
|
||||
out.append(raw)
|
||||
return out
|
||||
|
||||
|
||||
def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
WP-22: Extrahiert Wikilinks [[Ziel|Typ]] aus dem Body und speichert die Zeilennummer.
|
||||
Gibt eine Liste von Dictionaries zurück, die direkt von der Ingestion verarbeitet werden können.
|
||||
"""
|
||||
edges = []
|
||||
if not parsed or not parsed.body:
|
||||
return edges
|
||||
|
||||
# Wir nutzen splitlines(True), um Zeilenumbrüche für die Positionsberechnung zu erhalten,
|
||||
# oder einfaches splitlines() für die reine Zeilennummerierung.
|
||||
lines = parsed.body.splitlines()
|
||||
|
||||
for line_num, line_content in enumerate(lines, 1):
|
||||
for match in _WIKILINK_RE.finditer(line_content):
|
||||
raw = (match.group(1) or "").strip()
|
||||
if not raw:
|
||||
continue
|
||||
|
||||
# Syntax: [[Ziel|Typ]]
|
||||
if "|" in raw:
|
||||
parts = raw.split("|", 1)
|
||||
target = parts[0].strip()
|
||||
kind = parts[1].strip()
|
||||
else:
|
||||
target = raw.strip()
|
||||
kind = "related_to" # Default-Typ
|
||||
|
||||
# Anchor (#) entfernen, da Relationen auf Notiz-Ebene (ID) basieren
|
||||
if "#" in target:
|
||||
target = target.split("#", 1)[0].strip()
|
||||
|
||||
if target:
|
||||
edges.append({
|
||||
"to": target,
|
||||
"kind": kind,
|
||||
"line": line_num,
|
||||
"provenance": "explicit"
|
||||
})
|
||||
return edges
|
||||
22
app/core/parser/__init__.py
Normal file
22
app/core/parser/__init__.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
FILE: app/core/parser/__init__.py
|
||||
DESCRIPTION: Package-Einstiegspunkt für den Parser.
|
||||
Ermöglicht das Löschen der parser.py Facade.
|
||||
VERSION: 1.10.0
|
||||
"""
|
||||
from .parsing_models import ParsedNote, NoteContext
|
||||
from .parsing_utils import (
|
||||
FRONTMATTER_RE, validate_required_frontmatter,
|
||||
normalize_frontmatter, extract_wikilinks, extract_edges_with_context
|
||||
)
|
||||
from .parsing_markdown import read_markdown
|
||||
from .parsing_scanner import pre_scan_markdown
|
||||
|
||||
# Kompatibilitäts-Alias
|
||||
FRONTMATTER_END = FRONTMATTER_RE
|
||||
|
||||
__all__ = [
|
||||
"ParsedNote", "NoteContext", "FRONTMATTER_RE", "FRONTMATTER_END",
|
||||
"read_markdown", "pre_scan_markdown", "validate_required_frontmatter",
|
||||
"normalize_frontmatter", "extract_wikilinks", "extract_edges_with_context"
|
||||
]
|
||||
60
app/core/parser/parsing_markdown.py
Normal file
60
app/core/parser/parsing_markdown.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
"""
|
||||
FILE: app/core/parsing/parsing_markdown.py
|
||||
DESCRIPTION: Fehlertolerantes Einlesen von Markdown und Frontmatter-Splitting.
|
||||
"""
|
||||
import io
|
||||
import os
|
||||
import json
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
from .parsing_models import ParsedNote
|
||||
from .parsing_utils import FRONTMATTER_RE
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None
|
||||
|
||||
_FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1")
|
||||
|
||||
def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
|
||||
"""Zerlegt Text in Frontmatter-Dict und Body."""
|
||||
lines = text.splitlines(True)
|
||||
if not lines or not FRONTMATTER_RE.match(lines[0]):
|
||||
return {}, text
|
||||
end_idx = None
|
||||
for i in range(1, min(len(lines), 2000)):
|
||||
if FRONTMATTER_RE.match(lines[i]):
|
||||
end_idx = i
|
||||
break
|
||||
if end_idx is None: return {}, text
|
||||
fm_raw = "".join(lines[1:end_idx])
|
||||
body = "".join(lines[end_idx + 1:])
|
||||
if yaml is None: raise RuntimeError("PyYAML not installed.")
|
||||
try:
|
||||
loaded = yaml.safe_load(fm_raw) or {}
|
||||
data = loaded if isinstance(loaded, dict) else {}
|
||||
except Exception as e:
|
||||
print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)}))
|
||||
data = {}
|
||||
if body.startswith("\n"): body = body[1:]
|
||||
return data, body
|
||||
|
||||
def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:
|
||||
"""Liest Datei mit Encoding-Fallback-Kette."""
|
||||
last_err = None
|
||||
for enc in _FALLBACK_ENCODINGS:
|
||||
try:
|
||||
with io.open(path, "r", encoding=enc, errors="strict") as f:
|
||||
return f.read(), enc, (enc != "utf-8")
|
||||
except UnicodeDecodeError as e:
|
||||
last_err = str(e); continue
|
||||
with open(path, "rb") as fb:
|
||||
text = fb.read().decode("utf-8", errors="replace")
|
||||
return text, "utf-8(replace)", True
|
||||
|
||||
def read_markdown(path: str) -> Optional[ParsedNote]:
|
||||
"""Öffentliche API zum Einlesen einer Datei."""
|
||||
if not os.path.exists(path): return None
|
||||
text, enc, had_fb = _read_text_with_fallback(path)
|
||||
fm, body = _split_frontmatter(text)
|
||||
return ParsedNote(frontmatter=fm or {}, body=body or "", path=path)
|
||||
22
app/core/parser/parsing_models.py
Normal file
22
app/core/parser/parsing_models.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
FILE: app/core/parsing/parsing_models.py
|
||||
DESCRIPTION: Datenklassen für das Parsing-System.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List
|
||||
|
||||
@dataclass
|
||||
class ParsedNote:
|
||||
"""Container für eine vollständig eingelesene Markdown-Datei."""
|
||||
frontmatter: Dict[str, Any]
|
||||
body: str
|
||||
path: str
|
||||
|
||||
@dataclass
|
||||
class NoteContext:
|
||||
"""Metadaten-Container für den flüchtigen LocalBatchCache (Pass 1)."""
|
||||
note_id: str
|
||||
title: str
|
||||
type: str
|
||||
summary: str
|
||||
tags: List[str]
|
||||
40
app/core/parser/parsing_scanner.py
Normal file
40
app/core/parser/parsing_scanner.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
"""
|
||||
FILE: app/core/parsing/parsing_scanner.py
|
||||
DESCRIPTION: Pre-Scan für den LocalBatchCache (Pass 1).
|
||||
AUDIT v1.1.0: Dynamisierung der Scan-Parameter (WP-14).
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
from typing import Optional, Dict, Any
|
||||
from .parsing_models import NoteContext
|
||||
from .parsing_markdown import read_markdown
|
||||
|
||||
def pre_scan_markdown(path: str, registry: Optional[Dict[str, Any]] = None) -> Optional[NoteContext]:
|
||||
"""
|
||||
Extrahiert Identität und Kurz-Kontext zur Validierung.
|
||||
WP-14: Scan-Tiefe und Summary-Länge sind nun über die Registry steuerbar.
|
||||
"""
|
||||
parsed = read_markdown(path)
|
||||
if not parsed: return None
|
||||
|
||||
# WP-14: Konfiguration laden oder Standardwerte nutzen
|
||||
reg = registry or {}
|
||||
summary_cfg = reg.get("summary_settings", {})
|
||||
scan_depth = summary_cfg.get("pre_scan_depth", 600)
|
||||
max_len = summary_cfg.get("max_summary_length", 500)
|
||||
|
||||
fm = parsed.frontmatter
|
||||
# ID-Findung: Frontmatter ID oder Dateiname als Fallback
|
||||
note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0])
|
||||
|
||||
# Erstelle Kurz-Zusammenfassung mit dynamischen Limits
|
||||
clean_body = re.sub(r'[#*`>]', '', parsed.body[:scan_depth]).strip()
|
||||
summary = clean_body[:max_len] + "..." if len(clean_body) > max_len else clean_body
|
||||
|
||||
return NoteContext(
|
||||
note_id=note_id,
|
||||
title=str(fm.get("title", note_id)),
|
||||
type=str(fm.get("type", "concept")),
|
||||
summary=summary,
|
||||
tags=fm.get("tags", []) if isinstance(fm.get("tags"), list) else []
|
||||
)
|
||||
69
app/core/parser/parsing_utils.py
Normal file
69
app/core/parser/parsing_utils.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
"""
|
||||
FILE: app/core/parsing/parsing_utils.py
|
||||
DESCRIPTION: Werkzeuge zur Validierung, Normalisierung und Wikilink-Extraktion.
|
||||
"""
|
||||
import re
|
||||
from typing import Any, Dict, List, Tuple, Optional
|
||||
from .parsing_models import ParsedNote
|
||||
|
||||
# Öffentliche Konstanten für Abwärtskompatibilität
|
||||
FRONTMATTER_RE = re.compile(r"^\s*---\s*$")
|
||||
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
||||
|
||||
def validate_required_frontmatter(fm: Dict[str, Any], required: Tuple[str, ...] = ("id", "title")) -> None:
|
||||
"""Prüft, ob alle Pflichtfelder vorhanden sind."""
|
||||
if fm is None: fm = {}
|
||||
missing = []
|
||||
for k in required:
|
||||
v = fm.get(k)
|
||||
if v is None or (isinstance(v, str) and not v.strip()):
|
||||
missing.append(k)
|
||||
if missing:
|
||||
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
|
||||
if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
|
||||
raise ValueError("frontmatter 'tags' must be a list of strings")
|
||||
|
||||
def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Normalisierung von Tags und Boolean-Feldern."""
|
||||
out = dict(fm or {})
|
||||
if "tags" in out:
|
||||
if isinstance(out["tags"], str):
|
||||
out["tags"] = [out["tags"].strip()] if out["tags"].strip() else []
|
||||
elif isinstance(out["tags"], list):
|
||||
out["tags"] = [str(t).strip() for t in out["tags"] if t is not None]
|
||||
else:
|
||||
out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else []
|
||||
if "embedding_exclude" in out:
|
||||
out["embedding_exclude"] = bool(out["embedding_exclude"])
|
||||
return out
|
||||
|
||||
def extract_wikilinks(text: str) -> List[str]:
|
||||
"""Extrahiert Wikilinks als einfache Liste von IDs."""
|
||||
if not text: return []
|
||||
out: List[str] = []
|
||||
for m in _WIKILINK_RE.finditer(text):
|
||||
raw = (m.group(1) or "").strip()
|
||||
if not raw: continue
|
||||
if "|" in raw: raw = raw.split("|", 1)[0].strip()
|
||||
if "#" in raw: raw = raw.split("#", 1)[0].strip()
|
||||
if raw: out.append(raw)
|
||||
return out
|
||||
|
||||
def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]:
|
||||
"""WP-22: Extrahiert Wikilinks mit Zeilennummern für die EdgeRegistry."""
|
||||
edges = []
|
||||
if not parsed or not parsed.body: return edges
|
||||
lines = parsed.body.splitlines()
|
||||
for line_num, line_content in enumerate(lines, 1):
|
||||
for match in _WIKILINK_RE.finditer(line_content):
|
||||
raw = (match.group(1) or "").strip()
|
||||
if not raw: continue
|
||||
if "|" in raw:
|
||||
parts = raw.split("|", 1)
|
||||
target, kind = parts[0].strip(), parts[1].strip()
|
||||
else:
|
||||
target, kind = raw.strip(), "related_to"
|
||||
if "#" in target: target = target.split("#", 1)[0].strip()
|
||||
if target:
|
||||
edges.append({"to": target, "kind": kind, "line": line_num, "provenance": "explicit"})
|
||||
return edges
|
||||
|
|
@ -1,157 +1,18 @@
|
|||
"""
|
||||
FILE: app/core/qdrant.py
|
||||
DESCRIPTION: Qdrant-Client Factory und Schema-Management. Erstellt Collections und Payload-Indizes.
|
||||
VERSION: 2.2.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, dataclasses, os
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
DESCRIPTION: Proxy-Modul zur Aufrechterhaltung der Abwärtskompatibilität (WP-14).
|
||||
Leitet alle Aufrufe an das neue database-Paket weiter.
|
||||
STATUS: Proxy (Legacy-Support)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Dict, List
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Konfiguration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class QdrantConfig:
|
||||
host: Optional[str] = None
|
||||
port: Optional[int] = None
|
||||
url: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
prefix: str = "mindnet"
|
||||
dim: int = 384
|
||||
distance: str = "Cosine" # Cosine | Dot | Euclid
|
||||
on_disk_payload: bool = True
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> "QdrantConfig":
|
||||
# Entweder URL ODER Host/Port, API-Key optional
|
||||
url = os.getenv("QDRANT_URL") or None
|
||||
host = os.getenv("QDRANT_HOST") or None
|
||||
port = os.getenv("QDRANT_PORT")
|
||||
port = int(port) if port else None
|
||||
api_key = os.getenv("QDRANT_API_KEY") or None
|
||||
prefix = os.getenv("COLLECTION_PREFIX") or "mindnet"
|
||||
dim = int(os.getenv("VECTOR_DIM") or 384)
|
||||
distance = os.getenv("DISTANCE", "Cosine")
|
||||
on_disk_payload = (os.getenv("ON_DISK_PAYLOAD", "true").lower() == "true")
|
||||
return cls(
|
||||
host=host, port=port, url=url, api_key=api_key,
|
||||
prefix=prefix, dim=dim, distance=distance, on_disk_payload=on_disk_payload
|
||||
from .database.qdrant import (
|
||||
QdrantConfig,
|
||||
get_client,
|
||||
ensure_collections,
|
||||
ensure_payload_indexes,
|
||||
collection_names
|
||||
)
|
||||
|
||||
|
||||
def get_client(cfg: QdrantConfig) -> QdrantClient:
|
||||
# QdrantClient akzeptiert entweder url=... oder host/port
|
||||
if cfg.url:
|
||||
return QdrantClient(url=cfg.url, api_key=cfg.api_key, timeout=60.0)
|
||||
return QdrantClient(host=cfg.host or "127.0.0.1", port=cfg.port or 6333, api_key=cfg.api_key, timeout=60.0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Collections
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def collection_names(prefix: str) -> Tuple[str, str, str]:
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
|
||||
def _vector_params(dim: int, distance: str) -> rest.VectorParams:
|
||||
# Distance: "Cosine" | "Dot" | "Euclid"
|
||||
dist = getattr(rest.Distance, distance.capitalize(), rest.Distance.COSINE)
|
||||
return rest.VectorParams(size=dim, distance=dist)
|
||||
|
||||
|
||||
def ensure_collections(client: QdrantClient, prefix: str, dim: int) -> None:
|
||||
"""Legt mindnet_notes, mindnet_chunks, mindnet_edges an (falls nicht vorhanden)."""
|
||||
notes, chunks, edges = collection_names(prefix)
|
||||
|
||||
# notes
|
||||
if not client.collection_exists(notes):
|
||||
client.create_collection(
|
||||
collection_name=notes,
|
||||
vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
# chunks
|
||||
if not client.collection_exists(chunks):
|
||||
client.create_collection(
|
||||
collection_name=chunks,
|
||||
vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
# edges (Dummy-Vektor, Filter via Payload)
|
||||
if not client.collection_exists(edges):
|
||||
client.create_collection(
|
||||
collection_name=edges,
|
||||
vectors_config=_vector_params(1, "Dot"),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Payload-Indizes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _ensure_index(client: QdrantClient, collection: str, field: str, schema: rest.PayloadSchemaType) -> None:
|
||||
"""Idempotentes Anlegen eines Payload-Indexes für ein Feld."""
|
||||
try:
|
||||
client.create_payload_index(collection_name=collection, field_name=field, field_schema=schema, wait=True)
|
||||
except Exception as e:
|
||||
# Fehler ignorieren, falls Index bereits existiert oder Server "already indexed" meldet.
|
||||
# Für Debugging ggf. Logging ergänzen.
|
||||
_ = e
|
||||
|
||||
|
||||
def ensure_payload_indexes(client: QdrantClient, prefix: str) -> None:
|
||||
"""
|
||||
Stellt sicher, dass alle benötigten Payload-Indizes existieren.
|
||||
- notes: note_id(KEYWORD), type(KEYWORD), title(TEXT), updated(INTEGER), tags(KEYWORD)
|
||||
- chunks: note_id(KEYWORD), chunk_id(KEYWORD), index(INTEGER), type(KEYWORD), tags(KEYWORD)
|
||||
- edges: note_id(KEYWORD), kind(KEYWORD), scope(KEYWORD), source_id(KEYWORD), target_id(KEYWORD), chunk_id(KEYWORD)
|
||||
"""
|
||||
notes, chunks, edges = collection_names(prefix)
|
||||
|
||||
# NOTES
|
||||
for field, schema in [
|
||||
("note_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("type", rest.PayloadSchemaType.KEYWORD),
|
||||
("title", rest.PayloadSchemaType.TEXT),
|
||||
("updated", rest.PayloadSchemaType.INTEGER),
|
||||
("tags", rest.PayloadSchemaType.KEYWORD),
|
||||
]:
|
||||
_ensure_index(client, notes, field, schema)
|
||||
|
||||
# CHUNKS
|
||||
for field, schema in [
|
||||
("note_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("chunk_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("index", rest.PayloadSchemaType.INTEGER),
|
||||
("type", rest.PayloadSchemaType.KEYWORD),
|
||||
("tags", rest.PayloadSchemaType.KEYWORD),
|
||||
]:
|
||||
_ensure_index(client, chunks, field, schema)
|
||||
|
||||
# EDGES
|
||||
for field, schema in [
|
||||
("note_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("kind", rest.PayloadSchemaType.KEYWORD),
|
||||
("scope", rest.PayloadSchemaType.KEYWORD),
|
||||
("source_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("target_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("chunk_id", rest.PayloadSchemaType.KEYWORD),
|
||||
]:
|
||||
_ensure_index(client, edges, field, schema)
|
||||
|
||||
|
||||
# Re-Export für 100% Kompatibilität
|
||||
__all__ = [
|
||||
"QdrantConfig",
|
||||
"get_client",
|
||||
|
|
|
|||
|
|
@ -1,292 +1,24 @@
|
|||
"""
|
||||
FILE: app/core/qdrant_points.py
|
||||
DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs.
|
||||
VERSION: 1.5.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, uuid, os
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
DESCRIPTION: Proxy-Modul zur Aufrechterhaltung der Abwärtskompatibilität (WP-14).
|
||||
Leitet Point-Operationen an das neue database-Paket weiter.
|
||||
STATUS: Proxy (Legacy-Support)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import uuid
|
||||
from typing import List, Tuple, Iterable, Optional, Dict, Any
|
||||
|
||||
from qdrant_client.http import models as rest
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
# --------------------- ID helpers ---------------------
|
||||
|
||||
def _to_uuid(stable_key: str) -> str:
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key))
|
||||
|
||||
def _names(prefix: str) -> Tuple[str, str, str]:
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
# --------------------- Points builders ---------------------
|
||||
|
||||
def points_for_note(prefix: str, note_payload: dict, note_vec: List[float] | None, dim: int) -> Tuple[str, List[rest.PointStruct]]:
|
||||
notes_col, _, _ = _names(prefix)
|
||||
vector = note_vec if note_vec is not None else [0.0] * int(dim)
|
||||
raw_note_id = note_payload.get("note_id") or note_payload.get("id") or "missing-note-id"
|
||||
point_id = _to_uuid(raw_note_id)
|
||||
pt = rest.PointStruct(id=point_id, vector=vector, payload=note_payload)
|
||||
return notes_col, [pt]
|
||||
|
||||
def points_for_chunks(prefix: str, chunk_payloads: List[dict], vectors: List[List[float]]) -> Tuple[str, List[rest.PointStruct]]:
|
||||
_, chunks_col, _ = _names(prefix)
|
||||
points: List[rest.PointStruct] = []
|
||||
for i, (pl, vec) in enumerate(zip(chunk_payloads, vectors), start=1):
|
||||
chunk_id = pl.get("chunk_id") or pl.get("id")
|
||||
if not chunk_id:
|
||||
note_id = pl.get("note_id") or pl.get("parent_note_id") or "missing-note"
|
||||
chunk_id = f"{note_id}#{i}"
|
||||
pl["chunk_id"] = chunk_id
|
||||
point_id = _to_uuid(chunk_id)
|
||||
points.append(rest.PointStruct(id=point_id, vector=vec, payload=pl))
|
||||
return chunks_col, points
|
||||
|
||||
def _normalize_edge_payload(pl: dict) -> dict:
|
||||
kind = pl.get("kind") or pl.get("edge_type") or "edge"
|
||||
source_id = pl.get("source_id") or pl.get("src_id") or "unknown-src"
|
||||
target_id = pl.get("target_id") or pl.get("dst_id") or "unknown-tgt"
|
||||
seq = pl.get("seq") or pl.get("order") or pl.get("index")
|
||||
|
||||
pl.setdefault("kind", kind)
|
||||
pl.setdefault("source_id", source_id)
|
||||
pl.setdefault("target_id", target_id)
|
||||
if seq is not None and "seq" not in pl:
|
||||
pl["seq"] = seq
|
||||
return pl
|
||||
|
||||
def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]:
|
||||
_, _, edges_col = _names(prefix)
|
||||
points: List[rest.PointStruct] = []
|
||||
for raw in edge_payloads:
|
||||
pl = _normalize_edge_payload(raw)
|
||||
edge_id = pl.get("edge_id")
|
||||
if not edge_id:
|
||||
kind = pl.get("kind", "edge")
|
||||
s = pl.get("source_id", "unknown-src")
|
||||
t = pl.get("target_id", "unknown-tgt")
|
||||
seq = pl.get("seq") or ""
|
||||
edge_id = f"{kind}:{s}->{t}#{seq}"
|
||||
pl["edge_id"] = edge_id
|
||||
point_id = _to_uuid(edge_id)
|
||||
points.append(rest.PointStruct(id=point_id, vector=[0.0], payload=pl))
|
||||
return edges_col, points
|
||||
|
||||
# --------------------- Vector schema & overrides ---------------------
|
||||
|
||||
def _preferred_name(candidates: List[str]) -> str:
|
||||
for k in ("text", "default", "embedding", "content"):
|
||||
if k in candidates:
|
||||
return k
|
||||
return sorted(candidates)[0]
|
||||
|
||||
def _env_override_for_collection(collection: str) -> Optional[str]:
|
||||
"""
|
||||
Returns:
|
||||
- "__single__" to force single-vector
|
||||
- concrete name (str) to force named-vector with that name
|
||||
- None to auto-detect
|
||||
"""
|
||||
base = os.getenv("MINDNET_VECTOR_NAME")
|
||||
if collection.endswith("_notes"):
|
||||
base = os.getenv("NOTES_VECTOR_NAME", base)
|
||||
elif collection.endswith("_chunks"):
|
||||
base = os.getenv("CHUNKS_VECTOR_NAME", base)
|
||||
elif collection.endswith("_edges"):
|
||||
base = os.getenv("EDGES_VECTOR_NAME", base)
|
||||
|
||||
if not base:
|
||||
return None
|
||||
val = base.strip()
|
||||
if val.lower() in ("__single__", "single"):
|
||||
return "__single__"
|
||||
return val # concrete name
|
||||
|
||||
def _get_vector_schema(client: QdrantClient, collection_name: str) -> dict:
|
||||
"""
|
||||
Return {"kind": "single", "size": int} or {"kind": "named", "names": [...], "primary": str}.
|
||||
"""
|
||||
try:
|
||||
info = client.get_collection(collection_name=collection_name)
|
||||
vecs = getattr(info, "vectors", None)
|
||||
# Single-vector config
|
||||
if hasattr(vecs, "size") and isinstance(vecs.size, int):
|
||||
return {"kind": "single", "size": vecs.size}
|
||||
# Named-vectors config (dict-like in .config)
|
||||
cfg = getattr(vecs, "config", None)
|
||||
if isinstance(cfg, dict) and cfg:
|
||||
names = list(cfg.keys())
|
||||
if names:
|
||||
return {"kind": "named", "names": names, "primary": _preferred_name(names)}
|
||||
except Exception:
|
||||
pass
|
||||
return {"kind": "single", "size": None}
|
||||
|
||||
def _as_named(points: List[rest.PointStruct], name: str) -> List[rest.PointStruct]:
|
||||
out: List[rest.PointStruct] = []
|
||||
for pt in points:
|
||||
vec = getattr(pt, "vector", None)
|
||||
if isinstance(vec, dict):
|
||||
if name in vec:
|
||||
out.append(pt)
|
||||
else:
|
||||
# take any existing entry; if empty dict fallback to [0.0]
|
||||
fallback_vec = None
|
||||
try:
|
||||
fallback_vec = list(next(iter(vec.values())))
|
||||
except Exception:
|
||||
fallback_vec = [0.0]
|
||||
out.append(rest.PointStruct(id=pt.id, vector={name: fallback_vec}, payload=pt.payload))
|
||||
elif vec is not None:
|
||||
out.append(rest.PointStruct(id=pt.id, vector={name: vec}, payload=pt.payload))
|
||||
else:
|
||||
out.append(pt)
|
||||
return out
|
||||
|
||||
# --------------------- Qdrant ops ---------------------
|
||||
|
||||
def upsert_batch(client: QdrantClient, collection: str, points: List[rest.PointStruct]) -> None:
|
||||
if not points:
|
||||
return
|
||||
|
||||
# 1) ENV overrides come first
|
||||
override = _env_override_for_collection(collection)
|
||||
if override == "__single__":
|
||||
client.upsert(collection_name=collection, points=points, wait=True)
|
||||
return
|
||||
elif isinstance(override, str):
|
||||
client.upsert(collection_name=collection, points=_as_named(points, override), wait=True)
|
||||
return
|
||||
|
||||
# 2) Auto-detect schema
|
||||
schema = _get_vector_schema(client, collection)
|
||||
if schema.get("kind") == "named":
|
||||
name = schema.get("primary") or _preferred_name(schema.get("names") or [])
|
||||
client.upsert(collection_name=collection, points=_as_named(points, name), wait=True)
|
||||
return
|
||||
|
||||
# 3) Fallback single-vector
|
||||
client.upsert(collection_name=collection, points=points, wait=True)
|
||||
|
||||
# --- Optional search helpers ---
|
||||
|
||||
def _filter_any(field: str, values: Iterable[str]) -> rest.Filter:
|
||||
return rest.Filter(should=[rest.FieldCondition(key=field, match=rest.MatchValue(value=v)) for v in values])
|
||||
|
||||
def _merge_filters(*filters: Optional[rest.Filter]) -> Optional[rest.Filter]:
|
||||
fs = [f for f in filters if f is not None]
|
||||
if not fs:
|
||||
return None
|
||||
if len(fs) == 1:
|
||||
return fs[0]
|
||||
must = []
|
||||
for f in fs:
|
||||
if getattr(f, "must", None):
|
||||
must.extend(f.must)
|
||||
if getattr(f, "should", None):
|
||||
must.append(rest.Filter(should=f.should))
|
||||
return rest.Filter(must=must)
|
||||
|
||||
def _filter_from_dict(filters: Optional[Dict[str, Any]]) -> Optional[rest.Filter]:
|
||||
if not filters:
|
||||
return None
|
||||
parts = []
|
||||
for k, v in filters.items():
|
||||
if isinstance(v, (list, tuple, set)):
|
||||
parts.append(_filter_any(k, [str(x) for x in v]))
|
||||
else:
|
||||
parts.append(rest.Filter(must=[rest.FieldCondition(key=k, match=rest.MatchValue(value=v))]))
|
||||
return _merge_filters(*parts)
|
||||
|
||||
def search_chunks_by_vector(client: QdrantClient, prefix: str, vector: List[float], top: int = 10, filters: Optional[Dict[str, Any]] = None) -> List[Tuple[str, float, dict]]:
|
||||
_, chunks_col, _ = _names(prefix)
|
||||
flt = _filter_from_dict(filters)
|
||||
res = client.search(collection_name=chunks_col, query_vector=vector, limit=top, with_payload=True, with_vectors=False, query_filter=flt)
|
||||
out: List[Tuple[str, float, dict]] = []
|
||||
for r in res:
|
||||
out.append((str(r.id), float(r.score), dict(r.payload or {})))
|
||||
return out
|
||||
|
||||
|
||||
# --- Edge retrieval helper ---
|
||||
|
||||
def get_edges_for_sources(
|
||||
client: QdrantClient,
|
||||
prefix: str,
|
||||
source_ids: Iterable[str],
|
||||
edge_types: Optional[Iterable[str]] = None,
|
||||
limit: int = 2048,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Retrieve edge payloads from the <prefix>_edges collection.
|
||||
|
||||
Args:
|
||||
client: QdrantClient instance.
|
||||
prefix: Mindnet collection prefix (e.g. "mindnet").
|
||||
source_ids: Iterable of source_id values (typically chunk_ids or note_ids).
|
||||
edge_types: Optional iterable of edge kinds (e.g. ["references", "depends_on"]). If None,
|
||||
all kinds are returned.
|
||||
limit: Maximum number of edge payloads to return.
|
||||
|
||||
Returns:
|
||||
A list of edge payload dicts, e.g.:
|
||||
{
|
||||
"note_id": "...",
|
||||
"chunk_id": "...",
|
||||
"kind": "references" | "depends_on" | ...,
|
||||
"scope": "chunk",
|
||||
"source_id": "...",
|
||||
"target_id": "...",
|
||||
"rule_id": "...",
|
||||
"confidence": 0.7,
|
||||
...
|
||||
}
|
||||
"""
|
||||
source_ids = list(source_ids)
|
||||
if not source_ids or limit <= 0:
|
||||
return []
|
||||
|
||||
# Resolve collection name
|
||||
_, _, edges_col = _names(prefix)
|
||||
|
||||
# Build filter: source_id IN source_ids
|
||||
src_filter = _filter_any("source_id", [str(s) for s in source_ids])
|
||||
|
||||
# Optional: kind IN edge_types
|
||||
kind_filter = None
|
||||
if edge_types:
|
||||
kind_filter = _filter_any("kind", [str(k) for k in edge_types])
|
||||
|
||||
flt = _merge_filters(src_filter, kind_filter)
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
next_page = None
|
||||
remaining = int(limit)
|
||||
|
||||
# Use paginated scroll API; we don't need vectors, only payloads.
|
||||
while remaining > 0:
|
||||
batch_limit = min(256, remaining)
|
||||
res, next_page = client.scroll(
|
||||
collection_name=edges_col,
|
||||
scroll_filter=flt,
|
||||
limit=batch_limit,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=next_page,
|
||||
from .database.qdrant_points import (
|
||||
points_for_note,
|
||||
points_for_chunks,
|
||||
points_for_edges,
|
||||
upsert_batch,
|
||||
get_edges_for_sources,
|
||||
search_chunks_by_vector
|
||||
)
|
||||
|
||||
if not res:
|
||||
break
|
||||
|
||||
for r in res:
|
||||
out.append(dict(r.payload or {}))
|
||||
remaining -= 1
|
||||
if remaining <= 0:
|
||||
break
|
||||
|
||||
if next_page is None or remaining <= 0:
|
||||
break
|
||||
|
||||
return out
|
||||
# Re-Export für 100% Kompatibilität
|
||||
__all__ = [
|
||||
"points_for_note",
|
||||
"points_for_chunks",
|
||||
"points_for_edges",
|
||||
"upsert_batch",
|
||||
"get_edges_for_sources",
|
||||
"search_chunks_by_vector"
|
||||
]
|
||||
43
app/core/registry.py
Normal file
43
app/core/registry.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
"""
|
||||
FILE: app/core/registry.py
|
||||
DESCRIPTION: Zentraler Base-Layer für Konfigurations-Loading und Text-Bereinigung.
|
||||
Bricht Zirkelbezüge zwischen Ingestion und LLMService auf.
|
||||
VERSION: 1.0.0
|
||||
"""
|
||||
import os
|
||||
import yaml
|
||||
from typing import Optional, List
|
||||
|
||||
def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
||||
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Logik."""
|
||||
# Wir nutzen hier einen direkten Import von Settings, um Zyklen zu vermeiden
|
||||
from app.config import get_settings
|
||||
settings = get_settings()
|
||||
path = custom_path or settings.MINDNET_TYPES_FILE
|
||||
if not os.path.exists(path):
|
||||
return {}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def clean_llm_text(text: str, registry: Optional[dict] = None) -> str:
|
||||
"""
|
||||
Entfernt LLM-Steuerzeichen (<s>, [OUT] etc.) aus einem Text.
|
||||
Wird sowohl für JSON-Parsing als auch für Chat-Antworten genutzt.
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return ""
|
||||
|
||||
default_patterns = ["<s>", "</s>", "[OUT]", "[/OUT]"]
|
||||
reg = registry or load_type_registry()
|
||||
|
||||
# Lade Patterns aus llm_settings (WP-14)
|
||||
patterns: List[str] = reg.get("llm_settings", {}).get("cleanup_patterns", default_patterns)
|
||||
|
||||
clean = text
|
||||
for p in patterns:
|
||||
clean = clean.replace(p, "")
|
||||
|
||||
return clean.strip()
|
||||
25
app/core/retrieval/__init__.py
Normal file
25
app/core/retrieval/__init__.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
"""
|
||||
PACKAGE: app.core.retrieval
|
||||
DESCRIPTION: Zentrale Schnittstelle für Retrieval-Operationen (Vektor- & Graph-Suche).
|
||||
Bündelt Suche und mathematische Scoring-Engine.
|
||||
"""
|
||||
from .retriever import (
|
||||
Retriever,
|
||||
hybrid_retrieve,
|
||||
semantic_retrieve
|
||||
)
|
||||
|
||||
from .retriever_scoring import (
|
||||
get_weights,
|
||||
compute_wp22_score,
|
||||
get_status_multiplier
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Retriever",
|
||||
"hybrid_retrieve",
|
||||
"semantic_retrieve",
|
||||
"get_weights",
|
||||
"compute_wp22_score",
|
||||
"get_status_multiplier"
|
||||
]
|
||||
312
app/core/retrieval/retriever.py
Normal file
312
app/core/retrieval/retriever.py
Normal file
|
|
@ -0,0 +1,312 @@
|
|||
"""
|
||||
FILE: app/core/retrieval/retriever.py
|
||||
DESCRIPTION: Haupt-Schnittstelle für die Suche. Orchestriert Vektorsuche und Graph-Expansion.
|
||||
Nutzt retriever_scoring.py für die WP-22 Logik.
|
||||
MODULARISIERUNG: Verschoben in das retrieval-Paket für WP-14.
|
||||
VERSION: 0.6.16
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.config, app.models.dto, app.core.database*, app.core.graph_adapter
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from typing import Any, Dict, List, Tuple, Iterable, Optional
|
||||
|
||||
from app.config import get_settings
|
||||
from app.models.dto import (
|
||||
QueryRequest, QueryResponse, QueryHit,
|
||||
Explanation, ScoreBreakdown, Reason, EdgeDTO
|
||||
)
|
||||
|
||||
# MODULARISIERUNG: Neue Import-Pfade für die Datenbank-Ebene
|
||||
import app.core.database.qdrant as qdr
|
||||
import app.core.database.qdrant_points as qp
|
||||
|
||||
import app.services.embeddings_client as ec
|
||||
import app.core.graph_adapter as ga
|
||||
|
||||
# Mathematische Engine importieren (Bleibt vorerst in app.core)
|
||||
from app.core.retriever_scoring import get_weights, compute_wp22_score
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ==============================================================================
|
||||
# 1. CORE HELPERS & CONFIG LOADERS
|
||||
# ==============================================================================
|
||||
|
||||
def _get_client_and_prefix() -> Tuple[Any, str]:
|
||||
"""Initialisiert Qdrant Client und lädt Collection-Prefix via database-Paket."""
|
||||
cfg = qdr.QdrantConfig.from_env()
|
||||
return qdr.get_client(cfg), cfg.prefix
|
||||
|
||||
|
||||
def _get_query_vector(req: QueryRequest) -> List[float]:
|
||||
"""
|
||||
Vektorisiert die Anfrage.
|
||||
FIX: Enthält try-except Block für unterschiedliche Signaturen von ec.embed_text.
|
||||
"""
|
||||
if req.query_vector:
|
||||
return list(req.query_vector)
|
||||
if not req.query:
|
||||
raise ValueError("Kein Text oder Vektor für die Suche angegeben.")
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
try:
|
||||
# Versuch mit modernem Interface (WP-03 kompatibel)
|
||||
return ec.embed_text(req.query, model_name=settings.MODEL_NAME)
|
||||
except TypeError:
|
||||
# Fallback für Signaturen, die 'model_name' nicht als Keyword akzeptieren
|
||||
logger.debug("ec.embed_text does not accept 'model_name' keyword. Falling back.")
|
||||
return ec.embed_text(req.query)
|
||||
|
||||
|
||||
def _semantic_hits(
|
||||
client: Any,
|
||||
prefix: str,
|
||||
vector: List[float],
|
||||
top_k: int,
|
||||
filters: Optional[Dict] = None
|
||||
) -> List[Tuple[str, float, Dict[str, Any]]]:
|
||||
"""Führt die Vektorsuche via database-Points-Modul durch."""
|
||||
raw_hits = qp.search_chunks_by_vector(client, prefix, vector, top=top_k, filters=filters)
|
||||
# Strikte Typkonvertierung für Stabilität
|
||||
return [(str(hit[0]), float(hit[1]), dict(hit[2] or {})) for hit in raw_hits]
|
||||
|
||||
# ==============================================================================
|
||||
# 2. EXPLANATION LAYER (DEBUG & VERIFIABILITY)
|
||||
# ==============================================================================
|
||||
|
||||
def _build_explanation(
|
||||
semantic_score: float,
|
||||
payload: Dict[str, Any],
|
||||
scoring_debug: Dict[str, Any],
|
||||
subgraph: Optional[ga.Subgraph],
|
||||
target_note_id: Optional[str],
|
||||
applied_boosts: Optional[Dict[str, float]] = None
|
||||
) -> Explanation:
|
||||
"""
|
||||
Transformiert mathematische Scores und Graph-Signale in eine menschenlesbare Erklärung.
|
||||
Behebt Pydantic ValidationErrors durch explizite String-Sicherung.
|
||||
"""
|
||||
_, edge_w_cfg, _ = get_weights()
|
||||
base_val = scoring_debug["base_val"]
|
||||
|
||||
# 1. Detaillierter mathematischer Breakdown
|
||||
breakdown = ScoreBreakdown(
|
||||
semantic_contribution=base_val,
|
||||
edge_contribution=base_val * scoring_debug["edge_impact_final"],
|
||||
centrality_contribution=base_val * scoring_debug["cent_impact_final"],
|
||||
raw_semantic=semantic_score,
|
||||
raw_edge_bonus=scoring_debug["edge_bonus"],
|
||||
raw_centrality=scoring_debug["cent_bonus"],
|
||||
node_weight=float(payload.get("retriever_weight", 1.0)),
|
||||
status_multiplier=scoring_debug["status_multiplier"],
|
||||
graph_boost_factor=scoring_debug["graph_boost_factor"]
|
||||
)
|
||||
|
||||
reasons: List[Reason] = []
|
||||
edges_dto: List[EdgeDTO] = []
|
||||
|
||||
# 2. Gründe für Semantik hinzufügen
|
||||
if semantic_score > 0.85:
|
||||
reasons.append(Reason(kind="semantic", message="Sehr hohe textuelle Übereinstimmung.", score_impact=base_val))
|
||||
elif semantic_score > 0.70:
|
||||
reasons.append(Reason(kind="semantic", message="Inhaltliche Übereinstimmung.", score_impact=base_val))
|
||||
|
||||
# 3. Gründe für Typ und Lifecycle
|
||||
type_weight = float(payload.get("retriever_weight", 1.0))
|
||||
if type_weight != 1.0:
|
||||
msg = "Bevorzugt" if type_weight > 1.0 else "De-priorisiert"
|
||||
reasons.append(Reason(kind="type", message=f"{msg} durch Typ-Profil.", score_impact=base_val * (type_weight - 1.0)))
|
||||
|
||||
# 4. Kanten-Verarbeitung (Graph-Intelligence)
|
||||
if subgraph and target_note_id and scoring_debug["edge_bonus"] > 0:
|
||||
raw_edges = []
|
||||
if hasattr(subgraph, "get_incoming_edges"):
|
||||
raw_edges.extend(subgraph.get_incoming_edges(target_note_id) or [])
|
||||
if hasattr(subgraph, "get_outgoing_edges"):
|
||||
raw_edges.extend(subgraph.get_outgoing_edges(target_note_id) or [])
|
||||
|
||||
for edge in raw_edges:
|
||||
# FIX: Zwingende String-Konvertierung für Pydantic-Stabilität
|
||||
src = str(edge.get("source") or "note_root")
|
||||
tgt = str(edge.get("target") or target_note_id or "unknown_target")
|
||||
kind = str(edge.get("kind", "related_to"))
|
||||
prov = str(edge.get("provenance", "rule"))
|
||||
conf = float(edge.get("confidence", 1.0))
|
||||
|
||||
direction = "in" if tgt == target_note_id else "out"
|
||||
|
||||
edge_obj = EdgeDTO(
|
||||
id=f"{src}->{tgt}:{kind}",
|
||||
kind=kind,
|
||||
source=src,
|
||||
target=tgt,
|
||||
weight=conf,
|
||||
direction=direction,
|
||||
provenance=prov,
|
||||
confidence=conf
|
||||
)
|
||||
edges_dto.append(edge_obj)
|
||||
|
||||
# Die 3 wichtigsten Kanten als Begründung formulieren
|
||||
top_edges = sorted(edges_dto, key=lambda e: e.confidence, reverse=True)
|
||||
for e in top_edges[:3]:
|
||||
peer = e.source if e.direction == "in" else e.target
|
||||
prov_txt = "Bestätigte" if e.provenance == "explicit" else "KI-basierte"
|
||||
boost_txt = f" [Boost x{applied_boosts.get(e.kind)}]" if applied_boosts and e.kind in applied_boosts else ""
|
||||
|
||||
reasons.append(Reason(
|
||||
kind="edge",
|
||||
message=f"{prov_txt} Kante '{e.kind}'{boost_txt} von/zu '{peer}'.",
|
||||
score_impact=edge_w_cfg * e.confidence
|
||||
))
|
||||
|
||||
if scoring_debug["cent_bonus"] > 0.01:
|
||||
reasons.append(Reason(kind="centrality", message="Die Notiz ist ein zentraler Informations-Hub.", score_impact=breakdown.centrality_contribution))
|
||||
|
||||
return Explanation(
|
||||
breakdown=breakdown,
|
||||
reasons=reasons,
|
||||
related_edges=edges_dto if edges_dto else None,
|
||||
applied_boosts=applied_boosts
|
||||
)
|
||||
|
||||
# ==============================================================================
|
||||
# 3. CORE RETRIEVAL PIPELINE
|
||||
# ==============================================================================
|
||||
|
||||
def _build_hits_from_semantic(
|
||||
hits: Iterable[Tuple[str, float, Dict[str, Any]]],
|
||||
top_k: int,
|
||||
used_mode: str,
|
||||
subgraph: ga.Subgraph | None = None,
|
||||
explain: bool = False,
|
||||
dynamic_edge_boosts: Dict[str, float] = None
|
||||
) -> QueryResponse:
|
||||
"""Wandelt semantische Roh-Treffer in bewertete QueryHits um."""
|
||||
t0 = time.time()
|
||||
enriched = []
|
||||
|
||||
for pid, semantic_score, payload in hits:
|
||||
edge_bonus, cent_bonus = 0.0, 0.0
|
||||
target_id = payload.get("note_id")
|
||||
|
||||
if subgraph and target_id:
|
||||
try:
|
||||
edge_bonus = float(subgraph.edge_bonus(target_id))
|
||||
cent_bonus = float(subgraph.centrality_bonus(target_id))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Mathematisches Scoring via WP-22 Engine
|
||||
debug_data = compute_wp22_score(
|
||||
semantic_score, payload, edge_bonus, cent_bonus, dynamic_edge_boosts
|
||||
)
|
||||
enriched.append((pid, semantic_score, payload, debug_data))
|
||||
|
||||
# Sortierung nach finalem mathematischen Score
|
||||
enriched_sorted = sorted(enriched, key=lambda h: h[3]["total"], reverse=True)
|
||||
limited_hits = enriched_sorted[: max(1, top_k)]
|
||||
|
||||
results: List[QueryHit] = []
|
||||
for pid, s_score, pl, dbg in limited_hits:
|
||||
explanation_obj = None
|
||||
if explain:
|
||||
explanation_obj = _build_explanation(
|
||||
semantic_score=float(s_score),
|
||||
payload=pl,
|
||||
scoring_debug=dbg,
|
||||
subgraph=subgraph,
|
||||
target_note_id=pl.get("note_id"),
|
||||
applied_boosts=dynamic_edge_boosts
|
||||
)
|
||||
|
||||
# Payload Text-Feld normalisieren
|
||||
text_content = pl.get("page_content") or pl.get("text") or pl.get("content", "[Kein Text]")
|
||||
|
||||
results.append(QueryHit(
|
||||
node_id=str(pid),
|
||||
note_id=str(pl.get("note_id", "unknown")),
|
||||
semantic_score=float(s_score),
|
||||
edge_bonus=dbg["edge_bonus"],
|
||||
centrality_bonus=dbg["cent_bonus"],
|
||||
total_score=dbg["total"],
|
||||
source={
|
||||
"path": pl.get("path"),
|
||||
"section": pl.get("section") or pl.get("section_title"),
|
||||
"text": text_content
|
||||
},
|
||||
payload=pl,
|
||||
explanation=explanation_obj
|
||||
))
|
||||
|
||||
return QueryResponse(results=results, used_mode=used_mode, latency_ms=int((time.time() - t0) * 1000))
|
||||
|
||||
|
||||
def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
|
||||
"""
|
||||
Die Haupt-Einstiegsfunktion für die hybride Suche.
|
||||
Kombiniert Vektorsuche mit Graph-Expansion und WP-22 Gewichtung.
|
||||
"""
|
||||
client, prefix = _get_client_and_prefix()
|
||||
vector = list(req.query_vector) if req.query_vector else _get_query_vector(req)
|
||||
top_k = req.top_k or 10
|
||||
|
||||
# 1. Semantische Seed-Suche
|
||||
hits = _semantic_hits(client, prefix, vector, top_k=top_k, filters=req.filters)
|
||||
|
||||
# 2. Graph Expansion Konfiguration
|
||||
expand_cfg = req.expand if isinstance(req.expand, dict) else {}
|
||||
depth = int(expand_cfg.get("depth", 1))
|
||||
boost_edges = getattr(req, "boost_edges", {}) or {}
|
||||
|
||||
subgraph: ga.Subgraph | None = None
|
||||
if depth > 0 and hits:
|
||||
# Start-IDs für den Graph-Traversal sammeln
|
||||
seed_ids = list({h[2].get("note_id") for h in hits if h[2].get("note_id")})
|
||||
|
||||
if seed_ids:
|
||||
try:
|
||||
# Subgraph aus RAM/DB laden
|
||||
subgraph = ga.expand(client, prefix, seed_ids, depth=depth, edge_types=expand_cfg.get("edge_types"))
|
||||
|
||||
# --- WP-22: Kanten-Gewichtung im RAM-Graphen vor Bonus-Berechnung ---
|
||||
if subgraph and hasattr(subgraph, "graph"):
|
||||
for _, _, data in subgraph.graph.edges(data=True):
|
||||
# A. Provenance Weighting (WP-22 Bonus für Herkunft)
|
||||
prov = data.get("provenance", "rule")
|
||||
# Belohnung: Explizite Links (1.0) > Smart (0.9) > Rule (0.7)
|
||||
prov_w = 1.0 if prov == "explicit" else (0.9 if prov == "smart" else 0.7)
|
||||
|
||||
# B. Intent Boost Multiplikator (Vom Router dynamisch injiziert)
|
||||
kind = data.get("kind")
|
||||
intent_multiplier = boost_edges.get(kind, 1.0)
|
||||
|
||||
# Finales Gewicht setzen (Basis * Provenance * Intent)
|
||||
data["weight"] = data.get("weight", 1.0) * prov_w * intent_multiplier
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Graph Expansion failed: {e}")
|
||||
subgraph = None
|
||||
|
||||
# 3. Scoring & Explanation Generierung
|
||||
return _build_hits_from_semantic(hits, top_k, "hybrid", subgraph, req.explain, boost_edges)
|
||||
|
||||
|
||||
def semantic_retrieve(req: QueryRequest) -> QueryResponse:
|
||||
"""Standard Vektorsuche ohne Graph-Einfluss (WP-02 Fallback)."""
|
||||
client, prefix = _get_client_and_prefix()
|
||||
vector = _get_query_vector(req)
|
||||
hits = _semantic_hits(client, prefix, vector, req.top_k or 10, req.filters)
|
||||
return _build_hits_from_semantic(hits, req.top_k or 10, "semantic", explain=req.explain)
|
||||
|
||||
|
||||
class Retriever:
|
||||
"""Schnittstelle für die asynchrone Suche."""
|
||||
async def search(self, request: QueryRequest) -> QueryResponse:
|
||||
"""Führt eine hybride Suche aus."""
|
||||
return hybrid_retrieve(request)
|
||||
121
app/core/retrieval/retriever_scoring.py
Normal file
121
app/core/retrieval/retriever_scoring.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
"""
|
||||
FILE: app/core/retrieval/retriever_scoring.py
|
||||
DESCRIPTION: Mathematische Kern-Logik für das WP-22 Scoring.
|
||||
Berechnet Relevanz-Scores basierend auf Semantik, Graph-Intelligence und Content Lifecycle.
|
||||
MODULARISIERUNG: Verschoben in das retrieval-Paket für WP-14.
|
||||
VERSION: 1.0.2
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.config, typing
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from typing import Any, Dict, Tuple, Optional
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@lru_cache
|
||||
def get_weights() -> Tuple[float, float, float]:
|
||||
"""
|
||||
Liefert die Basis-Gewichtung (semantic, edge, centrality) aus der Konfiguration.
|
||||
Priorität:
|
||||
1. config/retriever.yaml (Scoring-Sektion)
|
||||
2. Umgebungsvariablen (RETRIEVER_W_*)
|
||||
3. System-Defaults (1.0, 0.0, 0.0)
|
||||
"""
|
||||
from app.config import get_settings
|
||||
settings = get_settings()
|
||||
|
||||
# Defaults aus Settings laden
|
||||
sem = float(getattr(settings, "RETRIEVER_W_SEM", 1.0))
|
||||
edge = float(getattr(settings, "RETRIEVER_W_EDGE", 0.0))
|
||||
cent = float(getattr(settings, "RETRIEVER_W_CENT", 0.0))
|
||||
|
||||
# Optionaler Override via YAML
|
||||
config_path = os.getenv("MINDNET_RETRIEVER_CONFIG", "config/retriever.yaml")
|
||||
if yaml and os.path.exists(config_path):
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
scoring = data.get("scoring", {})
|
||||
sem = float(scoring.get("semantic_weight", sem))
|
||||
edge = float(scoring.get("edge_weight", edge))
|
||||
cent = float(scoring.get("centrality_weight", cent))
|
||||
except Exception as e:
|
||||
logger.warning(f"Retriever Configuration could not be fully loaded from {config_path}: {e}")
|
||||
|
||||
return sem, edge, cent
|
||||
|
||||
def get_status_multiplier(payload: Dict[str, Any]) -> float:
|
||||
"""
|
||||
WP-22 A: Content Lifecycle Multiplier.
|
||||
Steuert das Ranking basierend auf dem Reifegrad der Information.
|
||||
|
||||
- stable: 1.2 (Belohnung für verifiziertes Wissen)
|
||||
- active: 1.0 (Standard-Gewichtung)
|
||||
- draft: 0.5 (Bestrafung für unfertige Fragmente)
|
||||
"""
|
||||
status = str(payload.get("status", "active")).lower().strip()
|
||||
if status == "stable":
|
||||
return 1.2
|
||||
if status == "draft":
|
||||
return 0.5
|
||||
return 1.0
|
||||
|
||||
def compute_wp22_score(
|
||||
semantic_score: float,
|
||||
payload: Dict[str, Any],
|
||||
edge_bonus_raw: float = 0.0,
|
||||
cent_bonus_raw: float = 0.0,
|
||||
dynamic_edge_boosts: Optional[Dict[str, float]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Die zentrale mathematische Scoring-Formel der Mindnet Intelligence.
|
||||
Implementiert das WP-22 Hybrid-Scoring (Semantic * Lifecycle * Graph).
|
||||
|
||||
FORMEL:
|
||||
Score = (Similarity * StatusMult) * (1 + (TypeWeight - 1) + ((EdgeW * EB + CentW * CB) * IntentBoost))
|
||||
|
||||
Returns:
|
||||
Dict mit dem finalen 'total' Score und allen mathematischen Zwischenwerten für den Explanation Layer.
|
||||
"""
|
||||
sem_w, edge_w_cfg, cent_w_cfg = get_weights()
|
||||
status_mult = get_status_multiplier(payload)
|
||||
|
||||
# Retriever Weight (Type Boost aus types.yaml, z.B. 1.1 für Decisions)
|
||||
node_weight = float(payload.get("retriever_weight", 1.0))
|
||||
|
||||
# 1. Berechnung des Base Scores (Semantik gewichtet durch Lifecycle-Status)
|
||||
base_val = float(semantic_score) * status_mult
|
||||
|
||||
# 2. Graph Boost Factor (Teil C: Intent-spezifische Verstärkung)
|
||||
# Erhöht das Gewicht des gesamten Graphen um 50%, wenn ein spezifischer Intent vorliegt.
|
||||
graph_boost_factor = 1.5 if dynamic_edge_boosts and (edge_bonus_raw > 0 or cent_bonus_raw > 0) else 1.0
|
||||
|
||||
# 3. Einzelne Graph-Komponenten berechnen
|
||||
edge_impact_final = (edge_w_cfg * edge_bonus_raw) * graph_boost_factor
|
||||
cent_impact_final = (cent_w_cfg * cent_bonus_raw) * graph_boost_factor
|
||||
|
||||
# 4. Finales Zusammenführen (Merging)
|
||||
# (node_weight - 1.0) sorgt dafür, dass ein Gewicht von 1.0 keinen Einfluss hat (neutral).
|
||||
total = base_val * (1.0 + (node_weight - 1.0) + edge_impact_final + cent_impact_final)
|
||||
|
||||
# Sicherstellen, dass der Score niemals 0 oder negativ ist (Floor)
|
||||
final_score = max(0.0001, float(total))
|
||||
|
||||
return {
|
||||
"total": final_score,
|
||||
"edge_bonus": float(edge_bonus_raw),
|
||||
"cent_bonus": float(cent_bonus_raw),
|
||||
"status_multiplier": status_mult,
|
||||
"graph_boost_factor": graph_boost_factor,
|
||||
"type_impact": node_weight - 1.0,
|
||||
"base_val": base_val,
|
||||
"edge_impact_final": edge_impact_final,
|
||||
"cent_impact_final": cent_impact_final
|
||||
}
|
||||
|
|
@ -1,310 +1,14 @@
|
|||
"""
|
||||
FILE: app/core/retriever.py
|
||||
DESCRIPTION: Haupt-Schnittstelle für die Suche. Orchestriert Vektorsuche und Graph-Expansion.
|
||||
Nutzt retriever_scoring.py für die WP-22 Logik.
|
||||
FIX: TypeError in embed_text (model_name) behoben.
|
||||
FIX: Pydantic ValidationError (Target/Source) behoben.
|
||||
VERSION: 0.6.15 (WP-22 Full & Stable)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.config, app.models.dto, app.core.qdrant*, app.core.graph_adapter, app.core.retriever_scoring
|
||||
DESCRIPTION: Proxy-Modul zur Aufrechterhaltung der Abwärtskompatibilität (WP-14).
|
||||
Leitet Retrieval-Anfragen an das neue retrieval-Paket weiter.
|
||||
STATUS: Proxy (Legacy-Support)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from typing import Any, Dict, List, Tuple, Iterable, Optional
|
||||
|
||||
from app.config import get_settings
|
||||
from app.models.dto import (
|
||||
QueryRequest, QueryResponse, QueryHit,
|
||||
Explanation, ScoreBreakdown, Reason, EdgeDTO
|
||||
)
|
||||
import app.core.qdrant as qdr
|
||||
import app.core.qdrant_points as qp
|
||||
import app.services.embeddings_client as ec
|
||||
import app.core.graph_adapter as ga
|
||||
|
||||
# Mathematische Engine importieren
|
||||
from app.core.retriever_scoring import get_weights, compute_wp22_score
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ==============================================================================
|
||||
# 1. CORE HELPERS & CONFIG LOADERS
|
||||
# ==============================================================================
|
||||
|
||||
def _get_client_and_prefix() -> Tuple[Any, str]:
|
||||
"""Initialisiert Qdrant Client und lädt Collection-Prefix."""
|
||||
cfg = qdr.QdrantConfig.from_env()
|
||||
return qdr.get_client(cfg), cfg.prefix
|
||||
|
||||
|
||||
def _get_query_vector(req: QueryRequest) -> List[float]:
|
||||
"""
|
||||
Vektorisiert die Anfrage.
|
||||
FIX: Enthält try-except Block für unterschiedliche Signaturen von ec.embed_text.
|
||||
"""
|
||||
if req.query_vector:
|
||||
return list(req.query_vector)
|
||||
if not req.query:
|
||||
raise ValueError("Kein Text oder Vektor für die Suche angegeben.")
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
try:
|
||||
# Versuch mit modernem Interface (WP-03 kompatibel)
|
||||
return ec.embed_text(req.query, model_name=settings.MODEL_NAME)
|
||||
except TypeError:
|
||||
# Fallback für Signaturen, die 'model_name' nicht als Keyword akzeptieren
|
||||
logger.debug("ec.embed_text does not accept 'model_name' keyword. Falling back.")
|
||||
return ec.embed_text(req.query)
|
||||
|
||||
|
||||
def _semantic_hits(
|
||||
client: Any,
|
||||
prefix: str,
|
||||
vector: List[float],
|
||||
top_k: int,
|
||||
filters: Optional[Dict] = None
|
||||
) -> List[Tuple[str, float, Dict[str, Any]]]:
|
||||
"""Führt die Vektorsuche durch und konvertiert Qdrant-Points in ein einheitliches Format."""
|
||||
raw_hits = qp.search_chunks_by_vector(client, prefix, vector, top=top_k, filters=filters)
|
||||
# Strikte Typkonvertierung für Stabilität
|
||||
return [(str(hit[0]), float(hit[1]), dict(hit[2] or {})) for hit in raw_hits]
|
||||
|
||||
# ==============================================================================
|
||||
# 2. EXPLANATION LAYER (DEBUG & VERIFIABILITY)
|
||||
# ==============================================================================
|
||||
|
||||
def _build_explanation(
|
||||
semantic_score: float,
|
||||
payload: Dict[str, Any],
|
||||
scoring_debug: Dict[str, Any],
|
||||
subgraph: Optional[ga.Subgraph],
|
||||
target_note_id: Optional[str],
|
||||
applied_boosts: Optional[Dict[str, float]] = None
|
||||
) -> Explanation:
|
||||
"""
|
||||
Transformiert mathematische Scores und Graph-Signale in eine menschenlesbare Erklärung.
|
||||
Behebt Pydantic ValidationErrors durch explizite String-Sicherung.
|
||||
"""
|
||||
_, edge_w_cfg, _ = get_weights()
|
||||
base_val = scoring_debug["base_val"]
|
||||
|
||||
# 1. Detaillierter mathematischer Breakdown
|
||||
breakdown = ScoreBreakdown(
|
||||
semantic_contribution=base_val,
|
||||
edge_contribution=base_val * scoring_debug["edge_impact_final"],
|
||||
centrality_contribution=base_val * scoring_debug["cent_impact_final"],
|
||||
raw_semantic=semantic_score,
|
||||
raw_edge_bonus=scoring_debug["edge_bonus"],
|
||||
raw_centrality=scoring_debug["cent_bonus"],
|
||||
node_weight=float(payload.get("retriever_weight", 1.0)),
|
||||
status_multiplier=scoring_debug["status_multiplier"],
|
||||
graph_boost_factor=scoring_debug["graph_boost_factor"]
|
||||
from .retrieval.retriever import (
|
||||
Retriever,
|
||||
hybrid_retrieve,
|
||||
semantic_retrieve
|
||||
)
|
||||
|
||||
reasons: List[Reason] = []
|
||||
edges_dto: List[EdgeDTO] = []
|
||||
|
||||
# 2. Gründe für Semantik hinzufügen
|
||||
if semantic_score > 0.85:
|
||||
reasons.append(Reason(kind="semantic", message="Sehr hohe textuelle Übereinstimmung.", score_impact=base_val))
|
||||
elif semantic_score > 0.70:
|
||||
reasons.append(Reason(kind="semantic", message="Inhaltliche Übereinstimmung.", score_impact=base_val))
|
||||
|
||||
# 3. Gründe für Typ und Lifecycle
|
||||
type_weight = float(payload.get("retriever_weight", 1.0))
|
||||
if type_weight != 1.0:
|
||||
msg = "Bevorzugt" if type_weight > 1.0 else "De-priorisiert"
|
||||
reasons.append(Reason(kind="type", message=f"{msg} durch Typ-Profil.", score_impact=base_val * (type_weight - 1.0)))
|
||||
|
||||
# 4. Kanten-Verarbeitung (Graph-Intelligence)
|
||||
if subgraph and target_note_id and scoring_debug["edge_bonus"] > 0:
|
||||
raw_edges = []
|
||||
if hasattr(subgraph, "get_incoming_edges"):
|
||||
raw_edges.extend(subgraph.get_incoming_edges(target_note_id) or [])
|
||||
if hasattr(subgraph, "get_outgoing_edges"):
|
||||
raw_edges.extend(subgraph.get_outgoing_edges(target_note_id) or [])
|
||||
|
||||
for edge in raw_edges:
|
||||
# FIX: Zwingende String-Konvertierung für Pydantic-Stabilität
|
||||
src = str(edge.get("source") or "note_root")
|
||||
tgt = str(edge.get("target") or target_note_id or "unknown_target")
|
||||
kind = str(edge.get("kind", "related_to"))
|
||||
prov = str(edge.get("provenance", "rule"))
|
||||
conf = float(edge.get("confidence", 1.0))
|
||||
|
||||
direction = "in" if tgt == target_note_id else "out"
|
||||
|
||||
edge_obj = EdgeDTO(
|
||||
id=f"{src}->{tgt}:{kind}",
|
||||
kind=kind,
|
||||
source=src,
|
||||
target=tgt,
|
||||
weight=conf,
|
||||
direction=direction,
|
||||
provenance=prov,
|
||||
confidence=conf
|
||||
)
|
||||
edges_dto.append(edge_obj)
|
||||
|
||||
# Die 3 wichtigsten Kanten als Begründung formulieren
|
||||
top_edges = sorted(edges_dto, key=lambda e: e.confidence, reverse=True)
|
||||
for e in top_edges[:3]:
|
||||
peer = e.source if e.direction == "in" else e.target
|
||||
prov_txt = "Bestätigte" if e.provenance == "explicit" else "KI-basierte"
|
||||
boost_txt = f" [Boost x{applied_boosts.get(e.kind)}]" if applied_boosts and e.kind in applied_boosts else ""
|
||||
|
||||
reasons.append(Reason(
|
||||
kind="edge",
|
||||
message=f"{prov_txt} Kante '{e.kind}'{boost_txt} von/zu '{peer}'.",
|
||||
score_impact=edge_w_cfg * e.confidence
|
||||
))
|
||||
|
||||
if scoring_debug["cent_bonus"] > 0.01:
|
||||
reasons.append(Reason(kind="centrality", message="Die Notiz ist ein zentraler Informations-Hub.", score_impact=breakdown.centrality_contribution))
|
||||
|
||||
return Explanation(
|
||||
breakdown=breakdown,
|
||||
reasons=reasons,
|
||||
related_edges=edges_dto if edges_dto else None,
|
||||
applied_boosts=applied_boosts
|
||||
)
|
||||
|
||||
# ==============================================================================
|
||||
# 3. CORE RETRIEVAL PIPELINE
|
||||
# ==============================================================================
|
||||
|
||||
def _build_hits_from_semantic(
|
||||
hits: Iterable[Tuple[str, float, Dict[str, Any]]],
|
||||
top_k: int,
|
||||
used_mode: str,
|
||||
subgraph: ga.Subgraph | None = None,
|
||||
explain: bool = False,
|
||||
dynamic_edge_boosts: Dict[str, float] = None
|
||||
) -> QueryResponse:
|
||||
"""Wandelt semantische Roh-Treffer in hochgeladene, bewertete QueryHits um."""
|
||||
t0 = time.time()
|
||||
enriched = []
|
||||
|
||||
for pid, semantic_score, payload in hits:
|
||||
edge_bonus, cent_bonus = 0.0, 0.0
|
||||
target_id = payload.get("note_id")
|
||||
|
||||
if subgraph and target_id:
|
||||
try:
|
||||
edge_bonus = float(subgraph.edge_bonus(target_id))
|
||||
cent_bonus = float(subgraph.centrality_bonus(target_id))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Mathematisches Scoring via WP-22 Engine
|
||||
debug_data = compute_wp22_score(
|
||||
semantic_score, payload, edge_bonus, cent_bonus, dynamic_edge_boosts
|
||||
)
|
||||
enriched.append((pid, semantic_score, payload, debug_data))
|
||||
|
||||
# Sortierung nach finalem mathematischen Score
|
||||
enriched_sorted = sorted(enriched, key=lambda h: h[3]["total"], reverse=True)
|
||||
limited_hits = enriched_sorted[: max(1, top_k)]
|
||||
|
||||
results: List[QueryHit] = []
|
||||
for pid, s_score, pl, dbg in limited_hits:
|
||||
explanation_obj = None
|
||||
if explain:
|
||||
explanation_obj = _build_explanation(
|
||||
semantic_score=float(s_score),
|
||||
payload=pl,
|
||||
scoring_debug=dbg,
|
||||
subgraph=subgraph,
|
||||
target_note_id=pl.get("note_id"),
|
||||
applied_boosts=dynamic_edge_boosts
|
||||
)
|
||||
|
||||
# Payload Text-Feld normalisieren
|
||||
text_content = pl.get("page_content") or pl.get("text") or pl.get("content", "[Kein Text]")
|
||||
|
||||
results.append(QueryHit(
|
||||
node_id=str(pid),
|
||||
note_id=str(pl.get("note_id", "unknown")),
|
||||
semantic_score=float(s_score),
|
||||
edge_bonus=dbg["edge_bonus"],
|
||||
centrality_bonus=dbg["cent_bonus"],
|
||||
total_score=dbg["total"],
|
||||
source={
|
||||
"path": pl.get("path"),
|
||||
"section": pl.get("section") or pl.get("section_title"),
|
||||
"text": text_content
|
||||
},
|
||||
payload=pl,
|
||||
explanation=explanation_obj
|
||||
))
|
||||
|
||||
return QueryResponse(results=results, used_mode=used_mode, latency_ms=int((time.time() - t0) * 1000))
|
||||
|
||||
|
||||
def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
|
||||
"""
|
||||
Die Haupt-Einstiegsfunktion für die hybride Suche.
|
||||
Kombiniert Vektorsuche mit Graph-Expansion, Provenance-Weighting und Intent-Boosting.
|
||||
"""
|
||||
client, prefix = _get_client_and_prefix()
|
||||
vector = list(req.query_vector) if req.query_vector else _get_query_vector(req)
|
||||
top_k = req.top_k or 10
|
||||
|
||||
# 1. Semantische Seed-Suche
|
||||
hits = _semantic_hits(client, prefix, vector, top_k=top_k, filters=req.filters)
|
||||
|
||||
# 2. Graph Expansion Konfiguration
|
||||
expand_cfg = req.expand if isinstance(req.expand, dict) else {}
|
||||
depth = int(expand_cfg.get("depth", 1))
|
||||
boost_edges = getattr(req, "boost_edges", {}) or {}
|
||||
|
||||
subgraph: ga.Subgraph | None = None
|
||||
if depth > 0 and hits:
|
||||
# Start-IDs für den Graph-Traversal sammeln
|
||||
seed_ids = list({h[2].get("note_id") for h in hits if h[2].get("note_id")})
|
||||
|
||||
if seed_ids:
|
||||
try:
|
||||
# Subgraph aus RAM/DB laden
|
||||
subgraph = ga.expand(client, prefix, seed_ids, depth=depth, edge_types=expand_cfg.get("edge_types"))
|
||||
|
||||
# --- WP-22: Kanten-Gewichtung im RAM-Graphen vor Bonus-Berechnung ---
|
||||
if subgraph and hasattr(subgraph, "graph"):
|
||||
for _, _, data in subgraph.graph.edges(data=True):
|
||||
# A. Provenance Weighting (WP-22 Bonus für Herkunft)
|
||||
prov = data.get("provenance", "rule")
|
||||
# Belohnung: Explizite Links (1.0) > Smart (0.9) > Rule (0.7)
|
||||
prov_w = 1.0 if prov == "explicit" else (0.9 if prov == "smart" else 0.7)
|
||||
|
||||
# B. Intent Boost Multiplikator (Vom Router dynamisch injiziert)
|
||||
kind = data.get("kind")
|
||||
intent_multiplier = boost_edges.get(kind, 1.0)
|
||||
|
||||
# Finales Gewicht setzen (Basis * Provenance * Intent)
|
||||
data["weight"] = data.get("weight", 1.0) * prov_w * intent_multiplier
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Graph Expansion failed: {e}")
|
||||
subgraph = None
|
||||
|
||||
# 3. Scoring & Explanation Generierung
|
||||
return _build_hits_from_semantic(hits, top_k, "hybrid", subgraph, req.explain, boost_edges)
|
||||
|
||||
|
||||
def semantic_retrieve(req: QueryRequest) -> QueryResponse:
|
||||
"""Standard Vektorsuche ohne Graph-Einfluss (WP-02 Fallback)."""
|
||||
client, prefix = _get_client_and_prefix()
|
||||
vector = _get_query_vector(req)
|
||||
hits = _semantic_hits(client, prefix, vector, req.top_k or 10, req.filters)
|
||||
return _build_hits_from_semantic(hits, req.top_k or 10, "semantic", explain=req.explain)
|
||||
|
||||
|
||||
class Retriever:
|
||||
"""Schnittstelle für die asynchrone Suche."""
|
||||
async def search(self, request: QueryRequest) -> QueryResponse:
|
||||
"""Führt eine hybride Suche aus."""
|
||||
return hybrid_retrieve(request)
|
||||
# Re-Export für 100% Kompatibilität
|
||||
__all__ = ["Retriever", "hybrid_retrieve", "semantic_retrieve"]
|
||||
|
|
@ -1,120 +1,18 @@
|
|||
"""
|
||||
FILE: app/core/retriever_scoring.py
|
||||
DESCRIPTION: Mathematische Kern-Logik für das WP-22 Scoring.
|
||||
Berechnet Relevanz-Scores basierend auf Semantik, Graph-Intelligence und Content Lifecycle.
|
||||
VERSION: 1.0.1 (WP-22 Full Math Engine)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.config, typing
|
||||
DESCRIPTION: Proxy-Modul zur Aufrechterhaltung der Abwärtskompatibilität (WP-14).
|
||||
Leitet Scoring-Berechnungen an das neue retrieval-Paket weiter.
|
||||
STATUS: Proxy (Legacy-Support)
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from typing import Any, Dict, Tuple, Optional
|
||||
from .retrieval.retriever_scoring import (
|
||||
get_weights,
|
||||
compute_wp22_score,
|
||||
get_status_multiplier
|
||||
)
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@lru_cache
|
||||
def get_weights() -> Tuple[float, float, float]:
|
||||
"""
|
||||
Liefert die Basis-Gewichtung (semantic, edge, centrality) aus der Konfiguration.
|
||||
Priorität:
|
||||
1. config/retriever.yaml (Scoring-Sektion)
|
||||
2. Umgebungsvariablen (RETRIEVER_W_*)
|
||||
3. System-Defaults (1.0, 0.0, 0.0)
|
||||
"""
|
||||
from app.config import get_settings
|
||||
settings = get_settings()
|
||||
|
||||
# Defaults aus Settings laden
|
||||
sem = float(getattr(settings, "RETRIEVER_W_SEM", 1.0))
|
||||
edge = float(getattr(settings, "RETRIEVER_W_EDGE", 0.0))
|
||||
cent = float(getattr(settings, "RETRIEVER_W_CENT", 0.0))
|
||||
|
||||
# Optionaler Override via YAML
|
||||
config_path = os.getenv("MINDNET_RETRIEVER_CONFIG", "config/retriever.yaml")
|
||||
if yaml and os.path.exists(config_path):
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
scoring = data.get("scoring", {})
|
||||
sem = float(scoring.get("semantic_weight", sem))
|
||||
edge = float(scoring.get("edge_weight", edge))
|
||||
cent = float(scoring.get("centrality_weight", cent))
|
||||
except Exception as e:
|
||||
logger.warning(f"Retriever Configuration could not be fully loaded from {config_path}: {e}")
|
||||
|
||||
return sem, edge, cent
|
||||
|
||||
def get_status_multiplier(payload: Dict[str, Any]) -> float:
|
||||
"""
|
||||
WP-22 A: Content Lifecycle Multiplier.
|
||||
Steuert das Ranking basierend auf dem Reifegrad der Information.
|
||||
|
||||
- stable: 1.2 (Belohnung für verifiziertes Wissen)
|
||||
- active: 1.0 (Standard-Gewichtung)
|
||||
- draft: 0.5 (Bestrafung für unfertige Fragmente)
|
||||
"""
|
||||
status = str(payload.get("status", "active")).lower().strip()
|
||||
if status == "stable":
|
||||
return 1.2
|
||||
if status == "draft":
|
||||
return 0.5
|
||||
return 1.0
|
||||
|
||||
def compute_wp22_score(
|
||||
semantic_score: float,
|
||||
payload: Dict[str, Any],
|
||||
edge_bonus_raw: float = 0.0,
|
||||
cent_bonus_raw: float = 0.0,
|
||||
dynamic_edge_boosts: Optional[Dict[str, float]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Die zentrale mathematische Scoring-Formel der Mindnet Intelligence.
|
||||
Implementiert das WP-22 Hybrid-Scoring (Semantic * Lifecycle * Graph).
|
||||
|
||||
FORMEL:
|
||||
Score = (Similarity * StatusMult) * (1 + (TypeWeight - 1) + ((EdgeW * EB + CentW * CB) * IntentBoost))
|
||||
|
||||
Returns:
|
||||
Dict mit dem finalen 'total' Score und allen mathematischen Zwischenwerten für den Explanation Layer.
|
||||
"""
|
||||
sem_w, edge_w_cfg, cent_w_cfg = get_weights()
|
||||
status_mult = get_status_multiplier(payload)
|
||||
|
||||
# Retriever Weight (Type Boost aus types.yaml, z.B. 1.1 für Decisions)
|
||||
node_weight = float(payload.get("retriever_weight", 1.0))
|
||||
|
||||
# 1. Berechnung des Base Scores (Semantik gewichtet durch Lifecycle-Status)
|
||||
base_val = float(semantic_score) * status_mult
|
||||
|
||||
# 2. Graph Boost Factor (Teil C: Intent-spezifische Verstärkung)
|
||||
# Erhöht das Gewicht des gesamten Graphen um 50%, wenn ein spezifischer Intent vorliegt.
|
||||
graph_boost_factor = 1.5 if dynamic_edge_boosts and (edge_bonus_raw > 0 or cent_bonus_raw > 0) else 1.0
|
||||
|
||||
# 3. Einzelne Graph-Komponenten berechnen
|
||||
edge_impact_final = (edge_w_cfg * edge_bonus_raw) * graph_boost_factor
|
||||
cent_impact_final = (cent_w_cfg * cent_bonus_raw) * graph_boost_factor
|
||||
|
||||
# 4. Finales Zusammenführen (Merging)
|
||||
# (node_weight - 1.0) sorgt dafür, dass ein Gewicht von 1.0 keinen Einfluss hat (neutral).
|
||||
total = base_val * (1.0 + (node_weight - 1.0) + edge_impact_final + cent_impact_final)
|
||||
|
||||
# Sicherstellen, dass der Score niemals 0 oder negativ ist (Floor)
|
||||
final_score = max(0.0001, float(total))
|
||||
|
||||
return {
|
||||
"total": final_score,
|
||||
"edge_bonus": float(edge_bonus_raw),
|
||||
"cent_bonus": float(cent_bonus_raw),
|
||||
"status_multiplier": status_mult,
|
||||
"graph_boost_factor": graph_boost_factor,
|
||||
"type_impact": node_weight - 1.0,
|
||||
"base_val": base_val,
|
||||
"edge_impact_final": edge_impact_final,
|
||||
"cent_impact_final": cent_impact_final
|
||||
}
|
||||
# Re-Export für 100% Kompatibilität
|
||||
__all__ = [
|
||||
"get_weights",
|
||||
"compute_wp22_score",
|
||||
"get_status_multiplier"
|
||||
]
|
||||
|
|
@ -1,11 +1,14 @@
|
|||
"""
|
||||
FILE: app/services/edge_registry.py
|
||||
DESCRIPTION: Single Source of Truth für Kanten-Typen mit dynamischem Reload.
|
||||
WP-15b: Erweiterte Provenance-Prüfung für die Candidate-Validation.
|
||||
Sichert die Graph-Integrität durch strikte Trennung von System- und Inhaltskanten.
|
||||
WP-22: Fix für absolute Pfade außerhalb des Vaults (Prod-Dictionary).
|
||||
WP-20: Synchronisation mit zentralen Settings (v0.6.2).
|
||||
VERSION: 0.7.5
|
||||
VERSION: 0.8.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: re, os, json, logging, time, app.config
|
||||
LAST_ANALYSIS: 2025-12-26
|
||||
"""
|
||||
import re
|
||||
import os
|
||||
|
|
@ -19,7 +22,12 @@ from app.config import get_settings
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
class EdgeRegistry:
|
||||
"""
|
||||
Zentraler Verwalter für das Kanten-Vokabular.
|
||||
Implementiert das Singleton-Pattern für konsistente Validierung über alle Services.
|
||||
"""
|
||||
_instance = None
|
||||
# System-Kanten, die nicht durch User oder KI gesetzt werden dürfen
|
||||
FORBIDDEN_SYSTEM_EDGES = {"next", "prev", "belongs_to"}
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
|
|
@ -51,7 +59,7 @@ class EdgeRegistry:
|
|||
def ensure_latest(self):
|
||||
"""
|
||||
Prüft den Zeitstempel der Vokabular-Datei und lädt bei Bedarf neu.
|
||||
Verhindert den AttributeError in der Ingestion-Pipeline.
|
||||
Verhindert Inkonsistenzen bei Laufzeit-Updates des Dictionaries.
|
||||
"""
|
||||
if not os.path.exists(self.full_vocab_path):
|
||||
logger.error(f"!!! [EDGE-REGISTRY ERROR] File not found: {self.full_vocab_path} !!!")
|
||||
|
|
@ -66,7 +74,10 @@ class EdgeRegistry:
|
|||
logger.error(f"!!! [EDGE-REGISTRY] Error checking file time: {e}")
|
||||
|
||||
def _load_vocabulary(self):
|
||||
"""Parst das Markdown-Wörterbuch und baut die Canonical-Map auf."""
|
||||
"""
|
||||
Parst das Markdown-Wörterbuch und baut die Canonical-Map auf.
|
||||
Erkennt Tabellen-Strukturen und extrahiert fettgedruckte System-Typen.
|
||||
"""
|
||||
self.canonical_map.clear()
|
||||
self.valid_types.clear()
|
||||
|
||||
|
|
@ -101,8 +112,8 @@ class EdgeRegistry:
|
|||
|
||||
def resolve(self, edge_type: str, provenance: str = "explicit", context: dict = None) -> str:
|
||||
"""
|
||||
Validiert einen Kanten-Typ gegen das Vokabular.
|
||||
Loggt unbekannte Typen für die spätere manuelle Pflege.
|
||||
WP-15b: Validiert einen Kanten-Typ gegen das Vokabular und prüft Berechtigungen.
|
||||
Sichert, dass nur strukturelle Prozesse System-Kanten setzen dürfen.
|
||||
"""
|
||||
self.ensure_latest()
|
||||
if not edge_type:
|
||||
|
|
@ -112,20 +123,23 @@ class EdgeRegistry:
|
|||
clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_")
|
||||
ctx = context or {}
|
||||
|
||||
# System-Kanten dürfen nicht manuell vergeben werden
|
||||
if provenance == "explicit" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
|
||||
self._log_issue(clean_type, "forbidden_system_usage", ctx)
|
||||
# WP-15b: System-Kanten dürfen weder manuell noch durch KI/Vererbung gesetzt werden.
|
||||
# Nur Provenienz 'structure' (interne Prozesse) ist autorisiert.
|
||||
# Wir blockieren hier alle Provenienzen außer 'structure'.
|
||||
restricted_provenance = ["explicit", "semantic_ai", "inherited", "global_pool", "rule"]
|
||||
if provenance in restricted_provenance and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
|
||||
self._log_issue(clean_type, f"forbidden_usage_by_{provenance}", ctx)
|
||||
return "related_to"
|
||||
|
||||
# System-Kanten sind nur bei struktureller Provenienz erlaubt
|
||||
# System-Kanten sind NUR bei struktureller Provenienz erlaubt
|
||||
if provenance == "structure" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
|
||||
return clean_type
|
||||
|
||||
# Mapping auf kanonischen Namen
|
||||
# Mapping auf kanonischen Namen (Alias-Auflösung)
|
||||
if clean_type in self.canonical_map:
|
||||
return self.canonical_map[clean_type]
|
||||
|
||||
# Fallback und Logging
|
||||
# Fallback und Logging unbekannter Typen für Admin-Review
|
||||
self._log_issue(clean_type, "unknown_type", ctx)
|
||||
return clean_type
|
||||
|
||||
|
|
@ -139,12 +153,13 @@ class EdgeRegistry:
|
|||
"error": error_kind,
|
||||
"file": ctx.get("file", "unknown"),
|
||||
"line": ctx.get("line", "unknown"),
|
||||
"note_id": ctx.get("note_id", "unknown")
|
||||
"note_id": ctx.get("note_id", "unknown"),
|
||||
"provenance": ctx.get("provenance", "unknown")
|
||||
}
|
||||
with open(self.unknown_log_path, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Singleton Export
|
||||
# Singleton Export für systemweiten Zugriff
|
||||
registry = EdgeRegistry()
|
||||
|
|
@ -6,12 +6,11 @@ DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
|
|||
WP-20 Fix: Bulletproof Prompt-Auflösung für format() Aufrufe.
|
||||
WP-22/JSON: Optionales JSON-Schema + strict (für OpenRouter structured outputs).
|
||||
FIX: Intelligente Rate-Limit Erkennung (429 Handling), v1-API Sync & Timeouts.
|
||||
VERSION: 3.3.7
|
||||
VERSION: 3.3.9
|
||||
STATUS: Active
|
||||
FIX:
|
||||
- Implementiert striktes max_retries Handling für alle Provider (v.a. für Chat-Stabilität).
|
||||
- Synchronisiert Rate-Limit Retries mit dem max_retries Parameter.
|
||||
- Optimiert Logging für sofortige Fehlererkennung.
|
||||
- Importiert clean_llm_text von app.core.registry zur Vermeidung von Circular Imports.
|
||||
- Wendet clean_llm_text auf Text-Antworten in generate_raw_response an.
|
||||
"""
|
||||
import httpx
|
||||
import yaml
|
||||
|
|
@ -25,6 +24,9 @@ from pathlib import Path
|
|||
from typing import Optional, Dict, Any, Literal
|
||||
from app.config import get_settings
|
||||
|
||||
# ENTSCHEIDENDER FIX: Import der neutralen Bereinigungs-Logik (WP-14)
|
||||
from app.core.registry import clean_llm_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -119,22 +121,27 @@ class LLMService:
|
|||
) -> str:
|
||||
"""
|
||||
Haupteinstiegspunkt für LLM-Anfragen mit Priorisierung.
|
||||
Wendet die Bereinigung auf Text-Antworten an.
|
||||
"""
|
||||
target_provider = provider or self.settings.MINDNET_LLM_PROVIDER
|
||||
|
||||
if priority == "background":
|
||||
async with LLMService._background_semaphore:
|
||||
return await self._dispatch(
|
||||
res = await self._dispatch(
|
||||
target_provider, prompt, system, force_json,
|
||||
max_retries, base_delay, model_override,
|
||||
json_schema, json_schema_name, strict_json_schema
|
||||
)
|
||||
# WP-14 Fix: Bereinige Text-Antworten vor Rückgabe
|
||||
return clean_llm_text(res) if not force_json else res
|
||||
|
||||
return await self._dispatch(
|
||||
res = await self._dispatch(
|
||||
target_provider, prompt, system, force_json,
|
||||
max_retries, base_delay, model_override,
|
||||
json_schema, json_schema_name, strict_json_schema
|
||||
)
|
||||
# WP-14 Fix: Bereinige Text-Antworten vor Rückgabe
|
||||
return clean_llm_text(res) if not force_json else res
|
||||
|
||||
async def _dispatch(
|
||||
self,
|
||||
|
|
@ -297,6 +304,7 @@ class LLMService:
|
|||
final_prompt = rag_template.format(context_str=context_str, query=query)
|
||||
|
||||
# RAG Aufrufe im Chat nutzen nun standardmäßig max_retries=2 (überschreibbar)
|
||||
# Durch den Aufruf von generate_raw_response wird die Bereinigung automatisch angewendet.
|
||||
return await self.generate_raw_response(
|
||||
final_prompt,
|
||||
system=system_prompt,
|
||||
|
|
|
|||
|
|
@ -1,199 +0,0 @@
|
|||
"""
|
||||
FILE: app/services/semantic_analyzer.py
|
||||
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
|
||||
WP-20 Fix: Volle Kompatibilität mit der provider-basierten Routing-Logik (OpenRouter Primary).
|
||||
WP-22: Integration von valid_types zur Halluzinations-Vermeidung.
|
||||
FIX: Mistral-sicheres JSON-Parsing (<s> & [OUT] Handling) und 100% Logik-Erhalt.
|
||||
VERSION: 2.2.6
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.services.llm_service, app.services.edge_registry, json, logging, re
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Optional, Any
|
||||
from dataclasses import dataclass
|
||||
|
||||
# Importe
|
||||
from app.services.llm_service import LLMService
|
||||
# WP-22: Registry für Vokabular-Erzwingung
|
||||
from app.services.edge_registry import registry as edge_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SemanticAnalyzer:
|
||||
def __init__(self):
|
||||
self.llm = LLMService()
|
||||
|
||||
def _is_valid_edge_string(self, edge_str: str) -> bool:
|
||||
"""
|
||||
Prüft, ob ein String eine valide Kante im Format 'kind:target' ist.
|
||||
Verhindert, dass LLM-Geschwätz als Kante durchrutscht.
|
||||
"""
|
||||
if not isinstance(edge_str, str) or ":" not in edge_str:
|
||||
return False
|
||||
|
||||
parts = edge_str.split(":", 1)
|
||||
kind = parts[0].strip()
|
||||
target = parts[1].strip()
|
||||
|
||||
# Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten.
|
||||
if " " in kind:
|
||||
return False
|
||||
|
||||
# Regel 2: Plausible Länge für den Typ (Vermeidet Sätze als Typ)
|
||||
if len(kind) > 40 or len(kind) < 2:
|
||||
return False
|
||||
|
||||
# Regel 3: Target darf nicht leer sein
|
||||
if not target:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def _extract_json_safely(self, text: str) -> Any:
|
||||
"""
|
||||
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama).
|
||||
Implementiert robuste Recovery-Logik für Cloud-Provider.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# 1. Entferne Mistral/Llama Steuerzeichen und Tags
|
||||
clean = text.replace("<s>", "").replace("</s>", "")
|
||||
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
||||
clean = clean.strip()
|
||||
|
||||
# 2. Suche nach Markdown JSON-Blöcken
|
||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
||||
payload = match.group(1) if match else clean
|
||||
|
||||
try:
|
||||
return json.loads(payload.strip())
|
||||
except json.JSONDecodeError:
|
||||
# 3. Recovery: Suche nach der ersten [ und letzten ]
|
||||
start = payload.find('[')
|
||||
end = payload.rfind(']') + 1
|
||||
if start != -1 and end > start:
|
||||
try:
|
||||
return json.loads(payload[start:end])
|
||||
except: pass
|
||||
|
||||
# 4. Zweite Recovery: Suche nach der ersten { und letzten }
|
||||
start_obj = payload.find('{')
|
||||
end_obj = payload.rfind('}') + 1
|
||||
if start_obj != -1 and end_obj > start_obj:
|
||||
try:
|
||||
return json.loads(payload[start_obj:end_obj])
|
||||
except: pass
|
||||
return []
|
||||
|
||||
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
|
||||
"""
|
||||
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
|
||||
Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind.
|
||||
WP-20: Nutzt primär den konfigurierten Provider (z.B. OpenRouter).
|
||||
"""
|
||||
if not all_edges:
|
||||
return []
|
||||
|
||||
# 1. Bestimmung des Providers und Modells (Dynamisch über Settings)
|
||||
provider = self.llm.settings.MINDNET_LLM_PROVIDER
|
||||
model = self.llm.settings.OPENROUTER_MODEL if provider == "openrouter" else self.llm.settings.GEMINI_MODEL
|
||||
|
||||
# 2. Prompt laden (Provider-spezifisch via get_prompt)
|
||||
prompt_template = self.llm.get_prompt("edge_allocation_template", provider)
|
||||
|
||||
if not prompt_template or not isinstance(prompt_template, str):
|
||||
logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' ungültig. Nutze Recovery-Template.")
|
||||
prompt_template = (
|
||||
"TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n"
|
||||
"TEXT: {chunk_text}\n"
|
||||
"KANDIDATEN: {edge_list}\n"
|
||||
"OUTPUT: JSON Liste von Strings [\"kind:target\"]."
|
||||
)
|
||||
|
||||
# 3. Daten für Template vorbereiten (Vokabular-Check)
|
||||
edge_registry.ensure_latest()
|
||||
valid_types_str = ", ".join(sorted(list(edge_registry.valid_types)))
|
||||
edges_str = "\n".join([f"- {e}" for e in all_edges])
|
||||
|
||||
logger.debug(f"🔍 [SemanticAnalyzer] Request: {len(chunk_text)} chars Text, {len(all_edges)} Candidates.")
|
||||
|
||||
# 4. Prompt füllen mit Format-Check (Kein Shortcut)
|
||||
try:
|
||||
# Wir begrenzen den Text auf eine vernünftige Länge für das Kontextfenster
|
||||
final_prompt = prompt_template.format(
|
||||
chunk_text=chunk_text[:6000],
|
||||
edge_list=edges_str,
|
||||
valid_types=valid_types_str
|
||||
)
|
||||
except Exception as format_err:
|
||||
logger.error(f"❌ [SemanticAnalyzer] Prompt Formatting failed: {format_err}")
|
||||
return []
|
||||
|
||||
try:
|
||||
# 5. LLM Call mit Background Priority & Semaphore Control
|
||||
response_json = await self.llm.generate_raw_response(
|
||||
prompt=final_prompt,
|
||||
force_json=True,
|
||||
max_retries=3,
|
||||
base_delay=2.0,
|
||||
priority="background",
|
||||
provider=provider,
|
||||
model_override=model
|
||||
)
|
||||
|
||||
# 6. Mistral-sicheres JSON Parsing via Helper
|
||||
data = self._extract_json_safely(response_json)
|
||||
|
||||
if not data:
|
||||
return []
|
||||
|
||||
# 7. Robuste Normalisierung (List vs Dict Recovery)
|
||||
raw_candidates = []
|
||||
if isinstance(data, list):
|
||||
raw_candidates = data
|
||||
elif isinstance(data, dict):
|
||||
logger.info(f"ℹ️ [SemanticAnalyzer] LLM returned dict, trying recovery.")
|
||||
for key in ["edges", "results", "kanten", "matches"]:
|
||||
if key in data and isinstance(data[key], list):
|
||||
raw_candidates.extend(data[key])
|
||||
break
|
||||
# Falls immer noch leer, nutze Schlüssel-Wert Paare als Behelf
|
||||
if not raw_candidates:
|
||||
for k, v in data.items():
|
||||
if isinstance(v, str): raw_candidates.append(f"{k}:{v}")
|
||||
elif isinstance(v, list):
|
||||
for target in v:
|
||||
if isinstance(target, str): raw_candidates.append(f"{k}:{target}")
|
||||
|
||||
# 8. Strikte Validierung gegen Kanten-Format
|
||||
valid_edges = []
|
||||
for e in raw_candidates:
|
||||
e_str = str(e).strip()
|
||||
if self._is_valid_edge_string(e_str):
|
||||
valid_edges.append(e_str)
|
||||
else:
|
||||
logger.debug(f" [SemanticAnalyzer] Rejected invalid edge format: '{e_str}'")
|
||||
|
||||
if valid_edges:
|
||||
logger.info(f"✅ [SemanticAnalyzer] Assigned {len(valid_edges)} edges to chunk.")
|
||||
return valid_edges
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"💥 [SemanticAnalyzer] Critical error during analysis: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
async def close(self):
|
||||
if self.llm:
|
||||
await self.llm.close()
|
||||
|
||||
# Singleton Instanziierung
|
||||
_analyzer_instance = None
|
||||
def get_semantic_analyzer():
|
||||
global _analyzer_instance
|
||||
if _analyzer_instance is None:
|
||||
_analyzer_instance = SemanticAnalyzer()
|
||||
return _analyzer_instance
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
# config/prompts.yaml — Final V2.5.5 (OpenRouter Hardening)
|
||||
# config/prompts.yaml — Final V2.6.0 (WP-15b Candidate-Validation)
|
||||
# WP-20: Optimierte Cloud-Templates zur Unterdrückung von Modell-Geschwätz.
|
||||
# FIX: Explizite Verbote für Einleitungstexte zur Vermeidung von JSON-Parsing-Fehlern.
|
||||
# WP-15b: Integration der binären edge_validation für den Two-Pass Workflow.
|
||||
# OLLAMA: UNVERÄNDERT laut Benutzeranweisung.
|
||||
|
||||
system_prompt: |
|
||||
|
|
@ -215,7 +216,7 @@ edge_extraction:
|
|||
4. Antworte AUSSCHLIESSLICH in validem JSON als Liste von Objekten.
|
||||
|
||||
BEISPIEL:
|
||||
[[ {{"to": "Ziel-Konzept", "kind": "beziehungs_typ"}} ]]
|
||||
[[ {{"to": "Ziel-Konzept", \"kind\": \"beziehungs_typ\"}} ]]
|
||||
|
||||
TEXT:
|
||||
"""
|
||||
|
|
@ -227,13 +228,46 @@ edge_extraction:
|
|||
Analysiere '{note_id}'. Extrahiere semantische Beziehungen.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {text}
|
||||
OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to":"Ziel","kind":"typ"}}]]. Kein Text davor/danach. Wenn nichts: [].
|
||||
OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to\":\"Ziel\",\"kind\":\"typ\"}}]]. Kein Text davor/danach. Wenn nichts: [].
|
||||
openrouter: |
|
||||
TASK: Extrahiere semantische Relationen für '{note_id}'.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {text}
|
||||
ANWEISUNG: Antworte AUSSCHLIESSLICH mit einem JSON-Array von Objekten.
|
||||
FORMAT: [[{{"to":"Ziel-Begriff","kind":"typ"}}]]
|
||||
FORMAT: [[{{"to\":\"Ziel-Begriff\",\"kind\":\"typ\"}}]]
|
||||
STRIKTES VERBOT: Schreibe keine Einleitung, keine Analyse und keine Erklärungen.
|
||||
Wenn keine Relationen existieren, antworte NUR mit: []
|
||||
OUTPUT:
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 8. WP-15b: EDGE VALIDATION (Intent: VALIDATE)
|
||||
# ---------------------------------------------------------
|
||||
edge_validation:
|
||||
gemini: |
|
||||
Bewerte die semantische Validität dieser Verbindung im Wissensgraph.
|
||||
|
||||
KONTEXT DER QUELLE (Chunk):
|
||||
"{chunk_text}"
|
||||
|
||||
ZIEL-NOTIZ: "{target_title}"
|
||||
ZIEL-BESCHREIBUNG (Zusammenfassung):
|
||||
"{target_summary}"
|
||||
|
||||
GEPLANTE RELATION: "{edge_kind}"
|
||||
|
||||
FRAGE: Bestätigt der Kontext der Quelle die Beziehung '{edge_kind}' zum Ziel?
|
||||
REGEL: Antworte NUR mit 'YES' oder 'NO'. Keine Erklärungen oder Smalltalk.
|
||||
openrouter: |
|
||||
Verify semantic relation for graph construction.
|
||||
Source Context: {chunk_text}
|
||||
Target Note: {target_title}
|
||||
Target Summary: {target_summary}
|
||||
Proposed Relation: {edge_kind}
|
||||
Instruction: Does the source context support this relation to the target?
|
||||
Result: Respond ONLY with 'YES' or 'NO'.
|
||||
ollama: |
|
||||
Bewerte die semantische Korrektheit dieser Verbindung.
|
||||
QUELLE: {chunk_text}
|
||||
ZIEL: {target_title} ({target_summary})
|
||||
BEZIEHUNG: {edge_kind}
|
||||
Ist diese Verbindung valide? Antworte NUR mit YES oder NO.
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
version: 2.6.0 # Final WP-15 Config: Smart Edges & Strict/Soft Chunking
|
||||
version: 2.7.0 # WP-14 Update: Dynamisierung der Ingestion-Pipeline
|
||||
|
||||
# ==============================================================================
|
||||
# 1. CHUNKING PROFILES
|
||||
|
|
@ -76,7 +76,32 @@ defaults:
|
|||
edge_defaults: []
|
||||
|
||||
# ==============================================================================
|
||||
# 3. TYPE DEFINITIONS
|
||||
# 3. INGESTION SETTINGS (WP-14 Dynamization)
|
||||
# ==============================================================================
|
||||
# Steuert, welche Notizen verarbeitet werden und wie Fallbacks aussehen.
|
||||
ingestion_settings:
|
||||
# Liste der Status-Werte, die beim Import ignoriert werden sollen.
|
||||
ignore_statuses: ["system", "template", "archive", "hidden"]
|
||||
# Standard-Typ, falls kein Typ im Frontmatter angegeben ist.
|
||||
default_note_type: "concept"
|
||||
|
||||
# ==============================================================================
|
||||
# 4. SUMMARY & SCAN SETTINGS
|
||||
# ==============================================================================
|
||||
# Steuert die Tiefe des Pre-Scans für den Context-Cache.
|
||||
summary_settings:
|
||||
max_summary_length: 500
|
||||
pre_scan_depth: 600
|
||||
|
||||
# ==============================================================================
|
||||
# 5. LLM SETTINGS
|
||||
# ==============================================================================
|
||||
# Steuerzeichen und Patterns zur Bereinigung der LLM-Antworten.
|
||||
llm_settings:
|
||||
cleanup_patterns: ["<s>", "</s>", "[OUT]", "[/OUT]", "```json", "```"]
|
||||
|
||||
# ==============================================================================
|
||||
# 6. TYPE DEFINITIONS
|
||||
# ==============================================================================
|
||||
|
||||
types:
|
||||
|
|
|
|||
|
|
@ -2,13 +2,13 @@
|
|||
doc_type: glossary
|
||||
audience: all
|
||||
status: active
|
||||
version: 2.8.0
|
||||
context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-Cloud Resilienz, WP-76 Quoten-Steuerung und Mistral-safe Parsing."
|
||||
version: 2.8.1
|
||||
context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-Cloud Resilienz, WP-14 Modularisierung, WP-15b Two-Pass Ingestion und Mistral-safe Parsing."
|
||||
---
|
||||
|
||||
# Mindnet Glossar
|
||||
|
||||
**Quellen:** `01_edge_vocabulary.md`, `llm_service.py`, `ingestion.py`, `edge_registry.py`
|
||||
**Quellen:** `01_edge_vocabulary.md`, `llm_service.py`, `ingestion.py`, `edge_registry.py`, `registry.py`, `qdrant.py`
|
||||
|
||||
## Kern-Entitäten
|
||||
|
||||
|
|
@ -21,11 +21,13 @@ context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-C
|
|||
## Komponenten
|
||||
|
||||
* **Edge Registry:** Der zentrale Dienst (SSOT), der Kanten-Typen validiert und Aliase in kanonische Typen auflöst. Nutzt `01_edge_vocabulary.md` als Basis.
|
||||
* **LLM Service:** Der Hybrid-Client (v3.3.6), der Anfragen zwischen OpenRouter, Google Gemini und lokalem Ollama routet. Verwaltet Cloud-Timeouts und Quoten-Management.
|
||||
* **Retriever:** Besteht in v2.7+ aus der Orchestrierung (`retriever.py`) und der mathematischen Scoring-Engine (`retriever_scoring.py`).
|
||||
* **LLM Service:** Der Hybrid-Client (v3.3.6), der Anfragen zwischen OpenRouter, Google Gemini und lokalem Ollama routet. Verwaltet Cloud-Timeouts und Quoten-Management. Nutzt zur Text-Bereinigung nun die neutrale `registry.py`, um Circular Imports zu vermeiden.
|
||||
* **Retriever:** Besteht in v2.7+ aus der Orchestrierung (`retriever.py`) und der mathematischen Scoring-Engine (`retriever_scoring.py`). Seit WP-14 im Paket `app.core.retrieval` gekapselt.
|
||||
* **Decision Engine:** Teil des Routers, der Intents erkennt und entsprechende **Boost-Faktoren** für das Retrieval injiziert.
|
||||
* **Traffic Control:** Verwaltet Prioritäten und drosselt Hintergrund-Tasks (z.B. Smart Edges) mittels Semaphoren und Timeouts (45s) zur Vermeidung von System-Hangs.
|
||||
* **Unknown Edges Log:** Die Datei `unknown_edges.jsonl`, in der das System Kanten-Typen protokolliert, die nicht im Dictionary gefunden wurden.
|
||||
* **Database Package (WP-14):** Zentralisiertes Infrastruktur-Paket (`app.core.database`), das den Qdrant-Client (`qdrant.py`) und das Point-Mapping (`qdrant_points.py`) verwaltet.
|
||||
* **LocalBatchCache (WP-15b):** Ein globaler In-Memory-Index, der während des Pass 1 Scans aufgebaut wird und Metadaten (IDs, Titel, Summaries) aller Notizen für die Kantenvalidierung bereithält.
|
||||
|
||||
## Konzepte & Features
|
||||
|
||||
|
|
@ -40,5 +42,9 @@ context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-C
|
|||
* `explicit`: Vom Mensch gesetzt (Prio 1).
|
||||
* `semantic_ai`: Von der KI im Turbo-Mode extrahiert und validiert (Prio 2).
|
||||
* `structure`: Durch System-Regeln/Matrix erzeugt (Prio 3).
|
||||
* **Smart Edge Allocation:** KI-Verfahren zur Relevanzprüfung von Links für spezifische Textabschnitte.
|
||||
* **Smart Edge Allocation (WP-15b):** KI-Verfahren zur Relevanzprüfung von Links für spezifische Textabschnitte. Validiert Kandidaten semantisch gegen das Ziel im LocalBatchCache.
|
||||
* **Matrix Logic:** Bestimmung des Kanten-Typs basierend auf Quell- und Ziel-Entität (z.B. Erfahrung -> Wert = `based_on`).
|
||||
* **Two-Pass Workflow (WP-15b):** Optimiertes Ingestion-Verfahren:
|
||||
* **Pass 1 (Pre-Scan):** Schnelles Scannen aller Dateien zur Befüllung des LocalBatchCache.
|
||||
* **Pass 2 (Semantic Processing):** Tiefenverarbeitung (Chunking, Embedding, Validierung) nur für geänderte Dateien.
|
||||
* **Circular Import Registry (WP-14):** Entkopplung von Kern-Logik (wie Textbereinigung) in eine neutrale `registry.py`, um Abhängigkeitsschleifen zwischen Diensten und Ingestion-Utilities zu verhindern.
|
||||
|
|
@ -1,19 +1,19 @@
|
|||
---
|
||||
doc_type: technical_reference
|
||||
audience: developer, admin
|
||||
scope: configuration, env, registry, scoring, resilience
|
||||
scope: configuration, env, registry, scoring, resilience, modularization
|
||||
status: active
|
||||
version: 2.8.0
|
||||
context: "Umfassende Referenztabellen für Umgebungsvariablen (inkl. Hybrid-Cloud & WP-76), YAML-Konfigurationen und die Edge Registry Struktur."
|
||||
version: 2.9.1
|
||||
context: "Umfassende Referenztabellen für Umgebungsvariablen (inkl. Hybrid-Cloud & WP-76), YAML-Konfigurationen und die Edge Registry Struktur unter Berücksichtigung von WP-14."
|
||||
---
|
||||
|
||||
# Konfigurations-Referenz
|
||||
|
||||
Dieses Dokument beschreibt alle Steuerungsdateien von Mindnet. In der Version 2.8 wurde die Konfiguration professionalisiert, um die Edge Registry, dynamische Scoring-Parameter (Lifecycle & Intent) sowie die neue Hybrid-Cloud-Resilienz zu unterstützen.
|
||||
Dieses Dokument beschreibt alle Steuerungsdateien von Mindnet. In der Version 2.9.1 wurde die Konfiguration professionalisiert, um die Edge Registry, dynamische Scoring-Parameter (Lifecycle & Intent), die neue Hybrid-Cloud-Resilienz sowie die modulare Datenbank-Infrastruktur (WP-14) zu unterstützen.
|
||||
|
||||
## 1. Environment Variablen (`.env`)
|
||||
|
||||
Diese Variablen steuern die Infrastruktur, Pfade und globale Timeouts.
|
||||
Diese Variablen steuern die Infrastruktur, Pfade und globale Timeouts. Seit der Modularisierung in WP-14 unterstützen sie zudem die explizite Benennung von Vektoren für verschiedene Collections.
|
||||
|
||||
| Variable | Default | Beschreibung |
|
||||
| :--- | :--- | :--- |
|
||||
|
|
@ -21,6 +21,10 @@ Diese Variablen steuern die Infrastruktur, Pfade und globale Timeouts.
|
|||
| `QDRANT_API_KEY` | *(leer)* | Optionaler Key für Absicherung. |
|
||||
| `COLLECTION_PREFIX` | `mindnet` | Namensraum für Collections (erzeugt `{prefix}_notes` etc). |
|
||||
| `VECTOR_DIM` | `768` | **Muss 768 sein** (für Nomic Embeddings). |
|
||||
| `MINDNET_VECTOR_NAME` | `default` | **Neu (WP-14):** Basis-Vektorname für Named Vectors Support. |
|
||||
| `NOTES_VECTOR_NAME` | *(leer)* | **Neu (WP-14):** Spezifischer Vektorname für die Notes-Collection (Override). |
|
||||
| `CHUNKS_VECTOR_NAME` | *(leer)* | **Neu (WP-14):** Spezifischer Vektorname für die Chunks-Collection (Override). |
|
||||
| `EDGES_VECTOR_NAME` | *(leer)* | **Neu (WP-14):** Spezifischer Vektorname für die Edges-Collection (Override). |
|
||||
| `MINDNET_VOCAB_PATH` | *(Pfad)* | **Neu (WP-22):** Absoluter Pfad zur `01_edge_vocabulary.md`. Definiert den Ort des Dictionarys. |
|
||||
| `MINDNET_VAULT_ROOT` | `./vault` | Basis-Pfad für Datei-Operationen. |
|
||||
| `MINDNET_TYPES_FILE` | `config/types.yaml` | Pfad zur Typ-Registry. |
|
||||
|
|
@ -38,23 +42,25 @@ Diese Variablen steuern die Infrastruktur, Pfade und globale Timeouts.
|
|||
| `MINDNET_LLM_MODEL` | `phi3:mini` | Name des lokalen Chat-Modells (Ollama). |
|
||||
| `MINDNET_EMBEDDING_MODEL` | `nomic-embed-text` | Name des Embedding-Modells (Ollama). |
|
||||
| `MINDNET_OLLAMA_URL` | `http://127.0.0.1:11434`| URL zum lokalen LLM-Server. |
|
||||
| `MAX_OLLAMA_CHARS` | `10000`| Maximale Länge des Kontext-Strings, der an das lokale Modell gesendet wird. Verhindert Batch-Decoding-Fehler bei sehr großen Notiz-Historien. |
|
||||
| `MAX_OLLAMA_CHARS` | `10000`| Maximale Länge des Kontext-Strings, der an das lokale Modell gesendet wird. |
|
||||
| `MINDNET_LLM_TIMEOUT` | `300.0` | Timeout in Sekunden für LLM-Anfragen. |
|
||||
| `MINDNET_API_TIMEOUT` | `300.0` | Globales API-Timeout für das Frontend. |
|
||||
| `MINDNET_LL_BACKGROUND_LIMIT`| `2` | **Traffic Control:** Max. parallele Hintergrund-Tasks (Semaphore). |
|
||||
| `MINDNET_CHANGE_DETECTION_MODE` | `full` | `full` (Text + Meta) oder `body` (nur Text). |
|
||||
| `MINDNET_DEFAULT_RETRIEVER_WEIGHT` | `1.0` | **Neu (WP-22):** Systemweiter Standard für das Retriever-Gewicht einer Notiz. |
|
||||
|
||||
---
|
||||
|
||||
## 2. Typ-Registry (`types.yaml`)
|
||||
|
||||
Steuert das Import-Verhalten, Chunking und die Kanten-Logik pro Typ.
|
||||
Steuert das Import-Verhalten, Chunking und die Kanten-Logik pro Typ. Die Auflösung erfolgt zentral über die modularisierte Registry in `app.core.registry`.
|
||||
|
||||
### 2.1 Konfigurations-Hierarchie (Override-Logik)
|
||||
Seit Version 2.7.0 gilt für `chunking_profile` und `retriever_weight` folgende Priorität:
|
||||
1. **Frontmatter (Höchste Prio):** Ein Wert direkt in der Markdown-Datei überschreibt alles.
|
||||
2. **Type Config:** Der Standardwert für den `type` aus `types.yaml`.
|
||||
3. **Global Default:** Fallback aus `defaults` in `types.yaml`.
|
||||
3. **Ingestion Settings (Neu WP-14):** Globale Konfiguration wie `default_chunk_profile` innerhalb des `ingestion_settings` Blocks.
|
||||
4. **Global Default:** Fallback aus `defaults` in `types.yaml`.
|
||||
|
||||
|
||||
## 2.2 Typ-Referenz & Stream-Logik (Vollständige Liste: 28 Typen)
|
||||
|
|
@ -113,7 +119,7 @@ Dieser Stream speichert deine Erlebnisse, Fakten und externes Wissen als Belege.
|
|||
|
||||
## 3. Retriever Config (`retriever.yaml`)
|
||||
|
||||
Steuert die Gewichtung der Scoring-Formel und die neuen Lifecycle-Modifier.
|
||||
Steuert die Gewichtung der Scoring-Formel und die neuen Lifecycle-Modifier. Seit WP-14 ist die mathematische Engine im Paket `app.core.retrieval` gekapselt.
|
||||
|
||||
```yaml
|
||||
version: 1.2
|
||||
|
|
@ -140,43 +146,36 @@ lifecycle_weights:
|
|||
system: 0.0 # Hard Skip via Ingestion
|
||||
|
||||
# Die nachfolgenden Werte überschreiben die Defaults aus app/core/retriever_config.
|
||||
# Wenn neue Kantentypen, z.B. durch Referenzierung innerhalb einer md-Datei im vault anders gewichtet werden sollen, dann muss hier die Konfiguration erfolgen
|
||||
edge_types:
|
||||
# --- KATEGORIE 1: LOGIK-BOOSTS (Relevanz-Treiber) ---
|
||||
# Diese Kanten haben die Kraft, das semantische Ranking aktiv umzugestalten.
|
||||
blocks: 1.6 # Kritisch: Risiken/Blocker müssen sofort sichtbar sein.
|
||||
solves: 1.5 # Zielführend: Lösungen sind primäre Suchziele.
|
||||
depends_on: 1.4 # Logisch: Harte fachliche Abhängigkeit.
|
||||
resulted_in: 1.4 # Kausal: Ergebnisse und unmittelbare Konsequenzen.
|
||||
followed_by: 1.3 # Sequenziell (User): Bewusst gesteuerte Wissenspfade.
|
||||
caused_by: 1.2 # Kausal: Ursachen-Bezug (Basis für Intent-Boost).
|
||||
preceded_by: 1.1 # Sequenziell (User): Rückwärts-Bezug in Logik-Ketten.
|
||||
blocks: 1.6
|
||||
solves: 1.5
|
||||
depends_on: 1.4
|
||||
resulted_in: 1.4
|
||||
followed_by: 1.3
|
||||
caused_by: 1.2
|
||||
preceded_by: 1.1
|
||||
|
||||
# --- KATEGORIE 2: QUALITATIVER KONTEXT (Stabilitäts-Stützen) ---
|
||||
# Diese Kanten liefern wichtigen Kontext, ohne das Ergebnis zu verfälschen.
|
||||
guides: 1.1 # Qualitativ: Prinzipien oder Werte leiten das Thema.
|
||||
part_of: 1.1 # Strukturell: Zieht übergeordnete Kontexte (Parents) mit hoch.
|
||||
based_on: 0.8 # Fundament: Bezug auf Basis-Werte (kalibriert auf Safe-Retrieval).
|
||||
derived_from: 0.6 # Historisch: Dokumentiert die Herkunft von Wissen.
|
||||
uses: 0.6 # Instrumentell: Genutzte Werkzeuge, Methoden oder Ressourcen.
|
||||
guides: 1.1
|
||||
part_of: 1.1
|
||||
based_on: 0.8
|
||||
derived_from: 0.6
|
||||
uses: 0.6
|
||||
|
||||
# --- KATEGORIE 3: THEMATISCHE NÄHE (Ähnlichkeits-Signal) ---
|
||||
# Diese Werte verhindern den "Drift" in fachfremde Bereiche.
|
||||
similar_to: 0.4 # Analytisch: Thematische Nähe (oft KI-generiert).
|
||||
similar_to: 0.4
|
||||
|
||||
# --- KATEGORIE 4: SYSTEM-NUDGES (Technische Struktur) ---
|
||||
# Reine Orientierungshilfen für das System; fast kein Einfluss auf das Ranking.
|
||||
belongs_to: 0.2 # System: Verknüpft Chunks mit der Note (Metadaten-Träger).
|
||||
next: 0.1 # System: Technische Lesereihenfolge der Absätze.
|
||||
prev: 0.1 # System: Technische Lesereihenfolge der Absätze.
|
||||
belongs_to: 0.2
|
||||
next: 0.1
|
||||
prev: 0.1
|
||||
|
||||
# --- KATEGORIE 5: WEICHE ASSOZIATIONEN (Rausch-Unterdrückung) ---
|
||||
# Verhindert, dass lose Verknüpfungen das Ergebnis "verwässern".
|
||||
references: 0.1 # Assoziativ: Einfacher Querverweis oder Erwähnung.
|
||||
related_to: 0.05 # Minimal: Schwächste thematische Verbindung.
|
||||
references: 0.1
|
||||
related_to: 0.05
|
||||
```
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 4. Edge Typen & Registry Referenz
|
||||
|
|
@ -185,7 +184,7 @@ Die `EdgeRegistry` ist die **Single Source of Truth** für das Vokabular.
|
|||
|
||||
### 4.1 Dateistruktur & Speicherort
|
||||
Die Registry erwartet eine Markdown-Datei an folgendem Ort:
|
||||
* **Standard-Pfad:** `<MINDNET_VAULT_ROOT>/01_User_Manual/01_edge_vocabulary.md`.
|
||||
* **Standard-Pfad:** `<MINDNET_VAULT_ROOT>/_system/dictionary/edge_vocabulary.md`.
|
||||
* **Custom-Pfad:** Kann via `.env` Variable `MINDNET_VOCAB_PATH` überschrieben werden.
|
||||
|
||||
### 4.2 Aufbau des Dictionaries (Markdown-Schema)
|
||||
|
|
@ -199,15 +198,10 @@ Die Datei muss eine Markdown-Tabelle enthalten, die vom Regex-Parser gelesen wir
|
|||
| **`caused_by`** | `ausgelöst_durch`, `wegen` | Kausalität: A löst B aus. |
|
||||
```
|
||||
|
||||
**Regeln für die Spalten:**
|
||||
1. **Canonical:** Muss fett gedruckt sein (`**type**` oder `**`type`**`). Dies ist der Wert, der in der DB landet.
|
||||
2. **Aliasse:** Kommagetrennte Liste von Synonymen. Diese werden beim Import automatisch zum Canonical aufgelöst.
|
||||
3. **Beschreibung:** Rein informativ für den Nutzer.
|
||||
|
||||
### 4.3 Verfügbare Kanten-Typen (System-Standard)
|
||||
|
||||
| System-Typ (Canonical) | Erlaubte Aliasse (User) | Beschreibung |
|
||||
| :--------------------- | :--------------------------------------------------- | :-------------------------------------- |
|
||||
| :--- | :--- | :--- |
|
||||
| **`caused_by`** | `ausgelöst_durch`, `wegen`, `ursache_ist` | Kausalität: A löst B aus. |
|
||||
| **`derived_from`** | `abgeleitet_von`, `quelle`, `inspiriert_durch` | Herkunft: A stammt von B. |
|
||||
| **`based_on`** | `basiert_auf`, `fundament`, `grundlage` | Fundament: B baut auf A auf. |
|
||||
|
|
@ -224,12 +218,10 @@ Die Datei muss eine Markdown-Tabelle enthalten, die vom Regex-Parser gelesen wir
|
|||
| **`references`** | *(Kein Alias)* | Standard-Verweis (Fallback). |
|
||||
| **`resulted_in`** | `ergebnis`, `resultat`, `erzeugt` | Herkunft: A erzeugt Ergebnis B |
|
||||
|
||||
**ACHTUNG!** Die Kantentypen
|
||||
**belongs_to**, **next** und **prev** dürfen nicht vom Nutzer gesetzt werden
|
||||
**ACHTUNG!** Die Kantentypen **belongs_to**, **next** und **prev** dürfen nicht vom Nutzer gesetzt werden.
|
||||
|
||||
---
|
||||
|
||||
|
||||
## 5. Decision Engine (`decision_engine.yaml`)
|
||||
|
||||
Die Decision Engine fungiert als zentraler Orchestrator für die Intent-Erkennung und das dynamische Retrieval-Routing. Sie bestimmt, wie das System auf eine Nutzeranfrage reagiert, welche Informationstypen bevorzugt werden und wie der Wissensgraph für die spezifische Situation verformt wird.
|
||||
|
|
@ -323,7 +315,4 @@ strategies:
|
|||
BITTE WÄGE FAKTEN GEGEN FOLGENDE WERTE, PRINZIPIEN UND ZIELE AB:
|
||||
|
||||
# 3. Empathie / "Ich"-Modus
|
||||
|
||||
```
|
||||
|
||||
*Richtwert für Kanten-Boosts: 0.1 (Abwertung) bis 3.0+ (Dominanz gegenüber Text-Match).*
|
||||
|
|
@ -3,15 +3,15 @@ doc_type: technical_reference
|
|||
audience: developer, architect
|
||||
scope: database, qdrant, schema
|
||||
status: active
|
||||
version: 2.7.0
|
||||
context: "Exakte Definition der Datenmodelle (Payloads) in Qdrant und Index-Anforderungen."
|
||||
version: 2.8.0
|
||||
context: "Exakte Definition der Datenmodelle (Payloads) in Qdrant und Index-Anforderungen. Berücksichtigt WP-14 Modularisierung und WP-15b Multi-Hashes."
|
||||
---
|
||||
|
||||
# Technisches Datenmodell (Qdrant Schema)
|
||||
|
||||
## 1. Collections & Namenskonvention
|
||||
|
||||
Mindnet speichert Daten in drei getrennten Qdrant-Collections. Der Prefix ist via ENV `COLLECTION_PREFIX` konfigurierbar (Default: `mindnet`).
|
||||
Mindnet speichert Daten in drei getrennten Qdrant-Collections. Der Prefix ist via ENV `COLLECTION_PREFIX` konfigurierbar (Default: `mindnet`). Die Auflösung erfolgt zentral über `app.core.database.collection_names`.
|
||||
|
||||
Das System nutzt folgende drei Collections:
|
||||
* `{prefix}_notes`: Metadaten der Dateien.
|
||||
|
|
@ -28,9 +28,10 @@ Repräsentiert die Metadaten einer Markdown-Datei (1:1 Beziehung).
|
|||
|
||||
```json
|
||||
{
|
||||
"note_id": "string (keyword)", // UUIDv5 (deterministisch) oder Slug
|
||||
"note_id": "string (keyword)", // UUIDv5 (deterministisch via NAMESPACE_URL)
|
||||
"title": "string (text)", // Titel aus Frontmatter
|
||||
"type": "string (keyword)", // Logischer Typ (z.B. 'project', 'concept')
|
||||
"status": "string (keyword)", // Lifecycle: 'stable', 'active', 'draft', 'system' (WP-22)
|
||||
"retriever_weight": "float", // Effektive Wichtigkeit (Frontmatter > Type > Default)
|
||||
"chunk_profile": "string", // Effektives Profil (Frontmatter > Type > Default)
|
||||
"edge_defaults": ["string"], // Liste der aktiven Default-Kanten
|
||||
|
|
@ -40,7 +41,7 @@ Repräsentiert die Metadaten einer Markdown-Datei (1:1 Beziehung).
|
|||
"updated": "integer", // Timestamp (File Modification Time)
|
||||
"fulltext": "string (no-index)", // Gesamter Text (nur für Recovery/Export)
|
||||
|
||||
// NEU in v2.7: Multi-Hash für flexible Change Detection
|
||||
// Multi-Hash für flexible Change Detection (WP-15b)
|
||||
"hashes": {
|
||||
"body:parsed:canonical": "string", // Hash nur über den Text-Body
|
||||
"full:parsed:canonical": "string" // Hash über Text + Metadaten (Tags, Title, Config)
|
||||
|
|
@ -52,6 +53,7 @@ Repräsentiert die Metadaten einer Markdown-Datei (1:1 Beziehung).
|
|||
Es müssen Payload-Indizes für folgende Felder existieren:
|
||||
* `note_id`
|
||||
* `type`
|
||||
* `status`
|
||||
* `tags`
|
||||
|
||||
---
|
||||
|
|
@ -61,7 +63,7 @@ Es müssen Payload-Indizes für folgende Felder existieren:
|
|||
Die atomare Sucheinheit. Enthält den Vektor.
|
||||
|
||||
**Vektor-Konfiguration:**
|
||||
* Modell: `nomic-embed-text`
|
||||
* Modell: `nomic-embed-text` (via Ollama oder Cloud)
|
||||
* Dimension: **768**
|
||||
* Metrik: Cosine Similarity
|
||||
|
||||
|
|
@ -69,7 +71,7 @@ Die atomare Sucheinheit. Enthält den Vektor.
|
|||
|
||||
```json
|
||||
{
|
||||
"chunk_id": "string (keyword)", // Format: {note_id}#c{index} (z.B. 'abc-123#c01')
|
||||
"chunk_id": "string (keyword)", // Format: UUIDv5 aus {note_id}#c{index}
|
||||
"note_id": "string (keyword)", // Foreign Key zur Note
|
||||
"type": "string (keyword)", // Kopie aus Note (Denormalisiert für Filterung)
|
||||
"text": "string (text)", // Reintext für Anzeige (ohne Overlap)
|
||||
|
|
@ -121,3 +123,4 @@ Es müssen Payload-Indizes für folgende Felder existieren:
|
|||
* `kind`
|
||||
* `scope`
|
||||
* `note_id`
|
||||
```
|
||||
|
|
@ -1,71 +1,77 @@
|
|||
---
|
||||
doc_type: technical_reference
|
||||
audience: developer, devops
|
||||
scope: backend, ingestion, smart_edges, edge_registry
|
||||
scope: backend, ingestion, smart_edges, edge_registry, modularization
|
||||
status: active
|
||||
version: 2.8.1
|
||||
context: "Detaillierte technische Beschreibung der Import-Pipeline, Mistral-safe Parsing und Deep Fallback Resilienz."
|
||||
version: 2.9.0
|
||||
context: "Detaillierte technische Beschreibung der Import-Pipeline, Two-Pass-Workflow (WP-15b) und modularer Datenbank-Architektur (WP-14). Integriert Mistral-safe Parsing und Deep Fallback."
|
||||
---
|
||||
|
||||
# Ingestion Pipeline & Smart Processing
|
||||
|
||||
**Quellen:** `pipeline_playbook.md`, `ingestion.py`, `edge_registry.py`, `01_edge_vocabulary.md`, `llm_service.py`
|
||||
**Quellen:** `pipeline_playbook.md`, `ingestion_processor.py`, `ingestion_db.py`, `ingestion_validation.py`, `registry.py`, `edge_registry.py`
|
||||
|
||||
Die Ingestion transformiert Markdown in den Graphen. Entrypoint: `scripts/import_markdown.py` (CLI) oder `routers/ingest.py` (API). Seit v2.9 nutzt dieser Prozess ein hocheffizientes **Two-Pass-Verfahren**, um globale Kontext-Informationen für die semantische Validierung bereitzustellen, ohne die Idempotenz oder die Change-Detection zu verletzen.
|
||||
|
||||
|
||||
Die Ingestion transformiert Markdown in den Graphen. Entrypoint: `scripts/import_markdown.py` (CLI) oder `routers/ingest.py` (API). Seit v2.8 integriert dieser Prozess eine **intelligente Quoten-Steuerung** (WP-20) und ein **robustes JSON-Parsing** für Cloud-Modelle (Mistral/Gemini).
|
||||
|
||||
## 1. Der Import-Prozess (16-Schritte-Workflow)
|
||||
|
||||
Der Prozess ist **asynchron** und **idempotent**.
|
||||
Der Prozess ist **asynchron**, **idempotent** und wird nun in zwei logische Durchläufe (Passes) unterteilt, um die semantische Genauigkeit zu maximieren.
|
||||
|
||||
### Phase 1: Pre-Scan & Context (Pass 1)
|
||||
1. **Trigger & Async Dispatch:**
|
||||
* **API (`/save`):** Nimmt Request entgegen, validiert und startet Background-Task ("Fire & Forget"). Antwortet sofort mit `202/Queued`.
|
||||
* **CLI:** Iteriert über Dateien und nutzt `asyncio.Semaphore` zur Drosselung.
|
||||
2. **Markdown lesen:** Rekursives Scannen des Vaults.
|
||||
2. **Markdown lesen:** Rekursives Scannen des Vaults zur Erstellung des Dateiinventars.
|
||||
3. **Frontmatter Check & Hard Skip (WP-22):**
|
||||
* Extraktion von `status` und `type`.
|
||||
* **Hard Skip Rule:** Wenn `status` in `['system', 'template', 'archive', 'hidden']` ist, wird die Datei **sofort übersprungen**. Sie wird weder vektorisiert noch in den Graphen aufgenommen.
|
||||
* **Hard Skip Rule:** Wenn `status` in `['system', 'template', 'archive', 'hidden']` ist, wird die Datei für das Deep-Processing übersprungen, ihre Metadaten werden jedoch für den Kontext-Cache erfasst.
|
||||
* Validierung der Pflichtfelder (`id`, `title`) für alle anderen Dateien.
|
||||
4. **Edge Registry Initialisierung (WP-22):**
|
||||
* Laden der Singleton-Instanz der `EdgeRegistry`.
|
||||
* Validierung der Vokabular-Datei unter `MINDNET_VOCAB_PATH`.
|
||||
5. **Config Resolution:**
|
||||
* Bestimmung von `chunking_profile` und `retriever_weight`.
|
||||
5. **Config Resolution (WP-14):**
|
||||
* Bestimmung von `chunking_profile` und `retriever_weight` via zentraler `TypeRegistry`.
|
||||
* **Priorität:** 1. Frontmatter (Override) -> 2. `types.yaml` (Type) -> 3. Global Default.
|
||||
6. **Note-Payload generieren:**
|
||||
* Erstellen des JSON-Objekts inklusive `status` (für Scoring).
|
||||
* **Multi-Hash Calculation:** Berechnet Hashtabellen für `body` (nur Text) und `full` (Text + Metadaten).
|
||||
7. **Change Detection:**
|
||||
* Vergleich des Hashes mit Qdrant.
|
||||
* Strategie wählbar via ENV `MINDNET_CHANGE_DETECTION_MODE` (`full` oder `body`).
|
||||
8. **Chunking anwenden:** Zerlegung des Textes basierend auf dem ermittelten Profil (siehe Kap. 3).
|
||||
9. **Smart Edge Allocation (WP-20):**
|
||||
* Wenn `enable_smart_edge_allocation: true`: Der `SemanticAnalyzer` sendet Chunks an das LLM.
|
||||
* **Traffic Control:** Request nutzt `priority="background"`. Semaphore drosselt die Last.
|
||||
* **Resilienz (Quota Handling):** Erkennt HTTP 429 (Rate-Limit) und pausiert kontrolliert (via `LLM_RATE_LIMIT_WAIT`), bevor ein Cloud-Retry erfolgt.
|
||||
* **Mistral-safe Parsing:** Automatisierte Bereinigung von BOS-Tokens (`<s>`) und Framework-Tags (`[OUT]`) sowie Recovery-Logik für Dictionaries (Suche nach `edges`, `links`, `results`, `kanten`).
|
||||
* **Deep Fallback (v2.11.14):** Erkennt "Silent Refusals" (Data Policy Violations). Liefert die Cloud trotz erfolgreicher Verbindung keine verwertbaren Kanten, wird ein lokaler Fallback via Ollama erzwungen, um Kantenverlust zu vermeiden.
|
||||
10. **Inline-Kanten finden:** Parsing von `[[rel:...]]`.
|
||||
11. **Alias-Auflösung & Kanonisierung (WP-22):**
|
||||
* Jede Kante wird via `edge_registry.resolve()` normalisiert.
|
||||
* Aliase (z.B. `basiert_auf`) werden zu kanonischen Typen (z.B. `based_on`) aufgelöst.
|
||||
6. **LocalBatchCache & Summary Generation (WP-15b):**
|
||||
* Erstellung von Kurz-Zusammenfassungen für jede Note.
|
||||
* Speicherung im `batch_cache` als Referenzrahmen für die spätere Kantenvalidierung.
|
||||
|
||||
### Phase 2: Semantic Processing & Persistence (Pass 2)
|
||||
7. **Note-Payload & Multi-Hash (WP-15b):**
|
||||
* Erstellen des JSON-Objekts inklusive `status`.
|
||||
* **Multi-Hash Calculation:** Berechnet Hashtabellen für `body` (nur Text) und `full` (Text + Metadaten) zur präzisen Änderungskontrolle.
|
||||
8. **Change Detection:**
|
||||
* Vergleich des aktuellen Hashes mit den Daten in Qdrant (Collection `{prefix}_notes`).
|
||||
* Strategie wählbar via ENV `MINDNET_CHANGE_DETECTION_MODE` (`full` oder `body`). Unveränderte Dateien werden hier final übersprungen.
|
||||
9. **Purge Old Artifacts (WP-14):**
|
||||
* Bei Änderungen löscht `purge_artifacts()` via `app.core.ingestion.ingestion_db` alle alten Chunks und Edges der Note.
|
||||
* Die Namensauflösung erfolgt nun über das modularisierte `database`-Paket.
|
||||
10. **Chunking anwenden:** Zerlegung des Textes basierend auf dem ermittelten Profil (siehe Kap. 3).
|
||||
11. **Smart Edge Allocation & Semantic Validation (WP-15b):**
|
||||
* Der `SemanticAnalyzer` schlägt Kanten-Kandidaten vor.
|
||||
* **Validierung:** Jeder Kandidat wird durch das LLM semantisch gegen das Ziel im **LocalBatchCache** geprüft.
|
||||
* **Traffic Control:** Nutzung der neutralen `clean_llm_text` Funktion zur Bereinigung von Steuerzeichen (<s>, [OUT]).
|
||||
* **Deep Fallback (v2.11.14):** Erkennt "Silent Refusals". Liefert die Cloud keine verwertbaren Kanten, wird ein lokaler Fallback via Ollama erzwungen.
|
||||
12. **Inline-Kanten finden:** Parsing von `[[rel:...]]` und Callouts.
|
||||
13. **Alias-Auflösung & Kanonisierung (WP-22):**
|
||||
* Jede Kante wird via `EdgeRegistry` normalisiert (z.B. `basiert_auf` -> `based_on`).
|
||||
* Unbekannte Typen werden in `unknown_edges.jsonl` protokolliert.
|
||||
12. **Callout-Kanten finden:** Parsing von `> [!edge]`.
|
||||
13. **Default- & Matrix-Edges erzeugen:** Anwendung der `edge_defaults` aus Registry und Matrix-Logik.
|
||||
14. **Strukturkanten erzeugen:** `belongs_to`, `next`, `prev`.
|
||||
15. **Embedding (Async):** Generierung via `nomic-embed-text` (768 Dim).
|
||||
16. **Diagnose:** Integritäts-Check nach dem Lauf.
|
||||
14. **Default- & Strukturkanten:** Anwendung der `edge_defaults` und Erzeugung von Systemkanten (`belongs_to`, `next`, `prev`).
|
||||
15. **Embedding (Async):** Generierung der Vektoren via `nomic-embed-text` (768 Dimensionen).
|
||||
16. **Database Sync (WP-14):** Batch-Upsert aller Points in die Collections `{prefix}_chunks` und `{prefix}_edges` über die zentrale Infrastruktur.
|
||||
|
||||
---
|
||||
|
||||
## 2. Betrieb & CLI Befehle
|
||||
|
||||
### 2.1 Standard-Betrieb (Inkrementell)
|
||||
Für regelmäßige Updates (Cronjob). Erkennt Änderungen via Hash.
|
||||
Erkennt Änderungen via Multi-Hash.
|
||||
|
||||
```bash
|
||||
export QDRANT_URL="http://localhost:6333"
|
||||
export COLLECTION_PREFIX="mindnet"
|
||||
# Steuert, wann eine Datei als "geändert" gilt
|
||||
export MINDNET_CHANGE_DETECTION_MODE="full"
|
||||
|
||||
# Nutzt das Venv der Produktionsumgebung
|
||||
|
|
@ -78,20 +84,13 @@ export MINDNET_CHANGE_DETECTION_MODE="full"
|
|||
```
|
||||
|
||||
> **[!WARNING] Purge-Before-Upsert**
|
||||
> Das Flag `--purge-before-upsert` ist kritisch. Es löscht vor dem Schreiben einer Note ihre alten Chunks/Edges. Ohne dieses Flag entstehen **"Geister-Chunks"** (alte Textabschnitte, die im Markdown gelöscht wurden, aber im Index verbleiben).
|
||||
> Das Flag `--purge-before-upsert` nutzt nun `ingestion_db.purge_artifacts`. Es ist kritisch, um "Geister-Chunks" (verwaiste Daten nach Textlöschung) konsistent aus den spezialisierten Collections zu entfernen.
|
||||
|
||||
### 2.2 Full Rebuild (Clean Slate)
|
||||
Notwendig bei Änderungen an `types.yaml` (z.B. neue Chunking-Profile), der Registry oder Modell-Wechsel.
|
||||
Notwendig bei Änderungen an `types.yaml`, der Registry oder Modell-Wechsel.
|
||||
|
||||
```bash
|
||||
# 0. Modell sicherstellen
|
||||
ollama pull nomic-embed-text
|
||||
|
||||
# 1. Qdrant Collections löschen (Wipe)
|
||||
python3 -m scripts.reset_qdrant --mode wipe --prefix "mindnet" --yes
|
||||
|
||||
# 2. Vollständiger Import (Force)
|
||||
# --force ignoriert alle Hashes und schreibt alles neu
|
||||
# --force ignoriert alle Hashes und erzwingt den vollständigen Two-Pass Workflow
|
||||
python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --force
|
||||
```
|
||||
|
||||
|
|
@ -99,22 +98,20 @@ python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --
|
|||
|
||||
## 3. Chunking & Payload
|
||||
|
||||
Das Chunking ist profilbasiert und in `types.yaml` konfiguriert.
|
||||
Das Chunking ist profilbasiert und bezieht seine Konfiguration dynamisch aus der `TypeRegistry`.
|
||||
|
||||
### 3.1 Profile und Strategien (Vollständige Referenz)
|
||||
### 3.1 Profile und Strategien
|
||||
|
||||
| Profil | Strategie | Parameter | Einsatzgebiet |
|
||||
| :--- | :--- | :--- | :--- |
|
||||
| `sliding_short` | `sliding_window` | Max: 350, Target: 200 | Kurze Logs, Chats, Risiken. |
|
||||
| `sliding_standard` | `sliding_window` | Max: 650, Target: 450 | Massendaten (Journal, Quellen). |
|
||||
| `sliding_smart_edges`| `sliding_window` | Max: 600, Target: 400 | Fließtexte mit hohem Wert (Projekte). |
|
||||
| `structured_smart_edges` | `by_heading` | `strict: false` (Soft) | Strukturierte Texte, Merging erlaubt. |
|
||||
| `structured_smart_edges_strict` | `by_heading` | `strict: true` (Hard) | **Atomare Einheiten**: Entscheidungen, Werte. |
|
||||
| `structured_smart_edges_strict_L3`| `by_heading` | `strict: true`, `level: 3` | Tief geschachtelte Prinzipien (Tier 2/MP1). |
|
||||
| `sliding_short` | `sliding_window` | Max: 350, Target: 200 | Kurze Logs, Chats. |
|
||||
| `sliding_standard` | `sliding_window` | Max: 650, Target: 450 | Standard-Wissen. |
|
||||
| `sliding_smart_edges`| `sliding_window` | Max: 600, Target: 400 | Fließtexte (Projekte). |
|
||||
| `structured_smart_edges` | `by_heading` | `strict: false` | Strukturierte Texte. |
|
||||
|
||||
### 3.2 Die `by_heading` Logik (v2.9 Hybrid)
|
||||
|
||||
Die Strategie `by_heading` zerlegt Texte anhand ihrer Struktur (Überschriften). Sie unterstützt seit v2.9 ein "Safety Net" gegen zu große Chunks.
|
||||
Die Strategie `by_heading` zerlegt Texte anhand ihrer Struktur (Überschriften). Sie unterstützt ein "Safety Net" gegen zu große Chunks.
|
||||
|
||||
* **Split Level:** Definiert die Tiefe (z.B. `2` = H1 & H2 triggern Split).
|
||||
* **Modus "Strict" (`strict_heading_split: true`):**
|
||||
|
|
@ -126,12 +123,6 @@ Die Strategie `by_heading` zerlegt Texte anhand ihrer Struktur (Überschriften).
|
|||
* **Füll-Logik:** Überschriften *auf* dem Split-Level lösen nur dann einen neuen Chunk aus, wenn der aktuelle Chunk die `target`-Größe erreicht hat.
|
||||
* *Safety Net:* Auch hier greift das `max` Token Limit.
|
||||
|
||||
### 3.3 Payload-Felder (Qdrant)
|
||||
|
||||
* `text`: Der reine Inhalt (Anzeige im UI).
|
||||
* `window`: Inhalt plus Overlap (für Embedding).
|
||||
* `chunk_profile`: Das effektiv genutzte Profil (zur Nachverfolgung).
|
||||
|
||||
---
|
||||
|
||||
## 4. Edge-Erzeugung & Prioritäten (Provenance)
|
||||
|
|
@ -143,7 +134,7 @@ Kanten werden nach Vertrauenswürdigkeit (`provenance`) priorisiert. Die höhere
|
|||
| **1** | Wikilink | `explicit:wikilink` | **1.00** | Harte menschliche Setzung. |
|
||||
| **2** | Inline | `inline:rel` | **0.95** | Typisierte menschliche Kante. |
|
||||
| **3** | Callout | `callout:edge` | **0.90** | Explizite Meta-Information. |
|
||||
| **4** | Semantic AI | `semantic_ai` | **0.90** | KI-extrahierte Verbindung (Mistral-safe). |
|
||||
| **4** | Semantic AI | `semantic_ai` | **0.90** | KI-validiert gegen LocalBatchCache. |
|
||||
| **5** | Type Default | `edge_defaults` | **0.70** | Heuristik aus der Registry. |
|
||||
| **6** | Struktur | `structure` | **1.00** | System-interne Verkettung (`belongs_to`). |
|
||||
|
||||
|
|
@ -151,18 +142,8 @@ Kanten werden nach Vertrauenswürdigkeit (`provenance`) priorisiert. Die höhere
|
|||
|
||||
## 5. Quality Gates & Monitoring
|
||||
|
||||
In v2.7+ wurden Tools zur Überwachung der Datenqualität integriert:
|
||||
**1. Registry Review (WP-14):** Prüfung der `data/logs/unknown_edges.jsonl`. Die zentrale Auflösung via `registry.py` verhindert Inkonsistenzen zwischen Import und Retrieval.
|
||||
|
||||
**1. Registry Review:** Prüfung der `data/logs/unknown_edges.jsonl`. Administratoren sollten hier gelistete Begriffe als Aliase in die `01_edge_vocabulary.md` aufnehmen.
|
||||
**2. Mistral-safe Parsing:** Automatisierte Bereinigung von LLM-Antworten in `ingestion_validation.py`. Stellt sicher, dass semantische Entscheidungen ("YES"/"NO") nicht durch technische Header verfälscht werden.
|
||||
|
||||
**2. Payload Dryrun (Schema-Check):**
|
||||
Simuliert Import, prüft JSON-Schema Konformität.
|
||||
```bash
|
||||
python3 -m scripts.payload_dryrun --vault ./test_vault
|
||||
```
|
||||
|
||||
**3. Full Edge Check (Graph-Integrität):**
|
||||
Prüft Invarianten (z.B. `next` muss reziprok zu `prev` sein).
|
||||
```bash
|
||||
python3 -m scripts.edges_full_check
|
||||
```
|
||||
**3. Purge Integrity:** Validierung, dass vor jedem Upsert alle assoziierten Artefakte in den Collections `{prefix}_chunks` und `{prefix}_edges` gelöscht wurden, um Daten-Duplikate zu vermeiden.
|
||||
|
|
@ -3,13 +3,13 @@ doc_type: technical_reference
|
|||
audience: developer, data_scientist
|
||||
scope: backend, retrieval, scoring, modularization
|
||||
status: active
|
||||
version: 2.7.1
|
||||
context: "Detaillierte Dokumentation der Scoring-Algorithmen, inklusive WP-22 Lifecycle-Modifier, Intent-Boosting und Modularisierung."
|
||||
version: 2.9.0
|
||||
context: "Detaillierte Dokumentation der Scoring-Algorithmen, inklusive WP-22 Lifecycle-Modifier, Intent-Boosting und WP-14 Modularisierung."
|
||||
---
|
||||
|
||||
# Retrieval & Scoring Algorithmen
|
||||
|
||||
Der Retriever unterstützt **Semantic Search** und **Hybrid Search**. Seit v2.4 nutzt Mindnet ein gewichtetes Scoring-Modell, das Semantik, Graphentheorie und Metadaten kombiniert. Mit Version 2.7 (WP-22) wurde dieses Modell um **Lifecycle-Faktoren** und **Intent-Boosting** erweitert sowie die Architektur modularisiert.
|
||||
Der Retriever unterstützt **Semantic Search** und **Hybrid Search**. Seit v2.4 nutzt Mindnet ein gewichtetes Scoring-Modell, das Semantik, Graphentheorie und Metadaten kombiniert. Mit Version 2.7 (WP-22) wurde dieses Modell um **Lifecycle-Faktoren** und **Intent-Boosting** erweitert sowie die Architektur modularisiert (WP-14).
|
||||
|
||||
## 1. Scoring Formel (v2.7.0)
|
||||
|
||||
|
|
@ -37,18 +37,19 @@ $$
|
|||
* **Zweck:** Belohnt Chunks, die "im Thema" vernetzt sind.
|
||||
|
||||
**4. Centrality Bonus ($B_{cent}$):**
|
||||
* **Kontext:** Berechnet im lokalen Subgraphen.
|
||||
* **Kontext:** Berechnet im lokalen Subgraphen via `graph_subgraph.centrality_bonus`.
|
||||
* **Logik:** Vereinfachte PageRank-Metrik (Degree Centrality).
|
||||
* **Zweck:** Belohnt "Hubs" mit vielen Verbindungen zu anderen Treffern.
|
||||
|
||||
### Die WP-22 Erweiterungen (v2.7.0)
|
||||
|
||||
**5. Status Modifier ($M_{status}$):**
|
||||
* **Herkunft:** Feld `status` aus dem Frontmatter.
|
||||
* **Herkunft:** Feld `status` aus dem Frontmatter (verarbeitet in `retriever_scoring.get_status_multiplier`).
|
||||
* **Zweck:** Bestraft unfertiges Wissen (Drafts) oder bevorzugt stabiles Wissen.
|
||||
* **Werte (Auftrag WP-22):** * `stable`: **1.2** (Bonus für Qualität).
|
||||
* `draft`: **0.5** (Malus für Entwürfe).
|
||||
* `system`: Exkludiert (siehe Ingestion).
|
||||
* **Werte (Auftrag WP-22):** * `stable`: **1.2** (Belohnung für verifiziertes Wissen).
|
||||
* `active`: **1.0** (Standard-Gewichtung).
|
||||
* `draft`: **0.5** (Malus für unfertige Fragmente).
|
||||
* `system`: Exkludiert (siehe Ingestion Lifecycle Filter).
|
||||
|
||||
**6. Intent Boost ($B_{intent}$):**
|
||||
* **Herkunft:** Dynamische Injektion durch die Decision Engine basierend auf der Nutzerfrage.
|
||||
|
|
@ -56,47 +57,61 @@ $$
|
|||
|
||||
---
|
||||
|
||||
## 2. Hybrid Retrieval Flow & Modularisierung
|
||||
## 2. Hybrid Retrieval Flow & Modularisierung (WP-14)
|
||||
|
||||
In v2.7 wurde die Engine in einen Orchestrator (`retriever.py`) und eine Scoring-Engine (`retriever_scoring.py`) aufgeteilt.
|
||||
Seit v2.9 ist die Retrieval-Engine im spezialisierten Paket `app.core.retrieval` gekapselt. Die Zuständigkeiten sind strikt zwischen Orchestrierung und mathematischer Bewertung getrennt.
|
||||
|
||||
**Phase 1: Vector Search (Seed Generation)**
|
||||
* Der Orchestrator sucht Top-K (Standard: 20) Kandidaten via Embeddings in Qdrant.
|
||||
* Der Orchestrator (`retriever.py`) sucht Top-K (Standard: 20) Kandidaten via Embeddings in Qdrant über das modularisierte `app.core.database` Paket.
|
||||
* Diese bilden die "Seeds" für den Graphen.
|
||||
|
||||
**Phase 2: Graph Expansion**
|
||||
* Nutze `graph_adapter.expand(seeds, depth=1)`.
|
||||
* Lade direkte Nachbarn aus der `_edges` Collection.
|
||||
* Konstruiere einen `NetworkX`-Graphen im Speicher.
|
||||
* Nutze die Fassade `app.core.graph_adapter.expand(seeds, depth=1)`.
|
||||
* Diese delegiert an `app.core.graph.graph_subgraph`, um direkte Nachbarn aus der `_edges` Collection zu laden.
|
||||
* Konstruktion eines in-memory Graphen zur Berechnung topologischer Boni.
|
||||
|
||||
**Phase 3: Re-Ranking (Modular)**
|
||||
* Der Orchestrator übergibt den Graphen und die Seeds an die `ScoringEngine`.
|
||||
* Berechne Boni ($B_{edge}$, $B_{cent}$) sowie die neuen Lifecycle- und Intent-Modifier.
|
||||
* Sortierung absteigend nach `TotalScore` und Limitierung auf Top-Resultate (z.B. 5).
|
||||
* Der Orchestrator übergibt den Graphen und die Seeds an die `ScoringEngine` (`retriever_scoring.py`).
|
||||
* Berechnung der finalen Scores unter Berücksichtigung von $B_{edge}$, $B_{cent}$ sowie der Lifecycle- und Intent-Modifier.
|
||||
* Sortierung absteigend nach `TotalScore` und Limitierung auf die angeforderten Top-Resultate.
|
||||
|
||||
---
|
||||
|
||||
## 3. Explanation Layer (WP-22 Update)
|
||||
|
||||
Bei `explain=True` generiert das System eine detaillierte Begründung.
|
||||
Bei `explain=True` generiert das System eine detaillierte Begründung inklusive Provenienz-Informationen.
|
||||
|
||||
**Erweiterte JSON-Struktur:**
|
||||
|
||||
```json
|
||||
{
|
||||
"score_breakdown": {
|
||||
"semantic": 0.85,
|
||||
"type_boost": 1.0,
|
||||
"lifecycle_modifier": 0.5,
|
||||
"edge_bonus": 0.4,
|
||||
"intent_boost": 0.5,
|
||||
"centrality": 0.1
|
||||
"semantic_contribution": 0.85,
|
||||
"edge_contribution": 0.4,
|
||||
"centrality_contribution": 0.1,
|
||||
"raw_semantic": 0.85,
|
||||
"raw_edge_bonus": 0.3,
|
||||
"raw_centrality": 0.1,
|
||||
"node_weight": 1.0,
|
||||
"status_multiplier": 1.2,
|
||||
"graph_boost_factor": 1.5
|
||||
},
|
||||
"reasons": [
|
||||
"Hohe textuelle Übereinstimmung (>0.85).",
|
||||
"Status 'draft' reduziert Relevanz (Modifier 0.5).",
|
||||
"Wird referenziert via 'caused_by' (Intent-Bonus 0.5).",
|
||||
"Bevorzugt, da Typ 'decision' (Gewicht 1.0)."
|
||||
{
|
||||
"kind": "semantic",
|
||||
"message": "Hohe textuelle Übereinstimmung (>0.85).",
|
||||
"score_impact": 0.85
|
||||
},
|
||||
{
|
||||
"kind": "type",
|
||||
"message": "Bevorzugt durch Typ-Profil.",
|
||||
"score_impact": 0.1
|
||||
},
|
||||
{
|
||||
"kind": "edge",
|
||||
"message": "Bestätigte Kante 'caused_by' [Boost x1.5] von 'Note-A'.",
|
||||
"score_impact": 0.4
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
|
@ -105,18 +120,18 @@ Bei `explain=True` generiert das System eine detaillierte Begründung.
|
|||
|
||||
## 4. Konfiguration (`retriever.yaml`)
|
||||
|
||||
Steuert die Gewichtung der mathematischen Komponenten.
|
||||
Steuert die globale Gewichtung der mathematischen Komponenten.
|
||||
|
||||
```yaml
|
||||
scoring:
|
||||
semantic_weight: 1.0 # Basis-Relevanz
|
||||
edge_weight: 0.7 # Graphen-Einfluss
|
||||
centrality_weight: 0.5 # Hub-Einfluss
|
||||
semantic_weight: 1.0 # Basis-Relevanz (W_sem)
|
||||
edge_weight: 0.7 # Graphen-Einfluss (W_edge)
|
||||
centrality_weight: 0.5 # Hub-Einfluss (W_cent)
|
||||
|
||||
# WP-22 Lifecycle Konfiguration (Abgleich mit Auftrag)
|
||||
# WP-22 Lifecycle Konfiguration
|
||||
lifecycle_weights:
|
||||
stable: 1.2 # Bonus für Qualität
|
||||
draft: 0.5 # Malus für Entwürfe
|
||||
stable: 1.2 # Modifier für Qualität
|
||||
draft: 0.5 # Modifier für Entwürfe
|
||||
|
||||
# Kanten-Gewichtung für den Edge-Bonus (Basis)
|
||||
edge_weights:
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
---
|
||||
doc_type: developer_guide
|
||||
audience: developer
|
||||
scope: workflow, testing, architecture, modules
|
||||
scope: workflow, testing, architecture, modules, modularization
|
||||
status: active
|
||||
version: 2.6.1
|
||||
context: "Umfassender Guide für Entwickler: Architektur, Modul-Interna (Deep Dive), Setup, Git-Workflow und Erweiterungs-Anleitungen."
|
||||
version: 2.9.1
|
||||
context: "Umfassender Guide für Entwickler: Modularisierte Architektur (WP-14), Two-Pass Ingestion (WP-15b), Modul-Interna, Setup und Git-Workflow."
|
||||
---
|
||||
|
||||
# Mindnet Developer Guide & Workflow
|
||||
|
|
@ -23,8 +23,6 @@ Dieser Guide ist die zentrale technische Referenz für Mindnet v2.6. Er vereint
|
|||
- [Kern-Philosophie](#kern-philosophie)
|
||||
- [2. Architektur](#2-architektur)
|
||||
- [2.1 High-Level Übersicht](#21-high-level-übersicht)
|
||||
- [2.2 Datenfluss-Muster](#22-datenfluss-muster)
|
||||
- [A. Ingestion (Write)](#a-ingestion-write)
|
||||
- [B. Retrieval (Read)](#b-retrieval-read)
|
||||
- [C. Visualisierung (Graph)](#c-visualisierung-graph)
|
||||
- [3. Physische Architektur](#3-physische-architektur)
|
||||
|
|
@ -84,23 +82,28 @@ graph TD
|
|||
API["main.py"]
|
||||
RouterChat["Chat / RAG"]
|
||||
RouterIngest["Ingest / Write"]
|
||||
CoreRet["Retriever Engine"]
|
||||
CoreIngest["Ingestion Pipeline"]
|
||||
|
||||
subgraph "Core Packages (WP-14)"
|
||||
PkgRet["retrieval/ (Search)"]
|
||||
PkgIng["ingestion/ (Import)"]
|
||||
PkgGra["graph/ (Logic)"]
|
||||
PkgDb["database/ (Infrastr.)"]
|
||||
Registry["registry.py (Neutral)"]
|
||||
end
|
||||
end
|
||||
|
||||
subgraph "Infrastructure & Services"
|
||||
LLM["Ollama (Phi3/Nomic)"]
|
||||
LLM["Ollama / Cloud (Hybrid)"]
|
||||
DB[("Qdrant Vector DB")]
|
||||
FS["File System (.md)"]
|
||||
end
|
||||
|
||||
User <--> UI
|
||||
UI -- "REST (Chat, Save, Feedback)" --> API
|
||||
UI -. "Direct Read (Graph Viz Performance)" .-> DB
|
||||
API -- "Embeddings & Completion" --> LLM
|
||||
API -- "Read/Write" --> DB
|
||||
API -- "Read/Write (Source of Truth)" --> FS
|
||||
```
|
||||
UI -- "REST Call" --> API
|
||||
PkgRet -- "Direct Query" --> PkgDb
|
||||
PkgIng -- "Process & Write" --> PkgDb
|
||||
PkgDb -- "API" --> DB
|
||||
API -- "Inference" --> LLM```
|
||||
|
||||
### 2.2 Datenfluss-Muster
|
||||
|
||||
|
|
@ -108,14 +111,12 @@ graph TD
|
|||
Vom Markdown zur Vektor-Datenbank.
|
||||
```mermaid
|
||||
graph LR
|
||||
MD["Markdown File"] --> Parser("Parser")
|
||||
Parser --> Chunker("Chunker")
|
||||
Chunker -- "Text Chunks" --> SemAn{"SemanticAnalyzer<br/>(LLM)"}
|
||||
SemAn -- "Smart Edges" --> Embedder("Embedder")
|
||||
Embedder --> DB[("Qdrant<br/>Points")]
|
||||
|
||||
style DB fill:#f9f,stroke:#333,stroke-width:2px
|
||||
style SemAn fill:#ff9,stroke:#333,stroke-width:2px
|
||||
MD["Markdown File"] --> Pass1["Pass 1: Pre-Scan"]
|
||||
Pass1 --> Cache[("LocalBatchCache<br/>(Titles/Summaries)")]
|
||||
MD --> Pass2["Pass 2: Processing"]
|
||||
Cache -- "Context" --> SmartEdges{"Smart Edge<br/>Validation"}
|
||||
SmartEdges --> Embedder("Embedder")
|
||||
Embedder --> DB[("Qdrant Points")]
|
||||
```
|
||||
|
||||
#### B. Retrieval (Read)
|
||||
|
|
@ -123,17 +124,10 @@ Die hybride Suche für Chat & RAG.
|
|||
```mermaid
|
||||
graph LR
|
||||
Query(["Query"]) --> Embed("Embedding")
|
||||
Embed --> Hybrid{"Hybrid Search"}
|
||||
|
||||
subgraph Search Components
|
||||
Vec["Vector Score"]
|
||||
Graph["Graph/Edge Bonus"]
|
||||
end
|
||||
|
||||
Vec --> Hybrid
|
||||
Graph --> Hybrid
|
||||
|
||||
Hybrid --> Rank("Re-Ranking")
|
||||
Embed --> Seed["Seed Search (Vector)"]
|
||||
Seed --> Expand{"Graph Expansion"}
|
||||
Expand --> Scoring["Scoring Engine (WP-22)"]
|
||||
Scoring --> Rank("Final Ranking")
|
||||
Rank --> Ctx["LLM Context"]
|
||||
```
|
||||
|
||||
|
|
@ -170,6 +164,12 @@ Das System ist modular aufgebaut. Hier ist die detaillierte Analyse aller Kompon
|
|||
mindnet/
|
||||
├── app/
|
||||
│ ├── core/ # Business Logic & Algorithms
|
||||
│ │ ├── database/ # WP-14: Qdrant Client & Point Mapping
|
||||
│ │ ├── ingestion/ # WP-14: Pipeline, Multi-Hash, Validation
|
||||
│ │ ├── retrieval/ # WP-14: Search Orchestrator & Scoring
|
||||
│ │ ├── graph/ # WP-14: Subgraph-Logik & Weights
|
||||
│ │ ├── registry.py # SSOT: Circular Import Fix & Text Cleanup
|
||||
│ │ └── *.py (Proxy) # Legacy Bridges für Abwärtskompatibilität
|
||||
│ ├── routers/ # API Interface (FastAPI)
|
||||
│ ├── services/ # External Integrations (LLM, DB)
|
||||
│ ├── models/ # Pydantic DTOs
|
||||
|
|
@ -285,6 +285,8 @@ Folgende Dateien wurden im Audit v2.6 als veraltet, redundant oder "Zombie-Code"
|
|||
| `app/core/type_registry.py` | **Redundant.** Logik in `ingestion.py` integriert. | 🗑️ Löschen |
|
||||
| `app/core/env_vars.py` | **Veraltet.** Ersetzt durch `config.py`. | 🗑️ Löschen |
|
||||
| `app/services/llm_ollama.py` | **Veraltet.** Ersetzt durch `llm_service.py`. | 🗑️ Löschen |
|
||||
| `app/core/type_registry.py` | **Redundant.** Logik in `app/core/registry.py` integriert. | 🗑️ Löschen |
|
||||
| `app/core/ranking.py` | **Redundant.** Logik in `retrieval/retriever_scoring.py` integriert. | 🗑️ Löschen |
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
|
|
@ -2,18 +2,14 @@
|
|||
doc_type: roadmap
|
||||
audience: product_owner, developer
|
||||
status: active
|
||||
version: 2.8.0
|
||||
context: "Aktuelle Planung für kommende Features (ab WP16), Release-Strategie und Historie der abgeschlossenen WPs."
|
||||
version: 2.9.1
|
||||
context: "Aktuelle Planung für kommende Features (ab WP16), Release-Strategie und Historie der abgeschlossenen WPs nach WP-14/15b."
|
||||
---
|
||||
|
||||
# Mindnet Active Roadmap
|
||||
|
||||
**Aktueller Stand:** v2.8.0 (Post-WP20/WP76)
|
||||
**Fokus:** Visualisierung, Exploration & Cloud-Resilienz.
|
||||
|
||||
## 1. Programmstatus
|
||||
|
||||
Wir haben mit der Implementierung des Graph Explorers (WP19), der Smart Edge Allocation (WP15) und der hybriden Cloud-Resilienz (WP20) die Basis für ein intelligentes, robustes System gelegt. Der nächste Schritt (WP19a) vertieft die Analyse, während WP16 die "Eingangs-Intelligenz" erhöht.
|
||||
**Aktueller Stand:** v2.9.1 (Post-WP14 / WP-15b)
|
||||
**Fokus:** Modularisierung, Two-Pass Ingestion & Graph Intelligence.
|
||||
|
||||
| Phase | Fokus | Status |
|
||||
| :--- | :--- | :--- |
|
||||
|
|
@ -45,6 +41,8 @@ Eine Übersicht der implementierten Features zum schnellen Auffinden von Funktio
|
|||
| **WP-10** | Web UI | Streamlit-Frontend als Ersatz für das Terminal. |
|
||||
| **WP-10a**| Draft Editor | GUI-Komponente zum Bearbeiten und Speichern generierter Notizen. |
|
||||
| **WP-11** | Backend Intelligence | `nomic-embed-text` (768d) und Matrix-Logik für Kanten-Typisierung. |
|
||||
| **WP-14** | **Modularisierung & Refactoring** | **Ergebnis:** Aufteilung in domänenspezifische Pakete (`database`, `ingestion`, `retrieval`, `graph`). Implementierung von Proxy-Adaptern für Abwärtskompatibilität und `registry.py` zur Lösung von Zirkelbezügen. |
|
||||
| **WP-15b**| **Candidate-Based Validation** | **Ergebnis:** Implementierung des **Two-Pass Workflows**. Einführung des `LocalBatchCache` und binäre semantische Validierung von Kanten-Kandidaten zur Vermeidung von Halluzinationen. |
|
||||
| **WP-15** | Smart Edge Allocation | LLM-Filter für Kanten in Chunks + Traffic Control (Semaphore) + Strict Chunking. |
|
||||
| **WP-19** | Graph Visualisierung | **Frontend Modularisierung:** Umbau auf `ui_*.py`.<br>**Graph Engines:** Parallelbetrieb von Cytoscape (COSE) und Agraph.<br>**Tools:** "Single Source of Truth" Editor, Persistenz via URL. |
|
||||
| **WP-20** | **Cloud Hybrid Mode & Resilienz** | **Ergebnis:** Integration von OpenRouter (Mistral 7B) & Gemini 2.5 Lite. Implementierung von WP-76 (Rate-Limit Wait) & Mistral-safe JSON Parsing. |
|
||||
|
|
@ -59,6 +57,10 @@ Eine Übersicht der implementierten Features zum schnellen Auffinden von Funktio
|
|||
* **Quoten-Management:** Die Nutzung von Free-Tier Modellen (Mistral/OpenRouter) erfordert zwingend eine intelligente Rate-Limit-Erkennung (HTTP 429) mit automatisierten Wartezyklen, um Batch-Prozesse stabil zu halten.
|
||||
* **Parser-Robustheit:** Cloud-Modelle betten JSON oft in technische Steuerzeichen (`<s>`, `[OUT]`) ein. Ein robuster Extraktor mit Recovery-Logik ist essentiell zur Vermeidung von Datenverlust.
|
||||
|
||||
### 2.3 WP-14 & WP-15b Lessons Learned
|
||||
* **Performance:** Der Pre-Scan (Pass 1) ist minimal invasiv, ermöglicht aber in Pass 2 eine drastische Reduktion der LLM-Kosten, da nur noch binär validiert werden muss, anstatt komplexe Extraktionen durchzuführen.
|
||||
* **Wartbarkeit:** Durch die Paket-Struktur können DB-Adapter (z.B. für Qdrant) nun unabhängig von der Business-Logik (Scoring) aktualisiert werden.
|
||||
*
|
||||
---
|
||||
|
||||
## 3. Offene Workpackages (Planung)
|
||||
|
|
@ -93,6 +95,20 @@ Diese Features stehen als nächstes an oder befinden sich in der Umsetzung.
|
|||
- Aufwand: Mittel
|
||||
- Komplexität: Niedrig/Mittel
|
||||
|
||||
|
||||
|
||||
### WP-13 – MCP-Integration & Agenten-Layer
|
||||
**Status:** 🟡 Geplant
|
||||
**Ziel:** mindnet als MCP-Server bereitstellen, damit Agenten (Claude Desktop, OpenAI) standardisierte Tools nutzen können.
|
||||
* **Umfang:** MCP-Server mit Tools (`mindnet_query`, `mindnet_explain`, etc.).
|
||||
|
||||
### WP-14 – Review / Refactoring / Dokumentation
|
||||
**Status:** 🟡 Laufend (Phase E)
|
||||
**Ziel:** Technische Schulden abbauen, die durch schnelle Feature-Entwicklung (WP15/WP19) entstanden sind.
|
||||
* **Refactoring `chunker.py`:** Die Datei ist monolithisch geworden (Parsing, Strategien, LLM-Orchestrierung).
|
||||
* *Lösung:* Aufteilung in ein Package `app/core/chunking/` mit Modulen (`strategies.py`, `orchestration.py`, `utils.py`).
|
||||
* **Dokumentation:** Kontinuierliche Synchronisation von Code und Docs (v2.8 Stand).
|
||||
|
||||
### WP-15b – Candidate-Based Edge Validation & Inheritance
|
||||
**Phase:** B/E (Refactoring & Semantic)
|
||||
**Status:** 🚀 Startklar (Ersatz für WP-15 Logik)
|
||||
|
|
@ -113,19 +129,6 @@ Der bisherige WP-15 Ansatz litt unter Halluzinationen (erfundene Kantentypen), h
|
|||
* **Chunker-Update:** Implementierung einer `propagate_edges`-Logik für "by_heading" und "sliding_window" Strategien.
|
||||
* **Ingestion-Update:** Umstellung von `_perform_smart_edge_allocation` auf einen binären Validierungs-Prompt (VALID/INVALID).
|
||||
|
||||
### WP-19a – Graph Intelligence & Discovery (Sprint-Fokus)
|
||||
**Status:** 🚀 Startklar
|
||||
**Ziel:** Vom "Anschauen" zum "Verstehen". Deep-Dive Werkzeuge für den Graphen.
|
||||
* **Discovery Screen:** Neuer Tab für semantische Suche ("Finde Notizen über Vaterschaft") und Wildcard-Filter.
|
||||
* **Filter-Logik:** "Zeige nur Wege, die zu `type:decision` führen".
|
||||
* **Chunk Inspection:** Umschaltbare Granularität (Notiz vs. Chunk) zur Validierung des Smart Chunkers.
|
||||
|
||||
### WP-14 – Review / Refactoring / Dokumentation
|
||||
**Status:** 🟡 Laufend (Phase E)
|
||||
**Ziel:** Technische Schulden abbauen, die durch schnelle Feature-Entwicklung (WP15/WP19) entstanden sind.
|
||||
* **Refactoring `chunker.py`:** Die Datei ist monolithisch geworden (Parsing, Strategien, LLM-Orchestrierung).
|
||||
* *Lösung:* Aufteilung in ein Package `app/core/chunking/` mit Modulen (`strategies.py`, `orchestration.py`, `utils.py`).
|
||||
* **Dokumentation:** Kontinuierliche Synchronisation von Code und Docs (v2.8 Stand).
|
||||
|
||||
### WP-16 – Auto-Discovery & Intelligent Ingestion
|
||||
**Status:** 🟡 Geplant
|
||||
|
|
@ -153,10 +156,13 @@ Der bisherige WP-15 Ansatz litt unter Halluzinationen (erfundene Kantentypen), h
|
|||
* **Feature:** Cronjob `check_graph_integrity.py`.
|
||||
* **Funktion:** Findet "Dangling Edges" (Links auf gelöschte Notizen) und repariert/löscht sie.
|
||||
|
||||
### WP-13 – MCP-Integration & Agenten-Layer
|
||||
**Status:** 🟡 Geplant
|
||||
**Ziel:** mindnet als MCP-Server bereitstellen, damit Agenten (Claude Desktop, OpenAI) standardisierte Tools nutzen können.
|
||||
* **Umfang:** MCP-Server mit Tools (`mindnet_query`, `mindnet_explain`, etc.).
|
||||
### WP-19a – Graph Intelligence & Discovery (Sprint-Fokus)
|
||||
**Status:** 🚀 Startklar
|
||||
**Ziel:** Vom "Anschauen" zum "Verstehen". Deep-Dive Werkzeuge für den Graphen.
|
||||
* **Discovery Screen:** Neuer Tab für semantische Suche ("Finde Notizen über Vaterschaft") und Wildcard-Filter.
|
||||
* **Filter-Logik:** "Zeige nur Wege, die zu `type:decision` führen".
|
||||
* **Chunk Inspection:** Umschaltbare Granularität (Notiz vs. Chunk) zur Validierung des Smart Chunkers.
|
||||
|
||||
|
||||
### WP-21 – Semantic Graph Routing & Canonical Edges
|
||||
**Status:** 🟡 Geplant
|
||||
|
|
@ -185,44 +191,42 @@ Der bisherige WP-15 Ansatz litt unter Halluzinationen (erfundene Kantentypen), h
|
|||
2. **Single Source of Truth (SSOT):** Die Registry nutzt `01_edge_vocabulary.md` als führende Konfiguration.
|
||||
3. **Self-Learning Loop:** Protokollierung unbekannter Kanten in `unknown_edges.jsonl`.
|
||||
|
||||
## 23: Agentic Multi-Stream Reasoning (Mindnet 2025)
|
||||
### WP-23: Agentic Multi-Stream Reasoning (Mindnet 2025)
|
||||
|
||||
### 1. Zielsetzung & Problemstellung
|
||||
#### 1. Zielsetzung & Problemstellung
|
||||
Das bisherige System basiert auf einem globalen Scoring-Modell, bei dem Notizen unterschiedlicher Typen (z. B. `insight` vs. `belief`) in einem einzigen Retrieval-Topf konkurrieren. Dies führt dazu, dass leiser gewichtete, aber fundamentale Identitätsmerkmale oft durch hochgewichtete aktuelle Erkenntnisse verdrängt werden. Ziel dieses Pakets ist die Einführung einer parallelen **Stream-Architektur**, um die Vielschichtigkeit menschlicher Entscheidungsprozesse (Werte + Erfahrung + Absicht) im LLM-Kontext zu garantieren.
|
||||
|
||||
---
|
||||
|
||||
### 2. Funktionsbeschreibung: Die Streams
|
||||
#### 2. Funktionsbeschreibung: Die Streams
|
||||
Die Daten aus der `types.yaml` werden in drei logische Verarbeitungseinheiten unterteilt:
|
||||
|
||||
#### A. Identity Stream (Die Wahrheitsebene)
|
||||
##### A. Identity Stream (Die Wahrheitsebene)
|
||||
* **Inhalt:** `value`, `belief`, `trait`, `principle`, `need`, `boundary`, `bias`.
|
||||
* **Zweck:** Definition des moralischen Kompasses, der psychologischen Grundbedürfnisse und kognitiven Muster.
|
||||
* **Wirkung:** Liefert das "Warum" hinter jeder Handlung.
|
||||
|
||||
#### B. History Stream (Die Evidenzebene)
|
||||
##### B. History Stream (Die Evidenzebene)
|
||||
* **Inhalt:** `experience`, `event`, `source`, `journal`, `person`.
|
||||
* **Zweck:** Bereitstellung empirischer Belege aus der Vergangenheit und sozialer Kontexte.
|
||||
* **Wirkung:** Verankert die Antwort in real erlebten Mustern und Fakten.
|
||||
|
||||
#### C. Action Stream (Die Dynamikebene)
|
||||
##### C. Action Stream (Die Dynamikebene)
|
||||
* **Inhalt:** `project`, `decision`, `goal`, `task`, `risk`, `motivation`, `habit`, `state`.
|
||||
* **Zweck:** Analyse der aktuellen Richtung, geplanter Vorhaben und des gegenwärtigen Zustands.
|
||||
* **Wirkung:** Liefert den Kontext für die Umsetzung und zukünftige Ziele.
|
||||
|
||||
|
||||
### 3. Technische Wirkungsweise (Solution Sketch)
|
||||
#### 3. Technische Wirkungsweise (Solution Sketch)
|
||||
|
||||
#### Schritt 1: Query-Decomposition
|
||||
##### Schritt 1: Query-Decomposition
|
||||
Ein initialer Klassifizierungs-Agent analysiert die Nutzeranfrage und bestimmt, welcher Stream primär angesprochen werden muss (z. B. "Wie soll ich mich entscheiden?" boostet den Identity Stream).
|
||||
|
||||
#### Schritt 2: Parallel Stream Retrieval
|
||||
##### Schritt 2: Parallel Stream Retrieval
|
||||
Anstelle einer Suche werden drei unabhängige Vektor-Suchen mit Typ-Filtern durchgeführt:
|
||||
* **Search_A (Identity):** Top-5 Ergebnisse aus Identitäts-Notizen.
|
||||
* **Search_B (History):** Top-5 Ergebnisse aus biografischen/externen Notizen.
|
||||
* **Search_C (Action):** Top-5 Ergebnisse aus operativen/strategischen Notizen.
|
||||
|
||||
#### Schritt 3: Agentic Synthesis (The Reasoning)
|
||||
##### Schritt 3: Agentic Synthesis (The Reasoning)
|
||||
Ein Synthese-Agent (LLM) erhält die aggregierten Ergebnisse in getrennten Sektionen. Die Anweisung lautet:
|
||||
1. **Prüfung:** Steht das aktuelle Vorhaben (Action) im Einklang mit den Werten (Identity)?
|
||||
2. **Abgleich:** Welche vergangenen Erfahrungen (History) stützen oder widersprechen diesem Weg?
|
||||
|
|
@ -230,12 +234,39 @@ Ein Synthese-Agent (LLM) erhält die aggregierten Ergebnisse in getrennten Sekti
|
|||
|
||||
|
||||
|
||||
### 4. Erwartete Ergebnisse
|
||||
#### 4. Erwartete Ergebnisse
|
||||
* **Höhere Resonanz:** Antworten wirken authentischer, da sie explizit auf das Wertesystem des Nutzers Bezug nehmen.
|
||||
* **Widerspruchs-Erkennung:** Das System kann den Nutzer aktiv warnen, wenn ein Projekt gegen seine `principles` oder `needs` verstößt.
|
||||
* **Robustes Retrieval:** Wichtige Identitäts-Informationen gehen nicht mehr im "Rauschen" von hunderten Journal-Einträgen verloren.
|
||||
---
|
||||
|
||||
### WP-24 – Proactive Discovery & Agentic Knowledge Mining
|
||||
**Status:** 🚀 In Planung (Nächster Architektur-Sprung)
|
||||
**Ziel:** Transformation von Mindnet von einem reaktiven Archiv zu einem aktiven Denkpartner. Das System soll aktiv Wissenslücken schließen und verborgene Querverbindungen in großen Vaults sowie in Chat-Dialogen aufspüren.
|
||||
|
||||
**Herausforderung:**
|
||||
1. **Silo-Effekt:** Bei wachsenden Vaults vergisst der Nutzer existierende Notizen und erstellt redundante Inhalte ohne Verknüpfung.
|
||||
2. **Insight-Verlust:** Im Chat entstehen wertvolle Synthesen, die momentan im flüchtigen Chat-Log vergraben bleiben.
|
||||
|
||||
**Lösungsskizze & Strategie:**
|
||||
|
||||
#### A. Proactive Discovery (Vault-Scanning)
|
||||
Das System nutzt die existierende `candidate_pool` Logik aus WP-15b, befüllt diese jedoch automatisiert:
|
||||
* **Vector Similarity Search**: Beim Import einer Note (oder als periodischer Hintergrundprozess) sucht der neue `RecommenderService` in Qdrant nach den Top-X semantisch ähnlichsten Chunks im gesamten Vault.
|
||||
* **Auto-Injection**: Diese Funde werden automatisch als `related_to` Kandidaten in den `candidate_pool` der neuen Note injiziert.
|
||||
* **WP-15b Filter**: Das LLM validiert diese Vorschläge im zweiten Pass der Ingestion gegen den Kontext. Nur was semantisch wirklich passt, wird als Kante im Graphen persistiert.
|
||||
|
||||
#### B. Agentic Knowledge Mining (Chat-to-Vault)
|
||||
Integration von Informationen aus dem Dialog direkt in den Graphen:
|
||||
* **Intent Detection**: Das Chat-Backend erkennt „notierwürdige“ Informationen (z.B. neue Prinzipien, Strategie-Entwürfe oder Werte-Anpassungen).
|
||||
* **Auto-Drafting**: Das LLM nutzt das `interview_template`, um aus dem Chat-Fragment eine valide Markdown-Datei mit Frontmatter (Status: `draft`) zu generieren.
|
||||
* **Real-Time Linking**: Die neue Datei wird sofort dem „Discovery-Lauf“ (Teil A) unterzogen, um sie mit dem bestehenden Wissensschatz zu vernetzen.
|
||||
* **User Review**: Die generierte Notiz erscheint im `00_Inbox` Ordner. Der Nutzer muss lediglich den Status auf `stable` setzen, um die Entdeckungen final zu integrieren.
|
||||
|
||||
**Erwartete Ergebnisse:**
|
||||
* Eliminierung von Wissens-Silos durch automatische Vernetzung.
|
||||
* Nahtloser Übergang von der Exploration (Chat) zur Konsolidierung (Vault).
|
||||
* Vermeidung von Dubletten durch Ähnlichkeits-Warnungen beim Import.
|
||||
## 4. Abhängigkeiten & Release-Plan
|
||||
|
||||
```mermaid
|
||||
|
|
@ -244,6 +275,8 @@ graph TD
|
|||
WP19a --> WP17(Memory)
|
||||
WP15(Smart Edges) --> WP16(Auto-Discovery)
|
||||
WP15 --> WP14(Refactoring)
|
||||
WP15(Smart Edges) --> WP15b(Candidate Validation)
|
||||
WP15b --> WP24(Proactive Discovery)
|
||||
WP03(Import) --> WP18(Health Check)
|
||||
WP03 --> WP13(MCP)
|
||||
WP04 --> WP13(MCP)
|
||||
|
|
@ -253,4 +286,5 @@ graph TD
|
|||
WP22 --> WP14
|
||||
WP15(Smart Edges) --> WP21
|
||||
WP20(Cloud Hybrid) --> WP15b
|
||||
WP24 --> WP23(Multi-Stream Reasoning)
|
||||
```
|
||||
|
|
@ -316,3 +316,44 @@ Die Gewichtung findet **Pre-Retrieval** (im Scoring-Algorithmus) statt, **nicht*
|
|||
3. Zeige die Erweiterung in `scoring.py` (Status-Gewicht & Dynamic Edge Boosting).
|
||||
|
||||
Bitte bestätige die Übernahme dieses Architektur-Pakets.
|
||||
|
||||
---
|
||||
|
||||
# Übergabe Arbeitspaket: WP-24 – Proactive Discovery & Agentic Knowledge Mining
|
||||
|
||||
## 1. Projekt-Kontext
|
||||
Wir arbeiten an **Mindnet**, einem System für einen "digitalen Zwilling". Das System nutzt einen Wissensgraph (Qdrant), asynchrone Ingestion und eine hybride LLM-Infrastruktur (Cloud/Lokal).
|
||||
|
||||
## 2. Status Quo (Abgeschlossen: WP-15b)
|
||||
Das Arbeitspaket **WP-15b (Candidate-Based Validation)** wurde gerade erfolgreich implementiert.
|
||||
* **Two-Pass Workflow:** In Pass 1 wird ein globaler `LocalBatchCache` aufgebaut (ID, Titel, Dateiname). In Pass 2 findet eine semantische binäre Validierung (YES/NO) statt.
|
||||
* **Edge Inheritance:** Kanten werden aus Sektionen und Frontmatter an Chunks vererbt.
|
||||
* **Candidate Pool:** Nur Kanten in der Sektion `## Unzugeordnete Kanten` (Provenienz: `global_pool`) werden vom LLM geprüft. Explizite Kanten (`[!edge]` im Text) werden direkt übernommen.
|
||||
|
||||
## 3. Auftrag: WP-24 – Proactive Discovery & Agentic Knowledge Mining
|
||||
Das Ziel ist die Transformation von Mindnet zu einem aktiven Denkpartner.
|
||||
|
||||
### Teil A: Proactive Discovery (Vault-Scanning)
|
||||
* **Mechanismus:** Automatisches Befüllen des `candidate_pool` via Vektor-Ähnlichkeit.
|
||||
* **Logik:** Beim Import einer Note sucht ein neuer Service in Qdrant nach den semantisch ähnlichsten Chunks im Vault und fügt diese als `related_to` Kandidaten hinzu.
|
||||
* **Filter:** Die WP-15b Validierungs-Logik filtert diese Vorschläge anschließend.
|
||||
|
||||
### Teil B: Agentic Knowledge Mining (Chat-to-Vault)
|
||||
* **Mechanismus:** Extraktion notierwürdiger Informationen aus dem Chat.
|
||||
* **Logik:** Erstellung von Markdown-Drafts im `00_Inbox` Ordner basierend auf dem Chat-Kontext unter Nutzung des `interview_template`.
|
||||
|
||||
## 4. Erforderliche Code-Basis (Dateien)
|
||||
Stelle sicher, dass dir folgende Dateien vorliegen, um die Logik zu verstehen und zu erweitern:
|
||||
|
||||
1. **`app/core/ingestion.py` (v2.12.2):** Zentraler Two-Pass Workflow und Validierungsgate.
|
||||
2. **`app/core/chunker.py` (v3.2.0):** Vorbereitung des Candidate-Pools und Vererbungslogik.
|
||||
3. **`scripts/import_markdown.py` (v2.4.1):** Entry-Point und Pre-Scan Harvester für den Cache.
|
||||
4. **`app/core/derive_edges.py` (v2.1.0):** Aggregator für Kanten mit Provenance-Priorisierung.
|
||||
5. **`app/services/edge_registry.py` (v0.8.0):** Validierung gegen das Kanten-Vokabular.
|
||||
6. **`config/prompts.yaml` (v2.6.0):** Enthält die `edge_validation` und `interview_template` Prompts.
|
||||
7. **`06_active_roadmap.md` (v2.9.0):** Enthält die detaillierte Planung für WP-24.
|
||||
|
||||
## 5. Nächste technische Schritte
|
||||
1. Entwurf eines `RecommenderService` für die Vektor-Suche in Qdrant.
|
||||
2. Integration des Services in die `ingestion.py` zur automatischen Befüllung des `candidate_pool`.
|
||||
3. Erweiterung des Chat-Backends um die "Capture-to-Vault" Funktionalität.
|
||||
|
|
@ -92,3 +92,22 @@ Dieses Dokument dient als Referenz für die Entstehungsgeschichte von Mindnet v2
|
|||
* **Graph Explorer:** Einführung von `st-cytoscape` für stabile, nicht-überlappende Layouts (COSE) als Ergänzung zur Legacy-Engine (Agraph).
|
||||
* **Single Source of Truth:** Der Editor lädt Inhalte nun direkt vom Dateisystem statt aus (potenziell veralteten) Vektor-Payloads.
|
||||
* **UX:** Einführung von URL-Persistenz für Layout-Settings und CSS-basiertes Highlighting zur Vermeidung von Re-Renders.
|
||||
|
||||
|
||||
## Phase E+: Architektur-Konsolidierung (WP-14)
|
||||
|
||||
### WP-14 – Modularisierung & Paket-Struktur
|
||||
* **Ziel:** Auflösung technischer Schulden und Beseitigung von Zirkelbezügen (Circular Imports).
|
||||
* **Ergebnis:**
|
||||
* **Domänen-Pakete:** Aufteilung der monolithischen `app/core/` Struktur in spezialisierte Pakete: `database/`, `ingestion/`, `retrieval/` und `graph/`.
|
||||
* **Proxy-Pattern:** Einsatz von Fassaden-Modulen (z. B. `graph_adapter.py`) zur Aufrechterhaltung der Abwärtskompatibilität für bestehende API-Endpunkte.
|
||||
* **Registry-Zentralisierung:** Auslagerung neutraler Hilfsfunktionen (wie `clean_llm_text`) in eine unabhängige `registry.py`, um Abhängigkeitsschleifen zwischen Diensten zu brechen.
|
||||
* **Tech:** Einführung von `__init__.py` Exporten zur Definition sauberer Paket-Schnittstellen.
|
||||
|
||||
### WP-15b – Two-Pass Ingestion & Candidate Validation
|
||||
* **Problem:** Die ursprüngliche Smart Edge Extraktion (WP-15) war teuer und neigte zu Halluzinationen, da sie ohne globalen Kontext operierte.
|
||||
* **Lösung:** Implementierung eines **Two-Pass Workflows**.
|
||||
* **Pass 1 (Pre-Scan):** Schnelles Einlesen aller Notizen zur Erstellung eines `LocalBatchCache` (Metadaten & Summaries).
|
||||
* **Pass 2 (Processing):** Gezielte semantische Verarbeitung nur für geänderte Dateien.
|
||||
* **Feature:** **Binary Validation Gate**. Statt Kanten frei zu erfinden, validiert das LLM nun Kanten-Kandidaten aus einem Pool gegen den Kontext des `LocalBatchCache`. Dies garantiert 100% Konformität mit der Edge Registry.
|
||||
* **Ergebnis:** Höhere Geschwindigkeit durch Reduktion komplexer LLM-Prompts auf binäre Entscheidungen (VALID/INVALID).
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import annotations
|
||||
import argparse, os, json, glob, statistics as stats
|
||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||
from app.core.chunker import assemble_chunks
|
||||
from app.core.chunking import assemble_chunks
|
||||
|
||||
def iter_md(root: str):
|
||||
for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True):
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
|||
# Pfad-Setup
|
||||
sys.path.insert(0, os.path.abspath("."))
|
||||
|
||||
from app.core.chunker import assemble_chunks, _extract_all_edges_from_md
|
||||
from app.core.chunking import assemble_chunks, _extract_all_edges_from_md
|
||||
from app.core.derive_edges import build_edges_for_note
|
||||
|
||||
# Mock für Settings, falls nötig
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import annotations
|
||||
import argparse, os, glob
|
||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||
from app.core.chunker import assemble_chunks
|
||||
from app.core.chunking import assemble_chunks
|
||||
|
||||
def iter_md(root: str):
|
||||
return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from slugify import slugify
|
|||
from app.core.parser import read_markdown, normalize_frontmatter
|
||||
from app.core.parser import FRONTMATTER_RE # für Re-Inject
|
||||
from app.core.validate_note import validate_note_payload
|
||||
from app.core.note_payload import make_note_payload
|
||||
from app.core.ingestion.ingestion_note_payload import make_note_payload
|
||||
|
||||
DATE_IN_NAME = re.compile(r"(?P<y>\d{4})[-_\.]?(?P<m>\d{2})[-_\.]?(?P<d>\d{2})")
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,10 @@
|
|||
"""
|
||||
scripts/import_markdown.py
|
||||
CLI-Tool zum Importieren von Markdown-Dateien in Qdrant.
|
||||
Updated for Mindnet v2.3.6 (Async Ingestion Support).
|
||||
WP-15b: Implementiert den Two-Pass Workflow (Pre-Scan + Processing).
|
||||
Sorgt dafür, dass der LocalBatchCache vor der Verarbeitung robust gefüllt wird.
|
||||
Indiziert Notizen nach ID, Titel und Dateiname für maximale Link-Kompatibilität.
|
||||
VERSION: 2.4.1
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
|
|
@ -11,21 +14,16 @@ import logging
|
|||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
import logging
|
||||
# Setzt das Level global auf INFO, damit Sie den Fortschritt sehen
|
||||
# Setzt das Level global auf INFO, damit der Fortschritt im Log sichtbar ist
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||
|
||||
# Wenn Sie TIEFE Einblicke wollen, setzen Sie den SemanticAnalyzer spezifisch auf DEBUG:
|
||||
logging.getLogger("app.services.semantic_analyzer").setLevel(logging.DEBUG)
|
||||
|
||||
# Importiere den neuen Async Service
|
||||
# Stellen wir sicher, dass der Pfad stimmt (Pythonpath)
|
||||
# Importiere den neuen Async Service und stelle Python-Pfad sicher
|
||||
import sys
|
||||
sys.path.append(os.getcwd())
|
||||
|
||||
from app.core.ingestion import IngestionService
|
||||
from app.core.parser import pre_scan_markdown
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
logger = logging.getLogger("importer")
|
||||
|
||||
async def main_async(args):
|
||||
|
|
@ -34,7 +32,7 @@ async def main_async(args):
|
|||
logger.error(f"Vault path does not exist: {vault_path}")
|
||||
return
|
||||
|
||||
# Service initialisieren (startet Async Clients)
|
||||
# 1. Service initialisieren
|
||||
logger.info(f"Initializing IngestionService (Prefix: {args.prefix})")
|
||||
service = IngestionService(collection_prefix=args.prefix)
|
||||
|
||||
|
|
@ -46,14 +44,42 @@ async def main_async(args):
|
|||
|
||||
logger.info(f"Found {len(files)} markdown files.")
|
||||
|
||||
stats = {"processed": 0, "skipped": 0, "errors": 0}
|
||||
# =========================================================================
|
||||
# PASS 1: Global Pre-Scan (WP-15b Harvester)
|
||||
# Füllt den LocalBatchCache für die semantische Kanten-Validierung.
|
||||
# Nutzt ID, Titel und Filename für robusten Look-up.
|
||||
# =========================================================================
|
||||
logger.info(f"🔍 [Pass 1] Pre-scanning {len(files)} files for global context cache...")
|
||||
for f_path in files:
|
||||
try:
|
||||
ctx = pre_scan_markdown(str(f_path))
|
||||
if ctx:
|
||||
# 1. Look-up via Note ID (UUID oder Frontmatter ID)
|
||||
service.batch_cache[ctx.note_id] = ctx
|
||||
|
||||
# Wir nutzen eine Semaphore, um nicht zu viele Files gleichzeitig zu öffnen/embedden
|
||||
sem = asyncio.Semaphore(5) # Max 5 concurrent files to avoid OOM or Rate Limit
|
||||
# 2. Look-up via Titel (Wichtig für Wikilinks [[Titel]])
|
||||
service.batch_cache[ctx.title] = ctx
|
||||
|
||||
# 3. Look-up via Dateiname (Wichtig für Wikilinks [[Filename]])
|
||||
fname = os.path.splitext(f_path.name)[0]
|
||||
service.batch_cache[fname] = ctx
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Could not pre-scan {f_path.name}: {e}")
|
||||
|
||||
logger.info(f"✅ Context Cache populated for {len(files)} notes.")
|
||||
|
||||
# =========================================================================
|
||||
# PASS 2: Processing (Semantic Batch-Verarbeitung)
|
||||
# Nutzt den gefüllten Cache zur binären Validierung semantischer Kanten.
|
||||
# =========================================================================
|
||||
stats = {"processed": 0, "skipped": 0, "errors": 0}
|
||||
sem = asyncio.Semaphore(5) # Max 5 parallele Dateien für Cloud-Stabilität
|
||||
|
||||
async def process_with_limit(f_path):
|
||||
async with sem:
|
||||
try:
|
||||
# Nutzt den nun gefüllten Batch-Cache in der process_file Logik
|
||||
res = await service.process_file(
|
||||
file_path=str(f_path),
|
||||
vault_root=str(vault_path),
|
||||
|
|
@ -65,8 +91,8 @@ async def main_async(args):
|
|||
except Exception as e:
|
||||
return {"status": "error", "error": str(e), "path": str(f_path)}
|
||||
|
||||
# Batch Processing
|
||||
# Wir verarbeiten in Chunks, um den Progress zu sehen
|
||||
logger.info(f"🚀 [Pass 2] Starting semantic processing in batches...")
|
||||
|
||||
batch_size = 20
|
||||
for i in range(0, len(files), batch_size):
|
||||
batch = files[i:i+batch_size]
|
||||
|
|
@ -92,7 +118,7 @@ def main():
|
|||
load_dotenv()
|
||||
default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Import Vault to Qdrant (Async)")
|
||||
parser = argparse.ArgumentParser(description="Two-Pass Markdown Ingestion for Mindnet")
|
||||
parser.add_argument("--vault", default="./vault", help="Path to vault root")
|
||||
parser.add_argument("--prefix", default=default_prefix, help="Collection prefix")
|
||||
parser.add_argument("--force", action="store_true", help="Force re-index all files")
|
||||
|
|
@ -100,7 +126,7 @@ def main():
|
|||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Starte den Async Loop
|
||||
# Starte den asynchronen Haupt-Loop
|
||||
asyncio.run(main_async(args))
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ from jsonschema import ValidationError
|
|||
from app.core.parser import read_markdown, validate_required_frontmatter, normalize_frontmatter
|
||||
from app.core.note_payload import make_note_payload
|
||||
from app.core.validate_note import validate_note_payload
|
||||
from app.core.ingestion.ingestion_note_payload import make_note_payload
|
||||
|
||||
|
||||
def iter_md_files(root: str, include: str, exclude: list[str]) -> list[str]:
|
||||
# include z.B. "**/*.md"
|
||||
|
|
|
|||
|
|
@ -10,9 +10,9 @@ import argparse, os, json
|
|||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||
from app.core.note_payload import make_note_payload
|
||||
from app.core.chunker import assemble_chunks
|
||||
from app.core.chunk_payload import make_chunk_payloads
|
||||
from app.core.chunking import assemble_chunks
|
||||
from app.core.ingestion.ingestion_note_payload import make_note_payload
|
||||
from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads
|
||||
try:
|
||||
from app.core.derive_edges import build_edges_for_note
|
||||
except Exception:
|
||||
|
|
|
|||
|
|
@ -2,9 +2,10 @@
|
|||
from __future__ import annotations
|
||||
import argparse, os, glob, json
|
||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||
from app.core.chunker import assemble_chunks
|
||||
from app.core.chunk_payload import make_chunk_payloads
|
||||
from app.core.note_payload import make_note_payload
|
||||
from app.core.chunking import assemble_chunks
|
||||
from app.core.ingestion.ingestion_note_payload import make_note_payload
|
||||
from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads
|
||||
|
||||
|
||||
def iter_md(root: str) -> list[str]:
|
||||
return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user