WP4d #16
|
|
@ -1,6 +1,8 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunking/chunking_parser.py
|
FILE: app/core/chunking/chunking_parser.py
|
||||||
DESCRIPTION: Zerlegt Markdown in Blöcke und extrahiert Kanten-Strings.
|
DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
|
||||||
|
Hält alle Überschriftenebenen (H1-H6) im Stream.
|
||||||
|
Stellt die Funktion parse_edges_robust zur Verfügung.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import List, Tuple, Set
|
from typing import List, Tuple, Set
|
||||||
|
|
@ -11,69 +13,86 @@ _WS = re.compile(r'\s+')
|
||||||
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
||||||
|
|
||||||
def split_sentences(text: str) -> list[str]:
|
def split_sentences(text: str) -> list[str]:
|
||||||
"""Teilt Text in Sätze auf."""
|
"""Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
|
||||||
text = _WS.sub(' ', text.strip())
|
text = _WS.sub(' ', text.strip())
|
||||||
if not text: return []
|
if not text: return []
|
||||||
|
# Splittet bei Punkt, Ausrufezeichen oder Fragezeichen, gefolgt von Leerzeichen und Großbuchstabe
|
||||||
return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
|
return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
|
||||||
|
|
||||||
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
"""Zerlegt Text in logische Einheiten."""
|
"""Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6."""
|
||||||
blocks = []
|
blocks = []
|
||||||
h1_title = "Dokument"; section_path = "/"; current_h2 = None
|
h1_title = "Dokument"
|
||||||
|
section_path = "/"
|
||||||
|
current_section_title = None
|
||||||
|
|
||||||
|
# Frontmatter entfernen
|
||||||
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
||||||
|
|
||||||
|
# H1 für Note-Titel extrahieren (Metadaten-Zweck)
|
||||||
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
|
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
|
||||||
if h1_match: h1_title = h1_match.group(1).strip()
|
if h1_match:
|
||||||
|
h1_title = h1_match.group(1).strip()
|
||||||
|
|
||||||
lines = text_without_fm.split('\n')
|
lines = text_without_fm.split('\n')
|
||||||
buffer = []
|
buffer = []
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
stripped = line.strip()
|
stripped = line.strip()
|
||||||
|
|
||||||
# H1 ignorieren (ist Doc Title)
|
# Heading-Erkennung (H1 bis H6)
|
||||||
if stripped.startswith('# '):
|
heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
|
||||||
continue
|
|
||||||
|
|
||||||
# Generische Heading-Erkennung (H2 bis H6) für flexible Split-Levels
|
|
||||||
heading_match = re.match(r'^(#{2,6})\s+(.*)', stripped)
|
|
||||||
if heading_match:
|
if heading_match:
|
||||||
# Buffer leeren (vorherigen Text abschließen)
|
# Vorherigen Text-Block abschließen
|
||||||
if buffer:
|
if buffer:
|
||||||
content = "\n".join(buffer).strip()
|
content = "\n".join(buffer).strip()
|
||||||
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
if content:
|
||||||
|
blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
|
||||||
buffer = []
|
buffer = []
|
||||||
|
|
||||||
level = len(heading_match.group(1))
|
level = len(heading_match.group(1))
|
||||||
title = heading_match.group(2).strip()
|
title = heading_match.group(2).strip()
|
||||||
|
|
||||||
# Pfad-Logik: H2 setzt den Haupt-Pfad
|
# Pfad- und Titel-Update für die Metadaten der folgenden Blöcke
|
||||||
if level == 2:
|
if level == 1:
|
||||||
current_h2 = title
|
current_section_title = title; section_path = "/"
|
||||||
section_path = f"/{current_h2}"
|
elif level == 2:
|
||||||
# Bei H3+ bleibt der section_path beim Parent, aber das Level wird korrekt gesetzt
|
current_section_title = title; section_path = f"/{current_section_title}"
|
||||||
|
|
||||||
blocks.append(RawBlock("heading", stripped, level, section_path, current_h2))
|
# Die Überschrift selbst als regulären Block hinzufügen
|
||||||
|
blocks.append(RawBlock("heading", stripped, level, section_path, current_section_title))
|
||||||
elif not stripped:
|
continue
|
||||||
|
|
||||||
|
# Trenner (---) oder Leerzeilen beenden Blöcke, außer innerhalb von Callouts
|
||||||
|
if (not stripped or stripped == "---") and not line.startswith('>'):
|
||||||
if buffer:
|
if buffer:
|
||||||
content = "\n".join(buffer).strip()
|
content = "\n".join(buffer).strip()
|
||||||
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
if content:
|
||||||
|
blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
|
||||||
buffer = []
|
buffer = []
|
||||||
|
if stripped == "---":
|
||||||
|
blocks.append(RawBlock("separator", "---", None, section_path, current_section_title))
|
||||||
else:
|
else:
|
||||||
buffer.append(line)
|
buffer.append(line)
|
||||||
|
|
||||||
if buffer:
|
if buffer:
|
||||||
content = "\n".join(buffer).strip()
|
content = "\n".join(buffer).strip()
|
||||||
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
if content:
|
||||||
|
blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
|
||||||
|
|
||||||
return blocks, h1_title
|
return blocks, h1_title
|
||||||
|
|
||||||
def parse_edges_robust(text: str) -> Set[str]:
|
def parse_edges_robust(text: str) -> Set[str]:
|
||||||
"""Extrahiert Kanten-Kandidaten (Wikilinks, Callouts)."""
|
"""Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts."""
|
||||||
found_edges = set()
|
found_edges = set()
|
||||||
|
# 1. Wikilinks [[rel:kind|target]]
|
||||||
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
||||||
for kind, target in inlines:
|
for kind, target in inlines:
|
||||||
k = kind.strip().lower()
|
k = kind.strip().lower()
|
||||||
t = target.strip()
|
t = target.strip()
|
||||||
if k and t: found_edges.add(f"{k}:{t}")
|
if k and t: found_edges.add(f"{k}:{t}")
|
||||||
|
|
||||||
|
# 2. Callout Edges > [!edge] kind
|
||||||
lines = text.split('\n')
|
lines = text.split('\n')
|
||||||
current_edge_type = None
|
current_edge_type = None
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
|
@ -81,13 +100,16 @@ def parse_edges_robust(text: str) -> Set[str]:
|
||||||
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
|
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
|
||||||
if callout_match:
|
if callout_match:
|
||||||
current_edge_type = callout_match.group(1).strip().lower()
|
current_edge_type = callout_match.group(1).strip().lower()
|
||||||
|
# Links in der gleichen Zeile des Callouts
|
||||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||||
for l in links:
|
for l in links:
|
||||||
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
||||||
continue
|
continue
|
||||||
|
# Links in Folgezeilen des Callouts
|
||||||
if current_edge_type and stripped.startswith('>'):
|
if current_edge_type and stripped.startswith('>'):
|
||||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||||
for l in links:
|
for l in links:
|
||||||
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
||||||
elif not stripped.startswith('>'): current_edge_type = None
|
elif not stripped.startswith('>'):
|
||||||
|
current_edge_type = None
|
||||||
return found_edges
|
return found_edges
|
||||||
|
|
@ -1,7 +1,8 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunking/chunking_processor.py
|
FILE: app/core/chunking/chunking_processor.py
|
||||||
DESCRIPTION: Der zentrale Orchestrator für das Chunking-System.
|
DESCRIPTION: Der zentrale Orchestrator für das Chunking-System.
|
||||||
AUDIT v3.3.3: Wiederherstellung der "Gold-Standard" Qualität.
|
AUDIT v3.3.4: Wiederherstellung der "Gold-Standard" Qualität.
|
||||||
|
- Fix: Synchronisierung der Parameter (context_prefix) für alle Strategien.
|
||||||
- Integriert physikalische Kanten-Injektion (Propagierung).
|
- Integriert physikalische Kanten-Injektion (Propagierung).
|
||||||
- Stellt H1-Kontext-Fenster sicher.
|
- Stellt H1-Kontext-Fenster sicher.
|
||||||
- Baut den Candidate-Pool für die WP-15b Ingestion auf.
|
- Baut den Candidate-Pool für die WP-15b Ingestion auf.
|
||||||
|
|
@ -30,16 +31,19 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
fm, body_text = extract_frontmatter_from_text(md_text)
|
fm, body_text = extract_frontmatter_from_text(md_text)
|
||||||
blocks, doc_title = parse_blocks(md_text)
|
blocks, doc_title = parse_blocks(md_text)
|
||||||
|
|
||||||
# Vorbereitung des H1-Präfix für die Embedding-Fenster
|
# Vorbereitung des H1-Präfix für die Embedding-Fenster (Breadcrumbs)
|
||||||
h1_prefix = f"# {doc_title}" if doc_title else ""
|
h1_prefix = f"# {doc_title}" if doc_title else ""
|
||||||
|
|
||||||
# 2. Anwendung der Splitting-Strategie
|
# 2. Anwendung der Splitting-Strategie
|
||||||
# Wir übergeben den Dokument-Titel/Präfix für die Window-Bildung.
|
# Alle Strategien nutzen nun einheitlich context_prefix für die Window-Bildung.
|
||||||
if config.get("strategy") == "by_heading":
|
if config.get("strategy") == "by_heading":
|
||||||
chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title)
|
chunks = await asyncio.to_thread(
|
||||||
|
strategy_by_heading, blocks, config, note_id, context_prefix=h1_prefix
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# sliding_window nutzt nun den context_prefix für das Window-Feld.
|
chunks = await asyncio.to_thread(
|
||||||
chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id, context_prefix=h1_prefix)
|
strategy_sliding_window, blocks, config, note_id, context_prefix=h1_prefix
|
||||||
|
)
|
||||||
|
|
||||||
if not chunks:
|
if not chunks:
|
||||||
return []
|
return []
|
||||||
|
|
@ -52,6 +56,7 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
# Zuerst die explizit im Text vorhandenen Kanten sammeln.
|
# Zuerst die explizit im Text vorhandenen Kanten sammeln.
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
# Wir extrahieren aus dem bereits (durch Propagation) angereicherten Text.
|
# Wir extrahieren aus dem bereits (durch Propagation) angereicherten Text.
|
||||||
|
# ch.candidate_pool wird im Modell-Konstruktor als leere Liste initialisiert.
|
||||||
for e_str in parse_edges_robust(ch.text):
|
for e_str in parse_edges_robust(ch.text):
|
||||||
parts = e_str.split(':', 1)
|
parts = e_str.split(':', 1)
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
|
|
@ -71,7 +76,7 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
parts = e_str.split(':', 1)
|
parts = e_str.split(':', 1)
|
||||||
if len(parts) == 2:
|
if len(parts) == 2:
|
||||||
k, t = parts
|
k, t = parts
|
||||||
# Diese Kanten werden als "Global Pool" markiert für die spätere KI-Prüfung.
|
# Diese Kanten werden als "global_pool" markiert für die spätere KI-Prüfung.
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"})
|
ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"})
|
||||||
|
|
||||||
|
|
@ -80,6 +85,7 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
seen = set()
|
seen = set()
|
||||||
unique = []
|
unique = []
|
||||||
for c in ch.candidate_pool:
|
for c in ch.candidate_pool:
|
||||||
|
# Eindeutigkeit über Typ, Ziel und Herkunft (Provenance)
|
||||||
key = (c["kind"], c["to"], c["provenance"])
|
key = (c["kind"], c["to"], c["provenance"])
|
||||||
if key not in seen:
|
if key not in seen:
|
||||||
seen.add(key)
|
seen.add(key)
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,8 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunking/chunking_propagation.py
|
FILE: app/core/chunking/chunking_propagation.py
|
||||||
DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment).
|
DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment).
|
||||||
Stellt die "Gold-Standard"-Qualität von v3.1.0 wieder her.
|
Fix v3.3.6: Nutzt robustes Parsing zur Erkennung vorhandener Kanten,
|
||||||
VERSION: 3.3.1
|
um Dopplungen direkt hinter [!edge] Callouts format-agnostisch zu verhindern.
|
||||||
STATUS: Active
|
|
||||||
"""
|
"""
|
||||||
from typing import List, Dict, Set
|
from typing import List, Dict, Set
|
||||||
from .chunking_models import Chunk
|
from .chunking_models import Chunk
|
||||||
|
|
@ -12,7 +11,7 @@ from .chunking_parser import parse_edges_robust
|
||||||
def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window.
|
Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window.
|
||||||
Dies ist essenziell für die Vektorisierung der Beziehungen.
|
Verhindert Dopplungen, wenn Kanten bereits via [!edge] Callout vorhanden sind.
|
||||||
"""
|
"""
|
||||||
# 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren
|
# 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren
|
||||||
section_map: Dict[str, Set[str]] = {} # path -> set(kind:target)
|
section_map: Dict[str, Set[str]] = {} # path -> set(kind:target)
|
||||||
|
|
@ -36,21 +35,28 @@ def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
||||||
if not edges_to_add:
|
if not edges_to_add:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Vorhandene Kanten (Typ:Ziel) in DIESEM Chunk ermitteln,
|
||||||
|
# um Dopplungen (z.B. durch Callouts) zu vermeiden.
|
||||||
|
existing_edges = parse_edges_robust(ch.text)
|
||||||
|
|
||||||
injections = []
|
injections = []
|
||||||
for e_str in edges_to_add:
|
# Sortierung für deterministische Ergebnisse
|
||||||
|
for e_str in sorted(list(edges_to_add)):
|
||||||
|
# Wenn die Kante (Typ + Ziel) bereits vorhanden ist (egal welches Format),
|
||||||
|
# überspringen wir die Injektion für diesen Chunk.
|
||||||
|
if e_str in existing_edges:
|
||||||
|
continue
|
||||||
|
|
||||||
kind, target = e_str.split(':', 1)
|
kind, target = e_str.split(':', 1)
|
||||||
# Nur injizieren, wenn die Kante nicht bereits im Text steht
|
injections.append(f"[[rel:{kind}|{target}]]")
|
||||||
token = f"[[rel:{kind}|{target}]]"
|
|
||||||
if token not in ch.text:
|
|
||||||
injections.append(token)
|
|
||||||
|
|
||||||
if injections:
|
if injections:
|
||||||
# Physische Anreicherung (Der v3.1.0 Qualitäts-Fix)
|
# Physische Anreicherung
|
||||||
# Triple-Newline für saubere Trennung im Embedding-Fenster
|
# Triple-Newline für saubere Trennung im Embedding-Fenster
|
||||||
block = "\n\n\n" + " ".join(injections)
|
block = "\n\n\n" + " ".join(injections)
|
||||||
ch.text += block
|
ch.text += block
|
||||||
|
|
||||||
# ENTSCHEIDEND: Auch ins Window schreiben, da Qdrant hier sucht!
|
# Auch ins Window schreiben, da Qdrant hier sucht!
|
||||||
if ch.window:
|
if ch.window:
|
||||||
ch.window += block
|
ch.window += block
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -1,142 +1,166 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunking/chunking_strategies.py
|
FILE: app/core/chunking/chunking_strategies.py
|
||||||
DESCRIPTION: Mathematische Splitting-Strategien.
|
DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
|
||||||
AUDIT v3.3.2: 100% Konformität zur 'by_heading' Spezifikation.
|
Implementiert das 'Pack-and-Carry-Over' Verfahren nach Regel 1-3.
|
||||||
- Implementiert Hybrid-Safety-Net (Sliding Window für Übergrößen).
|
- Keine redundante Kanten-Injektion.
|
||||||
- Breadcrumb-Kontext im Window (H1 > H2).
|
- Strikte Einhaltung von Sektionsgrenzen via Look-Ahead.
|
||||||
- Sliding Window mit H1-Kontext (Gold-Standard v3.1.0).
|
- Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix).
|
||||||
"""
|
"""
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from .chunking_models import RawBlock, Chunk
|
from .chunking_models import RawBlock, Chunk
|
||||||
from .chunking_utils import estimate_tokens
|
from .chunking_utils import estimate_tokens
|
||||||
from .chunking_parser import split_sentences
|
from .chunking_parser import split_sentences
|
||||||
|
|
||||||
def _create_context_win(doc_title: str, sec_title: Optional[str], text: str) -> str:
|
def _create_win(context_prefix: str, sec_title: Optional[str], text: str) -> str:
|
||||||
"""Baut den Breadcrumb-Kontext für das Embedding-Fenster."""
|
"""Baut den Breadcrumb-Kontext für das Embedding-Fenster."""
|
||||||
parts = []
|
parts = [context_prefix] if context_prefix else []
|
||||||
if doc_title: parts.append(doc_title)
|
# Verhindert Dopplung, falls der Context-Prefix (H1) bereits den Sektionsnamen enthält
|
||||||
if sec_title and sec_title != doc_title: parts.append(sec_title)
|
if sec_title and f"# {sec_title}" != context_prefix and sec_title not in (context_prefix or ""):
|
||||||
|
parts.append(sec_title)
|
||||||
prefix = " > ".join(parts)
|
prefix = " > ".join(parts)
|
||||||
return f"{prefix}\n{text}".strip() if prefix else text
|
return f"{prefix}\n{text}".strip() if prefix else text
|
||||||
|
|
||||||
def strategy_sliding_window(blocks: List[RawBlock],
|
def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
||||||
config: Dict[str, Any],
|
|
||||||
note_id: str,
|
|
||||||
context_prefix: str = "") -> List[Chunk]:
|
|
||||||
"""
|
"""
|
||||||
Fasst Blöcke zusammen und schneidet bei 'target' Tokens.
|
Universelle Heading-Strategie mit Carry-Over Logik.
|
||||||
Ignoriert H2-Überschriften beim Splitting, um Kontext zu wahren.
|
Synchronisiert auf context_prefix für Kompatibilität mit dem Orchestrator.
|
||||||
"""
|
|
||||||
target = config.get("target", 400)
|
|
||||||
max_tokens = config.get("max", 600)
|
|
||||||
overlap_val = config.get("overlap", (50, 80))
|
|
||||||
overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
|
|
||||||
|
|
||||||
chunks: List[Chunk] = []
|
|
||||||
buf: List[RawBlock] = []
|
|
||||||
|
|
||||||
def _add(txt, sec, path):
|
|
||||||
idx = len(chunks)
|
|
||||||
# H1-Kontext Präfix für das Window-Feld
|
|
||||||
win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
|
|
||||||
chunks.append(Chunk(
|
|
||||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
|
||||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
|
||||||
section_title=sec, section_path=path,
|
|
||||||
neighbors_prev=None, neighbors_next=None
|
|
||||||
))
|
|
||||||
|
|
||||||
def flush():
|
|
||||||
nonlocal buf
|
|
||||||
if not buf: return
|
|
||||||
text_body = "\n\n".join([b.text for b in buf])
|
|
||||||
sec_title = buf[-1].section_title; sec_path = buf[-1].section_path
|
|
||||||
|
|
||||||
if estimate_tokens(text_body) <= max_tokens:
|
|
||||||
_add(text_body, sec_title, sec_path)
|
|
||||||
else:
|
|
||||||
sents = split_sentences(text_body); cur_sents = []; cur_len = 0
|
|
||||||
for s in sents:
|
|
||||||
slen = estimate_tokens(s)
|
|
||||||
if cur_len + slen > target and cur_sents:
|
|
||||||
_add(" ".join(cur_sents), sec_title, sec_path)
|
|
||||||
ov_s = []; ov_l = 0
|
|
||||||
for os in reversed(cur_sents):
|
|
||||||
if ov_l + estimate_tokens(os) < overlap:
|
|
||||||
ov_s.insert(0, os); ov_l += estimate_tokens(os)
|
|
||||||
else: break
|
|
||||||
cur_sents = list(ov_s); cur_sents.append(s); cur_len = ov_l + slen
|
|
||||||
else:
|
|
||||||
cur_sents.append(s); cur_len += slen
|
|
||||||
if cur_sents:
|
|
||||||
_add(" ".join(cur_sents), sec_title, sec_path)
|
|
||||||
buf = []
|
|
||||||
|
|
||||||
for b in blocks:
|
|
||||||
# H2-Überschriften werden ignoriert, um den Zusammenhang zu wahren
|
|
||||||
if b.kind == "heading": continue
|
|
||||||
if estimate_tokens("\n\n".join([x.text for x in buf])) + estimate_tokens(b.text) >= target:
|
|
||||||
flush()
|
|
||||||
buf.append(b)
|
|
||||||
flush()
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
|
||||||
"""
|
|
||||||
Splittet Text basierend auf Markdown-Überschriften mit Hybrid-Safety-Net.
|
|
||||||
"""
|
"""
|
||||||
|
smart_edge = config.get("enable_smart_edge_allocation", True)
|
||||||
strict = config.get("strict_heading_split", False)
|
strict = config.get("strict_heading_split", False)
|
||||||
target = config.get("target", 400)
|
target = config.get("target", 400)
|
||||||
max_tokens = config.get("max", 600)
|
max_tokens = config.get("max", 600)
|
||||||
split_level = config.get("split_level", 2)
|
split_level = config.get("split_level", 2)
|
||||||
overlap = sum(config.get("overlap", (50, 80))) // 2
|
overlap_cfg = config.get("overlap", (50, 80))
|
||||||
|
overlap = sum(overlap_cfg) // 2 if isinstance(overlap_cfg, (list, tuple)) else overlap_cfg
|
||||||
|
|
||||||
chunks: List[Chunk] = []
|
chunks: List[Chunk] = []
|
||||||
buf: List[str] = []
|
|
||||||
cur_tokens = 0
|
|
||||||
|
|
||||||
def _add_to_chunks(txt, title, path):
|
def _emit(txt, title, path):
|
||||||
|
"""Schreibt den finalen Chunk ohne Text-Modifikationen."""
|
||||||
idx = len(chunks)
|
idx = len(chunks)
|
||||||
win = _create_context_win(doc_title, title, txt)
|
win = _create_win(context_prefix, title, txt)
|
||||||
chunks.append(Chunk(
|
chunks.append(Chunk(
|
||||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||||
section_title=title, section_path=path,
|
section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None
|
||||||
neighbors_prev=None, neighbors_next=None
|
|
||||||
))
|
))
|
||||||
|
|
||||||
def _flush(title, path):
|
# --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
|
||||||
nonlocal buf, cur_tokens
|
sections: List[Dict[str, Any]] = []
|
||||||
if not buf: return
|
curr_blocks = []
|
||||||
full_text = "\n\n".join(buf)
|
|
||||||
if estimate_tokens(full_text) <= max_tokens:
|
|
||||||
_add_to_chunks(full_text, title, path)
|
|
||||||
else:
|
|
||||||
sents = split_sentences(full_text); cur_sents = []; sub_len = 0
|
|
||||||
for s in sents:
|
|
||||||
slen = estimate_tokens(s)
|
|
||||||
if sub_len + slen > target and cur_sents:
|
|
||||||
_add_to_chunks(" ".join(cur_sents), title, path)
|
|
||||||
ov_s = []; ov_l = 0
|
|
||||||
for os in reversed(cur_sents):
|
|
||||||
if ov_l + estimate_tokens(os) < overlap:
|
|
||||||
ov_s.insert(0, os); ov_l += estimate_tokens(os)
|
|
||||||
else: break
|
|
||||||
cur_sents = list(ov_s); cur_sents.append(s); sub_len = ov_l + slen
|
|
||||||
else: cur_sents.append(s); sub_len += slen
|
|
||||||
if cur_sents: _add_to_chunks(" ".join(cur_sents), title, path)
|
|
||||||
buf = []; cur_tokens = 0
|
|
||||||
|
|
||||||
for b in blocks:
|
for b in blocks:
|
||||||
if b.kind == "heading":
|
if b.kind == "heading" and b.level <= split_level:
|
||||||
if b.level < split_level: _flush(b.section_title, b.section_path)
|
if curr_blocks:
|
||||||
elif b.level == split_level:
|
sections.append({
|
||||||
if strict or cur_tokens >= target: _flush(b.section_title, b.section_path)
|
"text": "\n\n".join([x.text for x in curr_blocks]),
|
||||||
|
"meta": curr_blocks[0],
|
||||||
|
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
|
||||||
|
})
|
||||||
|
curr_blocks = [b]
|
||||||
|
else:
|
||||||
|
curr_blocks.append(b)
|
||||||
|
if curr_blocks:
|
||||||
|
sections.append({
|
||||||
|
"text": "\n\n".join([x.text for x in curr_blocks]),
|
||||||
|
"meta": curr_blocks[0],
|
||||||
|
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
|
||||||
|
})
|
||||||
|
|
||||||
|
# --- SCHRITT 2: Verarbeitung der Queue ---
|
||||||
|
queue = list(sections)
|
||||||
|
current_chunk_text = ""
|
||||||
|
current_meta = {"title": None, "path": "/"}
|
||||||
|
|
||||||
|
# Bestimmung des Modus: Hard-Split wenn smart_edge=False ODER strict=True
|
||||||
|
is_hard_split_mode = (not smart_edge) or (strict)
|
||||||
|
|
||||||
|
while queue:
|
||||||
|
item = queue.pop(0)
|
||||||
|
item_text = item["text"]
|
||||||
|
|
||||||
|
# Initialisierung für neuen Chunk
|
||||||
|
if not current_chunk_text:
|
||||||
|
current_meta["title"] = item["meta"].section_title
|
||||||
|
current_meta["path"] = item["meta"].section_path
|
||||||
|
|
||||||
|
# FALL A: HARD SPLIT MODUS
|
||||||
|
if is_hard_split_mode:
|
||||||
|
# Leere Überschriften (z.B. H1 direkt vor H2) verbleiben am nächsten Chunk
|
||||||
|
if item.get("is_empty", False) and queue:
|
||||||
|
current_chunk_text = (current_chunk_text + "\n\n" + item_text).strip()
|
||||||
|
continue
|
||||||
|
|
||||||
|
combined = (current_chunk_text + "\n\n" + item_text).strip()
|
||||||
|
# Wenn durch Verschmelzung das Limit gesprengt würde, vorher flashen
|
||||||
|
if estimate_tokens(combined) > max_tokens and current_chunk_text:
|
||||||
|
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||||
|
current_chunk_text = item_text
|
||||||
|
else:
|
||||||
|
current_chunk_text = combined
|
||||||
|
|
||||||
|
# Im Hard-Split wird nach jeder Sektion geflasht
|
||||||
|
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||||
|
current_chunk_text = ""
|
||||||
continue
|
continue
|
||||||
bt = estimate_tokens(b.text)
|
|
||||||
if cur_tokens + bt > max_tokens and buf: _flush(b.section_title, b.section_path)
|
# FALL B: SMART MODE (Regel 1-3)
|
||||||
buf.append(b.text); cur_tokens += bt
|
combined_text = (current_chunk_text + "\n\n" + item_text).strip() if current_chunk_text else item_text
|
||||||
|
combined_est = estimate_tokens(combined_text)
|
||||||
|
|
||||||
|
if combined_est <= max_tokens:
|
||||||
|
# Regel 1 & 2: Passt rein laut Schätzung -> Aufnehmen
|
||||||
|
current_chunk_text = combined_text
|
||||||
|
else:
|
||||||
|
if current_chunk_text:
|
||||||
|
# Regel 2: Flashen an Sektionsgrenze, Item zurücklegen
|
||||||
|
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||||
|
current_chunk_text = ""
|
||||||
|
queue.insert(0, item)
|
||||||
|
else:
|
||||||
|
# Regel 3: Einzelne Sektion zu groß -> Smart Zerlegung
|
||||||
|
sents = split_sentences(item_text)
|
||||||
|
header_prefix = item["meta"].text if item["meta"].kind == "heading" else ""
|
||||||
|
|
||||||
|
take_sents = []; take_len = 0
|
||||||
|
while sents:
|
||||||
|
s = sents.pop(0); slen = estimate_tokens(s)
|
||||||
|
if take_len + slen > target and take_sents:
|
||||||
|
sents.insert(0, s); break
|
||||||
|
take_sents.append(s); take_len += slen
|
||||||
|
|
||||||
|
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
||||||
|
|
||||||
|
if sents:
|
||||||
|
remainder = " ".join(sents)
|
||||||
|
# Kontext-Erhalt: Überschrift für den Rest wiederholen
|
||||||
|
if header_prefix and not remainder.startswith(header_prefix):
|
||||||
|
remainder = header_prefix + "\n\n" + remainder
|
||||||
|
# Carry-Over: Rest wird vorne in die Queue geschoben
|
||||||
|
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True})
|
||||||
|
|
||||||
|
if current_chunk_text:
|
||||||
|
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
||||||
|
"""Standard-Sliding-Window für flache Texte ohne Sektionsfokus."""
|
||||||
|
target = config.get("target", 400); max_tokens = config.get("max", 600)
|
||||||
|
chunks: List[Chunk] = []; buf: List[RawBlock] = []
|
||||||
|
|
||||||
|
for b in blocks:
|
||||||
|
b_tokens = estimate_tokens(b.text)
|
||||||
|
curr_tokens = sum(estimate_tokens(x.text) for x in buf) if buf else 0
|
||||||
|
if curr_tokens + b_tokens > max_tokens and buf:
|
||||||
|
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||||
|
win = _create_win(context_prefix, buf[0].section_title, txt)
|
||||||
|
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=curr_tokens, section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
||||||
|
buf = []
|
||||||
|
buf.append(b)
|
||||||
|
|
||||||
if buf:
|
if buf:
|
||||||
last_b = blocks[-1] if blocks else None
|
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||||
_flush(last_b.section_title if last_b else None, last_b.section_path if last_b else "/")
|
win = _create_win(context_prefix, buf[0].section_title, txt)
|
||||||
|
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
@ -3,7 +3,7 @@ FILE: app/core/database/qdrant.py
|
||||||
DESCRIPTION: Qdrant-Client Factory und Schema-Management.
|
DESCRIPTION: Qdrant-Client Factory und Schema-Management.
|
||||||
Erstellt Collections und Payload-Indizes.
|
Erstellt Collections und Payload-Indizes.
|
||||||
MODULARISIERUNG: Verschoben in das database-Paket für WP-14.
|
MODULARISIERUNG: Verschoben in das database-Paket für WP-14.
|
||||||
VERSION: 2.2.1
|
VERSION: 2.2.2 (WP-Fix: Index für target_section)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: qdrant_client, dataclasses, os
|
DEPENDENCIES: qdrant_client, dataclasses, os
|
||||||
"""
|
"""
|
||||||
|
|
@ -124,7 +124,7 @@ def ensure_payload_indexes(client: QdrantClient, prefix: str) -> None:
|
||||||
Stellt sicher, dass alle benötigten Payload-Indizes für die Suche existieren.
|
Stellt sicher, dass alle benötigten Payload-Indizes für die Suche existieren.
|
||||||
- notes: note_id, type, title, updated, tags
|
- notes: note_id, type, title, updated, tags
|
||||||
- chunks: note_id, chunk_id, index, type, tags
|
- chunks: note_id, chunk_id, index, type, tags
|
||||||
- edges: note_id, kind, scope, source_id, target_id, chunk_id
|
- edges: note_id, kind, scope, source_id, target_id, chunk_id, target_section
|
||||||
"""
|
"""
|
||||||
notes, chunks, edges = collection_names(prefix)
|
notes, chunks, edges = collection_names(prefix)
|
||||||
|
|
||||||
|
|
@ -156,6 +156,8 @@ def ensure_payload_indexes(client: QdrantClient, prefix: str) -> None:
|
||||||
("source_id", rest.PayloadSchemaType.KEYWORD),
|
("source_id", rest.PayloadSchemaType.KEYWORD),
|
||||||
("target_id", rest.PayloadSchemaType.KEYWORD),
|
("target_id", rest.PayloadSchemaType.KEYWORD),
|
||||||
("chunk_id", rest.PayloadSchemaType.KEYWORD),
|
("chunk_id", rest.PayloadSchemaType.KEYWORD),
|
||||||
|
# NEU: Index für Section-Links (WP-15b)
|
||||||
|
("target_section", rest.PayloadSchemaType.KEYWORD),
|
||||||
]:
|
]:
|
||||||
_ensure_index(client, edges, field, schema)
|
_ensure_index(client, edges, field, schema)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/database/qdrant_points.py
|
FILE: app/core/database/qdrant_points.py
|
||||||
DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs.
|
DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs.
|
||||||
VERSION: 1.5.0
|
VERSION: 1.5.1 (WP-Fix: Explicit Target Section Support)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: qdrant_client, uuid, os
|
DEPENDENCIES: qdrant_client, uuid, os
|
||||||
LAST_ANALYSIS: 2025-12-15
|
LAST_ANALYSIS: 2025-12-29
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import os
|
import os
|
||||||
|
|
@ -46,16 +46,25 @@ def points_for_chunks(prefix: str, chunk_payloads: List[dict], vectors: List[Lis
|
||||||
return chunks_col, points
|
return chunks_col, points
|
||||||
|
|
||||||
def _normalize_edge_payload(pl: dict) -> dict:
|
def _normalize_edge_payload(pl: dict) -> dict:
|
||||||
|
"""Normalisiert Edge-Felder und sichert Schema-Konformität."""
|
||||||
kind = pl.get("kind") or pl.get("edge_type") or "edge"
|
kind = pl.get("kind") or pl.get("edge_type") or "edge"
|
||||||
source_id = pl.get("source_id") or pl.get("src_id") or "unknown-src"
|
source_id = pl.get("source_id") or pl.get("src_id") or "unknown-src"
|
||||||
target_id = pl.get("target_id") or pl.get("dst_id") or "unknown-tgt"
|
target_id = pl.get("target_id") or pl.get("dst_id") or "unknown-tgt"
|
||||||
seq = pl.get("seq") or pl.get("order") or pl.get("index")
|
seq = pl.get("seq") or pl.get("order") or pl.get("index")
|
||||||
|
|
||||||
|
# WP-Fix: target_section explizit durchreichen
|
||||||
|
target_section = pl.get("target_section")
|
||||||
|
|
||||||
pl.setdefault("kind", kind)
|
pl.setdefault("kind", kind)
|
||||||
pl.setdefault("source_id", source_id)
|
pl.setdefault("source_id", source_id)
|
||||||
pl.setdefault("target_id", target_id)
|
pl.setdefault("target_id", target_id)
|
||||||
|
|
||||||
if seq is not None and "seq" not in pl:
|
if seq is not None and "seq" not in pl:
|
||||||
pl["seq"] = seq
|
pl["seq"] = seq
|
||||||
|
|
||||||
|
if target_section is not None:
|
||||||
|
pl["target_section"] = target_section
|
||||||
|
|
||||||
return pl
|
return pl
|
||||||
|
|
||||||
def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]:
|
def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]:
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,14 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/graph/graph_derive_edges.py
|
FILE: app/core/graph/graph_derive_edges.py
|
||||||
DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
|
DESCRIPTION: Hauptlogik zur Kanten-Aggregation und De-Duplizierung.
|
||||||
|
AUDIT:
|
||||||
|
- Nutzt parse_link_target
|
||||||
|
- Übergibt Section als 'variant' an ID-Gen
|
||||||
|
- Dedup basiert jetzt auf Edge-ID (erlaubt Multigraph für Sections)
|
||||||
"""
|
"""
|
||||||
from typing import List, Optional, Dict, Tuple
|
from typing import List, Optional, Dict, Tuple
|
||||||
from .graph_utils import (
|
from .graph_utils import (
|
||||||
_get, _edge, _mk_edge_id, _dedupe_seq,
|
_get, _edge, _mk_edge_id, _dedupe_seq, parse_link_target,
|
||||||
PROVENANCE_PRIORITY, load_types_registry, get_edge_defaults_for
|
PROVENANCE_PRIORITY, load_types_registry, get_edge_defaults_for
|
||||||
)
|
)
|
||||||
from .graph_extractors import (
|
from .graph_extractors import (
|
||||||
|
|
@ -53,47 +57,85 @@ def build_edges_for_note(
|
||||||
|
|
||||||
# Typed & Candidate Pool (WP-15b Integration)
|
# Typed & Candidate Pool (WP-15b Integration)
|
||||||
typed, rem = extract_typed_relations(raw)
|
typed, rem = extract_typed_relations(raw)
|
||||||
for k, t in typed:
|
for k, raw_t in typed:
|
||||||
edges.append(_edge(k, "chunk", cid, t, note_id, {
|
t, sec = parse_link_target(raw_t, note_id)
|
||||||
"chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel"),
|
if not t: continue
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"chunk_id": cid,
|
||||||
|
# Variant=sec sorgt für eindeutige ID pro Abschnitt
|
||||||
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", "inline:rel", variant=sec),
|
||||||
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
"provenance": "explicit", "rule_id": "inline:rel", "confidence": PROVENANCE_PRIORITY["inline:rel"]
|
||||||
}))
|
}
|
||||||
|
if sec: payload["target_section"] = sec
|
||||||
|
|
||||||
|
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
||||||
|
|
||||||
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
|
pool = ch.get("candidate_pool") or ch.get("candidate_edges") or []
|
||||||
for cand in pool:
|
for cand in pool:
|
||||||
t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
raw_t, k, p = cand.get("to"), cand.get("kind", "related_to"), cand.get("provenance", "semantic_ai")
|
||||||
|
t, sec = parse_link_target(raw_t, note_id)
|
||||||
if t:
|
if t:
|
||||||
edges.append(_edge(k, "chunk", cid, t, note_id, {
|
payload = {
|
||||||
"chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", f"candidate:{p}"),
|
"chunk_id": cid,
|
||||||
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", f"candidate:{p}", variant=sec),
|
||||||
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
|
"provenance": p, "rule_id": f"candidate:{p}", "confidence": PROVENANCE_PRIORITY.get(p, 0.90)
|
||||||
}))
|
}
|
||||||
|
if sec: payload["target_section"] = sec
|
||||||
|
|
||||||
|
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
||||||
|
|
||||||
# Callouts & Wikilinks
|
# Callouts & Wikilinks
|
||||||
call_pairs, rem2 = extract_callout_relations(rem)
|
call_pairs, rem2 = extract_callout_relations(rem)
|
||||||
for k, t in call_pairs:
|
for k, raw_t in call_pairs:
|
||||||
edges.append(_edge(k, "chunk", cid, t, note_id, {
|
t, sec = parse_link_target(raw_t, note_id)
|
||||||
"chunk_id": cid, "edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge"),
|
if not t: continue
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"chunk_id": cid,
|
||||||
|
"edge_id": _mk_edge_id(k, cid, t, "chunk", "callout:edge", variant=sec),
|
||||||
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
|
"provenance": "explicit", "rule_id": "callout:edge", "confidence": PROVENANCE_PRIORITY["callout:edge"]
|
||||||
}))
|
}
|
||||||
|
if sec: payload["target_section"] = sec
|
||||||
|
|
||||||
|
edges.append(_edge(k, "chunk", cid, t, note_id, payload))
|
||||||
|
|
||||||
refs = extract_wikilinks(rem2)
|
refs = extract_wikilinks(rem2)
|
||||||
for r in refs:
|
for raw_r in refs:
|
||||||
edges.append(_edge("references", "chunk", cid, r, note_id, {
|
r, sec = parse_link_target(raw_r, note_id)
|
||||||
"chunk_id": cid, "ref_text": r, "edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink"),
|
if not r: continue
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"chunk_id": cid, "ref_text": raw_r,
|
||||||
|
"edge_id": _mk_edge_id("references", cid, r, "chunk", "explicit:wikilink", variant=sec),
|
||||||
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
|
"provenance": "explicit", "rule_id": "explicit:wikilink", "confidence": PROVENANCE_PRIORITY["explicit:wikilink"]
|
||||||
}))
|
}
|
||||||
|
if sec: payload["target_section"] = sec
|
||||||
|
|
||||||
|
edges.append(_edge("references", "chunk", cid, r, note_id, payload))
|
||||||
|
|
||||||
for rel in defaults:
|
for rel in defaults:
|
||||||
if rel != "references":
|
if rel != "references":
|
||||||
edges.append(_edge(rel, "chunk", cid, r, note_id, {
|
def_payload = {
|
||||||
"chunk_id": cid, "edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{rel}"),
|
"chunk_id": cid,
|
||||||
|
"edge_id": _mk_edge_id(rel, cid, r, "chunk", f"edge_defaults:{rel}", variant=sec),
|
||||||
"provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"]
|
"provenance": "rule", "rule_id": f"edge_defaults:{rel}", "confidence": PROVENANCE_PRIORITY["edge_defaults"]
|
||||||
}))
|
}
|
||||||
refs_all.extend(refs)
|
if sec: def_payload["target_section"] = sec
|
||||||
|
edges.append(_edge(rel, "chunk", cid, r, note_id, def_payload))
|
||||||
|
|
||||||
|
# Für Note-Scope Sammlung nutzen wir den Original-String zur Dedup, aber gesäubert
|
||||||
|
refs_all.extend([parse_link_target(r, note_id)[0] for r in refs])
|
||||||
|
|
||||||
# 3) Note-Scope & De-Duplizierung
|
# 3) Note-Scope & De-Duplizierung
|
||||||
if include_note_scope_refs:
|
if include_note_scope_refs:
|
||||||
refs_note = _dedupe_seq((refs_all or []) + (note_level_references or []))
|
# refs_all ist jetzt schon gesäubert (nur Targets)
|
||||||
|
# note_level_references müssen auch gesäubert werden
|
||||||
|
cleaned_note_refs = [parse_link_target(r, note_id)[0] for r in (note_level_references or [])]
|
||||||
|
refs_note = _dedupe_seq((refs_all or []) + cleaned_note_refs)
|
||||||
|
|
||||||
for r in refs_note:
|
for r in refs_note:
|
||||||
|
if not r: continue
|
||||||
edges.append(_edge("references", "note", note_id, r, note_id, {
|
edges.append(_edge("references", "note", note_id, r, note_id, {
|
||||||
"edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"),
|
"edge_id": _mk_edge_id("references", note_id, r, "note", "explicit:note_scope"),
|
||||||
"provenance": "explicit", "confidence": PROVENANCE_PRIORITY["explicit:note_scope"]
|
"provenance": "explicit", "confidence": PROVENANCE_PRIORITY["explicit:note_scope"]
|
||||||
|
|
@ -103,10 +145,13 @@ def build_edges_for_note(
|
||||||
"provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
"provenance": "rule", "confidence": PROVENANCE_PRIORITY["derived:backlink"]
|
||||||
}))
|
}))
|
||||||
|
|
||||||
unique_map: Dict[Tuple[str, str, str], dict] = {}
|
# Deduplizierung: Wir nutzen jetzt die EDGE-ID als Schlüssel.
|
||||||
|
# Da die Edge-ID nun 'variant' (Section) enthält, bleiben unterschiedliche Sections erhalten.
|
||||||
|
unique_map: Dict[str, dict] = {}
|
||||||
for e in edges:
|
for e in edges:
|
||||||
key = (str(e.get("source_id")), str(e.get("target_id")), str(e.get("kind")))
|
eid = e["edge_id"]
|
||||||
if key not in unique_map or e.get("confidence", 0) > unique_map[key].get("confidence", 0):
|
# Bei Konflikt (gleiche ID = exakt gleiche Kante und Section) gewinnt die höhere Confidence
|
||||||
unique_map[key] = e
|
if eid not in unique_map or e.get("confidence", 0) > unique_map[eid].get("confidence", 0):
|
||||||
|
unique_map[eid] = e
|
||||||
|
|
||||||
return list(unique_map.values())
|
return list(unique_map.values())
|
||||||
|
|
@ -1,25 +1,36 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/graph/graph_extractors.py
|
FILE: app/core/graph/graph_extractors.py
|
||||||
DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text.
|
DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text.
|
||||||
|
AUDIT:
|
||||||
|
- Regex für Wikilinks liberalisiert (Umlaute, Sonderzeichen).
|
||||||
|
- Callout-Parser erweitert für Multi-Line-Listen und Header-Typen.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([a-zA-Z0-9_\-#:. ]+)\]\]")
|
# Erlaube alle Zeichen außer ']' im Target (fängt Umlaute, Emojis, '&', '#' ab)
|
||||||
|
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([^\]]+)\]\]")
|
||||||
|
|
||||||
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||||
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||||
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||||
|
|
||||||
_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
|
_CALLOUT_START = re.compile(r"^\s*>\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
|
||||||
|
# Erkennt "kind: targets..."
|
||||||
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
|
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
|
||||||
_WIKILINKS_IN_LINE = re.compile(r"\[\[([^\]]+)\]\]")
|
# Erkennt reine Typen (z.B. "depends_on" im Header)
|
||||||
|
_SIMPLE_KIND = re.compile(r"^[a-z_]+$", re.IGNORECASE)
|
||||||
|
|
||||||
def extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
def extract_typed_relations(text: str) -> Tuple[List[Tuple[str, str]], str]:
|
||||||
"""Extrahiert [[rel:KIND|Target]]."""
|
"""
|
||||||
|
Findet Inline-Relationen wie [[rel:depends_on Target]].
|
||||||
|
Gibt (Liste[(kind, target)], bereinigter_text) zurück.
|
||||||
|
"""
|
||||||
|
if not text: return [], ""
|
||||||
pairs = []
|
pairs = []
|
||||||
def _collect(m):
|
def _collect(m):
|
||||||
k, t = (m.group("kind") or "").strip().lower(), (m.group("target") or "").strip()
|
k, t = m.group("kind").strip().lower(), m.group("target").strip()
|
||||||
if k and t: pairs.append((k, t))
|
pairs.append((k, t))
|
||||||
return ""
|
return ""
|
||||||
text = _REL_PIPE.sub(_collect, text)
|
text = _REL_PIPE.sub(_collect, text)
|
||||||
text = _REL_SPACE.sub(_collect, text)
|
text = _REL_SPACE.sub(_collect, text)
|
||||||
|
|
@ -27,29 +38,90 @@ def extract_typed_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
||||||
return pairs, text
|
return pairs, text
|
||||||
|
|
||||||
def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
||||||
"""Verarbeitet Obsidian [!edge]-Callouts."""
|
"""
|
||||||
|
Verarbeitet Obsidian [!edge]-Callouts.
|
||||||
|
Unterstützt zwei Formate:
|
||||||
|
1. Explizit: "kind: [[Target]]"
|
||||||
|
2. Implizit (Header): "> [!edge] kind" gefolgt von "[[Target]]" Zeilen
|
||||||
|
"""
|
||||||
if not text: return [], text
|
if not text: return [], text
|
||||||
lines = text.splitlines(); out_pairs, keep_lines, i = [], [], 0
|
lines = text.splitlines()
|
||||||
|
out_pairs = []
|
||||||
|
keep_lines = []
|
||||||
|
i = 0
|
||||||
|
|
||||||
while i < len(lines):
|
while i < len(lines):
|
||||||
m = _CALLOUT_START.match(lines[i])
|
line = lines[i]
|
||||||
|
m = _CALLOUT_START.match(line)
|
||||||
if not m:
|
if not m:
|
||||||
keep_lines.append(lines[i]); i += 1; continue
|
keep_lines.append(line)
|
||||||
block_lines = [m.group(1)] if m.group(1).strip() else []
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Callout-Block gefunden. Wir sammeln alle relevanten Zeilen.
|
||||||
|
block_lines = []
|
||||||
|
|
||||||
|
# Header Content prüfen (z.B. "type" aus "> [!edge] type")
|
||||||
|
header_raw = m.group(1).strip()
|
||||||
|
if header_raw:
|
||||||
|
block_lines.append(header_raw)
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
while i < len(lines) and lines[i].lstrip().startswith('>'):
|
while i < len(lines) and lines[i].lstrip().startswith('>'):
|
||||||
block_lines.append(lines[i].lstrip()[1:].lstrip()); i += 1
|
# Entferne '>' und führende Leerzeichen
|
||||||
|
content = lines[i].lstrip()[1:].lstrip()
|
||||||
|
if content:
|
||||||
|
block_lines.append(content)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Verarbeitung des Blocks
|
||||||
|
current_kind = None
|
||||||
|
|
||||||
|
# Heuristik: Ist die allererste Zeile (meist aus dem Header) ein reiner Typ?
|
||||||
|
# Dann setzen wir diesen als Default für den Block.
|
||||||
|
if block_lines:
|
||||||
|
first = block_lines[0]
|
||||||
|
# Wenn es NICHT wie "Key: Value" aussieht, aber wie ein Wort:
|
||||||
|
if not _REL_LINE.match(first) and _SIMPLE_KIND.match(first):
|
||||||
|
current_kind = first.lower()
|
||||||
|
|
||||||
for bl in block_lines:
|
for bl in block_lines:
|
||||||
|
# 1. Prüfen auf explizites "Kind: Targets" (überschreibt Header-Typ für diese Zeile)
|
||||||
mrel = _REL_LINE.match(bl)
|
mrel = _REL_LINE.match(bl)
|
||||||
if not mrel: continue
|
if mrel:
|
||||||
kind, targets = mrel.group("kind").strip().lower(), mrel.group("targets") or ""
|
line_kind = mrel.group("kind").strip().lower()
|
||||||
found = _WIKILINKS_IN_LINE.findall(targets)
|
targets = mrel.group("targets")
|
||||||
|
|
||||||
|
# Links extrahieren
|
||||||
|
found = _WIKILINK_RE.findall(targets)
|
||||||
|
if found:
|
||||||
|
for t in found: out_pairs.append((line_kind, t.strip()))
|
||||||
|
else:
|
||||||
|
# Fallback für kommagetrennten Plaintext
|
||||||
|
for raw in re.split(r"[,;]", targets):
|
||||||
|
if raw.strip(): out_pairs.append((line_kind, raw.strip()))
|
||||||
|
|
||||||
|
# Wenn wir eine explizite Zeile gefunden haben, aktualisieren wir NICHT
|
||||||
|
# den current_kind für nachfolgende Zeilen (Design-Entscheidung: lokal scope),
|
||||||
|
# oder wir machen es doch?
|
||||||
|
# Üblicher ist: Header setzt Default, Zeile überschreibt lokal.
|
||||||
|
# Wir lassen current_kind also unangetastet.
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 2. Kein Key:Value Muster -> Prüfen auf Links, die den current_kind nutzen
|
||||||
|
found = _WIKILINK_RE.findall(bl)
|
||||||
if found:
|
if found:
|
||||||
for t in found: out_pairs.append((kind, t.strip()))
|
if current_kind:
|
||||||
else:
|
for t in found: out_pairs.append((current_kind, t.strip()))
|
||||||
for raw in re.split(r"[,;]", targets):
|
else:
|
||||||
if raw.strip(): out_pairs.append((kind, raw.strip()))
|
# Link ohne Typ und ohne Header-Typ.
|
||||||
|
# Wird ignoriert oder könnte als 'related_to' fallback dienen.
|
||||||
|
# Aktuell: Ignorieren, um False Positives zu vermeiden.
|
||||||
|
pass
|
||||||
|
|
||||||
return out_pairs, "\n".join(keep_lines)
|
return out_pairs, "\n".join(keep_lines)
|
||||||
|
|
||||||
def extract_wikilinks(text: str) -> List[str]:
|
def extract_wikilinks(text: str) -> List[str]:
|
||||||
"""Extrahiert Standard-Wikilinks."""
|
"""Findet Standard-Wikilinks [[Target]] oder [[Alias|Target]]."""
|
||||||
return [m.group(1).strip() for m in _WIKILINK_RE.finditer(text or "")]
|
if not text: return []
|
||||||
|
return [m.strip() for m in _WIKILINK_RE.findall(text) if m.strip()]
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/graph/graph_utils.py
|
FILE: app/core/graph/graph_utils.py
|
||||||
DESCRIPTION: Basale Werkzeuge, ID-Generierung und Provenance-Konfiguration für den Graphen.
|
DESCRIPTION: Basale Werkzeuge, ID-Generierung und Provenance-Konfiguration für den Graphen.
|
||||||
|
AUDIT: Erweitert um parse_link_target für sauberes Section-Splitting (WP-Fix).
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
from typing import Iterable, List, Optional, Set, Any
|
from typing import Iterable, List, Optional, Set, Any, Tuple
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import yaml
|
import yaml
|
||||||
|
|
@ -40,10 +41,19 @@ def _dedupe_seq(seq: Iterable[str]) -> List[str]:
|
||||||
seen.add(s); out.append(s)
|
seen.add(s); out.append(s)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None) -> str:
|
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None, variant: Optional[str] = None) -> str:
|
||||||
"""Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s."""
|
"""
|
||||||
|
Erzeugt eine deterministische 12-Byte ID mittels BLAKE2s.
|
||||||
|
|
||||||
|
WP-Fix: 'variant' (z.B. Section) fließt in den Hash ein, um mehrere Kanten
|
||||||
|
zum gleichen Target-Node (aber unterschiedlichen Abschnitten) zu unterscheiden.
|
||||||
|
"""
|
||||||
base = f"{kind}:{s}->{t}#{scope}"
|
base = f"{kind}:{s}->{t}#{scope}"
|
||||||
if rule_id: base += f"|{rule_id}"
|
if rule_id:
|
||||||
|
base += f"|{rule_id}"
|
||||||
|
if variant:
|
||||||
|
base += f"|{variant}" # <--- Hier entsteht die Eindeutigkeit für verschiedene Sections
|
||||||
|
|
||||||
return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest()
|
return hashlib.blake2s(base.encode("utf-8"), digest_size=12).hexdigest()
|
||||||
|
|
||||||
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
||||||
|
|
@ -59,6 +69,27 @@ def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, e
|
||||||
if extra: pl.update(extra)
|
if extra: pl.update(extra)
|
||||||
return pl
|
return pl
|
||||||
|
|
||||||
|
def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[str, Optional[str]]:
|
||||||
|
"""
|
||||||
|
Zerlegt einen Link (z.B. 'Note#Section') in Target-ID und Section.
|
||||||
|
Behandelt Self-Links ('#Section'), indem current_note_id eingesetzt wird.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(target_id, target_section)
|
||||||
|
"""
|
||||||
|
if not raw:
|
||||||
|
return "", None
|
||||||
|
|
||||||
|
parts = raw.split("#", 1)
|
||||||
|
target = parts[0].strip()
|
||||||
|
section = parts[1].strip() if len(parts) > 1 else None
|
||||||
|
|
||||||
|
# Handle Self-Link [[#Section]] -> target wird zu current_note_id
|
||||||
|
if not target and section and current_note_id:
|
||||||
|
target = current_note_id
|
||||||
|
|
||||||
|
return target, section
|
||||||
|
|
||||||
def load_types_registry() -> dict:
|
def load_types_registry() -> dict:
|
||||||
"""Lädt die YAML-Registry."""
|
"""Lädt die YAML-Registry."""
|
||||||
p = os.getenv("MINDNET_TYPES_FILE", "./config/types.yaml")
|
p = os.getenv("MINDNET_TYPES_FILE", "./config/types.yaml")
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,8 @@ FILE: app/core/ingestion/ingestion_note_payload.py
|
||||||
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
||||||
FEATURES:
|
FEATURES:
|
||||||
- Multi-Hash (body/full) für flexible Change Detection.
|
- Multi-Hash (body/full) für flexible Change Detection.
|
||||||
- Fix v2.4.4: Integration der zentralen Registry (WP-14) für konsistente Defaults.
|
- Fix v2.4.5: Präzise Hash-Logik für Profil-Änderungen.
|
||||||
VERSION: 2.4.4
|
- Integration der zentralen Registry (WP-14).
|
||||||
STATUS: Active
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Any, Dict, Tuple, Optional
|
from typing import Any, Dict, Tuple, Optional
|
||||||
|
|
@ -45,14 +44,23 @@ def _compute_hash(content: str) -> str:
|
||||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||||
"""Generiert den Hash-Input-String basierend auf Body oder Metadaten."""
|
"""
|
||||||
body = str(n.get("body") or "")
|
Generiert den Hash-Input-String basierend auf Body oder Metadaten.
|
||||||
|
Fix: Inkludiert nun alle entscheidungsrelevanten Profil-Parameter.
|
||||||
|
"""
|
||||||
|
body = str(n.get("body") or "").strip()
|
||||||
if mode == "body": return body
|
if mode == "body": return body
|
||||||
if mode == "full":
|
if mode == "full":
|
||||||
fm = n.get("frontmatter") or {}
|
fm = n.get("frontmatter") or {}
|
||||||
meta_parts = []
|
meta_parts = []
|
||||||
# Sortierte Liste für deterministische Hashes
|
# Wir inkludieren alle Felder, die das Chunking oder Retrieval beeinflussen
|
||||||
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
# Jede Änderung hier führt nun zwingend zu einem neuen Full-Hash
|
||||||
|
keys = [
|
||||||
|
"title", "type", "status", "tags",
|
||||||
|
"chunking_profile", "chunk_profile",
|
||||||
|
"retriever_weight", "split_level", "strict_heading_split"
|
||||||
|
]
|
||||||
|
for k in sorted(keys):
|
||||||
val = fm.get(k)
|
val = fm.get(k)
|
||||||
if val is not None: meta_parts.append(f"{k}:{val}")
|
if val is not None: meta_parts.append(f"{k}:{val}")
|
||||||
return f"{'|'.join(meta_parts)}||{body}"
|
return f"{'|'.join(meta_parts)}||{body}"
|
||||||
|
|
@ -79,11 +87,11 @@ def _cfg_defaults(reg: dict) -> dict:
|
||||||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.
|
Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.
|
||||||
WP-14: Nutzt nun die zentrale Registry für alle Fallbacks.
|
WP-14: Nutzt die zentrale Registry für alle Fallbacks.
|
||||||
"""
|
"""
|
||||||
n = _as_dict(note)
|
n = _as_dict(note)
|
||||||
|
|
||||||
# Nutzt übergebene Registry oder lädt sie global
|
# Registry & Context Settings
|
||||||
reg = kwargs.get("types_cfg") or load_type_registry()
|
reg = kwargs.get("types_cfg") or load_type_registry()
|
||||||
hash_source = kwargs.get("hash_source", "parsed")
|
hash_source = kwargs.get("hash_source", "parsed")
|
||||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||||
|
|
@ -96,7 +104,6 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
ingest_cfg = reg.get("ingestion_settings", {})
|
ingest_cfg = reg.get("ingestion_settings", {})
|
||||||
|
|
||||||
# --- retriever_weight Audit ---
|
# --- retriever_weight Audit ---
|
||||||
# Priorität: Frontmatter -> Typ-Config -> globale Config -> Env-Var
|
|
||||||
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
|
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
|
||||||
retriever_weight = fm.get("retriever_weight")
|
retriever_weight = fm.get("retriever_weight")
|
||||||
if retriever_weight is None:
|
if retriever_weight is None:
|
||||||
|
|
@ -107,14 +114,13 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
retriever_weight = default_rw
|
retriever_weight = default_rw
|
||||||
|
|
||||||
# --- chunk_profile Audit ---
|
# --- chunk_profile Audit ---
|
||||||
# Nutzt nun primär die ingestion_settings aus der Registry
|
|
||||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||||
if chunk_profile is None:
|
if chunk_profile is None:
|
||||||
chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile")
|
chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile")
|
||||||
if chunk_profile is None:
|
if chunk_profile is None:
|
||||||
chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard"))
|
chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard"))
|
||||||
|
|
||||||
# --- edge_defaults ---
|
# --- edge_defaults Audit ---
|
||||||
edge_defaults = fm.get("edge_defaults")
|
edge_defaults = fm.get("edge_defaults")
|
||||||
if edge_defaults is None:
|
if edge_defaults is None:
|
||||||
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
|
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
|
||||||
|
|
@ -138,21 +144,24 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- MULTI-HASH ---
|
# --- MULTI-HASH ---
|
||||||
# Generiert Hashes für Change Detection
|
# Generiert Hashes für Change Detection (WP-15b)
|
||||||
for mode in ["body", "full"]:
|
for mode in ["body", "full"]:
|
||||||
content = _get_hash_source_content(n, mode)
|
content = _get_hash_source_content(n, mode)
|
||||||
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
|
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
|
||||||
|
|
||||||
# Metadaten Anreicherung
|
# Metadaten Anreicherung (Tags, Aliases, Zeitstempel)
|
||||||
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
||||||
if tags: payload["tags"] = _ensure_list(tags)
|
if tags: payload["tags"] = _ensure_list(tags)
|
||||||
if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases"))
|
|
||||||
|
aliases = fm.get("aliases")
|
||||||
|
if aliases: payload["aliases"] = _ensure_list(aliases)
|
||||||
|
|
||||||
for k in ("created", "modified", "date"):
|
for k in ("created", "modified", "date"):
|
||||||
v = fm.get(k) or n.get(k)
|
v = fm.get(k) or n.get(k)
|
||||||
if v: payload[k] = str(v)
|
if v: payload[k] = str(v)
|
||||||
|
|
||||||
if n.get("body"): payload["fulltext"] = str(n["body"])
|
if n.get("body"):
|
||||||
|
payload["fulltext"] = str(n["body"])
|
||||||
|
|
||||||
# Final JSON Validation Audit
|
# Final JSON Validation Audit
|
||||||
json.loads(json.dumps(payload, ensure_ascii=False))
|
json.loads(json.dumps(payload, ensure_ascii=False))
|
||||||
|
|
|
||||||
|
|
@ -4,8 +4,8 @@ DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||||
WP-14: Modularisierung der Datenbank-Ebene (app.core.database).
|
WP-14: Modularisierung der Datenbank-Ebene (app.core.database).
|
||||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||||
AUDIT v2.13.10: Umstellung auf app.core.database Infrastruktur.
|
AUDIT v2.13.12: Synchronisierung der Profil-Auflösung mit Registry-Defaults.
|
||||||
VERSION: 2.13.10
|
VERSION: 2.13.12
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -60,6 +60,7 @@ class IngestionService:
|
||||||
self.embedder = EmbeddingsClient()
|
self.embedder = EmbeddingsClient()
|
||||||
self.llm = LLMService()
|
self.llm = LLMService()
|
||||||
|
|
||||||
|
# Festlegen, welcher Hash für die Change-Detection maßgeblich ist
|
||||||
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
||||||
self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache
|
self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache
|
||||||
|
|
||||||
|
|
@ -130,12 +131,18 @@ class IngestionService:
|
||||||
)
|
)
|
||||||
note_id = note_pl["note_id"]
|
note_id = note_pl["note_id"]
|
||||||
|
|
||||||
|
# Abgleich mit der Datenbank (Qdrant)
|
||||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||||
|
|
||||||
|
# Prüfung gegen den konfigurierten Hash-Modus (body vs. full)
|
||||||
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
|
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
|
||||||
old_hash = (old_payload or {}).get("hashes", {}).get(check_key)
|
old_hash = (old_payload or {}).get("hashes", {}).get(check_key)
|
||||||
new_hash = note_pl.get("hashes", {}).get(check_key)
|
new_hash = note_pl.get("hashes", {}).get(check_key)
|
||||||
|
|
||||||
|
# Check ob Chunks oder Kanten in der DB fehlen (Reparatur-Modus)
|
||||||
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
||||||
|
|
||||||
|
# Wenn Hash identisch und Artefakte vorhanden -> Skip
|
||||||
if not (force_replace or not old_payload or old_hash != new_hash or c_miss or e_miss):
|
if not (force_replace or not old_payload or old_hash != new_hash or c_miss or e_miss):
|
||||||
return {**result, "status": "unchanged", "note_id": note_id}
|
return {**result, "status": "unchanged", "note_id": note_id}
|
||||||
|
|
||||||
|
|
@ -146,36 +153,49 @@ class IngestionService:
|
||||||
try:
|
try:
|
||||||
body_text = getattr(parsed, "body", "") or ""
|
body_text = getattr(parsed, "body", "") or ""
|
||||||
edge_registry.ensure_latest()
|
edge_registry.ensure_latest()
|
||||||
profile = fm.get("chunk_profile") or fm.get("chunking_profile") or "sliding_standard"
|
|
||||||
|
# Profil-Auflösung via Registry
|
||||||
|
# FIX: Wir nutzen das Profil, das bereits in make_note_payload unter
|
||||||
|
# Berücksichtigung der types.yaml (Registry) ermittelt wurde.
|
||||||
|
profile = note_pl.get("chunk_profile", "sliding_standard")
|
||||||
|
|
||||||
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
||||||
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
||||||
|
|
||||||
# WP-15b: Chunker-Aufruf bereitet Candidate-Pool vor
|
# WP-15b: Chunker-Aufruf bereitet den Candidate-Pool pro Chunk vor.
|
||||||
|
# assemble_chunks führt intern auch die Propagierung durch.
|
||||||
chunks = await assemble_chunks(note_id, body_text, note_type, config=chunk_cfg)
|
chunks = await assemble_chunks(note_id, body_text, note_type, config=chunk_cfg)
|
||||||
|
|
||||||
|
# Semantische Kanten-Validierung (Smart Edge Allocation)
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
filtered = []
|
filtered = []
|
||||||
for cand in getattr(ch, "candidate_pool", []):
|
for cand in getattr(ch, "candidate_pool", []):
|
||||||
# WP-15b: Nur global_pool Kandidaten erfordern binäre Validierung
|
# Nur global_pool Kandidaten (aus dem Pool am Ende) erfordern KI-Validierung
|
||||||
if cand.get("provenance") == "global_pool" and enable_smart:
|
if cand.get("provenance") == "global_pool" and enable_smart:
|
||||||
if await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm, self.settings.MINDNET_LLM_PROVIDER):
|
if await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm, self.settings.MINDNET_LLM_PROVIDER):
|
||||||
filtered.append(cand)
|
filtered.append(cand)
|
||||||
else:
|
else:
|
||||||
|
# Explizite Kanten (Wikilinks/Callouts) werden ungeprüft übernommen
|
||||||
filtered.append(cand)
|
filtered.append(cand)
|
||||||
ch.candidate_pool = filtered
|
ch.candidate_pool = filtered
|
||||||
|
|
||||||
# Payload-Erstellung via interne Module
|
# Payload-Erstellung für die Chunks
|
||||||
chunk_pls = make_chunk_payloads(
|
chunk_pls = make_chunk_payloads(
|
||||||
fm, note_pl["path"], chunks, file_path=file_path,
|
fm, note_pl["path"], chunks, file_path=file_path,
|
||||||
types_cfg=self.registry
|
types_cfg=self.registry
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Vektorisierung der Fenster-Texte
|
||||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||||
|
|
||||||
# Kanten-Aggregation
|
# Aggregation aller finalen Kanten (Edges)
|
||||||
edges = build_edges_for_note(
|
edges = build_edges_for_note(
|
||||||
note_id, chunk_pls,
|
note_id, chunk_pls,
|
||||||
note_level_references=note_pl.get("references", []),
|
note_level_references=note_pl.get("references", []),
|
||||||
include_note_scope_refs=note_scope_refs
|
include_note_scope_refs=note_scope_refs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Kanten-Typen via Registry validieren/auflösen
|
||||||
for e in edges:
|
for e in edges:
|
||||||
e["kind"] = edge_registry.resolve(
|
e["kind"] = edge_registry.resolve(
|
||||||
e.get("kind", "related_to"),
|
e.get("kind", "related_to"),
|
||||||
|
|
@ -184,16 +204,20 @@ class IngestionService:
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4. DB Upsert via modularisierter Points-Logik
|
# 4. DB Upsert via modularisierter Points-Logik
|
||||||
|
# WICHTIG: Wenn sich der Inhalt geändert hat, löschen wir erst alle alten Fragmente.
|
||||||
if purge_before and old_payload:
|
if purge_before and old_payload:
|
||||||
purge_artifacts(self.client, self.prefix, note_id)
|
purge_artifacts(self.client, self.prefix, note_id)
|
||||||
|
|
||||||
|
# Speichern der Haupt-Note
|
||||||
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||||
upsert_batch(self.client, n_name, n_pts)
|
upsert_batch(self.client, n_name, n_pts)
|
||||||
|
|
||||||
|
# Speichern der Chunks
|
||||||
if chunk_pls and vecs:
|
if chunk_pls and vecs:
|
||||||
c_pts = points_for_chunks(self.prefix, chunk_pls, vecs)[1]
|
c_pts = points_for_chunks(self.prefix, chunk_pls, vecs)[1]
|
||||||
upsert_batch(self.client, f"{self.prefix}_chunks", c_pts)
|
upsert_batch(self.client, f"{self.prefix}_chunks", c_pts)
|
||||||
|
|
||||||
|
# Speichern der Kanten
|
||||||
if edges:
|
if edges:
|
||||||
e_pts = points_for_edges(self.prefix, edges)[1]
|
e_pts = points_for_edges(self.prefix, edges)[1]
|
||||||
upsert_batch(self.client, f"{self.prefix}_edges", e_pts)
|
upsert_batch(self.client, f"{self.prefix}_edges", e_pts)
|
||||||
|
|
@ -217,4 +241,5 @@ class IngestionService:
|
||||||
with open(target_path, "w", encoding="utf-8") as f:
|
with open(target_path, "w", encoding="utf-8") as f:
|
||||||
f.write(markdown_content)
|
f.write(markdown_content)
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
|
# Triggert sofortigen Import mit force_replace/purge_before
|
||||||
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/frontend/ui_graph_service.py
|
FILE: app/frontend/ui_graph_service.py
|
||||||
DESCRIPTION: Data Layer für den Graphen. Greift direkt auf Qdrant zu (Performance), um Knoten/Kanten zu laden und Texte zu rekonstruieren ("Stitching").
|
DESCRIPTION: Data Layer für den Graphen. Greift direkt auf Qdrant zu (Performance), um Knoten/Kanten zu laden und Texte zu rekonstruieren ("Stitching").
|
||||||
VERSION: 2.6.1 (Fix: Anchor-Link & Fragment Resolution)
|
VERSION: 2.6.0
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: qdrant_client, streamlit_agraph, ui_config, re
|
DEPENDENCIES: qdrant_client, streamlit_agraph, ui_config, re
|
||||||
LAST_ANALYSIS: 2025-12-28
|
LAST_ANALYSIS: 2025-12-15
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
@ -24,7 +24,6 @@ class GraphExplorerService:
|
||||||
self.chunks_col = f"{self.prefix}_chunks"
|
self.chunks_col = f"{self.prefix}_chunks"
|
||||||
self.edges_col = f"{self.prefix}_edges"
|
self.edges_col = f"{self.prefix}_edges"
|
||||||
self._note_cache = {}
|
self._note_cache = {}
|
||||||
self._ref_resolution_cache = {}
|
|
||||||
|
|
||||||
def get_note_with_full_content(self, note_id):
|
def get_note_with_full_content(self, note_id):
|
||||||
"""
|
"""
|
||||||
|
|
@ -38,7 +37,8 @@ class GraphExplorerService:
|
||||||
# 2. Volltext aus Chunks bauen
|
# 2. Volltext aus Chunks bauen
|
||||||
full_text = self._fetch_full_text_stitched(note_id)
|
full_text = self._fetch_full_text_stitched(note_id)
|
||||||
|
|
||||||
# 3. Ergebnis kombinieren (Kopie zurückgeben)
|
# 3. Ergebnis kombinieren (Wir überschreiben das 'fulltext' Feld mit dem frischen Stitching)
|
||||||
|
# Wir geben eine Kopie zurück, um den Cache nicht zu verfälschen
|
||||||
complete_note = meta.copy()
|
complete_note = meta.copy()
|
||||||
if full_text:
|
if full_text:
|
||||||
complete_note['fulltext'] = full_text
|
complete_note['fulltext'] = full_text
|
||||||
|
|
@ -61,7 +61,7 @@ class GraphExplorerService:
|
||||||
# Initialset für Suche
|
# Initialset für Suche
|
||||||
level_1_ids = {center_note_id}
|
level_1_ids = {center_note_id}
|
||||||
|
|
||||||
# Suche Kanten für Center (L1) inkl. Titel für Anchor-Suche
|
# Suche Kanten für Center (L1)
|
||||||
l1_edges = self._find_connected_edges([center_note_id], center_note.get("title"))
|
l1_edges = self._find_connected_edges([center_note_id], center_note.get("title"))
|
||||||
|
|
||||||
for edge_data in l1_edges:
|
for edge_data in l1_edges:
|
||||||
|
|
@ -84,6 +84,7 @@ class GraphExplorerService:
|
||||||
if center_note_id in nodes_dict:
|
if center_note_id in nodes_dict:
|
||||||
orig_title = nodes_dict[center_note_id].title
|
orig_title = nodes_dict[center_note_id].title
|
||||||
clean_full = self._clean_markdown(center_text[:2000])
|
clean_full = self._clean_markdown(center_text[:2000])
|
||||||
|
# Wir packen den Text in den Tooltip (title attribute)
|
||||||
nodes_dict[center_note_id].title = f"{orig_title}\n\n📄 INHALT:\n{clean_full}..."
|
nodes_dict[center_note_id].title = f"{orig_title}\n\n📄 INHALT:\n{clean_full}..."
|
||||||
|
|
||||||
# B. Previews für alle Nachbarn holen (Batch)
|
# B. Previews für alle Nachbarn holen (Batch)
|
||||||
|
|
@ -103,6 +104,8 @@ class GraphExplorerService:
|
||||||
prov = data['provenance']
|
prov = data['provenance']
|
||||||
color = get_edge_color(kind)
|
color = get_edge_color(kind)
|
||||||
is_smart = (prov != "explicit" and prov != "rule")
|
is_smart = (prov != "explicit" and prov != "rule")
|
||||||
|
|
||||||
|
# Label Logik
|
||||||
label_text = kind if show_labels else " "
|
label_text = kind if show_labels else " "
|
||||||
|
|
||||||
final_edges.append(Edge(
|
final_edges.append(Edge(
|
||||||
|
|
@ -113,11 +116,15 @@ class GraphExplorerService:
|
||||||
return list(nodes_dict.values()), final_edges
|
return list(nodes_dict.values()), final_edges
|
||||||
|
|
||||||
def _clean_markdown(self, text):
|
def _clean_markdown(self, text):
|
||||||
"""Entfernt Markdown-Sonderzeichen für saubere Tooltips."""
|
"""Entfernt Markdown-Sonderzeichen für saubere Tooltips im Browser."""
|
||||||
if not text: return ""
|
if not text: return ""
|
||||||
|
# Entferne Header Marker (## )
|
||||||
text = re.sub(r'#+\s', '', text)
|
text = re.sub(r'#+\s', '', text)
|
||||||
|
# Entferne Bold/Italic (** oder *)
|
||||||
text = re.sub(r'\*\*|__|\*|_', '', text)
|
text = re.sub(r'\*\*|__|\*|_', '', text)
|
||||||
|
# Entferne Links [Text](Url) -> Text
|
||||||
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
||||||
|
# Entferne Wikilinks [[Link]] -> Link
|
||||||
text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text)
|
text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
@ -127,47 +134,52 @@ class GraphExplorerService:
|
||||||
scroll_filter = models.Filter(
|
scroll_filter = models.Filter(
|
||||||
must=[models.FieldCondition(key="note_id", match=models.MatchValue(value=note_id))]
|
must=[models.FieldCondition(key="note_id", match=models.MatchValue(value=note_id))]
|
||||||
)
|
)
|
||||||
|
# Limit hoch genug setzen
|
||||||
chunks, _ = self.client.scroll(self.chunks_col, scroll_filter=scroll_filter, limit=100, with_payload=True)
|
chunks, _ = self.client.scroll(self.chunks_col, scroll_filter=scroll_filter, limit=100, with_payload=True)
|
||||||
|
# Sortieren nach 'ord' (Reihenfolge im Dokument)
|
||||||
chunks.sort(key=lambda x: x.payload.get('ord', 999))
|
chunks.sort(key=lambda x: x.payload.get('ord', 999))
|
||||||
full_text = [c.payload.get('text', '') for c in chunks if c.payload.get('text')]
|
|
||||||
|
full_text = []
|
||||||
|
for c in chunks:
|
||||||
|
# 'text' ist der reine Inhalt ohne Overlap
|
||||||
|
txt = c.payload.get('text', '')
|
||||||
|
if txt: full_text.append(txt)
|
||||||
|
|
||||||
return "\n\n".join(full_text)
|
return "\n\n".join(full_text)
|
||||||
except:
|
except:
|
||||||
return "Fehler beim Laden des Volltexts."
|
return "Fehler beim Laden des Volltexts."
|
||||||
|
|
||||||
def _fetch_previews_for_nodes(self, node_ids):
|
def _fetch_previews_for_nodes(self, node_ids):
|
||||||
"""
|
"""Holt Batch-weise den ersten Chunk für eine Liste von Nodes."""
|
||||||
Holt Batch-weise den ersten relevanten Textabschnitt für eine Liste von Nodes.
|
if not node_ids: return {}
|
||||||
Optimiert die Ladezeit durch Reduzierung der API-Calls.
|
|
||||||
"""
|
|
||||||
if not node_ids:
|
|
||||||
return {}
|
|
||||||
previews = {}
|
previews = {}
|
||||||
try:
|
try:
|
||||||
scroll_filter = models.Filter(
|
scroll_filter = models.Filter(must=[models.FieldCondition(key="note_id", match=models.MatchAny(any=node_ids))])
|
||||||
must=[models.FieldCondition(key="note_id", match=models.MatchAny(any=node_ids))]
|
# Limit = Anzahl Nodes * 3 (Puffer)
|
||||||
)
|
|
||||||
# Genügend Chunks laden, um für jede ID eine Vorschau zu finden
|
|
||||||
chunks, _ = self.client.scroll(self.chunks_col, scroll_filter=scroll_filter, limit=len(node_ids)*3, with_payload=True)
|
chunks, _ = self.client.scroll(self.chunks_col, scroll_filter=scroll_filter, limit=len(node_ids)*3, with_payload=True)
|
||||||
|
|
||||||
for c in chunks:
|
for c in chunks:
|
||||||
nid = c.payload.get("note_id")
|
nid = c.payload.get("note_id")
|
||||||
# Wir nehmen den ersten gefundenen Chunk
|
# Nur den ersten gefundenen Chunk pro Note nehmen
|
||||||
if nid and nid not in previews:
|
if nid and nid not in previews:
|
||||||
previews[nid] = c.payload.get("window") or c.payload.get("text") or ""
|
previews[nid] = c.payload.get("window") or c.payload.get("text") or ""
|
||||||
except Exception:
|
except: pass
|
||||||
pass
|
|
||||||
return previews
|
return previews
|
||||||
|
|
||||||
def _find_connected_edges(self, note_ids, note_title=None):
|
def _find_connected_edges(self, note_ids, note_title=None):
|
||||||
"""
|
"""
|
||||||
Findet ein- und ausgehende Kanten für eine Liste von IDs.
|
Findet eingehende und ausgehende Kanten.
|
||||||
Implementiert den Fix für Anker-Links [[Titel#Abschnitt]] durch Präfix-Suche in der target_id.
|
|
||||||
|
WICHTIG: target_id enthält nur den Titel (ohne #Abschnitt).
|
||||||
|
target_section ist ein separates Feld für Abschnitt-Informationen.
|
||||||
"""
|
"""
|
||||||
results = []
|
results = []
|
||||||
if not note_ids:
|
if not note_ids:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# 1. AUSGEHENDE KANTEN (Outgoing)
|
# 1. OUTGOING EDGES (Der "Owner"-Fix)
|
||||||
# Suche über 'note_id' als Besitzer der Kante.
|
# Wir suchen Kanten, die im Feld 'note_id' (Owner) eine unserer Notizen haben.
|
||||||
|
# Das findet ALLE ausgehenden Kanten, egal ob sie an einem Chunk oder der Note hängen.
|
||||||
out_filter = models.Filter(must=[
|
out_filter = models.Filter(must=[
|
||||||
models.FieldCondition(key="note_id", match=models.MatchAny(any=note_ids)),
|
models.FieldCondition(key="note_id", match=models.MatchAny(any=note_ids)),
|
||||||
models.FieldCondition(key="kind", match=models.MatchExcept(**{"except": SYSTEM_EDGES}))
|
models.FieldCondition(key="kind", match=models.MatchExcept(**{"except": SYSTEM_EDGES}))
|
||||||
|
|
@ -175,71 +187,79 @@ class GraphExplorerService:
|
||||||
res_out, _ = self.client.scroll(self.edges_col, scroll_filter=out_filter, limit=2000, with_payload=True)
|
res_out, _ = self.client.scroll(self.edges_col, scroll_filter=out_filter, limit=2000, with_payload=True)
|
||||||
results.extend(res_out)
|
results.extend(res_out)
|
||||||
|
|
||||||
# 2. EINGEHENDE KANTEN (Incoming)
|
# 2. INCOMING EDGES (Ziel = Chunk ID, Note ID oder Titel)
|
||||||
# Suche über target_id (Ziel der Kante).
|
# WICHTIG: target_id enthält nur den Titel, target_section ist separat
|
||||||
|
|
||||||
# Sammele alle Chunk-IDs für exakte Treffer auf Segment-Ebene
|
# Chunk IDs der aktuellen Notes holen
|
||||||
c_filter = models.Filter(must=[models.FieldCondition(key="note_id", match=models.MatchAny(any=note_ids))])
|
c_filter = models.Filter(must=[models.FieldCondition(key="note_id", match=models.MatchAny(any=note_ids))])
|
||||||
chunks, _ = self.client.scroll(self.chunks_col, scroll_filter=c_filter, limit=1000, with_payload=False)
|
chunks, _ = self.client.scroll(self.chunks_col, scroll_filter=c_filter, limit=1000, with_payload=False)
|
||||||
chunk_ids = [c.id for c in chunks]
|
chunk_ids = [c.id for c in chunks]
|
||||||
|
|
||||||
should_conditions = []
|
shoulds = []
|
||||||
|
# Case A: Edge zeigt auf einen unserer Chunks
|
||||||
if chunk_ids:
|
if chunk_ids:
|
||||||
should_conditions.append(models.FieldCondition(key="target_id", match=models.MatchAny(any=chunk_ids)))
|
shoulds.append(models.FieldCondition(key="target_id", match=models.MatchAny(any=chunk_ids)))
|
||||||
should_conditions.append(models.FieldCondition(key="target_id", match=models.MatchAny(any=note_ids)))
|
|
||||||
|
|
||||||
# TITEL-BASIERTE SUCHE (Inkl. Anker-Fix)
|
|
||||||
titles_to_check = []
|
|
||||||
if note_title:
|
|
||||||
titles_to_check.append(note_title)
|
|
||||||
# Aliase laden für robuste Verlinkung
|
|
||||||
for nid in note_ids:
|
|
||||||
note = self._fetch_note_cached(nid)
|
|
||||||
if note:
|
|
||||||
aliases = note.get("aliases", [])
|
|
||||||
if isinstance(aliases, str): aliases = [aliases]
|
|
||||||
titles_to_check.extend([a for a in aliases if a not in titles_to_check])
|
|
||||||
|
|
||||||
# Exakte Titel-Matches hinzufügen
|
|
||||||
for t in titles_to_check:
|
|
||||||
should_conditions.append(models.FieldCondition(key="target_id", match=models.MatchValue(value=t)))
|
|
||||||
|
|
||||||
if should_conditions:
|
# Case B: Edge zeigt direkt auf unsere Note ID
|
||||||
|
shoulds.append(models.FieldCondition(key="target_id", match=models.MatchAny(any=note_ids)))
|
||||||
|
|
||||||
|
# Case C: Edge zeigt auf unseren Titel
|
||||||
|
# WICHTIG: target_id enthält nur den Titel (z.B. "Meine Prinzipien 2025")
|
||||||
|
# target_section enthält die Abschnitt-Information (z.B. "P3 – Disziplin"), wenn gesetzt
|
||||||
|
|
||||||
|
# Sammle alle relevanten Titel (inkl. Aliase)
|
||||||
|
titles_to_search = []
|
||||||
|
if note_title:
|
||||||
|
titles_to_search.append(note_title)
|
||||||
|
|
||||||
|
# Lade auch Titel aus den Notes selbst (falls note_title nicht übergeben wurde)
|
||||||
|
for nid in note_ids:
|
||||||
|
note = self._fetch_note_cached(nid)
|
||||||
|
if note:
|
||||||
|
note_title_from_db = note.get("title")
|
||||||
|
if note_title_from_db and note_title_from_db not in titles_to_search:
|
||||||
|
titles_to_search.append(note_title_from_db)
|
||||||
|
# Aliase hinzufügen
|
||||||
|
aliases = note.get("aliases", [])
|
||||||
|
if isinstance(aliases, str):
|
||||||
|
aliases = [aliases]
|
||||||
|
for alias in aliases:
|
||||||
|
if alias and alias not in titles_to_search:
|
||||||
|
titles_to_search.append(alias)
|
||||||
|
|
||||||
|
# Für jeden Titel: Suche nach exaktem Match
|
||||||
|
# target_id enthält nur den Titel, daher reicht MatchValue
|
||||||
|
for title in titles_to_search:
|
||||||
|
shoulds.append(models.FieldCondition(key="target_id", match=models.MatchValue(value=title)))
|
||||||
|
|
||||||
|
if shoulds:
|
||||||
in_filter = models.Filter(
|
in_filter = models.Filter(
|
||||||
must=[models.FieldCondition(key="kind", match=models.MatchExcept(**{"except": SYSTEM_EDGES}))],
|
must=[models.FieldCondition(key="kind", match=models.MatchExcept(**{"except": SYSTEM_EDGES}))],
|
||||||
should=should_conditions
|
should=shoulds
|
||||||
)
|
)
|
||||||
res_in, _ = self.client.scroll(self.edges_col, scroll_filter=in_filter, limit=2000, with_payload=True)
|
res_in, _ = self.client.scroll(self.edges_col, scroll_filter=in_filter, limit=2000, with_payload=True)
|
||||||
results.extend(res_in)
|
results.extend(res_in)
|
||||||
|
|
||||||
# FIX FÜR [[Titel#Abschnitt]]: Suche nach Fragmenten
|
|
||||||
if titles_to_check:
|
|
||||||
for t in titles_to_check:
|
|
||||||
anchor_filter = models.Filter(must=[
|
|
||||||
models.FieldCondition(key="target_id", match=models.MatchText(text=t)),
|
|
||||||
models.FieldCondition(key="kind", match=models.MatchExcept(**{"except": SYSTEM_EDGES}))
|
|
||||||
])
|
|
||||||
res_anchor, _ = self.client.scroll(self.edges_col, scroll_filter=anchor_filter, limit=1000, with_payload=True)
|
|
||||||
|
|
||||||
existing_ids = {r.id for r in results}
|
|
||||||
for edge in res_anchor:
|
|
||||||
tgt = edge.payload.get("target_id", "")
|
|
||||||
# Client-seitige Filterung: Nur Kanten nehmen, die mit Titel# beginnen
|
|
||||||
if edge.id not in existing_ids and (tgt == t or tgt.startswith(f"{t}#")):
|
|
||||||
results.append(edge)
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def _find_connected_edges_batch(self, note_ids):
|
def _find_connected_edges_batch(self, note_ids):
|
||||||
"""Wrapper für die Suche in tieferen Ebenen des Graphen."""
|
"""
|
||||||
first_note = self._fetch_note_cached(note_ids[0]) if note_ids else None
|
Wrapper für Level 2 Suche.
|
||||||
title = first_note.get("title") if first_note else None
|
Lädt Titel der ersten Note für Titel-basierte Suche.
|
||||||
return self._find_connected_edges(note_ids, note_title=title)
|
"""
|
||||||
|
if not note_ids:
|
||||||
|
return []
|
||||||
|
first_note = self._fetch_note_cached(note_ids[0])
|
||||||
|
note_title = first_note.get("title") if first_note else None
|
||||||
|
return self._find_connected_edges(note_ids, note_title=note_title)
|
||||||
|
|
||||||
def _process_edge(self, record, nodes_dict, unique_edges, current_depth):
|
def _process_edge(self, record, nodes_dict, unique_edges, current_depth):
|
||||||
"""
|
"""
|
||||||
Verarbeitet eine rohe Kante, löst Quell- und Ziel-Referenzen auf
|
Verarbeitet eine rohe Edge, löst IDs auf und fügt sie den Dictionaries hinzu.
|
||||||
und fügt sie den Dictionaries für den Graphen hinzu.
|
|
||||||
|
WICHTIG: Beide Richtungen werden unterstützt:
|
||||||
|
- Ausgehende Kanten: source_id gehört zu unserer Note (via note_id Owner)
|
||||||
|
- Eingehende Kanten: target_id zeigt auf unsere Note (via target_id Match)
|
||||||
"""
|
"""
|
||||||
if not record or not record.payload:
|
if not record or not record.payload:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
@ -250,10 +270,13 @@ class GraphExplorerService:
|
||||||
kind = payload.get("kind")
|
kind = payload.get("kind")
|
||||||
provenance = payload.get("provenance", "explicit")
|
provenance = payload.get("provenance", "explicit")
|
||||||
|
|
||||||
|
# Prüfe, ob beide Referenzen vorhanden sind
|
||||||
if not src_ref or not tgt_ref:
|
if not src_ref or not tgt_ref:
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
# IDs zu Notes auflösen (Hier greift der Fragment-Fix)
|
# IDs zu Notes auflösen
|
||||||
|
# WICHTIG: source_id kann Chunk-ID (note_id#c01), Note-ID oder Titel sein
|
||||||
|
# WICHTIG: target_id kann Chunk-ID, Note-ID oder Titel sein (ohne #Abschnitt)
|
||||||
src_note = self._resolve_note_from_ref(src_ref)
|
src_note = self._resolve_note_from_ref(src_ref)
|
||||||
tgt_note = self._resolve_note_from_ref(tgt_ref)
|
tgt_note = self._resolve_note_from_ref(tgt_ref)
|
||||||
|
|
||||||
|
|
@ -261,118 +284,159 @@ class GraphExplorerService:
|
||||||
src_id = src_note.get('note_id')
|
src_id = src_note.get('note_id')
|
||||||
tgt_id = tgt_note.get('note_id')
|
tgt_id = tgt_note.get('note_id')
|
||||||
|
|
||||||
if src_id and tgt_id and src_id != tgt_id:
|
# Prüfe, ob beide IDs vorhanden sind
|
||||||
# Knoten zum Set hinzufügen
|
if not src_id or not tgt_id:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
if src_id != tgt_id:
|
||||||
|
# Nodes hinzufügen
|
||||||
self._add_node_to_dict(nodes_dict, src_note, level=current_depth)
|
self._add_node_to_dict(nodes_dict, src_note, level=current_depth)
|
||||||
self._add_node_to_dict(nodes_dict, tgt_note, level=current_depth)
|
self._add_node_to_dict(nodes_dict, tgt_note, level=current_depth)
|
||||||
|
|
||||||
# Kante registrieren (Deduplizierung)
|
# Kante hinzufügen (mit Deduplizierung)
|
||||||
key = (src_id, tgt_id)
|
key = (src_id, tgt_id)
|
||||||
existing = unique_edges.get(key)
|
existing = unique_edges.get(key)
|
||||||
|
|
||||||
is_current_explicit = (provenance in ["explicit", "rule"])
|
|
||||||
should_update = True
|
should_update = True
|
||||||
|
# Bevorzuge explizite Kanten vor Smart Kanten
|
||||||
|
is_current_explicit = (provenance in ["explicit", "rule"])
|
||||||
if existing:
|
if existing:
|
||||||
is_existing_explicit = (existing.get('provenance', '') in ["explicit", "rule"])
|
is_existing_explicit = (existing.get('provenance', '') in ["explicit", "rule"])
|
||||||
if is_existing_explicit and not is_current_explicit:
|
if is_existing_explicit and not is_current_explicit:
|
||||||
should_update = False
|
should_update = False
|
||||||
|
|
||||||
if should_update:
|
if should_update:
|
||||||
unique_edges[key] = {
|
unique_edges[key] = {"source": src_id, "target": tgt_id, "kind": kind, "provenance": provenance}
|
||||||
"source": src_id,
|
|
||||||
"target": tgt_id,
|
|
||||||
"kind": kind,
|
|
||||||
"provenance": provenance
|
|
||||||
}
|
|
||||||
return src_id, tgt_id
|
return src_id, tgt_id
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
def _fetch_note_cached(self, note_id):
|
def _fetch_note_cached(self, note_id):
|
||||||
"""Lädt eine Note aus Qdrant mit Session-Caching."""
|
if note_id in self._note_cache: return self._note_cache[note_id]
|
||||||
if not note_id:
|
res, _ = self.client.scroll(
|
||||||
return None
|
collection_name=self.notes_col,
|
||||||
if note_id in self._note_cache:
|
scroll_filter=models.Filter(must=[models.FieldCondition(key="note_id", match=models.MatchValue(value=note_id))]),
|
||||||
return self._note_cache[note_id]
|
limit=1, with_payload=True
|
||||||
|
)
|
||||||
try:
|
if res:
|
||||||
res, _ = self.client.scroll(
|
self._note_cache[note_id] = res[0].payload
|
||||||
collection_name=self.notes_col,
|
return res[0].payload
|
||||||
scroll_filter=models.Filter(must=[
|
|
||||||
models.FieldCondition(key="note_id", match=models.MatchValue(value=note_id))
|
|
||||||
]),
|
|
||||||
limit=1, with_payload=True
|
|
||||||
)
|
|
||||||
if res and res[0].payload:
|
|
||||||
payload = res[0].payload
|
|
||||||
self._note_cache[note_id] = payload
|
|
||||||
return payload
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _resolve_note_from_ref(self, ref_str):
|
def _resolve_note_from_ref(self, ref_str):
|
||||||
"""
|
"""
|
||||||
Löst eine Referenz (ID, Chunk-ID oder Wikilink mit Anker) auf eine Note auf.
|
Löst eine Referenz zu einer Note Payload auf.
|
||||||
Bereinigt Anker (#) vor der Suche.
|
|
||||||
|
WICHTIG: Wenn ref_str ein Titel#Abschnitt Format hat, wird nur der Titel-Teil verwendet.
|
||||||
|
Unterstützt:
|
||||||
|
- Note-ID: "20250101-meine-note"
|
||||||
|
- Chunk-ID: "20250101-meine-note#c01"
|
||||||
|
- Titel: "Meine Prinzipien 2025"
|
||||||
|
- Titel#Abschnitt: "Meine Prinzipien 2025#P3 – Disziplin" (trennt Abschnitt ab, sucht nur nach Titel)
|
||||||
"""
|
"""
|
||||||
if not ref_str:
|
if not ref_str:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if ref_str in self._ref_resolution_cache:
|
# Fall A: Enthält # (kann Chunk-ID oder Titel#Abschnitt sein)
|
||||||
return self._ref_resolution_cache[ref_str]
|
|
||||||
|
|
||||||
# Fragment-Behandlung: Trenne Anker ab
|
|
||||||
base_ref = ref_str.split("#")[0].strip()
|
|
||||||
|
|
||||||
# 1. Versuch: Direkte Note-ID Suche
|
|
||||||
note = self._fetch_note_cached(base_ref)
|
|
||||||
if note:
|
|
||||||
self._ref_resolution_cache[ref_str] = note
|
|
||||||
return note
|
|
||||||
|
|
||||||
# 2. Versuch: Titel-Suche (Keyword-Match)
|
|
||||||
try:
|
|
||||||
res, _ = self.client.scroll(
|
|
||||||
collection_name=self.notes_col,
|
|
||||||
scroll_filter=models.Filter(must=[
|
|
||||||
models.FieldCondition(key="title", match=models.MatchValue(value=base_ref))
|
|
||||||
]),
|
|
||||||
limit=1, with_payload=True
|
|
||||||
)
|
|
||||||
if res and res[0].payload:
|
|
||||||
payload = res[0].payload
|
|
||||||
self._ref_resolution_cache[ref_str] = payload
|
|
||||||
return payload
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# 3. Versuch: Auflösung über Chunks
|
|
||||||
if "#" in ref_str:
|
if "#" in ref_str:
|
||||||
try:
|
try:
|
||||||
res_chunk = self.client.retrieve(self.chunks_col, ids=[ref_str], with_payload=True)
|
# Versuch 1: Chunk ID direkt (Format: note_id#c01)
|
||||||
if res_chunk and res_chunk[0].payload:
|
res = self.client.retrieve(self.chunks_col, ids=[ref_str], with_payload=True)
|
||||||
note_id = res_chunk[0].payload.get("note_id")
|
if res and res[0].payload:
|
||||||
note = self._fetch_note_cached(note_id)
|
note_id = res[0].payload.get("note_id")
|
||||||
if note:
|
if note_id:
|
||||||
self._ref_resolution_cache[ref_str] = note
|
return self._fetch_note_cached(note_id)
|
||||||
return note
|
except:
|
||||||
except Exception:
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Versuch 2: NoteID#Section (Hash abtrennen und als Note-ID versuchen)
|
||||||
|
# z.B. "20250101-meine-note#Abschnitt" -> "20250101-meine-note"
|
||||||
|
possible_note_id = ref_str.split("#")[0].strip()
|
||||||
|
note = self._fetch_note_cached(possible_note_id)
|
||||||
|
if note:
|
||||||
|
return note
|
||||||
|
|
||||||
|
# Versuch 3: Titel#Abschnitt (Hash abtrennen und als Titel suchen)
|
||||||
|
# z.B. "Meine Prinzipien 2025#P3 – Disziplin" -> "Meine Prinzipien 2025"
|
||||||
|
# WICHTIG: target_id enthält nur den Titel, daher suchen wir nur nach dem Titel-Teil
|
||||||
|
possible_title = ref_str.split("#")[0].strip()
|
||||||
|
if possible_title:
|
||||||
|
res, _ = self.client.scroll(
|
||||||
|
collection_name=self.notes_col,
|
||||||
|
scroll_filter=models.Filter(must=[
|
||||||
|
models.FieldCondition(key="title", match=models.MatchValue(value=possible_title))
|
||||||
|
]),
|
||||||
|
limit=1, with_payload=True
|
||||||
|
)
|
||||||
|
if res and res[0].payload:
|
||||||
|
payload = res[0].payload
|
||||||
|
self._note_cache[payload['note_id']] = payload
|
||||||
|
return payload
|
||||||
|
|
||||||
|
# Fallback: Text-Suche für Fuzzy-Matching
|
||||||
|
res, _ = self.client.scroll(
|
||||||
|
collection_name=self.notes_col,
|
||||||
|
scroll_filter=models.Filter(must=[
|
||||||
|
models.FieldCondition(key="title", match=models.MatchText(text=possible_title))
|
||||||
|
]),
|
||||||
|
limit=10, with_payload=True
|
||||||
|
)
|
||||||
|
if res:
|
||||||
|
# Nimm das erste Ergebnis, das exakt oder beginnend mit possible_title übereinstimmt
|
||||||
|
for r in res:
|
||||||
|
if r.payload:
|
||||||
|
note_title = r.payload.get("title", "")
|
||||||
|
if note_title == possible_title or note_title.startswith(possible_title):
|
||||||
|
payload = r.payload
|
||||||
|
self._note_cache[payload['note_id']] = payload
|
||||||
|
return payload
|
||||||
|
|
||||||
|
# Fall B: Note ID direkt
|
||||||
|
note = self._fetch_note_cached(ref_str)
|
||||||
|
if note:
|
||||||
|
return note
|
||||||
|
|
||||||
|
# Fall C: Titel (exakte Übereinstimmung)
|
||||||
|
res, _ = self.client.scroll(
|
||||||
|
collection_name=self.notes_col,
|
||||||
|
scroll_filter=models.Filter(must=[
|
||||||
|
models.FieldCondition(key="title", match=models.MatchValue(value=ref_str))
|
||||||
|
]),
|
||||||
|
limit=1, with_payload=True
|
||||||
|
)
|
||||||
|
if res and res[0].payload:
|
||||||
|
payload = res[0].payload
|
||||||
|
self._note_cache[payload['note_id']] = payload
|
||||||
|
return payload
|
||||||
|
|
||||||
|
# Fall D: Titel (Text-Suche für Fuzzy-Matching)
|
||||||
|
res, _ = self.client.scroll(
|
||||||
|
collection_name=self.notes_col,
|
||||||
|
scroll_filter=models.Filter(must=[
|
||||||
|
models.FieldCondition(key="title", match=models.MatchText(text=ref_str))
|
||||||
|
]),
|
||||||
|
limit=1, with_payload=True
|
||||||
|
)
|
||||||
|
if res and res[0].payload:
|
||||||
|
payload = res[0].payload
|
||||||
|
self._note_cache[payload['note_id']] = payload
|
||||||
|
return payload
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _add_node_to_dict(self, node_dict, note_payload, level=1):
|
def _add_node_to_dict(self, node_dict, note_payload, level=1):
|
||||||
"""Erstellt ein Node-Objekt für streamlit-agraph mit Styling."""
|
|
||||||
nid = note_payload.get("note_id")
|
nid = note_payload.get("note_id")
|
||||||
if not nid or nid in node_dict:
|
if not nid or nid in node_dict: return
|
||||||
return
|
|
||||||
|
|
||||||
ntype = note_payload.get("type", "default")
|
ntype = note_payload.get("type", "default")
|
||||||
color = GRAPH_COLORS.get(ntype, GRAPH_COLORS.get("default", "#8395a7"))
|
color = GRAPH_COLORS.get(ntype, GRAPH_COLORS["default"])
|
||||||
|
|
||||||
|
# Basis-Tooltip (wird später erweitert)
|
||||||
tooltip = f"Titel: {note_payload.get('title')}\nTyp: {ntype}"
|
tooltip = f"Titel: {note_payload.get('title')}\nTyp: {ntype}"
|
||||||
|
|
||||||
size = 45 if level == 0 else (25 if level == 1 else 15)
|
if level == 0: size = 45
|
||||||
|
elif level == 1: size = 25
|
||||||
|
else: size = 15
|
||||||
|
|
||||||
node_dict[nid] = Node(
|
node_dict[nid] = Node(
|
||||||
id=nid,
|
id=nid,
|
||||||
label=note_payload.get('title', nid),
|
label=note_payload.get('title', nid),
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/models/dto.py
|
FILE: app/models/dto.py
|
||||||
DESCRIPTION: Pydantic-Modelle (DTOs) für Request/Response Bodies. Definiert das API-Schema.
|
DESCRIPTION: Pydantic-Modelle (DTOs) für Request/Response Bodies. Definiert das API-Schema.
|
||||||
VERSION: 0.6.6 (WP-22 Debug & Stability Update)
|
VERSION: 0.6.7 (WP-Fix: Target Section Support)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: pydantic, typing, uuid
|
DEPENDENCIES: pydantic, typing, uuid
|
||||||
LAST_ANALYSIS: 2025-12-18
|
LAST_ANALYSIS: 2025-12-29
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -43,6 +43,7 @@ class EdgeDTO(BaseModel):
|
||||||
direction: Literal["out", "in", "undirected"] = "out"
|
direction: Literal["out", "in", "undirected"] = "out"
|
||||||
provenance: Optional[Literal["explicit", "rule", "smart", "structure"]] = "explicit"
|
provenance: Optional[Literal["explicit", "rule", "smart", "structure"]] = "explicit"
|
||||||
confidence: float = 1.0
|
confidence: float = 1.0
|
||||||
|
target_section: Optional[str] = None # Neu: Speichert den Anker (z.B. #Abschnitt)
|
||||||
|
|
||||||
|
|
||||||
# --- Request Models ---
|
# --- Request Models ---
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
doc_type: glossary
|
doc_type: glossary
|
||||||
audience: all
|
audience: all
|
||||||
status: active
|
status: active
|
||||||
version: 2.8.1
|
version: 2.9.1
|
||||||
context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-Cloud Resilienz, WP-14 Modularisierung, WP-15b Two-Pass Ingestion und Mistral-safe Parsing."
|
context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-Cloud Resilienz, WP-14 Modularisierung, WP-15b Two-Pass Ingestion und Mistral-safe Parsing."
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -14,7 +14,7 @@ context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-C
|
||||||
|
|
||||||
* **Note:** Repräsentiert eine Markdown-Datei. Die fachliche Haupteinheit. Verfügt über einen **Status** (stable, draft, system), der das Scoring beeinflusst.
|
* **Note:** Repräsentiert eine Markdown-Datei. Die fachliche Haupteinheit. Verfügt über einen **Status** (stable, draft, system), der das Scoring beeinflusst.
|
||||||
* **Chunk:** Ein Textabschnitt einer Note. Die technische Sucheinheit (Vektor).
|
* **Chunk:** Ein Textabschnitt einer Note. Die technische Sucheinheit (Vektor).
|
||||||
* **Edge:** Eine gerichtete Verbindung zwischen zwei Knoten. Wird in WP-22 durch die Registry validiert.
|
* **Edge:** Eine gerichtete Verbindung zwischen zwei Knoten. Wird in WP-22 durch die Registry validiert. Seit v2.9.1 unterstützt Edges **Section-basierte Links** (`target_section`), sodass mehrere Kanten zwischen denselben Knoten existieren können, wenn sie auf verschiedene Abschnitte zeigen.
|
||||||
* **Vault:** Der lokale Ordner mit den Markdown-Dateien (Source of Truth).
|
* **Vault:** Der lokale Ordner mit den Markdown-Dateien (Source of Truth).
|
||||||
* **Frontmatter:** Der YAML-Header am Anfang einer Notiz (enthält `id`, `type`, `title`, `status`).
|
* **Frontmatter:** Der YAML-Header am Anfang einer Notiz (enthält `id`, `type`, `title`, `status`).
|
||||||
|
|
||||||
|
|
@ -47,4 +47,7 @@ context: "Zentrales Glossar für Mindnet v2.8. Enthält Definitionen zu Hybrid-C
|
||||||
* **Two-Pass Workflow (WP-15b):** Optimiertes Ingestion-Verfahren:
|
* **Two-Pass Workflow (WP-15b):** Optimiertes Ingestion-Verfahren:
|
||||||
* **Pass 1 (Pre-Scan):** Schnelles Scannen aller Dateien zur Befüllung des LocalBatchCache.
|
* **Pass 1 (Pre-Scan):** Schnelles Scannen aller Dateien zur Befüllung des LocalBatchCache.
|
||||||
* **Pass 2 (Semantic Processing):** Tiefenverarbeitung (Chunking, Embedding, Validierung) nur für geänderte Dateien.
|
* **Pass 2 (Semantic Processing):** Tiefenverarbeitung (Chunking, Embedding, Validierung) nur für geänderte Dateien.
|
||||||
* **Circular Import Registry (WP-14):** Entkopplung von Kern-Logik (wie Textbereinigung) in eine neutrale `registry.py`, um Abhängigkeitsschleifen zwischen Diensten und Ingestion-Utilities zu verhindern.
|
* **Circular Import Registry (WP-14):** Entkopplung von Kern-Logik (wie Textbereinigung) in eine neutrale `registry.py`, um Abhängigkeitsschleifen zwischen Diensten und Ingestion-Utilities zu verhindern.
|
||||||
|
* **Deep-Link / Section-basierter Link:** Ein Link wie `[[Note#Section]]`, der auf einen spezifischen Abschnitt innerhalb einer Note verweist. Seit v2.9.1 wird dieser in `target_id="Note"` und `target_section="Section"` aufgeteilt, um "Phantom-Knoten" zu vermeiden und Multigraph-Support zu ermöglichen.
|
||||||
|
* **Atomic Section Logic (v3.9.9):** Chunking-Verfahren, das Sektions-Überschriften und deren Inhalte atomar in Chunks hält (Pack-and-Carry-Over). Verhindert, dass Überschriften über Chunk-Grenzen hinweg getrennt werden.
|
||||||
|
* **Registry-First Profiling (v2.13.12):** Hierarchische Auflösung des Chunking-Profils: Frontmatter > types.yaml Typ-Config > Global Defaults. Stellt sicher, dass Note-Typen automatisch das korrekte Profil erhalten.
|
||||||
|
|
@ -3,7 +3,7 @@ doc_type: user_manual
|
||||||
audience: user, author
|
audience: user, author
|
||||||
scope: vault, markdown, schema
|
scope: vault, markdown, schema
|
||||||
status: active
|
status: active
|
||||||
version: 2.8.0
|
version: 2.9.1
|
||||||
context: "Regelwerk für das Erstellen von Notizen im Vault. Die 'Source of Truth' für Autoren."
|
context: "Regelwerk für das Erstellen von Notizen im Vault. Die 'Source of Truth' für Autoren."
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -208,6 +208,12 @@ Dies ist die **mächtigste** Methode. Du sagst dem System explizit, **wie** Ding
|
||||||
> "Daher [[rel:depends_on Qdrant]]."
|
> "Daher [[rel:depends_on Qdrant]]."
|
||||||
> "Dieses Konzept ist [[rel:similar_to Pinecone]]."
|
> "Dieses Konzept ist [[rel:similar_to Pinecone]]."
|
||||||
|
|
||||||
|
**Deep-Links zu Abschnitten (v2.9.1):**
|
||||||
|
Du kannst auch auf spezifische Abschnitte innerhalb einer Note verlinken:
|
||||||
|
> "Siehe [[rel:based_on Mein Leitbild#P3 – Disziplin]]."
|
||||||
|
|
||||||
|
Das System trennt automatisch den Note-Namen (`Mein Leitbild`) vom Abschnitts-Namen (`P3 – Disziplin`), sodass mehrere Links zur gleichen Note möglich sind, wenn sie auf verschiedene Abschnitte zeigen.
|
||||||
|
|
||||||
**Gültige Relationen:**
|
**Gültige Relationen:**
|
||||||
* `depends_on`: Hängt ab von / Benötigt.
|
* `depends_on`: Hängt ab von / Benötigt.
|
||||||
* `blocks`: Blockiert oder gefährdet (z.B. Risiko -> Projekt).
|
* `blocks`: Blockiert oder gefährdet (z.B. Risiko -> Projekt).
|
||||||
|
|
@ -226,6 +232,12 @@ Für Zusammenfassungen am Ende einer Notiz, oder eines Absatzes:
|
||||||
> [[AI Agents]]
|
> [[AI Agents]]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Multi-Line Support (v2.9.1):**
|
||||||
|
Callout-Blocks mit mehreren Zeilen werden korrekt verarbeitet. Das System erkennt automatisch, wenn mehrere Links im gleichen Callout-Block stehen, und erstellt für jeden Link eine separate Kante (auch bei Deep-Links zu verschiedenen Sections).
|
||||||
|
|
||||||
|
**Format-agnostische De-Duplizierung:**
|
||||||
|
Wenn Kanten bereits via `[!edge]` Callout vorhanden sind, werden sie nicht mehrfach injiziert. Das System erkennt vorhandene Kanten unabhängig vom Format (Inline, Callout, Wikilink).
|
||||||
|
|
||||||
### 4.3 Implizite Bidirektionalität (Edger-Logik) [NEU] [PRÜFEN!]
|
### 4.3 Implizite Bidirektionalität (Edger-Logik) [NEU] [PRÜFEN!]
|
||||||
In Mindnet musst du Kanten **nicht** manuell in beide Richtungen pflegen. Der **Edger** übernimmt die Paarbildung automatisch im Hintergrund.
|
In Mindnet musst du Kanten **nicht** manuell in beide Richtungen pflegen. Der **Edger** übernimmt die Paarbildung automatisch im Hintergrund.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ doc_type: concept
|
||||||
audience: architect, product_owner
|
audience: architect, product_owner
|
||||||
scope: graph, logic, provenance
|
scope: graph, logic, provenance
|
||||||
status: active
|
status: active
|
||||||
version: 2.7.0
|
version: 2.9.1
|
||||||
context: "Fachliche Beschreibung des Wissensgraphen: Knoten, Kanten, Provenance, Matrix-Logik und WP-22 Scoring-Prinzipien."
|
context: "Fachliche Beschreibung des Wissensgraphen: Knoten, Kanten, Provenance, Matrix-Logik und WP-22 Scoring-Prinzipien."
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -118,8 +118,30 @@ Der Intent-Router injiziert spezifische Multiplikatoren für kanonische Typen:
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 6. Idempotenz & Konsistenz
|
## 6. Section-basierte Links & Multigraph-Support
|
||||||
|
|
||||||
|
Seit v2.9.1 unterstützt Mindnet **Deep-Links** zu spezifischen Abschnitten innerhalb einer Note.
|
||||||
|
|
||||||
|
### 6.1 Link-Parsing
|
||||||
|
Links wie `[[Note#Section]]` werden in zwei Komponenten aufgeteilt:
|
||||||
|
* **`target_id`:** Enthält nur den Note-Namen (z.B. "Mein Leitbild")
|
||||||
|
* **`target_section`:** Enthält den Abschnitts-Namen (z.B. "P3 – Disziplin")
|
||||||
|
|
||||||
|
**Vorteil:** Verhindert "Phantom-Knoten", die durch das Einbeziehen des Anchors in die `target_id` entstanden wären.
|
||||||
|
|
||||||
|
### 6.2 Multigraph-Support
|
||||||
|
Die Edge-ID enthält nun einen `variant`-Parameter (die Section), sodass mehrere Kanten zwischen denselben Knoten existieren können, wenn sie auf verschiedene Sections zeigen:
|
||||||
|
* `[[Note#Section1]]` → Edge-ID: `src->tgt:kind@Section1`
|
||||||
|
* `[[Note#Section2]]` → Edge-ID: `src->tgt:kind@Section2`
|
||||||
|
|
||||||
|
### 6.3 Semantische Deduplizierung
|
||||||
|
Die Deduplizierung basiert auf dem `src->tgt:kind@sec` Key, um sicherzustellen, dass identische Links (gleiche Quelle, Ziel, Typ und Section) nicht mehrfach erstellt werden.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Idempotenz & Konsistenz
|
||||||
|
|
||||||
Das System garantiert fachliche Konsistenz auch bei mehrfachen Importen.
|
Das System garantiert fachliche Konsistenz auch bei mehrfachen Importen.
|
||||||
* **Stabile IDs:** Deterministische IDs verhindern Duplikate bei Re-Imports.
|
* **Stabile IDs:** Deterministische IDs verhindern Duplikate bei Re-Imports.
|
||||||
* **Deduplizierung:** Kanten werden anhand ihrer Identität erkannt. Die "stärkere" Provenance gewinnt.
|
* **Deduplizierung:** Kanten werden anhand ihrer Identität (inkl. Section) erkannt. Die "stärkere" Provenance gewinnt.
|
||||||
|
* **Format-agnostische Erkennung:** Kanten werden unabhängig vom Format (Inline, Callout, Wikilink) erkannt, um Dopplungen zu vermeiden.
|
||||||
|
|
@ -144,8 +144,11 @@ Lädt den Subgraphen um eine Note herum.
|
||||||
"kind": "depends_on",
|
"kind": "depends_on",
|
||||||
"source": "uuid",
|
"source": "uuid",
|
||||||
"target": "uuid",
|
"target": "uuid",
|
||||||
|
"target_section": "P3 – Disziplin", // Optional: Abschnitts-Name bei Deep-Links
|
||||||
"weight": 1.4,
|
"weight": 1.4,
|
||||||
"direction": "out"
|
"direction": "out",
|
||||||
|
"provenance": "explicit",
|
||||||
|
"confidence": 1.0
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"stats": {
|
"stats": {
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ doc_type: technical_reference
|
||||||
audience: developer, architect
|
audience: developer, architect
|
||||||
scope: database, qdrant, schema
|
scope: database, qdrant, schema
|
||||||
status: active
|
status: active
|
||||||
version: 2.8.0
|
version: 2.9.1
|
||||||
context: "Exakte Definition der Datenmodelle (Payloads) in Qdrant und Index-Anforderungen. Berücksichtigt WP-14 Modularisierung und WP-15b Multi-Hashes."
|
context: "Exakte Definition der Datenmodelle (Payloads) in Qdrant und Index-Anforderungen. Berücksichtigt WP-14 Modularisierung und WP-15b Multi-Hashes."
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -96,15 +96,19 @@ Es müssen Payload-Indizes für folgende Felder existieren:
|
||||||
|
|
||||||
## 4. Edge Payload (`mindnet_edges`)
|
## 4. Edge Payload (`mindnet_edges`)
|
||||||
|
|
||||||
Gerichtete Kanten zwischen Knoten. Stark erweitert in v2.6 für Provenienz-Tracking.
|
Gerichtete Kanten zwischen Knoten. Stark erweitert in v2.6 für Provenienz-Tracking. Seit v2.9.1 unterstützt das System **Section-basierte Links** (`[[Note#Section]]`), die in `target_id` und `target_section` aufgeteilt werden.
|
||||||
|
|
||||||
**JSON-Schema:**
|
**JSON-Schema:**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"edge_id": "string (keyword)", // Deterministischer Hash aus (src, dst, kind)
|
"edge_id": "string (keyword)", // Deterministischer Hash aus (src, dst, kind, variant)
|
||||||
|
// variant = target_section (erlaubt Multigraph für Sections)
|
||||||
"source_id": "string (keyword)", // Chunk-ID (Start)
|
"source_id": "string (keyword)", // Chunk-ID (Start)
|
||||||
"target_id": "string (keyword)", // Chunk-ID oder Note-Titel (bei Unresolved)
|
"target_id": "string (keyword)", // Chunk-ID oder Note-Titel (bei Unresolved)
|
||||||
|
// WICHTIG: Enthält NUR den Note-Namen, KEINE Section-Info
|
||||||
|
"target_section": "string (keyword)", // Optional: Abschnitts-Name (z.B. "P3 – Disziplin")
|
||||||
|
// Wird aus [[Note#Section]] extrahiert
|
||||||
"kind": "string (keyword)", // Beziehungsart (z.B. 'depends_on')
|
"kind": "string (keyword)", // Beziehungsart (z.B. 'depends_on')
|
||||||
"scope": "string (keyword)", // Immer 'chunk' (Legacy-Support: 'note')
|
"scope": "string (keyword)", // Immer 'chunk' (Legacy-Support: 'note')
|
||||||
"note_id": "string (keyword)", // Owner Note ID (Ursprung der Kante)
|
"note_id": "string (keyword)", // Owner Note ID (Ursprung der Kante)
|
||||||
|
|
@ -116,10 +120,16 @@ Gerichtete Kanten zwischen Knoten. Stark erweitert in v2.6 für Provenienz-Track
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Section-Support:**
|
||||||
|
* Links wie `[[Note#Section]]` werden in `target_id="Note"` und `target_section="Section"` aufgeteilt.
|
||||||
|
* Die Edge-ID enthält die Section als `variant`, sodass mehrere Kanten zwischen denselben Knoten existieren können, wenn sie auf verschiedene Sections zeigen.
|
||||||
|
* Semantische Deduplizierung basiert auf `src->tgt:kind@sec` Key, um "Phantom-Knoten" zu vermeiden.
|
||||||
|
|
||||||
**Erforderliche Indizes:**
|
**Erforderliche Indizes:**
|
||||||
Es müssen Payload-Indizes für folgende Felder existieren:
|
Es müssen Payload-Indizes für folgende Felder existieren:
|
||||||
* `source_id`
|
* `source_id`
|
||||||
* `target_id`
|
* `target_id`
|
||||||
|
* `target_section` (neu: Keyword-Index für Section-basierte Filterung)
|
||||||
* `kind`
|
* `kind`
|
||||||
* `scope`
|
* `scope`
|
||||||
* `note_id`
|
* `note_id`
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ doc_type: technical_reference
|
||||||
audience: developer, frontend_architect
|
audience: developer, frontend_architect
|
||||||
scope: architecture, graph_viz, state_management
|
scope: architecture, graph_viz, state_management
|
||||||
status: active
|
status: active
|
||||||
version: 2.7.0
|
version: 2.9.1
|
||||||
context: "Technische Dokumentation des modularen Streamlit-Frontends, der Graph-Engines und des Editors."
|
context: "Technische Dokumentation des modularen Streamlit-Frontends, der Graph-Engines und des Editors."
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@ doc_type: technical_reference
|
||||||
audience: developer, devops
|
audience: developer, devops
|
||||||
scope: backend, ingestion, smart_edges, edge_registry, modularization
|
scope: backend, ingestion, smart_edges, edge_registry, modularization
|
||||||
status: active
|
status: active
|
||||||
version: 2.9.0
|
version: 2.13.12
|
||||||
context: "Detaillierte technische Beschreibung der Import-Pipeline, Two-Pass-Workflow (WP-15b) und modularer Datenbank-Architektur (WP-14). Integriert Mistral-safe Parsing und Deep Fallback."
|
context: "Detaillierte technische Beschreibung der Import-Pipeline, Two-Pass-Workflow (WP-15b) und modularer Datenbank-Architektur (WP-14). Integriert Mistral-safe Parsing und Deep Fallback."
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
@ -31,9 +31,10 @@ Der Prozess ist **asynchron**, **idempotent** und wird nun in zwei logische Durc
|
||||||
4. **Edge Registry Initialisierung (WP-22):**
|
4. **Edge Registry Initialisierung (WP-22):**
|
||||||
* Laden der Singleton-Instanz der `EdgeRegistry`.
|
* Laden der Singleton-Instanz der `EdgeRegistry`.
|
||||||
* Validierung der Vokabular-Datei unter `MINDNET_VOCAB_PATH`.
|
* Validierung der Vokabular-Datei unter `MINDNET_VOCAB_PATH`.
|
||||||
5. **Config Resolution (WP-14):**
|
5. **Config Resolution (WP-14 / v2.13.12):**
|
||||||
* Bestimmung von `chunking_profile` und `retriever_weight` via zentraler `TypeRegistry`.
|
* Bestimmung von `chunking_profile` und `retriever_weight` via zentraler `TypeRegistry`.
|
||||||
* **Priorität:** 1. Frontmatter (Override) -> 2. `types.yaml` (Type) -> 3. Global Default.
|
* **Priorität:** 1. Frontmatter (Override) -> 2. `types.yaml` (Type) -> 3. Global Default.
|
||||||
|
* **Registry-First Profiling:** Automatische Anwendung der korrekten Profile basierend auf dem Note-Typ (z.B. `value` nutzt automatisch `structured_smart_edges_strict`).
|
||||||
6. **LocalBatchCache & Summary Generation (WP-15b):**
|
6. **LocalBatchCache & Summary Generation (WP-15b):**
|
||||||
* Erstellung von Kurz-Zusammenfassungen für jede Note.
|
* Erstellung von Kurz-Zusammenfassungen für jede Note.
|
||||||
* Speicherung im `batch_cache` als Referenzrahmen für die spätere Kantenvalidierung.
|
* Speicherung im `batch_cache` als Referenzrahmen für die spätere Kantenvalidierung.
|
||||||
|
|
@ -126,19 +127,44 @@ Das Chunking ist profilbasiert und bezieht seine Konfiguration dynamisch aus der
|
||||||
| `sliding_smart_edges`| `sliding_window` | Max: 600, Target: 400 | Fließtexte (Projekte). |
|
| `sliding_smart_edges`| `sliding_window` | Max: 600, Target: 400 | Fließtexte (Projekte). |
|
||||||
| `structured_smart_edges` | `by_heading` | `strict: false` | Strukturierte Texte. |
|
| `structured_smart_edges` | `by_heading` | `strict: false` | Strukturierte Texte. |
|
||||||
|
|
||||||
### 3.2 Die `by_heading` Logik (v2.9 Hybrid)
|
### 3.2 Die `by_heading` Logik (v3.9.9 Atomic Section Logic)
|
||||||
|
|
||||||
Die Strategie `by_heading` zerlegt Texte anhand ihrer Struktur (Überschriften). Sie unterstützt ein "Safety Net" gegen zu große Chunks.
|
Die Strategie `by_heading` implementiert seit v3.9.9 das **"Pack-and-Carry-Over"** Verfahren (Regel 1-3), um Sektions-Überschriften und deren Inhalte atomar in Chunks zu halten.
|
||||||
|
|
||||||
* **Split Level:** Definiert die Tiefe (z.B. `2` = H1 & H2 triggern Split).
|
**Kernprinzipien:**
|
||||||
* **Modus "Strict" (`strict_heading_split: true`):**
|
* **Atomic Section Logic:** Überschriften und deren Inhalte werden als atomare Einheiten behandelt und nicht über Chunk-Grenzen hinweg getrennt.
|
||||||
* Jede Überschrift (`<= split_level`) erzwingt einen neuen Chunk.
|
* **H1-Context Preservation:** Der Dokumenttitel (H1) wird zuverlässig als Breadcrumb in das Embedding-Fenster (`window`) aller Chunks injiziert.
|
||||||
* *Merge-Check:* Wenn der vorherige Chunk leer war (nur Überschriften), wird gemergt.
|
* **Signature Alignment:** Parameter-Synchronisierung zwischen Orchestrator und Strategien (`context_prefix` statt `doc_title`).
|
||||||
* *Safety Net:* Wird ein Abschnitt zu lang (> `max` Token), wird auch ohne Überschrift getrennt.
|
|
||||||
* **Modus "Soft" (`strict_heading_split: false`):**
|
**Split Level:** Definiert die Tiefe (z.B. `2` = H1 & H2 triggern Split).
|
||||||
* **Hierarchie-Check:** Überschriften *oberhalb* des Split-Levels erzwingen **immer** einen Split.
|
|
||||||
* **Füll-Logik:** Überschriften *auf* dem Split-Level lösen nur dann einen neuen Chunk aus, wenn der aktuelle Chunk die `target`-Größe erreicht hat.
|
**Modus "Strict" (`strict_heading_split: true`):**
|
||||||
* *Safety Net:* Auch hier greift das `max` Token Limit.
|
* Jede Überschrift (`<= split_level`) erzwingt einen neuen Chunk.
|
||||||
|
* *Merge-Check:* Wenn der vorherige Chunk leer war (nur Überschriften), wird gemergt.
|
||||||
|
* *Safety Net:* Wird ein Abschnitt zu lang (> `max` Token), wird auch ohne Überschrift getrennt.
|
||||||
|
|
||||||
|
**Modus "Soft" (`strict_heading_split: false`):**
|
||||||
|
* **Hierarchie-Check:** Überschriften *oberhalb* des Split-Levels erzwingen **immer** einen Split.
|
||||||
|
* **Füll-Logik:** Überschriften *auf* dem Split-Level lösen nur dann einen neuen Chunk aus, wenn der aktuelle Chunk die `target`-Größe erreicht hat.
|
||||||
|
* **Pack-and-Carry-Over:** Wenn ein Abschnitt zu groß ist, wird er intelligent zerlegt, wobei der Rest (mit Überschrift) zurück in die Queue gelegt wird.
|
||||||
|
* *Safety Net:* Auch hier greift das `max` Token Limit.
|
||||||
|
|
||||||
|
### 3.3 Registry-First Profiling (v2.13.12)
|
||||||
|
|
||||||
|
Seit v2.13.12 nutzt der `IngestionService` die korrekte Hierarchie zur Ermittlung des Chunking-Profils:
|
||||||
|
|
||||||
|
**Priorität:**
|
||||||
|
1. **Frontmatter** (Override) - Explizite `chunking_profile` Angabe
|
||||||
|
2. **`types.yaml` Typ-Config** - Profil basierend auf `type`
|
||||||
|
3. **Global Defaults** - Fallback auf `sliding_standard`
|
||||||
|
|
||||||
|
**Wichtig:** Ein Hard-Fallback auf `sliding_standard` erfolgt nur noch, wenn keine Konfiguration existiert. Dies stellt sicher, dass Note-Typen wie `value` automatisch das korrekte Profil (z.B. `structured_smart_edges_strict`) erhalten.
|
||||||
|
|
||||||
|
### 3.4 Deterministic Hashing (v2.13.12)
|
||||||
|
|
||||||
|
Der `full`-Hash inkludiert nun alle strategischen Parameter (z.B. `split_level`, `strict_heading_split`), sodass Konfigurationsänderungen im Frontmatter zwingend einen Re-Import auslösen.
|
||||||
|
|
||||||
|
**Impact:** Änderungen an Chunking-Parametern werden zuverlässig erkannt, auch wenn der Text unverändert bleibt.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -279,4 +279,11 @@ python3 -m scripts.reset_qdrant --mode wipe --prefix "mindnet" --yes
|
||||||
|
|
||||||
# 2. Neu importieren (Force Hash recalculation)
|
# 2. Neu importieren (Force Hash recalculation)
|
||||||
python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --force
|
python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --force
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Wichtig (v2.9.1 Migration):**
|
||||||
|
Nach dem Update auf v2.9.1 (Section-basierte Links, Multigraph-Support) ist ein vollständiger Re-Import erforderlich, um "Phantom-Knoten" zu beheben und die neue Edge-Struktur zu konsolidieren:
|
||||||
|
```bash
|
||||||
|
python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --force
|
||||||
|
```
|
||||||
|
Dies stellt sicher, dass alle bestehenden Links korrekt in `target_id` und `target_section` aufgeteilt werden.
|
||||||
99
docs/99_Archive/WP4d_merge_commit.md
Normal file
99
docs/99_Archive/WP4d_merge_commit.md
Normal file
|
|
@ -0,0 +1,99 @@
|
||||||
|
# Branch Merge Commit Message: WP4d
|
||||||
|
|
||||||
|
```
|
||||||
|
feat: Section-basierte Links, Atomic Section Chunking & Registry-First Profiling (v2.9.1)
|
||||||
|
|
||||||
|
## Graph Topology & Edge Management
|
||||||
|
|
||||||
|
### Section-basierte Links (Multigraph-Support)
|
||||||
|
- Split `[[Note#Section]]` Links in `target_id="Note"` und `target_section="Section"`
|
||||||
|
- Edge-ID enthält nun `variant` (Section), ermöglicht mehrere Kanten zwischen denselben Knoten
|
||||||
|
- Semantische Deduplizierung basiert auf `src->tgt:kind@sec` Key
|
||||||
|
- Behebt "Phantom-Knoten" durch korrekte Trennung von Note-Name und Abschnitt
|
||||||
|
|
||||||
|
**Geänderte Dateien:**
|
||||||
|
- `app/core/graph/graph_utils.py`: `parse_link_target()` für Section-Extraktion
|
||||||
|
- `app/core/graph/graph_derive_edges.py`: `target_section` in Edge-Payload
|
||||||
|
- `app/core/database/qdrant.py`: Keyword-Index für `target_section`
|
||||||
|
- `app/core/database/qdrant_points.py`: Explizites Durchreichen von `target_section`
|
||||||
|
- `app/models/dto.py`: `EdgeDTO` mit `target_section` Feld
|
||||||
|
|
||||||
|
### Extraction & Parsing Verbesserungen
|
||||||
|
- Multi-line Callout-Blocks korrekt verarbeitet (stop-check logic)
|
||||||
|
- Robuster Fallback für "headless" Blocks (split chunks)
|
||||||
|
- Liberalisierte Regex für Umlaute und Sonderzeichen in Targets
|
||||||
|
|
||||||
|
**Geänderte Dateien:**
|
||||||
|
- `app/core/graph/graph_extractors.py`: Multi-line Callout-Parser, erweiterte Regex
|
||||||
|
|
||||||
|
## Chunking & Ingestion (v3.9.9 / v2.13.12)
|
||||||
|
|
||||||
|
### Atomic Section Logic (v3.9.9)
|
||||||
|
- Vollständige Implementierung des "Pack-and-Carry-Over" Verfahrens (Regel 1-3)
|
||||||
|
- Sektions-Überschriften und Inhalte bleiben atomar in Chunks
|
||||||
|
- H1-Context Preservation: Dokumenttitel als Breadcrumb in Embedding-Fenster
|
||||||
|
- Signature Alignment: Parameter-Synchronisierung (`context_prefix` statt `doc_title`)
|
||||||
|
|
||||||
|
**Geänderte Dateien:**
|
||||||
|
- `app/core/chunking/chunking_strategies.py`: Atomic Section Logic implementiert
|
||||||
|
|
||||||
|
### Format-agnostische De-Duplizierung
|
||||||
|
- Prüfung auf vorhandene Kanten basiert auf Ziel (`target`), nicht String-Match
|
||||||
|
- Verhindert Dopplung von Kanten, die bereits via `[!edge]` Callout vorhanden sind
|
||||||
|
- Global Pool Integration für unzugeordnete Kanten
|
||||||
|
|
||||||
|
**Geänderte Dateien:**
|
||||||
|
- `app/core/chunking/chunking_propagation.py`: Ziel-basierte Prüfung
|
||||||
|
|
||||||
|
### Registry-First Profiling (v2.13.12)
|
||||||
|
- Korrekte Hierarchie: Frontmatter > types.yaml Typ-Config > Global Defaults
|
||||||
|
- Hard-Fallback auf `sliding_standard` nur wenn keine Konfiguration existiert
|
||||||
|
- Automatische Anwendung korrekter Profile basierend auf Note-Typ
|
||||||
|
|
||||||
|
### Deterministic Hashing
|
||||||
|
- `full`-Hash inkludiert strategische Parameter (`split_level`, `strict_heading_split`)
|
||||||
|
- Konfigurationsänderungen im Frontmatter lösen zwingend Re-Import aus
|
||||||
|
|
||||||
|
**Geänderte Dateien:**
|
||||||
|
- `app/core/ingestion/ingestion_processor.py`: Registry-First Profiling, Deterministic Hashing
|
||||||
|
|
||||||
|
## Impact & Breaking Changes
|
||||||
|
|
||||||
|
### Migration erforderlich
|
||||||
|
**WICHTIG:** Vollständiger Re-Import erforderlich für bestehende Vaults:
|
||||||
|
```bash
|
||||||
|
python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --force
|
||||||
|
```
|
||||||
|
|
||||||
|
**Grund:**
|
||||||
|
- Behebt "Phantom-Knoten" durch korrekte Aufteilung von `[[Note#Section]]` Links
|
||||||
|
- Konsolidiert Edge-Struktur mit `target_section` Feld
|
||||||
|
- Aktualisiert Chunking basierend auf neuen Strategien
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
- ✅ Resolves: Mehrere Links zur gleichen Note in einem Callout-Block wurden zu einer Kante gemergt
|
||||||
|
- ✅ Resolves: "Phantom-Knoten" durch Einbeziehung des Anchors in `target_id`
|
||||||
|
- ✅ Resolves: Redundante `[[rel:...]]` Links in Chunks
|
||||||
|
- ✅ Resolves: Inkonsistente Metadaten in Qdrant durch Registry-First Profiling
|
||||||
|
|
||||||
|
## Dokumentation
|
||||||
|
|
||||||
|
Alle relevanten Dokumente aktualisiert:
|
||||||
|
- `03_tech_data_model.md`: Edge Payload Schema mit `target_section`
|
||||||
|
- `02_concept_graph_logic.md`: Section-basierte Links & Multigraph-Support
|
||||||
|
- `03_tech_ingestion_pipeline.md`: Chunking-Strategien, Registry-First Profiling
|
||||||
|
- `03_tech_api_reference.md`: EdgeDTO mit `target_section`
|
||||||
|
- `01_knowledge_design.md`: Deep-Links dokumentiert
|
||||||
|
- `00_glossary.md`: Neue Begriffe ergänzt
|
||||||
|
- `04_admin_operations.md`: Migration-Hinweis
|
||||||
|
|
||||||
|
## Versionen
|
||||||
|
|
||||||
|
- Graph Topology: v2.9.1
|
||||||
|
- Chunking Strategies: v3.9.9
|
||||||
|
- Ingestion Processor: v2.13.12
|
||||||
|
- API DTO: v0.6.7
|
||||||
|
|
||||||
|
Closes #[issue-number]
|
||||||
|
```
|
||||||
|
|
||||||
236
docs/99_Archive/WP4d_release_notes.md
Normal file
236
docs/99_Archive/WP4d_release_notes.md
Normal file
|
|
@ -0,0 +1,236 @@
|
||||||
|
# Release Notes: Mindnet v2.9.1 (WP4d)
|
||||||
|
|
||||||
|
**Release Date:** 2025-01-XX
|
||||||
|
**Type:** Feature Release mit Breaking Changes
|
||||||
|
**Branch:** WP4d
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🎯 Übersicht
|
||||||
|
|
||||||
|
Diese Version führt **Section-basierte Links** ein, verbessert das Chunking durch **Atomic Section Logic** und implementiert **Registry-First Profiling** für konsistentere Konfigurationsauflösung. Die Änderungen erfordern einen **vollständigen Re-Import** bestehender Vaults.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✨ Neue Features
|
||||||
|
|
||||||
|
### Section-basierte Links (Deep-Links)
|
||||||
|
|
||||||
|
Mindnet unterstützt nun **Deep-Links** zu spezifischen Abschnitten innerhalb einer Note:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
[[rel:based_on Mein Leitbild#P3 – Disziplin]]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Vorteile:**
|
||||||
|
- Mehrere Links zur gleichen Note möglich (verschiedene Sections)
|
||||||
|
- Präzise Kontext-Ladung (nur relevanter Abschnitt)
|
||||||
|
- Keine "Phantom-Knoten" mehr durch korrekte Trennung von Note-Name und Abschnitt
|
||||||
|
|
||||||
|
**Technische Details:**
|
||||||
|
- Links werden in `target_id="Note"` und `target_section="Section"` aufgeteilt
|
||||||
|
- Edge-ID enthält `variant` (Section) für Multigraph-Support
|
||||||
|
- Semantische Deduplizierung basiert auf `src->tgt:kind@sec` Key
|
||||||
|
|
||||||
|
### Atomic Section Logic (Chunking v3.9.9)
|
||||||
|
|
||||||
|
Das Chunking hält nun Sektions-Überschriften und deren Inhalte **atomar** zusammen:
|
||||||
|
|
||||||
|
**"Pack-and-Carry-Over" Verfahren:**
|
||||||
|
- Regel 1 & 2: Sektionen werden zusammengepackt, wenn sie in den Token-Limit passen
|
||||||
|
- Regel 3: Zu große Sektionen werden intelligent zerlegt, Rest wird zurück in Queue gelegt
|
||||||
|
- H1-Context Preservation: Dokumenttitel wird als Breadcrumb in alle Chunks injiziert
|
||||||
|
|
||||||
|
**Vorteile:**
|
||||||
|
- Keine getrennten Überschriften mehr
|
||||||
|
- Bessere semantische Kohärenz in Chunks
|
||||||
|
- Verbesserte Retrieval-Qualität durch vollständigen Kontext
|
||||||
|
|
||||||
|
### Registry-First Profiling (v2.13.12)
|
||||||
|
|
||||||
|
Die Konfigurationsauflösung folgt nun einer klaren Hierarchie:
|
||||||
|
|
||||||
|
1. **Frontmatter** (höchste Priorität)
|
||||||
|
2. **types.yaml Typ-Config**
|
||||||
|
3. **Global Defaults**
|
||||||
|
|
||||||
|
**Impact:**
|
||||||
|
- Note-Typen wie `value` erhalten automatisch das korrekte Profil (`structured_smart_edges_strict`)
|
||||||
|
- Keine manuellen Overrides mehr nötig für Standard-Typen
|
||||||
|
- Konsistente Metadaten in Qdrant
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔧 Verbesserungen
|
||||||
|
|
||||||
|
### Extraction & Parsing
|
||||||
|
|
||||||
|
- **Multi-line Callout-Blocks:** Korrekte Verarbeitung von mehrzeiligen `[!edge]` Callouts
|
||||||
|
- **Robuste Fallbacks:** "Headless" Blocks werden korrekt behandelt
|
||||||
|
- **Liberalisierte Regex:** Unterstützung für Umlaute und Sonderzeichen in Link-Targets
|
||||||
|
|
||||||
|
### Format-agnostische De-Duplizierung
|
||||||
|
|
||||||
|
- Kanten werden unabhängig vom Format (Inline, Callout, Wikilink) erkannt
|
||||||
|
- Verhindert Dopplungen, wenn Kanten bereits via `[!edge]` Callout vorhanden sind
|
||||||
|
- Ziel-basierte Prüfung statt String-Match
|
||||||
|
|
||||||
|
### Deterministic Hashing
|
||||||
|
|
||||||
|
- `full`-Hash inkludiert strategische Parameter (`split_level`, `strict_heading_split`)
|
||||||
|
- Konfigurationsänderungen im Frontmatter lösen zwingend Re-Import aus
|
||||||
|
- Zuverlässigere Change Detection
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🐛 Bugfixes
|
||||||
|
|
||||||
|
- ✅ **Behoben:** Mehrere Links zur gleichen Note in einem Callout-Block wurden zu einer Kante gemergt
|
||||||
|
- ✅ **Behoben:** "Phantom-Knoten" durch Einbeziehung des Anchors in `target_id`
|
||||||
|
- ✅ **Behoben:** Redundante `[[rel:...]]` Links in Chunks
|
||||||
|
- ✅ **Behoben:** Inkonsistente Metadaten in Qdrant durch fehlerhafte Profil-Auflösung
|
||||||
|
- ✅ **Behoben:** `TypeError` durch Parameter-Mismatch zwischen Orchestrator und Strategien
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ⚠️ Breaking Changes & Migration
|
||||||
|
|
||||||
|
### Migration erforderlich
|
||||||
|
|
||||||
|
**WICHTIG:** Nach dem Update auf v2.9.1 ist ein **vollständiger Re-Import** erforderlich:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --force
|
||||||
|
```
|
||||||
|
|
||||||
|
**Warum?**
|
||||||
|
- Behebt "Phantom-Knoten" durch korrekte Aufteilung von `[[Note#Section]]` Links
|
||||||
|
- Konsolidiert Edge-Struktur mit neuem `target_section` Feld
|
||||||
|
- Aktualisiert Chunking basierend auf Atomic Section Logic
|
||||||
|
|
||||||
|
**Was passiert beim Re-Import?**
|
||||||
|
- Alle bestehenden Links werden neu geparst und in `target_id` + `target_section` aufgeteilt
|
||||||
|
- Chunks werden mit neuer Atomic Section Logic neu generiert
|
||||||
|
- Edge-Struktur wird konsolidiert (Multigraph-Support)
|
||||||
|
|
||||||
|
**Dauer:** Abhängig von Vault-Größe (typischerweise 5-30 Minuten)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📚 API-Änderungen
|
||||||
|
|
||||||
|
### EdgeDTO erweitert
|
||||||
|
|
||||||
|
```python
|
||||||
|
class EdgeDTO(BaseModel):
|
||||||
|
# ... bestehende Felder ...
|
||||||
|
target_section: Optional[str] = None # Neu: Abschnitts-Name
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact für API-Consumer:**
|
||||||
|
- Graph-Endpunkte (`/graph/{note_id}`) enthalten nun `target_section` in Edge-Objekten
|
||||||
|
- Frontend kann Section-Informationen für präzisere Visualisierung nutzen
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📖 Dokumentation
|
||||||
|
|
||||||
|
Alle relevanten Dokumente wurden aktualisiert:
|
||||||
|
|
||||||
|
- ✅ `03_tech_data_model.md`: Edge Payload Schema mit `target_section`
|
||||||
|
- ✅ `02_concept_graph_logic.md`: Section-basierte Links & Multigraph-Support
|
||||||
|
- ✅ `03_tech_ingestion_pipeline.md`: Chunking-Strategien, Registry-First Profiling
|
||||||
|
- ✅ `03_tech_api_reference.md`: EdgeDTO mit `target_section`
|
||||||
|
- ✅ `01_knowledge_design.md`: Deep-Links dokumentiert
|
||||||
|
- ✅ `00_glossary.md`: Neue Begriffe ergänzt
|
||||||
|
- ✅ `04_admin_operations.md`: Migration-Hinweis
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔄 Technische Details
|
||||||
|
|
||||||
|
### Geänderte Module
|
||||||
|
|
||||||
|
**Graph Topology:**
|
||||||
|
- `app/core/graph/graph_utils.py`: `parse_link_target()` für Section-Extraktion
|
||||||
|
- `app/core/graph/graph_derive_edges.py`: `target_section` in Edge-Payload
|
||||||
|
- `app/core/graph/graph_extractors.py`: Multi-line Callout-Parser
|
||||||
|
|
||||||
|
**Chunking:**
|
||||||
|
- `app/core/chunking/chunking_strategies.py`: Atomic Section Logic (v3.9.9)
|
||||||
|
- `app/core/chunking/chunking_propagation.py`: Format-agnostische De-Duplizierung
|
||||||
|
|
||||||
|
**Ingestion:**
|
||||||
|
- `app/core/ingestion/ingestion_processor.py`: Registry-First Profiling (v2.13.12), Deterministic Hashing
|
||||||
|
|
||||||
|
**Database:**
|
||||||
|
- `app/core/database/qdrant.py`: Keyword-Index für `target_section`
|
||||||
|
- `app/core/database/qdrant_points.py`: Explizites Durchreichen von `target_section`
|
||||||
|
|
||||||
|
**API:**
|
||||||
|
- `app/models/dto.py`: `EdgeDTO` mit `target_section` Feld (v0.6.7)
|
||||||
|
|
||||||
|
### Versionsnummern
|
||||||
|
|
||||||
|
- Graph Topology: **v2.9.1**
|
||||||
|
- Chunking Strategies: **v3.9.9**
|
||||||
|
- Ingestion Processor: **v2.13.12**
|
||||||
|
- API DTO: **v0.6.7**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚀 Upgrade-Pfad
|
||||||
|
|
||||||
|
### Für Administratoren
|
||||||
|
|
||||||
|
1. **Backup erstellen:**
|
||||||
|
```bash
|
||||||
|
docker stop qdrant
|
||||||
|
tar -czf qdrant_backup_$(date +%F).tar.gz ./qdrant_data
|
||||||
|
docker start qdrant
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Code aktualisieren:**
|
||||||
|
```bash
|
||||||
|
git pull origin main
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Re-Import durchführen:**
|
||||||
|
```bash
|
||||||
|
python3 -m scripts.import_markdown --vault ./vault --prefix "mindnet" --apply --force
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Services neu starten:**
|
||||||
|
```bash
|
||||||
|
sudo systemctl restart mindnet-prod
|
||||||
|
sudo systemctl restart mindnet-ui-prod
|
||||||
|
```
|
||||||
|
|
||||||
|
### Für Entwickler
|
||||||
|
|
||||||
|
- Keine Code-Änderungen erforderlich, wenn nur API genutzt wird
|
||||||
|
- Frontend kann `target_section` Feld in Edge-Objekten nutzen (optional)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📝 Bekannte Einschränkungen
|
||||||
|
|
||||||
|
- **Migration-Dauer:** Große Vaults (>10.000 Notizen) können 30+ Minuten benötigen
|
||||||
|
- **Temporärer Speicher:** Während des Re-Imports kann Qdrant-Speicher temporär ansteigen
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🙏 Danksagungen
|
||||||
|
|
||||||
|
Diese Version wurde durch umfangreiche Code-Analyse und Dokumentationsprüfung ermöglicht. Besonderer Fokus lag auf:
|
||||||
|
- Konsistenz zwischen Code und Dokumentation
|
||||||
|
- Vollständige Abdeckung aller Rollen (Entwickler, Administratoren, Anwender, Tester, Deployment)
|
||||||
|
- Klare Migration-Pfade
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Vollständige Changelog:** Siehe Git-Commits für detaillierte Änderungen
|
||||||
|
**Support:** Bei Fragen zur Migration siehe [Admin Operations Guide](../04_Operations/04_admin_operations.md)
|
||||||
|
|
||||||
Loading…
Reference in New Issue
Block a user