smarter chunker initial

This commit is contained in:
Lars 2025-12-11 22:39:35 +01:00
parent c741cc7d1b
commit a1cd0741c9
3 changed files with 274 additions and 183 deletions

View File

@ -1,13 +0,0 @@
TYPE_SIZES = {
"thought": {"target": (150, 250), "max": 300, "overlap": (30, 40)},
"experience":{"target": (250, 350), "max": 450, "overlap": (40, 60)},
"journal": {"target": (200, 300), "max": 400, "overlap": (30, 50)},
"task": {"target": (120, 200), "max": 250, "overlap": (20, 30)},
"project": {"target": (300, 450), "max": 600, "overlap": (50, 70)},
"concept": {"target": (250, 400), "max": 550, "overlap": (40, 60)},
"source": {"target": (200, 350), "max": 500, "overlap": (30, 50)},
}
DEFAULT = {"target": (250, 350), "max": 500, "overlap": (40, 60)}
def get_sizes(note_type: str):
return TYPE_SIZES.get(str(note_type).lower(), DEFAULT)

View File

@ -1,41 +1,119 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple from typing import List, Dict, Optional, Tuple, Any
import re import re
import math import math
import yaml
from pathlib import Path
from markdown_it import MarkdownIt from markdown_it import MarkdownIt
from markdown_it.token import Token from markdown_it.token import Token
from .chunk_config import get_sizes
# ==========================================
# 1. CONFIGURATION LOADER (Ehemals chunk_config.py)
# ==========================================
# Pfad zur types.yaml bestimmen (2 Ebenen hoch von app/core/)
BASE_DIR = Path(__file__).resolve().parent.parent.parent
CONFIG_PATH = BASE_DIR / "types.yaml"
# Fallback Values
DEFAULT_PROFILE = {
"strategy": "sliding_window",
"target": 400,
"max": 600,
"overlap": (50, 80)
}
_CONFIG_CACHE = None
def _load_yaml_config() -> Dict[str, Any]:
"""Lädt die types.yaml und cached das Ergebnis."""
global _CONFIG_CACHE
if _CONFIG_CACHE is not None:
return _CONFIG_CACHE
if not CONFIG_PATH.exists():
print(f"WARNUNG: types.yaml nicht gefunden unter {CONFIG_PATH}. Nutze Defaults.")
return {}
try:
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
_CONFIG_CACHE = data
return data
except Exception as e:
print(f"FEHLER beim Laden von types.yaml: {e}")
return {}
def get_chunk_config(note_type: str) -> Dict[str, Any]:
"""
Löst Typ -> Profil -> Konfiguration auf.
"""
full_config = _load_yaml_config()
# 1. Profile holen
profiles = full_config.get("chunking_profiles", {})
# 2. Typ-Definition holen
type_def = full_config.get("types", {}).get(note_type.lower(), {})
# 3. Profil-Namen ermitteln (Fallback auf defaults)
profile_name = type_def.get("chunking_profile")
if not profile_name:
profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
# 4. Config bauen
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
# Sicherstellen, dass Overlap ein Tuple ist
if "overlap" in config and isinstance(config["overlap"], list):
config["overlap"] = tuple(config["overlap"])
return config
# Legacy Support für alten Code
def get_sizes(note_type: str):
cfg = get_chunk_config(note_type)
return {
"target": (cfg["target"], cfg["target"]),
"max": cfg["max"],
"overlap": cfg["overlap"]
}
# ==========================================
# 2. CHUNKING LOGIC & PARSER
# ==========================================
# --- Hilfen --- # --- Hilfen ---
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])') _SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
_WS = re.compile(r'\s+') _WS = re.compile(r'\s+')
def estimate_tokens(text: str) -> int: def estimate_tokens(text: str) -> int:
# leichte Approximation: 1 Token ≈ 4 Zeichen; robust + schnell # 1 Token ≈ 4 chars
t = len(text.strip()) t = len(text.strip())
return max(1, math.ceil(t / 4)) return max(1, math.ceil(t / 4))
def split_sentences(text: str) -> list[str]: def split_sentences(text: str) -> list[str]:
text = _WS.sub(' ', text.strip()) text = _WS.sub(' ', text.strip())
if not text: if not text: return []
return []
parts = _SENT_SPLIT.split(text) parts = _SENT_SPLIT.split(text)
return [p.strip() for p in parts if p.strip()] return [p.strip() for p in parts if p.strip()]
@dataclass @dataclass
class RawBlock: class RawBlock:
kind: str # "heading" | "paragraph" | "list" | "code" | "table" | "thematic_break" | "blockquote" kind: str
text: str text: str
level: Optional[int] # heading level (2,3,...) or None level: Optional[int]
section_path: str # e.g., "/H2 Title/H3 Subtitle" section_path: str
section_title: Optional[str]
@dataclass @dataclass
class Chunk: class Chunk:
id: str id: str
note_id: str note_id: str
index: int index: int
text: str text: str # Reintext für Anzeige
window: str # Text + Context für Embeddings
token_count: int token_count: int
section_title: Optional[str] section_title: Optional[str]
section_path: str section_path: str
@ -44,182 +122,178 @@ class Chunk:
char_start: int char_start: int
char_end: int char_end: int
# --- Markdown zu RawBlocks: H2/H3 als Sections, andere Blöcke gruppiert --- # --- Markdown Parser ---
def parse_blocks(md_text: str) -> List[RawBlock]: def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
"""Parst MD und gibt Blöcke UND den H1 Titel zurück."""
md = MarkdownIt("commonmark").enable("table") md = MarkdownIt("commonmark").enable("table")
tokens: List[Token] = md.parse(md_text) tokens: List[Token] = md.parse(md_text)
blocks: List[RawBlock] = [] blocks: List[RawBlock] = []
h1_title = "Dokument"
h2, h3 = None, None h2, h3 = None, None
section_path = "/" section_path = "/"
cur_text = []
cur_kind = None
def push(kind: str, txt: str, lvl: Optional[int]): def get_inline_content(idx, tokens):
nonlocal section_path txt = ""
txt = txt.strip() while idx < len(tokens) and tokens[idx].type != "heading_close":
if not txt: if tokens[idx].type == "inline":
return txt += tokens[idx].content
title = None idx += 1
if kind == "heading" and lvl: return txt.strip()
title = txt
blocks.append(RawBlock(kind=kind, text=txt, level=lvl, section_path=section_path))
i = 0 i = 0
while i < len(tokens): while i < len(tokens):
t = tokens[i] t = tokens[i]
if t.type == "heading_open": if t.type == "heading_open":
lvl = int(t.tag[1]) lvl = int(t.tag[1])
# Sammle heading inline
i += 1 i += 1
title_txt = "" title_txt = get_inline_content(i, tokens)
while i < len(tokens) and tokens[i].type != "heading_close":
if tokens[i].type == "inline": if lvl == 1:
title_txt += tokens[i].content h1_title = title_txt
i += 1 elif lvl == 2:
title_txt = title_txt.strip()
# Section-Pfad aktualisieren
if lvl == 2:
h2, h3 = title_txt, None h2, h3 = title_txt, None
section_path = f"/{h2}" section_path = f"/{h2}"
elif lvl == 3: elif lvl == 3:
h3 = title_txt h3 = title_txt
section_path = f"/{h2}/{h3}" if h2 else f"/{h3}" section_path = f"/{h2}/{h3}" if h2 else f"/{h3}"
push("heading", title_txt, lvl)
blocks.append(RawBlock("heading", title_txt, lvl, section_path, title_txt))
while i < len(tokens) and tokens[i].type != "heading_close": i += 1
elif t.type in ("paragraph_open", "bullet_list_open", "ordered_list_open", elif t.type in ("paragraph_open", "bullet_list_open", "ordered_list_open",
"fence", "code_block", "blockquote_open", "table_open", "hr"): "fence", "code_block", "blockquote_open", "table_open", "hr"):
kind = { kind = t.type.replace("_open", "")
"paragraph_open": "paragraph", content = ""
"bullet_list_open": "list",
"ordered_list_open": "list",
"fence": "code",
"code_block": "code",
"blockquote_open": "blockquote",
"table_open": "table",
"hr": "thematic_break",
}[t.type]
if t.type in ("fence", "code_block"): if t.type in ("fence", "code_block"):
# Codeblock hat eigenen content im selben Token
content = t.content or "" content = t.content or ""
push(kind, content, None)
else: else:
# inline sammeln bis close
content = ""
i += 1 i += 1
depth = 1 start_level = t.level
while i < len(tokens) and depth > 0: while i < len(tokens):
tk = tokens[i] tk = tokens[i]
if tk.type.endswith("_open"): if tk.type.replace("_close", "") == kind and tk.level == start_level and tk.type.endswith("_close"):
depth += 1 break
elif tk.type.endswith("_close"): if tk.type == "inline": content += tk.content
depth -= 1 elif tk.type in ("fence", "code_block"): content += "\n" + tk.content
elif tk.type == "inline": elif tk.type in ("softbreak", "hardbreak"): content += "\n"
content += tk.content
i += 1
push(kind, content, None)
continue # wir sind schon auf nächstem Token
i += 1 i += 1
return blocks if content.strip():
current_sec_title = h3 if h3 else (h2 if h2 else None)
blocks.append(RawBlock(kind, content.strip(), None, section_path, current_sec_title))
def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Chunk]: i += 1
sizes = get_sizes(note_type) return blocks, h1_title
target = sum(sizes["target"]) // 2 # mittlerer Zielwert
max_tokens = sizes["max"]
ov_min, ov_max = sizes["overlap"]
overlap = (ov_min + ov_max) // 2
blocks = parse_blocks(md_text) # --- Strategien ---
def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
target = config.get("target", 400)
max_tokens = config.get("max", 600)
overlap_val = config.get("overlap", (50, 80))
overlap = sum(overlap_val) // 2 if isinstance(overlap_val, tuple) else overlap_val
chunks: List[Chunk] = [] chunks: List[Chunk] = []
buf: List[Tuple[str, str, str]] = [] # (text, section_title, section_path) buf: List[RawBlock] = []
char_pos = 0
def flush_buffer(force=False): def flush_buffer():
nonlocal buf, chunks, char_pos nonlocal buf
if not buf: if not buf: return
return text_body = "\n\n".join([b.text for b in buf])
text = "\n\n".join([b[0] for b in buf]).strip() sec_title = buf[-1].section_title
if not text: sec_path = buf[-1].section_path
buf = [] window_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body
return
# Wenn zu groß, satzbasiert weich umbrechen if estimate_tokens(text_body) > max_tokens:
toks = estimate_tokens(text) sentences = split_sentences(text_body)
if toks > max_tokens: current_sents = []
sentences = split_sentences(text) cur_toks = 0
cur = []
cur_tokens = 0
for s in sentences: for s in sentences:
st = estimate_tokens(s) st = estimate_tokens(s)
if cur_tokens + st > target and cur: if cur_toks + st > target and current_sents:
_emit("\n".join(cur)) txt = "\n".join(current_sents)
# Overlap: letzte Sätze wiederverwenden win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
ov_text = " ".join(cur)[-overlap*4:] # 4 chars/token Heuristik _add_chunk(txt, win, sec_title, sec_path)
cur = [ov_text, s] if ov_text else [s] ov_txt = " ".join(current_sents)[-overlap*4:]
cur_tokens = estimate_tokens(" ".join(cur)) current_sents = [ov_txt, s] if ov_txt else [s]
cur_toks = estimate_tokens(" ".join(current_sents))
else: else:
cur.append(s) current_sents.append(s)
cur_tokens += st cur_toks += st
if cur: if current_sents:
_emit("\n".join(cur)) txt = "\n".join(current_sents)
win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
_add_chunk(txt, win, sec_title, sec_path)
else: else:
_emit(text) _add_chunk(text_body, window_body, sec_title, sec_path)
buf = [] buf = []
def _emit(text_block: str): def _add_chunk(txt, win, sec, path):
nonlocal chunks, char_pos
idx = len(chunks) idx = len(chunks)
chunk_id = f"{note_id}#c{idx:02d}"
token_count = estimate_tokens(text_block)
# section aus letztem buffer-entry ableiten
sec_title = buf[-1][1] if buf else None
sec_path = buf[-1][2] if buf else "/"
start = char_pos
end = start + len(text_block)
chunks.append(Chunk( chunks.append(Chunk(
id=chunk_id, id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
note_id=note_id, text=txt, window=win, token_count=estimate_tokens(txt),
index=idx, section_title=sec, section_path=path,
text=text_block, neighbors_prev=None, neighbors_next=None, char_start=0, char_end=0
token_count=token_count,
section_title=sec_title,
section_path=sec_path,
neighbors_prev=None,
neighbors_next=None,
char_start=start,
char_end=end
)) ))
char_pos = end + 1
# Blocks in Puffer sammeln; bei Überschreiten Zielbereich flushen
cur_sec_title = None
for b in blocks: for b in blocks:
if b.kind == "heading" and b.level in (2, 3): if estimate_tokens("\n\n".join([x.text for x in buf] + [b.text])) >= target:
# Sectionwechsel ⇒ Buffer flushen
flush_buffer() flush_buffer()
cur_sec_title = b.text.strip() buf.append(b)
# Heading selbst nicht als Chunk, aber als Kontexttitel nutzen
continue
txt = b.text.strip()
if not txt:
continue
tentative = "\n\n".join([*(x[0] for x in buf), txt]).strip()
if estimate_tokens(tentative) > max(get_sizes(note_type)["target"]):
# weicher Schnitt vor Hinzufügen
flush_buffer() flush_buffer()
buf.append((txt, cur_sec_title, b.section_path)) return chunks
# bei Erreichen ~Target flushen def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str) -> List[Chunk]:
if estimate_tokens("\n\n".join([x[0] for x in buf])) >= target: chunks: List[Chunk] = []
flush_buffer() sections: Dict[str, List[RawBlock]] = {}
ordered = []
flush_buffer(force=True) for b in blocks:
if b.kind == "heading": continue
if b.section_path not in sections:
sections[b.section_path] = []
ordered.append(b.section_path)
sections[b.section_path].append(b)
for path in ordered:
s_blocks = sections[path]
breadcrumbs = path.strip("/").replace("/", " > ")
context_header = f"# {doc_title}\n## {breadcrumbs}"
full_text = "\n\n".join([b.text for b in s_blocks])
if estimate_tokens(full_text) <= config.get("max", 600):
chunks.append(Chunk(
id=f"{note_id}#c{len(chunks):02d}", note_id=note_id, index=len(chunks),
text=full_text, window=f"{context_header}\n{full_text}",
token_count=estimate_tokens(full_text),
section_title=s_blocks[0].section_title, section_path=path,
neighbors_prev=None, neighbors_next=None, char_start=0, char_end=0
))
else:
sub = _strategy_sliding_window(s_blocks, config, note_id, context_prefix=context_header)
base = len(chunks)
for i, sc in enumerate(sub):
sc.index = base + i
sc.id = f"{note_id}#c{sc.index:02d}"
chunks.append(sc)
return chunks
# --- Main Entry Point ---
def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Chunk]:
config = get_chunk_config(note_type)
strategy = config.get("strategy", "sliding_window")
blocks, doc_title = parse_blocks(md_text)
if strategy == "by_heading":
chunks = _strategy_by_heading(blocks, config, note_id, doc_title)
else:
chunks = _strategy_sliding_window(blocks, config, note_id)
# neighbors setzen
for i, ch in enumerate(chunks): for i, ch in enumerate(chunks):
ch.neighbors_prev = chunks[i-1].id if i > 0 else None ch.neighbors_prev = chunks[i-1].id if i > 0 else None
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None

View File

@ -1,86 +1,116 @@
version: 1.1 # Update auf v1.1 für Mindnet v2.4 version: 1.2 # Update für Smart Chunking Config
# --- CHUNKING DEFINITIONEN ---
# Hier definieren wir die technischen Strategien zentral.
chunking_profiles:
# Standard für Fließtexte (Sliding Window)
sliding_short:
strategy: sliding_window
target: 200
max: 350
overlap: [30, 50]
sliding_standard:
strategy: sliding_window
target: 400
max: 600
overlap: [50, 80]
sliding_large:
strategy: sliding_window
target: 500
max: 800
overlap: [60, 100]
# Smart Chunking für Strukturen (Harte Splits)
structured_strict:
strategy: by_heading
split_level: 2
max: 600 # Fallback Limit
target: 400 # Fallback Target bei Sub-Chunking
overlap: [50, 80] # Overlap bei Sub-Chunking
defaults: defaults:
retriever_weight: 1.0 retriever_weight: 1.0
chunk_profile: default chunking_profile: sliding_standard # Fallback Profil
edge_defaults: [] edge_defaults: []
types: types:
# --- WISSENSBAUSTEINE --- # --- WISSENSBAUSTEINE ---
concept: concept:
chunk_profile: medium chunking_profile: sliding_standard
retriever_weight: 0.60 retriever_weight: 0.60
edge_defaults: ["references", "related_to"] edge_defaults: ["references", "related_to"]
source: source:
chunk_profile: short chunking_profile: sliding_standard
retriever_weight: 0.50 retriever_weight: 0.50
edge_defaults: [] # Quellen sind passiv edge_defaults: []
glossary: glossary:
chunk_profile: short chunking_profile: sliding_short
retriever_weight: 0.40 retriever_weight: 0.40
edge_defaults: ["related_to"] edge_defaults: ["related_to"]
# --- IDENTITÄT & PERSÖNLICHKEIT (Decision Engine Core) --- # --- IDENTITÄT & PERSÖNLICHKEIT ---
profile: profile:
chunk_profile: long chunking_profile: structured_strict # H2 Split wichtig für Profile
retriever_weight: 0.70 retriever_weight: 0.70
edge_defaults: ["references", "related_to"] edge_defaults: ["references", "related_to"]
value: value:
chunk_profile: short chunking_profile: structured_strict
retriever_weight: 1.00 # MAX: Werte stechen Fakten im Decision-Mode retriever_weight: 1.00
edge_defaults: ["related_to"] edge_defaults: ["related_to"]
principle: principle:
chunk_profile: short chunking_profile: structured_strict
retriever_weight: 0.95 # Sehr hoch: Handlungsleitlinien retriever_weight: 0.95
edge_defaults: ["derived_from", "references"] # Prinzipien leiten sich oft woraus ab edge_defaults: ["derived_from", "references"]
belief: # NEU: Glaubenssätze für Empathie-Modus belief:
chunk_profile: short chunking_profile: sliding_short
retriever_weight: 0.90 retriever_weight: 0.90
edge_defaults: ["related_to"] edge_defaults: ["related_to"]
experience: experience:
chunk_profile: medium chunking_profile: sliding_standard
retriever_weight: 0.90 retriever_weight: 0.90
edge_defaults: ["derived_from", "references"] # Erfahrungen haben einen Ursprung edge_defaults: ["derived_from", "references"]
# --- STRATEGIE & ENTSCHEIDUNG --- # --- STRATEGIE & ENTSCHEIDUNG ---
goal: goal:
chunk_profile: medium chunking_profile: sliding_standard
retriever_weight: 0.95 retriever_weight: 0.95
edge_defaults: ["depends_on", "related_to"] edge_defaults: ["depends_on", "related_to"]
decision: # ADRs (Architecture Decision Records) decision:
chunk_profile: long # Entscheidungen brauchen oft viel Kontext (Begründung) chunking_profile: structured_strict # ADRs sind oft strukturiert
retriever_weight: 1.00 # MAX: Getroffene Entscheidungen sind Gesetz retriever_weight: 1.00
edge_defaults: ["caused_by", "references"] # Entscheidungen haben Gründe edge_defaults: ["caused_by", "references"]
risk: # NEU: Risikomanagement risk:
chunk_profile: short chunking_profile: sliding_short
retriever_weight: 0.85 retriever_weight: 0.85
edge_defaults: ["related_to", "blocks"] # Risiken blockieren ggf. Projekte edge_defaults: ["related_to", "blocks"]
milestone: milestone:
chunk_profile: short chunking_profile: sliding_short
retriever_weight: 0.70 retriever_weight: 0.70
edge_defaults: ["related_to", "part_of"] edge_defaults: ["related_to", "part_of"]
# --- OPERATIV --- # --- OPERATIV ---
project: project:
chunk_profile: long chunking_profile: sliding_large # Projekte haben viel Text
retriever_weight: 0.97 # Projekte sind der Kontext für alles retriever_weight: 0.97
edge_defaults: ["references", "depends_on"] edge_defaults: ["references", "depends_on"]
task: task:
chunk_profile: short chunking_profile: sliding_short
retriever_weight: 0.80 retriever_weight: 0.80
edge_defaults: ["depends_on", "part_of"] edge_defaults: ["depends_on", "part_of"]
journal: journal:
chunk_profile: medium chunking_profile: sliding_standard
retriever_weight: 0.80 retriever_weight: 0.80
edge_defaults: ["references", "related_to"] edge_defaults: ["references", "related_to"]