Aktualisierung des Chunking-Parsers zur Verbesserung der Blockverarbeitung und Beschreibung. Anpassungen an der atomaren Sektions-Chunking-Strategie mit optimierter Token-Schätzung und neuen Hilfsfunktionen zur besseren Handhabung von großen Sektionen. Einführung einer präziseren Schätzung für deutsche Texte und Anpassungen an der Logik zur Handhabung von Sektionen.
This commit is contained in:
parent
96b4f65cd1
commit
680c36ab59
|
|
@ -1,7 +1,7 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunking/chunking_parser.py
|
FILE: app/core/chunking/chunking_parser.py
|
||||||
DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
|
DESCRIPTION: Zerlegt Markdown in logische Blöcke (RawBlocks).
|
||||||
Hält H1-Überschriften im Inhalts-Stream.
|
Gewährleistet, dass H1 und Trenner im Stream verbleiben.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import List, Tuple, Set
|
from typing import List, Tuple, Set
|
||||||
|
|
@ -20,28 +20,21 @@ def split_sentences(text: str) -> list[str]:
|
||||||
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
"""Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6."""
|
"""Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6."""
|
||||||
blocks = []
|
blocks = []
|
||||||
h1_title = "Dokument"
|
h1_title = "Dokument"; section_path = "/"; current_section_title = None
|
||||||
section_path = "/"
|
|
||||||
current_section_title = None
|
|
||||||
|
|
||||||
# Frontmatter entfernen
|
|
||||||
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
||||||
|
|
||||||
# H1 für Note-Titel extrahieren
|
# H1 für Metadaten extrahieren
|
||||||
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
|
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
|
||||||
if h1_match:
|
if h1_match: h1_title = h1_match.group(1).strip()
|
||||||
h1_title = h1_match.group(1).strip()
|
|
||||||
|
|
||||||
lines = text_without_fm.split('\n')
|
lines = text_without_fm.split('\n')
|
||||||
buffer = []
|
buffer = []
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
stripped = line.strip()
|
stripped = line.strip()
|
||||||
|
|
||||||
# Heading-Erkennung (H1 bis H6)
|
|
||||||
heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
|
heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
|
||||||
|
|
||||||
if heading_match:
|
if heading_match:
|
||||||
# Vorherigen Text-Block abschließen
|
|
||||||
if buffer:
|
if buffer:
|
||||||
content = "\n".join(buffer).strip()
|
content = "\n".join(buffer).strip()
|
||||||
if content:
|
if content:
|
||||||
|
|
@ -51,19 +44,14 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
level = len(heading_match.group(1))
|
level = len(heading_match.group(1))
|
||||||
title = heading_match.group(2).strip()
|
title = heading_match.group(2).strip()
|
||||||
|
|
||||||
# Pfad- und Titel-Update für die Metadaten der folgenden Blöcke
|
|
||||||
if level == 1:
|
if level == 1:
|
||||||
current_section_title = title
|
current_section_title = title; section_path = "/"
|
||||||
section_path = "/"
|
|
||||||
elif level == 2:
|
elif level == 2:
|
||||||
current_section_title = title
|
current_section_title = title; section_path = f"/{current_section_title}"
|
||||||
section_path = f"/{current_section_title}"
|
|
||||||
|
|
||||||
# Die Überschrift selbst als Block hinzufügen
|
|
||||||
blocks.append(RawBlock("heading", stripped, level, section_path, current_section_title))
|
blocks.append(RawBlock("heading", stripped, level, section_path, current_section_title))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Trenner oder Leerzeilen beenden Blöcke, außer innerhalb von Callouts
|
|
||||||
if (not stripped or stripped == "---") and not line.startswith('>'):
|
if (not stripped or stripped == "---") and not line.startswith('>'):
|
||||||
if buffer:
|
if buffer:
|
||||||
content = "\n".join(buffer).strip()
|
content = "\n".join(buffer).strip()
|
||||||
|
|
@ -77,22 +65,19 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
|
|
||||||
if buffer:
|
if buffer:
|
||||||
content = "\n".join(buffer).strip()
|
content = "\n".join(buffer).strip()
|
||||||
if content:
|
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
|
||||||
blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
|
|
||||||
|
|
||||||
return blocks, h1_title
|
return blocks, h1_title
|
||||||
|
|
||||||
def parse_edges_robust(text: str) -> Set[str]:
|
def parse_edges_robust(text: str) -> Set[str]:
|
||||||
"""Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts."""
|
"""Extrahiert Kanten aus Wikilinks und Callouts."""
|
||||||
found_edges = set()
|
found_edges = set()
|
||||||
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
||||||
for kind, target in inlines:
|
for kind, target in inlines:
|
||||||
k = kind.strip().lower()
|
k = kind.strip().lower(); t = target.strip()
|
||||||
t = target.strip()
|
|
||||||
if k and t: found_edges.add(f"{k}:{t}")
|
if k and t: found_edges.add(f"{k}:{t}")
|
||||||
|
|
||||||
lines = text.split('\n')
|
lines = text.split('\n'); current_edge_type = None
|
||||||
current_edge_type = None
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
stripped = line.strip()
|
stripped = line.strip()
|
||||||
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
|
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
|
||||||
|
|
@ -106,6 +91,5 @@ def parse_edges_robust(text: str) -> Set[str]:
|
||||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||||
for l in links:
|
for l in links:
|
||||||
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
|
||||||
elif not stripped.startswith('>'):
|
elif not stripped.startswith('>'): current_edge_type = None
|
||||||
current_edge_type = None
|
|
||||||
return found_edges
|
return found_edges
|
||||||
|
|
@ -1,15 +1,18 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunking/chunking_strategies.py
|
FILE: app/core/chunking/chunking_strategies.py
|
||||||
DESCRIPTION: Universelle Strategie für atomares Sektions-Chunking v3.5.0.
|
DESCRIPTION: Universelle Strategie für atomares Sektions-Chunking v3.6.0.
|
||||||
Garantiert Sektions-Integrität durch präventives Chunk-Management.
|
Garantiert Sektions-Integrität durch präventives Chunk-Management.
|
||||||
"""
|
"""
|
||||||
|
import math
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from .chunking_models import RawBlock, Chunk
|
from .chunking_models import RawBlock, Chunk
|
||||||
from .chunking_utils import estimate_tokens
|
|
||||||
from .chunking_parser import split_sentences
|
from .chunking_parser import split_sentences
|
||||||
|
|
||||||
|
def _accurate_estimate_tokens(text: str) -> int:
|
||||||
|
"""Konservative Schätzung für deutschen Text (len/2.5 statt len/4)."""
|
||||||
|
return max(1, math.ceil(len(text.strip()) / 2.5))
|
||||||
|
|
||||||
def _create_context_win(doc_title: str, sec_title: Optional[str], text: str) -> str:
|
def _create_context_win(doc_title: str, sec_title: Optional[str], text: str) -> str:
|
||||||
"""Baut den Breadcrumb-Kontext für das Embedding-Fenster."""
|
|
||||||
parts = []
|
parts = []
|
||||||
if doc_title: parts.append(doc_title)
|
if doc_title: parts.append(doc_title)
|
||||||
if sec_title and sec_title != doc_title: parts.append(sec_title)
|
if sec_title and sec_title != doc_title: parts.append(sec_title)
|
||||||
|
|
@ -18,8 +21,8 @@ def _create_context_win(doc_title: str, sec_title: Optional[str], text: str) ->
|
||||||
|
|
||||||
def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
Universelles Sektions-Chunking: Packt Sektionen in Chunks.
|
Sektions-Chunking: Packt komplette Abschnitte in Chunks.
|
||||||
Bei Überlauf wird die komplette Sektion in den nächsten Chunk geschoben.
|
Bei Überlauf wird die Sektion ohne Ausnahme in den nächsten Chunk geschoben.
|
||||||
"""
|
"""
|
||||||
target = config.get("target", 400)
|
target = config.get("target", 400)
|
||||||
max_tokens = config.get("max", 600)
|
max_tokens = config.get("max", 600)
|
||||||
|
|
@ -29,8 +32,8 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
|
|
||||||
chunks: List[Chunk] = []
|
chunks: List[Chunk] = []
|
||||||
|
|
||||||
# --- HILFSFUNKTION: Erzeugt einen Chunk aus einer Blockliste ---
|
def _emit_chunk(block_list: List[RawBlock]):
|
||||||
def _create_chunk_from_blocks(block_list: List[RawBlock]):
|
"""Schreibt eine Liste von Blöcken als einen einzigen, ungeteilten Chunk."""
|
||||||
if not block_list: return
|
if not block_list: return
|
||||||
txt = "\n\n".join([b.text for b in block_list])
|
txt = "\n\n".join([b.text for b in block_list])
|
||||||
idx = len(chunks)
|
idx = len(chunks)
|
||||||
|
|
@ -39,13 +42,13 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
win = _create_context_win(doc_title, title, txt)
|
win = _create_context_win(doc_title, title, txt)
|
||||||
chunks.append(Chunk(
|
chunks.append(Chunk(
|
||||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
text=txt, window=win, token_count=_accurate_estimate_tokens(txt),
|
||||||
section_title=title, section_path=path,
|
section_title=title, section_path=path,
|
||||||
neighbors_prev=None, neighbors_next=None
|
neighbors_prev=None, neighbors_next=None
|
||||||
))
|
))
|
||||||
|
|
||||||
# --- HILFSFUNKTION: Splittet eine einzelne Sektion, die > max ist ---
|
|
||||||
def _split_giant_section(sec_blocks: List[RawBlock]):
|
def _split_giant_section(sec_blocks: List[RawBlock]):
|
||||||
|
"""Notfall-Split: Nur wenn eine EINZELNE Sektion bereits > max ist."""
|
||||||
full_text = "\n\n".join([b.text for b in sec_blocks])
|
full_text = "\n\n".join([b.text for b in sec_blocks])
|
||||||
main_title = sec_blocks[0].section_title
|
main_title = sec_blocks[0].section_title
|
||||||
main_path = sec_blocks[0].section_path
|
main_path = sec_blocks[0].section_path
|
||||||
|
|
@ -55,85 +58,70 @@ def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id:
|
||||||
cur_sents = []; sub_len = 0
|
cur_sents = []; sub_len = 0
|
||||||
|
|
||||||
for s in sents:
|
for s in sents:
|
||||||
slen = estimate_tokens(s)
|
slen = _accurate_estimate_tokens(s)
|
||||||
if sub_len + slen > target and cur_sents:
|
if sub_len + slen > target and cur_sents:
|
||||||
combined_text = " ".join(cur_sents)
|
_emit_chunk([RawBlock("paragraph", " ".join(cur_sents), None, main_path, main_title)])
|
||||||
_create_chunk_from_blocks([RawBlock("paragraph", combined_text, None, main_path, main_title)])
|
|
||||||
|
|
||||||
# Context Injection: Überschrift für den nächsten Teil-Chunk
|
|
||||||
ov_s = [header_text] if header_text else []
|
ov_s = [header_text] if header_text else []
|
||||||
ov_l = estimate_tokens(header_text) if header_text else 0
|
ov_l = _accurate_estimate_tokens(header_text) if header_text else 0
|
||||||
for os in reversed(cur_sents):
|
for os in reversed(cur_sents):
|
||||||
if os == header_text: continue
|
if os == header_text: continue
|
||||||
t_len = estimate_tokens(os)
|
t_len = _accurate_estimate_tokens(os)
|
||||||
if ov_l + t_len < overlap:
|
if ov_l + t_len < overlap:
|
||||||
ov_s.insert(len(ov_s)-1 if header_text else 0, os)
|
ov_s.insert(len(ov_s)-1 if header_text else 0, os)
|
||||||
ov_l += t_len
|
ov_l += t_len
|
||||||
else: break
|
else: break
|
||||||
cur_sents = list(ov_s); cur_sents.append(s); sub_len = ov_l + slen
|
cur_sents = list(ov_s); cur_sents.append(s); sub_len = ov_l + slen
|
||||||
else:
|
else: cur_sents.append(s); sub_len += slen
|
||||||
cur_sents.append(s); sub_len += slen
|
|
||||||
|
|
||||||
if cur_sents:
|
if cur_sents: _emit_chunk([RawBlock("paragraph", " ".join(cur_sents), None, main_path, main_title)])
|
||||||
_create_chunk_from_blocks([RawBlock("paragraph", " ".join(cur_sents), None, main_path, main_title)])
|
|
||||||
|
|
||||||
# 1. SCHRITT: Gruppierung in atomare Sektions-Einheiten
|
# 1. Gruppierung in atomare Einheiten
|
||||||
sections: List[List[RawBlock]] = []
|
sections: List[List[RawBlock]] = []
|
||||||
curr_sec: List[RawBlock] = []
|
curr_sec: List[RawBlock] = []
|
||||||
for b in blocks:
|
for b in blocks:
|
||||||
# Eine neue Überschrift auf oder unter dem split_level startet eine neue Sektion
|
|
||||||
if b.kind == "heading" and b.level <= split_level:
|
if b.kind == "heading" and b.level <= split_level:
|
||||||
if curr_sec: sections.append(curr_sec)
|
if curr_sec: sections.append(curr_sec)
|
||||||
curr_sec = [b]
|
curr_sec = [b]
|
||||||
else:
|
else: curr_sec.append(b)
|
||||||
curr_sec.append(b)
|
|
||||||
if curr_sec: sections.append(curr_sec)
|
if curr_sec: sections.append(curr_sec)
|
||||||
|
|
||||||
# 2. SCHRITT: Sektionen in Chunks packen (Das universelle Pack-Verfahren)
|
# 2. Das Pack-Verfahren (Kein Zerschneiden beim Flashen!)
|
||||||
candidate_chunk: List[RawBlock] = []
|
candidate_chunk: List[RawBlock] = []
|
||||||
candidate_tokens = 0
|
candidate_tokens = 0
|
||||||
|
|
||||||
for sec in sections:
|
for sec in sections:
|
||||||
sec_text = "\n\n".join([b.text for b in sec])
|
sec_text = "\n\n".join([b.text for b in sec])
|
||||||
sec_tokens = estimate_tokens(sec_text)
|
sec_tokens = _accurate_estimate_tokens(sec_text)
|
||||||
|
|
||||||
# Passt diese gesamte Sektion noch in den laufenden Chunk?
|
# Prüfung: Passt die Sektion noch dazu?
|
||||||
if candidate_tokens + sec_tokens <= max_tokens:
|
if candidate_tokens + sec_tokens <= max_tokens:
|
||||||
candidate_chunk.extend(sec)
|
candidate_chunk.extend(sec)
|
||||||
candidate_tokens = estimate_tokens("\n\n".join([b.text for b in candidate_chunk]))
|
candidate_tokens = _accurate_estimate_tokens("\n\n".join([b.text for b in candidate_chunk]))
|
||||||
else:
|
else:
|
||||||
# Falls der aktuelle Chunk nicht leer ist: Raus damit, bevor die neue Sektion kommt
|
# Chunk ist voll -> Abschluss an Sektionsgrenze
|
||||||
if candidate_chunk:
|
if candidate_chunk:
|
||||||
_create_chunk_from_blocks(candidate_chunk)
|
_emit_chunk(candidate_chunk)
|
||||||
candidate_chunk = []
|
candidate_chunk = []
|
||||||
candidate_tokens = 0
|
candidate_tokens = 0
|
||||||
|
|
||||||
# Die neue Sektion ist nun allein. Ist sie selbst zu groß?
|
# Neue Sektion allein prüfen
|
||||||
if sec_tokens > max_tokens:
|
if sec_tokens > max_tokens:
|
||||||
_split_giant_section(sec)
|
_split_giant_section(sec)
|
||||||
else:
|
else:
|
||||||
candidate_chunk = list(sec)
|
candidate_chunk = list(sec)
|
||||||
candidate_tokens = sec_tokens
|
candidate_tokens = sec_tokens
|
||||||
|
|
||||||
# Letzten Rest wegschreiben
|
if candidate_chunk: _emit_chunk(candidate_chunk)
|
||||||
if candidate_chunk:
|
|
||||||
_create_chunk_from_blocks(candidate_chunk)
|
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
||||||
"""Standard-Sliding-Window für flache Texte ohne Sektionsfokus."""
|
target = config.get("target", 400); max_tokens = config.get("max", 600)
|
||||||
target = config.get("target", 400)
|
chunks: List[Chunk] = []; buf: List[RawBlock] = []
|
||||||
max_tokens = config.get("max", 600)
|
|
||||||
chunks: List[Chunk] = []
|
|
||||||
buf: List[RawBlock] = []
|
|
||||||
|
|
||||||
for b in blocks:
|
for b in blocks:
|
||||||
b_tokens = estimate_tokens(b.text)
|
b_tokens = _accurate_estimate_tokens(b.text)
|
||||||
current_tokens = sum(estimate_tokens(x.text) for x in buf) if buf else 0
|
current_tokens = sum(_accurate_estimate_tokens(x.text) for x in buf) if buf else 0
|
||||||
if current_tokens + b_tokens > max_tokens and buf:
|
if current_tokens + b_tokens > max_tokens and buf:
|
||||||
txt = "\n\n".join([x.text for x in buf])
|
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||||
idx = len(chunks)
|
|
||||||
win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
|
win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
|
||||||
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=current_tokens, section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=current_tokens, section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
||||||
buf = []
|
buf = []
|
||||||
|
|
@ -141,5 +129,5 @@ def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note
|
||||||
if buf:
|
if buf:
|
||||||
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||||
win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
|
win = f"{context_prefix}\n{txt}".strip() if context_prefix else txt
|
||||||
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=_accurate_estimate_tokens(txt), section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
||||||
return chunks
|
return chunks
|
||||||
Loading…
Reference in New Issue
Block a user