mindnet/app/core/chunking/chunking_parser.py

104 lines
4.3 KiB
Python

"""
FILE: app/core/chunking/chunking_parser.py
DESCRIPTION: Zerlegt Markdown in atomare Blöcke. Hält H1-Überschriften im Stream
und gewährleistet die strukturelle Integrität von Callouts.
"""
import re
from typing import List, Tuple, Set
from .chunking_models import RawBlock
from .chunking_utils import extract_frontmatter_from_text
def split_sentences(text: str) -> list[str]:
"""Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
text = re.sub(r'\s+', ' ', text.strip())
if not text: return []
# Splittet bei Satzzeichen, gefolgt von Leerzeichen und Großbuchstaben
sentences = re.split(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])', text)
return [s.strip() for s in sentences if s.strip()]
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
"""Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6."""
blocks = []
h1_title = "Dokument"
section_path = "/"
current_section_title = None
# Frontmatter entfernen
fm, text_without_fm = extract_frontmatter_from_text(md_text)
# H1 für Note-Titel extrahieren
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
if h1_match:
h1_title = h1_match.group(1).strip()
lines = text_without_fm.split('\n')
buffer = []
for line in lines:
stripped = line.strip()
# Heading-Erkennung (H1 bis H6)
heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
if heading_match:
if buffer:
content = "\n".join(buffer).strip()
if content:
blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
buffer = []
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
if level == 1:
current_section_title = title; section_path = "/"
elif level == 2:
current_section_title = title; section_path = f"/{current_section_title}"
blocks.append(RawBlock("heading", stripped, level, section_path, current_section_title))
continue
# Trenner (---) beenden Blöcke, Leerzeilen nur wenn nicht in Callout
if stripped == "---" and not line.startswith('>'):
if buffer:
content = "\n".join(buffer).strip()
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
buffer = []
blocks.append(RawBlock("separator", "---", None, section_path, current_section_title))
elif not stripped and not line.startswith('>'):
if buffer:
content = "\n".join(buffer).strip()
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
buffer = []
else:
buffer.append(line)
if buffer:
content = "\n".join(buffer).strip()
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_section_title))
return blocks, h1_title
def parse_edges_robust(text: str) -> Set[str]:
"""Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts."""
found_edges = set()
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
for kind, target in inlines:
k = kind.strip().lower(); t = target.strip()
if k and t: found_edges.add(f"{k}:{t}")
lines = text.split('\n'); current_edge_type = None
for line in lines:
stripped = line.strip()
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
if callout_match:
current_edge_type = callout_match.group(1).strip().lower()
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
for l in links:
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
continue
if current_edge_type and stripped.startswith('>'):
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
for l in links:
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
elif not stripped.startswith('>'): current_edge_type = None
return found_edges