mindnet/app/core/chunking/chunking_parser.py

93 lines
3.7 KiB
Python

"""
FILE: app/core/chunking/chunking_parser.py
DESCRIPTION: Zerlegt Markdown in Blöcke und extrahiert Kanten-Strings.
"""
import re
from typing import List, Tuple, Set
from .chunking_models import RawBlock
from .chunking_utils import extract_frontmatter_from_text
_WS = re.compile(r'\s+')
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
def split_sentences(text: str) -> list[str]:
"""Teilt Text in Sätze auf."""
text = _WS.sub(' ', text.strip())
if not text: return []
return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
"""Zerlegt Text in logische Einheiten."""
blocks = []
h1_title = "Dokument"; section_path = "/"; current_h2 = None
fm, text_without_fm = extract_frontmatter_from_text(md_text)
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
if h1_match: h1_title = h1_match.group(1).strip()
lines = text_without_fm.split('\n')
buffer = []
for line in lines:
stripped = line.strip()
# H1 ignorieren (ist Doc Title)
if stripped.startswith('# '):
continue
# Generische Heading-Erkennung (H2 bis H6) für flexible Split-Levels
heading_match = re.match(r'^(#{2,6})\s+(.*)', stripped)
if heading_match:
# Buffer leeren (vorherigen Text abschließen)
if buffer:
content = "\n".join(buffer).strip()
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
buffer = []
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
# Pfad-Logik: H2 setzt den Haupt-Pfad
if level == 2:
current_h2 = title
section_path = f"/{current_h2}"
# Bei H3+ bleibt der section_path beim Parent, aber das Level wird korrekt gesetzt
blocks.append(RawBlock("heading", stripped, level, section_path, current_h2))
elif not stripped:
if buffer:
content = "\n".join(buffer).strip()
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
buffer = []
else:
buffer.append(line)
if buffer:
content = "\n".join(buffer).strip()
if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
return blocks, h1_title
def parse_edges_robust(text: str) -> Set[str]:
"""Extrahiert Kanten-Kandidaten (Wikilinks, Callouts)."""
found_edges = set()
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
for kind, target in inlines:
k = kind.strip().lower()
t = target.strip()
if k and t: found_edges.add(f"{k}:{t}")
lines = text.split('\n')
current_edge_type = None
for line in lines:
stripped = line.strip()
callout_match = re.match(r'>\s*\[!edge\]\s*([^:\s]+)', stripped)
if callout_match:
current_edge_type = callout_match.group(1).strip().lower()
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
for l in links:
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
continue
if current_edge_type and stripped.startswith('>'):
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
for l in links:
if "rel:" not in l: found_edges.add(f"{current_edge_type}:{l}")
elif not stripped.startswith('>'): current_edge_type = None
return found_edges