diff --git a/app/core/chunking/chunking_parser.py b/app/core/chunking/chunking_parser.py index 0524484..3d56f55 100644 --- a/app/core/chunking/chunking_parser.py +++ b/app/core/chunking/chunking_parser.py @@ -25,23 +25,42 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: if h1_match: h1_title = h1_match.group(1).strip() lines = text_without_fm.split('\n') buffer = [] + for line in lines: stripped = line.strip() - if stripped.startswith('# '): continue - elif stripped.startswith('## '): + + # H1 ignorieren (ist Doc Title) + if stripped.startswith('# '): + continue + + # Generische Heading-Erkennung (H2 bis H6) für flexible Split-Levels + heading_match = re.match(r'^(#{2,6})\s+(.*)', stripped) + if heading_match: + # Buffer leeren (vorherigen Text abschließen) if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) buffer = [] - current_h2 = stripped[3:].strip() - section_path = f"/{current_h2}" - blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2)) + + level = len(heading_match.group(1)) + title = heading_match.group(2).strip() + + # Pfad-Logik: H2 setzt den Haupt-Pfad + if level == 2: + current_h2 = title + section_path = f"/{current_h2}" + # Bei H3+ bleibt der section_path beim Parent, aber das Level wird korrekt gesetzt + + blocks.append(RawBlock("heading", stripped, level, section_path, current_h2)) + elif not stripped: if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2)) buffer = [] - else: buffer.append(line) + else: + buffer.append(line) + if buffer: content = "\n".join(buffer).strip() if content: blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))