mindnet/app/core/chunking/chunking_processor.py
2025-12-27 10:30:09 +01:00

53 lines
2.2 KiB
Python

"""
FILE: app/core/chunking/chunking_processor.py
DESCRIPTION: Hauptlogik für das Zerlegen von Markdown in Chunks.
"""
import asyncio
import re
from typing import List, Dict, Optional
from .chunking_models import Chunk
from .chunking_utils import get_chunk_config, extract_frontmatter_from_text
from .chunking_parser import parse_blocks, parse_edges_robust
from .chunking_strategies import strategy_sliding_window, strategy_by_heading
from .chunking_propagation import propagate_section_edges
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
"""Orchestriert das Chunking und baut den Candidate-Pool auf."""
if config is None: config = get_chunk_config(note_type)
fm, body_text = extract_frontmatter_from_text(md_text)
blocks, doc_title = parse_blocks(md_text)
if config.get("strategy") == "by_heading":
chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title)
else:
chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id)
if not chunks: return []
# WP-15b: Candidate Pool Aufbau
chunks = propagate_section_edges(chunks, blocks)
for ch in chunks:
for e_str in parse_edges_robust(ch.text):
k, t = e_str.split(':', 1)
ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"})
# Global Pool (Unzugeordnete Kanten)
pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
if pool_match:
for e_str in parse_edges_robust(pool_match.group(1)):
k, t = e_str.split(':', 1)
for ch in chunks: ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"})
# De-Duplikation
for ch in chunks:
seen = set(); unique = []
for c in ch.candidate_pool:
if (c["kind"], c["to"]) not in seen:
seen.add((c["kind"], c["to"])); unique.append(c)
ch.candidate_pool = unique
# Nachbarschaften
for i, ch in enumerate(chunks):
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
return chunks