""" FILE: app/core/chunker.py DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher. WP-14: Modularisierung abgeschlossen. WP-15b: Edge-Inheritance und Candidate-Pool Logik integriert. Verwendet neue 'chunking_' Präfixe für Untermodule. VERSION: 3.3.0 STATUS: Active """ import asyncio import re import logging from typing import List, Dict, Optional # Interne Package-Imports mit neuer Präfix-Konvention from .chunking.chunking_models import Chunk, RawBlock from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text from .chunking.chunking_parser import parse_blocks, parse_edges_robust from .chunking.chunking_strategies import strategy_sliding_window, strategy_by_heading from .chunking.chunking_propagation import propagate_section_edges logger = logging.getLogger(__name__) # Legacy Support für SemanticAnalyzer (Optional für andere Skripte) try: from app.services.semantic_analyzer import get_semantic_analyzer except ImportError: def get_semantic_analyzer(): return None async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: """ Hauptfunktion zur Chunk-Generierung. Orchestriert die modularisierten Komponenten. Sichert die Kompatibilität zum bestehenden Ingestion-Prozess. """ if config is None: config = get_chunk_config(note_type) fm, body_text = extract_frontmatter_from_text(md_text) primary_strategy = config.get("strategy", "sliding_window") # 1. Parsing blocks, doc_title = parse_blocks(md_text) # 2. Splitting via Thread-Offloading if primary_strategy == "by_heading": chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title) else: chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id) if not chunks: return [] # 3. WP-15b: Candidate Pool Vorbereitung # A. Edge Inheritance (Sektions-Propagation) chunks = propagate_section_edges(chunks, blocks) # B. Explicit Edges (Direkt im Chunk-Text) for ch in chunks: explicit = parse_edges_robust(ch.text) for e_str in explicit: kind, target = e_str.split(':', 1) ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"}) # C. Global Pool Detection (Sektion 'Unzugeordnete Kanten') pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE) if pool_match: unassigned = parse_edges_robust(pool_match.group(1)) for ch in chunks: for e_str in unassigned: kind, target = e_str.split(':', 1) ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"}) # D. Eindeutigkeit sicherstellen for ch in chunks: seen = set(); unique_pool = [] for cand in ch.candidate_pool: key = (cand["kind"], cand["to"]) if key not in seen: seen.add(key); unique_pool.append(cand) ch.candidate_pool = unique_pool # 4. Graph-Struktur (Nachbarschaft) for i, ch in enumerate(chunks): ch.neighbors_prev = chunks[i-1].id if i > 0 else None ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None return chunks