From 1b7b8091a3849621576e56f9da18dbb99b536f90 Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 27 Dec 2025 10:30:09 +0100 Subject: [PATCH] bug Fix --- app/core/chunker.py | 82 +------------------------ app/core/chunking/__init__.py | 10 +++ app/core/chunking/chunking_processor.py | 53 ++++++++++++++++ app/core/ingestion/__init__.py | 9 +++ 4 files changed, 75 insertions(+), 79 deletions(-) create mode 100644 app/core/chunking/chunking_processor.py diff --git a/app/core/chunker.py b/app/core/chunker.py index d8ea589..4a624e2 100644 --- a/app/core/chunker.py +++ b/app/core/chunker.py @@ -1,86 +1,10 @@ """ FILE: app/core/chunker.py DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher. - WP-14: Modularisierung abgeschlossen. - WP-15b: Edge-Inheritance und Candidate-Pool Logik integriert. - Verwendet neue 'chunking_' Präfixe für Untermodule. VERSION: 3.3.0 -STATUS: Active """ -import asyncio -import re -import logging -from typing import List, Dict, Optional - -# Interne Package-Imports mit neuer Präfix-Konvention -from .chunking.chunking_models import Chunk, RawBlock +from .chunking.chunking_processor import assemble_chunks from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text -from .chunking.chunking_parser import parse_blocks, parse_edges_robust -from .chunking.chunking_strategies import strategy_sliding_window, strategy_by_heading -from .chunking.chunking_propagation import propagate_section_edges +from .chunking.chunking_models import Chunk -logger = logging.getLogger(__name__) - -# Legacy Support für SemanticAnalyzer (Optional für andere Skripte) -try: - from app.services.semantic_analyzer import get_semantic_analyzer -except ImportError: - def get_semantic_analyzer(): return None - -async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: - """ - Hauptfunktion zur Chunk-Generierung. Orchestriert die modularisierten Komponenten. - Sichert die Kompatibilität zum bestehenden Ingestion-Prozess. - """ - if config is None: - config = get_chunk_config(note_type) - - fm, body_text = extract_frontmatter_from_text(md_text) - primary_strategy = config.get("strategy", "sliding_window") - - # 1. Parsing - blocks, doc_title = parse_blocks(md_text) - - # 2. Splitting via Thread-Offloading - if primary_strategy == "by_heading": - chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title) - else: - chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id) - - if not chunks: return [] - - # 3. WP-15b: Candidate Pool Vorbereitung - # A. Edge Inheritance (Sektions-Propagation) - chunks = propagate_section_edges(chunks, blocks) - - # B. Explicit Edges (Direkt im Chunk-Text) - for ch in chunks: - explicit = parse_edges_robust(ch.text) - for e_str in explicit: - kind, target = e_str.split(':', 1) - ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"}) - - # C. Global Pool Detection (Sektion 'Unzugeordnete Kanten') - pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE) - if pool_match: - unassigned = parse_edges_robust(pool_match.group(1)) - for ch in chunks: - for e_str in unassigned: - kind, target = e_str.split(':', 1) - ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"}) - - # D. Eindeutigkeit sicherstellen - for ch in chunks: - seen = set(); unique_pool = [] - for cand in ch.candidate_pool: - key = (cand["kind"], cand["to"]) - if key not in seen: - seen.add(key); unique_pool.append(cand) - ch.candidate_pool = unique_pool - - # 4. Graph-Struktur (Nachbarschaft) - for i, ch in enumerate(chunks): - ch.neighbors_prev = chunks[i-1].id if i > 0 else None - ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None - - return chunks \ No newline at end of file +__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"] \ No newline at end of file diff --git a/app/core/chunking/__init__.py b/app/core/chunking/__init__.py index e69de29..0d8c4bc 100644 --- a/app/core/chunking/__init__.py +++ b/app/core/chunking/__init__.py @@ -0,0 +1,10 @@ +""" +FILE: app/core/chunking/__init__.py +DESCRIPTION: Package-Einstiegspunkt für Chunking. Exportiert assemble_chunks. +VERSION: 3.3.0 +""" +from .chunking_processor import assemble_chunks +from .chunking_utils import get_chunk_config, extract_frontmatter_from_text +from .chunking_models import Chunk + +__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"] \ No newline at end of file diff --git a/app/core/chunking/chunking_processor.py b/app/core/chunking/chunking_processor.py new file mode 100644 index 0000000..12c9a7b --- /dev/null +++ b/app/core/chunking/chunking_processor.py @@ -0,0 +1,53 @@ +""" +FILE: app/core/chunking/chunking_processor.py +DESCRIPTION: Hauptlogik für das Zerlegen von Markdown in Chunks. +""" +import asyncio +import re +from typing import List, Dict, Optional +from .chunking_models import Chunk +from .chunking_utils import get_chunk_config, extract_frontmatter_from_text +from .chunking_parser import parse_blocks, parse_edges_robust +from .chunking_strategies import strategy_sliding_window, strategy_by_heading +from .chunking_propagation import propagate_section_edges + +async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: + """Orchestriert das Chunking und baut den Candidate-Pool auf.""" + if config is None: config = get_chunk_config(note_type) + fm, body_text = extract_frontmatter_from_text(md_text) + blocks, doc_title = parse_blocks(md_text) + + if config.get("strategy") == "by_heading": + chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title) + else: + chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id) + + if not chunks: return [] + + # WP-15b: Candidate Pool Aufbau + chunks = propagate_section_edges(chunks, blocks) + for ch in chunks: + for e_str in parse_edges_robust(ch.text): + k, t = e_str.split(':', 1) + ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"}) + + # Global Pool (Unzugeordnete Kanten) + pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE) + if pool_match: + for e_str in parse_edges_robust(pool_match.group(1)): + k, t = e_str.split(':', 1) + for ch in chunks: ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"}) + + # De-Duplikation + for ch in chunks: + seen = set(); unique = [] + for c in ch.candidate_pool: + if (c["kind"], c["to"]) not in seen: + seen.add((c["kind"], c["to"])); unique.append(c) + ch.candidate_pool = unique + + # Nachbarschaften + for i, ch in enumerate(chunks): + ch.neighbors_prev = chunks[i-1].id if i > 0 else None + ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None + return chunks \ No newline at end of file diff --git a/app/core/ingestion/__init__.py b/app/core/ingestion/__init__.py index e69de29..6b1f0db 100644 --- a/app/core/ingestion/__init__.py +++ b/app/core/ingestion/__init__.py @@ -0,0 +1,9 @@ +""" +FILE: app/core/ingestion/__init__.py +DESCRIPTION: Package-Einstiegspunkt für Ingestion. Exportiert den IngestionService. +VERSION: 2.13.0 +""" +from .ingestion_processor import IngestionService +from .ingestion_utils import extract_json_from_response, load_type_registry + +__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"] \ No newline at end of file