From 1b7b8091a3849621576e56f9da18dbb99b536f90 Mon Sep 17 00:00:00 2001
From: Lars <Lars@stommer.de>
Date: Sat, 27 Dec 2025 10:30:09 +0100
Subject: [PATCH] bug Fix

---
 app/core/chunker.py                     | 82 +------------------------
 app/core/chunking/__init__.py           | 10 +++
 app/core/chunking/chunking_processor.py | 53 ++++++++++++++++
 app/core/ingestion/__init__.py          |  9 +++
 4 files changed, 75 insertions(+), 79 deletions(-)
 create mode 100644 app/core/chunking/chunking_processor.py

diff --git a/app/core/chunker.py b/app/core/chunker.py
index d8ea589..4a624e2 100644
--- a/app/core/chunker.py
+++ b/app/core/chunker.py
@@ -1,86 +1,10 @@
 """
 FILE: app/core/chunker.py
 DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher.
-             WP-14: Modularisierung abgeschlossen.
-             WP-15b: Edge-Inheritance und Candidate-Pool Logik integriert.
-             Verwendet neue 'chunking_' Präfixe für Untermodule.
 VERSION: 3.3.0
-STATUS: Active
 """
-import asyncio
-import re
-import logging
-from typing import List, Dict, Optional
-
-# Interne Package-Imports mit neuer Präfix-Konvention
-from .chunking.chunking_models import Chunk, RawBlock
+from .chunking.chunking_processor import assemble_chunks
 from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text
-from .chunking.chunking_parser import parse_blocks, parse_edges_robust
-from .chunking.chunking_strategies import strategy_sliding_window, strategy_by_heading
-from .chunking.chunking_propagation import propagate_section_edges
+from .chunking.chunking_models import Chunk
 
-logger = logging.getLogger(__name__)
-
-# Legacy Support für SemanticAnalyzer (Optional für andere Skripte)
-try:
-    from app.services.semantic_analyzer import get_semantic_analyzer
-except ImportError:
-    def get_semantic_analyzer(): return None
-
-async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
-    """
-    Hauptfunktion zur Chunk-Generierung. Orchestriert die modularisierten Komponenten.
-    Sichert die Kompatibilität zum bestehenden Ingestion-Prozess.
-    """
-    if config is None:
-        config = get_chunk_config(note_type)
-        
-    fm, body_text = extract_frontmatter_from_text(md_text)
-    primary_strategy = config.get("strategy", "sliding_window")
-
-    # 1. Parsing
-    blocks, doc_title = parse_blocks(md_text)
-    
-    # 2. Splitting via Thread-Offloading
-    if primary_strategy == "by_heading":
-        chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title)
-    else:
-        chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id)
-
-    if not chunks: return []
-
-    # 3. WP-15b: Candidate Pool Vorbereitung
-    # A. Edge Inheritance (Sektions-Propagation)
-    chunks = propagate_section_edges(chunks, blocks)
-    
-    # B. Explicit Edges (Direkt im Chunk-Text)
-    for ch in chunks:
-        explicit = parse_edges_robust(ch.text)
-        for e_str in explicit:
-            kind, target = e_str.split(':', 1)
-            ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"})
-
-    # C. Global Pool Detection (Sektion 'Unzugeordnete Kanten')
-    pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
-    if pool_match:
-        unassigned = parse_edges_robust(pool_match.group(1))
-        for ch in chunks:
-            for e_str in unassigned:
-                kind, target = e_str.split(':', 1)
-                ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"})
-
-    # D. Eindeutigkeit sicherstellen
-    for ch in chunks:
-        seen = set(); unique_pool = []
-        for cand in ch.candidate_pool:
-            key = (cand["kind"], cand["to"])
-            if key not in seen:
-                seen.add(key); unique_pool.append(cand)
-        ch.candidate_pool = unique_pool
-
-    # 4. Graph-Struktur (Nachbarschaft)
-    for i, ch in enumerate(chunks):
-        ch.neighbors_prev = chunks[i-1].id if i > 0 else None
-        ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
-
-    return chunks
\ No newline at end of file
+__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"]
\ No newline at end of file
diff --git a/app/core/chunking/__init__.py b/app/core/chunking/__init__.py
index e69de29..0d8c4bc 100644
--- a/app/core/chunking/__init__.py
+++ b/app/core/chunking/__init__.py
@@ -0,0 +1,10 @@
+"""
+FILE: app/core/chunking/__init__.py
+DESCRIPTION: Package-Einstiegspunkt für Chunking. Exportiert assemble_chunks.
+VERSION: 3.3.0
+"""
+from .chunking_processor import assemble_chunks
+from .chunking_utils import get_chunk_config, extract_frontmatter_from_text
+from .chunking_models import Chunk
+
+__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"]
\ No newline at end of file
diff --git a/app/core/chunking/chunking_processor.py b/app/core/chunking/chunking_processor.py
new file mode 100644
index 0000000..12c9a7b
--- /dev/null
+++ b/app/core/chunking/chunking_processor.py
@@ -0,0 +1,53 @@
+"""
+FILE: app/core/chunking/chunking_processor.py
+DESCRIPTION: Hauptlogik für das Zerlegen von Markdown in Chunks.
+"""
+import asyncio
+import re
+from typing import List, Dict, Optional
+from .chunking_models import Chunk
+from .chunking_utils import get_chunk_config, extract_frontmatter_from_text
+from .chunking_parser import parse_blocks, parse_edges_robust
+from .chunking_strategies import strategy_sliding_window, strategy_by_heading
+from .chunking_propagation import propagate_section_edges
+
+async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
+    """Orchestriert das Chunking und baut den Candidate-Pool auf."""
+    if config is None: config = get_chunk_config(note_type)
+    fm, body_text = extract_frontmatter_from_text(md_text)
+    blocks, doc_title = parse_blocks(md_text)
+    
+    if config.get("strategy") == "by_heading":
+        chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title)
+    else:
+        chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id)
+
+    if not chunks: return []
+
+    # WP-15b: Candidate Pool Aufbau
+    chunks = propagate_section_edges(chunks, blocks)
+    for ch in chunks:
+        for e_str in parse_edges_robust(ch.text):
+            k, t = e_str.split(':', 1)
+            ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"})
+
+    # Global Pool (Unzugeordnete Kanten)
+    pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
+    if pool_match:
+        for e_str in parse_edges_robust(pool_match.group(1)):
+            k, t = e_str.split(':', 1)
+            for ch in chunks: ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"})
+
+    # De-Duplikation
+    for ch in chunks:
+        seen = set(); unique = []
+        for c in ch.candidate_pool:
+            if (c["kind"], c["to"]) not in seen:
+                seen.add((c["kind"], c["to"])); unique.append(c)
+        ch.candidate_pool = unique
+
+    # Nachbarschaften
+    for i, ch in enumerate(chunks):
+        ch.neighbors_prev = chunks[i-1].id if i > 0 else None
+        ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
+    return chunks
\ No newline at end of file
diff --git a/app/core/ingestion/__init__.py b/app/core/ingestion/__init__.py
index e69de29..6b1f0db 100644
--- a/app/core/ingestion/__init__.py
+++ b/app/core/ingestion/__init__.py
@@ -0,0 +1,9 @@
+"""
+FILE: app/core/ingestion/__init__.py
+DESCRIPTION: Package-Einstiegspunkt für Ingestion. Exportiert den IngestionService.
+VERSION: 2.13.0
+"""
+from .ingestion_processor import IngestionService
+from .ingestion_utils import extract_json_from_response, load_type_registry
+
+__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"]
\ No newline at end of file