bug Fix
This commit is contained in:
parent
94e5ebf577
commit
1b7b8091a3
|
|
@ -1,86 +1,10 @@
|
|||
"""
|
||||
FILE: app/core/chunker.py
|
||||
DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher.
|
||||
WP-14: Modularisierung abgeschlossen.
|
||||
WP-15b: Edge-Inheritance und Candidate-Pool Logik integriert.
|
||||
Verwendet neue 'chunking_' Präfixe für Untermodule.
|
||||
VERSION: 3.3.0
|
||||
STATUS: Active
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
import logging
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
# Interne Package-Imports mit neuer Präfix-Konvention
|
||||
from .chunking.chunking_models import Chunk, RawBlock
|
||||
from .chunking.chunking_processor import assemble_chunks
|
||||
from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text
|
||||
from .chunking.chunking_parser import parse_blocks, parse_edges_robust
|
||||
from .chunking.chunking_strategies import strategy_sliding_window, strategy_by_heading
|
||||
from .chunking.chunking_propagation import propagate_section_edges
|
||||
from .chunking.chunking_models import Chunk
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Legacy Support für SemanticAnalyzer (Optional für andere Skripte)
|
||||
try:
|
||||
from app.services.semantic_analyzer import get_semantic_analyzer
|
||||
except ImportError:
|
||||
def get_semantic_analyzer(): return None
|
||||
|
||||
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
|
||||
"""
|
||||
Hauptfunktion zur Chunk-Generierung. Orchestriert die modularisierten Komponenten.
|
||||
Sichert die Kompatibilität zum bestehenden Ingestion-Prozess.
|
||||
"""
|
||||
if config is None:
|
||||
config = get_chunk_config(note_type)
|
||||
|
||||
fm, body_text = extract_frontmatter_from_text(md_text)
|
||||
primary_strategy = config.get("strategy", "sliding_window")
|
||||
|
||||
# 1. Parsing
|
||||
blocks, doc_title = parse_blocks(md_text)
|
||||
|
||||
# 2. Splitting via Thread-Offloading
|
||||
if primary_strategy == "by_heading":
|
||||
chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title)
|
||||
else:
|
||||
chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id)
|
||||
|
||||
if not chunks: return []
|
||||
|
||||
# 3. WP-15b: Candidate Pool Vorbereitung
|
||||
# A. Edge Inheritance (Sektions-Propagation)
|
||||
chunks = propagate_section_edges(chunks, blocks)
|
||||
|
||||
# B. Explicit Edges (Direkt im Chunk-Text)
|
||||
for ch in chunks:
|
||||
explicit = parse_edges_robust(ch.text)
|
||||
for e_str in explicit:
|
||||
kind, target = e_str.split(':', 1)
|
||||
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "explicit"})
|
||||
|
||||
# C. Global Pool Detection (Sektion 'Unzugeordnete Kanten')
|
||||
pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
|
||||
if pool_match:
|
||||
unassigned = parse_edges_robust(pool_match.group(1))
|
||||
for ch in chunks:
|
||||
for e_str in unassigned:
|
||||
kind, target = e_str.split(':', 1)
|
||||
ch.candidate_pool.append({"kind": kind, "to": target, "provenance": "global_pool"})
|
||||
|
||||
# D. Eindeutigkeit sicherstellen
|
||||
for ch in chunks:
|
||||
seen = set(); unique_pool = []
|
||||
for cand in ch.candidate_pool:
|
||||
key = (cand["kind"], cand["to"])
|
||||
if key not in seen:
|
||||
seen.add(key); unique_pool.append(cand)
|
||||
ch.candidate_pool = unique_pool
|
||||
|
||||
# 4. Graph-Struktur (Nachbarschaft)
|
||||
for i, ch in enumerate(chunks):
|
||||
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
||||
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
||||
|
||||
return chunks
|
||||
__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"]
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
"""
|
||||
FILE: app/core/chunking/__init__.py
|
||||
DESCRIPTION: Package-Einstiegspunkt für Chunking. Exportiert assemble_chunks.
|
||||
VERSION: 3.3.0
|
||||
"""
|
||||
from .chunking_processor import assemble_chunks
|
||||
from .chunking_utils import get_chunk_config, extract_frontmatter_from_text
|
||||
from .chunking_models import Chunk
|
||||
|
||||
__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"]
|
||||
53
app/core/chunking/chunking_processor.py
Normal file
53
app/core/chunking/chunking_processor.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_processor.py
|
||||
DESCRIPTION: Hauptlogik für das Zerlegen von Markdown in Chunks.
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
from typing import List, Dict, Optional
|
||||
from .chunking_models import Chunk
|
||||
from .chunking_utils import get_chunk_config, extract_frontmatter_from_text
|
||||
from .chunking_parser import parse_blocks, parse_edges_robust
|
||||
from .chunking_strategies import strategy_sliding_window, strategy_by_heading
|
||||
from .chunking_propagation import propagate_section_edges
|
||||
|
||||
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
|
||||
"""Orchestriert das Chunking und baut den Candidate-Pool auf."""
|
||||
if config is None: config = get_chunk_config(note_type)
|
||||
fm, body_text = extract_frontmatter_from_text(md_text)
|
||||
blocks, doc_title = parse_blocks(md_text)
|
||||
|
||||
if config.get("strategy") == "by_heading":
|
||||
chunks = await asyncio.to_thread(strategy_by_heading, blocks, config, note_id, doc_title)
|
||||
else:
|
||||
chunks = await asyncio.to_thread(strategy_sliding_window, blocks, config, note_id)
|
||||
|
||||
if not chunks: return []
|
||||
|
||||
# WP-15b: Candidate Pool Aufbau
|
||||
chunks = propagate_section_edges(chunks, blocks)
|
||||
for ch in chunks:
|
||||
for e_str in parse_edges_robust(ch.text):
|
||||
k, t = e_str.split(':', 1)
|
||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": "explicit"})
|
||||
|
||||
# Global Pool (Unzugeordnete Kanten)
|
||||
pool_match = re.search(r'###?\s*(?:Unzugeordnete Kanten|Edge Pool|Candidates)\s*\n(.*?)(?:\n#|$)', body_text, re.DOTALL | re.IGNORECASE)
|
||||
if pool_match:
|
||||
for e_str in parse_edges_robust(pool_match.group(1)):
|
||||
k, t = e_str.split(':', 1)
|
||||
for ch in chunks: ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"})
|
||||
|
||||
# De-Duplikation
|
||||
for ch in chunks:
|
||||
seen = set(); unique = []
|
||||
for c in ch.candidate_pool:
|
||||
if (c["kind"], c["to"]) not in seen:
|
||||
seen.add((c["kind"], c["to"])); unique.append(c)
|
||||
ch.candidate_pool = unique
|
||||
|
||||
# Nachbarschaften
|
||||
for i, ch in enumerate(chunks):
|
||||
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
||||
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
||||
return chunks
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/__init__.py
|
||||
DESCRIPTION: Package-Einstiegspunkt für Ingestion. Exportiert den IngestionService.
|
||||
VERSION: 2.13.0
|
||||
"""
|
||||
from .ingestion_processor import IngestionService
|
||||
from .ingestion_utils import extract_json_from_response, load_type_registry
|
||||
|
||||
__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"]
|
||||
Loading…
Reference in New Issue
Block a user