From 7fc316d2841aabadd324f9466c78c6ae62fd8197 Mon Sep 17 00:00:00 2001 From: Lars Date: Fri, 12 Dec 2025 12:04:31 +0100 Subject: [PATCH] bug --- app/core/chunker.py | 64 ++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/app/core/chunker.py b/app/core/chunker.py index e93bceb..88966dd 100644 --- a/app/core/chunker.py +++ b/app/core/chunker.py @@ -17,8 +17,8 @@ from app.services.semantic_analyzer import get_semantic_analyzer try: from app.core.derive_edges import build_edges_for_note except ImportError: - # Mock für Tests: Signatur muss mit dem Aufruf übereinstimmen - def build_edges_for_note(text, note_id, note_type, chunks=[], references=[]): return [] + # Mock für Tests + def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return [] logger = logging.getLogger(__name__) @@ -105,7 +105,6 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]: fm, text_without_fm = extract_frontmatter_from_text(md_text) - # H1 suchen h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE) if h1_match: h1_title = h1_match.group(1).strip() @@ -218,7 +217,7 @@ def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id return _strategy_sliding_window(blocks, config, note_id, doc_title, context_prefix=f"# {doc_title}") # ========================================== -# 4. ORCHESTRATION (ASYNC) +# 4. ORCHESTRATION (ASYNC) - WP-15 CORE # ========================================== async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]: @@ -246,6 +245,7 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op return [] if enable_smart_edges: + # Hier rufen wir nun die Smart Edge Allocation auf chunks = await _run_smart_edge_allocation(chunks, md_text, note_id, note_type) for i, ch in enumerate(chunks): @@ -254,36 +254,57 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op return chunks +def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> List[str]: + """ + Hilfsfunktion: Erstellt einen Dummy-Chunk für den gesamten Text und ruft + den Edge-Parser auf, um ALLE Kanten der Notiz zu finden. + """ + # 1. Dummy Chunk erstellen, der den gesamten Text enthält + # Das ist notwendig, da build_edges_for_note Kanten nur aus Chunks extrahiert. + dummy_chunk = { + "chunk_id": f"{note_id}#full", + "text": md_text, # Der Parser schaut in 'text' (oder 'window', 'content') + "type": note_type + } + + # 2. Aufruf des Parsers mit dem Dummy-Chunk + # WICHTIG: Argumentreihenfolge aus derive_edges.py beachten: + # note_id, chunks, note_level_references=None, include_note_scope_refs=False + raw_edges = build_edges_for_note( + note_id, + [dummy_chunk], + note_level_references=None, + include_note_scope_refs=False + ) + + # 3. Kanten extrahieren und formatieren + all_candidates = set() + for e in raw_edges: + # Wir ignorieren Strukturkanten, die wir für den Dummy erstellt haben + kind = e.get("kind") + target = e.get("target_id") + if target and kind not in ["belongs_to", "next", "prev"]: + all_candidates.add(f"{kind}:{target}") + + return list(all_candidates) + async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_id: str, note_type: str) -> List[Chunk]: analyzer = get_semantic_analyzer() - # FIX: Nutzung von positional arguments für die ersten 3 Parameter - # Dies verhindert den "multiple values for argument" Fehler - raw_edges = build_edges_for_note( - full_text, - note_id, - note_type, - chunks=[], - references=[] - ) - - all_candidates = set() - if raw_edges: - for e in raw_edges: - if e.get("target_id") and e.get("kind") not in ["next", "prev", "belongs_to"]: - all_candidates.add(f"{e['kind']}:{e['target_id']}") - - candidate_list = list(all_candidates) + # A. Alle potenziellen Kanten der Notiz sammeln (über den Dummy-Chunk Trick) + candidate_list = _extract_all_edges_from_md(full_text, note_id, note_type) if not candidate_list: return chunks + # B. LLM Filterung pro Chunk (Parallel) tasks = [] for chunk in chunks: tasks.append(analyzer.assign_edges_to_chunk(chunk.text, candidate_list, note_type)) results_per_chunk = await asyncio.gather(*tasks) + # C. Injection & Fallback assigned_edges_global = set() for i, confirmed_edges in enumerate(results_per_chunk): @@ -296,6 +317,7 @@ async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_i chunk.text += injection_str chunk.window += injection_str + # D. Fallback: Unassigned Kanten überall hin unassigned = set(candidate_list) - assigned_edges_global if unassigned: fallback_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in unassigned if ':' in e])