diff --git a/app/core/chunker.py b/app/core/chunker.py index 88966dd..9e0c5fa 100644 --- a/app/core/chunker.py +++ b/app/core/chunker.py @@ -263,27 +263,27 @@ def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> Li # Das ist notwendig, da build_edges_for_note Kanten nur aus Chunks extrahiert. dummy_chunk = { "chunk_id": f"{note_id}#full", - "text": md_text, # Der Parser schaut in 'text' (oder 'window', 'content') + "text": md_text, + "content": md_text, # Sicherstellen, dass der Parser Text findet + "window": md_text, "type": note_type } - # 2. Aufruf des Parsers mit dem Dummy-Chunk - # WICHTIG: Argumentreihenfolge aus derive_edges.py beachten: - # note_id, chunks, note_level_references=None, include_note_scope_refs=False + # 2. Aufruf des Parsers (Signatur-Fix!) + # derive_edges.py: build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False) raw_edges = build_edges_for_note( note_id, [dummy_chunk], - note_level_references=None, + note_level_references=None, include_note_scope_refs=False ) - # 3. Kanten extrahieren und formatieren + # 3. Kanten extrahieren all_candidates = set() for e in raw_edges: - # Wir ignorieren Strukturkanten, die wir für den Dummy erstellt haben kind = e.get("kind") target = e.get("target_id") - if target and kind not in ["belongs_to", "next", "prev"]: + if target and kind not in ["belongs_to", "next", "prev", "backlink"]: all_candidates.add(f"{kind}:{target}") return list(all_candidates) diff --git a/app/core/ingestion.py b/app/core/ingestion.py index cd6b293..716237e 100644 --- a/app/core/ingestion.py +++ b/app/core/ingestion.py @@ -5,7 +5,7 @@ Zentraler Service für die Transformation von Markdown-Dateien in Qdrant-Objekte Dient als Shared Logic für: 1. CLI-Imports (scripts/import_markdown.py) 2. API-Uploads (WP-11) -Refactored for Async Embedding Support. +Refactored for Async Embedding & Async Chunking (WP-15). """ import os import logging @@ -18,6 +18,7 @@ from app.core.parser import ( validate_required_frontmatter, ) from app.core.note_payload import make_note_payload +# ASYNC CHUNKER (WP-15) from app.core.chunker import assemble_chunks from app.core.chunk_payload import make_chunk_payloads @@ -193,10 +194,15 @@ class IngestionService: # 5. Processing (Chunking, Embedding, Edges) try: body_text = getattr(parsed, "body", "") or "" - chunks = assemble_chunks(fm["id"], body_text, fm["type"]) + + # --- FIX: AWAIT ASYNC CHUNKER (WP-15 Update) --- + # assemble_chunks ist jetzt eine Coroutine und muss mit await aufgerufen werden. + chunks = await assemble_chunks(fm["id"], body_text, fm["type"]) + # ----------------------------------------------- + chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text) - # --- EMBEDDING FIX (ASYNC) --- + # --- EMBEDDING (ASYNC) --- vecs = [] if chunk_pls: texts = [c.get("window") or c.get("text") or "" for c in chunk_pls]