Update ingestion_processor.py to version 3.2.0: Enhance logging stability and improve edge validation by addressing KeyError risks. Implement batch import with symmetry memory and modularized schema logic for explicit edge handling. Adjust documentation and versioning for improved clarity and robustness.
This commit is contained in:
parent
008a470f02
commit
7e4ea670b1
|
|
@ -2,17 +2,19 @@
|
||||||
FILE: app/core/ingestion/ingestion_processor.py
|
FILE: app/core/ingestion/ingestion_processor.py
|
||||||
DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||||
WP-24c: Integration der Symmetrie-Logik (Automatische inverse Kanten).
|
WP-24c: Integration der Symmetrie-Logik (Automatische inverse Kanten).
|
||||||
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
WP-25a: Mixture of Experts (MoE) - LLM Edge Validation.
|
||||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||||
AUDIT v3.1.9: Vollständiges Script mit Business-Logging, UUIDs und Edge-Fix.
|
AUDIT v3.2.0: Fix für KeyError 'target_id', stabiles Logging
|
||||||
VERSION: 3.1.9 (WP-24c: Robust Orchestration & Full Feature Set)
|
und Priorisierung expliziter User-Kanten.
|
||||||
|
VERSION: 3.2.0 (WP-24c: Stability & Business Logging)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
from typing import Dict, List, Optional, Tuple, Any
|
from typing import Dict, List, Optional, Tuple, Any
|
||||||
|
|
||||||
# Core Module Imports
|
# Core Module Imports
|
||||||
|
|
@ -21,7 +23,7 @@ from app.core.parser import (
|
||||||
validate_required_frontmatter, NoteContext
|
validate_required_frontmatter, NoteContext
|
||||||
)
|
)
|
||||||
from app.core.chunking import assemble_chunks
|
from app.core.chunking import assemble_chunks
|
||||||
# WP-24c: Import für die deterministische ID-Vorabberechnung (nun UUID-basiert)
|
# WP-24c: Import für die deterministische ID-Vorabberechnung (UUID-basiert)
|
||||||
from app.core.graph.graph_utils import _mk_edge_id
|
from app.core.graph.graph_utils import _mk_edge_id
|
||||||
|
|
||||||
# MODULARISIERUNG: Neue Import-Pfade für die Datenbank-Ebene
|
# MODULARISIERUNG: Neue Import-Pfade für die Datenbank-Ebene
|
||||||
|
|
@ -41,7 +43,7 @@ from .ingestion_validation import validate_edge_candidate
|
||||||
from .ingestion_note_payload import make_note_payload
|
from .ingestion_note_payload import make_note_payload
|
||||||
from .ingestion_chunk_payload import make_chunk_payloads
|
from .ingestion_chunk_payload import make_chunk_payloads
|
||||||
|
|
||||||
# Fallback für Edges (Struktur-Verknüpfung)
|
# Fallback für Edges
|
||||||
try:
|
try:
|
||||||
from app.core.graph.graph_derive_edges import build_edges_for_note
|
from app.core.graph.graph_derive_edges import build_edges_for_note
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
@ -56,10 +58,9 @@ class IngestionService:
|
||||||
self.settings = get_settings()
|
self.settings = get_settings()
|
||||||
|
|
||||||
# --- LOGGING CLEANUP (Business Focus) ---
|
# --- LOGGING CLEANUP (Business Focus) ---
|
||||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
# Unterdrückt technische Bibliotheks-Meldungen im Log-File und Konsole
|
||||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
for lib in ["httpx", "httpcore", "qdrant_client", "urllib3", "openai"]:
|
||||||
logging.getLogger("qdrant_client").setLevel(logging.WARNING)
|
logging.getLogger(lib).setLevel(logging.WARNING)
|
||||||
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
|
||||||
|
|
||||||
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
||||||
self.cfg = QdrantConfig.from_env()
|
self.cfg = QdrantConfig.from_env()
|
||||||
|
|
@ -76,9 +77,12 @@ class IngestionService:
|
||||||
|
|
||||||
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
||||||
self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache
|
self.batch_cache: Dict[str, NoteContext] = {} # WP-15b LocalBatchCache
|
||||||
|
|
||||||
|
# WP-24c: Laufzeit-Speicher für explizite Kanten-IDs im aktuellen Batch
|
||||||
self.processed_explicit_ids = set()
|
self.processed_explicit_ids = set()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Aufruf der modularisierten Schema-Logik
|
||||||
ensure_collections(self.client, self.prefix, self.dim)
|
ensure_collections(self.client, self.prefix, self.dim)
|
||||||
ensure_payload_indexes(self.client, self.prefix)
|
ensure_payload_indexes(self.client, self.prefix)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -104,6 +108,7 @@ class IngestionService:
|
||||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
|
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
WP-15b: Two-Pass Ingestion Workflow.
|
WP-15b: Two-Pass Ingestion Workflow.
|
||||||
|
Implementiert Batch-Import mit Symmetrie-Gedächtnis.
|
||||||
"""
|
"""
|
||||||
self.processed_explicit_ids.clear()
|
self.processed_explicit_ids.clear()
|
||||||
logger.info(f"--- 🔍 START BATCH IMPORT ({len(file_paths)} Dateien) ---")
|
logger.info(f"--- 🔍 START BATCH IMPORT ({len(file_paths)} Dateien) ---")
|
||||||
|
|
@ -112,6 +117,7 @@ class IngestionService:
|
||||||
try:
|
try:
|
||||||
ctx = pre_scan_markdown(path, registry=self.registry)
|
ctx = pre_scan_markdown(path, registry=self.registry)
|
||||||
if ctx:
|
if ctx:
|
||||||
|
# Look-up Index für Note_IDs und Titel
|
||||||
self.batch_cache[ctx.note_id] = ctx
|
self.batch_cache[ctx.note_id] = ctx
|
||||||
self.batch_cache[ctx.title] = ctx
|
self.batch_cache[ctx.title] = ctx
|
||||||
fname = os.path.splitext(os.path.basename(path))[0]
|
fname = os.path.splitext(os.path.basename(path))[0]
|
||||||
|
|
@ -197,7 +203,9 @@ class IngestionService:
|
||||||
is_valid = await validate_edge_candidate(
|
is_valid = await validate_edge_candidate(
|
||||||
ch.text, cand, self.batch_cache, self.llm, profile_name="ingest_validator"
|
ch.text, cand, self.batch_cache, self.llm, profile_name="ingest_validator"
|
||||||
)
|
)
|
||||||
logger.info(f" 🧠 [SMART EDGE] {cand['target_id']} -> {'✅ OK' if is_valid else '❌ SKIP'}")
|
# Fix (v3.2.0): Symmetrisches Logging ohne KeyError-Risiko
|
||||||
|
target_label = cand.get('target_id') or cand.get('note_id') or 'Unbekannt'
|
||||||
|
logger.info(f" 🧠 [SMART EDGE] {target_label} -> {'✅ OK' if is_valid else '❌ SKIP'}")
|
||||||
if is_valid: new_pool.append(cand)
|
if is_valid: new_pool.append(cand)
|
||||||
else:
|
else:
|
||||||
new_pool.append(cand)
|
new_pool.append(cand)
|
||||||
|
|
@ -206,6 +214,7 @@ class IngestionService:
|
||||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
||||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||||
|
|
||||||
|
# Aggregation aller Kanten
|
||||||
raw_edges = build_edges_for_note(
|
raw_edges = build_edges_for_note(
|
||||||
note_id, chunk_pls,
|
note_id, chunk_pls,
|
||||||
note_level_references=note_pl.get("references", []),
|
note_level_references=note_pl.get("references", []),
|
||||||
|
|
@ -219,7 +228,6 @@ class IngestionService:
|
||||||
for e in raw_edges:
|
for e in raw_edges:
|
||||||
target_raw = e.get("target_id")
|
target_raw = e.get("target_id")
|
||||||
if not self._is_valid_note_id(target_raw, provenance="explicit"):
|
if not self._is_valid_note_id(target_raw, provenance="explicit"):
|
||||||
logger.warning(f" ⚠️ Ignoriere Kante zu '{target_raw}' (Ungültige ID)")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"))
|
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"))
|
||||||
|
|
@ -246,7 +254,6 @@ class IngestionService:
|
||||||
|
|
||||||
is_in_batch = potential_id in self.processed_explicit_ids
|
is_in_batch = potential_id in self.processed_explicit_ids
|
||||||
|
|
||||||
# Real-Time DB Check (Sync)
|
|
||||||
is_in_db = False
|
is_in_db = False
|
||||||
if not is_in_batch:
|
if not is_in_batch:
|
||||||
is_in_db = is_explicit_edge_present(self.client, self.prefix, potential_id)
|
is_in_db = is_explicit_edge_present(self.client, self.prefix, potential_id)
|
||||||
|
|
@ -264,9 +271,10 @@ class IngestionService:
|
||||||
|
|
||||||
edges = final_edges
|
edges = final_edges
|
||||||
|
|
||||||
# 4. DB Upsert
|
# 4. DB Upsert via modularisierter Points-Logik
|
||||||
if apply:
|
if apply:
|
||||||
if purge_before: purge_artifacts(self.client, self.prefix, note_id)
|
if purge_before and old_payload:
|
||||||
|
purge_artifacts(self.client, self.prefix, note_id)
|
||||||
|
|
||||||
upsert_batch(self.client, f"{self.prefix}_notes", points_for_note(self.prefix, note_pl, None, self.dim)[1])
|
upsert_batch(self.client, f"{self.prefix}_notes", points_for_note(self.prefix, note_pl, None, self.dim)[1])
|
||||||
if chunk_pls and vecs:
|
if chunk_pls and vecs:
|
||||||
|
|
@ -284,6 +292,7 @@ class IngestionService:
|
||||||
"""Erstellt eine Note aus einem Textstream."""
|
"""Erstellt eine Note aus einem Textstream."""
|
||||||
target_path = os.path.join(vault_root, folder, filename)
|
target_path = os.path.join(vault_root, folder, filename)
|
||||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||||
with open(target_path, "w", encoding="utf-8") as f: f.write(markdown_content)
|
with open(target_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(markdown_content)
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||||
Loading…
Reference in New Issue
Block a user