Update ingestion_db.py and ingestion_processor.py: Refine documentation and enhance logging mechanisms. Improve edge validation logic with robust ID resolution and clarify comments for better understanding. Version updates to 2.2.1 and 3.2.1 respectively.

This commit is contained in:
Lars 2026-01-09 22:35:04 +01:00
parent 00264a9653
commit 4318395c83
2 changed files with 25 additions and 18 deletions

View File

@ -4,7 +4,6 @@ DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung.
WP-14: Umstellung auf zentrale database-Infrastruktur.
WP-24c: Implementierung der herkunftsbasierten Lösch-Logik (Origin-Purge).
Verhindert das versehentliche Löschen von inversen Kanten beim Re-Import.
VERSION v2.2.0: Integration der Authority-Prüfung für Point-IDs.
VERSION: 2.2.0 (WP-24c: Protected Purge & Authority Lookup)
STATUS: Active
"""
@ -50,6 +49,7 @@ def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) ->
"""
_, _, edges_col = collection_names(prefix)
try:
# retrieve erwartet eine Liste von IDs
res = client.retrieve(
collection_name=edges_col,
ids=[edge_id],
@ -83,7 +83,7 @@ def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
# Dies umfasst:
# - Alle ausgehenden Kanten (A -> B)
# - Alle inversen Kanten, die diese Note in anderen Notizen "deponiert" hat (B -> A)
# Fremde virtuelle Kanten (C -> A) bleiben erhalten, da deren origin_note_id == C ist.
# Fremde inverse Kanten (C -> A) bleiben erhalten.
edges_filter = rest.Filter(must=[
rest.FieldCondition(key="origin_note_id", match=rest.MatchValue(value=note_id))
])

View File

@ -5,9 +5,9 @@ DESCRIPTION: Der zentrale IngestionService (Orchestrator).
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
AUDIT v3.2.0: Fix für KeyError 'target_id', TypeError (Sync-Check)
und Business-Centric Logging.
VERSION: 3.2.0 (WP-24c: Stability & Full Feature Set)
AUDIT v3.2.1: Fix für ID-Kanonisierung in Phase 1 & 2,
robuster Smart-Edge-Logger und Business-Logging.
VERSION: 3.2.1 (WP-24c: Canonical Authority Protection)
STATUS: Active
"""
import logging
@ -56,8 +56,8 @@ class IngestionService:
from app.config import get_settings
self.settings = get_settings()
# --- LOGGING CLEANUP ---
# Unterdrückt Bibliotheks-Lärm in Konsole und Datei
# --- LOGGING CLEANUP (Business Focus) ---
# Unterdrückt Bibliotheks-Lärm in Konsole und Datei (via tee)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logging.getLogger("qdrant_client").setLevel(logging.WARNING)
@ -99,18 +99,18 @@ class IngestionService:
if not text or len(text.strip()) < 2:
return False
# Symmetrie-Filter gegen Typ-Strings
# Nur System-Kanten (Symmetrie) filtern wir gegen die Typ-Blacklist
if provenance != "explicit":
blacklisted = {"insight", "event", "source", "task", "project", "person", "concept", "related_to", "referenced_by"}
if text.lower().strip() in blacklisted:
return False
if len(text) > 150: return False # Wahrscheinlich kein Titel
if len(text) > 150: return False # Vermutlich ein ganzer Satz
return True
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
"""
WP-15b: Implementiert den Two-Pass Ingestion Workflow.
WP-15b: Two-Pass Ingestion Workflow.
Pass 1: Pre-Scan füllt den Context-Cache.
Pass 2: Verarbeitung nutzt den Cache für die semantische Prüfung.
"""
@ -122,7 +122,7 @@ class IngestionService:
# Übergabe der Registry für dynamische Scan-Tiefe
ctx = pre_scan_markdown(path, registry=self.registry)
if ctx:
# Mehrfache Indizierung für robusten Look-up
# Mehrfache Indizierung für robusten Look-up (ID, Titel, Dateiname)
self.batch_cache[ctx.note_id] = ctx
self.batch_cache[ctx.title] = ctx
fname = os.path.splitext(os.path.basename(path))[0]
@ -174,6 +174,7 @@ class IngestionService:
)
note_id = note_pl["note_id"]
# BUSINESS LOG: Aktuelle Notiz
logger.info(f"📄 Bearbeite: '{note_id}' (Typ: {note_type})")
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
@ -208,9 +209,9 @@ class IngestionService:
is_valid = await validate_edge_candidate(
ch.text, cand, self.batch_cache, self.llm, profile_name="ingest_validator"
)
# Fix v3.2.0: Sicherer Zugriff via .get() verhindert Crash bei fehlender target_id
t_id = cand.get('target_id') or cand.get('note_id') or "Unknown"
logger.info(f" 🧠 [SMART EDGE] {t_id} -> {'✅ OK' if is_valid else '❌ SKIP'}")
# Fix v3.2.1: Robuste ID-Auflösung für den Logger
t_label = cand.get('target_id') or cand.get('note_id') or cand.get('to') or "Unknown"
logger.info(f" 🧠 [SMART EDGE] {t_label} -> {'✅ OK' if is_valid else '❌ SKIP'}")
if is_valid: new_pool.append(cand)
else:
new_pool.append(cand)
@ -245,8 +246,12 @@ class IngestionService:
e["virtual"] = False
e["confidence"] = e.get("confidence", 1.0)
# Registrierung für Batch-Authority
edge_id = _mk_edge_id(resolved_kind, note_id, target_raw, e.get("scope", "note"))
# Fix v3.2.1: Kanonisierung der Target-ID vor der Registrierung!
# Nur wenn wir hier die echte Note-ID nutzen, erkennt Phase 2 die Kollision.
t_ctx = self.batch_cache.get(target_raw)
t_canonical = t_ctx.note_id if t_ctx else target_raw
edge_id = _mk_edge_id(resolved_kind, note_id, t_canonical, e.get("scope", "note"))
self.processed_explicit_ids.add(edge_id)
final_edges.append(e)
@ -260,11 +265,12 @@ class IngestionService:
target_id = target_ctx.note_id if target_ctx else target_raw
if (inv_kind and target_id and target_id != note_id and self._is_valid_note_id(target_id, provenance="structure")):
# ID der potenziellen virtuellen Kante
potential_id = _mk_edge_id(inv_kind, target_id, note_id, e.get("scope", "note"))
is_in_batch = potential_id in self.processed_explicit_ids
# Real-Time DB Check (Ohne 'await', da sync)
# Real-Time DB Check (Sync)
is_in_db = False
if not is_in_batch:
is_in_db = is_explicit_edge_present(self.client, self.prefix, potential_id)
@ -312,6 +318,7 @@ class IngestionService:
"""Erstellt eine Note aus einem Textstream."""
target_path = os.path.join(vault_root, folder, filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True)
with open(target_path, "w", encoding="utf-8") as f: f.write(markdown_content)
with open(target_path, "w", encoding="utf-8") as f:
f.write(markdown_content)
await asyncio.sleep(0.1)
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)