Update ingestion_db.py and ingestion_processor.py: Refine documentation and enhance logging mechanisms. Improve edge validation logic with robust ID resolution and clarify comments for better understanding. Version updates to 2.2.1 and 3.2.1 respectively.
This commit is contained in:
parent
00264a9653
commit
4318395c83
|
|
@ -4,7 +4,6 @@ DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung.
|
|||
WP-14: Umstellung auf zentrale database-Infrastruktur.
|
||||
WP-24c: Implementierung der herkunftsbasierten Lösch-Logik (Origin-Purge).
|
||||
Verhindert das versehentliche Löschen von inversen Kanten beim Re-Import.
|
||||
VERSION v2.2.0: Integration der Authority-Prüfung für Point-IDs.
|
||||
VERSION: 2.2.0 (WP-24c: Protected Purge & Authority Lookup)
|
||||
STATUS: Active
|
||||
"""
|
||||
|
|
@ -50,6 +49,7 @@ def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) ->
|
|||
"""
|
||||
_, _, edges_col = collection_names(prefix)
|
||||
try:
|
||||
# retrieve erwartet eine Liste von IDs
|
||||
res = client.retrieve(
|
||||
collection_name=edges_col,
|
||||
ids=[edge_id],
|
||||
|
|
@ -83,7 +83,7 @@ def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
|
|||
# Dies umfasst:
|
||||
# - Alle ausgehenden Kanten (A -> B)
|
||||
# - Alle inversen Kanten, die diese Note in anderen Notizen "deponiert" hat (B -> A)
|
||||
# Fremde virtuelle Kanten (C -> A) bleiben erhalten, da deren origin_note_id == C ist.
|
||||
# Fremde inverse Kanten (C -> A) bleiben erhalten.
|
||||
edges_filter = rest.Filter(must=[
|
||||
rest.FieldCondition(key="origin_note_id", match=rest.MatchValue(value=note_id))
|
||||
])
|
||||
|
|
|
|||
|
|
@ -5,9 +5,9 @@ DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
|||
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||
AUDIT v3.2.0: Fix für KeyError 'target_id', TypeError (Sync-Check)
|
||||
und Business-Centric Logging.
|
||||
VERSION: 3.2.0 (WP-24c: Stability & Full Feature Set)
|
||||
AUDIT v3.2.1: Fix für ID-Kanonisierung in Phase 1 & 2,
|
||||
robuster Smart-Edge-Logger und Business-Logging.
|
||||
VERSION: 3.2.1 (WP-24c: Canonical Authority Protection)
|
||||
STATUS: Active
|
||||
"""
|
||||
import logging
|
||||
|
|
@ -56,8 +56,8 @@ class IngestionService:
|
|||
from app.config import get_settings
|
||||
self.settings = get_settings()
|
||||
|
||||
# --- LOGGING CLEANUP ---
|
||||
# Unterdrückt Bibliotheks-Lärm in Konsole und Datei
|
||||
# --- LOGGING CLEANUP (Business Focus) ---
|
||||
# Unterdrückt Bibliotheks-Lärm in Konsole und Datei (via tee)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
logging.getLogger("qdrant_client").setLevel(logging.WARNING)
|
||||
|
|
@ -99,18 +99,18 @@ class IngestionService:
|
|||
if not text or len(text.strip()) < 2:
|
||||
return False
|
||||
|
||||
# Symmetrie-Filter gegen Typ-Strings
|
||||
# Nur System-Kanten (Symmetrie) filtern wir gegen die Typ-Blacklist
|
||||
if provenance != "explicit":
|
||||
blacklisted = {"insight", "event", "source", "task", "project", "person", "concept", "related_to", "referenced_by"}
|
||||
if text.lower().strip() in blacklisted:
|
||||
return False
|
||||
|
||||
if len(text) > 150: return False # Wahrscheinlich kein Titel
|
||||
if len(text) > 150: return False # Vermutlich ein ganzer Satz
|
||||
return True
|
||||
|
||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
WP-15b: Implementiert den Two-Pass Ingestion Workflow.
|
||||
WP-15b: Two-Pass Ingestion Workflow.
|
||||
Pass 1: Pre-Scan füllt den Context-Cache.
|
||||
Pass 2: Verarbeitung nutzt den Cache für die semantische Prüfung.
|
||||
"""
|
||||
|
|
@ -122,7 +122,7 @@ class IngestionService:
|
|||
# Übergabe der Registry für dynamische Scan-Tiefe
|
||||
ctx = pre_scan_markdown(path, registry=self.registry)
|
||||
if ctx:
|
||||
# Mehrfache Indizierung für robusten Look-up
|
||||
# Mehrfache Indizierung für robusten Look-up (ID, Titel, Dateiname)
|
||||
self.batch_cache[ctx.note_id] = ctx
|
||||
self.batch_cache[ctx.title] = ctx
|
||||
fname = os.path.splitext(os.path.basename(path))[0]
|
||||
|
|
@ -174,6 +174,7 @@ class IngestionService:
|
|||
)
|
||||
note_id = note_pl["note_id"]
|
||||
|
||||
# BUSINESS LOG: Aktuelle Notiz
|
||||
logger.info(f"📄 Bearbeite: '{note_id}' (Typ: {note_type})")
|
||||
|
||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||
|
|
@ -208,9 +209,9 @@ class IngestionService:
|
|||
is_valid = await validate_edge_candidate(
|
||||
ch.text, cand, self.batch_cache, self.llm, profile_name="ingest_validator"
|
||||
)
|
||||
# Fix v3.2.0: Sicherer Zugriff via .get() verhindert Crash bei fehlender target_id
|
||||
t_id = cand.get('target_id') or cand.get('note_id') or "Unknown"
|
||||
logger.info(f" 🧠 [SMART EDGE] {t_id} -> {'✅ OK' if is_valid else '❌ SKIP'}")
|
||||
# Fix v3.2.1: Robuste ID-Auflösung für den Logger
|
||||
t_label = cand.get('target_id') or cand.get('note_id') or cand.get('to') or "Unknown"
|
||||
logger.info(f" 🧠 [SMART EDGE] {t_label} -> {'✅ OK' if is_valid else '❌ SKIP'}")
|
||||
if is_valid: new_pool.append(cand)
|
||||
else:
|
||||
new_pool.append(cand)
|
||||
|
|
@ -245,8 +246,12 @@ class IngestionService:
|
|||
e["virtual"] = False
|
||||
e["confidence"] = e.get("confidence", 1.0)
|
||||
|
||||
# Registrierung für Batch-Authority
|
||||
edge_id = _mk_edge_id(resolved_kind, note_id, target_raw, e.get("scope", "note"))
|
||||
# Fix v3.2.1: Kanonisierung der Target-ID vor der Registrierung!
|
||||
# Nur wenn wir hier die echte Note-ID nutzen, erkennt Phase 2 die Kollision.
|
||||
t_ctx = self.batch_cache.get(target_raw)
|
||||
t_canonical = t_ctx.note_id if t_ctx else target_raw
|
||||
|
||||
edge_id = _mk_edge_id(resolved_kind, note_id, t_canonical, e.get("scope", "note"))
|
||||
self.processed_explicit_ids.add(edge_id)
|
||||
final_edges.append(e)
|
||||
|
||||
|
|
@ -260,11 +265,12 @@ class IngestionService:
|
|||
target_id = target_ctx.note_id if target_ctx else target_raw
|
||||
|
||||
if (inv_kind and target_id and target_id != note_id and self._is_valid_note_id(target_id, provenance="structure")):
|
||||
# ID der potenziellen virtuellen Kante
|
||||
potential_id = _mk_edge_id(inv_kind, target_id, note_id, e.get("scope", "note"))
|
||||
|
||||
is_in_batch = potential_id in self.processed_explicit_ids
|
||||
|
||||
# Real-Time DB Check (Ohne 'await', da sync)
|
||||
# Real-Time DB Check (Sync)
|
||||
is_in_db = False
|
||||
if not is_in_batch:
|
||||
is_in_db = is_explicit_edge_present(self.client, self.prefix, potential_id)
|
||||
|
|
@ -312,6 +318,7 @@ class IngestionService:
|
|||
"""Erstellt eine Note aus einem Textstream."""
|
||||
target_path = os.path.join(vault_root, folder, filename)
|
||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||
with open(target_path, "w", encoding="utf-8") as f: f.write(markdown_content)
|
||||
with open(target_path, "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
await asyncio.sleep(0.1)
|
||||
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||
Loading…
Reference in New Issue
Block a user