Update ingestion_db.py, ingestion_processor.py, and import_markdown.py: Enhance documentation and logging clarity, improve artifact purging and symmetry injection logic, and implement stricter authority checks. Update versioning to 2.6.0 and 3.3.7 to reflect changes in functionality and maintain compatibility with the ingestion service.

This commit is contained in:
Lars 2026-01-10 08:06:07 +01:00
parent 57656bbaaf
commit ec89d83916
3 changed files with 117 additions and 91 deletions

View File

@ -2,11 +2,11 @@
FILE: app/core/ingestion/ingestion_db.py FILE: app/core/ingestion/ingestion_db.py
DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung. DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung.
WP-14: Umstellung auf zentrale database-Infrastruktur. WP-14: Umstellung auf zentrale database-Infrastruktur.
WP-20/22: Integration von Cloud-Resilienz und Fehlerbehandlung. WP-20/22: Cloud-Resilienz und Fehlerbehandlung.
WP-24c: Implementierung der herkunftsbasierten Lösch-Logik (Origin-Purge). WP-24c: Implementierung der herkunftsbasierten Lösch-Logik (Origin-Purge).
Verhindert das versehentliche Löschen von inversen Kanten beim Re-Import. Verhindert das versehentliche Löschen von inversen Kanten beim Re-Import.
Integration der Authority-Prüfung für Point-IDs zur Symmetrie-Validierung. Integration der Authority-Prüfung für Point-IDs zur Symmetrie-Validierung.
VERSION: 2.2.0 (WP-24c: Protected Purge & Authority Lookup) VERSION: 2.2.1 (WP-24c: Robust Authority Lookup)
STATUS: Active STATUS: Active
""" """
import logging import logging
@ -45,26 +45,57 @@ def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[
def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) -> bool: def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) -> bool:
""" """
WP-24c: Prüft via Point-ID, ob bereits eine explizite Kante existiert. WP-24c: Prüft via Point-ID, ob bereits eine explizite Kante existiert.
Wird vom IngestionProcessor genutzt, um das Überschreiben von manuellem Wissen Wird vom IngestionProcessor in Phase 2 genutzt, um das Überschreiben
durch virtuelle Symmetrie-Kanten zu verhindern. von manuellem Wissen durch virtuelle Symmetrie-Kanten zu verhindern.
""" """
if not edge_id: return False
_, _, edges_col = collection_names(prefix) _, _, edges_col = collection_names(prefix)
try: try:
res = client.retrieve(collection_name=edges_col, ids=[edge_id], with_payload=True) # retrieve ist der schnellste Weg, um einen spezifischen Punkt via ID zu laden
if res and not res[0].payload.get("virtual", False): res = client.retrieve(
collection_name=edges_col,
ids=[edge_id],
with_payload=True
)
# Wenn der Punkt existiert und NICHT virtuell ist, handelt es sich um eine Nutzer-Autorität
if res and len(res) > 0:
payload = res[0].payload
if not payload.get("virtual", False):
return True return True
return False return False
except Exception: except Exception as e:
logger.debug(f"Authority check for {edge_id} failed: {e}")
return False return False
def purge_artifacts(client: QdrantClient, prefix: str, note_id: str): def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
"""WP-24c: Selektives Löschen von Artefakten (Origin-Purge).""" """
WP-24c: Selektives Löschen von Artefakten vor einem Re-Import.
Implementiert das Origin-Purge-Prinzip zur Sicherung der bidirektionalen Graph-Integrität.
"""
_, chunks_col, edges_col = collection_names(prefix) _, chunks_col, edges_col = collection_names(prefix)
try: try:
chunks_filter = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) # 1. Chunks löschen (immer fest an die note_id gebunden)
client.delete(collection_name=chunks_col, points_selector=rest.FilterSelector(filter=chunks_filter)) chunks_filter = rest.Filter(must=[
edges_filter = rest.Filter(must=[rest.FieldCondition(key="origin_note_id", match=rest.MatchValue(value=note_id))]) rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=edges_filter)) ])
client.delete(
collection_name=chunks_col,
points_selector=rest.FilterSelector(filter=chunks_filter)
)
# 2. WP-24c: Kanten löschen (HERKUNFTS-BASIERT via origin_note_id)
# Wir löschen alle Kanten, die von DIESER Note erzeugt wurden.
edges_filter = rest.Filter(must=[
rest.FieldCondition(key="origin_note_id", match=rest.MatchValue(value=note_id))
])
client.delete(
collection_name=edges_col,
points_selector=rest.FilterSelector(filter=edges_filter)
)
logger.info(f"🧹 [PURGE] Global artifacts owned by '{note_id}' cleared.") logger.info(f"🧹 [PURGE] Global artifacts owned by '{note_id}' cleared.")
except Exception as e: except Exception as e:
logger.error(f"❌ [PURGE ERROR] Failed to clear artifacts for {note_id}: {e}") logger.error(f"❌ [PURGE ERROR] Failed to clear artifacts for {note_id}: {e}")

View File

@ -5,10 +5,10 @@ DESCRIPTION: Der zentrale IngestionService (Orchestrator).
WP-25a: Integration der Mixture of Experts (MoE) Architektur. WP-25a: Integration der Mixture of Experts (MoE) Architektur.
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache. WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert. WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
AUDIT v3.3.6: Strikte Phasentrennung (Phase 2 global am Ende). AUDIT v3.3.7: Strikte globale Phasentrennung.
Fix für .trash-Folder und Pydantic 'None'-Crash. Fix für Pydantic Crash (None-ID Guard Clauses).
Vollständige Wiederherstellung des Business-Loggings. Erzwingung der Konsistenz (wait=True).
VERSION: 3.3.6 (WP-24c: Full Transparency Orchestration) VERSION: 3.3.7 (WP-24c: Strict Authority Commitment)
STATUS: Active STATUS: Active
""" """
import logging import logging
@ -26,7 +26,7 @@ from app.core.chunking import assemble_chunks
# WP-24c: Import für die deterministische UUID-Vorabberechnung # WP-24c: Import für die deterministische UUID-Vorabberechnung
from app.core.graph.graph_utils import _mk_edge_id from app.core.graph.graph_utils import _mk_edge_id
# Datenbank-Ebene (Modularisierte database-Infrastruktur) # MODULARISIERUNG: Neue Import-Pfade für die Datenbank-Ebene
from app.core.database.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes from app.core.database.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
from app.core.database.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch from app.core.database.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
@ -53,12 +53,12 @@ logger = logging.getLogger(__name__)
class IngestionService: class IngestionService:
def __init__(self, collection_prefix: str = None): def __init__(self, collection_prefix: str = None):
"""Initialisiert den Service und bereinigt das technische Logging.""" """Initialisiert den Service und nutzt die neue database-Infrastruktur."""
from app.config import get_settings from app.config import get_settings
self.settings = get_settings() self.settings = get_settings()
# --- LOGGING CLEANUP (Business Focus) --- # --- LOGGING CLEANUP (Business Focus) ---
# Unterdrückt HTTP-Bibliotheks-Lärm, erhält aber inhaltliche Service-Logs # Unterdrückt technische Bibliotheks-Header, erhält aber inhaltliche Service-Logs
for lib in ["httpx", "httpcore", "qdrant_client", "urllib3", "openai"]: for lib in ["httpx", "httpcore", "qdrant_client", "urllib3", "openai"]:
logging.getLogger(lib).setLevel(logging.WARNING) logging.getLogger(lib).setLevel(logging.WARNING)
@ -71,47 +71,49 @@ class IngestionService:
self.embedder = EmbeddingsClient() self.embedder = EmbeddingsClient()
self.llm = LLMService() self.llm = LLMService()
# WP-25a: Dimensionen über das LLM-Profil auflösen # WP-25a: Auflösung der Dimension über das Embedding-Profil (MoE)
embed_cfg = self.llm.profiles.get("embedding_expert", {}) embed_cfg = self.llm.profiles.get("embedding_expert", {})
self.dim = embed_cfg.get("dimensions") or self.settings.VECTOR_SIZE self.dim = embed_cfg.get("dimensions") or self.settings.VECTOR_SIZE
# Festlegen des Change-Detection Modus
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
# WP-15b: Kontext-Gedächtnis für ID-Auflösung (Global) # WP-15b: Kontext-Gedächtnis für ID-Auflösung
self.batch_cache: Dict[str, NoteContext] = {} self.batch_cache: Dict[str, NoteContext] = {}
# WP-24c: Puffer für Phase 2 (Symmetrie-Injektion am Ende des gesamten Imports) # WP-24c: Puffer für Phase 2 (Symmetrie-Injektion nach dem gesamten Import)
self.symmetry_buffer: List[Dict[str, Any]] = [] self.symmetry_buffer: List[Dict[str, Any]] = []
try: try:
# Schema-Prüfung und Initialisierung # Aufruf der modularisierten Schema-Logik
ensure_collections(self.client, self.prefix, self.dim) ensure_collections(self.client, self.prefix, self.dim)
ensure_payload_indexes(self.client, self.prefix) ensure_payload_indexes(self.client, self.prefix)
except Exception as e: except Exception as e:
logger.warning(f"DB initialization warning: {e}") logger.warning(f"DB initialization warning: {e}")
def _is_valid_note_id(self, text: str) -> bool: def _is_valid_note_id(self, text: Optional[str]) -> bool:
""" """
WP-24c: Prüft Ziel-Strings auf fachliche Validität. WP-24c: Prüft Ziel-Strings auf fachliche Validität.
Verhindert Müll-Kanten zu reinen System-Platzhaltern. Verhindert Müll-Kanten zu System-Platzhaltern.
""" """
if not text or len(text.strip()) < 2: return False if not text or not isinstance(text, str) or len(text.strip()) < 2:
blacklisted = {"insight", "event", "source", "task", "project", "person", "concept", "related_to", "referenced_by"} return False
if text.lower().strip() in blacklisted: return False
blacklisted = {"insight", "event", "source", "task", "project", "person", "concept", "related_to", "referenced_by", "none", "unknown"}
if text.lower().strip() in blacklisted:
return False
if len(text) > 200: return False if len(text) > 200: return False
return True return True
async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]: async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]:
""" """
WP-15b: Two-Pass Ingestion Workflow (PHASE 1). WP-15b: Two-Pass Ingestion Workflow (PHASE 1).
Füllt den Cache und verarbeitet Dateien batchweise. Verarbeitet Batches und schreibt NUR Nutzer-Autorität in die DB.
Gibt ein Dictionary zurück, um Kompatibilität zum Orchestrator zu wahren.
""" """
self.batch_cache.clear() self.batch_cache.clear()
logger.info(f"--- 🔍 START BATCH PHASE 1 ({len(file_paths)} Dateien) ---") logger.info(f"--- 🔍 START BATCH PHASE 1 ({len(file_paths)} Dateien) ---")
# 1. Schritt: Pre-Scan (Context-Cache befüllen) # 1. Schritt: Pre-Scan (Context-Cache füllen)
for path in file_paths: for path in file_paths:
try: try:
ctx = pre_scan_markdown(path, registry=self.registry) ctx = pre_scan_markdown(path, registry=self.registry)
@ -123,7 +125,7 @@ class IngestionService:
except Exception as e: except Exception as e:
logger.warning(f" ⚠️ Pre-scan fehlgeschlagen für {path}: {e}") logger.warning(f" ⚠️ Pre-scan fehlgeschlagen für {path}: {e}")
# 2. Schritt: Batch-Verarbeitung (Authority Only) # 2. Schritt: PROCESSING
processed_count = 0 processed_count = 0
success_count = 0 success_count = 0
for p in file_paths: for p in file_paths:
@ -132,52 +134,56 @@ class IngestionService:
if res.get("status") == "success": if res.get("status") == "success":
success_count += 1 success_count += 1
logger.info(f"--- ✅ Batch Phase 1 abgeschlossen ({success_count}/{processed_count}) ---") logger.info(f"--- ✅ Batch Phase 1 beendet ({success_count}/{processed_count}) ---")
return { return {
"status": "success", "status": "success",
"processed": processed_count, "processed": processed_count,
"success": success_count, "success": success_count,
"buffered_virtuals": len(self.symmetry_buffer) "buffered_symmetries": len(self.symmetry_buffer)
} }
async def commit_vault_symmetries(self) -> Dict[str, Any]: async def commit_vault_symmetries(self) -> Dict[str, Any]:
""" """
WP-24c: Führt PHASE 2 (Symmetrie-Injektion) für den gesamten Vault aus. WP-24c: Führt PHASE 2 (Globale Symmetrie-Injektion) aus.
Wird nach Abschluss aller Batches einmalig aufgerufen. Wird einmalig am Ende des gesamten Imports aufgerufen.
Vergleicht gepufferte Kanten gegen die Instance-of-Truth in Qdrant. Sorgt dafür, dass virtuelle Kanten erst NACH der Nutzer-Autorität geschrieben werden.
""" """
if not self.symmetry_buffer: if not self.symmetry_buffer:
logger.info("⏭️ Symmetrie-Puffer ist leer. Keine Aktion erforderlich.") logger.info("⏭️ Symmetrie-Puffer leer. Keine Aktion erforderlich.")
return {"status": "skipped", "reason": "buffer_empty"} return {"status": "skipped", "reason": "buffer_empty"}
logger.info(f"🔄 PHASE 2: Validiere {len(self.symmetry_buffer)} Symmetrie-Kanten gegen die Instance-of-Truth...") logger.info(f"🔄 PHASE 2: Validiere {len(self.symmetry_buffer)} Symmetrie-Vorschläge gegen Live-DB...")
final_virtuals = [] final_virtuals = []
for v_edge in self.symmetry_buffer: for v_edge in self.symmetry_buffer:
# Deterministische ID der potenziellen Symmetrie berechnen # Sicherheits-Check: Keine Kanten ohne Ziele zulassen
if not v_edge.get("target_id") or v_edge.get("target_id") == "None":
continue
# ID der potenziellen Symmetrie berechnen
v_id = _mk_edge_id(v_edge["kind"], v_edge["note_id"], v_edge["target_id"], v_edge.get("scope", "note")) v_id = _mk_edge_id(v_edge["kind"], v_edge["note_id"], v_edge["target_id"], v_edge.get("scope", "note"))
# AUTHORITY-CHECK: Nur schreiben, wenn KEINE manuelle Kante in der DB existiert # AUTHORITY-CHECK: Nur schreiben, wenn KEINE manuelle Kante in der DB existiert
if not is_explicit_edge_present(self.client, self.prefix, v_id): if not is_explicit_edge_present(self.client, self.prefix, v_id):
final_virtuals.append(v_edge) final_virtuals.append(v_edge)
# Detailliertes Logging für volle Transparenz logger.info(f" 🔄 [SYMMETRY] Erzeuge Gegenkante: {v_edge['note_id']} --({v_edge['kind']})--> {v_edge['target_id']}")
logger.info(f" 🔄 [SYMMETRY] Add inverse: {v_edge['note_id']} --({v_edge['kind']})--> {v_edge['target_id']}")
else: else:
logger.debug(f" 🛡️ Schutz: Manuelle Kante belegt ID {v_id}. Symmetrie verworfen.") logger.debug(f" 🛡️ Schutz: Manuelle Kante belegt ID {v_id}. Symmetrie verworfen.")
added_count = 0 added_count = 0
if final_virtuals: if final_virtuals:
logger.info(f"📤 Schreibe {len(final_virtuals)} validierte Symmetrie-Kanten in den Graphen.") logger.info(f"📤 Schreibe {len(final_virtuals)} geschützte Symmetrie-Kanten in Qdrant.")
e_pts = points_for_edges(self.prefix, final_virtuals)[1] e_pts = points_for_edges(self.prefix, final_virtuals)[1]
upsert_batch(self.client, f"{self.prefix}_edges", e_pts) upsert_batch(self.client, f"{self.prefix}_edges", e_pts, wait=True)
added_count = len(final_virtuals) added_count = len(final_virtuals)
self.symmetry_buffer.clear() # Puffer nach erfolgreichem Commit leeren self.symmetry_buffer.clear() # Puffer leeren
return {"status": "success", "added": added_count} return {"status": "success", "added": added_count}
async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]: async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]:
""" """
Transformiert eine Markdown-Datei in Phase 1 (Authority First). Transformiert eine Markdown-Datei.
Implementiert Ordner-Blacklists, Pydantic-Safety und MoE-Validierung. Schreibt Notes/Chunks/Explicit Edges sofort (Phase 1).
Befüllt den Symmetrie-Puffer für die globale Phase 2.
""" """
apply = kwargs.get("apply", False) apply = kwargs.get("apply", False)
force_replace = kwargs.get("force_replace", False) force_replace = kwargs.get("force_replace", False)
@ -186,7 +192,7 @@ class IngestionService:
result = {"path": file_path, "status": "skipped", "changed": False, "error": None} result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
try: try:
# --- ORDNER-FILTER (Fix für .trash und .obsidian Junk) --- # --- ORDNER-FILTER (.trash) ---
if any(part.startswith('.') for part in file_path.split(os.sep)): if any(part.startswith('.') for part in file_path.split(os.sep)):
return {**result, "status": "skipped", "reason": "hidden_folder"} return {**result, "status": "skipped", "reason": "hidden_folder"}
@ -195,7 +201,6 @@ class IngestionService:
if any(folder in file_path for folder in ignore_folders): if any(folder in file_path for folder in ignore_folders):
return {**result, "status": "skipped", "reason": "folder_blacklist"} return {**result, "status": "skipped", "reason": "folder_blacklist"}
# Datei einlesen und validieren
parsed = read_markdown(file_path) parsed = read_markdown(file_path)
if not parsed: return {**result, "error": "Empty file"} if not parsed: return {**result, "error": "Empty file"}
fm = normalize_frontmatter(parsed.frontmatter) fm = normalize_frontmatter(parsed.frontmatter)
@ -205,7 +210,7 @@ class IngestionService:
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, types_cfg=self.registry) note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, types_cfg=self.registry)
note_id = note_pl.get("note_id") note_id = note_pl.get("note_id")
# --- FIX: Guard Clause gegen 'None' IDs (Verhindert Pydantic Crash) --- # --- GUARD CLAUSE: Fehlende IDs verhindern PointStruct-Crash ---
if not note_id: if not note_id:
logger.warning(f" ⚠️ Fehlende note_id in '{file_path}'. Datei wird ignoriert.") logger.warning(f" ⚠️ Fehlende note_id in '{file_path}'. Datei wird ignoriert.")
return {**result, "status": "error", "error": "missing_note_id"} return {**result, "status": "error", "error": "missing_note_id"}
@ -221,7 +226,7 @@ class IngestionService:
if not apply: if not apply:
return {**result, "status": "dry-run", "changed": True, "note_id": note_id} return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
# Chunks erzeugen und semantisch validieren (MoE) # Deep Processing & MoE (LLM Validierung)
profile = note_pl.get("chunk_profile", "sliding_standard") profile = note_pl.get("chunk_profile", "sliding_standard")
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type) chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False) enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
@ -230,45 +235,45 @@ class IngestionService:
for ch in chunks: for ch in chunks:
new_pool = [] new_pool = []
for cand in getattr(ch, "candidate_pool", []): for cand in getattr(ch, "candidate_pool", []):
# --- GUARD: Ungültige Ziele im Candidate-Pool filtern ---
t_id = cand.get('target_id') or cand.get('note_id')
if not self._is_valid_note_id(t_id):
continue
if cand.get("provenance") == "global_pool" and enable_smart: if cand.get("provenance") == "global_pool" and enable_smart:
# Detailliertes Business-Logging für LLM-Aktivitäten logger.info(f" ⚖️ [VALIDATING] Relation to '{t_id}' via Expert-LLM...")
target_label = cand.get('target_id') or cand.get('note_id') or "Unknown"
logger.info(f" ⚖️ [VALIDATING] Relation to '{target_label}' via Expert-LLM...")
is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm) is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm)
logger.info(f" 🧠 [SMART EDGE] {t_id} -> {'✅ OK' if is_valid else '❌ SKIP'}")
logger.info(f" 🧠 [SMART EDGE] {target_label} -> {'✅ OK' if is_valid else '❌ SKIP'}")
if is_valid: new_pool.append(cand) if is_valid: new_pool.append(cand)
else: else:
new_pool.append(cand) new_pool.append(cand)
ch.candidate_pool = new_pool ch.candidate_pool = new_pool
# Embeddings und Payloads
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry) chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else [] vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
# Kanten-Extraktion mit ID-Kanonisierung # Kanten-Logik (Kanonisierung via batch_cache)
raw_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", [])) raw_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
explicit_edges = [] explicit_edges = []
for e in raw_edges: for e in raw_edges:
target_raw = e.get("target_id") target_raw = e.get("target_id")
# Auflösung von Titeln/Dateinamen zu echten IDs über den globalen Cache # ID-Resolution über den Context-Cache
target_ctx = self.batch_cache.get(target_raw) t_ctx = self.batch_cache.get(target_raw)
target_id = target_ctx.note_id if target_ctx else target_raw target_id = t_ctx.note_id if t_ctx else target_raw
if not self._is_valid_note_id(target_id): continue if not self._is_valid_note_id(target_id): continue
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit")) resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"))
# Echte physische Kante markieren (Phase 1 Autorität) # Echte physische Kante markieren (Phase 1 Authority)
e.update({ e.update({
"kind": resolved_kind, "target_id": target_id, "kind": resolved_kind, "target_id": target_id,
"origin_note_id": note_id, "virtual": False, "confidence": 1.0 "origin_note_id": note_id, "virtual": False, "confidence": 1.0
}) })
explicit_edges.append(e) explicit_edges.append(e)
# Symmetrie-Kandidat für die globale Phase 2 puffern # Symmetrie-Kandidat puffern
inv_kind = edge_registry.get_inverse(resolved_kind) inv_kind = edge_registry.get_inverse(resolved_kind)
if inv_kind and target_id != note_id: if inv_kind and target_id != note_id:
v_edge = e.copy() v_edge = e.copy()
@ -287,7 +292,8 @@ class IngestionService:
if chunk_pls and vecs: if chunk_pls and vecs:
upsert_batch(self.client, f"{self.prefix}_chunks", points_for_chunks(self.prefix, chunk_pls, vecs)[1]) upsert_batch(self.client, f"{self.prefix}_chunks", points_for_chunks(self.prefix, chunk_pls, vecs)[1])
if explicit_edges: if explicit_edges:
upsert_batch(self.client, f"{self.prefix}_edges", points_for_edges(self.prefix, explicit_edges)[1]) # Wichtig: wait=True stellt sicher, dass die Kanten in Phase 2 searchable sind
upsert_batch(self.client, f"{self.prefix}_edges", points_for_edges(self.prefix, explicit_edges)[1], wait=True)
logger.info(f" ✨ Phase 1 fertig: {len(chunk_pls)} Chunks, {len(explicit_edges)} explizite Kanten.") logger.info(f" ✨ Phase 1 fertig: {len(chunk_pls)} Chunks, {len(explicit_edges)} explizite Kanten.")
return {"status": "success", "note_id": note_id, "edges_count": len(explicit_edges)} return {"status": "success", "note_id": note_id, "edges_count": len(explicit_edges)}
@ -300,7 +306,6 @@ class IngestionService:
"""Erstellt eine Note aus einem Textstream.""" """Erstellt eine Note aus einem Textstream."""
target_path = os.path.join(vault_root, folder, filename) target_path = os.path.join(vault_root, folder, filename)
os.makedirs(os.path.dirname(target_path), exist_ok=True) os.makedirs(os.path.dirname(target_path), exist_ok=True)
with open(target_path, "w", encoding="utf-8") as f: with open(target_path, "w", encoding="utf-8") as f: f.write(markdown_content)
f.write(markdown_content)
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True) return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)

View File

@ -2,19 +2,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
FILE: scripts/import_markdown.py FILE: scripts/import_markdown.py
VERSION: 2.5.0 (2026-01-10) VERSION: 2.6.0 (2026-01-10)
STATUS: Active (Core) STATUS: Active (Core)
COMPATIBILITY: IngestionProcessor v3.3.5+ COMPATIBILITY: IngestionProcessor v3.3.7+
Zweck: Hauptwerkzeug zum Importieren von Markdown-Dateien.
Zweck:
-------
Hauptwerkzeug zum Importieren von Markdown-Dateien aus einem Vault in Qdrant.
Implementiert die globale 2-Phasen-Schreibstrategie. Implementiert die globale 2-Phasen-Schreibstrategie.
Änderungen v2.5.0:
------------------
- Globale Phasentrennung: commit_vault_symmetries() wird erst am Ende aufgerufen.
- Erweiterter Ordner-Filter: Schließt .trash und andere Systemordner aus.
""" """
import asyncio import asyncio
import os import os
@ -24,10 +16,8 @@ import sys
from pathlib import Path from pathlib import Path
from dotenv import load_dotenv from dotenv import load_dotenv
# Setzt das Level global auf INFO # Root Logger Setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s') logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
# Stelle sicher, dass das Root-Verzeichnis im Python-Pfad ist
sys.path.append(os.getcwd()) sys.path.append(os.getcwd())
from app.core.ingestion import IngestionService from app.core.ingestion import IngestionService
@ -41,14 +31,13 @@ async def main_async(args):
logger.error(f"Vault path does not exist: {vault_path}") logger.error(f"Vault path does not exist: {vault_path}")
return return
# 1. Service initialisieren
logger.info(f"Initializing IngestionService (Prefix: {args.prefix})") logger.info(f"Initializing IngestionService (Prefix: {args.prefix})")
service = IngestionService(collection_prefix=args.prefix) service = IngestionService(collection_prefix=args.prefix)
logger.info(f"Scanning {vault_path}...") logger.info(f"Scanning {vault_path}...")
all_files = list(vault_path.rglob("*.md")) all_files = list(vault_path.rglob("*.md"))
# --- ORDNER-FILTER --- # --- GLOBALER ORDNER-FILTER ---
files = [] files = []
ignore_folders = [".trash", ".obsidian", ".sync", "templates", "_system"] ignore_folders = [".trash", ".obsidian", ".sync", "templates", "_system"]
for f in all_files: for f in all_files:
@ -74,7 +63,7 @@ async def main_async(args):
except Exception: pass except Exception: pass
# ========================================================================= # =========================================================================
# PHASE 1: Batch-Import (Explicit Edges only) # PHASE 1: Batch-Import (Notes & Explicit Edges)
# ========================================================================= # =========================================================================
stats = {"processed": 0, "skipped": 0, "errors": 0} stats = {"processed": 0, "skipped": 0, "errors": 0}
sem = asyncio.Semaphore(5) sem = asyncio.Semaphore(5)
@ -82,6 +71,7 @@ async def main_async(args):
async def process_with_limit(f_path): async def process_with_limit(f_path):
async with sem: async with sem:
try: try:
# Nutzt process_file (v3.3.7)
return await service.process_file( return await service.process_file(
file_path=str(f_path), vault_root=str(vault_path), file_path=str(f_path), vault_root=str(vault_path),
force_replace=args.force, apply=args.apply, purge_before=True force_replace=args.force, apply=args.apply, purge_before=True
@ -101,15 +91,15 @@ async def main_async(args):
else: stats["skipped"] += 1 else: stats["skipped"] += 1
# ========================================================================= # =========================================================================
# PHASE 2: Global Symmetry Injection # PHASE 2: Global Symmetry Injection (Nach Abschluss aller Batches)
# ========================================================================= # =========================================================================
if args.apply: if args.apply:
logger.info(f"🔄 [Phase 2] Starting global symmetry injection...") logger.info(f"🔄 [Phase 2] Starting global symmetry injection for the entire vault...")
sym_res = await service.commit_vault_symmetries() sym_res = await service.commit_vault_symmetries()
if sym_res.get("status") == "success": if sym_res.get("status") == "success":
logger.info(f"Added {sym_res.get('added', 0)} protected symmetry edges.") logger.info(f"Finished global symmetry injection. Added: {sym_res.get('added', 0)}")
logger.info(f"Done. Final Stats: {stats}") logger.info(f"Final Stats: {stats}")
def main(): def main():
load_dotenv() load_dotenv()