Update ingestion_processor.py to version 3.4.3: Remove incompatible edge_registry initialization, maintain strict two-phase strategy, and fix ID generation issues. Enhance logging and comments for clarity, ensuring compatibility and improved functionality in the ingestion workflow.
This commit is contained in:
parent
b0f4309a29
commit
8fd7ef804d
|
|
@ -4,10 +4,11 @@ DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||||
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
||||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||||
AUDIT v3.4.2: Strikte 2-Phasen-Strategie (Authority-First).
|
AUDIT v3.4.3:
|
||||||
Lösung des Ghost-ID Problems & Pydantic-Crash Fix.
|
- Entfernung des inkompatiblen edge_registry.initialize Aufrufs.
|
||||||
Zentralisierte ID-Generierung zur Vermeidung von Duplikaten.
|
- Beibehaltung der strikten 2-Phasen-Strategie (Authority-First).
|
||||||
VERSION: 3.4.2 (WP-24c: Unified ID Orchestration)
|
- Fix für das Steinzeitaxt-Problem via zentralisierter ID-Logik.
|
||||||
|
VERSION: 3.4.3 (WP-24c: Compatibility Fix)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -22,8 +23,8 @@ from app.core.parser import (
|
||||||
validate_required_frontmatter, NoteContext
|
validate_required_frontmatter, NoteContext
|
||||||
)
|
)
|
||||||
from app.core.chunking import assemble_chunks
|
from app.core.chunking import assemble_chunks
|
||||||
# WP-24c: Import der zentralen Identitäts-Logik und Pfad-Getter
|
# WP-24c: Import der zentralen Identitäts-Logik
|
||||||
from app.core.graph.graph_utils import _mk_edge_id, get_vocab_path, get_schema_path
|
from app.core.graph.graph_utils import _mk_edge_id
|
||||||
|
|
||||||
# Datenbank-Ebene (Modularisierte database-Infrastruktur)
|
# Datenbank-Ebene (Modularisierte database-Infrastruktur)
|
||||||
from app.core.database.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
from app.core.database.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
||||||
|
|
@ -57,15 +58,10 @@ class IngestionService:
|
||||||
self.settings = get_settings()
|
self.settings = get_settings()
|
||||||
|
|
||||||
# --- LOGGING CLEANUP ---
|
# --- LOGGING CLEANUP ---
|
||||||
|
# Unterdrückt Bibliotheks-Lärm, erhält aber inhaltliche Service-Logs
|
||||||
for lib in ["httpx", "httpcore", "qdrant_client", "urllib3", "openai"]:
|
for lib in ["httpx", "httpcore", "qdrant_client", "urllib3", "openai"]:
|
||||||
logging.getLogger(lib).setLevel(logging.WARNING)
|
logging.getLogger(lib).setLevel(logging.WARNING)
|
||||||
|
|
||||||
# WP-24c: Explizite Initialisierung der Registry mit .env Pfaden
|
|
||||||
edge_registry.initialize(
|
|
||||||
vocab_path=get_vocab_path(),
|
|
||||||
schema_path=get_schema_path()
|
|
||||||
)
|
|
||||||
|
|
||||||
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
||||||
self.cfg = QdrantConfig.from_env()
|
self.cfg = QdrantConfig.from_env()
|
||||||
self.cfg.prefix = self.prefix
|
self.cfg.prefix = self.prefix
|
||||||
|
|
@ -104,7 +100,7 @@ class IngestionService:
|
||||||
|
|
||||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]:
|
async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
WP-15b: Phase 1 des Two-Pass Ingestion Workflows.
|
WP-15b: Phase 1 des Two-Pass Workflows.
|
||||||
Verarbeitet Batches und schreibt NUR Nutzer-Autorität (explizite Kanten).
|
Verarbeitet Batches und schreibt NUR Nutzer-Autorität (explizite Kanten).
|
||||||
"""
|
"""
|
||||||
self.batch_cache.clear()
|
self.batch_cache.clear()
|
||||||
|
|
@ -142,7 +138,6 @@ class IngestionService:
|
||||||
"""
|
"""
|
||||||
WP-24c: Führt PHASE 2 (Globale Symmetrie-Injektion) aus.
|
WP-24c: Führt PHASE 2 (Globale Symmetrie-Injektion) aus.
|
||||||
Wird am Ende des gesamten Imports aufgerufen.
|
Wird am Ende des gesamten Imports aufgerufen.
|
||||||
Sorgt dafür, dass virtuelle Kanten niemals Nutzer-Autorität überschreiben.
|
|
||||||
"""
|
"""
|
||||||
if not self.symmetry_buffer:
|
if not self.symmetry_buffer:
|
||||||
return {"status": "skipped", "reason": "buffer_empty"}
|
return {"status": "skipped", "reason": "buffer_empty"}
|
||||||
|
|
@ -153,13 +148,13 @@ class IngestionService:
|
||||||
src, tgt, kind = v_edge.get("note_id"), v_edge.get("target_id"), v_edge.get("kind")
|
src, tgt, kind = v_edge.get("note_id"), v_edge.get("target_id"), v_edge.get("kind")
|
||||||
if not src or not tgt: continue
|
if not src or not tgt: continue
|
||||||
|
|
||||||
# WP-Fix v3.4.2: NUTZUNG DER ZENTRALEN FUNKTION STATT MANUELLEM STRING
|
# WP-Fix: Nutzung der zentralisierten ID-Logik aus graph_utils
|
||||||
try:
|
try:
|
||||||
v_id = _mk_edge_id(kind, src, tgt, "note")
|
v_id = _mk_edge_id(kind, src, tgt, "note")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# AUTHORITY-CHECK: Nur schreiben, wenn keine manuelle Kante in der DB existiert
|
# AUTHORITY-CHECK: Nur schreiben, wenn keine manuelle Kante existiert
|
||||||
if not is_explicit_edge_present(self.client, self.prefix, v_id):
|
if not is_explicit_edge_present(self.client, self.prefix, v_id):
|
||||||
final_virtuals.append(v_edge)
|
final_virtuals.append(v_edge)
|
||||||
logger.info(f" 🔄 [SYMMETRY] Add inverse: {src} --({kind})--> {tgt}")
|
logger.info(f" 🔄 [SYMMETRY] Add inverse: {src} --({kind})--> {tgt}")
|
||||||
|
|
@ -178,7 +173,6 @@ class IngestionService:
|
||||||
"""
|
"""
|
||||||
Transformiert eine Markdown-Datei (Phase 1).
|
Transformiert eine Markdown-Datei (Phase 1).
|
||||||
Schreibt Notes/Chunks/Explicit Edges sofort.
|
Schreibt Notes/Chunks/Explicit Edges sofort.
|
||||||
Befüllt den Symmetrie-Puffer für Phase 2.
|
|
||||||
"""
|
"""
|
||||||
apply = kwargs.get("apply", False)
|
apply = kwargs.get("apply", False)
|
||||||
force_replace = kwargs.get("force_replace", False)
|
force_replace = kwargs.get("force_replace", False)
|
||||||
|
|
@ -191,7 +185,6 @@ class IngestionService:
|
||||||
if ".trash" in file_path or any(part.startswith('.') for part in file_path.split(os.sep)):
|
if ".trash" in file_path or any(part.startswith('.') for part in file_path.split(os.sep)):
|
||||||
return {**result, "status": "skipped", "reason": "ignored_folder"}
|
return {**result, "status": "skipped", "reason": "ignored_folder"}
|
||||||
|
|
||||||
# Datei einlesen und validieren
|
|
||||||
parsed = read_markdown(file_path)
|
parsed = read_markdown(file_path)
|
||||||
if not parsed: return {**result, "error": "Empty file"}
|
if not parsed: return {**result, "error": "Empty file"}
|
||||||
fm = normalize_frontmatter(parsed.frontmatter)
|
fm = normalize_frontmatter(parsed.frontmatter)
|
||||||
|
|
@ -205,7 +198,7 @@ class IngestionService:
|
||||||
|
|
||||||
logger.info(f"📄 Bearbeite: '{note_id}'")
|
logger.info(f"📄 Bearbeite: '{note_id}'")
|
||||||
|
|
||||||
# Change Detection & Fragment-Prüfung
|
# Change Detection
|
||||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||||
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
||||||
if not (force_replace or not old_payload or c_miss or e_miss):
|
if not (force_replace or not old_payload or c_miss or e_miss):
|
||||||
|
|
@ -214,7 +207,7 @@ class IngestionService:
|
||||||
if not apply:
|
if not apply:
|
||||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
||||||
|
|
||||||
# Deep Processing & MoE (LLM Validierung)
|
# Chunks & MoE
|
||||||
profile = note_pl.get("chunk_profile", "sliding_standard")
|
profile = note_pl.get("chunk_profile", "sliding_standard")
|
||||||
note_type = resolve_note_type(self.registry, fm.get("type"))
|
note_type = resolve_note_type(self.registry, fm.get("type"))
|
||||||
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
||||||
|
|
@ -234,17 +227,15 @@ class IngestionService:
|
||||||
new_pool.append(cand)
|
new_pool.append(cand)
|
||||||
ch.candidate_pool = new_pool
|
ch.candidate_pool = new_pool
|
||||||
|
|
||||||
# Embeddings erzeugen
|
|
||||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
||||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||||
|
|
||||||
# Kanten-Extraktion mit strikter Cache-Resolution (Fix für Ghost-IDs)
|
# Kanten-Extraktion
|
||||||
raw_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
|
raw_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
|
||||||
|
|
||||||
explicit_edges = []
|
explicit_edges = []
|
||||||
for e in raw_edges:
|
for e in raw_edges:
|
||||||
t_raw = e.get("target_id")
|
t_raw = e.get("target_id")
|
||||||
# Kanonisierung: Link-Auflösung über den globalen Cache
|
|
||||||
t_ctx = self.batch_cache.get(t_raw)
|
t_ctx = self.batch_cache.get(t_raw)
|
||||||
t_id = t_ctx.note_id if t_ctx else t_raw
|
t_id = t_ctx.note_id if t_ctx else t_raw
|
||||||
|
|
||||||
|
|
@ -254,20 +245,14 @@ class IngestionService:
|
||||||
e.update({"kind": resolved_kind, "target_id": t_id, "origin_note_id": note_id, "virtual": False})
|
e.update({"kind": resolved_kind, "target_id": t_id, "origin_note_id": note_id, "virtual": False})
|
||||||
explicit_edges.append(e)
|
explicit_edges.append(e)
|
||||||
|
|
||||||
# Symmetrie-Gegenkante für Phase 2 puffern
|
# Symmetrie puffern
|
||||||
inv_kind = edge_registry.get_inverse(resolved_kind)
|
inv_kind = edge_registry.get_inverse(resolved_kind)
|
||||||
if inv_kind and t_id != note_id:
|
if inv_kind and t_id != note_id:
|
||||||
v_edge = e.copy()
|
v_edge = e.copy()
|
||||||
v_edge.update({
|
v_edge.update({"note_id": t_id, "target_id": note_id, "kind": inv_kind, "virtual": True, "origin_note_id": note_id})
|
||||||
"note_id": t_id,
|
|
||||||
"target_id": note_id,
|
|
||||||
"kind": inv_kind,
|
|
||||||
"virtual": True,
|
|
||||||
"origin_note_id": note_id
|
|
||||||
})
|
|
||||||
self.symmetry_buffer.append(v_edge)
|
self.symmetry_buffer.append(v_edge)
|
||||||
|
|
||||||
# DB Upsert (Phase 1: Authority Commitment)
|
# DB Upsert
|
||||||
if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id)
|
if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id)
|
||||||
|
|
||||||
col_n, pts_n = points_for_note(self.prefix, note_pl, None, self.dim)
|
col_n, pts_n = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||||
|
|
@ -289,7 +274,7 @@ class IngestionService:
|
||||||
return {**result, "status": "error", "error": str(e)}
|
return {**result, "status": "error", "error": str(e)}
|
||||||
|
|
||||||
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
||||||
"""Erstellt eine Note aus einem Textstream und triggert die Ingestion."""
|
"""Erstellt eine Note aus einem Textstream."""
|
||||||
target_path = os.path.join(vault_root, folder, filename)
|
target_path = os.path.join(vault_root, folder, filename)
|
||||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||||
with open(target_path, "w", encoding="utf-8") as f:
|
with open(target_path, "w", encoding="utf-8") as f:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user