Refactor ingestion_db.py and ingestion_processor.py: Enhance documentation for clarity, improve symmetry injection logic, and refine artifact purging process. Update versioning to 3.3.5 to reflect changes in functionality and maintainability, ensuring robust handling of explicit edges and authority checks.
This commit is contained in:
parent
29e334625e
commit
3f528f2184
|
|
@ -14,7 +14,7 @@ from app.core.database import collection_names
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]:
|
def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]:
|
||||||
"""Holt die Metadaten einer Note aus Qdrant."""
|
"""Holt die Metadaten einer Note aus Qdrant via Scroll."""
|
||||||
notes_col, _, _ = collection_names(prefix)
|
notes_col, _, _ = collection_names(prefix)
|
||||||
try:
|
try:
|
||||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||||
|
|
@ -25,7 +25,7 @@ def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optio
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]:
|
def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]:
|
||||||
"""Prüft auf vorhandene Chunks und Edges."""
|
"""Prüft Qdrant aktiv auf vorhandene Chunks und Edges für eine Note."""
|
||||||
_, chunks_col, edges_col = collection_names(prefix)
|
_, chunks_col, edges_col = collection_names(prefix)
|
||||||
try:
|
try:
|
||||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||||
|
|
@ -38,12 +38,11 @@ def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[
|
||||||
|
|
||||||
def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) -> bool:
|
def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) -> bool:
|
||||||
"""
|
"""
|
||||||
WP-24c: Prüft, ob eine Kante mit der gegebenen ID bereits als 'explizit' existiert.
|
WP-24c: Prüft via Point-ID, ob bereits eine explizite Kante existiert.
|
||||||
Verhindert das Überschreiben von manuellem Wissen durch Symmetrie-Kanten.
|
Verhindert das Überschreiben von manuellem Wissen durch Symmetrien.
|
||||||
"""
|
"""
|
||||||
_, _, edges_col = collection_names(prefix)
|
_, _, edges_col = collection_names(prefix)
|
||||||
try:
|
try:
|
||||||
# retrieve ist der schnellste Weg, um einen Punkt via ID zu laden
|
|
||||||
res = client.retrieve(collection_name=edges_col, ids=[edge_id], with_payload=True)
|
res = client.retrieve(collection_name=edges_col, ids=[edge_id], with_payload=True)
|
||||||
if res and not res[0].payload.get("virtual", False):
|
if res and not res[0].payload.get("virtual", False):
|
||||||
return True
|
return True
|
||||||
|
|
@ -52,13 +51,12 @@ def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) ->
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
|
def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
|
||||||
"""Löscht Artefakte basierend auf ihrer Herkunft (Origin)."""
|
"""Löscht Artefakte basierend auf ihrer Herkunft (Origin-Purge)."""
|
||||||
_, chunks_col, edges_col = collection_names(prefix)
|
_, chunks_col, edges_col = collection_names(prefix)
|
||||||
try:
|
try:
|
||||||
chunks_filter = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
chunks_filter = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||||
client.delete(collection_name=chunks_col, points_selector=rest.FilterSelector(filter=chunks_filter))
|
client.delete(collection_name=chunks_col, points_selector=rest.FilterSelector(filter=chunks_filter))
|
||||||
|
|
||||||
# Origin-basiertes Löschen schützt fremde inverse Kanten
|
|
||||||
edges_filter = rest.Filter(must=[rest.FieldCondition(key="origin_note_id", match=rest.MatchValue(value=note_id))])
|
edges_filter = rest.Filter(must=[rest.FieldCondition(key="origin_note_id", match=rest.MatchValue(value=note_id))])
|
||||||
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=edges_filter))
|
client.delete(collection_name=edges_col, points_selector=rest.FilterSelector(filter=edges_filter))
|
||||||
logger.info(f"🧹 [PURGE] Global artifacts owned by '{note_id}' cleared.")
|
logger.info(f"🧹 [PURGE] Global artifacts owned by '{note_id}' cleared.")
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,9 @@ DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||||
WP-24c: Integration der Symmetrie-Logik (Automatische inverse Kanten).
|
WP-24c: Integration der Symmetrie-Logik (Automatische inverse Kanten).
|
||||||
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
||||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
AUDIT v3.3.5: 2-Phasen-Strategie (Phase 2 erst nach allen Batches).
|
||||||
AUDIT v3.3.2: 2-Phasen-Schreibstrategie & API-Kompatibilitäts Fix.
|
API-Fix für Dictionary-Rückgabe. Vollständiger Umfang.
|
||||||
Garantiert Datenhoheit expliziter Kanten.
|
VERSION: 3.3.5 (WP-24c: Global Symmetry Commitment)
|
||||||
VERSION: 3.3.2 (WP-24c: Authority-First Batch Orchestration)
|
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -22,7 +21,7 @@ from app.core.parser import (
|
||||||
validate_required_frontmatter, NoteContext
|
validate_required_frontmatter, NoteContext
|
||||||
)
|
)
|
||||||
from app.core.chunking import assemble_chunks
|
from app.core.chunking import assemble_chunks
|
||||||
# WP-24c: Import für die deterministische ID-Vorabberechnung
|
# WP-24c: Import für die deterministische UUID-Vorabberechnung
|
||||||
from app.core.graph.graph_utils import _mk_edge_id
|
from app.core.graph.graph_utils import _mk_edge_id
|
||||||
|
|
||||||
# Datenbank-Ebene (Modularisierte database-Infrastruktur)
|
# Datenbank-Ebene (Modularisierte database-Infrastruktur)
|
||||||
|
|
@ -52,12 +51,11 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class IngestionService:
|
class IngestionService:
|
||||||
def __init__(self, collection_prefix: str = None):
|
def __init__(self, collection_prefix: str = None):
|
||||||
"""Initialisiert den Service und nutzt die neue database-Infrastruktur."""
|
"""Initialisiert den Service und bereinigt das Logging."""
|
||||||
from app.config import get_settings
|
from app.config import get_settings
|
||||||
self.settings = get_settings()
|
self.settings = get_settings()
|
||||||
|
|
||||||
# --- LOGGING CLEANUP (Business Focus) ---
|
# --- LOGGING CLEANUP (Business Focus) ---
|
||||||
# Unterdrückt Bibliotheks-Lärm in Konsole und Datei (via tee)
|
|
||||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||||
logging.getLogger("qdrant_client").setLevel(logging.WARNING)
|
logging.getLogger("qdrant_client").setLevel(logging.WARNING)
|
||||||
|
|
@ -65,7 +63,6 @@ class IngestionService:
|
||||||
|
|
||||||
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
||||||
self.cfg = QdrantConfig.from_env()
|
self.cfg = QdrantConfig.from_env()
|
||||||
# Synchronisierung der Konfiguration mit dem Instanz-Präfix
|
|
||||||
self.cfg.prefix = self.prefix
|
self.cfg.prefix = self.prefix
|
||||||
self.client = get_client(self.cfg)
|
self.client = get_client(self.cfg)
|
||||||
|
|
||||||
|
|
@ -73,58 +70,44 @@ class IngestionService:
|
||||||
self.embedder = EmbeddingsClient()
|
self.embedder = EmbeddingsClient()
|
||||||
self.llm = LLMService()
|
self.llm = LLMService()
|
||||||
|
|
||||||
# WP-25a: Auflösung der Dimension über das Embedding-Profil (MoE)
|
|
||||||
embed_cfg = self.llm.profiles.get("embedding_expert", {})
|
embed_cfg = self.llm.profiles.get("embedding_expert", {})
|
||||||
self.dim = embed_cfg.get("dimensions") or self.settings.VECTOR_SIZE
|
self.dim = embed_cfg.get("dimensions") or self.settings.VECTOR_SIZE
|
||||||
|
|
||||||
# Festlegen, welcher Hash für die Change-Detection maßgeblich ist
|
|
||||||
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
||||||
|
|
||||||
# WP-15b: Kontext-Gedächtnis für ID-Auflösung
|
# WP-15b: Kontext-Gedächtnis für ID-Auflösung
|
||||||
self.batch_cache: Dict[str, NoteContext] = {}
|
self.batch_cache: Dict[str, NoteContext] = {}
|
||||||
|
|
||||||
# WP-24c: Puffer für Phase 2 (Symmetrie-Injektion nach Persistierung)
|
# WP-24c: Puffer für Phase 2 (Symmetrie-Injektion am Ende des gesamten Imports)
|
||||||
self.symmetry_buffer: List[Dict[str, Any]] = []
|
self.symmetry_buffer: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Aufruf der modularisierten Schema-Logik
|
|
||||||
ensure_collections(self.client, self.prefix, self.dim)
|
ensure_collections(self.client, self.prefix, self.dim)
|
||||||
ensure_payload_indexes(self.client, self.prefix)
|
ensure_payload_indexes(self.client, self.prefix)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"DB initialization warning: {e}")
|
logger.warning(f"DB initialization warning: {e}")
|
||||||
|
|
||||||
def _is_valid_note_id(self, text: str) -> bool:
|
def _is_valid_note_id(self, text: str) -> bool:
|
||||||
"""
|
"""WP-24c: Verhindert Müll-Kanten zu System-Platzhaltern."""
|
||||||
WP-24c: Prüft Ziel-Strings auf fachliche Validität.
|
if not text or len(text.strip()) < 2: return False
|
||||||
Verhindert Müll-Kanten zu System-Platzhaltern.
|
|
||||||
"""
|
|
||||||
if not text or len(text.strip()) < 2:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Blacklist für Begriffe, die keine echten Notizen sind
|
|
||||||
blacklisted = {"insight", "event", "source", "task", "project", "person", "concept", "related_to", "referenced_by"}
|
blacklisted = {"insight", "event", "source", "task", "project", "person", "concept", "related_to", "referenced_by"}
|
||||||
if text.lower().strip() in blacklisted:
|
if text.lower().strip() in blacklisted: return False
|
||||||
return False
|
|
||||||
|
|
||||||
# Längere Titel zulassen (z.B. für Hubs), aber keine ganzen Sätze
|
|
||||||
if len(text) > 200: return False
|
if len(text) > 200: return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]:
|
async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
WP-15b: Two-Pass Ingestion Workflow mit 2-Phasen-Schreibstrategie.
|
WP-15b: Two-Pass Ingestion Workflow (PHASE 1).
|
||||||
Fix: Gibt Dictionary zurück, um Kompatibilität zum Importer-Script zu wahren.
|
Fix: Gibt Dictionary zurück, um Kompatibilität zum Importer-Script zu wahren.
|
||||||
"""
|
"""
|
||||||
self.batch_cache.clear()
|
self.batch_cache.clear()
|
||||||
self.symmetry_buffer.clear()
|
logger.info(f"--- 🔍 START BATCH (Phase 1) ---")
|
||||||
logger.info(f"--- 🔍 START BATCH IMPORT ({len(file_paths)} Dateien) ---")
|
|
||||||
|
|
||||||
# 1. Schritt: Pre-Scan (Context-Cache füllen)
|
# 1. Pre-Scan (Context-Cache füllen)
|
||||||
for path in file_paths:
|
for path in file_paths:
|
||||||
try:
|
try:
|
||||||
ctx = pre_scan_markdown(path, registry=self.registry)
|
ctx = pre_scan_markdown(path, registry=self.registry)
|
||||||
if ctx:
|
if ctx:
|
||||||
# Look-up Index für Note_IDs und Titel
|
|
||||||
self.batch_cache[ctx.note_id] = ctx
|
self.batch_cache[ctx.note_id] = ctx
|
||||||
self.batch_cache[ctx.title] = ctx
|
self.batch_cache[ctx.title] = ctx
|
||||||
fname = os.path.splitext(os.path.basename(path))[0]
|
fname = os.path.splitext(os.path.basename(path))[0]
|
||||||
|
|
@ -132,8 +115,7 @@ class IngestionService:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f" ⚠️ Pre-scan fehlgeschlagen für {path}: {e}")
|
logger.warning(f" ⚠️ Pre-scan fehlgeschlagen für {path}: {e}")
|
||||||
|
|
||||||
# 2. Schritt: PROCESSING (PHASE 1: AUTHORITY)
|
# 2. Schritt: PROCESSING (NUR AUTHORITY)
|
||||||
# Verarbeitet alle Dateien und schreibt NUR explizite Kanten in die DB.
|
|
||||||
processed_count = 0
|
processed_count = 0
|
||||||
success_count = 0
|
success_count = 0
|
||||||
for p in file_paths:
|
for p in file_paths:
|
||||||
|
|
@ -142,108 +124,87 @@ class IngestionService:
|
||||||
if res.get("status") == "success":
|
if res.get("status") == "success":
|
||||||
success_count += 1
|
success_count += 1
|
||||||
|
|
||||||
# 3. Schritt: SYMMETRY INJECTION (PHASE 2)
|
logger.info(f"--- ✅ Batch Phase 1 abgeschlossen ({success_count}/{processed_count}) ---")
|
||||||
# Erst jetzt, wo alle manuellen Kanten in Qdrant liegen, schreiben wir die Symmetrien.
|
|
||||||
if self.symmetry_buffer:
|
|
||||||
logger.info(f"🔄 PHASE 2: Validiere {len(self.symmetry_buffer)} Symmetrie-Kanten gegen Live-DB...")
|
|
||||||
final_virtuals = []
|
|
||||||
for v_edge in self.symmetry_buffer:
|
|
||||||
# Eindeutige ID der potenziellen Symmetrie-Kante berechnen
|
|
||||||
v_id = _mk_edge_id(v_edge["kind"], v_edge["note_id"], v_edge["target_id"], v_edge.get("scope", "note"))
|
|
||||||
|
|
||||||
# Nur schreiben, wenn Qdrant sagt: "Keine manuelle Kante für diese ID vorhanden"
|
|
||||||
if not is_explicit_edge_present(self.client, self.prefix, v_id):
|
|
||||||
final_virtuals.append(v_edge)
|
|
||||||
else:
|
|
||||||
logger.debug(f" 🛡️ Symmetrie unterdrückt (Manuelle Kante existiert): {v_id}")
|
|
||||||
|
|
||||||
if final_virtuals:
|
|
||||||
logger.info(f"📤 Schreibe {len(final_virtuals)} geschützte Symmetrie-Kanten.")
|
|
||||||
e_pts = points_for_edges(self.prefix, final_virtuals)[1]
|
|
||||||
upsert_batch(self.client, f"{self.prefix}_edges", e_pts)
|
|
||||||
|
|
||||||
logger.info(f"--- ✅ BATCH IMPORT BEENDET ---")
|
|
||||||
return {
|
return {
|
||||||
"status": "success",
|
"status": "success",
|
||||||
"processed": processed_count,
|
"processed": processed_count,
|
||||||
"success": success_count,
|
"success": success_count,
|
||||||
"virtuals_added": len(self.symmetry_buffer)
|
"buffered_virtuals": len(self.symmetry_buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async def commit_vault_symmetries(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
WP-24c: Führt PHASE 2 für den gesamten Vault aus.
|
||||||
|
Wird nach allen run_batch Aufrufen einmalig getriggert.
|
||||||
|
"""
|
||||||
|
if not self.symmetry_buffer:
|
||||||
|
return {"status": "skipped", "reason": "buffer_empty"}
|
||||||
|
|
||||||
|
logger.info(f"🔄 PHASE 2: Validiere {len(self.symmetry_buffer)} Symmetrie-Kanten gegen die Instance-of-Truth...")
|
||||||
|
final_virtuals = []
|
||||||
|
for v_edge in self.symmetry_buffer:
|
||||||
|
# ID der potenziellen Symmetrie berechnen
|
||||||
|
v_id = _mk_edge_id(v_edge["kind"], v_edge["note_id"], v_edge["target_id"], v_edge.get("scope", "note"))
|
||||||
|
|
||||||
|
# Nur schreiben, wenn KEINE manuelle Kante in der DB existiert
|
||||||
|
if not is_explicit_edge_present(self.client, self.prefix, v_id):
|
||||||
|
final_virtuals.append(v_edge)
|
||||||
|
else:
|
||||||
|
logger.debug(f" 🛡️ Schutz: Manuelle Kante verhindert Symmetrie {v_id}")
|
||||||
|
|
||||||
|
added_count = 0
|
||||||
|
if final_virtuals:
|
||||||
|
logger.info(f"📤 Schreibe {len(final_virtuals)} geschützte Symmetrie-Kanten.")
|
||||||
|
e_pts = points_for_edges(self.prefix, final_virtuals)[1]
|
||||||
|
upsert_batch(self.client, f"{self.prefix}_edges", e_pts)
|
||||||
|
added_count = len(final_virtuals)
|
||||||
|
|
||||||
|
self.symmetry_buffer.clear() # Puffer leeren
|
||||||
|
return {"status": "success", "added": added_count}
|
||||||
|
|
||||||
async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]:
|
async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]:
|
||||||
"""
|
"""Transformiert Datei und befüllt den Symmetry-Buffer."""
|
||||||
Transformiert eine Markdown-Datei.
|
|
||||||
Schreibt Notes/Chunks/Explicit Edges sofort (Phase 1).
|
|
||||||
Befüllt den Symmetrie-Puffer für Phase 2.
|
|
||||||
"""
|
|
||||||
apply = kwargs.get("apply", False)
|
apply = kwargs.get("apply", False)
|
||||||
force_replace = kwargs.get("force_replace", False)
|
force_replace = kwargs.get("force_replace", False)
|
||||||
purge_before = kwargs.get("purge_before", False)
|
purge_before = kwargs.get("purge_before", False)
|
||||||
note_scope_refs = kwargs.get("note_scope_refs", False)
|
|
||||||
hash_source = kwargs.get("hash_source", "parsed")
|
|
||||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
|
||||||
|
|
||||||
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
||||||
|
|
||||||
# 1. Parse & Lifecycle Gate
|
|
||||||
try:
|
try:
|
||||||
|
# --- ORDNER-FILTER (.trash) ---
|
||||||
|
if any(part.startswith('.') for part in file_path.split(os.sep)):
|
||||||
|
return {**result, "status": "skipped", "reason": "hidden_folder"}
|
||||||
|
|
||||||
|
ingest_cfg = self.registry.get("ingestion_settings", {})
|
||||||
|
ignore_folders = ingest_cfg.get("ignore_folders", [".trash", ".obsidian", "templates"])
|
||||||
|
if any(folder in file_path for folder in ignore_folders):
|
||||||
|
return {**result, "status": "skipped", "reason": "folder_blacklist"}
|
||||||
|
|
||||||
parsed = read_markdown(file_path)
|
parsed = read_markdown(file_path)
|
||||||
if not parsed: return {**result, "error": "Empty file"}
|
if not parsed: return {**result, "error": "Empty file"}
|
||||||
fm = normalize_frontmatter(parsed.frontmatter)
|
fm = normalize_frontmatter(parsed.frontmatter)
|
||||||
validate_required_frontmatter(fm)
|
note_type = resolve_note_type(self.registry, fm.get("type"))
|
||||||
except Exception as e:
|
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, types_cfg=self.registry)
|
||||||
return {**result, "error": f"Validation failed: {str(e)}"}
|
note_id = note_pl["note_id"]
|
||||||
|
|
||||||
ingest_cfg = self.registry.get("ingestion_settings", {})
|
logger.info(f"📄 Bearbeite: '{note_id}' (Typ: {note_type})")
|
||||||
ignore_list = ingest_cfg.get("ignore_statuses", ["system", "template", "archive", "hidden"])
|
|
||||||
|
|
||||||
current_status = fm.get("status", "draft").lower().strip()
|
# Change Detection
|
||||||
if current_status in ignore_list:
|
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||||
return {**result, "status": "skipped", "reason": "lifecycle_filter"}
|
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
||||||
|
if not (force_replace or not old_payload or c_miss or e_miss):
|
||||||
|
return {**result, "status": "unchanged", "note_id": note_id}
|
||||||
|
|
||||||
# 2. Payload & Change Detection
|
# Deep Processing & MoE
|
||||||
note_type = resolve_note_type(self.registry, fm.get("type"))
|
|
||||||
note_pl = make_note_payload(
|
|
||||||
parsed, vault_root=vault_root, file_path=file_path,
|
|
||||||
hash_source=hash_source, hash_normalize=hash_normalize,
|
|
||||||
types_cfg=self.registry
|
|
||||||
)
|
|
||||||
note_id = note_pl["note_id"]
|
|
||||||
|
|
||||||
logger.info(f"📄 Bearbeite: '{note_id}' (Typ: {note_type})")
|
|
||||||
|
|
||||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
|
||||||
check_key = f"{self.active_hash_mode}:{hash_source}:{hash_normalize}"
|
|
||||||
old_hash = (old_payload or {}).get("hashes", {}).get(check_key)
|
|
||||||
new_hash = note_pl.get("hashes", {}).get(check_key)
|
|
||||||
|
|
||||||
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
|
||||||
|
|
||||||
if not (force_replace or not old_payload or old_hash != new_hash or c_miss or e_miss):
|
|
||||||
return {**result, "status": "unchanged", "note_id": note_id}
|
|
||||||
|
|
||||||
if not apply:
|
|
||||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
|
||||||
|
|
||||||
# 3. Deep Processing (Chunking, Validation, Embedding)
|
|
||||||
try:
|
|
||||||
body_text = getattr(parsed, "body", "") or ""
|
|
||||||
edge_registry.ensure_latest()
|
|
||||||
profile = note_pl.get("chunk_profile", "sliding_standard")
|
profile = note_pl.get("chunk_profile", "sliding_standard")
|
||||||
|
|
||||||
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
||||||
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
chunks = await assemble_chunks(note_id, getattr(parsed, "body", ""), note_type, config=chunk_cfg)
|
||||||
|
|
||||||
chunks = await assemble_chunks(note_id, body_text, note_type, config=chunk_cfg)
|
|
||||||
|
|
||||||
# --- WP-25a: MoE Semantische Kanten-Validierung ---
|
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
new_pool = []
|
new_pool = []
|
||||||
for cand in getattr(ch, "candidate_pool", []):
|
for cand in getattr(ch, "candidate_pool", []):
|
||||||
if cand.get("provenance") == "global_pool" and enable_smart:
|
if cand.get("provenance") == "global_pool" and chunk_cfg.get("enable_smart_edge_allocation"):
|
||||||
is_valid = await validate_edge_candidate(
|
is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm)
|
||||||
ch.text, cand, self.batch_cache, self.llm, profile_name="ingest_validator"
|
|
||||||
)
|
|
||||||
t_id = cand.get('target_id') or cand.get('note_id') or "Unknown"
|
t_id = cand.get('target_id') or cand.get('note_id') or "Unknown"
|
||||||
logger.info(f" 🧠 [SMART EDGE] {t_id} -> {'✅ OK' if is_valid else '❌ SKIP'}")
|
logger.info(f" 🧠 [SMART EDGE] {t_id} -> {'✅ OK' if is_valid else '❌ SKIP'}")
|
||||||
if is_valid: new_pool.append(cand)
|
if is_valid: new_pool.append(cand)
|
||||||
|
|
@ -254,30 +215,20 @@ class IngestionService:
|
||||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
||||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||||
|
|
||||||
# Aggregation aller Kanten
|
# Kanten-Logik (Kanonisierung)
|
||||||
raw_edges = build_edges_for_note(
|
raw_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
|
||||||
note_id, chunk_pls,
|
|
||||||
note_level_references=note_pl.get("references", []),
|
|
||||||
include_note_scope_refs=note_scope_refs
|
|
||||||
)
|
|
||||||
|
|
||||||
# --- WP-24c: Symmetrie-Injektion (Authority Implementation) ---
|
|
||||||
explicit_edges = []
|
explicit_edges = []
|
||||||
for e in raw_edges:
|
for e in raw_edges:
|
||||||
target_raw = e.get("target_id")
|
target_raw = e.get("target_id")
|
||||||
# ID-Resolution über den Context-Cache (Titel -> Note_ID)
|
t_ctx = self.batch_cache.get(target_raw)
|
||||||
target_ctx = self.batch_cache.get(target_raw)
|
target_id = t_ctx.note_id if t_ctx else target_raw
|
||||||
target_id = target_ctx.note_id if target_ctx else target_raw
|
|
||||||
|
|
||||||
if not self._is_valid_note_id(target_id): continue
|
if not self._is_valid_note_id(target_id): continue
|
||||||
|
|
||||||
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"))
|
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"))
|
||||||
|
|
||||||
# Echte physische Kante markieren (Phase 1)
|
# Echte physische Kante markieren (Phase 1)
|
||||||
e.update({
|
e.update({"kind": resolved_kind, "target_id": target_id, "origin_note_id": note_id, "virtual": False, "confidence": 1.0})
|
||||||
"kind": resolved_kind, "target_id": target_id,
|
|
||||||
"origin_note_id": note_id, "virtual": False, "confidence": 1.0
|
|
||||||
})
|
|
||||||
explicit_edges.append(e)
|
explicit_edges.append(e)
|
||||||
|
|
||||||
# Symmetrie-Kandidat für Phase 2 puffern
|
# Symmetrie-Kandidat für Phase 2 puffern
|
||||||
|
|
@ -291,28 +242,19 @@ class IngestionService:
|
||||||
})
|
})
|
||||||
self.symmetry_buffer.append(v_edge)
|
self.symmetry_buffer.append(v_edge)
|
||||||
|
|
||||||
# 4. DB Upsert (Phase 1: Authority)
|
# 4. DB Upsert (Phase 1: Authority Only)
|
||||||
if apply:
|
if apply:
|
||||||
if purge_before and old_payload:
|
if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id)
|
||||||
purge_artifacts(self.client, self.prefix, note_id)
|
|
||||||
|
|
||||||
# Speichern der Haupt-Note
|
|
||||||
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||||
upsert_batch(self.client, n_name, n_pts)
|
upsert_batch(self.client, n_name, n_pts)
|
||||||
|
|
||||||
if chunk_pls and vecs:
|
if chunk_pls and vecs:
|
||||||
c_pts = points_for_chunks(self.prefix, chunk_pls, vecs)[1]
|
upsert_batch(self.client, f"{self.prefix}_chunks", points_for_chunks(self.prefix, chunk_pls, vecs)[1])
|
||||||
upsert_batch(self.client, f"{self.prefix}_chunks", c_pts)
|
|
||||||
|
|
||||||
if explicit_edges:
|
if explicit_edges:
|
||||||
e_pts = points_for_edges(self.prefix, explicit_edges)[1]
|
upsert_batch(self.client, f"{self.prefix}_edges", points_for_edges(self.prefix, explicit_edges)[1])
|
||||||
upsert_batch(self.client, f"{self.prefix}_edges", e_pts)
|
|
||||||
|
|
||||||
logger.info(f" ✨ Phase 1 fertig: {len(chunk_pls)} Chunks, {len(explicit_edges)} explizite Kanten.")
|
logger.info(f" ✨ Phase 1 fertig: {len(chunk_pls)} Chunks, {len(explicit_edges)} explizite Kanten.")
|
||||||
return {
|
return {"status": "success", "note_id": note_id, "edges_count": len(explicit_edges)}
|
||||||
"path": file_path, "status": "success", "changed": True, "note_id": note_id,
|
|
||||||
"chunks_count": len(chunk_pls), "edges_count": len(explicit_edges)
|
|
||||||
}
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ Fehler bei {file_path}: {e}", exc_info=True)
|
logger.error(f"❌ Fehler bei {file_path}: {e}", exc_info=True)
|
||||||
return {**result, "error": str(e)}
|
return {**result, "error": str(e)}
|
||||||
|
|
@ -321,7 +263,6 @@ class IngestionService:
|
||||||
"""Erstellt eine Note aus einem Textstream."""
|
"""Erstellt eine Note aus einem Textstream."""
|
||||||
target_path = os.path.join(vault_root, folder, filename)
|
target_path = os.path.join(vault_root, folder, filename)
|
||||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||||
with open(target_path, "w", encoding="utf-8") as f:
|
with open(target_path, "w", encoding="utf-8") as f: f.write(markdown_content)
|
||||||
f.write(markdown_content)
|
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||||
Loading…
Reference in New Issue
Block a user