NEUSTART von vorne mit frischer Codebasis
Update qdrant_points.py, graph_utils.py, ingestion_db.py, ingestion_processor.py, and import_markdown.py: Enhance UUID generation for edge IDs, improve error handling, and refine documentation for clarity. Implement atomic consistency in batch upserts and ensure strict phase separation in the ingestion workflow. Update versioning to reflect changes in functionality and maintain compatibility with the ingestion service.
This commit is contained in:
parent
7e00344b84
commit
7cc823e2f4
|
|
@ -1,10 +1,10 @@
|
|||
"""
|
||||
FILE: app/core/database/qdrant_points.py
|
||||
DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs.
|
||||
VERSION: 1.5.1 (WP-Fix: Explicit Target Section Support)
|
||||
VERSION: 1.5.2 (WP-Fix: Atomic Consistency & Canonical Edge IDs)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, uuid, os
|
||||
LAST_ANALYSIS: 2025-12-29
|
||||
LAST_ANALYSIS: 2026-01-10
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
|
|
@ -17,7 +17,13 @@ from qdrant_client import QdrantClient
|
|||
# --------------------- ID helpers ---------------------
|
||||
|
||||
def _to_uuid(stable_key: str) -> str:
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key))
|
||||
"""
|
||||
Erzeugt eine deterministische UUIDv5 basierend auf einem stabilen Schlüssel.
|
||||
Härtung v1.5.2: Guard gegen leere Schlüssel zur Vermeidung von Pydantic-Fehlern.
|
||||
"""
|
||||
if not stable_key:
|
||||
raise ValueError("UUID generation failed: stable_key is empty or None")
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, str(stable_key)))
|
||||
|
||||
def _names(prefix: str) -> Tuple[str, str, str]:
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
|
@ -68,18 +74,25 @@ def _normalize_edge_payload(pl: dict) -> dict:
|
|||
return pl
|
||||
|
||||
def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]:
|
||||
"""
|
||||
Konvertiert Kanten-Payloads in PointStructs.
|
||||
WP-24c: Nutzt strikte ID-Kanonisierung für die Symmetrie-Integrität.
|
||||
"""
|
||||
_, _, edges_col = _names(prefix)
|
||||
points: List[rest.PointStruct] = []
|
||||
for raw in edge_payloads:
|
||||
pl = _normalize_edge_payload(raw)
|
||||
edge_id = pl.get("edge_id")
|
||||
if not edge_id:
|
||||
kind = pl.get("kind", "edge")
|
||||
s = pl.get("source_id", "unknown-src")
|
||||
t = pl.get("target_id", "unknown-tgt")
|
||||
seq = pl.get("seq") or ""
|
||||
edge_id = f"{kind}:{s}->{t}#{seq}"
|
||||
pl["edge_id"] = edge_id
|
||||
|
||||
# WP-24c: Deterministische ID-Generierung zur Kollisionsvermeidung
|
||||
kind = pl.get("kind", "edge")
|
||||
s = pl.get("source_id", "unknown-src")
|
||||
t = pl.get("target_id", "unknown-tgt")
|
||||
scope = pl.get("scope", "note")
|
||||
|
||||
# Stabiler Schlüssel für UUIDv5
|
||||
edge_id = f"edge:{kind}:{s}:{t}:{scope}"
|
||||
pl["edge_id"] = edge_id
|
||||
|
||||
point_id = _to_uuid(edge_id)
|
||||
points.append(rest.PointStruct(id=point_id, vector=[0.0], payload=pl))
|
||||
return edges_col, points
|
||||
|
|
@ -157,28 +170,32 @@ def _as_named(points: List[rest.PointStruct], name: str) -> List[rest.PointStruc
|
|||
|
||||
# --------------------- Qdrant ops ---------------------
|
||||
|
||||
def upsert_batch(client: QdrantClient, collection: str, points: List[rest.PointStruct]) -> None:
|
||||
def upsert_batch(client: QdrantClient, collection: str, points: List[rest.PointStruct], wait: bool = True) -> None:
|
||||
"""
|
||||
Schreibt Points in eine Collection.
|
||||
WP-Fix: Unterstützt den 'wait' Parameter (Default True für Kompatibilität zu v1.5.1).
|
||||
"""
|
||||
if not points:
|
||||
return
|
||||
|
||||
# 1) ENV overrides come first
|
||||
override = _env_override_for_collection(collection)
|
||||
if override == "__single__":
|
||||
client.upsert(collection_name=collection, points=points, wait=True)
|
||||
client.upsert(collection_name=collection, points=points, wait=wait)
|
||||
return
|
||||
elif isinstance(override, str):
|
||||
client.upsert(collection_name=collection, points=_as_named(points, override), wait=True)
|
||||
client.upsert(collection_name=collection, points=_as_named(points, override), wait=wait)
|
||||
return
|
||||
|
||||
# 2) Auto-detect schema
|
||||
schema = _get_vector_schema(client, collection)
|
||||
if schema.get("kind") == "named":
|
||||
name = schema.get("primary") or _preferred_name(schema.get("names") or [])
|
||||
client.upsert(collection_name=collection, points=_as_named(points, name), wait=True)
|
||||
client.upsert(collection_name=collection, points=_as_named(points, name), wait=wait)
|
||||
return
|
||||
|
||||
# 3) Fallback single-vector
|
||||
client.upsert(collection_name=collection, points=points, wait=True)
|
||||
client.upsert(collection_name=collection, points=points, wait=wait)
|
||||
|
||||
# --- Optional search helpers ---
|
||||
|
||||
|
|
@ -229,30 +246,7 @@ def get_edges_for_sources(
|
|||
edge_types: Optional[Iterable[str]] = None,
|
||||
limit: int = 2048,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Retrieve edge payloads from the <prefix>_edges collection.
|
||||
|
||||
Args:
|
||||
client: QdrantClient instance.
|
||||
prefix: Mindnet collection prefix (e.g. "mindnet").
|
||||
source_ids: Iterable of source_id values (typically chunk_ids or note_ids).
|
||||
edge_types: Optional iterable of edge kinds (e.g. ["references", "depends_on"]). If None,
|
||||
all kinds are returned.
|
||||
limit: Maximum number of edge payloads to return.
|
||||
|
||||
Returns:
|
||||
A list of edge payload dicts, e.g.:
|
||||
{
|
||||
"note_id": "...",
|
||||
"chunk_id": "...",
|
||||
"kind": "references" | "depends_on" | ...,
|
||||
"scope": "chunk",
|
||||
"source_id": "...",
|
||||
"target_id": "...",
|
||||
"rule_id": "...",
|
||||
"confidence": 0.7,
|
||||
...
|
||||
}
|
||||
"""
|
||||
"""Retrieve edge payloads from the <prefix>_edges collection."""
|
||||
source_ids = list(source_ids)
|
||||
if not source_ids or limit <= 0:
|
||||
return []
|
||||
|
|
@ -274,7 +268,7 @@ def get_edges_for_sources(
|
|||
next_page = None
|
||||
remaining = int(limit)
|
||||
|
||||
# Use paginated scroll API; we don't need vectors, only payloads.
|
||||
# Use paginated scroll API
|
||||
while remaining > 0:
|
||||
batch_limit = min(256, remaining)
|
||||
res, next_page = client.scroll(
|
||||
|
|
@ -286,10 +280,6 @@ def get_edges_for_sources(
|
|||
offset=next_page,
|
||||
)
|
||||
|
||||
# Recovery: In der originalen Codebasis v1.5.0 fehlt hier der Abschluss des Loops.
|
||||
# Um 100% Konformität zu wahren, habe ich ihn genau so gelassen.
|
||||
# ACHTUNG: Der Code unten stellt die logische Fortsetzung aus deiner Datei dar.
|
||||
|
||||
if not res:
|
||||
break
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,16 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_utils.py
|
||||
DESCRIPTION: Basale Werkzeuge, ID-Generierung und Provenance-Konfiguration für den Graphen.
|
||||
WP-24c: Integration der EdgeRegistry für dynamische Topologie-Defaults.
|
||||
FIX v1.2.0: Umstellung auf deterministische UUIDs für Qdrant-Kompatibilität.
|
||||
VERSION: 1.2.0
|
||||
AUDIT v1.6.0:
|
||||
- Erweitert um parse_link_target für sauberes Section-Splitting.
|
||||
- Einführung einer gehärteten, deterministischen ID-Berechnung für Kanten (WP-24c).
|
||||
- Integration der .env-gesteuerten Pfadauflösung für Schema und Vokabular.
|
||||
VERSION: 1.6.0 (WP-24c: Identity & Path Enforcement)
|
||||
STATUS: Active
|
||||
"""
|
||||
import os
|
||||
import hashlib
|
||||
import uuid
|
||||
import logging
|
||||
import hashlib
|
||||
from typing import Iterable, List, Optional, Set, Any, Tuple
|
||||
|
||||
try:
|
||||
|
|
@ -17,12 +18,7 @@ try:
|
|||
except ImportError:
|
||||
yaml = None
|
||||
|
||||
# WP-24c: Import der zentralen Registry für Topologie-Abfragen
|
||||
from app.services.edge_registry import registry as edge_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# WP-15b: Prioritäten-Ranking für die De-Duplizierung
|
||||
# WP-15b: Prioritäten-Ranking für die De-Duplizierung von Kanten unterschiedlicher Herkunft
|
||||
PROVENANCE_PRIORITY = {
|
||||
"explicit:wikilink": 1.00,
|
||||
"inline:rel": 0.95,
|
||||
|
|
@ -32,57 +28,44 @@ PROVENANCE_PRIORITY = {
|
|||
"structure:order": 0.95, # next/prev
|
||||
"explicit:note_scope": 1.00,
|
||||
"derived:backlink": 0.90,
|
||||
"edge_defaults": 0.70 # Heuristik (nun via graph_schema.md)
|
||||
"edge_defaults": 0.70 # Heuristik basierend auf types.yaml
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pfad-Auflösung (Integration der .env Umgebungsvariablen)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_vocab_path() -> str:
|
||||
"""Liefert den Pfad zum Edge-Vokabular aus der .env oder den Default."""
|
||||
return os.getenv("MINDNET_VOCAB_PATH", "/mindnet/vault/mindnet/_system/dictionary/edge_vocabulary.md")
|
||||
|
||||
def get_schema_path() -> str:
|
||||
"""Liefert den Pfad zum Graph-Schema aus der .env oder den Default."""
|
||||
return os.getenv("MINDNET_SCHEMA_PATH", "/mindnet/vault/mindnet/_system/dictionary/graph_schema.md")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ID & String Helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get(d: dict, *keys, default=None):
|
||||
"""Sicherer Zugriff auf verschachtelte Keys."""
|
||||
"""Sicherer Zugriff auf tief verschachtelte Dictionary-Keys."""
|
||||
for k in keys:
|
||||
if isinstance(d, dict) and k in d and d[k] is not None:
|
||||
return d[k]
|
||||
return default
|
||||
|
||||
def _dedupe_seq(seq: Iterable[str]) -> List[str]:
|
||||
"""Dedupliziert Strings unter Beibehaltung der Reihenfolge."""
|
||||
seen: Set[str] = set()
|
||||
out: List[str] = []
|
||||
for s in seq:
|
||||
if s not in seen:
|
||||
seen.add(s); out.append(s)
|
||||
return out
|
||||
|
||||
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None, variant: Optional[str] = None) -> str:
|
||||
"""
|
||||
Erzeugt eine deterministische UUID v5-konforme ID für Qdrant.
|
||||
Behebt den 'HTTP 400 Bad Request', indem ein valides UUID-Format geliefert wird.
|
||||
"""
|
||||
base = f"{kind}:{s}->{t}#{scope}"
|
||||
if rule_id:
|
||||
base += f"|{rule_id}"
|
||||
if variant:
|
||||
base += f"|{variant}"
|
||||
|
||||
# Wir erzeugen einen 16-Byte Hash (128 Bit) für die UUID-Konvertierung
|
||||
hash_bytes = hashlib.blake2s(base.encode("utf-8"), digest_size=16).digest()
|
||||
return str(uuid.UUID(bytes=hash_bytes))
|
||||
|
||||
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
||||
"""Konstruiert ein Kanten-Payload für Qdrant."""
|
||||
pl = {
|
||||
"kind": kind,
|
||||
"relation": kind,
|
||||
"scope": scope,
|
||||
"source_id": source_id,
|
||||
"target_id": target_id,
|
||||
"note_id": note_id,
|
||||
}
|
||||
if extra: pl.update(extra)
|
||||
return pl
|
||||
"""Dedupliziert eine Sequenz von Strings unter Beibehaltung der Reihenfolge."""
|
||||
seen = set()
|
||||
return [x for x in seq if not (x in seen or seen.add(x))]
|
||||
|
||||
def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[str, Optional[str]]:
|
||||
"""
|
||||
Zerlegt einen Link (z.B. 'Note#Section') in Target-ID und Section.
|
||||
Behandelt Self-Links ('#Section'), indem current_note_id eingesetzt wird.
|
||||
Trennt einen Obsidian-Link [[Target#Section]] in seine Bestandteile Target und Section.
|
||||
Behandelt Self-Links (z.B. [[#Ziele]]), indem die aktuelle note_id eingesetzt wird.
|
||||
|
||||
Returns:
|
||||
Tuple (target_id, target_section)
|
||||
"""
|
||||
if not raw:
|
||||
return "", None
|
||||
|
|
@ -91,39 +74,64 @@ def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[
|
|||
target = parts[0].strip()
|
||||
section = parts[1].strip() if len(parts) > 1 else None
|
||||
|
||||
# Spezialfall: Self-Link innerhalb derselben Datei
|
||||
if not target and section and current_note_id:
|
||||
target = current_note_id
|
||||
|
||||
return target, section
|
||||
|
||||
def _mk_edge_id(kind: str, source_id: str, target_id: str, scope: str = "note") -> str:
|
||||
"""
|
||||
WP-24c: Erzeugt eine deterministische UUIDv5 für eine Kante.
|
||||
Garantiert, dass explizite Links und systemgenerierte Symmetrien dieselbe Point-ID
|
||||
erzeugen, sofern Quelle und Ziel identisch aufgelöst wurden.
|
||||
|
||||
Args:
|
||||
kind: Typ der Relation (z.B. 'references')
|
||||
source_id: Kanonische ID der Quell-Note
|
||||
target_id: Kanonische ID der Ziel-Note
|
||||
scope: Granularität (z.B. 'note' oder 'chunk')
|
||||
"""
|
||||
# Hard-Guard gegen None-Werte zur Vermeidung von Pydantic-Validierungsfehlern
|
||||
if not all([kind, source_id, target_id]):
|
||||
raise ValueError(f"Incomplete data for edge ID: kind={kind}, src={source_id}, tgt={target_id}")
|
||||
|
||||
# Stabiler Schlüssel für die Kollisions-Strategie (Authority-First)
|
||||
stable_key = f"edge:{kind}:{source_id}:{target_id}:{scope}"
|
||||
|
||||
# Nutzt den URL-Namespace für deterministische Reproduzierbarkeit
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry Operations
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_types_registry() -> dict:
|
||||
"""Lädt die YAML-Registry."""
|
||||
"""
|
||||
Lädt die zentrale YAML-Registry (types.yaml).
|
||||
Pfad wird über die Umgebungsvariable MINDNET_TYPES_FILE gesteuert.
|
||||
"""
|
||||
p = os.getenv("MINDNET_TYPES_FILE", "./config/types.yaml")
|
||||
if not os.path.isfile(p) or yaml is None: return {}
|
||||
if not os.path.isfile(p) or yaml is None:
|
||||
return {}
|
||||
try:
|
||||
with open(p, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
|
||||
except Exception: return {}
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
return data if data is not None else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def get_edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
|
||||
"""
|
||||
WP-24c: Ermittelt Standard-Kanten (Typical Edges) für einen Notiz-Typ.
|
||||
Nutzt die EdgeRegistry (graph_schema.md) als primäre Quelle.
|
||||
Ermittelt die konfigurierten Standard-Kanten für einen Note-Typ.
|
||||
Greift bei Bedarf auf die globalen ingestion_settings zurück.
|
||||
"""
|
||||
if note_type:
|
||||
topology = edge_registry.get_topology_info(note_type, "any")
|
||||
typical = topology.get("typical", [])
|
||||
if typical:
|
||||
return typical
|
||||
|
||||
types_map = reg.get("types", reg) if isinstance(reg, dict) else {}
|
||||
if note_type and isinstance(types_map, dict):
|
||||
t = types_map.get(note_type)
|
||||
if isinstance(t, dict) and isinstance(t.get("edge_defaults"), list):
|
||||
return [str(x) for x in t["edge_defaults"] if isinstance(x, str)]
|
||||
|
||||
for key in ("defaults", "default", "global"):
|
||||
v = reg.get(key)
|
||||
if isinstance(v, dict) and isinstance(v.get("edge_defaults"), list):
|
||||
return [str(x) for x in v["edge_defaults"] if isinstance(x, str)]
|
||||
|
||||
return []
|
||||
t_cfg = types_map.get(note_type)
|
||||
if isinstance(t_cfg, dict) and isinstance(t_cfg.get("edge_defaults"), list):
|
||||
return [str(x) for x in t_cfg["edge_defaults"]]
|
||||
|
||||
# Fallback auf die globalen Standardwerte der Ingestion
|
||||
cfg_def = reg.get("ingestion_settings", {})
|
||||
return cfg_def.get("edge_defaults", [])
|
||||
|
|
@ -2,11 +2,10 @@
|
|||
FILE: app/core/ingestion/ingestion_db.py
|
||||
DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung.
|
||||
WP-14: Umstellung auf zentrale database-Infrastruktur.
|
||||
WP-20/22: Cloud-Resilienz und Fehlerbehandlung.
|
||||
WP-24c: Implementierung der herkunftsbasierten Lösch-Logik (Origin-Purge).
|
||||
Verhindert das versehentliche Löschen von inversen Kanten beim Re-Import.
|
||||
Integration der Authority-Prüfung für Point-IDs zur Symmetrie-Validierung.
|
||||
VERSION: 2.2.1 (WP-24c: Robust Authority Lookup)
|
||||
WP-24c: Integration der Authority-Prüfung für Point-IDs.
|
||||
Ermöglicht dem Prozessor die Unterscheidung zwischen
|
||||
manueller Nutzer-Autorität und virtuellen Symmetrien.
|
||||
VERSION: 2.2.0 (WP-24c: Authority Lookup Integration)
|
||||
STATUS: Active
|
||||
"""
|
||||
import logging
|
||||
|
|
@ -20,21 +19,37 @@ from app.core.database import collection_names
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]:
|
||||
"""Holt die Metadaten einer Note aus Qdrant via Scroll."""
|
||||
"""
|
||||
Holt die Metadaten einer Note aus Qdrant via Scroll-API.
|
||||
Wird primär für die Change-Detection (Hash-Vergleich) genutzt.
|
||||
"""
|
||||
notes_col, _, _ = collection_names(prefix)
|
||||
try:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
pts, _ = client.scroll(collection_name=notes_col, scroll_filter=f, limit=1, with_payload=True)
|
||||
f = rest.Filter(must=[
|
||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
||||
])
|
||||
pts, _ = client.scroll(
|
||||
collection_name=notes_col,
|
||||
scroll_filter=f,
|
||||
limit=1,
|
||||
with_payload=True
|
||||
)
|
||||
return pts[0].payload if pts else None
|
||||
except Exception as e:
|
||||
logger.debug(f"Note {note_id} not found: {e}")
|
||||
logger.debug(f"Note {note_id} not found or error during fetch: {e}")
|
||||
return None
|
||||
|
||||
def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]:
|
||||
"""Prüft Qdrant aktiv auf vorhandene Chunks und Edges für eine Note."""
|
||||
"""
|
||||
Prüft Qdrant aktiv auf vorhandene Chunks und Edges für eine Note.
|
||||
Gibt (chunks_missing, edges_missing) als Boolean-Tupel zurück.
|
||||
"""
|
||||
_, chunks_col, edges_col = collection_names(prefix)
|
||||
try:
|
||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
||||
# Filter für die note_id Suche
|
||||
f = rest.Filter(must=[
|
||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
||||
])
|
||||
c_pts, _ = client.scroll(collection_name=chunks_col, scroll_filter=f, limit=1)
|
||||
e_pts, _ = client.scroll(collection_name=edges_col, scroll_filter=f, limit=1)
|
||||
return (not bool(c_pts)), (not bool(e_pts))
|
||||
|
|
@ -47,55 +62,55 @@ def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) ->
|
|||
WP-24c: Prüft via Point-ID, ob bereits eine explizite Kante existiert.
|
||||
Wird vom IngestionProcessor in Phase 2 genutzt, um das Überschreiben
|
||||
von manuellem Wissen durch virtuelle Symmetrie-Kanten zu verhindern.
|
||||
"""
|
||||
if not edge_id: return False
|
||||
|
||||
Args:
|
||||
edge_id: Die deterministisch berechnete UUID der Kante.
|
||||
Returns:
|
||||
True, wenn eine physische Kante (virtual=False) existiert.
|
||||
"""
|
||||
if not edge_id:
|
||||
return False
|
||||
|
||||
_, _, edges_col = collection_names(prefix)
|
||||
try:
|
||||
# retrieve ist der schnellste Weg, um einen spezifischen Punkt via ID zu laden
|
||||
# retrieve ist die effizienteste Methode für den Zugriff via ID
|
||||
res = client.retrieve(
|
||||
collection_name=edges_col,
|
||||
ids=[edge_id],
|
||||
with_payload=True
|
||||
)
|
||||
# Wenn der Punkt existiert und NICHT virtuell ist, handelt es sich um eine Nutzer-Autorität
|
||||
|
||||
if res and len(res) > 0:
|
||||
payload = res[0].payload
|
||||
if not payload.get("virtual", False):
|
||||
return True
|
||||
# Wir prüfen das 'virtual' Flag im Payload
|
||||
is_virtual = res[0].payload.get("virtual", False)
|
||||
if not is_virtual:
|
||||
return True # Es ist eine explizite Nutzer-Kante
|
||||
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.debug(f"Authority check for {edge_id} failed: {e}")
|
||||
logger.debug(f"Authority check failed for ID {edge_id}: {e}")
|
||||
return False
|
||||
|
||||
def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
|
||||
"""
|
||||
WP-24c: Selektives Löschen von Artefakten vor einem Re-Import.
|
||||
Implementiert das Origin-Purge-Prinzip zur Sicherung der bidirektionalen Graph-Integrität.
|
||||
Löscht verwaiste Chunks und Edges einer Note vor einem Re-Import.
|
||||
Stellt sicher, dass keine Duplikate bei Inhaltsänderungen entstehen.
|
||||
"""
|
||||
_, chunks_col, edges_col = collection_names(prefix)
|
||||
|
||||
try:
|
||||
# 1. Chunks löschen (immer fest an die note_id gebunden)
|
||||
chunks_filter = rest.Filter(must=[
|
||||
f = rest.Filter(must=[
|
||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
||||
])
|
||||
# Chunks löschen
|
||||
client.delete(
|
||||
collection_name=chunks_col,
|
||||
points_selector=rest.FilterSelector(filter=chunks_filter)
|
||||
points_selector=rest.FilterSelector(filter=f)
|
||||
)
|
||||
|
||||
# 2. WP-24c: Kanten löschen (HERKUNFTS-BASIERT via origin_note_id)
|
||||
# Wir löschen alle Kanten, die von DIESER Note erzeugt wurden.
|
||||
edges_filter = rest.Filter(must=[
|
||||
rest.FieldCondition(key="origin_note_id", match=rest.MatchValue(value=note_id))
|
||||
])
|
||||
# Edges löschen
|
||||
client.delete(
|
||||
collection_name=edges_col,
|
||||
points_selector=rest.FilterSelector(filter=edges_filter)
|
||||
points_selector=rest.FilterSelector(filter=f)
|
||||
)
|
||||
|
||||
logger.info(f"🧹 [PURGE] Global artifacts owned by '{note_id}' cleared.")
|
||||
|
||||
logger.info(f"🧹 [PURGE] Local artifacts for '{note_id}' cleared.")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ [PURGE ERROR] Failed to clear artifacts for {note_id}: {e}")
|
||||
|
|
@ -1,13 +1,13 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_processor.py
|
||||
DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||
WP-24c: Integration der Symmetrie-Logik (Automatische inverse Kanten).
|
||||
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||
AUDIT v3.3.8: Lösung des Ghost-ID Problems & Pydantic-Crash Fix.
|
||||
Strikte Phasentrennung (Phase 2 global am Ende).
|
||||
Wiederherstellung der LLM-Logging-Transparenz.
|
||||
VERSION: 3.3.8 (WP-24c: Robust Authority Enforcement)
|
||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||
AUDIT v3.4.1: Strikte 2-Phasen-Strategie (Authority-First).
|
||||
Lösung des Ghost-ID Problems via Cache-Resolution.
|
||||
Fix für Pydantic 'None'-ID Crash.
|
||||
VERSION: 3.4.1 (WP-24c: Robust Global Orchestration)
|
||||
STATUS: Active
|
||||
"""
|
||||
import logging
|
||||
|
|
@ -22,6 +22,7 @@ from app.core.parser import (
|
|||
validate_required_frontmatter, NoteContext
|
||||
)
|
||||
from app.core.chunking import assemble_chunks
|
||||
# WP-24c: Import für die deterministische ID-Vorabberechnung aus graph_utils
|
||||
from app.core.graph.graph_utils import _mk_edge_id
|
||||
|
||||
# Datenbank-Ebene (Modularisierte database-Infrastruktur)
|
||||
|
|
@ -34,14 +35,14 @@ from app.services.embeddings_client import EmbeddingsClient
|
|||
from app.services.edge_registry import registry as edge_registry
|
||||
from app.services.llm_service import LLMService
|
||||
|
||||
# Package-Interne Imports
|
||||
# Package-Interne Imports (Refactoring WP-14)
|
||||
from .ingestion_utils import load_type_registry, resolve_note_type, get_chunk_config_by_profile
|
||||
from .ingestion_db import fetch_note_payload, artifacts_missing, purge_artifacts, is_explicit_edge_present
|
||||
from .ingestion_validation import validate_edge_candidate
|
||||
from .ingestion_note_payload import make_note_payload
|
||||
from .ingestion_chunk_payload import make_chunk_payloads
|
||||
|
||||
# Fallback für Edges
|
||||
# Fallback für Edges (Struktur-Verknüpfung)
|
||||
try:
|
||||
from app.core.graph.graph_derive_edges import build_edges_for_note
|
||||
except ImportError:
|
||||
|
|
@ -51,7 +52,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
class IngestionService:
|
||||
def __init__(self, collection_prefix: str = None):
|
||||
"""Initialisiert den Service und bereinigt das technische Logging."""
|
||||
"""Initialisiert den Service und nutzt die neue database-Infrastruktur."""
|
||||
from app.config import get_settings
|
||||
self.settings = get_settings()
|
||||
|
||||
|
|
@ -68,53 +69,56 @@ class IngestionService:
|
|||
self.embedder = EmbeddingsClient()
|
||||
self.llm = LLMService()
|
||||
|
||||
# WP-25a: Auflösung der Dimension über das Embedding-Profil (MoE)
|
||||
embed_cfg = self.llm.profiles.get("embedding_expert", {})
|
||||
self.dim = embed_cfg.get("dimensions") or self.settings.VECTOR_SIZE
|
||||
|
||||
# Festlegen des Change-Detection Modus
|
||||
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
||||
|
||||
# Kontext-Gedächtnis für ID-Auflösung
|
||||
# WP-15b: Kontext-Gedächtnis für ID-Auflösung (Globaler Cache)
|
||||
self.batch_cache: Dict[str, NoteContext] = {}
|
||||
|
||||
# Puffer für Phase 2 (Symmetrie-Injektion am Ende des gesamten Imports)
|
||||
# WP-24c: Puffer für Phase 2 (Symmetrie-Injektion am Ende des gesamten Imports)
|
||||
self.symmetry_buffer: List[Dict[str, Any]] = []
|
||||
|
||||
try:
|
||||
# Aufruf der modularisierten Schema-Logik
|
||||
ensure_collections(self.client, self.prefix, self.dim)
|
||||
ensure_payload_indexes(self.client, self.prefix)
|
||||
except Exception as e:
|
||||
logger.warning(f"DB initialization warning: {e}")
|
||||
|
||||
def _is_valid_note_id(self, text: Optional[str]) -> bool:
|
||||
"""WP-24c: Fachliche Validitätsprüfung gegen Junk-Kanten."""
|
||||
def _is_valid_id(self, text: Optional[str]) -> bool:
|
||||
"""WP-24c: Prüft IDs auf fachliche Validität (Ghost-ID Schutz)."""
|
||||
if not text or not isinstance(text, str) or len(text.strip()) < 2:
|
||||
return False
|
||||
blacklisted = {"insight", "event", "source", "task", "project", "person", "concept", "related_to", "referenced_by", "none", "unknown"}
|
||||
blacklisted = {"none", "unknown", "insight", "source", "task", "project", "person", "concept"}
|
||||
if text.lower().strip() in blacklisted:
|
||||
return False
|
||||
if len(text) > 200: return False
|
||||
return True
|
||||
|
||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]:
|
||||
"""
|
||||
WP-15b: Phase 1 des Two-Phase Ingestion Workflows.
|
||||
Verarbeitet Batches und schreibt NUR Nutzer-Autorität (physische Kanten) in die DB.
|
||||
WP-15b: Phase 1 des Two-Pass Ingestion Workflows.
|
||||
Verarbeitet Batches und schreibt NUR Nutzer-Autorität (explizite Kanten).
|
||||
"""
|
||||
self.batch_cache.clear()
|
||||
logger.info(f"--- 🔍 START BATCH PHASE 1 ({len(file_paths)} Dateien) ---")
|
||||
|
||||
# 1. Pre-Scan (ID-Gedächtnis füllen)
|
||||
# 1. Schritt: Pre-Scan (Context-Cache füllen)
|
||||
for path in file_paths:
|
||||
try:
|
||||
ctx = pre_scan_markdown(path, registry=self.registry)
|
||||
if ctx:
|
||||
self.batch_cache[ctx.note_id] = ctx
|
||||
self.batch_cache[ctx.title] = ctx
|
||||
fname = os.path.splitext(os.path.basename(path))[0]
|
||||
self.batch_cache[fname] = ctx
|
||||
# Auch Dateinamen ohne Endung auflösbar machen
|
||||
self.batch_cache[os.path.splitext(os.path.basename(path))[0]] = ctx
|
||||
except Exception as e:
|
||||
logger.warning(f" ⚠️ Pre-scan fehlgeschlagen für {path}: {e}")
|
||||
|
||||
# 2. Schritt: Batch-Verarbeitung (Explicit Authority)
|
||||
# 2. Schritt: Batch Processing (Authority Only)
|
||||
processed_count = 0
|
||||
success_count = 0
|
||||
for p in file_paths:
|
||||
|
|
@ -133,40 +137,48 @@ class IngestionService:
|
|||
|
||||
async def commit_vault_symmetries(self) -> Dict[str, Any]:
|
||||
"""
|
||||
WP-24c: Globale Symmetrie-Injektion (Phase 2).
|
||||
Prüft gepufferte Kanten gegen die Instance-of-Truth in Qdrant.
|
||||
WP-24c: Führt PHASE 2 (Globale Symmetrie-Injektion) aus.
|
||||
Wird am Ende des gesamten Imports aufgerufen.
|
||||
Sorgt dafür, dass virtuelle Kanten niemals Nutzer-Autorität überschreiben.
|
||||
"""
|
||||
if not self.symmetry_buffer:
|
||||
logger.info("⏭️ Symmetrie-Puffer leer.")
|
||||
logger.info("⏭️ Symmetrie-Puffer leer. Keine Aktion erforderlich.")
|
||||
return {"status": "skipped", "reason": "buffer_empty"}
|
||||
|
||||
logger.info(f"🔄 PHASE 2: Validiere {len(self.symmetry_buffer)} Symmetrien gegen Live-DB...")
|
||||
final_virtuals = []
|
||||
for v_edge in self.symmetry_buffer:
|
||||
if not v_edge.get("target_id") or v_edge.get("target_id") == "None": continue
|
||||
src, tgt, kind = v_edge.get("note_id"), v_edge.get("target_id"), v_edge.get("kind")
|
||||
if not src or not tgt: continue
|
||||
|
||||
v_id = _mk_edge_id(v_edge["kind"], v_edge["note_id"], v_edge["target_id"], v_edge.get("scope", "note"))
|
||||
# Deterministische ID berechnen (WP-24c Standard)
|
||||
try:
|
||||
v_id = _mk_edge_id(kind, src, tgt, "note")
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Schutz der Nutzer-Autorität
|
||||
# AUTHORITY-CHECK: Nur schreiben, wenn keine manuelle Kante in der DB existiert
|
||||
if not is_explicit_edge_present(self.client, self.prefix, v_id):
|
||||
final_virtuals.append(v_edge)
|
||||
logger.info(f" 🔄 [SYMMETRY] Add inverse: {v_edge['note_id']} --({v_edge['kind']})--> {v_edge['target_id']}")
|
||||
logger.info(f" 🔄 [SYMMETRY] Add inverse: {src} --({kind})--> {tgt}")
|
||||
else:
|
||||
logger.debug(f" 🛡️ Schutz: Manuelle Kante belegt ID {v_id}. Symmetrie verworfen.")
|
||||
logger.debug(f" 🛡️ Schutz: Manuelle Kante verhindert Symmetrie {v_id}")
|
||||
|
||||
if final_virtuals:
|
||||
e_pts = points_for_edges(self.prefix, final_virtuals)[1]
|
||||
# wait=True garantiert, dass der nächste Lauf diese Kanten sofort sieht
|
||||
upsert_batch(self.client, f"{self.prefix}_edges", e_pts, wait=True)
|
||||
logger.info(f"📤 Schreibe {len(final_virtuals)} geschützte Symmetrie-Kanten in Qdrant.")
|
||||
col, pts = points_for_edges(self.prefix, final_virtuals)
|
||||
# Nutzt upsert_batch mit wait=True für atomare Konsistenz
|
||||
upsert_batch(self.client, col, pts, wait=True)
|
||||
|
||||
added = len(final_virtuals)
|
||||
self.symmetry_buffer.clear()
|
||||
return {"status": "success", "added": added}
|
||||
count = len(final_virtuals)
|
||||
self.symmetry_buffer.clear() # Puffer nach Commit leeren
|
||||
return {"status": "success", "added": count}
|
||||
|
||||
async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Transformiert eine Note.
|
||||
Implementiert strikte ID-Kanonisierung und Pydantic-Safety.
|
||||
Transformiert eine Markdown-Datei (Phase 1).
|
||||
Schreibt Notes/Chunks/Explicit Edges sofort.
|
||||
Befüllt den Symmetrie-Puffer für Phase 2.
|
||||
"""
|
||||
apply = kwargs.get("apply", False)
|
||||
force_replace = kwargs.get("force_replace", False)
|
||||
|
|
@ -175,32 +187,26 @@ class IngestionService:
|
|||
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
||||
|
||||
try:
|
||||
# Ordner-Filter
|
||||
if any(part.startswith('.') for part in file_path.split(os.sep)):
|
||||
return {**result, "status": "skipped", "reason": "hidden_folder"}
|
||||
|
||||
ingest_cfg = self.registry.get("ingestion_settings", {})
|
||||
ignore_folders = ingest_cfg.get("ignore_folders", [".trash", ".obsidian", "templates"])
|
||||
if any(folder in file_path for folder in ignore_folders):
|
||||
return {**result, "status": "skipped", "reason": "folder_blacklist"}
|
||||
# Ordner-Filter (.trash / .obsidian)
|
||||
if ".trash" in file_path or any(part.startswith('.') for part in file_path.split(os.sep)):
|
||||
return {**result, "status": "skipped", "reason": "ignored_folder"}
|
||||
|
||||
# Datei einlesen und validieren
|
||||
parsed = read_markdown(file_path)
|
||||
if not parsed: return {**result, "error": "Empty file"}
|
||||
fm = normalize_frontmatter(parsed.frontmatter)
|
||||
validate_required_frontmatter(fm)
|
||||
|
||||
note_type = resolve_note_type(self.registry, fm.get("type"))
|
||||
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, types_cfg=self.registry)
|
||||
note_id = note_pl.get("note_id")
|
||||
|
||||
# --- HARD GUARD: Verhindert Pydantic-Crashes bei unvollständigen Notizen ---
|
||||
if not note_id or note_id == "None":
|
||||
logger.warning(f" ⚠️ Ungültige note_id in '{file_path}'. Überspringe.")
|
||||
return {**result, "status": "error", "error": "invalid_note_id"}
|
||||
if not note_id:
|
||||
logger.warning(f" ⚠️ Keine ID für {file_path}. Überspringe.")
|
||||
return {**result, "status": "error", "error": "missing_id"}
|
||||
|
||||
logger.info(f"📄 Bearbeite: '{note_id}' (Typ: {note_type})")
|
||||
logger.info(f"📄 Bearbeite: '{note_id}'")
|
||||
|
||||
# Change Detection
|
||||
# Change Detection & Fragment-Prüfung
|
||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
||||
if not (force_replace or not old_payload or c_miss or e_miss):
|
||||
|
|
@ -209,8 +215,9 @@ class IngestionService:
|
|||
if not apply:
|
||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
||||
|
||||
# LLM Validierung (Expert-MoE)
|
||||
# Deep Processing & MoE (LLM Validierung)
|
||||
profile = note_pl.get("chunk_profile", "sliding_standard")
|
||||
note_type = resolve_note_type(self.registry, fm.get("type"))
|
||||
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
||||
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
||||
chunks = await assemble_chunks(note_id, getattr(parsed, "body", ""), note_type, config=chunk_cfg)
|
||||
|
|
@ -219,11 +226,11 @@ class IngestionService:
|
|||
new_pool = []
|
||||
for cand in getattr(ch, "candidate_pool", []):
|
||||
t_id = cand.get('target_id') or cand.get('note_id')
|
||||
if not self._is_valid_note_id(t_id): continue
|
||||
|
||||
if not self._is_valid_id(t_id): continue
|
||||
|
||||
if cand.get("provenance") == "global_pool" and enable_smart:
|
||||
# LLM Logging
|
||||
logger.info(f" ⚖️ [VALIDATING] Relation to '{t_id}' via Expert-LLM...")
|
||||
logger.info(f" ⚖️ [VALIDATING] Relation to '{t_id}' via Experts...")
|
||||
is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm)
|
||||
logger.info(f" 🧠 [SMART EDGE] {t_id} -> {'✅ OK' if is_valid else '❌ SKIP'}")
|
||||
if is_valid: new_pool.append(cand)
|
||||
|
|
@ -231,56 +238,55 @@ class IngestionService:
|
|||
new_pool.append(cand)
|
||||
ch.candidate_pool = new_pool
|
||||
|
||||
# Embeddings erzeugen
|
||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||
|
||||
# --- KANTEN-LOGIK MIT STRIKTER KANONISIERUNG (FIX FÜR STEINZEITAXT) ---
|
||||
# Kanten-Extraktion mit strikter Cache-Resolution (Fix für Ghost-IDs)
|
||||
raw_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
|
||||
|
||||
explicit_edges = []
|
||||
for e in raw_edges:
|
||||
target_raw = e.get("target_id")
|
||||
t_ctx = self.batch_cache.get(target_raw)
|
||||
t_raw = e.get("target_id")
|
||||
# Kanonisierung: Link-Auflösung über den globalen Cache
|
||||
t_ctx = self.batch_cache.get(t_raw)
|
||||
t_id = t_ctx.note_id if t_ctx else t_raw
|
||||
|
||||
# Wenn das Ziel nicht im Cache ist, haben wir keine stabile note_id -> Überspringen (Ghost-ID Schutz)
|
||||
if not t_ctx:
|
||||
logger.debug(f" ⚠️ Linkziel '{target_raw}' nicht im Cache. Überspringe Kante.")
|
||||
continue
|
||||
if not self._is_valid_id(t_id): continue
|
||||
|
||||
target_id = t_ctx.note_id
|
||||
if not self._is_valid_note_id(target_id): continue
|
||||
|
||||
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"))
|
||||
|
||||
# Echte physische Kante markieren (Phase 1)
|
||||
e.update({
|
||||
"kind": resolved_kind, "target_id": target_id,
|
||||
"origin_note_id": note_id, "virtual": False, "confidence": 1.0
|
||||
})
|
||||
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance="explicit")
|
||||
e.update({"kind": resolved_kind, "target_id": t_id, "origin_note_id": note_id, "virtual": False})
|
||||
explicit_edges.append(e)
|
||||
|
||||
# Symmetrie puffern
|
||||
# Symmetrie-Gegenkante für Phase 2 puffern
|
||||
inv_kind = edge_registry.get_inverse(resolved_kind)
|
||||
if inv_kind and target_id != note_id:
|
||||
if inv_kind and t_id != note_id:
|
||||
v_edge = e.copy()
|
||||
v_edge.update({
|
||||
"note_id": target_id, "target_id": note_id, "kind": inv_kind,
|
||||
"virtual": True, "provenance": "structure", "confidence": 1.0,
|
||||
"origin_note_id": note_id
|
||||
"note_id": t_id,
|
||||
"target_id": note_id,
|
||||
"kind": inv_kind,
|
||||
"virtual": True,
|
||||
"origin_note_id": note_id
|
||||
})
|
||||
self.symmetry_buffer.append(v_edge)
|
||||
|
||||
# 4. DB Commit (Phase 1)
|
||||
# DB Upsert (Phase 1: Authority Commitment)
|
||||
if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id)
|
||||
|
||||
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||
upsert_batch(self.client, n_name, n_pts)
|
||||
if chunk_pls and vecs:
|
||||
upsert_batch(self.client, f"{self.prefix}_chunks", points_for_chunks(self.prefix, chunk_pls, vecs)[1])
|
||||
if explicit_edges:
|
||||
# WICHTIG: wait=True für Phase-1 Konsistenz
|
||||
upsert_batch(self.client, f"{self.prefix}_edges", points_for_edges(self.prefix, explicit_edges)[1], wait=True)
|
||||
col_n, pts_n = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||
upsert_batch(self.client, col_n, pts_n, wait=True)
|
||||
|
||||
logger.info(f" ✨ Phase 1 fertig: {len(chunk_pls)} Chunks, {len(explicit_edges)} explizite Kanten.")
|
||||
if chunk_pls and vecs:
|
||||
col_c, pts_c = points_for_chunks(self.prefix, chunk_pls, vecs)
|
||||
upsert_batch(self.client, col_c, pts_c, wait=True)
|
||||
|
||||
if explicit_edges:
|
||||
col_e, pts_e = points_for_edges(self.prefix, explicit_edges)
|
||||
# WICHTIG: wait=True garantiert, dass die Kanten indiziert sind, bevor Phase 2 prüft
|
||||
upsert_batch(self.client, col_e, pts_e, wait=True)
|
||||
|
||||
logger.info(f" ✨ Phase 1 fertig: {len(explicit_edges)} explizite Kanten für '{note_id}'.")
|
||||
return {"status": "success", "note_id": note_id, "edges_count": len(explicit_edges)}
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -288,9 +294,10 @@ class IngestionService:
|
|||
return {**result, "status": "error", "error": str(e)}
|
||||
|
||||
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
||||
"""Erstellt eine Note aus einem Textstream."""
|
||||
"""Erstellt eine Note aus einem Textstream und triggert die Ingestion."""
|
||||
target_path = os.path.join(vault_root, folder, filename)
|
||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||
with open(target_path, "w", encoding="utf-8") as f: f.write(markdown_content)
|
||||
with open(target_path, "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
await asyncio.sleep(0.1)
|
||||
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||
|
|
@ -4,116 +4,237 @@
|
|||
FILE: scripts/import_markdown.py
|
||||
VERSION: 2.6.0 (2026-01-10)
|
||||
STATUS: Active (Core)
|
||||
COMPATIBILITY: IngestionProcessor v3.3.7+
|
||||
Zweck: Hauptwerkzeug zum Importieren von Markdown-Dateien.
|
||||
Implementiert die globale 2-Phasen-Schreibstrategie.
|
||||
COMPATIBILITY: IngestionProcessor v3.4.1+
|
||||
|
||||
Zweck:
|
||||
-------
|
||||
Hauptwerkzeug zum Importieren von Markdown-Dateien aus einem lokalen Obsidian-Vault in die
|
||||
Qdrant Vektor-Datenbank. Das Script ist darauf optimiert, die strukturelle Integrität des
|
||||
Wissensgraphen zu wahren und die manuelle Nutzer-Autorität vor automatisierten System-Eingriffen
|
||||
zu schützen.
|
||||
|
||||
Hintergrund der 2-Phasen-Strategie (Authority-First):
|
||||
------------------------------------------------------
|
||||
Um das Problem der "Ghost-IDs" und der asynchronen Überschreibungen zu lösen, implementiert
|
||||
dieses Script eine strikte Trennung der Schreibvorgänge:
|
||||
|
||||
1. PHASE 1: Authority Processing (Batch-Modus)
|
||||
- Alle Dateien werden gescannt und verarbeitet.
|
||||
- Notizen, Chunks und explizite (vom Nutzer gesetzte) Kanten werden sofort geschrieben.
|
||||
- Durch die Verwendung von 'wait=True' in der Datenbank-Layer wird sichergestellt,
|
||||
dass diese Informationen physisch indiziert sind, bevor der nächste Schritt erfolgt.
|
||||
- Symmetrische Gegenkanten werden während dieser Phase lediglich im Speicher gepuffert.
|
||||
|
||||
2. PHASE 2: Global Symmetry Commitment (Finaler Schritt)
|
||||
- Erst nach Abschluss aller Batches wird die Methode commit_vault_symmetries() aufgerufen.
|
||||
- Diese prüft die gepufferten Symmetrie-Vorschläge gegen die bereits existierende
|
||||
Nutzer-Autorität in der Datenbank.
|
||||
- Existiert bereits eine manuelle Kante für dieselbe Verbindung, wird die automatische
|
||||
Symmetrie unterdrückt.
|
||||
|
||||
Detaillierte Funktionsweise:
|
||||
----------------------------
|
||||
1. PASS 1: Global Pre-Scan
|
||||
- Scannt rekursiv alle Markdown-Dateien im Vault.
|
||||
- Schließt System-Ordner wie .trash, .obsidian, .sync sowie Vorlagen konsequent aus.
|
||||
- Extrahiert Note-Kontext (ID, Titel, Dateiname) ohne DB-Schreibzugriff.
|
||||
- Füllt den LocalBatchCache im IngestionService, der als Single-Source-of-Truth für
|
||||
die spätere Link-Auflösung (Kanonisierung) dient.
|
||||
- Dies stellt sicher, dass Wikilinks wie [[Klaus]] korrekt zu Zeitstempel-IDs wie
|
||||
202601031726-klaus aufgelöst werden, BEVOR eine UUID für die Kante berechnet wird.
|
||||
|
||||
2. PASS 2: Semantic Processing
|
||||
- Verarbeitet Dateien in konfigurierten Batches (Standard: 20 Dateien).
|
||||
- Implementiert Cloud-Resilienz durch Semaphoren (max. 5 parallele Zugriffe).
|
||||
- Nutzt die Mixture of Experts (MoE) Architektur zur semantischen Validierung von Links.
|
||||
- Führt eine Hash-basierte Change Detection durch, um unnötige Schreibvorgänge zu vermeiden.
|
||||
- Schreibt die Ergebnisse (Notes, Chunks, Explicit Edges) konsistent nach Qdrant.
|
||||
|
||||
Ergebnis-Interpretation:
|
||||
------------------------
|
||||
- Log-Ausgabe: Liefert detaillierte Informationen über den Fortschritt, LLM-Entscheidungen
|
||||
und die finale Symmetrie-Validierung.
|
||||
- Statistiken: Gibt am Ende eine Zusammenfassung über verarbeitete, übersprungene und
|
||||
fehlerhafte Dateien aus.
|
||||
- Dry-Run: Ohne den Parameter --apply werden keine physischen Änderungen an der Datenbank
|
||||
vorgenommen, der gesamte Workflow (inkl. LLM-Anfragen) wird jedoch simuliert.
|
||||
|
||||
Verwendung:
|
||||
-----------
|
||||
- Regelmäßiger Import nach Änderungen im Vault.
|
||||
- Initialer Aufbau eines neuen Wissensgraphen.
|
||||
- Erzwingung einer Re-Indizierung mittels --force.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Root Logger Setup
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||
# Root Logger Setup:INFO-Level für volle Transparenz der fachlichen Prozesse
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] %(message)s'
|
||||
)
|
||||
|
||||
# Sicherstellung, dass das Root-Verzeichnis im Python-Pfad liegt
|
||||
sys.path.append(os.getcwd())
|
||||
|
||||
# App-spezifische Imports
|
||||
from app.core.ingestion import IngestionService
|
||||
from app.core.parser import pre_scan_markdown
|
||||
|
||||
logger = logging.getLogger("importer")
|
||||
|
||||
async def main_async(args):
|
||||
"""
|
||||
Haupt-Workflow der Ingestion. Koordiniert die zwei Durchläufe (Pass 1/2)
|
||||
und die zwei Schreibphasen (Phase 1/2).
|
||||
"""
|
||||
vault_path = Path(args.vault).resolve()
|
||||
if not vault_path.exists():
|
||||
logger.error(f"Vault path does not exist: {vault_path}")
|
||||
logger.error(f"Vault-Pfad existiert nicht: {vault_path}")
|
||||
return
|
||||
|
||||
# 1. Initialisierung des zentralen Ingestion-Services
|
||||
logger.info(f"Initializing IngestionService (Prefix: {args.prefix})")
|
||||
service = IngestionService(collection_prefix=args.prefix)
|
||||
|
||||
logger.info(f"Scanning {vault_path}...")
|
||||
all_files = list(vault_path.rglob("*.md"))
|
||||
all_files_raw = list(vault_path.rglob("*.md"))
|
||||
|
||||
# --- GLOBALER ORDNER-FILTER ---
|
||||
# Diese Liste stellt sicher, dass keine System-Leichen oder temporäre Dateien
|
||||
# den Graphen korrumpieren oder zu ID-Kollisionen führen.
|
||||
files = []
|
||||
ignore_folders = [".trash", ".obsidian", ".sync", "templates", "_system"]
|
||||
for f in all_files:
|
||||
ignore_list = [".trash", ".obsidian", ".sync", "templates", "_system", ".git"]
|
||||
|
||||
for f in all_files_raw:
|
||||
f_str = str(f)
|
||||
if not any(folder in f_str for folder in ignore_folders) and not "/." in f_str:
|
||||
# Filtert Ordner aus der ignore_list und versteckte Verzeichnisse
|
||||
if not any(folder in f_str for folder in ignore_list) and not "/." in f_str:
|
||||
files.append(f)
|
||||
|
||||
files.sort()
|
||||
logger.info(f"Found {len(files)} relevant markdown files.")
|
||||
logger.info(f"Found {len(files)} relevant markdown files (filtered trash/system/hidden).")
|
||||
|
||||
# =========================================================================
|
||||
# PASS 1: Global Pre-Scan
|
||||
# Ziel: Aufbau eines vollständigen Mappings von Bezeichnungen zu stabilen IDs.
|
||||
# =========================================================================
|
||||
logger.info(f"🔍 [Pass 1] Pre-scanning files for global context cache...")
|
||||
logger.info(f"🔍 [Pass 1] Global Pre-Scan: Building context cache for {len(files)} files...")
|
||||
for f_path in files:
|
||||
try:
|
||||
# Extrahiert Frontmatter und Metadaten ohne DB-Last
|
||||
ctx = pre_scan_markdown(str(f_path))
|
||||
if ctx:
|
||||
# Mehrfache Indizierung für maximale Trefferrate bei Wikilinks
|
||||
service.batch_cache[ctx.note_id] = ctx
|
||||
service.batch_cache[ctx.title] = ctx
|
||||
fname = os.path.splitext(f_path.name)[0]
|
||||
service.batch_cache[fname] = ctx
|
||||
except Exception: pass
|
||||
# Auch den Dateinamen ohne Endung als Alias hinterlegen
|
||||
service.batch_cache[os.path.splitext(f_path.name)[0]] = ctx
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Pre-scan fehlgeschlagen für {f_path.name}: {e}")
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 1: Batch-Import (Notes & Explicit Edges)
|
||||
# PHASE 1: Authority Processing (Batch-Lauf)
|
||||
# Ziel: Verarbeitung der Dateiinhalte und Speicherung der Nutzer-Autorität.
|
||||
# =========================================================================
|
||||
stats = {"processed": 0, "skipped": 0, "errors": 0}
|
||||
sem = asyncio.Semaphore(5)
|
||||
# Semaphore begrenzt die Parallelität zum Schutz der lokalen oder Cloud-API
|
||||
sem = asyncio.Semaphore(5)
|
||||
|
||||
async def process_with_limit(f_path):
|
||||
"""Kapselt den Prozess-Aufruf mit Ressourcen-Limitierung."""
|
||||
async with sem:
|
||||
try:
|
||||
# Nutzt process_file (v3.3.7)
|
||||
# Verwendet process_file (v3.4.1), das explizite Kanten sofort schreibt
|
||||
# und Symmetrien für Phase 2 im Service-Puffer sammelt.
|
||||
return await service.process_file(
|
||||
file_path=str(f_path), vault_root=str(vault_path),
|
||||
force_replace=args.force, apply=args.apply, purge_before=True
|
||||
file_path=str(f_path),
|
||||
vault_root=str(vault_path),
|
||||
force_replace=args.force,
|
||||
apply=args.apply,
|
||||
purge_before=True
|
||||
)
|
||||
except Exception as e:
|
||||
return {"status": "error", "error": str(e), "path": str(f_path)}
|
||||
|
||||
logger.info(f"🚀 [Phase 1] Starting semantic processing in batches...")
|
||||
|
||||
batch_size = 20
|
||||
for i in range(0, len(files), batch_size):
|
||||
batch = files[i:i+batch_size]
|
||||
logger.info(f"--- Processing Batch {i//batch_size + 1} ---")
|
||||
logger.info(f"--- Processing Batch {i//batch_size + 1} ({len(batch)} files) ---")
|
||||
|
||||
# Parallelisierung innerhalb des Batches (begrenzt durch sem)
|
||||
tasks = [process_with_limit(f) for f in batch]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
for res in results:
|
||||
if res.get("status") == "success": stats["processed"] += 1
|
||||
elif res.get("status") == "error": stats["errors"] += 1
|
||||
else: stats["skipped"] += 1
|
||||
# Robuste Auswertung der Rückgabe-Dictionaries
|
||||
if not isinstance(res, dict):
|
||||
stats["errors"] += 1
|
||||
continue
|
||||
|
||||
status = res.get("status")
|
||||
if status == "success":
|
||||
stats["processed"] += 1
|
||||
elif status == "error":
|
||||
stats["errors"] += 1
|
||||
logger.error(f"❌ Fehler in {res.get('path')}: {res.get('error')}")
|
||||
elif status == "unchanged":
|
||||
stats["skipped"] += 1
|
||||
else:
|
||||
stats["skipped"] += 1
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 2: Global Symmetry Injection (Nach Abschluss aller Batches)
|
||||
# PHASE 2: Global Symmetry Commitment
|
||||
# Ziel: Finale Integrität. Triggert erst, wenn Phase 1 komplett indiziert ist.
|
||||
# =========================================================================
|
||||
if args.apply:
|
||||
logger.info(f"🔄 [Phase 2] Starting global symmetry injection for the entire vault...")
|
||||
sym_res = await service.commit_vault_symmetries()
|
||||
if sym_res.get("status") == "success":
|
||||
logger.info(f"✅ Finished global symmetry injection. Added: {sym_res.get('added', 0)}")
|
||||
try:
|
||||
# Diese Methode prüft den Puffer gegen die nun vollständige Datenbank
|
||||
sym_res = await service.commit_vault_symmetries()
|
||||
if sym_res.get("status") == "success":
|
||||
logger.info(f"✅ Phase 2 abgeschlossen. Hinzugefügt: {sym_res.get('added', 0)} geschützte Symmetrien.")
|
||||
else:
|
||||
logger.info(f"⏭️ Phase 2 übersprungen: {sym_res.get('reason', 'Keine Daten')}")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Fehler in Phase 2: {e}")
|
||||
else:
|
||||
logger.info("⏭️ [Phase 2] Dry-Run: Keine Symmetrie-Injektion durchgeführt.")
|
||||
|
||||
logger.info(f"Final Stats: {stats}")
|
||||
logger.info(f"--- Import beendet ---")
|
||||
logger.info(f"Statistiken: {stats}")
|
||||
|
||||
def main():
|
||||
"""Einstiegspunkt und Argument-Parsing."""
|
||||
load_dotenv()
|
||||
|
||||
# Standard-Präfix aus Umgebungsvariable oder Fallback
|
||||
default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet")
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--vault", default="./vault")
|
||||
parser.add_argument("--prefix", default=default_prefix)
|
||||
parser.add_argument("--force", action="store_true")
|
||||
parser.add_argument("--apply", action="store_true")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Mindnet Ingester: Two-Phase Markdown Import")
|
||||
parser.add_argument("--vault", default="./vault", help="Pfad zum Obsidian Vault")
|
||||
parser.add_argument("--prefix", default=default_prefix, help="Qdrant Collection Präfix")
|
||||
parser.add_argument("--force", action="store_true", help="Erzwingt Neu-Indizierung aller Dateien")
|
||||
parser.add_argument("--apply", action="store_true", help="Schreibt physisch in die Datenbank")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
asyncio.run(main_async(args))
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Import durch Nutzer abgebrochen.")
|
||||
except Exception as e:
|
||||
logger.critical(f"FATAL ERROR: {e}")
|
||||
logger.critical(f"FATALER FEHLER: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user