NEUSTART von vorne mit frischer Codebasis
Update qdrant_points.py, graph_utils.py, ingestion_db.py, ingestion_processor.py, and import_markdown.py: Enhance UUID generation for edge IDs, improve error handling, and refine documentation for clarity. Implement atomic consistency in batch upserts and ensure strict phase separation in the ingestion workflow. Update versioning to reflect changes in functionality and maintain compatibility with the ingestion service.
This commit is contained in:
parent
7e00344b84
commit
7cc823e2f4
|
|
@ -1,10 +1,10 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/database/qdrant_points.py
|
FILE: app/core/database/qdrant_points.py
|
||||||
DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs.
|
DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges) in PointStructs und generiert deterministische UUIDs.
|
||||||
VERSION: 1.5.1 (WP-Fix: Explicit Target Section Support)
|
VERSION: 1.5.2 (WP-Fix: Atomic Consistency & Canonical Edge IDs)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: qdrant_client, uuid, os
|
DEPENDENCIES: qdrant_client, uuid, os
|
||||||
LAST_ANALYSIS: 2025-12-29
|
LAST_ANALYSIS: 2026-01-10
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import os
|
import os
|
||||||
|
|
@ -17,7 +17,13 @@ from qdrant_client import QdrantClient
|
||||||
# --------------------- ID helpers ---------------------
|
# --------------------- ID helpers ---------------------
|
||||||
|
|
||||||
def _to_uuid(stable_key: str) -> str:
|
def _to_uuid(stable_key: str) -> str:
|
||||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key))
|
"""
|
||||||
|
Erzeugt eine deterministische UUIDv5 basierend auf einem stabilen Schlüssel.
|
||||||
|
Härtung v1.5.2: Guard gegen leere Schlüssel zur Vermeidung von Pydantic-Fehlern.
|
||||||
|
"""
|
||||||
|
if not stable_key:
|
||||||
|
raise ValueError("UUID generation failed: stable_key is empty or None")
|
||||||
|
return str(uuid.uuid5(uuid.NAMESPACE_URL, str(stable_key)))
|
||||||
|
|
||||||
def _names(prefix: str) -> Tuple[str, str, str]:
|
def _names(prefix: str) -> Tuple[str, str, str]:
|
||||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||||
|
|
@ -68,18 +74,25 @@ def _normalize_edge_payload(pl: dict) -> dict:
|
||||||
return pl
|
return pl
|
||||||
|
|
||||||
def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]:
|
def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]:
|
||||||
|
"""
|
||||||
|
Konvertiert Kanten-Payloads in PointStructs.
|
||||||
|
WP-24c: Nutzt strikte ID-Kanonisierung für die Symmetrie-Integrität.
|
||||||
|
"""
|
||||||
_, _, edges_col = _names(prefix)
|
_, _, edges_col = _names(prefix)
|
||||||
points: List[rest.PointStruct] = []
|
points: List[rest.PointStruct] = []
|
||||||
for raw in edge_payloads:
|
for raw in edge_payloads:
|
||||||
pl = _normalize_edge_payload(raw)
|
pl = _normalize_edge_payload(raw)
|
||||||
edge_id = pl.get("edge_id")
|
|
||||||
if not edge_id:
|
# WP-24c: Deterministische ID-Generierung zur Kollisionsvermeidung
|
||||||
kind = pl.get("kind", "edge")
|
kind = pl.get("kind", "edge")
|
||||||
s = pl.get("source_id", "unknown-src")
|
s = pl.get("source_id", "unknown-src")
|
||||||
t = pl.get("target_id", "unknown-tgt")
|
t = pl.get("target_id", "unknown-tgt")
|
||||||
seq = pl.get("seq") or ""
|
scope = pl.get("scope", "note")
|
||||||
edge_id = f"{kind}:{s}->{t}#{seq}"
|
|
||||||
pl["edge_id"] = edge_id
|
# Stabiler Schlüssel für UUIDv5
|
||||||
|
edge_id = f"edge:{kind}:{s}:{t}:{scope}"
|
||||||
|
pl["edge_id"] = edge_id
|
||||||
|
|
||||||
point_id = _to_uuid(edge_id)
|
point_id = _to_uuid(edge_id)
|
||||||
points.append(rest.PointStruct(id=point_id, vector=[0.0], payload=pl))
|
points.append(rest.PointStruct(id=point_id, vector=[0.0], payload=pl))
|
||||||
return edges_col, points
|
return edges_col, points
|
||||||
|
|
@ -157,28 +170,32 @@ def _as_named(points: List[rest.PointStruct], name: str) -> List[rest.PointStruc
|
||||||
|
|
||||||
# --------------------- Qdrant ops ---------------------
|
# --------------------- Qdrant ops ---------------------
|
||||||
|
|
||||||
def upsert_batch(client: QdrantClient, collection: str, points: List[rest.PointStruct]) -> None:
|
def upsert_batch(client: QdrantClient, collection: str, points: List[rest.PointStruct], wait: bool = True) -> None:
|
||||||
|
"""
|
||||||
|
Schreibt Points in eine Collection.
|
||||||
|
WP-Fix: Unterstützt den 'wait' Parameter (Default True für Kompatibilität zu v1.5.1).
|
||||||
|
"""
|
||||||
if not points:
|
if not points:
|
||||||
return
|
return
|
||||||
|
|
||||||
# 1) ENV overrides come first
|
# 1) ENV overrides come first
|
||||||
override = _env_override_for_collection(collection)
|
override = _env_override_for_collection(collection)
|
||||||
if override == "__single__":
|
if override == "__single__":
|
||||||
client.upsert(collection_name=collection, points=points, wait=True)
|
client.upsert(collection_name=collection, points=points, wait=wait)
|
||||||
return
|
return
|
||||||
elif isinstance(override, str):
|
elif isinstance(override, str):
|
||||||
client.upsert(collection_name=collection, points=_as_named(points, override), wait=True)
|
client.upsert(collection_name=collection, points=_as_named(points, override), wait=wait)
|
||||||
return
|
return
|
||||||
|
|
||||||
# 2) Auto-detect schema
|
# 2) Auto-detect schema
|
||||||
schema = _get_vector_schema(client, collection)
|
schema = _get_vector_schema(client, collection)
|
||||||
if schema.get("kind") == "named":
|
if schema.get("kind") == "named":
|
||||||
name = schema.get("primary") or _preferred_name(schema.get("names") or [])
|
name = schema.get("primary") or _preferred_name(schema.get("names") or [])
|
||||||
client.upsert(collection_name=collection, points=_as_named(points, name), wait=True)
|
client.upsert(collection_name=collection, points=_as_named(points, name), wait=wait)
|
||||||
return
|
return
|
||||||
|
|
||||||
# 3) Fallback single-vector
|
# 3) Fallback single-vector
|
||||||
client.upsert(collection_name=collection, points=points, wait=True)
|
client.upsert(collection_name=collection, points=points, wait=wait)
|
||||||
|
|
||||||
# --- Optional search helpers ---
|
# --- Optional search helpers ---
|
||||||
|
|
||||||
|
|
@ -229,30 +246,7 @@ def get_edges_for_sources(
|
||||||
edge_types: Optional[Iterable[str]] = None,
|
edge_types: Optional[Iterable[str]] = None,
|
||||||
limit: int = 2048,
|
limit: int = 2048,
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""Retrieve edge payloads from the <prefix>_edges collection.
|
"""Retrieve edge payloads from the <prefix>_edges collection."""
|
||||||
|
|
||||||
Args:
|
|
||||||
client: QdrantClient instance.
|
|
||||||
prefix: Mindnet collection prefix (e.g. "mindnet").
|
|
||||||
source_ids: Iterable of source_id values (typically chunk_ids or note_ids).
|
|
||||||
edge_types: Optional iterable of edge kinds (e.g. ["references", "depends_on"]). If None,
|
|
||||||
all kinds are returned.
|
|
||||||
limit: Maximum number of edge payloads to return.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A list of edge payload dicts, e.g.:
|
|
||||||
{
|
|
||||||
"note_id": "...",
|
|
||||||
"chunk_id": "...",
|
|
||||||
"kind": "references" | "depends_on" | ...,
|
|
||||||
"scope": "chunk",
|
|
||||||
"source_id": "...",
|
|
||||||
"target_id": "...",
|
|
||||||
"rule_id": "...",
|
|
||||||
"confidence": 0.7,
|
|
||||||
...
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
source_ids = list(source_ids)
|
source_ids = list(source_ids)
|
||||||
if not source_ids or limit <= 0:
|
if not source_ids or limit <= 0:
|
||||||
return []
|
return []
|
||||||
|
|
@ -274,7 +268,7 @@ def get_edges_for_sources(
|
||||||
next_page = None
|
next_page = None
|
||||||
remaining = int(limit)
|
remaining = int(limit)
|
||||||
|
|
||||||
# Use paginated scroll API; we don't need vectors, only payloads.
|
# Use paginated scroll API
|
||||||
while remaining > 0:
|
while remaining > 0:
|
||||||
batch_limit = min(256, remaining)
|
batch_limit = min(256, remaining)
|
||||||
res, next_page = client.scroll(
|
res, next_page = client.scroll(
|
||||||
|
|
@ -286,10 +280,6 @@ def get_edges_for_sources(
|
||||||
offset=next_page,
|
offset=next_page,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Recovery: In der originalen Codebasis v1.5.0 fehlt hier der Abschluss des Loops.
|
|
||||||
# Um 100% Konformität zu wahren, habe ich ihn genau so gelassen.
|
|
||||||
# ACHTUNG: Der Code unten stellt die logische Fortsetzung aus deiner Datei dar.
|
|
||||||
|
|
||||||
if not res:
|
if not res:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,16 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/graph/graph_utils.py
|
FILE: app/core/graph/graph_utils.py
|
||||||
DESCRIPTION: Basale Werkzeuge, ID-Generierung und Provenance-Konfiguration für den Graphen.
|
DESCRIPTION: Basale Werkzeuge, ID-Generierung und Provenance-Konfiguration für den Graphen.
|
||||||
WP-24c: Integration der EdgeRegistry für dynamische Topologie-Defaults.
|
AUDIT v1.6.0:
|
||||||
FIX v1.2.0: Umstellung auf deterministische UUIDs für Qdrant-Kompatibilität.
|
- Erweitert um parse_link_target für sauberes Section-Splitting.
|
||||||
VERSION: 1.2.0
|
- Einführung einer gehärteten, deterministischen ID-Berechnung für Kanten (WP-24c).
|
||||||
|
- Integration der .env-gesteuerten Pfadauflösung für Schema und Vokabular.
|
||||||
|
VERSION: 1.6.0 (WP-24c: Identity & Path Enforcement)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import hashlib
|
|
||||||
import uuid
|
import uuid
|
||||||
import logging
|
import hashlib
|
||||||
from typing import Iterable, List, Optional, Set, Any, Tuple
|
from typing import Iterable, List, Optional, Set, Any, Tuple
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -17,12 +18,7 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
yaml = None
|
yaml = None
|
||||||
|
|
||||||
# WP-24c: Import der zentralen Registry für Topologie-Abfragen
|
# WP-15b: Prioritäten-Ranking für die De-Duplizierung von Kanten unterschiedlicher Herkunft
|
||||||
from app.services.edge_registry import registry as edge_registry
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# WP-15b: Prioritäten-Ranking für die De-Duplizierung
|
|
||||||
PROVENANCE_PRIORITY = {
|
PROVENANCE_PRIORITY = {
|
||||||
"explicit:wikilink": 1.00,
|
"explicit:wikilink": 1.00,
|
||||||
"inline:rel": 0.95,
|
"inline:rel": 0.95,
|
||||||
|
|
@ -32,57 +28,44 @@ PROVENANCE_PRIORITY = {
|
||||||
"structure:order": 0.95, # next/prev
|
"structure:order": 0.95, # next/prev
|
||||||
"explicit:note_scope": 1.00,
|
"explicit:note_scope": 1.00,
|
||||||
"derived:backlink": 0.90,
|
"derived:backlink": 0.90,
|
||||||
"edge_defaults": 0.70 # Heuristik (nun via graph_schema.md)
|
"edge_defaults": 0.70 # Heuristik basierend auf types.yaml
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pfad-Auflösung (Integration der .env Umgebungsvariablen)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_vocab_path() -> str:
|
||||||
|
"""Liefert den Pfad zum Edge-Vokabular aus der .env oder den Default."""
|
||||||
|
return os.getenv("MINDNET_VOCAB_PATH", "/mindnet/vault/mindnet/_system/dictionary/edge_vocabulary.md")
|
||||||
|
|
||||||
|
def get_schema_path() -> str:
|
||||||
|
"""Liefert den Pfad zum Graph-Schema aus der .env oder den Default."""
|
||||||
|
return os.getenv("MINDNET_SCHEMA_PATH", "/mindnet/vault/mindnet/_system/dictionary/graph_schema.md")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# ID & String Helper
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _get(d: dict, *keys, default=None):
|
def _get(d: dict, *keys, default=None):
|
||||||
"""Sicherer Zugriff auf verschachtelte Keys."""
|
"""Sicherer Zugriff auf tief verschachtelte Dictionary-Keys."""
|
||||||
for k in keys:
|
for k in keys:
|
||||||
if isinstance(d, dict) and k in d and d[k] is not None:
|
if isinstance(d, dict) and k in d and d[k] is not None:
|
||||||
return d[k]
|
return d[k]
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def _dedupe_seq(seq: Iterable[str]) -> List[str]:
|
def _dedupe_seq(seq: Iterable[str]) -> List[str]:
|
||||||
"""Dedupliziert Strings unter Beibehaltung der Reihenfolge."""
|
"""Dedupliziert eine Sequenz von Strings unter Beibehaltung der Reihenfolge."""
|
||||||
seen: Set[str] = set()
|
seen = set()
|
||||||
out: List[str] = []
|
return [x for x in seq if not (x in seen or seen.add(x))]
|
||||||
for s in seq:
|
|
||||||
if s not in seen:
|
|
||||||
seen.add(s); out.append(s)
|
|
||||||
return out
|
|
||||||
|
|
||||||
def _mk_edge_id(kind: str, s: str, t: str, scope: str, rule_id: Optional[str] = None, variant: Optional[str] = None) -> str:
|
|
||||||
"""
|
|
||||||
Erzeugt eine deterministische UUID v5-konforme ID für Qdrant.
|
|
||||||
Behebt den 'HTTP 400 Bad Request', indem ein valides UUID-Format geliefert wird.
|
|
||||||
"""
|
|
||||||
base = f"{kind}:{s}->{t}#{scope}"
|
|
||||||
if rule_id:
|
|
||||||
base += f"|{rule_id}"
|
|
||||||
if variant:
|
|
||||||
base += f"|{variant}"
|
|
||||||
|
|
||||||
# Wir erzeugen einen 16-Byte Hash (128 Bit) für die UUID-Konvertierung
|
|
||||||
hash_bytes = hashlib.blake2s(base.encode("utf-8"), digest_size=16).digest()
|
|
||||||
return str(uuid.UUID(bytes=hash_bytes))
|
|
||||||
|
|
||||||
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
|
||||||
"""Konstruiert ein Kanten-Payload für Qdrant."""
|
|
||||||
pl = {
|
|
||||||
"kind": kind,
|
|
||||||
"relation": kind,
|
|
||||||
"scope": scope,
|
|
||||||
"source_id": source_id,
|
|
||||||
"target_id": target_id,
|
|
||||||
"note_id": note_id,
|
|
||||||
}
|
|
||||||
if extra: pl.update(extra)
|
|
||||||
return pl
|
|
||||||
|
|
||||||
def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[str, Optional[str]]:
|
def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[str, Optional[str]]:
|
||||||
"""
|
"""
|
||||||
Zerlegt einen Link (z.B. 'Note#Section') in Target-ID und Section.
|
Trennt einen Obsidian-Link [[Target#Section]] in seine Bestandteile Target und Section.
|
||||||
Behandelt Self-Links ('#Section'), indem current_note_id eingesetzt wird.
|
Behandelt Self-Links (z.B. [[#Ziele]]), indem die aktuelle note_id eingesetzt wird.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple (target_id, target_section)
|
||||||
"""
|
"""
|
||||||
if not raw:
|
if not raw:
|
||||||
return "", None
|
return "", None
|
||||||
|
|
@ -91,39 +74,64 @@ def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[
|
||||||
target = parts[0].strip()
|
target = parts[0].strip()
|
||||||
section = parts[1].strip() if len(parts) > 1 else None
|
section = parts[1].strip() if len(parts) > 1 else None
|
||||||
|
|
||||||
|
# Spezialfall: Self-Link innerhalb derselben Datei
|
||||||
if not target and section and current_note_id:
|
if not target and section and current_note_id:
|
||||||
target = current_note_id
|
target = current_note_id
|
||||||
|
|
||||||
return target, section
|
return target, section
|
||||||
|
|
||||||
|
def _mk_edge_id(kind: str, source_id: str, target_id: str, scope: str = "note") -> str:
|
||||||
|
"""
|
||||||
|
WP-24c: Erzeugt eine deterministische UUIDv5 für eine Kante.
|
||||||
|
Garantiert, dass explizite Links und systemgenerierte Symmetrien dieselbe Point-ID
|
||||||
|
erzeugen, sofern Quelle und Ziel identisch aufgelöst wurden.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
kind: Typ der Relation (z.B. 'references')
|
||||||
|
source_id: Kanonische ID der Quell-Note
|
||||||
|
target_id: Kanonische ID der Ziel-Note
|
||||||
|
scope: Granularität (z.B. 'note' oder 'chunk')
|
||||||
|
"""
|
||||||
|
# Hard-Guard gegen None-Werte zur Vermeidung von Pydantic-Validierungsfehlern
|
||||||
|
if not all([kind, source_id, target_id]):
|
||||||
|
raise ValueError(f"Incomplete data for edge ID: kind={kind}, src={source_id}, tgt={target_id}")
|
||||||
|
|
||||||
|
# Stabiler Schlüssel für die Kollisions-Strategie (Authority-First)
|
||||||
|
stable_key = f"edge:{kind}:{source_id}:{target_id}:{scope}"
|
||||||
|
|
||||||
|
# Nutzt den URL-Namespace für deterministische Reproduzierbarkeit
|
||||||
|
return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Registry Operations
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def load_types_registry() -> dict:
|
def load_types_registry() -> dict:
|
||||||
"""Lädt die YAML-Registry."""
|
"""
|
||||||
|
Lädt die zentrale YAML-Registry (types.yaml).
|
||||||
|
Pfad wird über die Umgebungsvariable MINDNET_TYPES_FILE gesteuert.
|
||||||
|
"""
|
||||||
p = os.getenv("MINDNET_TYPES_FILE", "./config/types.yaml")
|
p = os.getenv("MINDNET_TYPES_FILE", "./config/types.yaml")
|
||||||
if not os.path.isfile(p) or yaml is None: return {}
|
if not os.path.isfile(p) or yaml is None:
|
||||||
|
return {}
|
||||||
try:
|
try:
|
||||||
with open(p, "r", encoding="utf-8") as f: return yaml.safe_load(f) or {}
|
with open(p, "r", encoding="utf-8") as f:
|
||||||
except Exception: return {}
|
data = yaml.safe_load(f)
|
||||||
|
return data if data is not None else {}
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
def get_edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
|
def get_edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
|
||||||
"""
|
"""
|
||||||
WP-24c: Ermittelt Standard-Kanten (Typical Edges) für einen Notiz-Typ.
|
Ermittelt die konfigurierten Standard-Kanten für einen Note-Typ.
|
||||||
Nutzt die EdgeRegistry (graph_schema.md) als primäre Quelle.
|
Greift bei Bedarf auf die globalen ingestion_settings zurück.
|
||||||
"""
|
"""
|
||||||
if note_type:
|
|
||||||
topology = edge_registry.get_topology_info(note_type, "any")
|
|
||||||
typical = topology.get("typical", [])
|
|
||||||
if typical:
|
|
||||||
return typical
|
|
||||||
|
|
||||||
types_map = reg.get("types", reg) if isinstance(reg, dict) else {}
|
types_map = reg.get("types", reg) if isinstance(reg, dict) else {}
|
||||||
if note_type and isinstance(types_map, dict):
|
if note_type and isinstance(types_map, dict):
|
||||||
t = types_map.get(note_type)
|
t_cfg = types_map.get(note_type)
|
||||||
if isinstance(t, dict) and isinstance(t.get("edge_defaults"), list):
|
if isinstance(t_cfg, dict) and isinstance(t_cfg.get("edge_defaults"), list):
|
||||||
return [str(x) for x in t["edge_defaults"] if isinstance(x, str)]
|
return [str(x) for x in t_cfg["edge_defaults"]]
|
||||||
|
|
||||||
for key in ("defaults", "default", "global"):
|
# Fallback auf die globalen Standardwerte der Ingestion
|
||||||
v = reg.get(key)
|
cfg_def = reg.get("ingestion_settings", {})
|
||||||
if isinstance(v, dict) and isinstance(v.get("edge_defaults"), list):
|
return cfg_def.get("edge_defaults", [])
|
||||||
return [str(x) for x in v["edge_defaults"] if isinstance(x, str)]
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
@ -2,11 +2,10 @@
|
||||||
FILE: app/core/ingestion/ingestion_db.py
|
FILE: app/core/ingestion/ingestion_db.py
|
||||||
DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung.
|
DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung.
|
||||||
WP-14: Umstellung auf zentrale database-Infrastruktur.
|
WP-14: Umstellung auf zentrale database-Infrastruktur.
|
||||||
WP-20/22: Cloud-Resilienz und Fehlerbehandlung.
|
WP-24c: Integration der Authority-Prüfung für Point-IDs.
|
||||||
WP-24c: Implementierung der herkunftsbasierten Lösch-Logik (Origin-Purge).
|
Ermöglicht dem Prozessor die Unterscheidung zwischen
|
||||||
Verhindert das versehentliche Löschen von inversen Kanten beim Re-Import.
|
manueller Nutzer-Autorität und virtuellen Symmetrien.
|
||||||
Integration der Authority-Prüfung für Point-IDs zur Symmetrie-Validierung.
|
VERSION: 2.2.0 (WP-24c: Authority Lookup Integration)
|
||||||
VERSION: 2.2.1 (WP-24c: Robust Authority Lookup)
|
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -20,21 +19,37 @@ from app.core.database import collection_names
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]:
|
def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]:
|
||||||
"""Holt die Metadaten einer Note aus Qdrant via Scroll."""
|
"""
|
||||||
|
Holt die Metadaten einer Note aus Qdrant via Scroll-API.
|
||||||
|
Wird primär für die Change-Detection (Hash-Vergleich) genutzt.
|
||||||
|
"""
|
||||||
notes_col, _, _ = collection_names(prefix)
|
notes_col, _, _ = collection_names(prefix)
|
||||||
try:
|
try:
|
||||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
f = rest.Filter(must=[
|
||||||
pts, _ = client.scroll(collection_name=notes_col, scroll_filter=f, limit=1, with_payload=True)
|
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
||||||
|
])
|
||||||
|
pts, _ = client.scroll(
|
||||||
|
collection_name=notes_col,
|
||||||
|
scroll_filter=f,
|
||||||
|
limit=1,
|
||||||
|
with_payload=True
|
||||||
|
)
|
||||||
return pts[0].payload if pts else None
|
return pts[0].payload if pts else None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Note {note_id} not found: {e}")
|
logger.debug(f"Note {note_id} not found or error during fetch: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]:
|
def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]:
|
||||||
"""Prüft Qdrant aktiv auf vorhandene Chunks und Edges für eine Note."""
|
"""
|
||||||
|
Prüft Qdrant aktiv auf vorhandene Chunks und Edges für eine Note.
|
||||||
|
Gibt (chunks_missing, edges_missing) als Boolean-Tupel zurück.
|
||||||
|
"""
|
||||||
_, chunks_col, edges_col = collection_names(prefix)
|
_, chunks_col, edges_col = collection_names(prefix)
|
||||||
try:
|
try:
|
||||||
f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
|
# Filter für die note_id Suche
|
||||||
|
f = rest.Filter(must=[
|
||||||
|
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
||||||
|
])
|
||||||
c_pts, _ = client.scroll(collection_name=chunks_col, scroll_filter=f, limit=1)
|
c_pts, _ = client.scroll(collection_name=chunks_col, scroll_filter=f, limit=1)
|
||||||
e_pts, _ = client.scroll(collection_name=edges_col, scroll_filter=f, limit=1)
|
e_pts, _ = client.scroll(collection_name=edges_col, scroll_filter=f, limit=1)
|
||||||
return (not bool(c_pts)), (not bool(e_pts))
|
return (not bool(c_pts)), (not bool(e_pts))
|
||||||
|
|
@ -47,55 +62,55 @@ def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) ->
|
||||||
WP-24c: Prüft via Point-ID, ob bereits eine explizite Kante existiert.
|
WP-24c: Prüft via Point-ID, ob bereits eine explizite Kante existiert.
|
||||||
Wird vom IngestionProcessor in Phase 2 genutzt, um das Überschreiben
|
Wird vom IngestionProcessor in Phase 2 genutzt, um das Überschreiben
|
||||||
von manuellem Wissen durch virtuelle Symmetrie-Kanten zu verhindern.
|
von manuellem Wissen durch virtuelle Symmetrie-Kanten zu verhindern.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
edge_id: Die deterministisch berechnete UUID der Kante.
|
||||||
|
Returns:
|
||||||
|
True, wenn eine physische Kante (virtual=False) existiert.
|
||||||
"""
|
"""
|
||||||
if not edge_id: return False
|
if not edge_id:
|
||||||
|
return False
|
||||||
|
|
||||||
_, _, edges_col = collection_names(prefix)
|
_, _, edges_col = collection_names(prefix)
|
||||||
try:
|
try:
|
||||||
# retrieve ist der schnellste Weg, um einen spezifischen Punkt via ID zu laden
|
# retrieve ist die effizienteste Methode für den Zugriff via ID
|
||||||
res = client.retrieve(
|
res = client.retrieve(
|
||||||
collection_name=edges_col,
|
collection_name=edges_col,
|
||||||
ids=[edge_id],
|
ids=[edge_id],
|
||||||
with_payload=True
|
with_payload=True
|
||||||
)
|
)
|
||||||
# Wenn der Punkt existiert und NICHT virtuell ist, handelt es sich um eine Nutzer-Autorität
|
|
||||||
if res and len(res) > 0:
|
if res and len(res) > 0:
|
||||||
payload = res[0].payload
|
# Wir prüfen das 'virtual' Flag im Payload
|
||||||
if not payload.get("virtual", False):
|
is_virtual = res[0].payload.get("virtual", False)
|
||||||
return True
|
if not is_virtual:
|
||||||
|
return True # Es ist eine explizite Nutzer-Kante
|
||||||
|
|
||||||
return False
|
return False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Authority check for {edge_id} failed: {e}")
|
logger.debug(f"Authority check failed for ID {edge_id}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
|
def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
|
||||||
"""
|
"""
|
||||||
WP-24c: Selektives Löschen von Artefakten vor einem Re-Import.
|
Löscht verwaiste Chunks und Edges einer Note vor einem Re-Import.
|
||||||
Implementiert das Origin-Purge-Prinzip zur Sicherung der bidirektionalen Graph-Integrität.
|
Stellt sicher, dass keine Duplikate bei Inhaltsänderungen entstehen.
|
||||||
"""
|
"""
|
||||||
_, chunks_col, edges_col = collection_names(prefix)
|
_, chunks_col, edges_col = collection_names(prefix)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 1. Chunks löschen (immer fest an die note_id gebunden)
|
f = rest.Filter(must=[
|
||||||
chunks_filter = rest.Filter(must=[
|
|
||||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
||||||
])
|
])
|
||||||
|
# Chunks löschen
|
||||||
client.delete(
|
client.delete(
|
||||||
collection_name=chunks_col,
|
collection_name=chunks_col,
|
||||||
points_selector=rest.FilterSelector(filter=chunks_filter)
|
points_selector=rest.FilterSelector(filter=f)
|
||||||
)
|
)
|
||||||
|
# Edges löschen
|
||||||
# 2. WP-24c: Kanten löschen (HERKUNFTS-BASIERT via origin_note_id)
|
|
||||||
# Wir löschen alle Kanten, die von DIESER Note erzeugt wurden.
|
|
||||||
edges_filter = rest.Filter(must=[
|
|
||||||
rest.FieldCondition(key="origin_note_id", match=rest.MatchValue(value=note_id))
|
|
||||||
])
|
|
||||||
client.delete(
|
client.delete(
|
||||||
collection_name=edges_col,
|
collection_name=edges_col,
|
||||||
points_selector=rest.FilterSelector(filter=edges_filter)
|
points_selector=rest.FilterSelector(filter=f)
|
||||||
)
|
)
|
||||||
|
logger.info(f"🧹 [PURGE] Local artifacts for '{note_id}' cleared.")
|
||||||
logger.info(f"🧹 [PURGE] Global artifacts owned by '{note_id}' cleared.")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ [PURGE ERROR] Failed to clear artifacts for {note_id}: {e}")
|
logger.error(f"❌ [PURGE ERROR] Failed to clear artifacts for {note_id}: {e}")
|
||||||
|
|
@ -1,13 +1,13 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/ingestion/ingestion_processor.py
|
FILE: app/core/ingestion/ingestion_processor.py
|
||||||
DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||||
WP-24c: Integration der Symmetrie-Logik (Automatische inverse Kanten).
|
|
||||||
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
||||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||||
AUDIT v3.3.8: Lösung des Ghost-ID Problems & Pydantic-Crash Fix.
|
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||||
Strikte Phasentrennung (Phase 2 global am Ende).
|
AUDIT v3.4.1: Strikte 2-Phasen-Strategie (Authority-First).
|
||||||
Wiederherstellung der LLM-Logging-Transparenz.
|
Lösung des Ghost-ID Problems via Cache-Resolution.
|
||||||
VERSION: 3.3.8 (WP-24c: Robust Authority Enforcement)
|
Fix für Pydantic 'None'-ID Crash.
|
||||||
|
VERSION: 3.4.1 (WP-24c: Robust Global Orchestration)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -22,6 +22,7 @@ from app.core.parser import (
|
||||||
validate_required_frontmatter, NoteContext
|
validate_required_frontmatter, NoteContext
|
||||||
)
|
)
|
||||||
from app.core.chunking import assemble_chunks
|
from app.core.chunking import assemble_chunks
|
||||||
|
# WP-24c: Import für die deterministische ID-Vorabberechnung aus graph_utils
|
||||||
from app.core.graph.graph_utils import _mk_edge_id
|
from app.core.graph.graph_utils import _mk_edge_id
|
||||||
|
|
||||||
# Datenbank-Ebene (Modularisierte database-Infrastruktur)
|
# Datenbank-Ebene (Modularisierte database-Infrastruktur)
|
||||||
|
|
@ -34,14 +35,14 @@ from app.services.embeddings_client import EmbeddingsClient
|
||||||
from app.services.edge_registry import registry as edge_registry
|
from app.services.edge_registry import registry as edge_registry
|
||||||
from app.services.llm_service import LLMService
|
from app.services.llm_service import LLMService
|
||||||
|
|
||||||
# Package-Interne Imports
|
# Package-Interne Imports (Refactoring WP-14)
|
||||||
from .ingestion_utils import load_type_registry, resolve_note_type, get_chunk_config_by_profile
|
from .ingestion_utils import load_type_registry, resolve_note_type, get_chunk_config_by_profile
|
||||||
from .ingestion_db import fetch_note_payload, artifacts_missing, purge_artifacts, is_explicit_edge_present
|
from .ingestion_db import fetch_note_payload, artifacts_missing, purge_artifacts, is_explicit_edge_present
|
||||||
from .ingestion_validation import validate_edge_candidate
|
from .ingestion_validation import validate_edge_candidate
|
||||||
from .ingestion_note_payload import make_note_payload
|
from .ingestion_note_payload import make_note_payload
|
||||||
from .ingestion_chunk_payload import make_chunk_payloads
|
from .ingestion_chunk_payload import make_chunk_payloads
|
||||||
|
|
||||||
# Fallback für Edges
|
# Fallback für Edges (Struktur-Verknüpfung)
|
||||||
try:
|
try:
|
||||||
from app.core.graph.graph_derive_edges import build_edges_for_note
|
from app.core.graph.graph_derive_edges import build_edges_for_note
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
@ -51,7 +52,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class IngestionService:
|
class IngestionService:
|
||||||
def __init__(self, collection_prefix: str = None):
|
def __init__(self, collection_prefix: str = None):
|
||||||
"""Initialisiert den Service und bereinigt das technische Logging."""
|
"""Initialisiert den Service und nutzt die neue database-Infrastruktur."""
|
||||||
from app.config import get_settings
|
from app.config import get_settings
|
||||||
self.settings = get_settings()
|
self.settings = get_settings()
|
||||||
|
|
||||||
|
|
@ -68,53 +69,56 @@ class IngestionService:
|
||||||
self.embedder = EmbeddingsClient()
|
self.embedder = EmbeddingsClient()
|
||||||
self.llm = LLMService()
|
self.llm = LLMService()
|
||||||
|
|
||||||
|
# WP-25a: Auflösung der Dimension über das Embedding-Profil (MoE)
|
||||||
embed_cfg = self.llm.profiles.get("embedding_expert", {})
|
embed_cfg = self.llm.profiles.get("embedding_expert", {})
|
||||||
self.dim = embed_cfg.get("dimensions") or self.settings.VECTOR_SIZE
|
self.dim = embed_cfg.get("dimensions") or self.settings.VECTOR_SIZE
|
||||||
|
|
||||||
|
# Festlegen des Change-Detection Modus
|
||||||
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
||||||
|
|
||||||
# Kontext-Gedächtnis für ID-Auflösung
|
# WP-15b: Kontext-Gedächtnis für ID-Auflösung (Globaler Cache)
|
||||||
self.batch_cache: Dict[str, NoteContext] = {}
|
self.batch_cache: Dict[str, NoteContext] = {}
|
||||||
|
|
||||||
# Puffer für Phase 2 (Symmetrie-Injektion am Ende des gesamten Imports)
|
# WP-24c: Puffer für Phase 2 (Symmetrie-Injektion am Ende des gesamten Imports)
|
||||||
self.symmetry_buffer: List[Dict[str, Any]] = []
|
self.symmetry_buffer: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Aufruf der modularisierten Schema-Logik
|
||||||
ensure_collections(self.client, self.prefix, self.dim)
|
ensure_collections(self.client, self.prefix, self.dim)
|
||||||
ensure_payload_indexes(self.client, self.prefix)
|
ensure_payload_indexes(self.client, self.prefix)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"DB initialization warning: {e}")
|
logger.warning(f"DB initialization warning: {e}")
|
||||||
|
|
||||||
def _is_valid_note_id(self, text: Optional[str]) -> bool:
|
def _is_valid_id(self, text: Optional[str]) -> bool:
|
||||||
"""WP-24c: Fachliche Validitätsprüfung gegen Junk-Kanten."""
|
"""WP-24c: Prüft IDs auf fachliche Validität (Ghost-ID Schutz)."""
|
||||||
if not text or not isinstance(text, str) or len(text.strip()) < 2:
|
if not text or not isinstance(text, str) or len(text.strip()) < 2:
|
||||||
return False
|
return False
|
||||||
blacklisted = {"insight", "event", "source", "task", "project", "person", "concept", "related_to", "referenced_by", "none", "unknown"}
|
blacklisted = {"none", "unknown", "insight", "source", "task", "project", "person", "concept"}
|
||||||
if text.lower().strip() in blacklisted:
|
if text.lower().strip() in blacklisted:
|
||||||
return False
|
return False
|
||||||
if len(text) > 200: return False
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]:
|
async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
WP-15b: Phase 1 des Two-Phase Ingestion Workflows.
|
WP-15b: Phase 1 des Two-Pass Ingestion Workflows.
|
||||||
Verarbeitet Batches und schreibt NUR Nutzer-Autorität (physische Kanten) in die DB.
|
Verarbeitet Batches und schreibt NUR Nutzer-Autorität (explizite Kanten).
|
||||||
"""
|
"""
|
||||||
self.batch_cache.clear()
|
self.batch_cache.clear()
|
||||||
logger.info(f"--- 🔍 START BATCH PHASE 1 ({len(file_paths)} Dateien) ---")
|
logger.info(f"--- 🔍 START BATCH PHASE 1 ({len(file_paths)} Dateien) ---")
|
||||||
|
|
||||||
# 1. Pre-Scan (ID-Gedächtnis füllen)
|
# 1. Schritt: Pre-Scan (Context-Cache füllen)
|
||||||
for path in file_paths:
|
for path in file_paths:
|
||||||
try:
|
try:
|
||||||
ctx = pre_scan_markdown(path, registry=self.registry)
|
ctx = pre_scan_markdown(path, registry=self.registry)
|
||||||
if ctx:
|
if ctx:
|
||||||
self.batch_cache[ctx.note_id] = ctx
|
self.batch_cache[ctx.note_id] = ctx
|
||||||
self.batch_cache[ctx.title] = ctx
|
self.batch_cache[ctx.title] = ctx
|
||||||
fname = os.path.splitext(os.path.basename(path))[0]
|
# Auch Dateinamen ohne Endung auflösbar machen
|
||||||
self.batch_cache[fname] = ctx
|
self.batch_cache[os.path.splitext(os.path.basename(path))[0]] = ctx
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f" ⚠️ Pre-scan fehlgeschlagen für {path}: {e}")
|
logger.warning(f" ⚠️ Pre-scan fehlgeschlagen für {path}: {e}")
|
||||||
|
|
||||||
# 2. Schritt: Batch-Verarbeitung (Explicit Authority)
|
# 2. Schritt: Batch Processing (Authority Only)
|
||||||
processed_count = 0
|
processed_count = 0
|
||||||
success_count = 0
|
success_count = 0
|
||||||
for p in file_paths:
|
for p in file_paths:
|
||||||
|
|
@ -133,40 +137,48 @@ class IngestionService:
|
||||||
|
|
||||||
async def commit_vault_symmetries(self) -> Dict[str, Any]:
|
async def commit_vault_symmetries(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
WP-24c: Globale Symmetrie-Injektion (Phase 2).
|
WP-24c: Führt PHASE 2 (Globale Symmetrie-Injektion) aus.
|
||||||
Prüft gepufferte Kanten gegen die Instance-of-Truth in Qdrant.
|
Wird am Ende des gesamten Imports aufgerufen.
|
||||||
|
Sorgt dafür, dass virtuelle Kanten niemals Nutzer-Autorität überschreiben.
|
||||||
"""
|
"""
|
||||||
if not self.symmetry_buffer:
|
if not self.symmetry_buffer:
|
||||||
logger.info("⏭️ Symmetrie-Puffer leer.")
|
logger.info("⏭️ Symmetrie-Puffer leer. Keine Aktion erforderlich.")
|
||||||
return {"status": "skipped", "reason": "buffer_empty"}
|
return {"status": "skipped", "reason": "buffer_empty"}
|
||||||
|
|
||||||
logger.info(f"🔄 PHASE 2: Validiere {len(self.symmetry_buffer)} Symmetrien gegen Live-DB...")
|
logger.info(f"🔄 PHASE 2: Validiere {len(self.symmetry_buffer)} Symmetrien gegen Live-DB...")
|
||||||
final_virtuals = []
|
final_virtuals = []
|
||||||
for v_edge in self.symmetry_buffer:
|
for v_edge in self.symmetry_buffer:
|
||||||
if not v_edge.get("target_id") or v_edge.get("target_id") == "None": continue
|
src, tgt, kind = v_edge.get("note_id"), v_edge.get("target_id"), v_edge.get("kind")
|
||||||
|
if not src or not tgt: continue
|
||||||
|
|
||||||
v_id = _mk_edge_id(v_edge["kind"], v_edge["note_id"], v_edge["target_id"], v_edge.get("scope", "note"))
|
# Deterministische ID berechnen (WP-24c Standard)
|
||||||
|
try:
|
||||||
|
v_id = _mk_edge_id(kind, src, tgt, "note")
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
# Schutz der Nutzer-Autorität
|
# AUTHORITY-CHECK: Nur schreiben, wenn keine manuelle Kante in der DB existiert
|
||||||
if not is_explicit_edge_present(self.client, self.prefix, v_id):
|
if not is_explicit_edge_present(self.client, self.prefix, v_id):
|
||||||
final_virtuals.append(v_edge)
|
final_virtuals.append(v_edge)
|
||||||
logger.info(f" 🔄 [SYMMETRY] Add inverse: {v_edge['note_id']} --({v_edge['kind']})--> {v_edge['target_id']}")
|
logger.info(f" 🔄 [SYMMETRY] Add inverse: {src} --({kind})--> {tgt}")
|
||||||
else:
|
else:
|
||||||
logger.debug(f" 🛡️ Schutz: Manuelle Kante belegt ID {v_id}. Symmetrie verworfen.")
|
logger.debug(f" 🛡️ Schutz: Manuelle Kante verhindert Symmetrie {v_id}")
|
||||||
|
|
||||||
if final_virtuals:
|
if final_virtuals:
|
||||||
e_pts = points_for_edges(self.prefix, final_virtuals)[1]
|
logger.info(f"📤 Schreibe {len(final_virtuals)} geschützte Symmetrie-Kanten in Qdrant.")
|
||||||
# wait=True garantiert, dass der nächste Lauf diese Kanten sofort sieht
|
col, pts = points_for_edges(self.prefix, final_virtuals)
|
||||||
upsert_batch(self.client, f"{self.prefix}_edges", e_pts, wait=True)
|
# Nutzt upsert_batch mit wait=True für atomare Konsistenz
|
||||||
|
upsert_batch(self.client, col, pts, wait=True)
|
||||||
|
|
||||||
added = len(final_virtuals)
|
count = len(final_virtuals)
|
||||||
self.symmetry_buffer.clear()
|
self.symmetry_buffer.clear() # Puffer nach Commit leeren
|
||||||
return {"status": "success", "added": added}
|
return {"status": "success", "added": count}
|
||||||
|
|
||||||
async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]:
|
async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Transformiert eine Note.
|
Transformiert eine Markdown-Datei (Phase 1).
|
||||||
Implementiert strikte ID-Kanonisierung und Pydantic-Safety.
|
Schreibt Notes/Chunks/Explicit Edges sofort.
|
||||||
|
Befüllt den Symmetrie-Puffer für Phase 2.
|
||||||
"""
|
"""
|
||||||
apply = kwargs.get("apply", False)
|
apply = kwargs.get("apply", False)
|
||||||
force_replace = kwargs.get("force_replace", False)
|
force_replace = kwargs.get("force_replace", False)
|
||||||
|
|
@ -175,32 +187,26 @@ class IngestionService:
|
||||||
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Ordner-Filter
|
# Ordner-Filter (.trash / .obsidian)
|
||||||
if any(part.startswith('.') for part in file_path.split(os.sep)):
|
if ".trash" in file_path or any(part.startswith('.') for part in file_path.split(os.sep)):
|
||||||
return {**result, "status": "skipped", "reason": "hidden_folder"}
|
return {**result, "status": "skipped", "reason": "ignored_folder"}
|
||||||
|
|
||||||
ingest_cfg = self.registry.get("ingestion_settings", {})
|
|
||||||
ignore_folders = ingest_cfg.get("ignore_folders", [".trash", ".obsidian", "templates"])
|
|
||||||
if any(folder in file_path for folder in ignore_folders):
|
|
||||||
return {**result, "status": "skipped", "reason": "folder_blacklist"}
|
|
||||||
|
|
||||||
|
# Datei einlesen und validieren
|
||||||
parsed = read_markdown(file_path)
|
parsed = read_markdown(file_path)
|
||||||
if not parsed: return {**result, "error": "Empty file"}
|
if not parsed: return {**result, "error": "Empty file"}
|
||||||
fm = normalize_frontmatter(parsed.frontmatter)
|
fm = normalize_frontmatter(parsed.frontmatter)
|
||||||
validate_required_frontmatter(fm)
|
validate_required_frontmatter(fm)
|
||||||
|
|
||||||
note_type = resolve_note_type(self.registry, fm.get("type"))
|
|
||||||
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, types_cfg=self.registry)
|
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, types_cfg=self.registry)
|
||||||
note_id = note_pl.get("note_id")
|
note_id = note_pl.get("note_id")
|
||||||
|
|
||||||
# --- HARD GUARD: Verhindert Pydantic-Crashes bei unvollständigen Notizen ---
|
if not note_id:
|
||||||
if not note_id or note_id == "None":
|
logger.warning(f" ⚠️ Keine ID für {file_path}. Überspringe.")
|
||||||
logger.warning(f" ⚠️ Ungültige note_id in '{file_path}'. Überspringe.")
|
return {**result, "status": "error", "error": "missing_id"}
|
||||||
return {**result, "status": "error", "error": "invalid_note_id"}
|
|
||||||
|
|
||||||
logger.info(f"📄 Bearbeite: '{note_id}' (Typ: {note_type})")
|
logger.info(f"📄 Bearbeite: '{note_id}'")
|
||||||
|
|
||||||
# Change Detection
|
# Change Detection & Fragment-Prüfung
|
||||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||||
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
||||||
if not (force_replace or not old_payload or c_miss or e_miss):
|
if not (force_replace or not old_payload or c_miss or e_miss):
|
||||||
|
|
@ -209,8 +215,9 @@ class IngestionService:
|
||||||
if not apply:
|
if not apply:
|
||||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
||||||
|
|
||||||
# LLM Validierung (Expert-MoE)
|
# Deep Processing & MoE (LLM Validierung)
|
||||||
profile = note_pl.get("chunk_profile", "sliding_standard")
|
profile = note_pl.get("chunk_profile", "sliding_standard")
|
||||||
|
note_type = resolve_note_type(self.registry, fm.get("type"))
|
||||||
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
||||||
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
||||||
chunks = await assemble_chunks(note_id, getattr(parsed, "body", ""), note_type, config=chunk_cfg)
|
chunks = await assemble_chunks(note_id, getattr(parsed, "body", ""), note_type, config=chunk_cfg)
|
||||||
|
|
@ -219,11 +226,11 @@ class IngestionService:
|
||||||
new_pool = []
|
new_pool = []
|
||||||
for cand in getattr(ch, "candidate_pool", []):
|
for cand in getattr(ch, "candidate_pool", []):
|
||||||
t_id = cand.get('target_id') or cand.get('note_id')
|
t_id = cand.get('target_id') or cand.get('note_id')
|
||||||
if not self._is_valid_note_id(t_id): continue
|
if not self._is_valid_id(t_id): continue
|
||||||
|
|
||||||
if cand.get("provenance") == "global_pool" and enable_smart:
|
if cand.get("provenance") == "global_pool" and enable_smart:
|
||||||
# LLM Logging
|
# LLM Logging
|
||||||
logger.info(f" ⚖️ [VALIDATING] Relation to '{t_id}' via Expert-LLM...")
|
logger.info(f" ⚖️ [VALIDATING] Relation to '{t_id}' via Experts...")
|
||||||
is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm)
|
is_valid = await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm)
|
||||||
logger.info(f" 🧠 [SMART EDGE] {t_id} -> {'✅ OK' if is_valid else '❌ SKIP'}")
|
logger.info(f" 🧠 [SMART EDGE] {t_id} -> {'✅ OK' if is_valid else '❌ SKIP'}")
|
||||||
if is_valid: new_pool.append(cand)
|
if is_valid: new_pool.append(cand)
|
||||||
|
|
@ -231,56 +238,55 @@ class IngestionService:
|
||||||
new_pool.append(cand)
|
new_pool.append(cand)
|
||||||
ch.candidate_pool = new_pool
|
ch.candidate_pool = new_pool
|
||||||
|
|
||||||
|
# Embeddings erzeugen
|
||||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
||||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||||
|
|
||||||
# --- KANTEN-LOGIK MIT STRIKTER KANONISIERUNG (FIX FÜR STEINZEITAXT) ---
|
# Kanten-Extraktion mit strikter Cache-Resolution (Fix für Ghost-IDs)
|
||||||
raw_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
|
raw_edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
|
||||||
|
|
||||||
explicit_edges = []
|
explicit_edges = []
|
||||||
for e in raw_edges:
|
for e in raw_edges:
|
||||||
target_raw = e.get("target_id")
|
t_raw = e.get("target_id")
|
||||||
t_ctx = self.batch_cache.get(target_raw)
|
# Kanonisierung: Link-Auflösung über den globalen Cache
|
||||||
|
t_ctx = self.batch_cache.get(t_raw)
|
||||||
|
t_id = t_ctx.note_id if t_ctx else t_raw
|
||||||
|
|
||||||
# Wenn das Ziel nicht im Cache ist, haben wir keine stabile note_id -> Überspringen (Ghost-ID Schutz)
|
if not self._is_valid_id(t_id): continue
|
||||||
if not t_ctx:
|
|
||||||
logger.debug(f" ⚠️ Linkziel '{target_raw}' nicht im Cache. Überspringe Kante.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
target_id = t_ctx.note_id
|
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance="explicit")
|
||||||
if not self._is_valid_note_id(target_id): continue
|
e.update({"kind": resolved_kind, "target_id": t_id, "origin_note_id": note_id, "virtual": False})
|
||||||
|
|
||||||
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance=e.get("provenance", "explicit"))
|
|
||||||
|
|
||||||
# Echte physische Kante markieren (Phase 1)
|
|
||||||
e.update({
|
|
||||||
"kind": resolved_kind, "target_id": target_id,
|
|
||||||
"origin_note_id": note_id, "virtual": False, "confidence": 1.0
|
|
||||||
})
|
|
||||||
explicit_edges.append(e)
|
explicit_edges.append(e)
|
||||||
|
|
||||||
# Symmetrie puffern
|
# Symmetrie-Gegenkante für Phase 2 puffern
|
||||||
inv_kind = edge_registry.get_inverse(resolved_kind)
|
inv_kind = edge_registry.get_inverse(resolved_kind)
|
||||||
if inv_kind and target_id != note_id:
|
if inv_kind and t_id != note_id:
|
||||||
v_edge = e.copy()
|
v_edge = e.copy()
|
||||||
v_edge.update({
|
v_edge.update({
|
||||||
"note_id": target_id, "target_id": note_id, "kind": inv_kind,
|
"note_id": t_id,
|
||||||
"virtual": True, "provenance": "structure", "confidence": 1.0,
|
"target_id": note_id,
|
||||||
|
"kind": inv_kind,
|
||||||
|
"virtual": True,
|
||||||
"origin_note_id": note_id
|
"origin_note_id": note_id
|
||||||
})
|
})
|
||||||
self.symmetry_buffer.append(v_edge)
|
self.symmetry_buffer.append(v_edge)
|
||||||
|
|
||||||
# 4. DB Commit (Phase 1)
|
# DB Upsert (Phase 1: Authority Commitment)
|
||||||
if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id)
|
if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id)
|
||||||
|
|
||||||
n_name, n_pts = points_for_note(self.prefix, note_pl, None, self.dim)
|
col_n, pts_n = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||||
upsert_batch(self.client, n_name, n_pts)
|
upsert_batch(self.client, col_n, pts_n, wait=True)
|
||||||
if chunk_pls and vecs:
|
|
||||||
upsert_batch(self.client, f"{self.prefix}_chunks", points_for_chunks(self.prefix, chunk_pls, vecs)[1])
|
|
||||||
if explicit_edges:
|
|
||||||
# WICHTIG: wait=True für Phase-1 Konsistenz
|
|
||||||
upsert_batch(self.client, f"{self.prefix}_edges", points_for_edges(self.prefix, explicit_edges)[1], wait=True)
|
|
||||||
|
|
||||||
logger.info(f" ✨ Phase 1 fertig: {len(chunk_pls)} Chunks, {len(explicit_edges)} explizite Kanten.")
|
if chunk_pls and vecs:
|
||||||
|
col_c, pts_c = points_for_chunks(self.prefix, chunk_pls, vecs)
|
||||||
|
upsert_batch(self.client, col_c, pts_c, wait=True)
|
||||||
|
|
||||||
|
if explicit_edges:
|
||||||
|
col_e, pts_e = points_for_edges(self.prefix, explicit_edges)
|
||||||
|
# WICHTIG: wait=True garantiert, dass die Kanten indiziert sind, bevor Phase 2 prüft
|
||||||
|
upsert_batch(self.client, col_e, pts_e, wait=True)
|
||||||
|
|
||||||
|
logger.info(f" ✨ Phase 1 fertig: {len(explicit_edges)} explizite Kanten für '{note_id}'.")
|
||||||
return {"status": "success", "note_id": note_id, "edges_count": len(explicit_edges)}
|
return {"status": "success", "note_id": note_id, "edges_count": len(explicit_edges)}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -288,9 +294,10 @@ class IngestionService:
|
||||||
return {**result, "status": "error", "error": str(e)}
|
return {**result, "status": "error", "error": str(e)}
|
||||||
|
|
||||||
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
||||||
"""Erstellt eine Note aus einem Textstream."""
|
"""Erstellt eine Note aus einem Textstream und triggert die Ingestion."""
|
||||||
target_path = os.path.join(vault_root, folder, filename)
|
target_path = os.path.join(vault_root, folder, filename)
|
||||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||||
with open(target_path, "w", encoding="utf-8") as f: f.write(markdown_content)
|
with open(target_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(markdown_content)
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||||
|
|
@ -4,116 +4,237 @@
|
||||||
FILE: scripts/import_markdown.py
|
FILE: scripts/import_markdown.py
|
||||||
VERSION: 2.6.0 (2026-01-10)
|
VERSION: 2.6.0 (2026-01-10)
|
||||||
STATUS: Active (Core)
|
STATUS: Active (Core)
|
||||||
COMPATIBILITY: IngestionProcessor v3.3.7+
|
COMPATIBILITY: IngestionProcessor v3.4.1+
|
||||||
Zweck: Hauptwerkzeug zum Importieren von Markdown-Dateien.
|
|
||||||
Implementiert die globale 2-Phasen-Schreibstrategie.
|
Zweck:
|
||||||
|
-------
|
||||||
|
Hauptwerkzeug zum Importieren von Markdown-Dateien aus einem lokalen Obsidian-Vault in die
|
||||||
|
Qdrant Vektor-Datenbank. Das Script ist darauf optimiert, die strukturelle Integrität des
|
||||||
|
Wissensgraphen zu wahren und die manuelle Nutzer-Autorität vor automatisierten System-Eingriffen
|
||||||
|
zu schützen.
|
||||||
|
|
||||||
|
Hintergrund der 2-Phasen-Strategie (Authority-First):
|
||||||
|
------------------------------------------------------
|
||||||
|
Um das Problem der "Ghost-IDs" und der asynchronen Überschreibungen zu lösen, implementiert
|
||||||
|
dieses Script eine strikte Trennung der Schreibvorgänge:
|
||||||
|
|
||||||
|
1. PHASE 1: Authority Processing (Batch-Modus)
|
||||||
|
- Alle Dateien werden gescannt und verarbeitet.
|
||||||
|
- Notizen, Chunks und explizite (vom Nutzer gesetzte) Kanten werden sofort geschrieben.
|
||||||
|
- Durch die Verwendung von 'wait=True' in der Datenbank-Layer wird sichergestellt,
|
||||||
|
dass diese Informationen physisch indiziert sind, bevor der nächste Schritt erfolgt.
|
||||||
|
- Symmetrische Gegenkanten werden während dieser Phase lediglich im Speicher gepuffert.
|
||||||
|
|
||||||
|
2. PHASE 2: Global Symmetry Commitment (Finaler Schritt)
|
||||||
|
- Erst nach Abschluss aller Batches wird die Methode commit_vault_symmetries() aufgerufen.
|
||||||
|
- Diese prüft die gepufferten Symmetrie-Vorschläge gegen die bereits existierende
|
||||||
|
Nutzer-Autorität in der Datenbank.
|
||||||
|
- Existiert bereits eine manuelle Kante für dieselbe Verbindung, wird die automatische
|
||||||
|
Symmetrie unterdrückt.
|
||||||
|
|
||||||
|
Detaillierte Funktionsweise:
|
||||||
|
----------------------------
|
||||||
|
1. PASS 1: Global Pre-Scan
|
||||||
|
- Scannt rekursiv alle Markdown-Dateien im Vault.
|
||||||
|
- Schließt System-Ordner wie .trash, .obsidian, .sync sowie Vorlagen konsequent aus.
|
||||||
|
- Extrahiert Note-Kontext (ID, Titel, Dateiname) ohne DB-Schreibzugriff.
|
||||||
|
- Füllt den LocalBatchCache im IngestionService, der als Single-Source-of-Truth für
|
||||||
|
die spätere Link-Auflösung (Kanonisierung) dient.
|
||||||
|
- Dies stellt sicher, dass Wikilinks wie [[Klaus]] korrekt zu Zeitstempel-IDs wie
|
||||||
|
202601031726-klaus aufgelöst werden, BEVOR eine UUID für die Kante berechnet wird.
|
||||||
|
|
||||||
|
2. PASS 2: Semantic Processing
|
||||||
|
- Verarbeitet Dateien in konfigurierten Batches (Standard: 20 Dateien).
|
||||||
|
- Implementiert Cloud-Resilienz durch Semaphoren (max. 5 parallele Zugriffe).
|
||||||
|
- Nutzt die Mixture of Experts (MoE) Architektur zur semantischen Validierung von Links.
|
||||||
|
- Führt eine Hash-basierte Change Detection durch, um unnötige Schreibvorgänge zu vermeiden.
|
||||||
|
- Schreibt die Ergebnisse (Notes, Chunks, Explicit Edges) konsistent nach Qdrant.
|
||||||
|
|
||||||
|
Ergebnis-Interpretation:
|
||||||
|
------------------------
|
||||||
|
- Log-Ausgabe: Liefert detaillierte Informationen über den Fortschritt, LLM-Entscheidungen
|
||||||
|
und die finale Symmetrie-Validierung.
|
||||||
|
- Statistiken: Gibt am Ende eine Zusammenfassung über verarbeitete, übersprungene und
|
||||||
|
fehlerhafte Dateien aus.
|
||||||
|
- Dry-Run: Ohne den Parameter --apply werden keine physischen Änderungen an der Datenbank
|
||||||
|
vorgenommen, der gesamte Workflow (inkl. LLM-Anfragen) wird jedoch simuliert.
|
||||||
|
|
||||||
|
Verwendung:
|
||||||
|
-----------
|
||||||
|
- Regelmäßiger Import nach Änderungen im Vault.
|
||||||
|
- Initialer Aufbau eines neuen Wissensgraphen.
|
||||||
|
- Erzwingung einer Re-Indizierung mittels --force.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Any
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Root Logger Setup
|
# Root Logger Setup:INFO-Level für volle Transparenz der fachlichen Prozesse
|
||||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s [%(levelname)s] %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sicherstellung, dass das Root-Verzeichnis im Python-Pfad liegt
|
||||||
sys.path.append(os.getcwd())
|
sys.path.append(os.getcwd())
|
||||||
|
|
||||||
|
# App-spezifische Imports
|
||||||
from app.core.ingestion import IngestionService
|
from app.core.ingestion import IngestionService
|
||||||
from app.core.parser import pre_scan_markdown
|
from app.core.parser import pre_scan_markdown
|
||||||
|
|
||||||
logger = logging.getLogger("importer")
|
logger = logging.getLogger("importer")
|
||||||
|
|
||||||
async def main_async(args):
|
async def main_async(args):
|
||||||
|
"""
|
||||||
|
Haupt-Workflow der Ingestion. Koordiniert die zwei Durchläufe (Pass 1/2)
|
||||||
|
und die zwei Schreibphasen (Phase 1/2).
|
||||||
|
"""
|
||||||
vault_path = Path(args.vault).resolve()
|
vault_path = Path(args.vault).resolve()
|
||||||
if not vault_path.exists():
|
if not vault_path.exists():
|
||||||
logger.error(f"Vault path does not exist: {vault_path}")
|
logger.error(f"Vault-Pfad existiert nicht: {vault_path}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# 1. Initialisierung des zentralen Ingestion-Services
|
||||||
logger.info(f"Initializing IngestionService (Prefix: {args.prefix})")
|
logger.info(f"Initializing IngestionService (Prefix: {args.prefix})")
|
||||||
service = IngestionService(collection_prefix=args.prefix)
|
service = IngestionService(collection_prefix=args.prefix)
|
||||||
|
|
||||||
logger.info(f"Scanning {vault_path}...")
|
logger.info(f"Scanning {vault_path}...")
|
||||||
all_files = list(vault_path.rglob("*.md"))
|
all_files_raw = list(vault_path.rglob("*.md"))
|
||||||
|
|
||||||
# --- GLOBALER ORDNER-FILTER ---
|
# --- GLOBALER ORDNER-FILTER ---
|
||||||
|
# Diese Liste stellt sicher, dass keine System-Leichen oder temporäre Dateien
|
||||||
|
# den Graphen korrumpieren oder zu ID-Kollisionen führen.
|
||||||
files = []
|
files = []
|
||||||
ignore_folders = [".trash", ".obsidian", ".sync", "templates", "_system"]
|
ignore_list = [".trash", ".obsidian", ".sync", "templates", "_system", ".git"]
|
||||||
for f in all_files:
|
|
||||||
|
for f in all_files_raw:
|
||||||
f_str = str(f)
|
f_str = str(f)
|
||||||
if not any(folder in f_str for folder in ignore_folders) and not "/." in f_str:
|
# Filtert Ordner aus der ignore_list und versteckte Verzeichnisse
|
||||||
|
if not any(folder in f_str for folder in ignore_list) and not "/." in f_str:
|
||||||
files.append(f)
|
files.append(f)
|
||||||
|
|
||||||
files.sort()
|
files.sort()
|
||||||
logger.info(f"Found {len(files)} relevant markdown files.")
|
logger.info(f"Found {len(files)} relevant markdown files (filtered trash/system/hidden).")
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# PASS 1: Global Pre-Scan
|
# PASS 1: Global Pre-Scan
|
||||||
|
# Ziel: Aufbau eines vollständigen Mappings von Bezeichnungen zu stabilen IDs.
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
logger.info(f"🔍 [Pass 1] Pre-scanning files for global context cache...")
|
logger.info(f"🔍 [Pass 1] Global Pre-Scan: Building context cache for {len(files)} files...")
|
||||||
for f_path in files:
|
for f_path in files:
|
||||||
try:
|
try:
|
||||||
|
# Extrahiert Frontmatter und Metadaten ohne DB-Last
|
||||||
ctx = pre_scan_markdown(str(f_path))
|
ctx = pre_scan_markdown(str(f_path))
|
||||||
if ctx:
|
if ctx:
|
||||||
|
# Mehrfache Indizierung für maximale Trefferrate bei Wikilinks
|
||||||
service.batch_cache[ctx.note_id] = ctx
|
service.batch_cache[ctx.note_id] = ctx
|
||||||
service.batch_cache[ctx.title] = ctx
|
service.batch_cache[ctx.title] = ctx
|
||||||
fname = os.path.splitext(f_path.name)[0]
|
# Auch den Dateinamen ohne Endung als Alias hinterlegen
|
||||||
service.batch_cache[fname] = ctx
|
service.batch_cache[os.path.splitext(f_path.name)[0]] = ctx
|
||||||
except Exception: pass
|
except Exception as e:
|
||||||
|
logger.warning(f"⚠️ Pre-scan fehlgeschlagen für {f_path.name}: {e}")
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# PHASE 1: Batch-Import (Notes & Explicit Edges)
|
# PHASE 1: Authority Processing (Batch-Lauf)
|
||||||
|
# Ziel: Verarbeitung der Dateiinhalte und Speicherung der Nutzer-Autorität.
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
stats = {"processed": 0, "skipped": 0, "errors": 0}
|
stats = {"processed": 0, "skipped": 0, "errors": 0}
|
||||||
|
# Semaphore begrenzt die Parallelität zum Schutz der lokalen oder Cloud-API
|
||||||
sem = asyncio.Semaphore(5)
|
sem = asyncio.Semaphore(5)
|
||||||
|
|
||||||
async def process_with_limit(f_path):
|
async def process_with_limit(f_path):
|
||||||
|
"""Kapselt den Prozess-Aufruf mit Ressourcen-Limitierung."""
|
||||||
async with sem:
|
async with sem:
|
||||||
try:
|
try:
|
||||||
# Nutzt process_file (v3.3.7)
|
# Verwendet process_file (v3.4.1), das explizite Kanten sofort schreibt
|
||||||
|
# und Symmetrien für Phase 2 im Service-Puffer sammelt.
|
||||||
return await service.process_file(
|
return await service.process_file(
|
||||||
file_path=str(f_path), vault_root=str(vault_path),
|
file_path=str(f_path),
|
||||||
force_replace=args.force, apply=args.apply, purge_before=True
|
vault_root=str(vault_path),
|
||||||
|
force_replace=args.force,
|
||||||
|
apply=args.apply,
|
||||||
|
purge_before=True
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"status": "error", "error": str(e), "path": str(f_path)}
|
return {"status": "error", "error": str(e), "path": str(f_path)}
|
||||||
|
|
||||||
|
logger.info(f"🚀 [Phase 1] Starting semantic processing in batches...")
|
||||||
|
|
||||||
batch_size = 20
|
batch_size = 20
|
||||||
for i in range(0, len(files), batch_size):
|
for i in range(0, len(files), batch_size):
|
||||||
batch = files[i:i+batch_size]
|
batch = files[i:i+batch_size]
|
||||||
logger.info(f"--- Processing Batch {i//batch_size + 1} ---")
|
logger.info(f"--- Processing Batch {i//batch_size + 1} ({len(batch)} files) ---")
|
||||||
|
|
||||||
|
# Parallelisierung innerhalb des Batches (begrenzt durch sem)
|
||||||
tasks = [process_with_limit(f) for f in batch]
|
tasks = [process_with_limit(f) for f in batch]
|
||||||
results = await asyncio.gather(*tasks)
|
results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
for res in results:
|
for res in results:
|
||||||
if res.get("status") == "success": stats["processed"] += 1
|
# Robuste Auswertung der Rückgabe-Dictionaries
|
||||||
elif res.get("status") == "error": stats["errors"] += 1
|
if not isinstance(res, dict):
|
||||||
else: stats["skipped"] += 1
|
stats["errors"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
status = res.get("status")
|
||||||
|
if status == "success":
|
||||||
|
stats["processed"] += 1
|
||||||
|
elif status == "error":
|
||||||
|
stats["errors"] += 1
|
||||||
|
logger.error(f"❌ Fehler in {res.get('path')}: {res.get('error')}")
|
||||||
|
elif status == "unchanged":
|
||||||
|
stats["skipped"] += 1
|
||||||
|
else:
|
||||||
|
stats["skipped"] += 1
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# PHASE 2: Global Symmetry Injection (Nach Abschluss aller Batches)
|
# PHASE 2: Global Symmetry Commitment
|
||||||
|
# Ziel: Finale Integrität. Triggert erst, wenn Phase 1 komplett indiziert ist.
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
if args.apply:
|
if args.apply:
|
||||||
logger.info(f"🔄 [Phase 2] Starting global symmetry injection for the entire vault...")
|
logger.info(f"🔄 [Phase 2] Starting global symmetry injection for the entire vault...")
|
||||||
sym_res = await service.commit_vault_symmetries()
|
try:
|
||||||
if sym_res.get("status") == "success":
|
# Diese Methode prüft den Puffer gegen die nun vollständige Datenbank
|
||||||
logger.info(f"✅ Finished global symmetry injection. Added: {sym_res.get('added', 0)}")
|
sym_res = await service.commit_vault_symmetries()
|
||||||
|
if sym_res.get("status") == "success":
|
||||||
|
logger.info(f"✅ Phase 2 abgeschlossen. Hinzugefügt: {sym_res.get('added', 0)} geschützte Symmetrien.")
|
||||||
|
else:
|
||||||
|
logger.info(f"⏭️ Phase 2 übersprungen: {sym_res.get('reason', 'Keine Daten')}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Fehler in Phase 2: {e}")
|
||||||
|
else:
|
||||||
|
logger.info("⏭️ [Phase 2] Dry-Run: Keine Symmetrie-Injektion durchgeführt.")
|
||||||
|
|
||||||
logger.info(f"Final Stats: {stats}")
|
logger.info(f"--- Import beendet ---")
|
||||||
|
logger.info(f"Statistiken: {stats}")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
"""Einstiegspunkt und Argument-Parsing."""
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
# Standard-Präfix aus Umgebungsvariable oder Fallback
|
||||||
default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet")
|
default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet")
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--vault", default="./vault")
|
parser = argparse.ArgumentParser(description="Mindnet Ingester: Two-Phase Markdown Import")
|
||||||
parser.add_argument("--prefix", default=default_prefix)
|
parser.add_argument("--vault", default="./vault", help="Pfad zum Obsidian Vault")
|
||||||
parser.add_argument("--force", action="store_true")
|
parser.add_argument("--prefix", default=default_prefix, help="Qdrant Collection Präfix")
|
||||||
parser.add_argument("--apply", action="store_true")
|
parser.add_argument("--force", action="store_true", help="Erzwingt Neu-Indizierung aller Dateien")
|
||||||
|
parser.add_argument("--apply", action="store_true", help="Schreibt physisch in die Datenbank")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
asyncio.run(main_async(args))
|
asyncio.run(main_async(args))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logger.info("Import durch Nutzer abgebrochen.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.critical(f"FATAL ERROR: {e}")
|
logger.critical(f"FATALER FEHLER: {e}", exc_info=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user