WP24c - Agentic Edge Validation & Chunk-Aware Multigraph-System (v4.5.8) #22
|
|
@ -2,81 +2,32 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
FILE: scripts/import_markdown.py
|
FILE: scripts/import_markdown.py
|
||||||
VERSION: 2.4.1 (2025-12-15)
|
VERSION: 2.5.0 (2026-01-10)
|
||||||
STATUS: Active (Core)
|
STATUS: Active (Core)
|
||||||
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)
|
COMPATIBILITY: IngestionProcessor v3.3.5+
|
||||||
|
|
||||||
Zweck:
|
Zweck:
|
||||||
-------
|
-------
|
||||||
Hauptwerkzeug zum Importieren von Markdown-Dateien aus einem Vault in Qdrant.
|
Hauptwerkzeug zum Importieren von Markdown-Dateien aus einem Vault in Qdrant.
|
||||||
Implementiert den Two-Pass Workflow (WP-15b) für robuste Edge-Validierung.
|
Implementiert die globale 2-Phasen-Schreibstrategie.
|
||||||
|
|
||||||
Funktionsweise:
|
Änderungen v2.5.0:
|
||||||
---------------
|
------------------
|
||||||
1. PASS 1: Global Pre-Scan
|
- Globale Phasentrennung: commit_vault_symmetries() wird erst am Ende aufgerufen.
|
||||||
- Scannt alle Markdown-Dateien im Vault
|
- Erweiterter Ordner-Filter: Schließt .trash und andere Systemordner aus.
|
||||||
- Extrahiert Note-Kontext (ID, Titel, Dateiname)
|
|
||||||
- Füllt LocalBatchCache für semantische Edge-Validierung
|
|
||||||
- Indiziert nach ID, Titel und Dateiname für Link-Auflösung
|
|
||||||
|
|
||||||
2. PASS 2: Semantic Processing
|
|
||||||
- Verarbeitet Dateien in Batches (20 Dateien, max. 5 parallel)
|
|
||||||
- Nutzt gefüllten Cache für binäre Edge-Validierung
|
|
||||||
- Erzeugt Notes, Chunks und Edges in Qdrant
|
|
||||||
- Respektiert Hash-basierte Change Detection
|
|
||||||
|
|
||||||
Ergebnis-Interpretation:
|
|
||||||
------------------------
|
|
||||||
- Log-Ausgabe: Fortschritt und Statistiken
|
|
||||||
- Stats: processed, skipped, errors
|
|
||||||
- Exit-Code 0: Erfolgreich (auch wenn einzelne Dateien Fehler haben)
|
|
||||||
- Ohne --apply: Dry-Run (keine DB-Änderungen)
|
|
||||||
|
|
||||||
Verwendung:
|
|
||||||
-----------
|
|
||||||
- Regelmäßiger Import nach Vault-Änderungen
|
|
||||||
- Initial-Import eines neuen Vaults
|
|
||||||
- Re-Indexierung mit --force
|
|
||||||
|
|
||||||
Hinweise:
|
|
||||||
---------
|
|
||||||
- Two-Pass Workflow sorgt für robuste Edge-Validierung
|
|
||||||
- Change Detection verhindert unnötige Re-Indexierung
|
|
||||||
- Parallele Verarbeitung für Performance (max. 5 gleichzeitig)
|
|
||||||
- Cloud-Resilienz durch Semaphore-Limits
|
|
||||||
|
|
||||||
Aufruf:
|
|
||||||
-------
|
|
||||||
python3 -m scripts.import_markdown --vault ./vault --apply
|
|
||||||
python3 -m scripts.import_markdown --vault ./vault --prefix mindnet_dev --force --apply
|
|
||||||
|
|
||||||
Parameter:
|
|
||||||
----------
|
|
||||||
--vault PATH Pfad zum Vault-Verzeichnis (Default: ./vault)
|
|
||||||
--prefix TEXT Collection-Präfix (Default: ENV COLLECTION_PREFIX oder mindnet)
|
|
||||||
--force Erzwingt Re-Indexierung aller Dateien (ignoriert Hashes)
|
|
||||||
--apply Führt tatsächliche DB-Schreibvorgänge durch (sonst Dry-Run)
|
|
||||||
|
|
||||||
Änderungen:
|
|
||||||
-----------
|
|
||||||
v2.4.1 (2025-12-15): WP-15b Two-Pass Workflow
|
|
||||||
- Implementiert Pre-Scan für LocalBatchCache
|
|
||||||
- Indizierung nach ID, Titel und Dateiname
|
|
||||||
- Batch-Verarbeitung mit Semaphore-Limits
|
|
||||||
v2.0.0: Initial Release
|
|
||||||
"""
|
"""
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
# Setzt das Level global auf INFO, damit der Fortschritt im Log sichtbar ist
|
# Setzt das Level global auf INFO
|
||||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||||
|
|
||||||
# Importiere den neuen Async Service und stelle Python-Pfad sicher
|
# Stelle sicher, dass das Root-Verzeichnis im Python-Pfad ist
|
||||||
import sys
|
|
||||||
sys.path.append(os.getcwd())
|
sys.path.append(os.getcwd())
|
||||||
|
|
||||||
from app.core.ingestion import IngestionService
|
from app.core.ingestion import IngestionService
|
||||||
|
|
@ -95,97 +46,84 @@ async def main_async(args):
|
||||||
service = IngestionService(collection_prefix=args.prefix)
|
service = IngestionService(collection_prefix=args.prefix)
|
||||||
|
|
||||||
logger.info(f"Scanning {vault_path}...")
|
logger.info(f"Scanning {vault_path}...")
|
||||||
files = list(vault_path.rglob("*.md"))
|
all_files = list(vault_path.rglob("*.md"))
|
||||||
# Exclude .obsidian folder if present
|
|
||||||
files = [f for f in files if ".obsidian" not in str(f)]
|
# --- ORDNER-FILTER ---
|
||||||
|
files = []
|
||||||
|
ignore_folders = [".trash", ".obsidian", ".sync", "templates", "_system"]
|
||||||
|
for f in all_files:
|
||||||
|
f_str = str(f)
|
||||||
|
if not any(folder in f_str for folder in ignore_folders) and not "/." in f_str:
|
||||||
|
files.append(f)
|
||||||
|
|
||||||
files.sort()
|
files.sort()
|
||||||
|
logger.info(f"Found {len(files)} relevant markdown files.")
|
||||||
logger.info(f"Found {len(files)} markdown files.")
|
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# PASS 1: Global Pre-Scan (WP-15b Harvester)
|
# PASS 1: Global Pre-Scan
|
||||||
# Füllt den LocalBatchCache für die semantische Kanten-Validierung.
|
|
||||||
# Nutzt ID, Titel und Filename für robusten Look-up.
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
logger.info(f"🔍 [Pass 1] Pre-scanning {len(files)} files for global context cache...")
|
logger.info(f"🔍 [Pass 1] Pre-scanning files for global context cache...")
|
||||||
for f_path in files:
|
for f_path in files:
|
||||||
try:
|
try:
|
||||||
ctx = pre_scan_markdown(str(f_path))
|
ctx = pre_scan_markdown(str(f_path))
|
||||||
if ctx:
|
if ctx:
|
||||||
# 1. Look-up via Note ID (UUID oder Frontmatter ID)
|
|
||||||
service.batch_cache[ctx.note_id] = ctx
|
service.batch_cache[ctx.note_id] = ctx
|
||||||
|
|
||||||
# 2. Look-up via Titel (Wichtig für Wikilinks [[Titel]])
|
|
||||||
service.batch_cache[ctx.title] = ctx
|
service.batch_cache[ctx.title] = ctx
|
||||||
|
|
||||||
# 3. Look-up via Dateiname (Wichtig für Wikilinks [[Filename]])
|
|
||||||
fname = os.path.splitext(f_path.name)[0]
|
fname = os.path.splitext(f_path.name)[0]
|
||||||
service.batch_cache[fname] = ctx
|
service.batch_cache[fname] = ctx
|
||||||
|
except Exception: pass
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"⚠️ Could not pre-scan {f_path.name}: {e}")
|
|
||||||
|
|
||||||
logger.info(f"✅ Context Cache populated for {len(files)} notes.")
|
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# PASS 2: Processing (Semantic Batch-Verarbeitung)
|
# PHASE 1: Batch-Import (Explicit Edges only)
|
||||||
# Nutzt den gefüllten Cache zur binären Validierung semantischer Kanten.
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
stats = {"processed": 0, "skipped": 0, "errors": 0}
|
stats = {"processed": 0, "skipped": 0, "errors": 0}
|
||||||
sem = asyncio.Semaphore(5) # Max 5 parallele Dateien für Cloud-Stabilität
|
sem = asyncio.Semaphore(5)
|
||||||
|
|
||||||
async def process_with_limit(f_path):
|
async def process_with_limit(f_path):
|
||||||
async with sem:
|
async with sem:
|
||||||
try:
|
try:
|
||||||
# Nutzt den nun gefüllten Batch-Cache in der process_file Logik
|
return await service.process_file(
|
||||||
res = await service.process_file(
|
file_path=str(f_path), vault_root=str(vault_path),
|
||||||
file_path=str(f_path),
|
force_replace=args.force, apply=args.apply, purge_before=True
|
||||||
vault_root=str(vault_path),
|
|
||||||
force_replace=args.force,
|
|
||||||
apply=args.apply,
|
|
||||||
purge_before=True
|
|
||||||
)
|
)
|
||||||
return res
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"status": "error", "error": str(e), "path": str(f_path)}
|
return {"status": "error", "error": str(e), "path": str(f_path)}
|
||||||
|
|
||||||
logger.info(f"🚀 [Pass 2] Starting semantic processing in batches...")
|
|
||||||
|
|
||||||
batch_size = 20
|
batch_size = 20
|
||||||
for i in range(0, len(files), batch_size):
|
for i in range(0, len(files), batch_size):
|
||||||
batch = files[i:i+batch_size]
|
batch = files[i:i+batch_size]
|
||||||
logger.info(f"Processing batch {i} to {i+len(batch)}...")
|
logger.info(f"--- Processing Batch {i//batch_size + 1} ---")
|
||||||
|
|
||||||
tasks = [process_with_limit(f) for f in batch]
|
tasks = [process_with_limit(f) for f in batch]
|
||||||
results = await asyncio.gather(*tasks)
|
results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
for res in results:
|
for res in results:
|
||||||
if res.get("status") == "success":
|
if res.get("status") == "success": stats["processed"] += 1
|
||||||
stats["processed"] += 1
|
elif res.get("status") == "error": stats["errors"] += 1
|
||||||
elif res.get("status") == "error":
|
else: stats["skipped"] += 1
|
||||||
stats["errors"] += 1
|
|
||||||
logger.error(f"Error in {res.get('path')}: {res.get('error')}")
|
|
||||||
else:
|
|
||||||
stats["skipped"] += 1
|
|
||||||
|
|
||||||
logger.info(f"Done. Stats: {stats}")
|
# =========================================================================
|
||||||
if not args.apply:
|
# PHASE 2: Global Symmetry Injection
|
||||||
logger.info("DRY RUN. Use --apply to write to DB.")
|
# =========================================================================
|
||||||
|
if args.apply:
|
||||||
|
logger.info(f"🔄 [Phase 2] Starting global symmetry injection...")
|
||||||
|
sym_res = await service.commit_vault_symmetries()
|
||||||
|
if sym_res.get("status") == "success":
|
||||||
|
logger.info(f"✅ Added {sym_res.get('added', 0)} protected symmetry edges.")
|
||||||
|
|
||||||
|
logger.info(f"Done. Final Stats: {stats}")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet")
|
default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet")
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
parser = argparse.ArgumentParser(description="Two-Pass Markdown Ingestion for Mindnet")
|
parser.add_argument("--vault", default="./vault")
|
||||||
parser.add_argument("--vault", default="./vault", help="Path to vault root")
|
parser.add_argument("--prefix", default=default_prefix)
|
||||||
parser.add_argument("--prefix", default=default_prefix, help="Collection prefix")
|
parser.add_argument("--force", action="store_true")
|
||||||
parser.add_argument("--force", action="store_true", help="Force re-index all files")
|
parser.add_argument("--apply", action="store_true")
|
||||||
parser.add_argument("--apply", action="store_true", help="Perform writes to Qdrant")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
try:
|
||||||
# Starte den asynchronen Haupt-Loop
|
asyncio.run(main_async(args))
|
||||||
asyncio.run(main_async(args))
|
except Exception as e:
|
||||||
|
logger.critical(f"FATAL ERROR: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user