129 lines
4.6 KiB
Python
129 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
FILE: scripts/import_markdown.py
|
|
VERSION: 2.5.0 (2026-01-10)
|
|
STATUS: Active (Core)
|
|
COMPATIBILITY: IngestionProcessor v3.3.5+
|
|
|
|
Zweck:
|
|
-------
|
|
Hauptwerkzeug zum Importieren von Markdown-Dateien aus einem Vault in Qdrant.
|
|
Implementiert die globale 2-Phasen-Schreibstrategie.
|
|
|
|
Änderungen v2.5.0:
|
|
------------------
|
|
- Globale Phasentrennung: commit_vault_symmetries() wird erst am Ende aufgerufen.
|
|
- Erweiterter Ordner-Filter: Schließt .trash und andere Systemordner aus.
|
|
"""
|
|
import asyncio
|
|
import os
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
from dotenv import load_dotenv
|
|
|
|
# Setzt das Level global auf INFO
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
|
|
|
# Stelle sicher, dass das Root-Verzeichnis im Python-Pfad ist
|
|
sys.path.append(os.getcwd())
|
|
|
|
from app.core.ingestion import IngestionService
|
|
from app.core.parser import pre_scan_markdown
|
|
|
|
logger = logging.getLogger("importer")
|
|
|
|
async def main_async(args):
|
|
vault_path = Path(args.vault).resolve()
|
|
if not vault_path.exists():
|
|
logger.error(f"Vault path does not exist: {vault_path}")
|
|
return
|
|
|
|
# 1. Service initialisieren
|
|
logger.info(f"Initializing IngestionService (Prefix: {args.prefix})")
|
|
service = IngestionService(collection_prefix=args.prefix)
|
|
|
|
logger.info(f"Scanning {vault_path}...")
|
|
all_files = list(vault_path.rglob("*.md"))
|
|
|
|
# --- ORDNER-FILTER ---
|
|
files = []
|
|
ignore_folders = [".trash", ".obsidian", ".sync", "templates", "_system"]
|
|
for f in all_files:
|
|
f_str = str(f)
|
|
if not any(folder in f_str for folder in ignore_folders) and not "/." in f_str:
|
|
files.append(f)
|
|
|
|
files.sort()
|
|
logger.info(f"Found {len(files)} relevant markdown files.")
|
|
|
|
# =========================================================================
|
|
# PASS 1: Global Pre-Scan
|
|
# =========================================================================
|
|
logger.info(f"🔍 [Pass 1] Pre-scanning files for global context cache...")
|
|
for f_path in files:
|
|
try:
|
|
ctx = pre_scan_markdown(str(f_path))
|
|
if ctx:
|
|
service.batch_cache[ctx.note_id] = ctx
|
|
service.batch_cache[ctx.title] = ctx
|
|
fname = os.path.splitext(f_path.name)[0]
|
|
service.batch_cache[fname] = ctx
|
|
except Exception: pass
|
|
|
|
# =========================================================================
|
|
# PHASE 1: Batch-Import (Explicit Edges only)
|
|
# =========================================================================
|
|
stats = {"processed": 0, "skipped": 0, "errors": 0}
|
|
sem = asyncio.Semaphore(5)
|
|
|
|
async def process_with_limit(f_path):
|
|
async with sem:
|
|
try:
|
|
return await service.process_file(
|
|
file_path=str(f_path), vault_root=str(vault_path),
|
|
force_replace=args.force, apply=args.apply, purge_before=True
|
|
)
|
|
except Exception as e:
|
|
return {"status": "error", "error": str(e), "path": str(f_path)}
|
|
|
|
batch_size = 20
|
|
for i in range(0, len(files), batch_size):
|
|
batch = files[i:i+batch_size]
|
|
logger.info(f"--- Processing Batch {i//batch_size + 1} ---")
|
|
tasks = [process_with_limit(f) for f in batch]
|
|
results = await asyncio.gather(*tasks)
|
|
for res in results:
|
|
if res.get("status") == "success": stats["processed"] += 1
|
|
elif res.get("status") == "error": stats["errors"] += 1
|
|
else: stats["skipped"] += 1
|
|
|
|
# =========================================================================
|
|
# PHASE 2: Global Symmetry Injection
|
|
# =========================================================================
|
|
if args.apply:
|
|
logger.info(f"🔄 [Phase 2] Starting global symmetry injection...")
|
|
sym_res = await service.commit_vault_symmetries()
|
|
if sym_res.get("status") == "success":
|
|
logger.info(f"✅ Added {sym_res.get('added', 0)} protected symmetry edges.")
|
|
|
|
logger.info(f"Done. Final Stats: {stats}")
|
|
|
|
def main():
|
|
load_dotenv()
|
|
default_prefix = os.getenv("COLLECTION_PREFIX", "mindnet")
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--vault", default="./vault")
|
|
parser.add_argument("--prefix", default=default_prefix)
|
|
parser.add_argument("--force", action="store_true")
|
|
parser.add_argument("--apply", action="store_true")
|
|
args = parser.parse_args()
|
|
try:
|
|
asyncio.run(main_async(args))
|
|
except Exception as e:
|
|
logger.critical(f"FATAL ERROR: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |