mindnet/scripts/import_markdown.py
Lars fb4bf79841
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
scripts/import_markdown.py aktualisiert
2025-09-05 11:35:46 +02:00

186 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script: scripts/import_markdown.py
Version: v2.3.1 (2025-09-05)
Beschreibung
Importiert Markdown-Notizen in Qdrant (Notes, Chunks, Edges).
- Chunking + Embedding (MiniLM 384d, externer Embed-Server)
- Edges direkt beim Import aus Wikilinks ([[…]]) ableiten
- Idempotente UUIDv5-IDs; Collections werden bereitgestellt
Aufruf
python3 -m scripts.import_markdown --vault ./vault [--apply] [--note-id ID] [--embed-note] [--force-replace]
Parameter
--vault Pfad zum Obsidian-Vault (erforderlich)
--apply Ohne Flag: Dry-Run (nur JSON-Zeilen). Mit Flag: schreibt in Qdrant.
--note-id Nur eine spezifische Note-ID verarbeiten
--embed-note Optional: Note-Volltext zusätzlich einbetten
--force-replace Vor Upsert: zugehörige Edges der Quell-Note in Qdrant löschen (harte Ersetzung)
Hinweise
- Im venv laufen: `source .venv/bin/activate`
- Erwartet laufenden Embed-Server (http://127.0.0.1:8990)
- Qdrant via env: QDRANT_URL, QDRANT_API_KEY, COLLECTION_PREFIX, VECTOR_DIM
Changelog
v2.3.1: FIX Für derive_wikilink_edges werden jetzt echte Chunk-Texte übergeben
(chunks_for_links mit {"chunk_id","text"}), damit `references_at` erzeugt werden.
v2.3.0: Umstellung auf app.core.derive_edges; Edge-IDs unterstützen Occurrence.
v2.2.x: Fix Filter-API (pydantic v2) bei Purge; Kleinkosmetik.
"""
from __future__ import annotations
import argparse, os, glob, json, sys
from dotenv import load_dotenv
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.note_payload import make_note_payload
from app.core.validate_note import validate_note_payload
from app.core.chunker import assemble_chunks
from app.core.chunk_payload import make_chunk_payloads
from app.core.embed import embed_texts, embed_one
from app.core.qdrant import QdrantConfig, ensure_collections, get_client
from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
from app.core.derive_edges import build_note_index, derive_wikilink_edges # nutzt 'text' je Chunk :contentReference[oaicite:1]{index=1}
from qdrant_client.http import models as rest
def iter_md(root: str, exclude_dirs=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")):
files = [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
out = []
for p in files:
pn = p.replace("\\","/")
if any(ex in pn for ex in exclude_dirs):
continue
out.append(p)
return out
def purge_note_edges(client, prefix: str, source_note_id: str):
# Löscht Edges, deren source_id == source_note_id ODER (kind==backlink && target_id==source_note_id)
edges_col = f"{prefix}_edges"
f = rest.Filter(
should=[
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=source_note_id)),
rest.Filter(
must=[
rest.FieldCondition(key="kind", match=rest.MatchValue(value="backlink")),
rest.FieldCondition(key="target_id", match=rest.MatchValue(value=source_note_id)),
]
),
]
)
# pydantic v2: client.delete(...) statt delete_points(...)
client.delete(collection_name=edges_col, points_selector=f, wait=True)
def main():
load_dotenv()
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Pfad zum Obsidian Vault (z.B. ./vault)")
ap.add_argument("--apply", action="store_true", help="Schreibt in Qdrant (sonst Dry-Run)")
ap.add_argument("--note-id", help="Nur eine Note-ID verarbeiten")
ap.add_argument("--embed-note", action="store_true", help="Auch Note-Volltext einbetten (optional)")
ap.add_argument("--force-replace", action="store_true", help="Vor Upsert alte Edges der Note löschen")
args = ap.parse_args()
# Qdrant
cfg = QdrantConfig(
url=os.getenv("QDRANT_URL", "http://127.0.0.1:6333"),
api_key=os.getenv("QDRANT_API_KEY") or None,
prefix=os.getenv("COLLECTION_PREFIX", "mindnet"),
dim=int(os.getenv("VECTOR_DIM","384")),
)
client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim)
root = os.path.abspath(args.vault)
files = iter_md(root)
if not files:
print("Keine Markdown-Dateien gefunden.", file=sys.stderr); sys.exit(2)
# --- Note-Index (für robuste Zielauflösung) ---
note_stubs = []
for path in files:
parsed = read_markdown(path)
fm = normalize_frontmatter(parsed.frontmatter)
try:
validate_required_frontmatter(fm)
except Exception:
continue
if args.note_id and fm.get("id") != args.note_id:
continue
rel = os.path.relpath(parsed.path, root).replace("\\","/")
note_stubs.append({"note_id": fm["id"], "title": fm.get("title",""), "path": rel})
note_index = build_note_index(note_stubs)
total_notes = 0
for path in files:
parsed = read_markdown(path)
fm = normalize_frontmatter(parsed.frontmatter)
try:
validate_required_frontmatter(fm)
except Exception:
continue
if args.note_id and fm.get("id") != args.note_id:
continue
total_notes += 1
# Note-Payload
note_pl = make_note_payload(parsed, vault_root=root)
validate_note_payload(note_pl)
note_pl["fulltext"] = parsed.body # für derive_edges
# Chunks
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
# Embeddings (Chunks)
texts = [ch.text for ch in chunks]
vectors = embed_texts(texts)
# Optional: Note-Vektor
note_vec = embed_one(parsed.body) if args.embed_note else None
# --- WICHTIG: Chunk-Texte für derive_wikilink_edges ---
chunks_for_links = [
{"chunk_id": (pl.get("chunk_id") or pl.get("id") or f"{fm['id']}#{i+1}"),
"text": chunks[i].text}
for i, pl in enumerate(chunk_pls)
if i < len(chunks)
]
# Kanten (Note- & Chunk-Ebene)
edges = derive_wikilink_edges(note_pl, chunks_for_links, note_index)
# Dry-Run-Ausgabe
print(json.dumps({
"note_id": fm["id"],
"title": fm["title"],
"chunks": len(chunk_pls),
"edges": len(edges),
"path": note_pl["path"]
}, ensure_ascii=False))
if args.apply:
if args.force_replace:
purge_note_edges(client, cfg.prefix, fm["id"])
# Notes upsert
notes_col, note_pts = points_for_note(cfg.prefix, note_pl, note_vec, cfg.dim)
upsert_batch(client, notes_col, note_pts)
# Chunks upsert
chunks_col, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vectors)
upsert_batch(client, chunks_col, chunk_pts)
# Edges upsert
edges_col, edge_pts = points_for_edges(cfg.prefix, edges)
upsert_batch(client, edges_col, edge_pts)
print(f"Done. Processed notes: {total_notes}")
if __name__ == "__main__":
main()