scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s

This commit is contained in:
Lars 2025-09-05 07:41:28 +02:00
parent f7b1995b08
commit 364502244a

View File

@ -2,20 +2,22 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Name: scripts/import_markdown.py Name: scripts/import_markdown.py
Version: v2.1.1 (2025-09-05) Version: v2.2.0 (2025-09-05)
Kurzbeschreibung: Kurzbeschreibung:
Importiert Obsidian-Markdown-Notes in Qdrant (Notes/Chunks/Edges). Importiert Obsidian-Markdown-Notes in Qdrant (Notes/Chunks/Edges).
Leitet Wikilink-Edges (references/backlink/references_at) direkt beim Import - Leitet Wikilink-Edges (references/backlink/references_at) direkt aus Volltext + echten Chunk-Texten ab.
aus Volltext + echten Chunk-Texten ab. Keine Abhängigkeit zu edges.py (Legacy). - Idempotenz: Ermittelt hash_fulltext; bei Änderung werden alte Chunks/Edges der Note entfernt (Replace-on-Change).
- Unveränderte Noten werden übersprungen (schnell).
Aufruf (aus Projekt-Root, im venv): Aufruf (aus Projekt-Root, im venv):
python3 -m scripts.import_markdown --vault ./vault [--apply] [--note-id NOTE_ID] [--embed-note] python3 -m scripts.import_markdown --vault ./vault [--apply] [--note-id NOTE_ID] [--embed-note] [--force-replace]
Parameter: Parameter:
--vault Pfad zum Vault (z. B. ./vault) --vault Pfad zum Vault (z. B. ./vault)
--apply Führt Upserts in Qdrant aus (ohne Flag = Dry-Run mit JSON-Summaries) --apply Führt Upserts in Qdrant aus (ohne Flag = Dry-Run mit JSON-Summaries)
--note-id Bearbeite nur eine konkrete Note-ID --note-id Bearbeite nur eine konkrete Note-ID
--embed-note Optional: Note-Vektor (Volltext) zusätzlich einbetten --embed-note Optional: Note-Vektor (Volltext) zusätzlich einbetten
--force-replace Erzwingt Purge & Neuaufbau auch ohne Hash-Änderung (Debug)
Umgebungsvariablen (optional): Umgebungsvariablen (optional):
QDRANT_URL, QDRANT_API_KEY, COLLECTION_PREFIX, VECTOR_DIM (Default 384) QDRANT_URL, QDRANT_API_KEY, COLLECTION_PREFIX, VECTOR_DIM (Default 384)
@ -23,49 +25,39 @@ Umgebungsvariablen (optional):
Exitcodes: Exitcodes:
0 = OK, 2 = keine Markdown-Dateien gefunden 0 = OK, 2 = keine Markdown-Dateien gefunden
Wichtige Hinweise: Hinweise:
- Verwendet ausschließlich app.core.derive_edges für Wikilink-Kanten. - Wikilink-Ableitung basiert auf app.core.derive_edges (Slug-/ID-Auflösung, unresolved-Status).
- Chunk-Texte werden an derive_wikilink_edges übergeben; nur so entstehen references_at. - Für references_at werden echte Chunk-Texte übergeben (sonst würden sie fehlen).
- Edges-IDs sind deterministisch (UUIDv5 in qdrant_points), idempotent. - Purge verwendet Qdrant-Filter:
- Edges-Collection bleibt 1D-Dummy-Vektor (DOT), reine Payload-Nutzung. * Chunks: payload.note_id == NOTE_ID
* Edges : (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#")
- Notes/Chunks/Edges bleiben 1:1 kompatibel zu Validator & Backfill.
Changelog: Changelog:
v2.1.1: Entfernt Abhängigkeit auf app.core.edges; direkte Nutzung von derive_wikilink_edges; v2.2.0: Hash-basierte Replace-on-Change-Logik; Purge pro Note; Skip unverändert.
sichert Übergabe der Chunk-Texte references_at werden erzeugt. v2.1.1: Sicherstellung references_at durch Übergabe echter Chunk-Texte.
v2.1.0: Note-Index über gesamten Vault; erste Fassung mit direkter Edge-Ableitung. v2.1.0: Vorab-Note-Index über Vault; direkte Edge-Ableitung.
v2.0.x: Import Notes/Chunks/Embeddings, Edges via Backfill (separat).
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse, os, glob, json, sys, hashlib
import os from typing import List, Dict, Tuple
import glob
import json
import sys
from typing import List, Dict
from dotenv import load_dotenv from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
# Core-Module (gemäß Projektstruktur) from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.parser import (
read_markdown,
normalize_frontmatter,
validate_required_frontmatter,
)
from app.core.note_payload import make_note_payload from app.core.note_payload import make_note_payload
from app.core.validate_note import validate_note_payload from app.core.validate_note import validate_note_payload
from app.core.chunker import assemble_chunks from app.core.chunker import assemble_chunks
from app.core.chunk_payload import make_chunk_payloads from app.core.chunk_payload import make_chunk_payloads
from app.core.embed import embed_texts, embed_one from app.core.embed import embed_texts, embed_one
from app.core.qdrant import QdrantConfig, ensure_collections, get_client from app.core.qdrant import QdrantConfig, ensure_collections, get_client
from app.core.qdrant_points import ( from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
points_for_chunks, from app.core.derive_edges import build_note_index, derive_wikilink_edges # Wikilinks
points_for_note,
points_for_edges,
upsert_batch,
)
# Nur Wikilink-Ableitung (keine Kollision mit edges.py):
from app.core.derive_edges import build_note_index, derive_wikilink_edges
# -------------------
# Utility / Helpers
# -------------------
def iter_md(root: str, exclude=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")) -> List[str]: def iter_md(root: str, exclude=("/.obsidian/", "/_backup_frontmatter/", "/_imported/")) -> List[str]:
files = [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)] files = [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
@ -77,31 +69,70 @@ def iter_md(root: str, exclude=("/.obsidian/", "/_backup_frontmatter/", "/_impor
out.append(p) out.append(p)
return out return out
def minimal_note_index_payload(abs_path: str, vault_root: str) -> Dict: def minimal_note_index_payload(abs_path: str, vault_root: str) -> Dict:
"""Nur Felder, die der Resolver braucht (id/title/path).""" """Nur Felder, die der Resolver braucht (id/title/path)."""
parsed = read_markdown(abs_path) parsed = read_markdown(abs_path)
fm = normalize_frontmatter(parsed.frontmatter) fm = normalize_frontmatter(parsed.frontmatter)
validate_required_frontmatter(fm) validate_required_frontmatter(fm)
relpath = os.path.relpath(abs_path, vault_root).replace("\\", "/") relpath = os.path.relpath(abs_path, vault_root).replace("\\", "/")
return { return {"note_id": fm.get("id") or fm.get("note_id"), "title": fm.get("title"), "path": relpath}
"note_id": fm.get("id") or fm.get("note_id"),
"title": fm.get("title"),
"path": relpath,
}
def compute_hash_fulltext(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def note_uuid5(note_id: str) -> str:
import uuid
return str(uuid.uuid5(uuid.NAMESPACE_URL, note_id))
def fetch_existing_note_payload(client: QdrantClient, notes_col: str, note_id: str) -> Dict | None:
"""Hole bestehende Note (per deterministischem UUIDv5) oder None."""
pid = note_uuid5(note_id)
try:
res = client.retrieve(collection_name=notes_col, ids=[pid], with_payload=True, with_vectors=False)
if not res:
return None
return res[0].payload or {}
except Exception:
return None
def purge_note(client: QdrantClient, cfg: QdrantConfig, note_id: str) -> None:
"""Löscht alle Chunks & Edges einer Note (Replace-on-Change)."""
notes_col, chunks_col, edges_col = f"{cfg.prefix}_notes", f"{cfg.prefix}_chunks", f"{cfg.prefix}_edges"
# Chunks: payload.note_id == NOTE_ID
f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
if hasattr(client, "delete_points"):
client.delete_points(chunks_col, f_chunks, wait=True)
else:
client.delete(chunks_col, f_chunks, wait=True)
# Edges: (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#")
conds = [
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)),
rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id)),
rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")),
]
f_edges = rest.Filter(should=conds, minimum_should=1)
if hasattr(client, "delete_points"):
client.delete_points(edges_col, f_edges, wait=True)
else:
client.delete(edges_col, f_edges, wait=True)
# -------------------
# Main
# -------------------
def main(): def main():
load_dotenv() load_dotenv()
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Obsidian Vault Pfad (z.B. mindnet/vault)") ap.add_argument("--vault", required=True, help="Obsidian Vault Pfad (z.B. mindnet/vault)")
ap.add_argument("--apply", action="store_true", help="Schreibt in Qdrant (sonst Dry-Run)") ap.add_argument("--apply", action="store_true", help="Schreibt in Qdrant (sonst Dry-Run)")
ap.add_argument("--note-id", help="Nur eine Note-ID verarbeiten") ap.add_argument("--note-id", help="Nur eine Note-ID verarbeiten")
ap.add_argument("--embed-note", action="store_true", help="Auch Note-Volltext einbetten (optional)") ap.add_argument("--embed-note", action="store_true", help="Auch Note-Volltext einbetten (optional)")
ap.add_argument("--force-replace", action="store_true", help="Purge & Neuaufbau erzwingen (Debug)")
args = ap.parse_args() args = ap.parse_args()
# Qdrant-Konfiguration (+ Collections sicherstellen) # Qdrant
cfg = QdrantConfig( cfg = QdrantConfig(
url=os.getenv("QDRANT_URL", "http://127.0.0.1:6333"), url=os.getenv("QDRANT_URL", "http://127.0.0.1:6333"),
api_key=os.getenv("QDRANT_API_KEY") or None, api_key=os.getenv("QDRANT_API_KEY") or None,
@ -114,10 +145,9 @@ def main():
root = os.path.abspath(args.vault) root = os.path.abspath(args.vault)
files = iter_md(root) files = iter_md(root)
if not files: if not files:
print("Keine Markdown-Dateien gefunden.", file=sys.stderr) print("Keine Markdown-Dateien gefunden.", file=sys.stderr); sys.exit(2)
sys.exit(2)
# (1) Vorab-Lauf: Note-Index (robuste Auflösung via ID/Titel-Slug/File-Slug) # 1) Vorab-Lauf: globaler Note-Index für robuste Auflösung
index_payloads: List[Dict] = [] index_payloads: List[Dict] = []
for path in files: for path in files:
try: try:
@ -128,13 +158,13 @@ def main():
continue continue
index_payloads.append(pl) index_payloads.append(pl)
except Exception: except Exception:
# Einzeldefekte nicht fatal
continue continue
note_index = build_note_index(index_payloads) # by_id/by_slug/by_file_slug :contentReference[oaicite:3]{index=3}
note_index = build_note_index(index_payloads) notes_col = f"{cfg.prefix}_notes"
# (2) Hauptlauf: Import + direkte Wikilink-Edge-Ableitung
total_notes = 0 total_notes = 0
# 2) Hauptlauf pro Datei
for path in files: for path in files:
parsed = read_markdown(path) parsed = read_markdown(path)
fm = normalize_frontmatter(parsed.frontmatter) fm = normalize_frontmatter(parsed.frontmatter)
@ -147,11 +177,13 @@ def main():
total_notes += 1 total_notes += 1
# Note-Payload (vollständig für notes-Collection) # Note-Payload inkl. hash_fulltext
note_pl = make_note_payload(parsed, vault_root=root) note_pl = make_note_payload(parsed, vault_root=root)
validate_note_payload(note_pl) validate_note_payload(note_pl)
h = compute_hash_fulltext(parsed.body)
note_pl["hash_fulltext"] = h # im Schema vorgesehen :contentReference[oaicite:4]{index=4}
# Chunks aus Body gemäß Chunking-Strategie # Chunks + Payloads
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
@ -159,49 +191,53 @@ def main():
texts = [ch.text for ch in chunks] texts = [ch.text for ch in chunks]
vectors = embed_texts(texts) vectors = embed_texts(texts)
# Optional: Note-Vektor (z. B. für Doc-Suche/Clustering) # Optional: Note-Vektor
note_vec = None note_vec = embed_one(parsed.body) if args.embed_note else None
if args.embed_note:
note_vec = embed_one(parsed.body)
# Edges direkt aus realen Chunks + Note-Index # Edges (aus Volltext + echten Chunk-Texten) :contentReference[oaicite:5]{index=5}
# ACHTUNG: derive_wikilink_edges erwartet Chunks mit Text → wir bauen eine note_pl_for_edges = {"note_id": fm["id"], "title": fm.get("title"), "path": note_pl["path"], "fulltext": parsed.body}
# minimalistische Liste {"chunk_id","text"} in gleicher Reihenfolge.
note_pl_for_edges = {
"note_id": fm["id"],
"title": fm.get("title"),
"path": note_pl["path"],
"fulltext": parsed.body,
}
chunks_for_links = [] chunks_for_links = []
for i, pl in enumerate(chunk_pls): for i, pl in enumerate(chunk_pls):
cid = pl.get("chunk_id") or pl.get("id") cid = pl.get("chunk_id") or pl.get("id")
txt = chunks[i].text if i < len(chunks) else "" txt = chunks[i].text if i < len(chunks) else ""
chunks_for_links.append({"chunk_id": cid, "text": txt}) chunks_for_links.append({"chunk_id": cid, "text": txt})
edges = derive_wikilink_edges(note_pl_for_edges, chunks_for_links, note_index) edges = derive_wikilink_edges(note_pl_for_edges, chunks_for_links, note_index)
# Bestehende Note laden (für Hash-Vergleich)
existing = fetch_existing_note_payload(client, notes_col, fm["id"])
changed = args.force_replaces if False else False # placeholder, fixed below
if existing and isinstance(existing, dict):
old_h = existing.get("hash_fulltext")
changed = (old_h != h)
else:
changed = True # neu
# Dry-Run-Summary # Dry-Run-Summary
print(json.dumps({ print(json.dumps({
"note_id": fm["id"], "note_id": fm["id"],
"title": fm["title"], "title": fm["title"],
"chunks": len(chunk_pls), "chunks": len(chunk_pls),
"edges": len(edges), "edges": len(edges),
"changed": changed or args.force_replaces,
"path": note_pl["path"], "path": note_pl["path"],
}, ensure_ascii=False)) }, ensure_ascii=False))
if args.apply: if not args.apply:
# Notes upsert (idempotent; UUIDv5) continue
notes_col, note_pts = points_for_note(cfg.prefix, note_pl, note_vec, cfg.dim)
upsert_batch(client, notes_col, note_pts)
# Chunks upsert (idempotent) # Replace-on-Change: vorherige Artefakte der Note löschen
chunks_col, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vectors) if changed or args.force_replaces:
upsert_batch(client, chunks_col, chunk_pts) purge_note(client, cfg, fm["id"])
# Edges upsert (idempotent; deterministisches edge_id) # Upserts
edges_col, edge_pts = points_for_edges(cfg.prefix, edges) notes_col_name, note_pts = points_for_note(cfg.prefix, note_pl, note_vec, cfg.dim)
upsert_batch(client, edges_col, edge_pts) upsert_batch(client, notes_col_name, note_pts)
chunks_col, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vectors)
upsert_batch(client, chunks_col, chunk_pts)
edges_col, edge_pts = points_for_edges(cfg.prefix, edges)
upsert_batch(client, edges_col, edge_pts)
print(f"Done. Processed notes: {total_notes}") print(f"Done. Processed notes: {total_notes}")