scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s

This commit is contained in:
Lars 2025-09-05 07:47:37 +02:00
parent 364502244a
commit 12dd67fbb5

View File

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Name: scripts/import_markdown.py Name: scripts/import_markdown.py
Version: v2.2.0 (2025-09-05) Version: v2.2.1 (2025-09-05)
Kurzbeschreibung: Kurzbeschreibung:
Importiert Obsidian-Markdown-Notes in Qdrant (Notes/Chunks/Edges). Importiert Obsidian-Markdown-Notes in Qdrant (Notes/Chunks/Edges).
- Leitet Wikilink-Edges (references/backlink/references_at) direkt aus Volltext + echten Chunk-Texten ab. - Leitet Wikilink-Edges (references/backlink/references_at) direkt aus Volltext + echten Chunk-Texten ab.
@ -13,11 +13,11 @@ Aufruf (aus Projekt-Root, im venv):
python3 -m scripts.import_markdown --vault ./vault [--apply] [--note-id NOTE_ID] [--embed-note] [--force-replace] python3 -m scripts.import_markdown --vault ./vault [--apply] [--note-id NOTE_ID] [--embed-note] [--force-replace]
Parameter: Parameter:
--vault Pfad zum Vault (z. B. ./vault) --vault Pfad zum Vault (z. B. ./vault)
--apply Führt Upserts in Qdrant aus (ohne Flag = Dry-Run mit JSON-Summaries) --apply Führt Upserts in Qdrant aus (ohne Flag = Dry-Run mit JSON-Summaries)
--note-id Bearbeite nur eine konkrete Note-ID --note-id Bearbeite nur eine konkrete Note-ID
--embed-note Optional: Note-Vektor (Volltext) zusätzlich einbetten --embed-note Optional: Note-Vektor (Volltext) zusätzlich einbetten
--force-replace Erzwingt Purge & Neuaufbau auch ohne Hash-Änderung (Debug) --force-replace Erzwingt Purge & Neuaufbau auch ohne Hash-Änderung (Debug)
Umgebungsvariablen (optional): Umgebungsvariablen (optional):
QDRANT_URL, QDRANT_API_KEY, COLLECTION_PREFIX, VECTOR_DIM (Default 384) QDRANT_URL, QDRANT_API_KEY, COLLECTION_PREFIX, VECTOR_DIM (Default 384)
@ -32,14 +32,16 @@ Hinweise:
* Chunks: payload.note_id == NOTE_ID * Chunks: payload.note_id == NOTE_ID
* Edges : (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#") * Edges : (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#")
- Notes/Chunks/Edges bleiben 1:1 kompatibel zu Validator & Backfill. - Notes/Chunks/Edges bleiben 1:1 kompatibel zu Validator & Backfill.
Changelog: Changelog:
v2.2.1: Bugfix Tippfehler (args.force_replace statt args.force_replaces).
v2.2.0: Hash-basierte Replace-on-Change-Logik; Purge pro Note; Skip unverändert. v2.2.0: Hash-basierte Replace-on-Change-Logik; Purge pro Note; Skip unverändert.
v2.1.1: Sicherstellung references_at durch Übergabe echter Chunk-Texte. v2.1.1: Sicherstellung references_at durch Übergabe echter Chunk-Texte.
v2.1.0: Vorab-Note-Index über Vault; direkte Edge-Ableitung. v2.1.0: Vorab-Note-Index über Vault; direkte Edge-Ableitung.
""" """
from __future__ import annotations from __future__ import annotations
import argparse, os, glob, json, sys, hashlib import argparse, os, glob, json, sys, hashlib
from typing import List, Dict, Tuple from typing import List, Dict
from dotenv import load_dotenv from dotenv import load_dotenv
from qdrant_client import QdrantClient from qdrant_client import QdrantClient
@ -159,7 +161,7 @@ def main():
index_payloads.append(pl) index_payloads.append(pl)
except Exception: except Exception:
continue continue
note_index = build_note_index(index_payloads) # by_id/by_slug/by_file_slug :contentReference[oaicite:3]{index=3} note_index = build_note_index(index_payloads) # by_id/by_slug/by_file_slug
notes_col = f"{cfg.prefix}_notes" notes_col = f"{cfg.prefix}_notes"
total_notes = 0 total_notes = 0
@ -181,7 +183,7 @@ def main():
note_pl = make_note_payload(parsed, vault_root=root) note_pl = make_note_payload(parsed, vault_root=root)
validate_note_payload(note_pl) validate_note_payload(note_pl)
h = compute_hash_fulltext(parsed.body) h = compute_hash_fulltext(parsed.body)
note_pl["hash_fulltext"] = h # im Schema vorgesehen :contentReference[oaicite:4]{index=4} note_pl["hash_fulltext"] = h
# Chunks + Payloads # Chunks + Payloads
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
@ -194,7 +196,7 @@ def main():
# Optional: Note-Vektor # Optional: Note-Vektor
note_vec = embed_one(parsed.body) if args.embed_note else None note_vec = embed_one(parsed.body) if args.embed_note else None
# Edges (aus Volltext + echten Chunk-Texten) :contentReference[oaicite:5]{index=5} # Edges (aus Volltext + echten Chunk-Texten)
note_pl_for_edges = {"note_id": fm["id"], "title": fm.get("title"), "path": note_pl["path"], "fulltext": parsed.body} note_pl_for_edges = {"note_id": fm["id"], "title": fm.get("title"), "path": note_pl["path"], "fulltext": parsed.body}
chunks_for_links = [] chunks_for_links = []
for i, pl in enumerate(chunk_pls): for i, pl in enumerate(chunk_pls):
@ -205,7 +207,7 @@ def main():
# Bestehende Note laden (für Hash-Vergleich) # Bestehende Note laden (für Hash-Vergleich)
existing = fetch_existing_note_payload(client, notes_col, fm["id"]) existing = fetch_existing_note_payload(client, notes_col, fm["id"])
changed = args.force_replaces if False else False # placeholder, fixed below changed = False
if existing and isinstance(existing, dict): if existing and isinstance(existing, dict):
old_h = existing.get("hash_fulltext") old_h = existing.get("hash_fulltext")
changed = (old_h != h) changed = (old_h != h)
@ -218,7 +220,7 @@ def main():
"title": fm["title"], "title": fm["title"],
"chunks": len(chunk_pls), "chunks": len(chunk_pls),
"edges": len(edges), "edges": len(edges),
"changed": changed or args.force_replaces, "changed": changed or args.force_replace,
"path": note_pl["path"], "path": note_pl["path"],
}, ensure_ascii=False)) }, ensure_ascii=False))
@ -226,7 +228,7 @@ def main():
continue continue
# Replace-on-Change: vorherige Artefakte der Note löschen # Replace-on-Change: vorherige Artefakte der Note löschen
if changed or args.force_replaces: if changed or args.force_replace:
purge_note(client, cfg, fm["id"]) purge_note(client, cfg, fm["id"])
# Upserts # Upserts