scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s
This commit is contained in:
parent
364502244a
commit
12dd67fbb5
|
|
@ -2,7 +2,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
Name: scripts/import_markdown.py
|
Name: scripts/import_markdown.py
|
||||||
Version: v2.2.0 (2025-09-05)
|
Version: v2.2.1 (2025-09-05)
|
||||||
Kurzbeschreibung:
|
Kurzbeschreibung:
|
||||||
Importiert Obsidian-Markdown-Notes in Qdrant (Notes/Chunks/Edges).
|
Importiert Obsidian-Markdown-Notes in Qdrant (Notes/Chunks/Edges).
|
||||||
- Leitet Wikilink-Edges (references/backlink/references_at) direkt aus Volltext + echten Chunk-Texten ab.
|
- Leitet Wikilink-Edges (references/backlink/references_at) direkt aus Volltext + echten Chunk-Texten ab.
|
||||||
|
|
@ -13,11 +13,11 @@ Aufruf (aus Projekt-Root, im venv):
|
||||||
python3 -m scripts.import_markdown --vault ./vault [--apply] [--note-id NOTE_ID] [--embed-note] [--force-replace]
|
python3 -m scripts.import_markdown --vault ./vault [--apply] [--note-id NOTE_ID] [--embed-note] [--force-replace]
|
||||||
|
|
||||||
Parameter:
|
Parameter:
|
||||||
--vault Pfad zum Vault (z. B. ./vault)
|
--vault Pfad zum Vault (z. B. ./vault)
|
||||||
--apply Führt Upserts in Qdrant aus (ohne Flag = Dry-Run mit JSON-Summaries)
|
--apply Führt Upserts in Qdrant aus (ohne Flag = Dry-Run mit JSON-Summaries)
|
||||||
--note-id Bearbeite nur eine konkrete Note-ID
|
--note-id Bearbeite nur eine konkrete Note-ID
|
||||||
--embed-note Optional: Note-Vektor (Volltext) zusätzlich einbetten
|
--embed-note Optional: Note-Vektor (Volltext) zusätzlich einbetten
|
||||||
--force-replace Erzwingt Purge & Neuaufbau auch ohne Hash-Änderung (Debug)
|
--force-replace Erzwingt Purge & Neuaufbau auch ohne Hash-Änderung (Debug)
|
||||||
|
|
||||||
Umgebungsvariablen (optional):
|
Umgebungsvariablen (optional):
|
||||||
QDRANT_URL, QDRANT_API_KEY, COLLECTION_PREFIX, VECTOR_DIM (Default 384)
|
QDRANT_URL, QDRANT_API_KEY, COLLECTION_PREFIX, VECTOR_DIM (Default 384)
|
||||||
|
|
@ -32,14 +32,16 @@ Hinweise:
|
||||||
* Chunks: payload.note_id == NOTE_ID
|
* Chunks: payload.note_id == NOTE_ID
|
||||||
* Edges : (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#")
|
* Edges : (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#")
|
||||||
- Notes/Chunks/Edges bleiben 1:1 kompatibel zu Validator & Backfill.
|
- Notes/Chunks/Edges bleiben 1:1 kompatibel zu Validator & Backfill.
|
||||||
|
|
||||||
Changelog:
|
Changelog:
|
||||||
|
v2.2.1: Bugfix Tippfehler (args.force_replace statt args.force_replaces).
|
||||||
v2.2.0: Hash-basierte Replace-on-Change-Logik; Purge pro Note; Skip unverändert.
|
v2.2.0: Hash-basierte Replace-on-Change-Logik; Purge pro Note; Skip unverändert.
|
||||||
v2.1.1: Sicherstellung references_at durch Übergabe echter Chunk-Texte.
|
v2.1.1: Sicherstellung references_at durch Übergabe echter Chunk-Texte.
|
||||||
v2.1.0: Vorab-Note-Index über Vault; direkte Edge-Ableitung.
|
v2.1.0: Vorab-Note-Index über Vault; direkte Edge-Ableitung.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse, os, glob, json, sys, hashlib
|
import argparse, os, glob, json, sys, hashlib
|
||||||
from typing import List, Dict, Tuple
|
from typing import List, Dict
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from qdrant_client import QdrantClient
|
from qdrant_client import QdrantClient
|
||||||
|
|
@ -159,7 +161,7 @@ def main():
|
||||||
index_payloads.append(pl)
|
index_payloads.append(pl)
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
note_index = build_note_index(index_payloads) # by_id/by_slug/by_file_slug :contentReference[oaicite:3]{index=3}
|
note_index = build_note_index(index_payloads) # by_id/by_slug/by_file_slug
|
||||||
|
|
||||||
notes_col = f"{cfg.prefix}_notes"
|
notes_col = f"{cfg.prefix}_notes"
|
||||||
total_notes = 0
|
total_notes = 0
|
||||||
|
|
@ -181,7 +183,7 @@ def main():
|
||||||
note_pl = make_note_payload(parsed, vault_root=root)
|
note_pl = make_note_payload(parsed, vault_root=root)
|
||||||
validate_note_payload(note_pl)
|
validate_note_payload(note_pl)
|
||||||
h = compute_hash_fulltext(parsed.body)
|
h = compute_hash_fulltext(parsed.body)
|
||||||
note_pl["hash_fulltext"] = h # im Schema vorgesehen :contentReference[oaicite:4]{index=4}
|
note_pl["hash_fulltext"] = h
|
||||||
|
|
||||||
# Chunks + Payloads
|
# Chunks + Payloads
|
||||||
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
|
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
|
||||||
|
|
@ -194,7 +196,7 @@ def main():
|
||||||
# Optional: Note-Vektor
|
# Optional: Note-Vektor
|
||||||
note_vec = embed_one(parsed.body) if args.embed_note else None
|
note_vec = embed_one(parsed.body) if args.embed_note else None
|
||||||
|
|
||||||
# Edges (aus Volltext + echten Chunk-Texten) :contentReference[oaicite:5]{index=5}
|
# Edges (aus Volltext + echten Chunk-Texten)
|
||||||
note_pl_for_edges = {"note_id": fm["id"], "title": fm.get("title"), "path": note_pl["path"], "fulltext": parsed.body}
|
note_pl_for_edges = {"note_id": fm["id"], "title": fm.get("title"), "path": note_pl["path"], "fulltext": parsed.body}
|
||||||
chunks_for_links = []
|
chunks_for_links = []
|
||||||
for i, pl in enumerate(chunk_pls):
|
for i, pl in enumerate(chunk_pls):
|
||||||
|
|
@ -205,7 +207,7 @@ def main():
|
||||||
|
|
||||||
# Bestehende Note laden (für Hash-Vergleich)
|
# Bestehende Note laden (für Hash-Vergleich)
|
||||||
existing = fetch_existing_note_payload(client, notes_col, fm["id"])
|
existing = fetch_existing_note_payload(client, notes_col, fm["id"])
|
||||||
changed = args.force_replaces if False else False # placeholder, fixed below
|
changed = False
|
||||||
if existing and isinstance(existing, dict):
|
if existing and isinstance(existing, dict):
|
||||||
old_h = existing.get("hash_fulltext")
|
old_h = existing.get("hash_fulltext")
|
||||||
changed = (old_h != h)
|
changed = (old_h != h)
|
||||||
|
|
@ -218,7 +220,7 @@ def main():
|
||||||
"title": fm["title"],
|
"title": fm["title"],
|
||||||
"chunks": len(chunk_pls),
|
"chunks": len(chunk_pls),
|
||||||
"edges": len(edges),
|
"edges": len(edges),
|
||||||
"changed": changed or args.force_replaces,
|
"changed": changed or args.force_replace,
|
||||||
"path": note_pl["path"],
|
"path": note_pl["path"],
|
||||||
}, ensure_ascii=False))
|
}, ensure_ascii=False))
|
||||||
|
|
||||||
|
|
@ -226,7 +228,7 @@ def main():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Replace-on-Change: vorherige Artefakte der Note löschen
|
# Replace-on-Change: vorherige Artefakte der Note löschen
|
||||||
if changed or args.force_replaces:
|
if changed or args.force_replace:
|
||||||
purge_note(client, cfg, fm["id"])
|
purge_note(client, cfg, fm["id"])
|
||||||
|
|
||||||
# Upserts
|
# Upserts
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user