scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 1s

This commit is contained in:
Lars 2025-09-05 07:54:33 +02:00
parent 12dd67fbb5
commit cb30dbb23c

View File

@ -2,12 +2,13 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Name: scripts/import_markdown.py Name: scripts/import_markdown.py
Version: v2.2.1 (2025-09-05) Version: v2.2.2 (2025-09-05)
Kurzbeschreibung: Kurzbeschreibung:
Importiert Obsidian-Markdown-Notes in Qdrant (Notes/Chunks/Edges). Importiert Obsidian-Markdown-Notes in Qdrant (Notes/Chunks/Edges).
- Leitet Wikilink-Edges (references/backlink/references_at) direkt aus Volltext + echten Chunk-Texten ab. - Leitet Wikilink-Edges (references/backlink/references_at) direkt aus Volltext + echten Chunk-Texten ab.
- Idempotenz: Ermittelt hash_fulltext; bei Änderung werden alte Chunks/Edges der Note entfernt (Replace-on-Change). - Idempotenz: Ermittelt hash_fulltext; bei Änderung werden alte Chunks/Edges der Note entfernt (Replace-on-Change).
- Unveränderte Noten werden übersprungen (schnell). - Unveränderte Noten werden übersprungen (schnell).
- Purge für Edges robust gegen Client-/API-Unterschiede (Filter + Scroll-Delete Fallback).
Aufruf (aus Projekt-Root, im venv): Aufruf (aus Projekt-Root, im venv):
python3 -m scripts.import_markdown --vault ./vault [--apply] [--note-id NOTE_ID] [--embed-note] [--force-replace] python3 -m scripts.import_markdown --vault ./vault [--apply] [--note-id NOTE_ID] [--embed-note] [--force-replace]
@ -26,22 +27,24 @@ Exitcodes:
0 = OK, 2 = keine Markdown-Dateien gefunden 0 = OK, 2 = keine Markdown-Dateien gefunden
Hinweise: Hinweise:
- Wikilink-Ableitung basiert auf app.core.derive_edges (Slug-/ID-Auflösung, unresolved-Status). - Wikilink-Ableitung basiert auf app.core.derive_edges (Slug-/ID-Auflösung, unresolved-Status). :contentReference[oaicite:3]{index=3}
- Für references_at werden echte Chunk-Texte übergeben (sonst würden sie fehlen). - Für references_at werden echte Chunk-Texte übergeben (sonst würden sie fehlen). :contentReference[oaicite:4]{index=4}
- Purge verwendet Qdrant-Filter: - Purge verwendet:
* Chunks: payload.note_id == NOTE_ID * Chunks: payload.note_id == NOTE_ID
* Edges : (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#") * Edges : (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#")
- Notes/Chunks/Edges bleiben 1:1 kompatibel zu Validator & Backfill. Falls MatchText/Prefix nicht unterstützt: Scroll & Delete-by-IDs als Fallback.
- Notes/Chunks/Edges bleiben kompatibel zu Validator & Backfill (UUIDv5, 1D-Edges).
Changelog: Changelog:
v2.2.1: Bugfix Tippfehler (args.force_replace statt args.force_replaces). v2.2.2: Entfernt minimum_should (Kompatibilität); Edge-Purge mit Scroll-Fallback für source_id-Prefix.
v2.2.0: Hash-basierte Replace-on-Change-Logik; Purge pro Note; Skip unverändert. v2.2.1: Bugfix Tippfehler (args.force_replace).
v2.1.1: Sicherstellung references_at durch Übergabe echter Chunk-Texte. v2.2.0: Hash-basierte Replace-on-Change-Logik; Purge pro Note; Skip unverändert. :contentReference[oaicite:6]{index=6}
v2.1.0: Vorab-Note-Index über Vault; direkte Edge-Ableitung. v2.1.1: Sicherstellung references_at durch Übergabe echter Chunk-Texte. :contentReference[oaicite:7]{index=7}
v2.1.0: Vorab-Note-Index über Vault; direkte Edge-Ableitung. :contentReference[oaicite:8]{index=8}
""" """
from __future__ import annotations from __future__ import annotations
import argparse, os, glob, json, sys, hashlib import argparse, os, glob, json, sys, hashlib
from typing import List, Dict from typing import List, Dict, Optional
from dotenv import load_dotenv from dotenv import load_dotenv
from qdrant_client import QdrantClient from qdrant_client import QdrantClient
@ -55,7 +58,7 @@ from app.core.chunk_payload import make_chunk_payloads
from app.core.embed import embed_texts, embed_one from app.core.embed import embed_texts, embed_one
from app.core.qdrant import QdrantConfig, ensure_collections, get_client from app.core.qdrant import QdrantConfig, ensure_collections, get_client
from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
from app.core.derive_edges import build_note_index, derive_wikilink_edges # Wikilinks from app.core.derive_edges import build_note_index, derive_wikilink_edges # Wikilinks :contentReference[oaicite:9]{index=9}
# ------------------- # -------------------
# Utility / Helpers # Utility / Helpers
@ -86,7 +89,7 @@ def note_uuid5(note_id: str) -> str:
import uuid import uuid
return str(uuid.uuid5(uuid.NAMESPACE_URL, note_id)) return str(uuid.uuid5(uuid.NAMESPACE_URL, note_id))
def fetch_existing_note_payload(client: QdrantClient, notes_col: str, note_id: str) -> Dict | None: def fetch_existing_note_payload(client: QdrantClient, notes_col: str, note_id: str) -> Optional[Dict]:
"""Hole bestehende Note (per deterministischem UUIDv5) oder None.""" """Hole bestehende Note (per deterministischem UUIDv5) oder None."""
pid = note_uuid5(note_id) pid = note_uuid5(note_id)
try: try:
@ -97,28 +100,64 @@ def fetch_existing_note_payload(client: QdrantClient, notes_col: str, note_id: s
except Exception: except Exception:
return None return None
def _client_delete_points(client: QdrantClient, collection: str, selector_or_filter):
"""Kompat-Schicht: delete_points (neu) oder delete (alt)."""
if hasattr(client, "delete_points"):
return client.delete_points(collection, selector_or_filter, wait=True)
return client.delete(collection, selector_or_filter, wait=True)
def _scroll_edge_ids_by_source_prefix(client: QdrantClient, edges_col: str, source_prefix: str, batch: int = 1000) -> List[int]:
"""Sucht Edge-Point-IDs, deren payload.source_id mit 'source_prefix' beginnt (via Scroll)."""
# Wir holen alle Edges (mit payload) und filtern lokal robust gegen fehlende Prefix-Operatoren.
next_offset = None
ids: List[int] = []
while True:
points, next_offset = client.scroll(
collection_name=edges_col,
limit=batch,
with_payload=True,
with_vectors=False,
offset=next_offset,
)
for p in points:
pl = p.payload or {}
sid = pl.get("source_id") or ""
if isinstance(sid, str) and sid.startswith(source_prefix):
if hasattr(p, "id") and isinstance(p.id, int):
ids.append(p.id)
if next_offset is None:
break
return ids
def purge_note(client: QdrantClient, cfg: QdrantConfig, note_id: str) -> None: def purge_note(client: QdrantClient, cfg: QdrantConfig, note_id: str) -> None:
"""Löscht alle Chunks & Edges einer Note (Replace-on-Change).""" """Löscht alle Chunks & Edges einer Note (Replace-on-Change)."""
notes_col, chunks_col, edges_col = f"{cfg.prefix}_notes", f"{cfg.prefix}_chunks", f"{cfg.prefix}_edges" notes_col, chunks_col, edges_col = f"{cfg.prefix}_notes", f"{cfg.prefix}_chunks", f"{cfg.prefix}_edges"
# Chunks: payload.note_id == NOTE_ID # Chunks: payload.note_id == NOTE_ID
f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
if hasattr(client, "delete_points"): _client_delete_points(client, chunks_col, f_chunks)
client.delete_points(chunks_col, f_chunks, wait=True)
else:
client.delete(chunks_col, f_chunks, wait=True)
# Edges: (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#") # Edges: (source_id == NOTE_ID) OR (target_id == NOTE_ID) OR (source_id startswith NOTE_ID + "#")
conds = [ conds = [
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)), rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)),
rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id)), rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id)),
rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")),
] ]
f_edges = rest.Filter(should=conds, minimum_should=1) # Versuche MatchText (falls verfügbar) für Prefix "NOTE_ID#"
if hasattr(client, "delete_points"): try:
client.delete_points(edges_col, f_edges, wait=True) conds.append(rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")))
else: f_edges = rest.Filter(should=conds) # kein minimum_should in deiner Client-Version
client.delete(edges_col, f_edges, wait=True) _client_delete_points(client, edges_col, f_edges)
except Exception:
# Fallback: Scroll & Delete-by-IDs für source_id prefix
f_edges_basic = rest.Filter(should=conds) # lösche erst exakte Note-Edges
_client_delete_points(client, edges_col, f_edges_basic)
# jetzt alle Kanten mit source_id == NOTE_ID#* auf IDs suchen und gezielt löschen
ids = _scroll_edge_ids_by_source_prefix(client, edges_col, f"{note_id}#")
if ids:
# Delete-by-IDs
selector = rest.PointIdsList(points=ids)
_client_delete_points(client, edges_col, selector)
# ------------------- # -------------------
# Main # Main
@ -161,7 +200,7 @@ def main():
index_payloads.append(pl) index_payloads.append(pl)
except Exception: except Exception:
continue continue
note_index = build_note_index(index_payloads) # by_id/by_slug/by_file_slug note_index = build_note_index(index_payloads) # by_id/by_slug/by_file_slug :contentReference[oaicite:10]{index=10}
notes_col = f"{cfg.prefix}_notes" notes_col = f"{cfg.prefix}_notes"
total_notes = 0 total_notes = 0
@ -183,7 +222,7 @@ def main():
note_pl = make_note_payload(parsed, vault_root=root) note_pl = make_note_payload(parsed, vault_root=root)
validate_note_payload(note_pl) validate_note_payload(note_pl)
h = compute_hash_fulltext(parsed.body) h = compute_hash_fulltext(parsed.body)
note_pl["hash_fulltext"] = h note_pl["hash_fulltext"] = h # Feld ist im Schema vorgesehen :contentReference[oaicite:11]{index=11}
# Chunks + Payloads # Chunks + Payloads
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
@ -196,7 +235,7 @@ def main():
# Optional: Note-Vektor # Optional: Note-Vektor
note_vec = embed_one(parsed.body) if args.embed_note else None note_vec = embed_one(parsed.body) if args.embed_note else None
# Edges (aus Volltext + echten Chunk-Texten) # Edges (aus Volltext + echten Chunk-Texten) :contentReference[oaicite:12]{index=12}
note_pl_for_edges = {"note_id": fm["id"], "title": fm.get("title"), "path": note_pl["path"], "fulltext": parsed.body} note_pl_for_edges = {"note_id": fm["id"], "title": fm.get("title"), "path": note_pl["path"], "fulltext": parsed.body}
chunks_for_links = [] chunks_for_links = []
for i, pl in enumerate(chunk_pls): for i, pl in enumerate(chunk_pls):