scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s

This commit is contained in:
Lars 2025-09-24 12:14:56 +02:00
parent 13e510dd9c
commit 54197995f5

View File

@ -2,8 +2,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Script: scripts/import_markdown.py Markdown Qdrant (Notes, Chunks, Edges) Script: scripts/import_markdown.py Markdown Qdrant (Notes, Chunks, Edges)
Version: 3.6.2 Version: 3.7.0
Datum: 2025-09-09 Datum: 2025-09-10
Kurzbeschreibung Kurzbeschreibung
---------------- ----------------
@ -11,32 +11,34 @@ Kurzbeschreibung
- Robuste Änderungserkennung: Mehrfach-Hashes werden parallel in der Note gespeichert - Robuste Änderungserkennung: Mehrfach-Hashes werden parallel in der Note gespeichert
(Option C). Vergleich erfolgt **modusgenau** anhand von `hashes[<mode>:<source>:<normalize>]`. (Option C). Vergleich erfolgt **modusgenau** anhand von `hashes[<mode>:<source>:<normalize>]`.
Ein Wechsel des Vergleichsmodus führt so **nicht** zu Massenänderungen. Ein Wechsel des Vergleichsmodus führt so **nicht** zu Massenänderungen.
- **Fix (v3.6.2):** Bei **erstem Import** (kein Alt-Payload) wird die Note als **geändert** - **Erstimport-Fix:** Bei leerem Qdrant gilt Create-Fall automatisch als geändert.
behandelt Create (Notes/Chunks/Edges) findet zuverlässig statt. - **--baseline-modes:** fehlende Hash-Varianten still nachtragen (nur Notes upserten).
- Baseline-Modus: Mit `--baseline-modes` werden **fehlende** Hash-Varianten - **--sync-deletes:** Punkte in Qdrant, die im Vault fehlen, sicher löschen (Dry-Run + Apply).
im Feld `hashes` still nachgetragen (Upsert NUR Notes; Legacy-Hashfelder bleiben unangetastet). - **--prefix**: CLI-Override für COLLECTION_PREFIX.
Konfiguration Hash/Compare Hash/Compare Konfiguration
-------------------------- --------------------------
- Vergleichsmodus: `--hash-mode body|frontmatter|full` - Vergleichsmodus: `--hash-mode body|frontmatter|full` oder ENV `MINDNET_HASH_MODE|MINDNET_HASH_COMPARE`
- oder ENV: `MINDNET_HASH_MODE` / `MINDNET_HASH_COMPARE` (Body|Frontmatter|Full) - Quelle: `--hash-source parsed|raw` (ENV: MINDNET_HASH_SOURCE, Default parsed)
- Quelle: `--hash-source parsed|raw` (ENV: `MINDNET_HASH_SOURCE`, Default parsed) - Normalisierung: `--hash-normalize canonical|none` (ENV: MINDNET_HASH_NORMALIZE, Default canonical)
- Normalisierung: `--hash-normalize canonical|none` (ENV: `MINDNET_HASH_NORMALIZE`, Default canonical)
- Optional: `--compare-text` (oder `MINDNET_COMPARE_TEXT=true`) vergleicht zusätzlich den parsed Body-Text direkt. - Optional: `--compare-text` (oder `MINDNET_COMPARE_TEXT=true`) vergleicht zusätzlich den parsed Body-Text direkt.
Weitere ENV / Qdrant Qdrant / ENV
-------------------- ------------
- QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY - QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
- COLLECTION_PREFIX (Default: mindnet) - COLLECTION_PREFIX (Default: mindnet), via `--prefix` überschreibbar
- VECTOR_DIM (Default: 384) - VECTOR_DIM (Default: 384)
- MINDNET_NOTE_SCOPE_REFS: true|false (Default: false) - MINDNET_NOTE_SCOPE_REFS: true|false (Default: false)
Aufruf-Beispiele Beispiele
---------------- ---------
# Standard (Body, parsed, canonical) # Standard (Body, parsed, canonical)
python3 -m scripts.import_markdown --vault ./vault python3 -m scripts.import_markdown --vault ./vault
# Baseline (füllt hashes für parsed:canonical Tripel „still“ auf) # Erstimport nach truncate (Create-Fall)
python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert
# Baseline für parsed:canonical „still“ auffüllen
MINDNET_HASH_SOURCE=parsed MINDNET_HASH_NORMALIZE=canonical \ MINDNET_HASH_SOURCE=parsed MINDNET_HASH_NORMALIZE=canonical \
python3 -m scripts.import_markdown --vault ./vault --apply --baseline-modes python3 -m scripts.import_markdown --vault ./vault --apply --baseline-modes
@ -44,18 +46,20 @@ Aufruf-Beispiele
MINDNET_HASH_COMPARE=Frontmatter \ MINDNET_HASH_COMPARE=Frontmatter \
python3 -m scripts.import_markdown --vault ./vault python3 -m scripts.import_markdown --vault ./vault
# Sehr sensibel (raw + none) und direkten Textvergleich # Sync-Deletes (Dry-Run → Apply)
python3 -m scripts.import_markdown --vault ./vault --apply --hash-source raw --hash-normalize none --compare-text python3 -m scripts.import_markdown --vault ./vault --sync-deletes
python3 -m scripts.import_markdown --vault ./vault --sync-deletes --apply
# Prefix explizit setzen
python3 -m scripts.import_markdown --vault ./vault --prefix mindnet
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import difflib
import json import json
import os import os
import sys import sys
from typing import Dict, List, Optional, Tuple, Any from typing import Dict, List, Optional, Tuple, Any, Set
from collections.abc import Mapping
from dotenv import load_dotenv from dotenv import load_dotenv
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
@ -68,7 +72,11 @@ from app.core.parser import (
from app.core.note_payload import make_note_payload from app.core.note_payload import make_note_payload
from app.core.chunker import assemble_chunks from app.core.chunker import assemble_chunks
from app.core.chunk_payload import make_chunk_payloads from app.core.chunk_payload import make_chunk_payloads
from app.core.edges import build_edges_for_note # Kompatibel zu beiden Modulnamen:
try:
from app.core.derive_edges import build_edges_for_note
except Exception: # pragma: no cover
from app.core.edges import build_edges_for_note # type: ignore
from app.core.qdrant import ( from app.core.qdrant import (
QdrantConfig, QdrantConfig,
get_client, get_client,
@ -87,9 +95,8 @@ try:
except Exception: except Exception:
embed_texts = None embed_texts = None
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
# Helpers # Helper
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
def iter_md(root: str) -> List[str]: def iter_md(root: str) -> List[str]:
@ -122,6 +129,30 @@ def fetch_existing_note_payload(client, prefix: str, note_id: str) -> Optional[D
return None return None
return points[0].payload or {} return points[0].payload or {}
def list_qdrant_note_ids(client, prefix: str) -> Set[str]:
"""Liest alle note_ids aus der Notes-Collection (per Scroll)."""
notes_col, _, _ = collections(prefix)
out: Set[str] = set()
next_page = None
while True:
pts, next_page = client.scroll(
collection_name=notes_col,
with_payload=True,
with_vectors=False,
limit=256,
offset=next_page,
)
if not pts:
break
for p in pts:
pl = p.payload or {}
nid = pl.get("note_id")
if isinstance(nid, str):
out.add(nid)
if next_page is None:
break
return out
def purge_note_artifacts(client, prefix: str, note_id: str) -> None: def purge_note_artifacts(client, prefix: str, note_id: str) -> None:
""" """
Löscht alle Chunks & Edges zu einer Note mittels Filter-Selector (kompatibel mit aktuellen Qdrant-Clients). Löscht alle Chunks & Edges zu einer Note mittels Filter-Selector (kompatibel mit aktuellen Qdrant-Clients).
@ -147,12 +178,20 @@ def purge_note_artifacts(client, prefix: str, note_id: str) -> None:
except Exception as e: except Exception as e:
print(json.dumps({"note_id": note_id, "warn": f"delete edges via filter failed: {e}"})) print(json.dumps({"note_id": note_id, "warn": f"delete edges via filter failed: {e}"}))
def _normalize_rel_path(abs_path: str, vault_root: str) -> str: def delete_note_everywhere(client, prefix: str, note_id: str) -> None:
try: """Löscht Note + zugehörige Chunks + Edges per Filter."""
rel = os.path.relpath(abs_path, vault_root) notes_col, chunks_col, edges_col = collections(prefix)
except Exception: filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
rel = abs_path # Reihenfolge: erst edges/chunks, dann note
return rel.replace("\\", "/").lstrip("/") for col in (edges_col, chunks_col, notes_col):
try:
client.delete(
collection_name=col,
points_selector=rest.FilterSelector(filter=filt),
wait=True
)
except Exception as e:
print(json.dumps({"note_id": note_id, "warn": f"delete in {col} failed: {e}"}))
def _resolve_mode(val: Optional[str]) -> str: def _resolve_mode(val: Optional[str]) -> str:
v = (val or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower() v = (val or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower()
@ -165,7 +204,6 @@ def _resolve_mode(val: Optional[str]) -> str:
def _env(key: str, default: str) -> str: def _env(key: str, default: str) -> str:
return (os.environ.get(key) or default).strip().lower() return (os.environ.get(key) or default).strip().lower()
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
# Main # Main
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
@ -189,11 +227,14 @@ def main() -> None:
ap.add_argument("--note-scope-refs", action="store_true", ap.add_argument("--note-scope-refs", action="store_true",
help="(Optional) erzeugt zusätzlich references:note (Default: aus)") help="(Optional) erzeugt zusätzlich references:note (Default: aus)")
ap.add_argument("--debug-hash-diff", action="store_true", ap.add_argument("--debug-hash-diff", action="store_true",
help="Zeigt einen kurzen Diff zwischen altem und neuem Body") help="(reserviert) optionaler Body-Diff (nicht nötig bei Option C)")
ap.add_argument("--compare-text", action="store_true", ap.add_argument("--compare-text", action="store_true",
help="Parsed fulltext zusätzlich direkt vergleichen (über Hash hinaus)") help="Parsed fulltext zusätzlich direkt vergleichen (über Hash hinaus)")
ap.add_argument("--baseline-modes", action="store_true", ap.add_argument("--baseline-modes", action="store_true",
help="Fehlende Hash-Varianten im Feld 'hashes' still nachtragen (Upsert NUR Notes)") help="Fehlende Hash-Varianten im Feld 'hashes' still nachtragen (Upsert NUR Notes)")
ap.add_argument("--sync-deletes", action="store_true",
help="Notes/Chunks/Edges löschen, die in Qdrant existieren aber im Vault fehlen (Dry-Run; mit --apply ausführen)")
ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
args = ap.parse_args() args = ap.parse_args()
mode = _resolve_mode(args.hash_mode) # body|frontmatter|full mode = _resolve_mode(args.hash_mode) # body|frontmatter|full
@ -204,6 +245,8 @@ def main() -> None:
compare_text = args.compare_text or (_env("MINDNET_COMPARE_TEXT", "false") == "true") compare_text = args.compare_text or (_env("MINDNET_COMPARE_TEXT", "false") == "true")
cfg = QdrantConfig.from_env() cfg = QdrantConfig.from_env()
if args.prefix:
cfg.prefix = args.prefix.strip()
client = get_client(cfg) client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim) ensure_collections(client, cfg.prefix, cfg.dim)
ensure_payload_indexes(client, cfg.prefix) ensure_payload_indexes(client, cfg.prefix)
@ -214,6 +257,38 @@ def main() -> None:
print("Keine Markdown-Dateien gefunden.", file=sys.stderr) print("Keine Markdown-Dateien gefunden.", file=sys.stderr)
sys.exit(2) sys.exit(2)
# Optional: Sync-Deletes vorab
if args.sync_deletes:
# Vault-Note-IDs sammeln
vault_note_ids: Set[str] = set()
for path in files:
try:
parsed = read_markdown(path)
if not parsed:
continue
fm = normalize_frontmatter(parsed.frontmatter)
nid = fm.get("id")
if isinstance(nid, str):
vault_note_ids.add(nid)
except Exception:
continue
# Qdrant-Note-IDs sammeln
qdrant_note_ids = list_qdrant_note_ids(client, cfg.prefix)
to_delete = sorted(qdrant_note_ids - vault_note_ids)
print(json.dumps({
"action": "sync-deletes",
"prefix": cfg.prefix,
"qdrant_total": len(qdrant_note_ids),
"vault_total": len(vault_note_ids),
"to_delete_count": len(to_delete),
"to_delete": to_delete[:50] + ([""] if len(to_delete) > 50 else [])
}, ensure_ascii=False))
if args.apply and to_delete:
for nid in to_delete:
print(json.dumps({"action": "delete", "note_id": nid, "decision": "apply"}))
delete_note_everywhere(client, cfg.prefix, nid)
# Danach normal mit Import fortfahren (z. B. neue Vault-Notes anlegen)
key_current = f"{mode}:{src}:{norm}" key_current = f"{mode}:{src}:{norm}"
processed = 0 processed = 0
@ -337,6 +412,7 @@ def main() -> None:
"hash_mode": mode, "hash_mode": mode,
"hash_normalize": norm, "hash_normalize": norm,
"hash_source": src, "hash_source": src,
"prefix": cfg.prefix,
} }
print(json.dumps(summary, ensure_ascii=False)) print(json.dumps(summary, ensure_ascii=False))