scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s

This commit is contained in:
Lars 2025-09-24 12:14:56 +02:00
parent 13e510dd9c
commit 54197995f5

View File

@ -2,8 +2,8 @@
# -*- coding: utf-8 -*-
"""
Script: scripts/import_markdown.py Markdown Qdrant (Notes, Chunks, Edges)
Version: 3.6.2
Datum: 2025-09-09
Version: 3.7.0
Datum: 2025-09-10
Kurzbeschreibung
----------------
@ -11,32 +11,34 @@ Kurzbeschreibung
- Robuste Änderungserkennung: Mehrfach-Hashes werden parallel in der Note gespeichert
(Option C). Vergleich erfolgt **modusgenau** anhand von `hashes[<mode>:<source>:<normalize>]`.
Ein Wechsel des Vergleichsmodus führt so **nicht** zu Massenänderungen.
- **Fix (v3.6.2):** Bei **erstem Import** (kein Alt-Payload) wird die Note als **geändert**
behandelt Create (Notes/Chunks/Edges) findet zuverlässig statt.
- Baseline-Modus: Mit `--baseline-modes` werden **fehlende** Hash-Varianten
im Feld `hashes` still nachgetragen (Upsert NUR Notes; Legacy-Hashfelder bleiben unangetastet).
- **Erstimport-Fix:** Bei leerem Qdrant gilt Create-Fall automatisch als geändert.
- **--baseline-modes:** fehlende Hash-Varianten still nachtragen (nur Notes upserten).
- **--sync-deletes:** Punkte in Qdrant, die im Vault fehlen, sicher löschen (Dry-Run + Apply).
- **--prefix**: CLI-Override für COLLECTION_PREFIX.
Konfiguration Hash/Compare
Hash/Compare Konfiguration
--------------------------
- Vergleichsmodus: `--hash-mode body|frontmatter|full`
- oder ENV: `MINDNET_HASH_MODE` / `MINDNET_HASH_COMPARE` (Body|Frontmatter|Full)
- Quelle: `--hash-source parsed|raw` (ENV: `MINDNET_HASH_SOURCE`, Default parsed)
- Normalisierung: `--hash-normalize canonical|none` (ENV: `MINDNET_HASH_NORMALIZE`, Default canonical)
- Vergleichsmodus: `--hash-mode body|frontmatter|full` oder ENV `MINDNET_HASH_MODE|MINDNET_HASH_COMPARE`
- Quelle: `--hash-source parsed|raw` (ENV: MINDNET_HASH_SOURCE, Default parsed)
- Normalisierung: `--hash-normalize canonical|none` (ENV: MINDNET_HASH_NORMALIZE, Default canonical)
- Optional: `--compare-text` (oder `MINDNET_COMPARE_TEXT=true`) vergleicht zusätzlich den parsed Body-Text direkt.
Weitere ENV / Qdrant
--------------------
Qdrant / ENV
------------
- QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
- COLLECTION_PREFIX (Default: mindnet)
- COLLECTION_PREFIX (Default: mindnet), via `--prefix` überschreibbar
- VECTOR_DIM (Default: 384)
- MINDNET_NOTE_SCOPE_REFS: true|false (Default: false)
Aufruf-Beispiele
----------------
Beispiele
---------
# Standard (Body, parsed, canonical)
python3 -m scripts.import_markdown --vault ./vault
# Baseline (füllt hashes für parsed:canonical Tripel „still“ auf)
# Erstimport nach truncate (Create-Fall)
python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert
# Baseline für parsed:canonical „still“ auffüllen
MINDNET_HASH_SOURCE=parsed MINDNET_HASH_NORMALIZE=canonical \
python3 -m scripts.import_markdown --vault ./vault --apply --baseline-modes
@ -44,18 +46,20 @@ Aufruf-Beispiele
MINDNET_HASH_COMPARE=Frontmatter \
python3 -m scripts.import_markdown --vault ./vault
# Sehr sensibel (raw + none) und direkten Textvergleich
python3 -m scripts.import_markdown --vault ./vault --apply --hash-source raw --hash-normalize none --compare-text
# Sync-Deletes (Dry-Run → Apply)
python3 -m scripts.import_markdown --vault ./vault --sync-deletes
python3 -m scripts.import_markdown --vault ./vault --sync-deletes --apply
# Prefix explizit setzen
python3 -m scripts.import_markdown --vault ./vault --prefix mindnet
"""
from __future__ import annotations
import argparse
import difflib
import json
import os
import sys
from typing import Dict, List, Optional, Tuple, Any
from collections.abc import Mapping
from typing import Dict, List, Optional, Tuple, Any, Set
from dotenv import load_dotenv
from qdrant_client.http import models as rest
@ -68,7 +72,11 @@ from app.core.parser import (
from app.core.note_payload import make_note_payload
from app.core.chunker import assemble_chunks
from app.core.chunk_payload import make_chunk_payloads
from app.core.edges import build_edges_for_note
# Kompatibel zu beiden Modulnamen:
try:
from app.core.derive_edges import build_edges_for_note
except Exception: # pragma: no cover
from app.core.edges import build_edges_for_note # type: ignore
from app.core.qdrant import (
QdrantConfig,
get_client,
@ -87,9 +95,8 @@ try:
except Exception:
embed_texts = None
# ---------------------------------------------------------------------
# Helpers
# Helper
# ---------------------------------------------------------------------
def iter_md(root: str) -> List[str]:
@ -122,6 +129,30 @@ def fetch_existing_note_payload(client, prefix: str, note_id: str) -> Optional[D
return None
return points[0].payload or {}
def list_qdrant_note_ids(client, prefix: str) -> Set[str]:
"""Liest alle note_ids aus der Notes-Collection (per Scroll)."""
notes_col, _, _ = collections(prefix)
out: Set[str] = set()
next_page = None
while True:
pts, next_page = client.scroll(
collection_name=notes_col,
with_payload=True,
with_vectors=False,
limit=256,
offset=next_page,
)
if not pts:
break
for p in pts:
pl = p.payload or {}
nid = pl.get("note_id")
if isinstance(nid, str):
out.add(nid)
if next_page is None:
break
return out
def purge_note_artifacts(client, prefix: str, note_id: str) -> None:
"""
Löscht alle Chunks & Edges zu einer Note mittels Filter-Selector (kompatibel mit aktuellen Qdrant-Clients).
@ -147,12 +178,20 @@ def purge_note_artifacts(client, prefix: str, note_id: str) -> None:
except Exception as e:
print(json.dumps({"note_id": note_id, "warn": f"delete edges via filter failed: {e}"}))
def _normalize_rel_path(abs_path: str, vault_root: str) -> str:
try:
rel = os.path.relpath(abs_path, vault_root)
except Exception:
rel = abs_path
return rel.replace("\\", "/").lstrip("/")
def delete_note_everywhere(client, prefix: str, note_id: str) -> None:
"""Löscht Note + zugehörige Chunks + Edges per Filter."""
notes_col, chunks_col, edges_col = collections(prefix)
filt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
# Reihenfolge: erst edges/chunks, dann note
for col in (edges_col, chunks_col, notes_col):
try:
client.delete(
collection_name=col,
points_selector=rest.FilterSelector(filter=filt),
wait=True
)
except Exception as e:
print(json.dumps({"note_id": note_id, "warn": f"delete in {col} failed: {e}"}))
def _resolve_mode(val: Optional[str]) -> str:
v = (val or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower()
@ -165,7 +204,6 @@ def _resolve_mode(val: Optional[str]) -> str:
def _env(key: str, default: str) -> str:
return (os.environ.get(key) or default).strip().lower()
# ---------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------
@ -189,11 +227,14 @@ def main() -> None:
ap.add_argument("--note-scope-refs", action="store_true",
help="(Optional) erzeugt zusätzlich references:note (Default: aus)")
ap.add_argument("--debug-hash-diff", action="store_true",
help="Zeigt einen kurzen Diff zwischen altem und neuem Body")
help="(reserviert) optionaler Body-Diff (nicht nötig bei Option C)")
ap.add_argument("--compare-text", action="store_true",
help="Parsed fulltext zusätzlich direkt vergleichen (über Hash hinaus)")
ap.add_argument("--baseline-modes", action="store_true",
help="Fehlende Hash-Varianten im Feld 'hashes' still nachtragen (Upsert NUR Notes)")
ap.add_argument("--sync-deletes", action="store_true",
help="Notes/Chunks/Edges löschen, die in Qdrant existieren aber im Vault fehlen (Dry-Run; mit --apply ausführen)")
ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
args = ap.parse_args()
mode = _resolve_mode(args.hash_mode) # body|frontmatter|full
@ -204,6 +245,8 @@ def main() -> None:
compare_text = args.compare_text or (_env("MINDNET_COMPARE_TEXT", "false") == "true")
cfg = QdrantConfig.from_env()
if args.prefix:
cfg.prefix = args.prefix.strip()
client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim)
ensure_payload_indexes(client, cfg.prefix)
@ -214,6 +257,38 @@ def main() -> None:
print("Keine Markdown-Dateien gefunden.", file=sys.stderr)
sys.exit(2)
# Optional: Sync-Deletes vorab
if args.sync_deletes:
# Vault-Note-IDs sammeln
vault_note_ids: Set[str] = set()
for path in files:
try:
parsed = read_markdown(path)
if not parsed:
continue
fm = normalize_frontmatter(parsed.frontmatter)
nid = fm.get("id")
if isinstance(nid, str):
vault_note_ids.add(nid)
except Exception:
continue
# Qdrant-Note-IDs sammeln
qdrant_note_ids = list_qdrant_note_ids(client, cfg.prefix)
to_delete = sorted(qdrant_note_ids - vault_note_ids)
print(json.dumps({
"action": "sync-deletes",
"prefix": cfg.prefix,
"qdrant_total": len(qdrant_note_ids),
"vault_total": len(vault_note_ids),
"to_delete_count": len(to_delete),
"to_delete": to_delete[:50] + ([""] if len(to_delete) > 50 else [])
}, ensure_ascii=False))
if args.apply and to_delete:
for nid in to_delete:
print(json.dumps({"action": "delete", "note_id": nid, "decision": "apply"}))
delete_note_everywhere(client, cfg.prefix, nid)
# Danach normal mit Import fortfahren (z. B. neue Vault-Notes anlegen)
key_current = f"{mode}:{src}:{norm}"
processed = 0
@ -337,6 +412,7 @@ def main() -> None:
"hash_mode": mode,
"hash_normalize": norm,
"hash_source": src,
"prefix": cfg.prefix,
}
print(json.dumps(summary, ensure_ascii=False))