From f4abb1d873bcb32a02949157b50ea856dff525be Mon Sep 17 00:00:00 2001 From: Lars Date: Tue, 9 Sep 2025 11:52:16 +0200 Subject: [PATCH] scripts/import_markdown.py aktualisiert --- scripts/import_markdown.py | 132 ++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 76 deletions(-) diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index 180a1a9..6500e0c 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -2,47 +2,45 @@ # -*- coding: utf-8 -*- """ Script: import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges) -Version: 3.2.0 +Version: 3.3.1 Datum: 2025-09-09 Kurzbeschreibung ---------------- -Liest Markdown-Dateien aus einem Vault ein und schreibt Notes, Chunks und Edges -idempotent nach Qdrant. Change-Detection basiert standardmäßig auf dem **Body-Hash**. -Neu: Hash-Modus und Normalisierung sind auch per **CLI** steuerbar. +- Liest Markdown-Dateien ein, erzeugt Notes/Chunks/Edges idempotent. +- Change-Detection über Body-Hash (CLI/ENV steuerbar). +- Edges werden zentral über app.core.edges.build_edges_for_note erzeugt + (neues Schema; plus note_id als Owner). +- Legt bei Start sinnvolle Payload-Indizes in Qdrant an. ENV / Qdrant ------------ -- QDRANT_URL (oder QDRANT_HOST/QDRANT_PORT) -- QDRANT_API_KEY (optional) +- QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY - COLLECTION_PREFIX (Default: mindnet) - VECTOR_DIM (Default: 384) -- MINDNET_HASH_MODE: "body" (Default) | "frontmatter" | "body+frontmatter" -- MINDNET_HASH_NORMALIZE: "canonical" (Default) | "none" +- MINDNET_HASH_MODE: "body" | "frontmatter" | "body+frontmatter" (Default: body) +- MINDNET_HASH_NORMALIZE: "canonical" | "none" (Default: canonical) +- MINDNET_NOTE_SCOPE_REFS: "true"|"false" (Default: false) CLI (übersteuert ENV) --------------------- - --hash-mode body|frontmatter|body+frontmatter - --hash-normalize canonical|none + --hash-mode body|frontmatter|body+frontmatter + --hash-normalize canonical|none + --note-scope-refs Aufruf ------ - python3 -m scripts.import_markdown --vault ./vault - python3 -m scripts.import_markdown --vault ./vault --apply - python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert - python3 -m scripts.import_markdown --vault ./vault --note-id 20250821-foo --apply - python3 -m scripts.import_markdown --vault ./vault --apply --embed-note - # Feingranulare Erkennung (jede Kleinigkeit im Body zählt): - python3 -m scripts.import_markdown --vault ./vault --hash-normalize none + python3 -m scripts.import_markdown --vault ./vault --apply + python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert + python3 -m scripts.import_markdown --vault ./vault --apply --hash-normalize none """ from __future__ import annotations - import argparse import json import os import sys -from typing import Dict, List, Optional, Tuple +from typing import List, Tuple, Optional from dotenv import load_dotenv from qdrant_client.http import models as rest @@ -51,12 +49,17 @@ from app.core.parser import ( read_markdown, normalize_frontmatter, validate_required_frontmatter, - extract_wikilinks, ) from app.core.note_payload import make_note_payload from app.core.chunker import assemble_chunks from app.core.chunk_payload import make_chunk_payloads -from app.core.qdrant import QdrantConfig, get_client, ensure_collections +from app.core.edges import build_edges_for_note +from app.core.qdrant import ( + QdrantConfig, + get_client, + ensure_collections, + ensure_payload_indexes, +) from app.core.qdrant_points import ( points_for_chunks, points_for_note, @@ -71,9 +74,9 @@ except Exception: embed_one = None -# ----------------------------------------------------------------------------- -# Hilfsfunktionen -# ----------------------------------------------------------------------------- +# ----------------------------- +# Utils +# ----------------------------- def iter_md(root: str) -> List[str]: out: List[str] = [] @@ -94,8 +97,7 @@ def collections(prefix: str) -> Tuple[str, str, str]: def fetch_existing_note_hash(client, prefix: str, note_id: str) -> Optional[str]: notes_col, _, _ = collections(prefix) f = rest.Filter(must=[rest.FieldCondition( - key="note_id", - match=rest.MatchValue(value=note_id), + key="note_id", match=rest.MatchValue(value=note_id) )]) points, _ = client.scroll( collection_name=notes_col, @@ -106,21 +108,7 @@ def fetch_existing_note_hash(client, prefix: str, note_id: str) -> Optional[str] ) if not points: return None - pl = points[0].payload or {} - return pl.get("hash_fulltext") - -def purge_note_artifacts(client, prefix: str, note_id: str) -> None: - _, chunks_col, edges_col = collections(prefix) - f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) - client.delete(collection_name=chunks_col, points_selector=f_chunks, wait=True) - should = [ - rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")), - rest.FieldCondition(key="target_id", match=rest.MatchText(text=f"{note_id}#")), - rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)), - rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id)), - ] - f_edges = rest.Filter(should=should) - client.delete(collection_name=edges_col, points_selector=f_edges, wait=True) + return (points[0].payload or {}).get("hash_fulltext") def _normalize_rel_path(abs_path: str, vault_root: str) -> str: try: @@ -130,9 +118,9 @@ def _normalize_rel_path(abs_path: str, vault_root: str) -> str: return rel.replace("\\", "/").lstrip("/") -# ----------------------------------------------------------------------------- +# ----------------------------- # Main -# ----------------------------------------------------------------------------- +# ----------------------------- def main() -> None: load_dotenv() @@ -145,15 +133,19 @@ def main() -> None: ap.add_argument("--embed-note", action="store_true", help="Optional: Note-Volltext einbetten") ap.add_argument("--force-replace", action="store_true", help="Änderungserkennung ignorieren und immer upserten (+ optional Purge)") - # NEU: Hash-Steuerung per CLI ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter"], default=None) ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None) + ap.add_argument("--note-scope-refs", action="store_true", + help="(Optional) erzeugt zusätzlich references:note (Default: aus)") args = ap.parse_args() + note_scope_refs_env = (os.environ.get("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true") + note_scope_refs = args.note_scope_refs or note_scope_refs_env + cfg = QdrantConfig.from_env() client = get_client(cfg) ensure_collections(client, cfg.prefix, cfg.dim) - notes_col, chunks_col, edges_col = collections(cfg.prefix) + ensure_payload_indexes(client, cfg.prefix) # <— Neu: Indizes root = os.path.abspath(args.vault) files = iter_md(root) @@ -177,9 +169,9 @@ def main() -> None: processed += 1 - # Note-Payload (mit expliziten Hash-Parametern) + # Note-Payload inkl. Hash-Steuerung (per CLI/ENV) note_pl = make_note_payload(parsed, vault_root=root, - hash_mode=args.hash_mode, hash_normalize=args.hash_normalize) + hash_mode=args.hash_mode, hash_normalize=args.hash_normalize) # type: ignore[arg-type] if "fulltext" not in (note_pl or {}): note_pl["fulltext"] = parsed.body or "" if note_pl.get("path"): @@ -200,35 +192,18 @@ def main() -> None: chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) if embed_texts: - vecs = embed_texts([getattr(c, "text", "") for c in chunks]) + vecs = embed_texts([getattr(c, "text", "") for c in chunks]) # type: ignore[attr-defined] else: vecs = [[0.0] * cfg.dim for _ in chunks] - # Edges (leichtgewichtig direkt hier ableiten) - # belongs_to / prev/next aus Chunk-Nachbarschaften + Wikilinks als references - edges: List[Dict] = [] - for ch in chunk_pls: - cid = ch["id"] - edges.append({"src_id": cid, "dst_id": note_id, "edge_type": "belongs_to", "scope": "chunk"}) - nb = ch.get("neighbors") or {} - if nb.get("prev"): - edges.append({"src_id": nb["prev"], "dst_id": cid, "edge_type": "next", "scope": "chunk"}) - edges.append({"src_id": cid, "dst_id": nb["prev"], "edge_type": "prev", "scope": "chunk"}) - if nb.get("next"): - edges.append({"src_id": cid, "dst_id": nb["next"], "edge_type": "next", "scope": "chunk"}) - edges.append({"src_id": nb["next"], "dst_id": cid, "edge_type": "prev", "scope": "chunk"}) - for ref in (ch.get("references") or []): - tid = ref.get("target_id") - if tid: - edges.append({"src_id": cid, "dst_id": tid, "edge_type": "references", "scope": "chunk"}) - for tid in (note_pl.get("references") or []): - edges.append({"src_id": note_id, "dst_id": tid, "edge_type": "references", "scope": "note"}) - edges.append({"src_id": tid, "dst_id": note_id, "edge_type": "backlink", "scope": "note"}) - # Dedupe - _uniq = {} - for e in edges: - _uniq[(e["src_id"], e["dst_id"], e["edge_type"], e.get("scope", ""))] = e - edges = list(_uniq.values()) + # Edges (nur NEUES Schema, mit note_id als Owner) + note_refs = note_pl.get("references") or [] + edges = build_edges_for_note( + note_id, + chunk_pls, + note_refs, + include_note_scope_refs=note_scope_refs, + ) # Zusammenfassung summary = { @@ -243,8 +218,7 @@ def main() -> None: "path": note_pl["path"], "hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE", "body"), "hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"), - "hash_old": old_hash, - "hash_new": new_hash, + "note_scope_refs": note_scope_refs, } print(json.dumps(summary, ensure_ascii=False)) @@ -252,8 +226,14 @@ def main() -> None: continue if changed and args.purge_before_upsert: - purge_note_artifacts(client, cfg.prefix, note_id) + # gezieltes Löschen: jetzt performant per note_id-Index + _, chunks_col, edges_col = collections(cfg.prefix) + f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + client.delete(collection_name=chunks_col, points_selector=f_chunks, wait=True) + f_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))]) + client.delete(collection_name=edges_col, points_selector=f_edges, wait=True) + # Upsert Notes / Chunks / Edges notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) upsert_batch(client, notes_name, note_pts)