scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s

This commit is contained in:
Lars 2025-09-09 11:52:16 +02:00
parent 8f48701ea0
commit f4abb1d873

View File

@ -2,47 +2,45 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Script: import_markdown.py Markdown Qdrant (Notes, Chunks, Edges) Script: import_markdown.py Markdown Qdrant (Notes, Chunks, Edges)
Version: 3.2.0 Version: 3.3.1
Datum: 2025-09-09 Datum: 2025-09-09
Kurzbeschreibung Kurzbeschreibung
---------------- ----------------
Liest Markdown-Dateien aus einem Vault ein und schreibt Notes, Chunks und Edges - Liest Markdown-Dateien ein, erzeugt Notes/Chunks/Edges idempotent.
idempotent nach Qdrant. Change-Detection basiert standardmäßig auf dem **Body-Hash**. - Change-Detection über Body-Hash (CLI/ENV steuerbar).
Neu: Hash-Modus und Normalisierung sind auch per **CLI** steuerbar. - Edges werden zentral über app.core.edges.build_edges_for_note erzeugt
(neues Schema; plus note_id als Owner).
- Legt bei Start sinnvolle Payload-Indizes in Qdrant an.
ENV / Qdrant ENV / Qdrant
------------ ------------
- QDRANT_URL (oder QDRANT_HOST/QDRANT_PORT) - QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
- QDRANT_API_KEY (optional)
- COLLECTION_PREFIX (Default: mindnet) - COLLECTION_PREFIX (Default: mindnet)
- VECTOR_DIM (Default: 384) - VECTOR_DIM (Default: 384)
- MINDNET_HASH_MODE: "body" (Default) | "frontmatter" | "body+frontmatter" - MINDNET_HASH_MODE: "body" | "frontmatter" | "body+frontmatter" (Default: body)
- MINDNET_HASH_NORMALIZE: "canonical" (Default) | "none" - MINDNET_HASH_NORMALIZE: "canonical" | "none" (Default: canonical)
- MINDNET_NOTE_SCOPE_REFS: "true"|"false" (Default: false)
CLI (übersteuert ENV) CLI (übersteuert ENV)
--------------------- ---------------------
--hash-mode body|frontmatter|body+frontmatter --hash-mode body|frontmatter|body+frontmatter
--hash-normalize canonical|none --hash-normalize canonical|none
--note-scope-refs
Aufruf Aufruf
------ ------
python3 -m scripts.import_markdown --vault ./vault python3 -m scripts.import_markdown --vault ./vault --apply
python3 -m scripts.import_markdown --vault ./vault --apply python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert
python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert python3 -m scripts.import_markdown --vault ./vault --apply --hash-normalize none
python3 -m scripts.import_markdown --vault ./vault --note-id 20250821-foo --apply
python3 -m scripts.import_markdown --vault ./vault --apply --embed-note
# Feingranulare Erkennung (jede Kleinigkeit im Body zählt):
python3 -m scripts.import_markdown --vault ./vault --hash-normalize none
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import json import json
import os import os
import sys import sys
from typing import Dict, List, Optional, Tuple from typing import List, Tuple, Optional
from dotenv import load_dotenv from dotenv import load_dotenv
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
@ -51,12 +49,17 @@ from app.core.parser import (
read_markdown, read_markdown,
normalize_frontmatter, normalize_frontmatter,
validate_required_frontmatter, validate_required_frontmatter,
extract_wikilinks,
) )
from app.core.note_payload import make_note_payload from app.core.note_payload import make_note_payload
from app.core.chunker import assemble_chunks from app.core.chunker import assemble_chunks
from app.core.chunk_payload import make_chunk_payloads from app.core.chunk_payload import make_chunk_payloads
from app.core.qdrant import QdrantConfig, get_client, ensure_collections from app.core.edges import build_edges_for_note
from app.core.qdrant import (
QdrantConfig,
get_client,
ensure_collections,
ensure_payload_indexes,
)
from app.core.qdrant_points import ( from app.core.qdrant_points import (
points_for_chunks, points_for_chunks,
points_for_note, points_for_note,
@ -71,9 +74,9 @@ except Exception:
embed_one = None embed_one = None
# ----------------------------------------------------------------------------- # -----------------------------
# Hilfsfunktionen # Utils
# ----------------------------------------------------------------------------- # -----------------------------
def iter_md(root: str) -> List[str]: def iter_md(root: str) -> List[str]:
out: List[str] = [] out: List[str] = []
@ -94,8 +97,7 @@ def collections(prefix: str) -> Tuple[str, str, str]:
def fetch_existing_note_hash(client, prefix: str, note_id: str) -> Optional[str]: def fetch_existing_note_hash(client, prefix: str, note_id: str) -> Optional[str]:
notes_col, _, _ = collections(prefix) notes_col, _, _ = collections(prefix)
f = rest.Filter(must=[rest.FieldCondition( f = rest.Filter(must=[rest.FieldCondition(
key="note_id", key="note_id", match=rest.MatchValue(value=note_id)
match=rest.MatchValue(value=note_id),
)]) )])
points, _ = client.scroll( points, _ = client.scroll(
collection_name=notes_col, collection_name=notes_col,
@ -106,21 +108,7 @@ def fetch_existing_note_hash(client, prefix: str, note_id: str) -> Optional[str]
) )
if not points: if not points:
return None return None
pl = points[0].payload or {} return (points[0].payload or {}).get("hash_fulltext")
return pl.get("hash_fulltext")
def purge_note_artifacts(client, prefix: str, note_id: str) -> None:
_, chunks_col, edges_col = collections(prefix)
f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
client.delete(collection_name=chunks_col, points_selector=f_chunks, wait=True)
should = [
rest.FieldCondition(key="source_id", match=rest.MatchText(text=f"{note_id}#")),
rest.FieldCondition(key="target_id", match=rest.MatchText(text=f"{note_id}#")),
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=note_id)),
rest.FieldCondition(key="target_id", match=rest.MatchValue(value=note_id)),
]
f_edges = rest.Filter(should=should)
client.delete(collection_name=edges_col, points_selector=f_edges, wait=True)
def _normalize_rel_path(abs_path: str, vault_root: str) -> str: def _normalize_rel_path(abs_path: str, vault_root: str) -> str:
try: try:
@ -130,9 +118,9 @@ def _normalize_rel_path(abs_path: str, vault_root: str) -> str:
return rel.replace("\\", "/").lstrip("/") return rel.replace("\\", "/").lstrip("/")
# ----------------------------------------------------------------------------- # -----------------------------
# Main # Main
# ----------------------------------------------------------------------------- # -----------------------------
def main() -> None: def main() -> None:
load_dotenv() load_dotenv()
@ -145,15 +133,19 @@ def main() -> None:
ap.add_argument("--embed-note", action="store_true", help="Optional: Note-Volltext einbetten") ap.add_argument("--embed-note", action="store_true", help="Optional: Note-Volltext einbetten")
ap.add_argument("--force-replace", action="store_true", ap.add_argument("--force-replace", action="store_true",
help="Änderungserkennung ignorieren und immer upserten (+ optional Purge)") help="Änderungserkennung ignorieren und immer upserten (+ optional Purge)")
# NEU: Hash-Steuerung per CLI
ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter"], default=None) ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter"], default=None)
ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None) ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
ap.add_argument("--note-scope-refs", action="store_true",
help="(Optional) erzeugt zusätzlich references:note (Default: aus)")
args = ap.parse_args() args = ap.parse_args()
note_scope_refs_env = (os.environ.get("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true")
note_scope_refs = args.note_scope_refs or note_scope_refs_env
cfg = QdrantConfig.from_env() cfg = QdrantConfig.from_env()
client = get_client(cfg) client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim) ensure_collections(client, cfg.prefix, cfg.dim)
notes_col, chunks_col, edges_col = collections(cfg.prefix) ensure_payload_indexes(client, cfg.prefix) # <— Neu: Indizes
root = os.path.abspath(args.vault) root = os.path.abspath(args.vault)
files = iter_md(root) files = iter_md(root)
@ -177,9 +169,9 @@ def main() -> None:
processed += 1 processed += 1
# Note-Payload (mit expliziten Hash-Parametern) # Note-Payload inkl. Hash-Steuerung (per CLI/ENV)
note_pl = make_note_payload(parsed, vault_root=root, note_pl = make_note_payload(parsed, vault_root=root,
hash_mode=args.hash_mode, hash_normalize=args.hash_normalize) hash_mode=args.hash_mode, hash_normalize=args.hash_normalize) # type: ignore[arg-type]
if "fulltext" not in (note_pl or {}): if "fulltext" not in (note_pl or {}):
note_pl["fulltext"] = parsed.body or "" note_pl["fulltext"] = parsed.body or ""
if note_pl.get("path"): if note_pl.get("path"):
@ -200,35 +192,18 @@ def main() -> None:
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
if embed_texts: if embed_texts:
vecs = embed_texts([getattr(c, "text", "") for c in chunks]) vecs = embed_texts([getattr(c, "text", "") for c in chunks]) # type: ignore[attr-defined]
else: else:
vecs = [[0.0] * cfg.dim for _ in chunks] vecs = [[0.0] * cfg.dim for _ in chunks]
# Edges (leichtgewichtig direkt hier ableiten) # Edges (nur NEUES Schema, mit note_id als Owner)
# belongs_to / prev/next aus Chunk-Nachbarschaften + Wikilinks als references note_refs = note_pl.get("references") or []
edges: List[Dict] = [] edges = build_edges_for_note(
for ch in chunk_pls: note_id,
cid = ch["id"] chunk_pls,
edges.append({"src_id": cid, "dst_id": note_id, "edge_type": "belongs_to", "scope": "chunk"}) note_refs,
nb = ch.get("neighbors") or {} include_note_scope_refs=note_scope_refs,
if nb.get("prev"): )
edges.append({"src_id": nb["prev"], "dst_id": cid, "edge_type": "next", "scope": "chunk"})
edges.append({"src_id": cid, "dst_id": nb["prev"], "edge_type": "prev", "scope": "chunk"})
if nb.get("next"):
edges.append({"src_id": cid, "dst_id": nb["next"], "edge_type": "next", "scope": "chunk"})
edges.append({"src_id": nb["next"], "dst_id": cid, "edge_type": "prev", "scope": "chunk"})
for ref in (ch.get("references") or []):
tid = ref.get("target_id")
if tid:
edges.append({"src_id": cid, "dst_id": tid, "edge_type": "references", "scope": "chunk"})
for tid in (note_pl.get("references") or []):
edges.append({"src_id": note_id, "dst_id": tid, "edge_type": "references", "scope": "note"})
edges.append({"src_id": tid, "dst_id": note_id, "edge_type": "backlink", "scope": "note"})
# Dedupe
_uniq = {}
for e in edges:
_uniq[(e["src_id"], e["dst_id"], e["edge_type"], e.get("scope", ""))] = e
edges = list(_uniq.values())
# Zusammenfassung # Zusammenfassung
summary = { summary = {
@ -243,8 +218,7 @@ def main() -> None:
"path": note_pl["path"], "path": note_pl["path"],
"hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE", "body"), "hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE", "body"),
"hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"), "hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
"hash_old": old_hash, "note_scope_refs": note_scope_refs,
"hash_new": new_hash,
} }
print(json.dumps(summary, ensure_ascii=False)) print(json.dumps(summary, ensure_ascii=False))
@ -252,8 +226,14 @@ def main() -> None:
continue continue
if changed and args.purge_before_upsert: if changed and args.purge_before_upsert:
purge_note_artifacts(client, cfg.prefix, note_id) # gezieltes Löschen: jetzt performant per note_id-Index
_, chunks_col, edges_col = collections(cfg.prefix)
f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
client.delete(collection_name=chunks_col, points_selector=f_chunks, wait=True)
f_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
client.delete(collection_name=edges_col, points_selector=f_edges, wait=True)
# Upsert Notes / Chunks / Edges
notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
upsert_batch(client, notes_name, note_pts) upsert_batch(client, notes_name, note_pts)