scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s

This commit is contained in:
Lars 2025-09-09 12:40:54 +02:00
parent 5bf5316af5
commit d7e5f398f0

View File

@ -1,46 +1,43 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Script: import_markdown.py Markdown Qdrant (Notes, Chunks, Edges) Script: scripts/import_markdown.py Markdown Qdrant (Notes, Chunks, Edges)
Version: 3.3.1 Version: 3.4.1
Datum: 2025-09-09 Datum: 2025-09-09
Kurzbeschreibung Kurzbeschreibung
---------------- ----------------
- Liest Markdown-Dateien ein, erzeugt Notes/Chunks/Edges idempotent. - Liest Markdown-Dateien ein, erzeugt Notes/Chunks/Edges **idempotent**.
- Change-Detection über Body-Hash (CLI/ENV steuerbar). - Change-Detection (nur **Inhalte**, keine FS-Zeitstempel) konfigurierbar:
- Edges werden zentral über app.core.edges.build_edges_for_note erzeugt * ``--hash-mode``: body | frontmatter | body+frontmatter | full (Alias)
(neues Schema; plus note_id als Owner). - Env: ``MINDNET_HASH_MODE`` **oder** ``MINDNET_HASH_COMPARE`` (Body/Frontmatter/Full)
- Legt bei Start sinnvolle Payload-Indizes in Qdrant an. * ``--hash-normalize``: canonical | none (Default: canonical)
* ``--hash-source``: parsed (Default) | raw
- "raw" hasht den **ungeparsten** Body aus der Datei (Frontmatter vorher entfernt).
- Optionales Diff-Logging: ``--debug-hash-diff`` zeigt bei Bedarf einen kompakten Diff.
ENV / Qdrant ENV / Qdrant
------------ ------------
- QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY - QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
- COLLECTION_PREFIX (Default: mindnet) - COLLECTION_PREFIX (Default: mindnet)
- VECTOR_DIM (Default: 384) - VECTOR_DIM (Default: 384)
- MINDNET_HASH_MODE: "body" | "frontmatter" | "body+frontmatter" (Default: body) - MINDNET_NOTE_SCOPE_REFS: true|false (Default: false)
- MINDNET_HASH_NORMALIZE: "canonical" | "none" (Default: canonical)
- MINDNET_NOTE_SCOPE_REFS: "true"|"false" (Default: false)
CLI (übersteuert ENV)
---------------------
--hash-mode body|frontmatter|body+frontmatter
--hash-normalize canonical|none
--note-scope-refs
Aufruf Aufruf
------ ------
python3 -m scripts.import_markdown --vault ./vault --apply python3 -m scripts.import_markdown --vault ./vault
python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert python3 -m scripts.import_markdown --vault ./vault --apply
python3 -m scripts.import_markdown --vault ./vault --apply --hash-normalize none python3 -m scripts.import_markdown --vault ./vault --apply --hash-source raw --hash-normalize none
MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import difflib
import json import json
import os import os
import sys import sys
from typing import List, Tuple, Optional from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv from dotenv import load_dotenv
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
@ -68,15 +65,14 @@ from app.core.qdrant_points import (
) )
try: try:
from app.core.embed import embed_texts, embed_one # optional from app.core.embed import embed_texts # optional
except Exception: except Exception:
embed_texts = None embed_texts = None
embed_one = None
# ----------------------------- # ---------------------------------------------------------------------
# Utils # Helpers
# ----------------------------- # ---------------------------------------------------------------------
def iter_md(root: str) -> List[str]: def iter_md(root: str) -> List[str]:
out: List[str] = [] out: List[str] = []
@ -94,11 +90,9 @@ def iter_md(root: str) -> List[str]:
def collections(prefix: str) -> Tuple[str, str, str]: def collections(prefix: str) -> Tuple[str, str, str]:
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def fetch_existing_note_hash(client, prefix: str, note_id: str) -> Optional[str]: def fetch_existing_note_payload(client, prefix: str, note_id: str) -> Optional[Dict]:
notes_col, _, _ = collections(prefix) notes_col, _, _ = collections(prefix)
f = rest.Filter(must=[rest.FieldCondition( f = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
key="note_id", match=rest.MatchValue(value=note_id)
)])
points, _ = client.scroll( points, _ = client.scroll(
collection_name=notes_col, collection_name=notes_col,
scroll_filter=f, scroll_filter=f,
@ -108,7 +102,14 @@ def fetch_existing_note_hash(client, prefix: str, note_id: str) -> Optional[str]
) )
if not points: if not points:
return None return None
return (points[0].payload or {}).get("hash_fulltext") return points[0].payload or {}
def purge_note_artifacts(client, prefix: str, note_id: str) -> None:
_, chunks_col, edges_col = collections(prefix)
f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
client.delete(collection_name=chunks_col, points_selector=f_chunks, wait=True)
f_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
client.delete(collection_name=edges_col, points_selector=f_edges, wait=True)
def _normalize_rel_path(abs_path: str, vault_root: str) -> str: def _normalize_rel_path(abs_path: str, vault_root: str) -> str:
try: try:
@ -118,9 +119,9 @@ def _normalize_rel_path(abs_path: str, vault_root: str) -> str:
return rel.replace("\\", "/").lstrip("/") return rel.replace("\\", "/").lstrip("/")
# ----------------------------- # ---------------------------------------------------------------------
# Main # Main
# ----------------------------- # ---------------------------------------------------------------------
def main() -> None: def main() -> None:
load_dotenv() load_dotenv()
@ -133,19 +134,25 @@ def main() -> None:
ap.add_argument("--embed-note", action="store_true", help="Optional: Note-Volltext einbetten") ap.add_argument("--embed-note", action="store_true", help="Optional: Note-Volltext einbetten")
ap.add_argument("--force-replace", action="store_true", ap.add_argument("--force-replace", action="store_true",
help="Änderungserkennung ignorieren und immer upserten (+ optional Purge)") help="Änderungserkennung ignorieren und immer upserten (+ optional Purge)")
ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter"], default=None) ap.add_argument("--hash-mode", choices=["body", "frontmatter", "body+frontmatter", "full"], default=None,
help="Vergleichsmodus: Body | Frontmatter | body+frontmatter (Alias: full)")
ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None) ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None,
help="Quelle für die Hash-Berechnung (Default: parsed)")
ap.add_argument("--note-scope-refs", action="store_true", ap.add_argument("--note-scope-refs", action="store_true",
help="(Optional) erzeugt zusätzlich references:note (Default: aus)") help="(Optional) erzeugt zusätzlich references:note (Default: aus)")
ap.add_argument("--debug-hash-diff", action="store_true",
help="Zeigt bei Bedarf einen kurzen Diff zwischen altem und neuem Body")
args = ap.parse_args() args = ap.parse_args()
note_scope_refs_env = (os.environ.get("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true") note_scope_refs_env = (os.environ.get("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true")
note_scope_refs = args.note_scope_refs or note_scope_refs_env note_scope_refs = args.note_sCope_refs if hasattr(args, "note_sCope_refs") else args.note_scope_refs # defensive
note_scope_refs = note_scope_refs or note_scope_refs_env
cfg = QdrantConfig.from_env() cfg = QdrantConfig.from_env()
client = get_client(cfg) client = get_client(cfg)
ensure_collections(client, cfg.prefix, cfg.dim) ensure_collections(client, cfg.prefix, cfg.dim)
ensure_payload_indexes(client, cfg.prefix) # <— Neu: Indizes ensure_payload_indexes(client, cfg.prefix)
root = os.path.abspath(args.vault) root = os.path.abspath(args.vault)
files = iter_md(root) files = iter_md(root)
@ -169,9 +176,16 @@ def main() -> None:
processed += 1 processed += 1
# Note-Payload inkl. Hash-Steuerung (per CLI/ENV) # Note-Payload (inkl. Hash-Steuerung & Quelle)
note_pl = make_note_payload(parsed, vault_root=root, note_pl = make_note_payload(
hash_mode=args.hash_mode, hash_normalize=args.hash_normalize) # type: ignore[arg-type] parsed,
vault_root=root,
hash_mode=args.hash_mode,
hash_normalize=args.hash_normalize,
hash_source=args.hash_source,
file_path=path,
)
if "fulltext" not in (note_pl or {}): if "fulltext" not in (note_pl or {}):
note_pl["fulltext"] = parsed.body or "" note_pl["fulltext"] = parsed.body or ""
if note_pl.get("path"): if note_pl.get("path"):
@ -183,11 +197,35 @@ def main() -> None:
note_id = note_pl["note_id"] note_id = note_pl["note_id"]
# Change-Detection # Change-Detection (nur Inhalte, keine FS-Timestamps)
old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id)
old_hash = None if not old_payload else old_payload.get("hash_fulltext")
new_hash = note_pl.get("hash_fulltext") new_hash = note_pl.get("hash_fulltext")
old_hash = None if args.force_replace else fetch_existing_note_hash(client, cfg.prefix, note_id)
changed = args.force_replace or (old_hash != new_hash) changed = args.force_replace or (old_hash != new_hash)
# Optionales Debugging: kompakten Diff anzeigen
if args.debug_hash_diff:
old_text = (old_payload or {}).get("fulltext") or ""
new_text = note_pl.get("fulltext") or ""
# Wenn Hash gleich, aber Text verschieden → Hinweis auf Normalisierung/Quelle
if (old_hash == new_hash) and old_text != new_text:
print(json.dumps({
"debug": "hash_equal_but_text_differs",
"note_id": note_id,
"hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
"hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
"hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
}, ensure_ascii=False))
if old_text and new_text:
ud = list(difflib.unified_diff(
old_text.splitlines(), new_text.splitlines(),
fromfile="qdrant_fulltext(old)", tofile="vault_body(new)",
n=3
))
if ud:
preview = "\n".join(ud[:50])
print(json.dumps({"note_id": note_id, "diff_preview": preview}, ensure_ascii=False))
# Chunks + Embeddings (Nullvektor-Fallback) # Chunks + Embeddings (Nullvektor-Fallback)
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
@ -196,7 +234,7 @@ def main() -> None:
else: else:
vecs = [[0.0] * cfg.dim for _ in chunks] vecs = [[0.0] * cfg.dim for _ in chunks]
# Edges (nur NEUES Schema, mit note_id als Owner) # Edges (neues Schema, mit note_id als Owner)
note_refs = note_pl.get("references") or [] note_refs = note_pl.get("references") or []
edges = build_edges_for_note( edges = build_edges_for_note(
note_id, note_id,
@ -205,7 +243,7 @@ def main() -> None:
include_note_scope_refs=note_scope_refs, include_note_scope_refs=note_scope_refs,
) )
# Zusammenfassung # Zusammenfassung pro Datei
summary = { summary = {
"note_id": note_id, "note_id": note_id,
"title": fm.get("title"), "title": fm.get("title"),
@ -216,9 +254,9 @@ def main() -> None:
"apply-skip-unchanged" if args.apply and not changed else "apply-skip-unchanged" if args.apply and not changed else
"dry-run"), "dry-run"),
"path": note_pl["path"], "path": note_pl["path"],
"hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE", "body"), "hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
"hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"), "hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
"note_scope_refs": note_scope_refs, "hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
} }
print(json.dumps(summary, ensure_ascii=False)) print(json.dumps(summary, ensure_ascii=False))
@ -226,14 +264,9 @@ def main() -> None:
continue continue
if changed and args.purge_before_upsert: if changed and args.purge_before_upsert:
# gezieltes Löschen: jetzt performant per note_id-Index purge_note_artifacts(client, cfg.prefix, note_id)
_, chunks_col, edges_col = collections(cfg.prefix)
f_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
client.delete(collection_name=chunks_col, points_selector=f_chunks, wait=True)
f_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))])
client.delete(collection_name=edges_col, points_selector=f_edges, wait=True)
# Upsert Notes / Chunks / Edges # Upserts
notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim) notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
upsert_batch(client, notes_name, note_pts) upsert_batch(client, notes_name, note_pts)