scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
This commit is contained in:
parent
9ef2e8d397
commit
f5e6fcc097
|
|
@ -2,7 +2,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges)
|
Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges)
|
||||||
Version: 3.5.0
|
Version: 3.5.1
|
||||||
Datum: 2025-09-09
|
Datum: 2025-09-09
|
||||||
|
|
||||||
Kurzbeschreibung
|
Kurzbeschreibung
|
||||||
|
|
@ -14,12 +14,11 @@ Kurzbeschreibung
|
||||||
* ``--hash-normalize``: canonical | none (Default: canonical)
|
* ``--hash-normalize``: canonical | none (Default: canonical)
|
||||||
* ``--hash-source``: parsed (Default) | raw
|
* ``--hash-source``: parsed (Default) | raw
|
||||||
- "raw" hasht den **ungeparsten** Body (Frontmatter via Regex entfernt).
|
- "raw" hasht den **ungeparsten** Body (Frontmatter via Regex entfernt).
|
||||||
* **NEU**: ``--compare-text`` (oder ENV ``MINDNET_COMPARE_TEXT=true``)
|
* Optional: ``--compare-text`` (oder ENV ``MINDNET_COMPARE_TEXT=true``)
|
||||||
- Zusätzlich zum Hash wird der *parsed* ``fulltext`` direkt verglichen.
|
- parsed ``fulltext`` zusätzlich direkt vergleichen (falls Normalisierung Unterschiede glättet).
|
||||||
- Erkennt Änderungen auch dann, wenn eine Normalisierung Unterschiede glättet.
|
* Signaturabgleich:
|
||||||
* **NEU**: Signaturabgleich:
|
- Wenn sich die Signatur (z. B. body→full, parsed→raw, canonical→none) zwischen Alt/Neu unterscheidet,
|
||||||
- Falls sich die Signatur (z. B. body→full, parsed→raw, canonical→none) zwischen Alt/Neu unterscheidet,
|
gilt die Note als **geändert** (Einmal-Update, um die neue Signatur zu persistieren).
|
||||||
gilt die Note als **geändert** (Einmal-Update, um neue Signatur zu persistieren).
|
|
||||||
|
|
||||||
ENV / Qdrant
|
ENV / Qdrant
|
||||||
------------
|
------------
|
||||||
|
|
@ -40,7 +39,7 @@ Aufruf-Beispiele
|
||||||
# Full-Vergleich (Body+Frontmatter)
|
# Full-Vergleich (Body+Frontmatter)
|
||||||
MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply
|
MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply
|
||||||
|
|
||||||
# Zusätzlich Body-Text direkt vergleichen (maximale Sicherheit)
|
# Zusätzlich Body-Text direkt vergleichen (max. Sicherheit)
|
||||||
python3 -m scripts.import_markdown --vault ./vault --apply --compare-text
|
python3 -m scripts.import_markdown --vault ./vault --apply --compare-text
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -177,8 +176,21 @@ def main() -> None:
|
||||||
|
|
||||||
processed = 0
|
processed = 0
|
||||||
for path in files:
|
for path in files:
|
||||||
parsed = read_markdown(path)
|
# ----------------- robustes Parsing -----------------
|
||||||
fm = normalize_frontmatter(parsed.frontmatter)
|
try:
|
||||||
|
parsed = read_markdown(path)
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"path": path, "error": f"read_markdown failed: {e}"}))
|
||||||
|
continue
|
||||||
|
if parsed is None:
|
||||||
|
print(json.dumps({"path": path, "error": "read_markdown returned None"}))
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
fm = normalize_frontmatter(parsed.frontmatter)
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"path": path, "error": f"normalize_frontmatter failed: {e}"}))
|
||||||
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
validate_required_frontmatter(fm)
|
validate_required_frontmatter(fm)
|
||||||
|
|
@ -191,41 +203,50 @@ def main() -> None:
|
||||||
|
|
||||||
processed += 1
|
processed += 1
|
||||||
|
|
||||||
# Note-Payload (inkl. Hash-Steuerung & Quelle)
|
# -------------- Note-Payload (defensiv) --------------
|
||||||
note_pl = make_note_payload(
|
try:
|
||||||
parsed,
|
note_pl = make_note_payload(
|
||||||
vault_root=root,
|
parsed,
|
||||||
hash_mode=args.hash_mode,
|
vault_root=root,
|
||||||
hash_normalize=args.hash_normalize,
|
hash_mode=args.hash_mode,
|
||||||
hash_source=args.hash_source,
|
hash_normalize=args.hash_normalize,
|
||||||
file_path=path,
|
hash_source=args.hash_source,
|
||||||
)
|
file_path=path,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"path": path, "note_id": fm.get("id"), "error": f"make_note_payload failed: {e}"}))
|
||||||
|
continue
|
||||||
|
|
||||||
if "fulltext" not in (note_pl or {}):
|
if not isinstance(note_pl, dict):
|
||||||
note_pl["fulltext"] = parsed.body or ""
|
print(json.dumps({"path": path, "note_id": fm.get("id"), "error": "make_note_payload returned non-dict"}))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# fulltext sicherstellen + Pfad normalisieren
|
||||||
|
if not note_pl.get("fulltext"):
|
||||||
|
note_pl["fulltext"] = getattr(parsed, "body", "") or ""
|
||||||
if note_pl.get("path"):
|
if note_pl.get("path"):
|
||||||
note_pl["path"] = _normalize_rel_path(
|
note_pl["path"] = _normalize_rel_path(
|
||||||
os.path.join(root, note_pl["path"]) if not os.path.isabs(note_pl["path"]) else note_pl["path"], root
|
os.path.join(root, note_pl["path"]) if not os.path.isabs(note_pl["path"]) else note_pl["path"], root
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
note_pl["path"] = _normalize_rel_path(parsed.path, root)
|
p_path = getattr(parsed, "path", None) or path
|
||||||
|
note_pl["path"] = _normalize_rel_path(p_path, root)
|
||||||
|
|
||||||
note_id = note_pl["note_id"]
|
note_id = note_pl.get("note_id") or fm.get("id")
|
||||||
|
if not note_id:
|
||||||
|
print(json.dumps({"path": path, "error": "Missing note_id after payload build"}))
|
||||||
|
continue
|
||||||
|
|
||||||
# Change-Detection (nur Inhalte, keine FS-Timestamps)
|
# -------------- Change-Detection --------------
|
||||||
old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id)
|
old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id)
|
||||||
old_hash = None if not old_payload else old_payload.get("hash_fulltext")
|
old_hash = None if not old_payload else old_payload.get("hash_fulltext")
|
||||||
old_sig = (old_payload or {}).get("hash_signature")
|
old_sig = (old_payload or {}).get("hash_signature")
|
||||||
new_hash = note_pl.get("hash_fulltext")
|
new_hash = note_pl.get("hash_fulltext")
|
||||||
new_sig = note_pl.get("hash_signature")
|
new_sig = note_pl.get("hash_signature")
|
||||||
|
|
||||||
# 1) Signaturwechsel → als geändert behandeln (Einmal-Update)
|
|
||||||
sig_changed = bool(old_sig) and bool(new_sig) and (old_sig.split(":")[:3] != new_sig.split(":")[:3])
|
sig_changed = bool(old_sig) and bool(new_sig) and (old_sig.split(":")[:3] != new_sig.split(":")[:3])
|
||||||
|
|
||||||
# 2) Hash-Vergleich
|
|
||||||
hash_changed = (old_hash != new_hash)
|
hash_changed = (old_hash != new_hash)
|
||||||
|
|
||||||
# 3) Optional: Parsed-Text direkt vergleichen (zusätzlich)
|
|
||||||
text_changed = False
|
text_changed = False
|
||||||
if compare_text:
|
if compare_text:
|
||||||
old_text = (old_payload or {}).get("fulltext") or ""
|
old_text = (old_payload or {}).get("fulltext") or ""
|
||||||
|
|
@ -234,49 +255,41 @@ def main() -> None:
|
||||||
|
|
||||||
changed = args.force_replace or sig_changed or hash_changed or text_changed
|
changed = args.force_replace or sig_changed or hash_changed or text_changed
|
||||||
|
|
||||||
# Optionales Debugging: kompakten Diff anzeigen
|
# -------------- Chunks / Embeddings / Edges --------------
|
||||||
if args.debug_hash_diff:
|
try:
|
||||||
old_text = (old_payload or {}).get("fulltext") or ""
|
chunks = assemble_chunks(fm["id"], getattr(parsed, "body", "") or "", fm.get("type", "concept"))
|
||||||
new_text = note_pl.get("fulltext") or ""
|
except Exception as e:
|
||||||
# Hinweis, wenn Hash gleich aber Text verschieden (oder Signaturwechsel)
|
print(json.dumps({"path": path, "note_id": note_id, "error": f"assemble_chunks failed: {e}"}))
|
||||||
if (not hash_changed) and (old_text != new_text or sig_changed):
|
continue
|
||||||
print(json.dumps({
|
|
||||||
"debug": "hash_equal_but_text_or_signature_differs",
|
try:
|
||||||
"note_id": note_id,
|
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
|
||||||
"old_sig": old_sig,
|
except Exception as e:
|
||||||
"new_sig": new_sig,
|
print(json.dumps({"path": path, "note_id": note_id, "error": f"make_chunk_payloads failed: {e}"}))
|
||||||
"hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
|
continue
|
||||||
"hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
|
|
||||||
"hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
|
|
||||||
}, ensure_ascii=False))
|
|
||||||
if old_text and new_text:
|
|
||||||
ud = list(difflib.unified_diff(
|
|
||||||
old_text.splitlines(), new_text.splitlines(),
|
|
||||||
fromfile="qdrant_fulltext(old)", tofile="vault_body(new)",
|
|
||||||
n=3
|
|
||||||
))
|
|
||||||
if ud:
|
|
||||||
preview = "\n".join(ud[:50])
|
|
||||||
print(json.dumps({"note_id": note_id, "diff_preview": preview}, ensure_ascii=False))
|
|
||||||
|
|
||||||
# Chunks + Embeddings (Nullvektor-Fallback)
|
|
||||||
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
|
|
||||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
|
|
||||||
if embed_texts:
|
if embed_texts:
|
||||||
vecs = embed_texts([getattr(c, "text", "") for c in chunks]) # type: ignore[attr-defined]
|
try:
|
||||||
|
vecs = embed_texts([getattr(c, "text", "") for c in chunks]) # type: ignore[attr-defined]
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"path": path, "note_id": note_id, "warn": f"embed_texts failed, using zeros: {e}"}))
|
||||||
|
vecs = [[0.0] * cfg.dim for _ in chunks]
|
||||||
else:
|
else:
|
||||||
vecs = [[0.0] * cfg.dim for _ in chunks]
|
vecs = [[0.0] * cfg.dim for _ in chunks]
|
||||||
|
|
||||||
# Edges (neues Schema, mit note_id als Owner)
|
try:
|
||||||
note_refs = note_pl.get("references") or []
|
note_refs = note_pl.get("references") or []
|
||||||
edges = build_edges_for_note(
|
edges = build_edges_for_note(
|
||||||
note_id,
|
note_id,
|
||||||
chunk_pls,
|
chunk_pls,
|
||||||
note_refs,
|
note_refs,
|
||||||
include_note_scope_refs=note_scope_refs,
|
include_note_scope_refs=note_scope_refs,
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"path": path, "note_id": note_id, "error": f"build_edges_for_note failed: {e}"}))
|
||||||
|
continue
|
||||||
|
|
||||||
# Zusammenfassung pro Datei
|
# -------------- Zusammenfassung / Ausgabe --------------
|
||||||
summary = {
|
summary = {
|
||||||
"note_id": note_id,
|
"note_id": note_id,
|
||||||
"title": fm.get("title"),
|
"title": fm.get("title"),
|
||||||
|
|
@ -297,21 +310,25 @@ def main() -> None:
|
||||||
}
|
}
|
||||||
print(json.dumps(summary, ensure_ascii=False))
|
print(json.dumps(summary, ensure_ascii=False))
|
||||||
|
|
||||||
|
# -------------- Upserts --------------
|
||||||
if not args.apply:
|
if not args.apply:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if changed and args.purge_before_upsert:
|
try:
|
||||||
purge_note_artifacts(client, cfg.prefix, note_id)
|
if changed and args.purge_before_upsert:
|
||||||
|
purge_note_artifacts(client, cfg.prefix, note_id)
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"}))
|
||||||
|
|
||||||
# Upserts
|
try:
|
||||||
notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
|
notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
|
||||||
upsert_batch(client, notes_name, note_pts)
|
upsert_batch(client, notes_name, note_pts)
|
||||||
|
chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs)
|
||||||
chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs)
|
upsert_batch(client, chunks_name, chunk_pts)
|
||||||
upsert_batch(client, chunks_name, chunk_pts)
|
edges_name, edge_pts = points_for_edges(cfg.prefix, edges)
|
||||||
|
upsert_batch(client, edges_name, edge_pts)
|
||||||
edges_name, edge_pts = points_for_edges(cfg.prefix, edges)
|
except Exception as e:
|
||||||
upsert_batch(client, edges_name, edge_pts)
|
print(json.dumps({"path": path, "note_id": note_id, "error": f"upsert failed: {e}"}))
|
||||||
|
|
||||||
print(f"Done. Processed notes: {processed}")
|
print(f"Done. Processed notes: {processed}")
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user