scripts/import_markdown.py aktualisiert
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
Some checks failed
Deploy mindnet to llm-node / deploy (push) Failing after 2s
This commit is contained in:
parent
9362064ad1
commit
43a8c80508
|
|
@ -2,19 +2,24 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges)
|
Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges)
|
||||||
Version: 3.4.1
|
Version: 3.5.0
|
||||||
Datum: 2025-09-09
|
Datum: 2025-09-09
|
||||||
|
|
||||||
Kurzbeschreibung
|
Kurzbeschreibung
|
||||||
----------------
|
----------------
|
||||||
- Liest Markdown-Dateien ein, erzeugt Notes/Chunks/Edges **idempotent**.
|
- Liest Markdown-Dateien ein, erzeugt Notes/Chunks/Edges **idempotent**.
|
||||||
- Change-Detection (nur **Inhalte**, keine FS-Zeitstempel) konfigurierbar:
|
- **Änderungserkennung** (nur Inhalte, keine FS-Zeitstempel) konfigurierbar:
|
||||||
* ``--hash-mode``: body | frontmatter | body+frontmatter | full (Alias)
|
* ``--hash-mode``: body | frontmatter | body+frontmatter | full (Alias)
|
||||||
- Env: ``MINDNET_HASH_MODE`` **oder** ``MINDNET_HASH_COMPARE`` (Body/Frontmatter/Full)
|
- Env: ``MINDNET_HASH_MODE`` **oder** ``MINDNET_HASH_COMPARE`` (Body/Frontmatter/Full)
|
||||||
* ``--hash-normalize``: canonical | none (Default: canonical)
|
* ``--hash-normalize``: canonical | none (Default: canonical)
|
||||||
* ``--hash-source``: parsed (Default) | raw
|
* ``--hash-source``: parsed (Default) | raw
|
||||||
- "raw" hasht den **ungeparsten** Body aus der Datei (Frontmatter vorher entfernt).
|
- "raw" hasht den **ungeparsten** Body (Frontmatter via Regex entfernt).
|
||||||
- Optionales Diff-Logging: ``--debug-hash-diff`` zeigt bei Bedarf einen kompakten Diff.
|
* **NEU**: ``--compare-text`` (oder ENV ``MINDNET_COMPARE_TEXT=true``)
|
||||||
|
- Zusätzlich zum Hash wird der *parsed* ``fulltext`` direkt verglichen.
|
||||||
|
- Erkennt Änderungen auch dann, wenn eine Normalisierung Unterschiede glättet.
|
||||||
|
* **NEU**: Signaturabgleich:
|
||||||
|
- Falls sich die Signatur (z. B. body→full, parsed→raw, canonical→none) zwischen Alt/Neu unterscheidet,
|
||||||
|
gilt die Note als **geändert** (Einmal-Update, um neue Signatur zu persistieren).
|
||||||
|
|
||||||
ENV / Qdrant
|
ENV / Qdrant
|
||||||
------------
|
------------
|
||||||
|
|
@ -22,15 +27,22 @@ ENV / Qdrant
|
||||||
- COLLECTION_PREFIX (Default: mindnet)
|
- COLLECTION_PREFIX (Default: mindnet)
|
||||||
- VECTOR_DIM (Default: 384)
|
- VECTOR_DIM (Default: 384)
|
||||||
- MINDNET_NOTE_SCOPE_REFS: true|false (Default: false)
|
- MINDNET_NOTE_SCOPE_REFS: true|false (Default: false)
|
||||||
|
- MINDNET_COMPARE_TEXT: true|false (Default: false)
|
||||||
|
|
||||||
Aufruf
|
Aufruf-Beispiele
|
||||||
------
|
----------------
|
||||||
|
# Standard (nur Body, kanonisiert, parsed-Quelle)
|
||||||
python3 -m scripts.import_markdown --vault ./vault
|
python3 -m scripts.import_markdown --vault ./vault
|
||||||
python3 -m scripts.import_markdown --vault ./vault --apply
|
|
||||||
python3 -m scripts.import_markdown --vault ./vault --apply --hash-source raw --hash-normalize none
|
|
||||||
MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
# Sensibel (jede Kleinigkeit): raw-Quelle + keine Normalisierung
|
||||||
|
python3 -m scripts.import_markdown --vault ./vault --apply --hash-source raw --hash-normalize none
|
||||||
|
|
||||||
|
# Full-Vergleich (Body+Frontmatter)
|
||||||
|
MINDNET_HASH_COMPARE=Full python3 -m scripts.import_markdown --vault ./vault --apply
|
||||||
|
|
||||||
|
# Zusätzlich Body-Text direkt vergleichen (maximale Sicherheit)
|
||||||
|
python3 -m scripts.import_markdown --vault ./vault --apply --compare-text
|
||||||
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse
|
import argparse
|
||||||
import difflib
|
import difflib
|
||||||
|
|
@ -143,11 +155,14 @@ def main() -> None:
|
||||||
help="(Optional) erzeugt zusätzlich references:note (Default: aus)")
|
help="(Optional) erzeugt zusätzlich references:note (Default: aus)")
|
||||||
ap.add_argument("--debug-hash-diff", action="store_true",
|
ap.add_argument("--debug-hash-diff", action="store_true",
|
||||||
help="Zeigt bei Bedarf einen kurzen Diff zwischen altem und neuem Body")
|
help="Zeigt bei Bedarf einen kurzen Diff zwischen altem und neuem Body")
|
||||||
|
ap.add_argument("--compare-text", action="store_true",
|
||||||
|
help="Parsed fulltext zusätzlich direkt vergleichen (über Hash hinaus)")
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
note_scope_refs_env = (os.environ.get("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true")
|
note_scope_refs_env = (os.environ.get("MINDNET_NOTE_SCOPE_REFS", "false").strip().lower() == "true")
|
||||||
note_scope_refs = args.note_sCope_refs if hasattr(args, "note_sCope_refs") else args.note_scope_refs # defensive
|
note_scope_refs = args.note_scope_refs or note_scope_refs_env
|
||||||
note_scope_refs = note_scope_refs or note_scope_refs_env
|
|
||||||
|
compare_text = args.compare_text or (os.environ.get("MINDNET_COMPARE_TEXT", "false").strip().lower() == "true")
|
||||||
|
|
||||||
cfg = QdrantConfig.from_env()
|
cfg = QdrantConfig.from_env()
|
||||||
client = get_client(cfg)
|
client = get_client(cfg)
|
||||||
|
|
@ -200,18 +215,36 @@ def main() -> None:
|
||||||
# Change-Detection (nur Inhalte, keine FS-Timestamps)
|
# Change-Detection (nur Inhalte, keine FS-Timestamps)
|
||||||
old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id)
|
old_payload = None if args.force_replace else fetch_existing_note_payload(client, cfg.prefix, note_id)
|
||||||
old_hash = None if not old_payload else old_payload.get("hash_fulltext")
|
old_hash = None if not old_payload else old_payload.get("hash_fulltext")
|
||||||
|
old_sig = (old_payload or {}).get("hash_signature")
|
||||||
new_hash = note_pl.get("hash_fulltext")
|
new_hash = note_pl.get("hash_fulltext")
|
||||||
changed = args.force_replace or (old_hash != new_hash)
|
new_sig = note_pl.get("hash_signature")
|
||||||
|
|
||||||
|
# 1) Signaturwechsel → als geändert behandeln (Einmal-Update)
|
||||||
|
sig_changed = bool(old_sig) and bool(new_sig) and (old_sig.split(":")[:3] != new_sig.split(":")[:3])
|
||||||
|
|
||||||
|
# 2) Hash-Vergleich
|
||||||
|
hash_changed = (old_hash != new_hash)
|
||||||
|
|
||||||
|
# 3) Optional: Parsed-Text direkt vergleichen (zusätzlich)
|
||||||
|
text_changed = False
|
||||||
|
if compare_text:
|
||||||
|
old_text = (old_payload or {}).get("fulltext") or ""
|
||||||
|
new_text = note_pl.get("fulltext") or ""
|
||||||
|
text_changed = (old_text != new_text)
|
||||||
|
|
||||||
|
changed = args.force_replace or sig_changed or hash_changed or text_changed
|
||||||
|
|
||||||
# Optionales Debugging: kompakten Diff anzeigen
|
# Optionales Debugging: kompakten Diff anzeigen
|
||||||
if args.debug_hash_diff:
|
if args.debug_hash_diff:
|
||||||
old_text = (old_payload or {}).get("fulltext") or ""
|
old_text = (old_payload or {}).get("fulltext") or ""
|
||||||
new_text = note_pl.get("fulltext") or ""
|
new_text = note_pl.get("fulltext") or ""
|
||||||
# Wenn Hash gleich, aber Text verschieden → Hinweis auf Normalisierung/Quelle
|
# Hinweis, wenn Hash gleich aber Text verschieden (oder Signaturwechsel)
|
||||||
if (old_hash == new_hash) and old_text != new_text:
|
if (not hash_changed) and (old_text != new_text or sig_changed):
|
||||||
print(json.dumps({
|
print(json.dumps({
|
||||||
"debug": "hash_equal_but_text_differs",
|
"debug": "hash_equal_but_text_or_signature_differs",
|
||||||
"note_id": note_id,
|
"note_id": note_id,
|
||||||
|
"old_sig": old_sig,
|
||||||
|
"new_sig": new_sig,
|
||||||
"hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
|
"hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
|
||||||
"hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
|
"hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
|
||||||
"hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
|
"hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
|
||||||
|
|
@ -257,6 +290,10 @@ def main() -> None:
|
||||||
"hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
|
"hash_mode": args.hash_mode or os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE", "body"),
|
||||||
"hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
|
"hash_normalize": args.hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical"),
|
||||||
"hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
|
"hash_source": args.hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed"),
|
||||||
|
"hash_signature": note_pl.get("hash_signature"),
|
||||||
|
"sig_changed": sig_changed,
|
||||||
|
"hash_changed": hash_changed,
|
||||||
|
"text_changed": text_changed,
|
||||||
}
|
}
|
||||||
print(json.dumps(summary, ensure_ascii=False))
|
print(json.dumps(summary, ensure_ascii=False))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user