Dateien nach "scripts" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-11-08 17:00:27 +01:00
parent 2794d26181
commit 3b192e2eed
2 changed files with 263 additions and 389 deletions

View File

@ -1,180 +1,227 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Modul: scripts/export_markdown.py Script: scripts/export_markdown.py Qdrant Markdown (Vault)
Version: 1.6.1 Version: 1.4.1
Datum: 2025-11-07 Datum: 2025-09-10
Zweck Funktion
----- --------
Exportiert Notes aus Qdrant zurück in Markdown-Dateien (verlustarm): Exportiert Notes (Frontmatter + Body) aus Qdrant in einen Zielordner. Der Body wird
Pfade relativieren, Backslashes Slashes bevorzugt aus dem Feld `fulltext` rekonstruiert; falls leer/nicht vorhanden, aus Chunks
Body aus 'fulltext' (falls vorhanden) oder Rekonstruktion via Chunks (seq/chunk_index) (Sortierung: seq chunk_index Nummer in chunk_id). Pfade werden **relativ** geschrieben.
Optional: vorhandene Edges pro Note mit exportieren (--include-edges yaml|footer)
CLI Optionen
--------
--out PATH Zielordner (erforderlich)
--prefix TEXT Collection-Prefix (CLI überschreibt ENV COLLECTION_PREFIX)
--note-id ID Nur eine Note exportieren
--overwrite Existierende Dateien überschreiben
--include-edges MODE none|yaml|footer (Default: none)
--flatten-paths Alle Dateien flach schreiben; Originalpfad in FM: orig_path
ENV
--- ---
COLLECTION_PREFIX, QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
Beispiele
---------
export COLLECTION_PREFIX="mindnet" export COLLECTION_PREFIX="mindnet"
python3 -m scripts.export_markdown --out ./_exportVault python3 -m scripts.export_markdown --out ./_exportVault
python3 -m scripts.export_markdown --out ./_exportVault --note-id <ID>
python3 -m scripts.export_markdown --out ./_exportVault --overwrite
python3 -m scripts.export_markdown --out ./_exportVault --include-edges yaml
python3 -m scripts.export_markdown --out ./_exportVault --include-edges footer
Parameter # Nur eine Note, Edges als YAML-Feld 'references'
--------- python3 -m scripts.export_markdown --out ./_exportVault --note-id concept-alpha --include-edges yaml
--out Zielwurzel (Ordner wird angelegt)
--prefix überschreibt ENV COLLECTION_PREFIX (Default: mindnet) # Flach schreiben mit Überschreiben
--note-id nur eine bestimmte Note exportieren python3 -m scripts.export_markdown --out ./_exportVault --flatten-paths --overwrite
--overwrite vorhandene Dateien überschreiben
--include-edges none|yaml|footer (Default: none)
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import os import os
import json import json
from pathlib import Path from typing import Dict, List, Optional, Tuple, Any
from typing import Dict, List, Tuple, Optional
from app.core.qdrant import ( import yaml
QdrantConfig, from qdrant_client.http import models as rest
get_client,
fetch_all_notes,
fetch_chunks_for_note,
fetch_edges_for_note, # <— jetzt angebunden
ensure_collections,
)
def _normalize_rel_path(p: str) -> str: from app.core.qdrant import QdrantConfig, get_client
p = (p or "").replace("\\", "/") from app.core.qdrant import ensure_collections # safety
while p.startswith("/"):
p = p[1:]
return p
def _ensure_parent(p: Path): # ---------------------------------------------------------------------
p.parent.mkdir(parents=True, exist_ok=True) # Helpers
# ---------------------------------------------------------------------
def _yaml_frontmatter(d: Dict) -> str: def collections(prefix: str) -> Tuple[str, str, str]:
import io return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def _ser(obj):
if isinstance(obj, str):
if any(ch in obj for ch in [":", "-", "{", "}", "[", "]", ",", "#", "&", "*", "!", "|", ">", "'", "\"", "%", "@", "`"]):
return '"' + obj.replace('"', '\\"') + '"'
return obj
if isinstance(obj, bool):
return "true" if obj else "false"
if obj is None:
return "null"
if isinstance(obj, (int, float)):
return str(obj)
if isinstance(obj, list):
return "[" + ", ".join(_ser(x) for x in obj) + "]"
if isinstance(obj, dict):
inner = []
for k in sorted(obj.keys()):
inner.append(f"{k}: {_ser(obj[k])}")
return "{ " + ", ".join(inner) + " }"
return '"' + str(obj).replace('"', '\\"') + '"'
buf = io.StringIO() def _norm_rel_path(path: str) -> str:
buf.write("---\n") p = (path or "").replace("\\", "/").lstrip("/")
for k in sorted(d.keys()): return p if p else ""
buf.write(f"{k}: {_ser(d[k])}\n")
buf.write("---\n")
return buf.getvalue()
def _reconstruct_body_from_chunks(chunks: List[Dict]) -> str: def _ensure_dir(path: str) -> None:
if not chunks: d = os.path.dirname(path)
return "" if d and not os.path.exists(d):
def _num_from_chunk_id(cid: str) -> int: os.makedirs(d, exist_ok=True)
try:
if "#" in cid: def _yaml_dump(data: Dict[str, Any]) -> str:
return int(cid.split("#", 1)[1]) return yaml.safe_dump(data, allow_unicode=True, sort_keys=False).strip()
return 0
except Exception: def _frontmatter_block(fm: Dict[str, Any]) -> str:
return 0 y = _yaml_dump(fm)
chunks_sorted = sorted( return f"---\n{y}\n---\n"
chunks,
key=lambda c: ( def _scroll_all(client, collection: str, flt: Optional[rest.Filter] = None) -> List[Any]:
int(c.get("seq", c.get("chunk_index", 0))), out = []
int(c.get("chunk_index", 0)), nextp = None
_num_from_chunk_id(str(c.get("chunk_id", ""))), while True:
pts, nextp = client.scroll(
collection_name=collection,
with_payload=True,
with_vectors=False,
limit=256,
scroll_filter=flt,
offset=nextp,
) )
) if not pts:
body = "".join(c.get("text") or "" for c in chunks_sorted) break
return body out.extend(pts)
if nextp is None:
break
return out
def parse_args() -> argparse.Namespace: def _reconstruct_body_from_chunks(chunks: List[Any]) -> str:
p = argparse.ArgumentParser(prog="export_markdown.py", description="Exportiert Notes aus Qdrant in Markdown.") def seq_key(pl: Dict[str, Any]) -> Tuple[int, int, int]:
p.add_argument("--out", required=True, help="Zielordner") s = pl.get("seq")
p.add_argument("--prefix", default="", help="Collections-Prefix; überschreibt ENV COLLECTION_PREFIX") ci = pl.get("chunk_index")
p.add_argument("--note-id", default="", help="nur eine Note exportieren") cid = pl.get("chunk_id") or ""
p.add_argument("--overwrite", action="store_true", help="vorhandene Dateien überschreiben") n = 0
p.add_argument("--include-edges", default="none", choices=["none", "yaml", "footer"], help="Edges im Export anzeigen") if isinstance(cid, str) and "#" in cid:
return p.parse_args() try:
n = int(cid.rsplit("#", 1)[-1])
except Exception:
n = 0
return (int(s) if isinstance(s, int) else 0,
int(ci) if isinstance(ci, int) else 0,
n)
chunks_sorted = sorted(chunks, key=lambda p: seq_key(p.payload or {}))
texts: List[str] = []
for p in chunks_sorted:
pl = p.payload or {}
t = pl.get("text") or pl.get("content") or pl.get("raw") or ""
if isinstance(t, str) and t:
texts.append(t)
return "\n".join(texts).strip()
def main(): def _collect_forward_refs_from_edges(edges: List[Any]) -> List[str]:
args = parse_args() refs = []
out_root = Path(args.out).resolve() for p in edges:
out_root.mkdir(parents=True, exist_ok=True) pl = p.payload or {}
if pl.get("kind") == "references" and isinstance(pl.get("target_id"), str):
refs.append(pl["target_id"])
# de-dupe, preserve order
seen = set()
out = []
for r in refs:
if r not in seen:
seen.add(r)
out.append(r)
return out
prefix = args.prefix.strip() or os.environ.get("COLLECTION_PREFIX", "").strip() or "mindnet" # ---------------------------------------------------------------------
cfg = QdrantConfig.from_env(prefix=prefix) # Main
# ---------------------------------------------------------------------
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--out", required=True, help="Zielordner für exportierte Markdown-Dateien")
ap.add_argument("--prefix", help="Collection-Prefix (überschreibt ENV COLLECTION_PREFIX)")
ap.add_argument("--note-id", help="Nur eine bestimmte Note-ID exportieren")
ap.add_argument("--overwrite", action="store_true", help="Existierende Dateien überschreiben")
ap.add_argument("--include-edges", choices=["none", "yaml", "footer"], default="none",
help="Forward-Links mit exportieren (aus Edges oder Note-Payload)")
ap.add_argument("--flatten-paths", action="store_true", help="Alle Dateien flach schreiben (orig_path in Frontmatter)")
args = ap.parse_args()
cfg = QdrantConfig.from_env()
if args.prefix:
cfg.prefix = args.prefix.strip()
client = get_client(cfg) client = get_client(cfg)
ensure_collections(client, cfg) ensure_collections(client, cfg.prefix, cfg.dim)
out_root = os.path.abspath(args.out)
os.makedirs(out_root, exist_ok=True)
notes_col, chunks_col, edges_col = collections(cfg.prefix)
# Filter nach note-id (optional)
flt = None
if args.note_id: if args.note_id:
notes = fetch_all_notes(client, cfg, only_note_id=args.note_id) flt = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=args.note_id))])
else:
notes = fetch_all_notes(client, cfg)
exported = 0 notes = _scroll_all(client, notes_col, flt)
total = 0
for n in notes: for n in notes:
note_id = n.get("note_id") or n.get("id") pl = n.payload or {}
if not note_id: nid = pl.get("note_id")
continue rel_path = _norm_rel_path(pl.get("path") or "")
if args.flatten_paths or not rel_path:
fname = f"{(nid or 'note')}.md"
out_path = os.path.join(out_root, fname)
else:
out_path = os.path.join(out_root, rel_path)
out_path = out_path.replace("\\", "/")
_ensure_dir(out_path)
rel = _normalize_rel_path(str(n.get("path") or f"{note_id}.md")) # Frontmatter aufbauen (nur sinnvolle Felder)
dst = out_root.joinpath(rel) fm_fields = ["id","title","type","status","created","updated","tags","area","project","source","lang","slug","aliases"]
fm: Dict[str, Any] = {}
fm["id"] = nid
for k in fm_fields:
if k == "id":
continue
if k in pl and pl[k] is not None:
fm[k] = pl[k]
if args.flatten_paths and rel_path:
fm["orig_path"] = rel_path
body = str(n.get("fulltext") or "") # Body ermitteln (fulltext oder Chunks)
body = (pl.get("fulltext") or "").strip()
if not body: if not body:
chunks = fetch_chunks_for_note(client, cfg, note_id) flt_chunks = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
chunks = _scroll_all(client, chunks_col, flt_chunks)
body = _reconstruct_body_from_chunks(chunks) body = _reconstruct_body_from_chunks(chunks)
fm = {} # Edges (optional)
for k in ("id", "title", "type", "status", "created", "tags", "priority", "due", "effort_min", "values", "goals", "embedding_exclude"): refs: List[str] = []
if k in n: if args.include_edges != "none":
fm[k] = n[k] # aus Note-Payload, falls vorhanden
for k in ("hash_signature", "hash_fulltext", "hash_body", "hash_frontmatter"): if isinstance(pl.get("references"), list) and pl["references"]:
if k in n: refs = [r for r in pl["references"] if isinstance(r, str)]
fm[k] = n[k] else:
flt_edges = rest.Filter(must=[rest.FieldCondition(key="note_id", match=rest.MatchValue(value=nid))])
edges = _scroll_all(client, edges_col, flt_edges)
refs = _collect_forward_refs_from_edges(edges)
if args.include_edges == "yaml" and refs:
fm["references"] = refs
edges_block = "" # Datei schreiben
if args.include_edges in ("yaml", "footer"): if (not args.overwrite) and os.path.exists(out_path):
try: print(json.dumps({"note_id": nid, "path": out_path, "decision": "skip-exists"}))
edges = fetch_edges_for_note(client, cfg, note_id) or [] continue
if args.include_edges == "yaml":
fm["_edges"] = edges
else:
edges_block = "\n\n---\n_edges_:\n" + json.dumps(edges, ensure_ascii=False, indent=2) + "\n"
except Exception:
pass
if dst.exists() and not args.overwrite: content = _frontmatter_block(fm) + (body + "\n" if body else "")
decision = "skip" if args.include_edges == "footer" and refs:
else: content += "\n---\nLinks:\n" + "\n".join(f"- [[{r}]]" for r in refs) + "\n"
_ensure_parent(dst)
content = _yaml_frontmatter(fm) + (body or "") + edges_block
dst.write_text(content, encoding="utf-8")
decision = "write"
print(json.dumps({"note_id": note_id, "path": str(dst), "decision": decision}, ensure_ascii=False)) with open(out_path, "w", encoding="utf-8") as f:
if decision == "write": f.write(content)
exported += 1
print(json.dumps({"note_id": nid, "path": out_path, "decision": "write"}))
total += 1
print(f"Done. Exported notes: {total}")
print(f"Done. Exported notes: {exported}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -2,16 +2,59 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Script: scripts/import_markdown.py Markdown Qdrant (Notes, Chunks, Edges) Script: scripts/import_markdown.py Markdown Qdrant (Notes, Chunks, Edges)
Version: 3.8.5 Version: 3.7.2
Date: 2025-11-08 Datum: 2025-09-30
Notes Kurzbeschreibung
----- ----------------
- Uses compatibility wrappers for ensure_collections and payload index creation. - Liest Markdown-Dateien ein und erzeugt Notes/Chunks/Edges **idempotent**.
- Provides robust local fallbacks for qdrant_points helpers. - Änderungserkennung Option C: mehrere Hash-Varianten werden parallel in der Note
- Generates valid Qdrant point IDs (int or UUIDv5) if none provided. gespeichert (Feld `hashes` mit Schlüsseln `<mode>:<source>:<normalize>`). Der Vergleich
- Detects Named-Vector schema and coerces points accordingly. nutzt NUR den aktuellen Modus-Key ein Moduswechsel triggert keine Massenänderungen mehr.
- Integrates Type-Registry without breaking older behavior. - Erstimport-Fix: Bei leerem Qdrant gilt Create-Fall automatisch als geändert.
- `--baseline-modes`: fehlende Hash-Varianten still nachtragen (nur Notes upserten).
- `--sync-deletes`: gezielte Lösch-Synchronisation (Dry-Run + Apply).
- `--only-path`: exakt **eine** Datei (Pfad) importieren nützlich für Diagnosefälle.
Neu in 3.7.1/3.7.2
------------------
- Chunk-Payloads: `window` (für Embeddings), `text` (überlappungsfrei, verlustfrei rekonstruierbar),
`start/end/overlap_*`. Embeddings nutzen `window`.
- **3.7.2:** Edges-Fehler führen nicht mehr zum Abbruch der gesamten Note; Note/Chunks werden trotzdem geschrieben.
Hash/Compare Konfiguration
--------------------------
- Vergleichsmodus:
--hash-mode body|frontmatter|full
oder ENV: MINDNET_HASH_MODE | MINDNET_HASH_COMPARE
- Quelle:
--hash-source parsed|raw (ENV: MINDNET_HASH_SOURCE, Default parsed)
- Normalisierung:
--hash-normalize canonical|none (ENV: MINDNET_HASH_NORMALIZE, Default canonical)
- Optional: --compare-text (oder ENV MINDNET_COMPARE_TEXT=true) vergleicht zusätzlich
den parsed Body-Text direkt.
Qdrant / ENV
------------
- QDRANT_URL | QDRANT_HOST/QDRANT_PORT | QDRANT_API_KEY
- COLLECTION_PREFIX (Default: mindnet), via --prefix überschreibbar
- VECTOR_DIM (Default: 384)
- MINDNET_NOTE_SCOPE_REFS: true|false (Default: false)
Beispiele
---------
# Standard (Body, parsed, canonical)
python3 -m scripts.import_markdown --vault ./vault
# Erstimport nach truncate (Create-Fall)
python3 -m scripts.import_markdown --vault ./vault --apply --purge-before-upsert
# Nur eine Datei (Diagnose)
python3 -m scripts.import_markdown --vault ./vault --only-path ./vault/30_projects/project-demo.md --apply
# Sync-Deletes (Dry-Run → Apply)
python3 -m scripts.import_markdown --vault ./vault --sync-deletes
python3 -m scripts.import_markdown --vault ./vault --sync-deletes --apply
""" """
from __future__ import annotations from __future__ import annotations
@ -19,19 +62,11 @@ import argparse
import json import json
import os import os
import sys import sys
import uuid as _uuid
from typing import Dict, List, Optional, Tuple, Any, Set from typing import Dict, List, Optional, Tuple, Any, Set
from dotenv import load_dotenv from dotenv import load_dotenv
from qdrant_client.http import models as rest from qdrant_client.http import models as rest
# Deterministic UUIDv5 for stable Qdrant point IDs
_MN_NAMESPACE = _uuid.uuid5(_uuid.NAMESPACE_URL, "mindnet-default-namespace")
def _uuid5_deterministic(*parts: str) -> str:
base = ":".join(str(p) for p in parts if p is not None)
return str(_uuid.uuid5(_MN_NAMESPACE, base))
# --- Project imports (as in stable 20251105) ---
from app.core.parser import ( from app.core.parser import (
read_markdown, read_markdown,
normalize_frontmatter, normalize_frontmatter,
@ -44,125 +79,18 @@ try:
from app.core.derive_edges import build_edges_for_note from app.core.derive_edges import build_edges_for_note
except Exception: # pragma: no cover except Exception: # pragma: no cover
from app.core.edges import build_edges_for_note # type: ignore from app.core.edges import build_edges_for_note # type: ignore
from app.core.qdrant import ( from app.core.qdrant import (
QdrantConfig, QdrantConfig,
get_client, get_client,
ensure_collections, # used only via wrapper ensure_collections,
ensure_payload_indexes,
)
from app.core.qdrant_points import (
points_for_chunks,
points_for_note,
points_for_edges,
upsert_batch,
) )
# Backward-compatible import for payload index creation
try:
from app.core.qdrant import ensure_payload_indexes as _ensure_payload_indexes
except Exception:
try:
from app.core.qdrant import ensure_payload_indices as _ensure_payload_indexes # older name
except Exception:
def _ensure_payload_indexes(*_args, **_kwargs):
# No-Op for older releases without explicit payload index creation
return None
# Qdrant points helpers (try project first, then safe local fallbacks)
try:
from app.core.qdrant_points import (
points_for_chunks as _points_for_chunks,
points_for_note as _points_for_note,
points_for_edges as _points_for_edges,
upsert_batch as _upsert_batch,
)
except Exception:
# ---- Local fallbacks ----
def _collection_names(prefix: str):
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
def _points_for_note(prefix: str, note_payload: dict, note_vec, dim: int):
notes_col, _, _ = _collection_names(prefix)
raw = (
note_payload.get("point_id")
or note_payload.get("qdrant_id")
or note_payload.get("note_id")
or note_payload.get("id")
or note_payload.get("path")
)
pid: Any = None
if isinstance(raw, int):
pid = raw
elif isinstance(raw, str) and raw.isdigit():
try:
pid = int(raw)
except Exception:
pid = None
if pid is None:
pid = _uuid5_deterministic("note", str(raw or ""))
vec = note_vec if note_vec is not None else [0.0] * int(dim)
pt = rest.PointStruct(id=pid, vector=vec, payload=note_payload)
return notes_col, [pt]
def _points_for_chunks(prefix: str, chunk_payloads: list[dict], vectors: list[list[float]]):
_, chunks_col, _ = _collection_names(prefix)
pts = []
for i, pl in enumerate(chunk_payloads):
raw = (
pl.get("point_id")
or pl.get("qdrant_id")
or pl.get("chunk_id")
or pl.get("id")
or f"{pl.get('note_id','missing')}#{i+1}"
)
pid: Any = None
if isinstance(raw, int):
pid = raw
elif isinstance(raw, str) and raw.isdigit():
try:
pid = int(raw)
except Exception:
pid = None
if pid is None:
pid = _uuid5_deterministic("chunk", str(raw))
vec = vectors[i] if i < len(vectors) else None
if vec is None:
continue
pts.append(rest.PointStruct(id=pid, vector=vec, payload=pl))
return chunks_col, pts
def _points_for_edges(prefix: str, edges: list[dict]):
_, _, edges_col = _collection_names(prefix)
pts = []
for i, e in enumerate(edges):
src_id = e.get("source_id") or e.get("src_id") or "src"
dst_id = e.get("target_id") or e.get("dst_id") or "dst"
kind = e.get("kind") or e.get("edge_type") or "edge"
nid = e.get("note_id") or "note"
raw = e.get("point_id") or e.get("qdrant_id")
if raw is None:
raw = f"{nid}:{kind}:{src_id}->{dst_id}:{i}"
pid: Any = None
if isinstance(raw, int):
pid = raw
elif isinstance(raw, str) and raw.isdigit():
try:
pid = int(raw)
except Exception:
pid = None
if pid is None:
pid = _uuid5_deterministic("edge", str(raw))
pts.append(rest.PointStruct(id=pid, vector=None, payload=e))
return edges_col, pts
def _upsert_batch(client, collection_name: str, points: list):
if not points:
return
# Collections are single-vector (size/distance) per original setup script.
client.upsert(collection_name=collection_name, points=points, wait=True)
# Type-Registry (optional)
try:
from app.core.type_registry import load_type_registry, resolve_note_type, get_type_config, effective_chunk_profile
except Exception:
load_type_registry = None # type: ignore
resolve_note_type = None # type: ignore
get_type_config = None # type: ignore
effective_chunk_profile = None # type: ignore
try: try:
from app.core.embed import embed_texts # optional from app.core.embed import embed_texts # optional
@ -272,76 +200,6 @@ def _env(key: str, default: str) -> str:
return (os.environ.get(key) or default).strip().lower() return (os.environ.get(key) or default).strip().lower()
def _resolve_dim(cfg) -> int:
# Try common attribute names on QdrantConfig
for attr in ("dim", "vector_dim", "dimension", "dimensions", "embedding_dim", "embed_dim", "vector_size", "size"):
try:
v = getattr(cfg, attr)
if isinstance(v, int) and v > 0:
return v
except Exception:
pass
# Try environment fallbacks
for key in ("MINDNET_DIM", "EMBED_DIM", "EMBEDDING_DIM", "QDRANT_VECTOR_DIM", "QDRANT_DIM", "VECTOR_DIM", "DIM"):
try:
v = int(os.environ.get(key, "").strip() or "0")
if v > 0:
return v
except Exception:
continue
# Conservative default
return 384
# ---- Compatibility wrappers (no direct calls to project-specific signatures) ----
def _ensure_collections_compat(client, cfg, dim):
"""
Call ensure_collections with the correct signature across releases:
- preferred: ensure_collections(client, cfg)
- fallbacks: (client, cfg.prefix, dim) -> (client, cfg.prefix) -> (client)
"""
try:
return ensure_collections(client, cfg)
except TypeError:
pass
try:
return ensure_collections(client, cfg.prefix, dim)
except TypeError:
pass
try:
return ensure_collections(client, cfg.prefix)
except TypeError:
pass
try:
return ensure_collections(client)
except TypeError:
pass
# If everything fails, do nothing
return None
def _ensure_payload_indexes_compat(client, cfg):
"""
Try calling payload index creation with cfg, then prefix; ignore if unsupported.
"""
try:
_ensure_payload_indexes(client, cfg)
return
except TypeError:
pass
except AttributeError:
pass
try:
_ensure_payload_indexes(client, getattr(cfg, "prefix", None))
return
except TypeError:
pass
except AttributeError:
pass
# final no-op
return
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
# Main # Main
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
@ -380,25 +238,15 @@ def main() -> None:
src = _env("MINDNET_HASH_SOURCE", args.hash_source or "parsed") # parsed|raw src = _env("MINDNET_HASH_SOURCE", args.hash_source or "parsed") # parsed|raw
norm = _env("MINDNET_HASH_NORMALIZE", args.hash_normalize or "canonical") # canonical|none norm = _env("MINDNET_HASH_NORMALIZE", args.hash_normalize or "canonical") # canonical|none
note_scope_refs_env = (_env("MINDNET_NOTE_SCOPE_REFS", "false") == "true") note_scope_refs_env = (_env("MINDNET_NOTE_SCOPE_REFS", "false") == "true")
note_scope_refs_flag = args.note_scope_refs or note_scope_refs_env note_scope_refs = args.note_scope_refs or note_scope_refs_env
compare_text = args.compare_text or (_env("MINDNET_COMPARE_TEXT", "false") == "true") compare_text = args.compare_text or (_env("MINDNET_COMPARE_TEXT", "false") == "true")
cfg = QdrantConfig.from_env() cfg = QdrantConfig.from_env()
if args.prefix: if args.prefix:
cfg.prefix = args.prefix.strip() cfg.prefix = args.prefix.strip()
client = get_client(cfg) client = get_client(cfg)
dim = _resolve_dim(cfg) ensure_collections(client, cfg.prefix, cfg.dim)
ensure_payload_indexes(client, cfg.prefix)
# Collections & Indexe (nur über Wrapper)
_ensure_collections_compat(client, cfg, dim)
_ensure_payload_indexes_compat(client, cfg)
# Type-Registry laden (optional)
reg = None
if load_type_registry is not None:
reg = load_type_registry()
if reg.get("_using_defaults"):
print(json.dumps({"warn": "type_registry_missing_or_invalid", "info": reg.get("_warning")}))
root = os.path.abspath(args.vault) root = os.path.abspath(args.vault)
@ -467,16 +315,6 @@ def main() -> None:
processed += 1 processed += 1
# -------- Type-Registry: Typvalidierung & Konfiguration --------
fm_type = (fm.get("type") or "concept")
if resolve_note_type is not None:
resolved_type = resolve_note_type(fm_type, reg or {})
else:
resolved_type = (fm_type or "concept")
type_cfg = get_type_config(resolved_type, reg or {"types":{"concept":{}}}) if get_type_config else {}
chunk_profile = effective_chunk_profile(resolved_type, reg or {}) if effective_chunk_profile else None
retriever_weight = type_cfg.get("retriever_weight")
# -------- Build new payload (includes 'hashes') -------- # -------- Build new payload (includes 'hashes') --------
note_pl = make_note_payload( note_pl = make_note_payload(
parsed, parsed,
@ -489,10 +327,6 @@ def main() -> None:
if not note_pl.get("fulltext"): if not note_pl.get("fulltext"):
note_pl["fulltext"] = getattr(parsed, "body", "") or "" note_pl["fulltext"] = getattr(parsed, "body", "") or ""
# retriever_weight (optional) persistieren
if isinstance(retriever_weight, (int, float)):
note_pl["retriever_weight"] = float(retriever_weight)
note_id = note_pl.get("note_id") or fm.get("id") note_id = note_pl.get("note_id") or fm.get("id")
if not note_id: if not note_id:
print(json.dumps({"path": path, "error": "Missing note_id after payload build"})) print(json.dumps({"path": path, "error": "Missing note_id after payload build"}))
@ -522,15 +356,13 @@ def main() -> None:
chunk_pls: List[Dict[str, Any]] = [] chunk_pls: List[Dict[str, Any]] = []
try: try:
body_text = getattr(parsed, "body", "") or "" body_text = getattr(parsed, "body", "") or ""
# assemble_chunks nutzt weiterhin den Note-Typ (keine Breaking Changes) chunks = assemble_chunks(fm["id"], body_text, fm.get("type", "concept"))
chunks = assemble_chunks(fm["id"], body_text, resolved_type) chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text)
# chunk_profile beeinflusst ggf. nur die Fenster-Overlap-Synthese
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=body_text, chunk_profile=chunk_profile)
except Exception as e: except Exception as e:
print(json.dumps({"path": path, "note_id": note_id, "error": f"chunk build failed: {type(e).__name__}: {e}"})) print(json.dumps({"path": path, "note_id": note_id, "error": f"chunk build failed: {type(e).__name__}: {e}"}))
continue continue
vecs: List[List[float]] = [[0.0] * dim for _ in chunk_pls] vecs: List[List[float]] = [[0.0] * cfg.dim for _ in chunk_pls]
if embed_texts and chunk_pls: if embed_texts and chunk_pls:
try: try:
texts_for_embed = [(pl.get("window") or pl.get("text") or "") for pl in chunk_pls] texts_for_embed = [(pl.get("window") or pl.get("text") or "") for pl in chunk_pls]
@ -544,27 +376,22 @@ def main() -> None:
if changed and (not do_baseline_only): if changed and (not do_baseline_only):
try: try:
note_refs = note_pl.get("references") or [] note_refs = note_pl.get("references") or []
# Registry kann note-scope references additiv anschalten
edge_defaults = [e for e in (type_cfg.get("edge_defaults") or []) if isinstance(e, str)]
eff_note_scope_refs = bool(note_scope_refs_flag or ("references" in edge_defaults))
edges = build_edges_for_note( edges = build_edges_for_note(
note_id, note_id,
chunk_pls, chunk_pls,
note_level_references=note_refs, note_level_references=note_refs,
include_note_scope_refs=eff_note_scope_refs, include_note_scope_refs=note_scope_refs,
) )
except Exception as e: except Exception as e:
edges_failed = True edges_failed = True
edges = [] edges = []
# WICHTIG: Wir brechen NICHT mehr ab — Note & Chunks werden geschrieben.
print(json.dumps({"path": path, "note_id": note_id, "warn": f"build_edges_for_note failed, skipping edges: {type(e).__name__}: {e}"})) print(json.dumps({"path": path, "note_id": note_id, "warn": f"build_edges_for_note failed, skipping edges: {type(e).__name__}: {e}"}))
# -------- Summary -------- # -------- Summary --------
summary = { summary = {
"note_id": note_id, "note_id": note_id,
"title": fm.get("title"), "title": fm.get("title"),
"type": resolved_type,
"chunk_profile": chunk_profile,
"retriever_weight": retriever_weight,
"chunks": len(chunk_pls), "chunks": len(chunk_pls),
"edges": len(edges), "edges": len(edges),
"edges_failed": edges_failed, "edges_failed": edges_failed,
@ -590,12 +417,12 @@ def main() -> None:
merged_hashes = {} merged_hashes = {}
merged_hashes.update(old_hashes) merged_hashes.update(old_hashes)
merged_hashes.update(note_pl.get("hashes") or {}) merged_hashes.update(note_pl.get("hashes") or {})
if has_old and old_payload: if old_payload:
note_pl["hash_fulltext"] = old_payload.get("hash_fulltext", note_pl.get("hash_fulltext")) note_pl["hash_fulltext"] = old_payload.get("hash_fulltext", note_pl.get("hash_fulltext"))
note_pl["hash_signature"] = old_payload.get("hash_signature", note_pl.get("hash_signature")) note_pl["hash_signature"] = old_payload.get("hash_signature", note_pl.get("hash_signature"))
note_pl["hashes"] = merged_hashes note_pl["hashes"] = merged_hashes
notes_name, note_pts = _points_for_note(cfg.prefix, note_pl, None, dim) notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
_upsert_batch(client, notes_name, note_pts) upsert_batch(client, notes_name, note_pts)
continue continue
if not changed: if not changed:
@ -607,17 +434,17 @@ def main() -> None:
except Exception as e: except Exception as e:
print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"})) print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"}))
notes_name, note_pts = _points_for_note(cfg.prefix, note_pl, None, dim) notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
_upsert_batch(client, notes_name, note_pts) upsert_batch(client, notes_name, note_pts)
if chunk_pls: if chunk_pls:
chunks_name, chunk_pts = _points_for_chunks(cfg.prefix, chunk_pls, vecs) chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs)
_upsert_batch(client, chunks_name, chunk_pts) upsert_batch(client, chunks_name, chunk_pts)
if edges: if edges:
edges_name, edge_pts = _points_for_edges(cfg.prefix, edges) edges_name, edge_pts = points_for_edges(cfg.prefix, edges)
_upsert_batch(client, edges_name, edge_pts) upsert_batch(client, edges_name, edge_pts)
print(f"Done. Processed notes: {processed}") print(f"Done. Processed notes: {processed}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()