scripts/import_markdown.py aktualisiert

2025-11-08 15:35:00 +01:00 · 2025-11-08 15:35:00 +01:00 · 3282f85007
commit 3282f85007
parent b186569750
1 changed files with 38 additions and 10 deletions
--- a/scripts/import_markdown.py
+++ b/scripts/import_markdown.py
@ -219,6 +219,26 @@ def _resolve_mode(val: Optional[str]) -> str:
 def _env(key: str, default: str) -> str:
 def _resolve_dim(cfg) -> int:
    # Try common attribute names on QdrantConfig
    for attr in ("dim", "vector_dim", "dimension", "dimensions", "embedding_dim", "embed_dim", "vector_size", "size"):
        try:
            v = getattr(cfg, attr)
            if isinstance(v, int) and v > 0:
                return v
        except Exception:
            pass
    # Try environment fallbacks
    for key in ("MINDNET_DIM", "EMBED_DIM", "EMBEDDING_DIM", "QDRANT_VECTOR_DIM", "QDRANT_DIM", "VECTOR_DIM", "DIM"):
        try:
            v = int(os.environ.get(key, "").strip() or "0")
            if v > 0:
                return v
        except Exception:
            continue
    # Conservative default: MiniLM 384d (im Projekt üblich)
    return 384
    return (os.environ.get(key) or default).strip().lower()
@ -267,7 +287,15 @@ def main() -> None:
    if args.prefix:
        cfg.prefix = args.prefix.strip()
    client = get_client(cfg)
-    ensure_collections(client, cfg.prefix, cfg.dim)
+    dim = _resolve_dim(cfg)
    # ensure_collections signature compatibility
    try:
        ensure_collections(client, cfg.prefix, dim)
    except TypeError:
        try:
            ensure_collections(client, cfg.prefix)
        except TypeError:
            ensure_collections(client)
    # abwärtskompatible Index-Erstellung
    _ensure_payload_indexes(client, cfg.prefix)
@ -408,7 +436,7 @@ def main() -> None:
            print(json.dumps({"path": path, "note_id": note_id, "error": f"chunk build failed: {type(e).__name__}: {e}"}))
            continue
-        vecs: List[List[float]] = [[0.0] * cfg.dim for _ in chunk_pls]
+        vecs: List[List[float]] = [[0.0] * dim for _ in chunk_pls]
        if embed_texts and chunk_pls:
            try:
                texts_for_embed = [(pl.get("window") or pl.get("text") or "") for pl in chunk_pls]
@ -473,8 +501,8 @@ def main() -> None:
                note_pl["hash_fulltext"] = old_payload.get("hash_fulltext", note_pl.get("hash_fulltext"))
                note_pl["hash_signature"] = old_payload.get("hash_signature", note_pl.get("hash_signature"))
            note_pl["hashes"] = merged_hashes
-            notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
+            notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, dim)
-            upsert_batch(client, notes_name, note_pts)
+            _upsert_batch(client, notes_name, note_pts)
            continue
        if not changed:
@ -486,14 +514,14 @@ def main() -> None:
            except Exception as e:
                print(json.dumps({"path": path, "note_id": note_id, "warn": f"purge failed: {e}"}))
-        notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, cfg.dim)
+        notes_name, note_pts = points_for_note(cfg.prefix, note_pl, None, dim)
-        upsert_batch(client, notes_name, note_pts)
+        _upsert_batch(client, notes_name, note_pts)
        if chunk_pls:
-            chunks_name, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vecs)
+            chunks_name, chunk_pts = _points_for_chunks(cfg.prefix, chunk_pls, vecs)
-            upsert_batch(client, chunks_name, chunk_pts)
+            _upsert_batch(client, chunks_name, chunk_pts)
        if edges:
-            edges_name, edge_pts = points_for_edges(cfg.prefix, edges)
+            edges_name, edge_pts = _points_for_edges(cfg.prefix, edges)
-            upsert_batch(client, edges_name, edge_pts)
+            _upsert_batch(client, edges_name, edge_pts)
    print(f"Done. Processed notes: {processed}")