diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index 898c479..2ace4d1 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -2,13 +2,16 @@ # -*- coding: utf-8 -*- """ Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges) -Version: 3.8.4 +Version: 3.8.5 Date: 2025-11-08 -Changes vs 3.8.3 ------------------ -- Fixed SyntaxError by moving `import uuid` to top-level and simplifying the UUIDv5 helper. -- No functional changes otherwise. +Notes +----- +- Uses compatibility wrappers for ensure_collections and payload index creation. +- Provides robust local fallbacks for qdrant_points helpers. +- Generates valid Qdrant point IDs (int or UUIDv5) if none provided. +- Detects Named-Vector schema and coerces points accordingly. +- Integrates Type-Registry without breaking older behavior. """ from __future__ import annotations @@ -28,6 +31,7 @@ def _uuid5_deterministic(*parts: str) -> str: base = ":".join(str(p) for p in parts if p is not None) return str(_uuid.uuid5(_MN_NAMESPACE, base)) +# --- Project imports (as in stable 20251105) --- from app.core.parser import ( read_markdown, normalize_frontmatter, @@ -41,11 +45,10 @@ try: except Exception: # pragma: no cover from app.core.edges import build_edges_for_note # type: ignore -# Qdrant-Basics from app.core.qdrant import ( QdrantConfig, get_client, - ensure_collections, # used only via compatibility wrapper below + ensure_collections, # used only via wrapper ) # Backward-compatible import for payload index creation @@ -56,10 +59,10 @@ except Exception: from app.core.qdrant import ensure_payload_indices as _ensure_payload_indexes # older name except Exception: def _ensure_payload_indexes(*_args, **_kwargs): - # No-Op: older releases without dedicated index creation + # No-Op for older releases without explicit payload index creation return None -# Qdrant points helpers (robust against older names / missing module functions) +# Qdrant points helpers (try project first, then safe local fallbacks) try: from app.core.qdrant_points import ( points_for_chunks as _points_for_chunks, @@ -68,17 +71,20 @@ try: upsert_batch as _upsert_batch, ) except Exception: - # Local fallbacks (No-Break) - from qdrant_client.http import models as _rest - + # ---- Local fallbacks ---- def _collection_names(prefix: str): return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges" def _points_for_note(prefix: str, note_payload: dict, note_vec, dim: int): notes_col, _, _ = _collection_names(prefix) - raw = note_payload.get("point_id") or note_payload.get("qdrant_id") or note_payload.get("note_id") or note_payload.get("id") or note_payload.get("path") - # Accept integer IDs if provided - pid = None + raw = ( + note_payload.get("point_id") + or note_payload.get("qdrant_id") + or note_payload.get("note_id") + or note_payload.get("id") + or note_payload.get("path") + ) + pid: Any = None if isinstance(raw, int): pid = raw elif isinstance(raw, str) and raw.isdigit(): @@ -89,15 +95,21 @@ except Exception: if pid is None: pid = _uuid5_deterministic("note", str(raw or "")) vec = note_vec if note_vec is not None else [0.0] * int(dim) - pt = _rest.PointStruct(id=pid, vector=vec, payload=note_payload) + pt = rest.PointStruct(id=pid, vector=vec, payload=note_payload) return notes_col, [pt] def _points_for_chunks(prefix: str, chunk_payloads: list[dict], vectors: list[list[float]]): _, chunks_col, _ = _collection_names(prefix) pts = [] for i, pl in enumerate(chunk_payloads): - raw = pl.get("point_id") or pl.get("qdrant_id") or pl.get("chunk_id") or pl.get("id") or f"{pl.get('note_id','missing')}#{i+1}" - pid = None + raw = ( + pl.get("point_id") + or pl.get("qdrant_id") + or pl.get("chunk_id") + or pl.get("id") + or f"{pl.get('note_id','missing')}#{i+1}" + ) + pid: Any = None if isinstance(raw, int): pid = raw elif isinstance(raw, str) and raw.isdigit(): @@ -110,7 +122,7 @@ except Exception: vec = vectors[i] if i < len(vectors) else None if vec is None: continue - pts.append(_rest.PointStruct(id=pid, vector=vec, payload=pl)) + pts.append(rest.PointStruct(id=pid, vector=vec, payload=pl)) return chunks_col, pts def _points_for_edges(prefix: str, edges: list[dict]): @@ -124,7 +136,7 @@ except Exception: raw = e.get("point_id") or e.get("qdrant_id") if raw is None: raw = f"{nid}:{kind}:{src_id}->{dst_id}:{i}" - pid = None + pid: Any = None if isinstance(raw, int): pid = raw elif isinstance(raw, str) and raw.isdigit(): @@ -134,14 +146,14 @@ except Exception: pid = None if pid is None: pid = _uuid5_deterministic("edge", str(raw)) - pts.append(_rest.PointStruct(id=pid, vector=None, payload=e)) + pts.append(rest.PointStruct(id=pid, vector=None, payload=e)) return edges_col, pts def _upsert_batch(client, collection_name: str, points: list): - if not points: - return - pts = _coerce_points_for_collection(client, collection_name, points) - client.upsert(collection_name=collection_name, points=pts, wait=True) + if not points: + return + pts = _coerce_points_for_collection(client, collection_name, points) + client.upsert(collection_name=collection_name, points=pts, wait=True) # Type-Registry (optional) try: @@ -277,12 +289,10 @@ def _resolve_dim(cfg) -> int: return v except Exception: continue - # Conservative default: MiniLM 384d + # Conservative default return 384 -# ---- Compatibility wrappers (no direct calls to project-specific signatures) ---- - # ---- Qdrant vector schema detection & point coercion ---- def _get_vector_schema(client, collection_name: str): """ @@ -293,17 +303,20 @@ def _get_vector_schema(client, collection_name: str): try: info = client.get_collection(collection_name=collection_name) vecs = getattr(info, "vectors", None) + # Single-vector config if hasattr(vecs, "size") and isinstance(vecs.size, int): return {"kind": "single", "size": vecs.size} - if hasattr(vecs, "config"): - # NamedVectors as dict-like in .config - names = list(getattr(vecs, "config", {}).keys()) + # Named-vectors config + cfg = getattr(vecs, "config", None) + if isinstance(cfg, dict) and cfg: + names = list(cfg.keys()) if names: return {"kind": "named", "names": names} except Exception: pass return {"kind": "single", "size": None} + def _coerce_points_for_collection(client, collection_name: str, points: list): """ If collection uses named vectors, wrap each point's .vector into .vectors{: vector}. @@ -317,20 +330,19 @@ def _coerce_points_for_collection(client, collection_name: str, points: list): if not names: return points primary = names[0] - from qdrant_client.http import models as _rest fixed = [] for pt in points: - # pt may be a dataclass; create a new PointStruct when needed vec = getattr(pt, "vector", None) if vec is not None: - fixed.append(_rest.PointStruct(id=pt.id, vectors={primary: vec}, payload=pt.payload)) + fixed.append(rest.PointStruct(id=pt.id, vectors={primary: vec}, payload=pt.payload)) else: - # keep as-is (no vectors) fixed.append(pt) return fixed except Exception: return points + +# ---- Compatibility wrappers (no direct calls to project-specific signatures) ---- def _ensure_collections_compat(client, cfg, dim): """ Call ensure_collections with the correct signature across releases: @@ -356,6 +368,7 @@ def _ensure_collections_compat(client, cfg, dim): # If everything fails, do nothing return None + def _ensure_payload_indexes_compat(client, cfg): """ Try calling payload index creation with cfg, then prefix; ignore if unsupported. @@ -592,7 +605,6 @@ def main() -> None: except Exception as e: edges_failed = True edges = [] - # WICHTIG: Wir brechen NICHT mehr ab — Note & Chunks werden geschrieben. print(json.dumps({"path": path, "note_id": note_id, "warn": f"build_edges_for_note failed, skipping edges: {type(e).__name__}: {e}"})) # -------- Summary --------