mindnet/scripts/setup_mindnet_collections.py
Lars c5215e22e7 Implement WP-26 v1.0 - Phase 2: Enhance edge scoring and aggregation configuration
- Introduced configurable edge scoring with internal and external boosts for intra-note edges.
- Added aggregation configuration to support note-level and chunk-level retrieval strategies.
- Updated retriever and graph subgraph modules to utilize new scoring and aggregation logic.
- Enhanced YAML configuration to include new parameters for edge scoring and aggregation levels.
- Added boolean indexing for filtering based on edge properties in the setup script.
2026-01-25 21:06:13 +01:00

204 lines
7.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/setup_mindnet_collections.py
VERSION: 2.2.0 (2026-01-25)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b), WP-26 (Intra-Note-Edges)
Zweck:
-------
Richtet die Qdrant-Collections für das mindnet-Projekt ein.
Legt Collections und Payload-Indizes an (idempotent).
Funktionsweise:
---------------
1. Prüft Qdrant-Verfügbarkeit (optional /ready Endpoint)
2. Legt drei Collections an:
- {prefix}_chunks: Semantische Suche über Text-Chunks (Vektor: dim/Cosine)
- {prefix}_notes: Metadaten pro Notiz (Vektor: dim/Cosine)
- {prefix}_edges: Link-Kanten (Dummy-Vektor size=1, Filter über Payload)
3. Richtet Payload-Indizes ein:
- Keyword-Indizes für häufige Filter-Felder
- Text-Index für Volltextsuche (chunks.text)
Ergebnis-Interpretation:
------------------------
- Ausgabe: Status pro Collection ([+] angelegt, [=] existiert bereits)
- Abschluss: JSON-Liste aller vorhandenen Collections
- Exit-Code 0: Erfolgreich
- Exit-Code 1: Fehler (z.B. Qdrant nicht erreichbar)
Verwendung:
-----------
- Initial-Setup einer neuen mindnet-Instanz
- Nach Qdrant-Reset oder Migration
- Validierung der Collection-Struktur
Hinweise:
---------
- Idempotent: Überspringt existierende Collections
- Nutzt HTTP-API direkt (kein qdrant-client)
- Legt nur Basis-Indizes an (erweiterte Indizes via ensure_payload_indexes)
Aufruf:
-------
python3 -m scripts.setup_mindnet_collections --qdrant-url http://127.0.0.1:6333 --prefix mindnet --dim 768
python3 -m scripts.setup_mindnet_collections --prefix mindnet_dev --dim 768 --distance Cosine
Parameter:
----------
--qdrant-url URL Qdrant-URL (Default: http://127.0.0.1:6333)
--prefix TEXT Collection-Präfix (Default: mindnet)
--dim INT Vektor-Dimension (Default: 384, empfohlen: 768 für nomic)
--distance METRIC Cosine | Euclid | Dot (Default: Cosine)
Änderungen:
-----------
v2.1.0 (2025-12-15): Kompatibilität mit WP-14 Modularisierung
- Dokumentation aktualisiert
v1.0.0: Initial Release
"""
from __future__ import annotations
import argparse
import json
import sys
from dataclasses import dataclass
from typing import Any, Dict
import requests
@dataclass
class QdrantHTTP:
base_url: str
def _url(self, path: str) -> str:
return self.base_url.rstrip("/") + path
def rq(self, method: str, path: str, **kwargs) -> requests.Response:
url = self._url(path)
r = requests.request(method, url, timeout=20, **kwargs)
if not r.ok:
raise RuntimeError(f"{method} {url} -> {r.status_code} {r.text}")
return r
def collection_exists(self, name: str) -> bool:
r = self.rq("GET", f"/collections/{name}")
data = r.json()
return data.get("result", {}).get("status") == "green"
def create_collection(self, name: str, size: int, distance: str = "Cosine") -> None:
if self.collection_exists(name):
print(f"[=] Collection '{name}' existiert bereits überspringe Anlage.")
return
payload = {"vectors": {"size": size, "distance": distance}}
self.rq("PUT", f"/collections/{name}", json=payload)
print(f"[+] Collection '{name}' angelegt (size={size}, distance={distance}).")
def create_keyword_index(self, collection: str, field: str) -> None:
payload = {"field_name": field, "field_schema": "keyword"}
self.rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index keyword on {collection}.{field}")
def create_text_index(self, collection: str, field: str = "text") -> None:
payload = {"field_name": field, "field_schema": {"type": "text"}}
self.rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index text on {collection}.{field}")
def create_bool_index(self, collection: str, field: str) -> None:
"""WP-26 v1.0: Boolean-Index für Filterung (z.B. is_internal)."""
payload = {"field_name": field, "field_schema": "bool"}
self.rq("PUT", f"/collections/{collection}/index", json=payload)
print(f"[+] Index bool on {collection}.{field}")
def list_collections(self) -> Dict[str, Any]:
r = self.rq("GET", "/collections")
return r.json().get("result", {}).get("collections", [])
def setup_mindnet_collections(q: QdrantHTTP, prefix: str, dim: int, distance: str) -> None:
chunks = f"{prefix}_chunks"
notes = f"{prefix}_notes"
edges = f"{prefix}_edges"
# 1) Collections anlegen
q.create_collection(chunks, size=dim, distance=distance)
q.create_collection(notes, size=dim, distance=distance)
q.create_collection(edges, size=1, distance=distance) # Dummy-Vektor
# 2) Indizes definieren
# mindnet_chunks: häufige Filter + Volltext
for f in ["note_id", "Status", "Typ", "title", "path"]:
q.create_keyword_index(chunks, f)
for f in ["tags", "Rolle", "links"]:
q.create_keyword_index(chunks, f)
# WP-26 v1.0: note_type für Filterung (Section-Type vs Note-Type)
q.create_keyword_index(chunks, "note_type")
q.create_keyword_index(chunks, "type") # Effektiver Typ (section_type || note_type)
q.create_text_index(chunks, "text") # Volltextsuche auf dem Textfeld
# mindnet_notes: Metadaten der Notizen
for f in ["note_id", "title", "path", "Typ", "Status"]:
q.create_keyword_index(notes, f)
for f in ["tags", "Rolle"]:
q.create_keyword_index(notes, f)
# mindnet_edges: Graph/Kanten (Filter-only)
for f in [
"src_note_id",
"dst_note_id",
"src_chunk_id",
"dst_chunk_id",
"link_text",
"relation",
"kind", # WP-26 v1.0: Kantentyp für Filterung
"source_id", # WP-26 v1.0: Source-ID für Graph-Queries
"target_id", # WP-26 v1.0: Target-ID für Graph-Queries
"scope", # WP-26 v1.0: "chunk" oder "note"
"provenance", # WP-26 v1.0: Herkunft der Kante
]:
q.create_keyword_index(edges, f)
# WP-26 v1.0: Boolean-Index für is_internal (Intra-Note-Edge-Filterung)
q.create_bool_index(edges, "is_internal")
def parse_args() -> argparse.Namespace:
ap = argparse.ArgumentParser()
ap.add_argument("--qdrant-url", default="http://127.0.0.1:6333", help="z.B. http://127.0.0.1:6333")
ap.add_argument("--prefix", default="mindnet", help="Collection-Präfix (default: mindnet)")
ap.add_argument("--dim", type=int, default=384, help="Embedding-Dimension (z.B. 384 für MiniLM)")
ap.add_argument("--distance", default="Cosine", choices=["Cosine", "Euclid", "Dot"], help="Distanzmetrik")
return ap.parse_args()
def main() -> int:
args = parse_args()
q = QdrantHTTP(args.qdrant_url)
try:
# Readiness (optional, ignoriert Fehler)
try:
r = q.rq("GET", "/ready")
if r.text.strip():
print(f"[ready] {r.text.strip()}")
except Exception as e:
print(f"[warn] /ready nicht erreichbar oder kein Text: {e}")
setup_mindnet_collections(q, prefix=args.prefix, dim=args.dim, distance=args.distance)
cols = q.list_collections()
print("\n[Info] Collections vorhanden:")
print(json.dumps(cols, indent=2, ensure_ascii=False))
return 0
except Exception as e:
print(f"[ERROR] {e}", file=sys.stderr)
return 1
if __name__ == "__main__":
raise SystemExit(main())