mindnet/scripts/import_markdown.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script: scripts/import_markdown.py — Markdown → Qdrant (Notes, Chunks, Edges)
Version: 3.7.1
Datum: 2025-09-30
(Änderung: Embeddings aus payload['window']; Chunk-Payloads mit Offsets/Overlap)
"""
from __future__ import annotations

import argparse
import json
import os
import sys
from typing import Dict, List, Optional, Tuple, Any, Set

from dotenv import load_dotenv
from qdrant_client.http import models as rest

from app.core.parser import (
    read_markdown,
    normalize_frontmatter,
    validate_required_frontmatter,
)
from app.core.note_payload import make_note_payload
from app.core.chunker import assemble_chunks
from app.core.chunk_payload import make_chunk_payloads
try:
    from app.core.derive_edges import build_edges_for_note
except Exception:  # pragma: no cover
    from app.core.edges import build_edges_for_note  # type: ignore
from app.core.qdrant import (
    QdrantConfig,
    get_client,
    ensure_collections,
    ensure_payload_indexes,
)
from app.core.qdrant_points import (
    points_for_chunks,
    points_for_note,
    points_for_edges,
    upsert_batch,
)

try:
    from app.core.embed import embed_texts  # optional
except Exception:
    embed_texts = None

# (… unverändert: Helper-Funktionen list_qdrant_note_ids, purge_note_artifacts, etc. …)
# --- Hinweis: Aus Platzgründen hier nicht erneut abgedruckt; bitte deine bestehende v3.7.0-Datei verwenden ---
# --- und NUR die markierten Stellen unten anpassen. Wenn du möchtest, liefere ich dir die komplette Datei erneut. ---

def main() -> None:
    load_dotenv()
    ap = argparse.ArgumentParser()
    # (… alle Argumente wie in v3.7.0 …)
    ap.add_argument("--vault", required=True)
    ap.add_argument("--apply", action="store_true")
    ap.add_argument("--purge-before-upsert", action="store_true")
    ap.add_argument("--note-id")
    ap.add_argument("--embed-note", action="store_true")
    ap.add_argument("--force-replace", action="store_true")
    ap.add_argument("--hash-mode", choices=["body", "frontmatter", "full"], default=None)
    ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
    ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None)
    ap.add_argument("--note-scope-refs", action="store_true")
    ap.add_argument("--debug-hash-diff", action="store_true")
    ap.add_argument("--compare-text", action="store_true")
    ap.add_argument("--baseline-modes", action="store_true")
    ap.add_argument("--sync-deletes", action="store_true")
    ap.add_argument("--prefix")
    args = ap.parse_args()

    # (… Konfiguration/Qdrant-Setup wie gehabt …)
    from app.core.qdrant import QdrantConfig, get_client
    cfg = QdrantConfig.from_env()
    if args.prefix:
        cfg.prefix = args.prefix.strip()
    client = get_client(cfg)
    ensure_collections(client, cfg.prefix, cfg.dim)
    ensure_payload_indexes(client, cfg.prefix)

    root = os.path.abspath(args.vault)
    # (… Dateiiteration, Parsing, Validation …)

    processed = 0
    # (… for path in files: …)
        # parsed = read_markdown(path)  -> liefert .frontmatter, .body
        # fm = normalize_frontmatter(parsed.frontmatter); validate_required_frontmatter(fm)
        # note_pl = make_note_payload(…)

        # -------- Chunks & Payloads --------
        chunks = assemble_chunks(fm["id"], getattr(parsed, "body", "") or "", fm.get("type", "concept"))
        # NEU: note_text an make_chunk_payloads übergeben, damit Offsets stimmen
        chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, note_text=(getattr(parsed, "body", "") or ""))

        # -------- Embeddings --------
        vecs = [[0.0] * cfg.dim for _ in chunk_pls]
        if embed_texts and chunk_pls:
            try:
                # NEU: Embeddings über 'window' (inkl. Overlap/Kontext)
                texts_for_embed = [ (pl.get("window") or pl.get("text") or "") for pl in chunk_pls ]
                vecs = embed_texts(texts_for_embed)
            except Exception as e:
                print(json.dumps({"note_id": fm["id"], "warn": f"embed_texts failed, using zeros: {e}"}))

        # (… Edges, Changed-Logik, Writes wie gehabt …)
        # upsert: points_for_chunks(prefix, chunk_pls, vecs)
        #         points_for_note(prefix, note_pl, None, cfg.dim)
        #         points_for_edges(prefix, edges)

    # print("Done. Processed notes: …")

if __name__ == "__main__":
    main()