mindnet/scripts/backfill_edges.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FILE: scripts/backfill_edges.py
VERSION: 2.1.0 (2025-12-15)
STATUS: Active
COMPATIBILITY: v2.9.1 (Post-WP14/WP-15b)

Zweck:
-------
Füllt fehlende Edges nach, indem es Wikilinks aus dem Vault extrahiert
und in Qdrant schreibt. Nützlich nach Migrationen oder wenn Edges verloren gingen.

Funktionsweise:
---------------
1. Scannt alle Markdown-Dateien im Vault
2. Für jede Datei:
   - Erstellt Note-Stub (minimaler Payload)
   - Erstellt einen Chunk mit dem gesamten Body
   - Extrahiert Wikilinks via derive_wikilink_edges
3. Baut Lookup-Index (Titel/Alias -> note_id)
4. Löst Wikilinks auf und erzeugt Edges
5. Schreibt Edges in Qdrant

Ergebnis-Interpretation:
------------------------
- Ausgabe: JSON mit Statistiken
  * notes_scanned: Anzahl verarbeiteter Notizen
  * edges_upserted: Anzahl geschriebener Edges
- Exit-Code 0: Erfolgreich

Verwendung:
-----------
- Nach Migrationen oder Datenverlust
- Wenn Edges fehlen oder unvollständig sind
- Zur Reparatur des Graphen

Hinweise:
---------
- Überschreibt existierende Edges (kein Merge)
- Nutzt vereinfachtes Chunking (1 Chunk = gesamter Body)
- Erzeugt nur Wikilink-Edges, keine anderen Edge-Typen

Aufruf:
-------
python3 -m scripts.backfill_edges --vault ./vault
python3 -m scripts.backfill_edges --vault ./vault --exclude /.obsidian/ /_backup/

Parameter:
----------
--vault PATH    Pfad zum Vault-Verzeichnis (erforderlich)
--exclude PATH  Pfade zum Ausschließen (Default: /.obsidian/, /_backup_frontmatter/)

Änderungen:
-----------
v2.1.0 (2025-12-15): Dokumentation aktualisiert
v1.0.0: Initial Release
"""
from __future__ import annotations
import argparse, glob, json, os
from typing import List, Tuple

from app.core.parser import read_markdown  # gibt je nach Implementierung ein Objekt ODER ein (fm, body)-Tuple
from app.core.database.qdrant import QdrantConfig, get_client, ensure_collections
from app.core.database.qdrant_points import points_for_edges, upsert_batch
# TODO: build_note_index und derive_wikilink_edges müssen noch implementiert werden
# from app.core.graph.graph_derive_edges import build_note_index, derive_wikilink_edges


def _coerce_parsed(note_or_tuple):
    """
    Unterstützt beide Varianten von read_markdown:
      - ParsedNote-ähnlich: hat .frontmatter, .body, .path?
      - Tuple: (frontmatter_dict, body_str)
    Gibt (frontmatter: dict, body: str, path: str) zurück.
    """
    fm, body, path = None, None, None

    # Objekt mit Attributen?
    if hasattr(note_or_tuple, "frontmatter") and hasattr(note_or_tuple, "body"):
        fm = getattr(note_or_tuple, "frontmatter") or {}
        body = getattr(note_or_tuple, "body") or ""
        # manche Implementationen haben .path (voll) oder .relpath
        if hasattr(note_or_tuple, "path"):
            path = getattr(note_or_tuple, "path")
        elif hasattr(note_or_tuple, "relpath"):
            path = getattr(note_or_tuple, "relpath")

    # Tuple?
    elif isinstance(note_or_tuple, (tuple, list)) and len(note_or_tuple) >= 2:
        fm = note_or_tuple[0] or {}
        body = note_or_tuple[1] or ""
        # Pfad ist in dieser Variante unbekannt, wird extern gesetzt
    else:
        raise TypeError("Unsupported return type from read_markdown")

    return fm, body, path


def make_note_stub(path: str, fm: dict, body: str) -> dict:
    """
    Minimaler Note-Payload für Linkauflösung:
      - note_id (aus Frontmatter), title, path (relativ), fulltext=body
    """
    note_id = fm.get("id") or fm.get("note_id")
    title = fm.get("title") or os.path.basename(path).rsplit(".", 1)[0]
    return {
        "note_id": note_id,
        "title": title,
        "path": path.replace("\\", "/"),
        "fulltext": body,
    }


def iter_notes(vault: str, excludes: List[str]) -> List[Tuple[dict, List[dict]]]:
    """
    Liefert Liste von (note_stub, chunks_for_link_scan).
    Für Backfill reicht 1 Chunk (= gesamter Body), um [[...]] zu finden.
    """
    files = [p for p in glob.glob(os.path.join(vault, "**/*.md"), recursive=True)]
    out: List[Tuple[dict, List[dict]]] = []
    for abs_path in files:
        if any(ex in abs_path for ex in excludes):
            continue
        try:
            parsed = read_markdown(abs_path)
            fm, body, p = _coerce_parsed(parsed)

            # falls read_markdown den Pfad nicht liefert -> relativ zum Vault bauen
            rel = p if p else os.path.relpath(abs_path, vault)

            stub = make_note_stub(path=rel, fm=fm, body=body)
            if not stub.get("note_id"):
                # ohne stabile ID können wir keine Edges sauber referenzieren
                print(f"skip {rel}: missing note_id in frontmatter")
                continue

            chunk = {
                "chunk_id": f"{stub['note_id']}#1",
                "note_id": stub["note_id"],
                "text": body,
            }
            out.append((stub, [chunk]))
        except Exception as e:
            print(f"skip {abs_path}: {e}")
    return out


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--vault", required=True)
    ap.add_argument("--exclude", nargs="*", default=["/.obsidian/", "/_backup_frontmatter/"])
    args = ap.parse_args()

    cfg = QdrantConfig.from_env()
    client = get_client(cfg)
    ensure_collections(client, cfg.prefix, cfg.dim)

    # 1) Notizen sammeln (stubs) + 1-Chunk pro Note für den Scan
    note_tuples = iter_notes(args.vault, args.exclude)
    note_payloads = [n for n, _ in note_tuples]

    # 2) Index für Zielauflösung
    idx = build_note_index(note_payloads)

    # 3) Edges ableiten
    all_edges = []
    for note_stub, chunks in note_tuples:
        edges = derive_wikilink_edges(note_stub, chunks, idx)
        all_edges.extend(edges)

    # 4) Upsert
    edges_col, edge_pts = points_for_edges(cfg.prefix, all_edges)
    upsert_batch(client, edges_col, edge_pts)

    print(json.dumps({"notes_scanned": len(note_tuples), "edges_upserted": len(edge_pts)}, ensure_ascii=False))


if __name__ == "__main__":
    main()