scripts/import_markdown.py aktualisiert

2025-09-03 07:58:57 +02:00 · 2025-09-03 07:58:57 +02:00 · 5254eb22ed
commit 5254eb22ed
parent 309462dfa9
1 changed files with 94 additions and 142 deletions
--- a/scripts/import_markdown.py
+++ b/scripts/import_markdown.py
@ -1,157 +1,109 @@
-
 #!/usr/bin/env python3
 from __future__ import annotations
-import argparse, hashlib, os, re, sys, time
-from typing import List, Tuple, Dict, Any
-import requests
-import frontmatter
-from tqdm import tqdm
+import argparse, os, glob, json, sys
+from dotenv import load_dotenv
+from qdrant_client import QdrantClient

-WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
-MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
+from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
+from app.core.note_payload import make_note_payload
+from app.core.validate_note import validate_note_payload
+from app.core.chunker import assemble_chunks
+from app.core.chunk_payload import make_chunk_payloads
+from app.core.embed import embed_texts, embed_one
+from app.core.qdrant import QdrantConfig, ensure_collections, get_client
+from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
+from app.core.edges import deriv_edges_for_note

-def stable_id(s: str) -> str:
-    return hashlib.sha1(s.encode("utf-8")).hexdigest()
-
-def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]:
-    # Prefer splitting on headings and paragraphs, then length bucket
-    parts: List[str] = []
-    # Split by headings while keeping content
-    blocks = re.split(r"(?m)^#{1,6}\s.*$", text)
-    for block in blocks:
-        block = block.strip()
-        if not block:
+def iter_md(root: str, exclude=(" /.obsidian/ ", " /_backup_frontmatter/ ", " /_imported/ ")):
+    files = [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
+    out = []
+    for p in files:
+        pn = p.replace("\\","/")
+        if any(ex.strip() in pn for ex in ["/.obsidian/", "/_backup_frontmatter/", "/_imported/"]):
            continue
-        # further split by double-newline paragraphs
-        paras = [p.strip() for p in block.split("\n\n") if p.strip()]
-        buf = ""
-        for p in paras:
-            if len(buf) + len(p) + 2 <= max_chars:
-                buf = f"{buf}\n\n{p}" if buf else p
-            else:
-                if buf:
-                    parts.append(buf)
-                # if paragraph itself is very long, hard-split
-                while len(p) > max_chars:
-                    parts.append(p[:max_chars])
-                    p = p[max_chars - overlap:]
-                buf = p
-        if buf:
-            parts.append(buf)
-            buf = ""
-    # merge with overlap
-    merged: List[str] = []
-    for i, part in enumerate(parts):
-        if not merged:
-            merged.append(part)
-        else:
-            prev = merged[-1]
-            if len(prev) + len(part) <= max_chars:
-                merged[-1] = prev + "\n\n" + part
-            else:
-                # add overlap from prev end to next start
-                tail = prev[-overlap:] if len(prev) > overlap else prev
-                merged.append(tail + "\n\n" + part)
-    return merged if merged else [text[:max_chars]]
-
-def extract_links(md_text: str) -> List[str]:
-    links = []
-    links += WIKI_LINK_RE.findall(md_text)
-    links += [m[1] for m in MD_LINK_RE.findall(md_text)]
-    return list(dict.fromkeys(links))  # de-dup, preserve order
-
-def import_note(path: str, vault_root: str, api_base: str, prefix: str, dry_run: bool = False) -> Dict[str, Any]:
-    rel_path = os.path.relpath(path, vault_root).replace("\\", "/")
-    with open(path, "r", encoding="utf-8") as f:
-        post = frontmatter.load(f)
-    content = post.content
-    meta = post.metadata or {}
-    title = meta.get("title") or os.path.splitext(os.path.basename(path))[0]
-    note_id = meta.get("note_id") or stable_id(rel_path)
-
-    # Upsert note
-    note_payload = {
-        "note_id": note_id,
-        "title": title,
-        "path": rel_path,
-        "Typ": meta.get("Typ"),
-        "Status": meta.get("Status"),
-        "tags": meta.get("tags"),
-        "Rolle": meta.get("Rolle"),
-        "text": content,
-    }
-    if not dry_run:
-        r = requests.post(f"{api_base}/qdrant/upsert_note", json=note_payload, timeout=60)
-        r.raise_for_status()
-
-    # Chunks
-    chunks = chunk_text(content)
-    results = {"chunks": 0, "edges": 0}
-    for idx, chunk in enumerate(chunks, start=1):
-        chunk_id = stable_id(f"{rel_path}#chunk-{idx}")
-        links = extract_links(chunk)
-        chunk_payload = {
-            "chunk_id": chunk_id,
-            "note_id": note_id,
-            "title": title,
-            "path": rel_path,
-            "Typ": meta.get("Typ"),
-            "Status": meta.get("Status"),
-            "tags": meta.get("tags"),
-            "Rolle": meta.get("Rolle"),
-            "text": chunk,
-            "links": links,
-        }
-        if not dry_run:
-            r = requests.post(f"{api_base}/qdrant/upsert_chunk", json=chunk_payload, timeout=60)
-            r.raise_for_status()
-        results["chunks"] += 1
-
-        # Edges from links
-        for link in links:
-            edge_payload = {
-                "src_note_id": note_id,
-                "dst_note_id": stable_id(link),  # naive target id from link text or URL
-                "src_chunk_id": chunk_id,
-                "dst_chunk_id": None,
-                "relation": "links_to",
-                "link_text": link,
-            }
-            if not dry_run:
-                r = requests.post(f"{api_base}/qdrant/upsert_edge", json=edge_payload, timeout=60)
-                r.raise_for_status()
-            results["edges"] += 1
-
-    return {"note_id": note_id, "title": title, "path": rel_path, **results}
+        out.append(p)
+    return out

 def main():
+    load_dotenv()
    ap = argparse.ArgumentParser()
-    ap.add_argument("--vault", required=True, help="Path to Obsidian vault root")
-    ap.add_argument("--api-base", default="http://127.0.0.1:8001", help="mindnet API base URL")
-    ap.add_argument("--prefix", default="mindnet", help="Collection prefix (kept for future use)")
-    ap.add_argument("--dry-run", action="store_true", help="Parse and show, but do not upsert")
+    ap.add_argument("--vault", required=True, help="Obsidian Vault Pfad (z.B. mindnet/vault)")
+    ap.add_argument("--apply", action="store_true", help="Schreibt in Qdrant (sonst Dry-Run)")
+    ap.add_argument("--note-id", help="Nur eine Note-ID verarbeiten")
+    ap.add_argument("--embed-note", action="store_true", help="Auch Note-Volltext einbetten (optional)")
    args = ap.parse_args()

-    md_files = []
-    for root, _, files in os.walk(args.vault):
-        for fn in files:
-            if fn.lower().endswith(".md"):
-                md_files.append(os.path.join(root, fn))
+    # Qdrant
+    cfg = QdrantConfig(
+        url=os.getenv("QDRANT_URL", "http://127.0.0.1:6333"),
+        api_key=os.getenv("QDRANT_API_KEY") or None,
+        prefix=os.getenv("COLLECTION_PREFIX", "mindnet"),
+        dim=int(os.getenv("VECTOR_DIM","384")),
+    )
+    client = get_client(cfg)
+    ensure_collections(cfg)

-    if not md_files:
-        print("No .md files found.")
-        return 0
+    root = os.path.abspath(args.vault)
+    files = iter_md(root)
+    if not files:
+        print("Keine Markdown-Dateien gefunden.", file=sys.stderr); sys.exit(2)

-    print(f"Found {len(md_files)} markdown files.")
-    stats = {"notes": 0, "chunks": 0, "edges": 0}
-    for path in tqdm(md_files, desc="Importing"):
-        res = import_note(path, args.vault, args.api_base, args.prefix, args.dry_run)
-        stats["notes"] += 1
-        stats["chunks"] += res["chunks"]
-        stats["edges"] += res["edges"]
+    total_notes = 0
+    for path in files:
+        parsed = read_markdown(path)
+        fm = normalize_frontmatter(parsed.frontmatter)
+        try:
+            validate_required_frontmatter(fm)
+        except Exception:
+            continue
+        if args.note_id and fm.get("id") != args.note_id:
+            continue

-    print(f"Done. Notes: {stats['notes']}  Chunks: {stats['chunks']}  Edges: {stats['edges']}")
-    return 0
+        total_notes += 1
+        # Note-Payload
+        note_pl = make_note_payload(parsed, vault_root=root)
+        validate_note_payload(note_pl)
+
+        # Chunks
+        chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
+        chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
+
+        # Embeddings (Chunks)
+        texts = [c for c in (ch.text for ch in chunks)]
+        vectors = embed_texts(texts)
+
+        # Optional: Note-Vektor
+        note_vec = None
+        if args.embed_note:
+            note_vec = embed_one(parsed.body)
+
+        # Kanten
+        edges = deriv_edges_for_note(fm, chunk_pls)
+
+        # Dry-Run-Ausgabe
+        summary = {
+            "note_id": fm["id"],
+            "title": fm["title"],
+            "chunks": len(chunk_pls),
+            "edges": len(edges),
+            "path": note_pl["path"]
+        }
+        print(json.dumps(summary, ensure_ascii=False))
+
+        if args.apply:
+            # Notes upsert
+            notes_col, note_pts = points_for_note(cfg.prefix, note_pl, note_vec)
+            upsert_batch(client, notes_col, note_pts)
+
+            # Chunks upsert
+            chunks_col, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vectors)
+            upsert_batch(client, chunks_col, chunk_pts)
+
+            # Edges upsert
+            edges_col, edge_pts = points_for_edges(cfg.prefix, edges)
+            upsert_batch(client, edges_col, edge_pts)
+
+    print(f"Done. Processed notes: {total_notes}")

 if __name__ == "__main__":
-    raise SystemExit(main())
+    main()