scripts/import_markdown.py hinzugefügt

2025-09-01 17:37:39 +02:00 · 2025-09-01 17:37:39 +02:00 · f248ba962b
commit f248ba962b
parent 1c484e1ca0
1 changed files with 157 additions and 0 deletions
--- a/scripts/import_markdown.py
+++ b/scripts/import_markdown.py
@ -0,0 +1,157 @@
+
+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse, hashlib, os, re, sys, time
+from typing import List, Tuple, Dict, Any
+import requests
+import frontmatter
+from tqdm import tqdm
+
+WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
+MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
+
+def stable_id(s: str) -> str:
+    return hashlib.sha1(s.encode("utf-8")).hexdigest()
+
+def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]:
+    # Prefer splitting on headings and paragraphs, then length bucket
+    parts: List[str] = []
+    # Split by headings while keeping content
+    blocks = re.split(r"(?m)^#{1,6}\s.*$", text)
+    for block in blocks:
+        block = block.strip()
+        if not block:
+            continue
+        # further split by double-newline paragraphs
+        paras = [p.strip() for p in block.split("\n\n") if p.strip()]
+        buf = ""
+        for p in paras:
+            if len(buf) + len(p) + 2 <= max_chars:
+                buf = f"{buf}\n\n{p}" if buf else p
+            else:
+                if buf:
+                    parts.append(buf)
+                # if paragraph itself is very long, hard-split
+                while len(p) > max_chars:
+                    parts.append(p[:max_chars])
+                    p = p[max_chars - overlap:]
+                buf = p
+        if buf:
+            parts.append(buf)
+            buf = ""
+    # merge with overlap
+    merged: List[str] = []
+    for i, part in enumerate(parts):
+        if not merged:
+            merged.append(part)
+        else:
+            prev = merged[-1]
+            if len(prev) + len(part) <= max_chars:
+                merged[-1] = prev + "\n\n" + part
+            else:
+                # add overlap from prev end to next start
+                tail = prev[-overlap:] if len(prev) > overlap else prev
+                merged.append(tail + "\n\n" + part)
+    return merged if merged else [text[:max_chars]]
+
+def extract_links(md_text: str) -> List[str]:
+    links = []
+    links += WIKI_LINK_RE.findall(md_text)
+    links += [m[1] for m in MD_LINK_RE.findall(md_text)]
+    return list(dict.fromkeys(links))  # de-dup, preserve order
+
+def import_note(path: str, vault_root: str, api_base: str, prefix: str, dry_run: bool = False) -> Dict[str, Any]:
+    rel_path = os.path.relpath(path, vault_root).replace("\\", "/")
+    with open(path, "r", encoding="utf-8") as f:
+        post = frontmatter.load(f)
+    content = post.content
+    meta = post.metadata or {}
+    title = meta.get("title") or os.path.splitext(os.path.basename(path))[0]
+    note_id = meta.get("note_id") or stable_id(rel_path)
+
+    # Upsert note
+    note_payload = {
+        "note_id": note_id,
+        "title": title,
+        "path": rel_path,
+        "Typ": meta.get("Typ"),
+        "Status": meta.get("Status"),
+        "tags": meta.get("tags"),
+        "Rolle": meta.get("Rolle"),
+        "text": content,
+    }
+    if not dry_run:
+        r = requests.post(f"{api_base}/qdrant/upsert_note", json=note_payload, timeout=60)
+        r.raise_for_status()
+
+    # Chunks
+    chunks = chunk_text(content)
+    results = {"chunks": 0, "edges": 0}
+    for idx, chunk in enumerate(chunks, start=1):
+        chunk_id = stable_id(f"{rel_path}#chunk-{idx}")
+        links = extract_links(chunk)
+        chunk_payload = {
+            "chunk_id": chunk_id,
+            "note_id": note_id,
+            "title": title,
+            "path": rel_path,
+            "Typ": meta.get("Typ"),
+            "Status": meta.get("Status"),
+            "tags": meta.get("tags"),
+            "Rolle": meta.get("Rolle"),
+            "text": chunk,
+            "links": links,
+        }
+        if not dry_run:
+            r = requests.post(f"{api_base}/qdrant/upsert_chunk", json=chunk_payload, timeout=60)
+            r.raise_for_status()
+        results["chunks"] += 1
+
+        # Edges from links
+        for link in links:
+            edge_payload = {
+                "src_note_id": note_id,
+                "dst_note_id": stable_id(link),  # naive target id from link text or URL
+                "src_chunk_id": chunk_id,
+                "dst_chunk_id": None,
+                "relation": "links_to",
+                "link_text": link,
+            }
+            if not dry_run:
+                r = requests.post(f"{api_base}/qdrant/upsert_edge", json=edge_payload, timeout=60)
+                r.raise_for_status()
+            results["edges"] += 1
+
+    return {"note_id": note_id, "title": title, "path": rel_path, **results}
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--vault", required=True, help="Path to Obsidian vault root")
+    ap.add_argument("--api-base", default="http://127.0.0.1:8001", help="mindnet API base URL")
+    ap.add_argument("--prefix", default="mindnet", help="Collection prefix (kept for future use)")
+    ap.add_argument("--dry-run", action="store_true", help="Parse and show, but do not upsert")
+    args = ap.parse_args()
+
+    md_files = []
+    for root, _, files in os.walk(args.vault):
+        for fn in files:
+            if fn.lower().endswith(".md"):
+                md_files.append(os.path.join(root, fn))
+
+    if not md_files:
+        print("No .md files found.")
+        return 0
+
+    print(f"Found {len(md_files)} markdown files.")
+    stats = {"notes": 0, "chunks": 0, "edges": 0}
+    for path in tqdm(md_files, desc="Importing"):
+        res = import_note(path, args.vault, args.api_base, args.prefix, args.dry_run)
+        stats["notes"] += 1
+        stats["chunks"] += res["chunks"]
+        stats["edges"] += res["edges"]
+
+    print(f"Done. Notes: {stats['notes']}  Chunks: {stats['chunks']}  Edges: {stats['edges']}")
+    return 0
+
+if __name__ == "__main__":
+    raise SystemExit(main())