From f248ba962bf6279bb727f33350039f4f26c84b59 Mon Sep 17 00:00:00 2001 From: Lars Date: Mon, 1 Sep 2025 17:37:39 +0200 Subject: [PATCH] =?UTF-8?q?scripts/import=5Fmarkdown.py=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/import_markdown.py | 157 +++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 scripts/import_markdown.py diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py new file mode 100644 index 0000000..b332913 --- /dev/null +++ b/scripts/import_markdown.py @@ -0,0 +1,157 @@ + +#!/usr/bin/env python3 +from __future__ import annotations +import argparse, hashlib, os, re, sys, time +from typing import List, Tuple, Dict, Any +import requests +import frontmatter +from tqdm import tqdm + +WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]") +MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") + +def stable_id(s: str) -> str: + return hashlib.sha1(s.encode("utf-8")).hexdigest() + +def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]: + # Prefer splitting on headings and paragraphs, then length bucket + parts: List[str] = [] + # Split by headings while keeping content + blocks = re.split(r"(?m)^#{1,6}\s.*$", text) + for block in blocks: + block = block.strip() + if not block: + continue + # further split by double-newline paragraphs + paras = [p.strip() for p in block.split("\n\n") if p.strip()] + buf = "" + for p in paras: + if len(buf) + len(p) + 2 <= max_chars: + buf = f"{buf}\n\n{p}" if buf else p + else: + if buf: + parts.append(buf) + # if paragraph itself is very long, hard-split + while len(p) > max_chars: + parts.append(p[:max_chars]) + p = p[max_chars - overlap:] + buf = p + if buf: + parts.append(buf) + buf = "" + # merge with overlap + merged: List[str] = [] + for i, part in enumerate(parts): + if not merged: + merged.append(part) + else: + prev = merged[-1] + if len(prev) + len(part) <= max_chars: + merged[-1] = prev + "\n\n" + part + else: + # add overlap from prev end to next start + tail = prev[-overlap:] if len(prev) > overlap else prev + merged.append(tail + "\n\n" + part) + return merged if merged else [text[:max_chars]] + +def extract_links(md_text: str) -> List[str]: + links = [] + links += WIKI_LINK_RE.findall(md_text) + links += [m[1] for m in MD_LINK_RE.findall(md_text)] + return list(dict.fromkeys(links)) # de-dup, preserve order + +def import_note(path: str, vault_root: str, api_base: str, prefix: str, dry_run: bool = False) -> Dict[str, Any]: + rel_path = os.path.relpath(path, vault_root).replace("\\", "/") + with open(path, "r", encoding="utf-8") as f: + post = frontmatter.load(f) + content = post.content + meta = post.metadata or {} + title = meta.get("title") or os.path.splitext(os.path.basename(path))[0] + note_id = meta.get("note_id") or stable_id(rel_path) + + # Upsert note + note_payload = { + "note_id": note_id, + "title": title, + "path": rel_path, + "Typ": meta.get("Typ"), + "Status": meta.get("Status"), + "tags": meta.get("tags"), + "Rolle": meta.get("Rolle"), + "text": content, + } + if not dry_run: + r = requests.post(f"{api_base}/qdrant/upsert_note", json=note_payload, timeout=60) + r.raise_for_status() + + # Chunks + chunks = chunk_text(content) + results = {"chunks": 0, "edges": 0} + for idx, chunk in enumerate(chunks, start=1): + chunk_id = stable_id(f"{rel_path}#chunk-{idx}") + links = extract_links(chunk) + chunk_payload = { + "chunk_id": chunk_id, + "note_id": note_id, + "title": title, + "path": rel_path, + "Typ": meta.get("Typ"), + "Status": meta.get("Status"), + "tags": meta.get("tags"), + "Rolle": meta.get("Rolle"), + "text": chunk, + "links": links, + } + if not dry_run: + r = requests.post(f"{api_base}/qdrant/upsert_chunk", json=chunk_payload, timeout=60) + r.raise_for_status() + results["chunks"] += 1 + + # Edges from links + for link in links: + edge_payload = { + "src_note_id": note_id, + "dst_note_id": stable_id(link), # naive target id from link text or URL + "src_chunk_id": chunk_id, + "dst_chunk_id": None, + "relation": "links_to", + "link_text": link, + } + if not dry_run: + r = requests.post(f"{api_base}/qdrant/upsert_edge", json=edge_payload, timeout=60) + r.raise_for_status() + results["edges"] += 1 + + return {"note_id": note_id, "title": title, "path": rel_path, **results} + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--vault", required=True, help="Path to Obsidian vault root") + ap.add_argument("--api-base", default="http://127.0.0.1:8001", help="mindnet API base URL") + ap.add_argument("--prefix", default="mindnet", help="Collection prefix (kept for future use)") + ap.add_argument("--dry-run", action="store_true", help="Parse and show, but do not upsert") + args = ap.parse_args() + + md_files = [] + for root, _, files in os.walk(args.vault): + for fn in files: + if fn.lower().endswith(".md"): + md_files.append(os.path.join(root, fn)) + + if not md_files: + print("No .md files found.") + return 0 + + print(f"Found {len(md_files)} markdown files.") + stats = {"notes": 0, "chunks": 0, "edges": 0} + for path in tqdm(md_files, desc="Importing"): + res = import_note(path, args.vault, args.api_base, args.prefix, args.dry_run) + stats["notes"] += 1 + stats["chunks"] += res["chunks"] + stats["edges"] += res["edges"] + + print(f"Done. Notes: {stats['notes']} Chunks: {stats['chunks']} Edges: {stats['edges']}") + return 0 + +if __name__ == "__main__": + raise SystemExit(main())