#!/usr/bin/env python3 from __future__ import annotations import argparse, hashlib, os, re, sys, time from typing import List, Tuple, Dict, Any import requests import frontmatter from tqdm import tqdm WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]") MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") def stable_id(s: str) -> str: return hashlib.sha1(s.encode("utf-8")).hexdigest() def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]: # Prefer splitting on headings and paragraphs, then length bucket parts: List[str] = [] # Split by headings while keeping content blocks = re.split(r"(?m)^#{1,6}\s.*$", text) for block in blocks: block = block.strip() if not block: continue # further split by double-newline paragraphs paras = [p.strip() for p in block.split("\n\n") if p.strip()] buf = "" for p in paras: if len(buf) + len(p) + 2 <= max_chars: buf = f"{buf}\n\n{p}" if buf else p else: if buf: parts.append(buf) # if paragraph itself is very long, hard-split while len(p) > max_chars: parts.append(p[:max_chars]) p = p[max_chars - overlap:] buf = p if buf: parts.append(buf) buf = "" # merge with overlap merged: List[str] = [] for i, part in enumerate(parts): if not merged: merged.append(part) else: prev = merged[-1] if len(prev) + len(part) <= max_chars: merged[-1] = prev + "\n\n" + part else: # add overlap from prev end to next start tail = prev[-overlap:] if len(prev) > overlap else prev merged.append(tail + "\n\n" + part) return merged if merged else [text[:max_chars]] def extract_links(md_text: str) -> List[str]: links = [] links += WIKI_LINK_RE.findall(md_text) links += [m[1] for m in MD_LINK_RE.findall(md_text)] return list(dict.fromkeys(links)) # de-dup, preserve order def import_note(path: str, vault_root: str, api_base: str, prefix: str, dry_run: bool = False) -> Dict[str, Any]: rel_path = os.path.relpath(path, vault_root).replace("\\", "/") with open(path, "r", encoding="utf-8") as f: post = frontmatter.load(f) content = post.content meta = post.metadata or {} title = meta.get("title") or os.path.splitext(os.path.basename(path))[0] note_id = meta.get("note_id") or stable_id(rel_path) # Upsert note note_payload = { "note_id": note_id, "title": title, "path": rel_path, "Typ": meta.get("Typ"), "Status": meta.get("Status"), "tags": meta.get("tags"), "Rolle": meta.get("Rolle"), "text": content, } if not dry_run: r = requests.post(f"{api_base}/qdrant/upsert_note", json=note_payload, timeout=60) r.raise_for_status() # Chunks chunks = chunk_text(content) results = {"chunks": 0, "edges": 0} for idx, chunk in enumerate(chunks, start=1): chunk_id = stable_id(f"{rel_path}#chunk-{idx}") links = extract_links(chunk) chunk_payload = { "chunk_id": chunk_id, "note_id": note_id, "title": title, "path": rel_path, "Typ": meta.get("Typ"), "Status": meta.get("Status"), "tags": meta.get("tags"), "Rolle": meta.get("Rolle"), "text": chunk, "links": links, } if not dry_run: r = requests.post(f"{api_base}/qdrant/upsert_chunk", json=chunk_payload, timeout=60) r.raise_for_status() results["chunks"] += 1 # Edges from links for link in links: edge_payload = { "src_note_id": note_id, "dst_note_id": stable_id(link), # naive target id from link text or URL "src_chunk_id": chunk_id, "dst_chunk_id": None, "relation": "links_to", "link_text": link, } if not dry_run: r = requests.post(f"{api_base}/qdrant/upsert_edge", json=edge_payload, timeout=60) r.raise_for_status() results["edges"] += 1 return {"note_id": note_id, "title": title, "path": rel_path, **results} def main(): ap = argparse.ArgumentParser() ap.add_argument("--vault", required=True, help="Path to Obsidian vault root") ap.add_argument("--api-base", default="http://127.0.0.1:8001", help="mindnet API base URL") ap.add_argument("--prefix", default="mindnet", help="Collection prefix (kept for future use)") ap.add_argument("--dry-run", action="store_true", help="Parse and show, but do not upsert") args = ap.parse_args() md_files = [] for root, _, files in os.walk(args.vault): for fn in files: if fn.lower().endswith(".md"): md_files.append(os.path.join(root, fn)) if not md_files: print("No .md files found.") return 0 print(f"Found {len(md_files)} markdown files.") stats = {"notes": 0, "chunks": 0, "edges": 0} for path in tqdm(md_files, desc="Importing"): res = import_note(path, args.vault, args.api_base, args.prefix, args.dry_run) stats["notes"] += 1 stats["chunks"] += res["chunks"] stats["edges"] += res["edges"] print(f"Done. Notes: {stats['notes']} Chunks: {stats['chunks']} Edges: {stats['edges']}") return 0 if __name__ == "__main__": raise SystemExit(main())