diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index b332913..0bab8f9 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -1,157 +1,109 @@ - #!/usr/bin/env python3 from __future__ import annotations -import argparse, hashlib, os, re, sys, time -from typing import List, Tuple, Dict, Any -import requests -import frontmatter -from tqdm import tqdm +import argparse, os, glob, json, sys +from dotenv import load_dotenv +from qdrant_client import QdrantClient -WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]") -MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") +from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter +from app.core.note_payload import make_note_payload +from app.core.validate_note import validate_note_payload +from app.core.chunker import assemble_chunks +from app.core.chunk_payload import make_chunk_payloads +from app.core.embed import embed_texts, embed_one +from app.core.qdrant import QdrantConfig, ensure_collections, get_client +from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch +from app.core.edges import deriv_edges_for_note -def stable_id(s: str) -> str: - return hashlib.sha1(s.encode("utf-8")).hexdigest() - -def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]: - # Prefer splitting on headings and paragraphs, then length bucket - parts: List[str] = [] - # Split by headings while keeping content - blocks = re.split(r"(?m)^#{1,6}\s.*$", text) - for block in blocks: - block = block.strip() - if not block: +def iter_md(root: str, exclude=(" /.obsidian/ ", " /_backup_frontmatter/ ", " /_imported/ ")): + files = [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)] + out = [] + for p in files: + pn = p.replace("\\","/") + if any(ex.strip() in pn for ex in ["/.obsidian/", "/_backup_frontmatter/", "/_imported/"]): continue - # further split by double-newline paragraphs - paras = [p.strip() for p in block.split("\n\n") if p.strip()] - buf = "" - for p in paras: - if len(buf) + len(p) + 2 <= max_chars: - buf = f"{buf}\n\n{p}" if buf else p - else: - if buf: - parts.append(buf) - # if paragraph itself is very long, hard-split - while len(p) > max_chars: - parts.append(p[:max_chars]) - p = p[max_chars - overlap:] - buf = p - if buf: - parts.append(buf) - buf = "" - # merge with overlap - merged: List[str] = [] - for i, part in enumerate(parts): - if not merged: - merged.append(part) - else: - prev = merged[-1] - if len(prev) + len(part) <= max_chars: - merged[-1] = prev + "\n\n" + part - else: - # add overlap from prev end to next start - tail = prev[-overlap:] if len(prev) > overlap else prev - merged.append(tail + "\n\n" + part) - return merged if merged else [text[:max_chars]] - -def extract_links(md_text: str) -> List[str]: - links = [] - links += WIKI_LINK_RE.findall(md_text) - links += [m[1] for m in MD_LINK_RE.findall(md_text)] - return list(dict.fromkeys(links)) # de-dup, preserve order - -def import_note(path: str, vault_root: str, api_base: str, prefix: str, dry_run: bool = False) -> Dict[str, Any]: - rel_path = os.path.relpath(path, vault_root).replace("\\", "/") - with open(path, "r", encoding="utf-8") as f: - post = frontmatter.load(f) - content = post.content - meta = post.metadata or {} - title = meta.get("title") or os.path.splitext(os.path.basename(path))[0] - note_id = meta.get("note_id") or stable_id(rel_path) - - # Upsert note - note_payload = { - "note_id": note_id, - "title": title, - "path": rel_path, - "Typ": meta.get("Typ"), - "Status": meta.get("Status"), - "tags": meta.get("tags"), - "Rolle": meta.get("Rolle"), - "text": content, - } - if not dry_run: - r = requests.post(f"{api_base}/qdrant/upsert_note", json=note_payload, timeout=60) - r.raise_for_status() - - # Chunks - chunks = chunk_text(content) - results = {"chunks": 0, "edges": 0} - for idx, chunk in enumerate(chunks, start=1): - chunk_id = stable_id(f"{rel_path}#chunk-{idx}") - links = extract_links(chunk) - chunk_payload = { - "chunk_id": chunk_id, - "note_id": note_id, - "title": title, - "path": rel_path, - "Typ": meta.get("Typ"), - "Status": meta.get("Status"), - "tags": meta.get("tags"), - "Rolle": meta.get("Rolle"), - "text": chunk, - "links": links, - } - if not dry_run: - r = requests.post(f"{api_base}/qdrant/upsert_chunk", json=chunk_payload, timeout=60) - r.raise_for_status() - results["chunks"] += 1 - - # Edges from links - for link in links: - edge_payload = { - "src_note_id": note_id, - "dst_note_id": stable_id(link), # naive target id from link text or URL - "src_chunk_id": chunk_id, - "dst_chunk_id": None, - "relation": "links_to", - "link_text": link, - } - if not dry_run: - r = requests.post(f"{api_base}/qdrant/upsert_edge", json=edge_payload, timeout=60) - r.raise_for_status() - results["edges"] += 1 - - return {"note_id": note_id, "title": title, "path": rel_path, **results} + out.append(p) + return out def main(): + load_dotenv() ap = argparse.ArgumentParser() - ap.add_argument("--vault", required=True, help="Path to Obsidian vault root") - ap.add_argument("--api-base", default="http://127.0.0.1:8001", help="mindnet API base URL") - ap.add_argument("--prefix", default="mindnet", help="Collection prefix (kept for future use)") - ap.add_argument("--dry-run", action="store_true", help="Parse and show, but do not upsert") + ap.add_argument("--vault", required=True, help="Obsidian Vault Pfad (z.B. mindnet/vault)") + ap.add_argument("--apply", action="store_true", help="Schreibt in Qdrant (sonst Dry-Run)") + ap.add_argument("--note-id", help="Nur eine Note-ID verarbeiten") + ap.add_argument("--embed-note", action="store_true", help="Auch Note-Volltext einbetten (optional)") args = ap.parse_args() - md_files = [] - for root, _, files in os.walk(args.vault): - for fn in files: - if fn.lower().endswith(".md"): - md_files.append(os.path.join(root, fn)) + # Qdrant + cfg = QdrantConfig( + url=os.getenv("QDRANT_URL", "http://127.0.0.1:6333"), + api_key=os.getenv("QDRANT_API_KEY") or None, + prefix=os.getenv("COLLECTION_PREFIX", "mindnet"), + dim=int(os.getenv("VECTOR_DIM","384")), + ) + client = get_client(cfg) + ensure_collections(cfg) - if not md_files: - print("No .md files found.") - return 0 + root = os.path.abspath(args.vault) + files = iter_md(root) + if not files: + print("Keine Markdown-Dateien gefunden.", file=sys.stderr); sys.exit(2) - print(f"Found {len(md_files)} markdown files.") - stats = {"notes": 0, "chunks": 0, "edges": 0} - for path in tqdm(md_files, desc="Importing"): - res = import_note(path, args.vault, args.api_base, args.prefix, args.dry_run) - stats["notes"] += 1 - stats["chunks"] += res["chunks"] - stats["edges"] += res["edges"] + total_notes = 0 + for path in files: + parsed = read_markdown(path) + fm = normalize_frontmatter(parsed.frontmatter) + try: + validate_required_frontmatter(fm) + except Exception: + continue + if args.note_id and fm.get("id") != args.note_id: + continue - print(f"Done. Notes: {stats['notes']} Chunks: {stats['chunks']} Edges: {stats['edges']}") - return 0 + total_notes += 1 + # Note-Payload + note_pl = make_note_payload(parsed, vault_root=root) + validate_note_payload(note_pl) + + # Chunks + chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept")) + chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks) + + # Embeddings (Chunks) + texts = [c for c in (ch.text for ch in chunks)] + vectors = embed_texts(texts) + + # Optional: Note-Vektor + note_vec = None + if args.embed_note: + note_vec = embed_one(parsed.body) + + # Kanten + edges = deriv_edges_for_note(fm, chunk_pls) + + # Dry-Run-Ausgabe + summary = { + "note_id": fm["id"], + "title": fm["title"], + "chunks": len(chunk_pls), + "edges": len(edges), + "path": note_pl["path"] + } + print(json.dumps(summary, ensure_ascii=False)) + + if args.apply: + # Notes upsert + notes_col, note_pts = points_for_note(cfg.prefix, note_pl, note_vec) + upsert_batch(client, notes_col, note_pts) + + # Chunks upsert + chunks_col, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vectors) + upsert_batch(client, chunks_col, chunk_pts) + + # Edges upsert + edges_col, edge_pts = points_for_edges(cfg.prefix, edges) + upsert_batch(client, edges_col, edge_pts) + + print(f"Done. Processed notes: {total_notes}") if __name__ == "__main__": - raise SystemExit(main()) + main()