scripts/import_markdown.py hinzugefügt
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 2s

This commit is contained in:
Lars 2025-09-01 17:37:39 +02:00
parent 1c484e1ca0
commit f248ba962b

157
scripts/import_markdown.py Normal file
View File

@ -0,0 +1,157 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse, hashlib, os, re, sys, time
from typing import List, Tuple, Dict, Any
import requests
import frontmatter
from tqdm import tqdm
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
def stable_id(s: str) -> str:
return hashlib.sha1(s.encode("utf-8")).hexdigest()
def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]:
# Prefer splitting on headings and paragraphs, then length bucket
parts: List[str] = []
# Split by headings while keeping content
blocks = re.split(r"(?m)^#{1,6}\s.*$", text)
for block in blocks:
block = block.strip()
if not block:
continue
# further split by double-newline paragraphs
paras = [p.strip() for p in block.split("\n\n") if p.strip()]
buf = ""
for p in paras:
if len(buf) + len(p) + 2 <= max_chars:
buf = f"{buf}\n\n{p}" if buf else p
else:
if buf:
parts.append(buf)
# if paragraph itself is very long, hard-split
while len(p) > max_chars:
parts.append(p[:max_chars])
p = p[max_chars - overlap:]
buf = p
if buf:
parts.append(buf)
buf = ""
# merge with overlap
merged: List[str] = []
for i, part in enumerate(parts):
if not merged:
merged.append(part)
else:
prev = merged[-1]
if len(prev) + len(part) <= max_chars:
merged[-1] = prev + "\n\n" + part
else:
# add overlap from prev end to next start
tail = prev[-overlap:] if len(prev) > overlap else prev
merged.append(tail + "\n\n" + part)
return merged if merged else [text[:max_chars]]
def extract_links(md_text: str) -> List[str]:
links = []
links += WIKI_LINK_RE.findall(md_text)
links += [m[1] for m in MD_LINK_RE.findall(md_text)]
return list(dict.fromkeys(links)) # de-dup, preserve order
def import_note(path: str, vault_root: str, api_base: str, prefix: str, dry_run: bool = False) -> Dict[str, Any]:
rel_path = os.path.relpath(path, vault_root).replace("\\", "/")
with open(path, "r", encoding="utf-8") as f:
post = frontmatter.load(f)
content = post.content
meta = post.metadata or {}
title = meta.get("title") or os.path.splitext(os.path.basename(path))[0]
note_id = meta.get("note_id") or stable_id(rel_path)
# Upsert note
note_payload = {
"note_id": note_id,
"title": title,
"path": rel_path,
"Typ": meta.get("Typ"),
"Status": meta.get("Status"),
"tags": meta.get("tags"),
"Rolle": meta.get("Rolle"),
"text": content,
}
if not dry_run:
r = requests.post(f"{api_base}/qdrant/upsert_note", json=note_payload, timeout=60)
r.raise_for_status()
# Chunks
chunks = chunk_text(content)
results = {"chunks": 0, "edges": 0}
for idx, chunk in enumerate(chunks, start=1):
chunk_id = stable_id(f"{rel_path}#chunk-{idx}")
links = extract_links(chunk)
chunk_payload = {
"chunk_id": chunk_id,
"note_id": note_id,
"title": title,
"path": rel_path,
"Typ": meta.get("Typ"),
"Status": meta.get("Status"),
"tags": meta.get("tags"),
"Rolle": meta.get("Rolle"),
"text": chunk,
"links": links,
}
if not dry_run:
r = requests.post(f"{api_base}/qdrant/upsert_chunk", json=chunk_payload, timeout=60)
r.raise_for_status()
results["chunks"] += 1
# Edges from links
for link in links:
edge_payload = {
"src_note_id": note_id,
"dst_note_id": stable_id(link), # naive target id from link text or URL
"src_chunk_id": chunk_id,
"dst_chunk_id": None,
"relation": "links_to",
"link_text": link,
}
if not dry_run:
r = requests.post(f"{api_base}/qdrant/upsert_edge", json=edge_payload, timeout=60)
r.raise_for_status()
results["edges"] += 1
return {"note_id": note_id, "title": title, "path": rel_path, **results}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Path to Obsidian vault root")
ap.add_argument("--api-base", default="http://127.0.0.1:8001", help="mindnet API base URL")
ap.add_argument("--prefix", default="mindnet", help="Collection prefix (kept for future use)")
ap.add_argument("--dry-run", action="store_true", help="Parse and show, but do not upsert")
args = ap.parse_args()
md_files = []
for root, _, files in os.walk(args.vault):
for fn in files:
if fn.lower().endswith(".md"):
md_files.append(os.path.join(root, fn))
if not md_files:
print("No .md files found.")
return 0
print(f"Found {len(md_files)} markdown files.")
stats = {"notes": 0, "chunks": 0, "edges": 0}
for path in tqdm(md_files, desc="Importing"):
res = import_note(path, args.vault, args.api_base, args.prefix, args.dry_run)
stats["notes"] += 1
stats["chunks"] += res["chunks"]
stats["edges"] += res["edges"]
print(f"Done. Notes: {stats['notes']} Chunks: {stats['chunks']} Edges: {stats['edges']}")
return 0
if __name__ == "__main__":
raise SystemExit(main())