scripts/import_markdown.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-09-03 07:58:57 +02:00
parent 309462dfa9
commit 5254eb22ed

View File

@ -1,157 +1,109 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse, hashlib, os, re, sys, time
from typing import List, Tuple, Dict, Any
import requests
import frontmatter
from tqdm import tqdm
import argparse, os, glob, json, sys
from dotenv import load_dotenv
from qdrant_client import QdrantClient
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.note_payload import make_note_payload
from app.core.validate_note import validate_note_payload
from app.core.chunker import assemble_chunks
from app.core.chunk_payload import make_chunk_payloads
from app.core.embed import embed_texts, embed_one
from app.core.qdrant import QdrantConfig, ensure_collections, get_client
from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
from app.core.edges import deriv_edges_for_note
def stable_id(s: str) -> str:
return hashlib.sha1(s.encode("utf-8")).hexdigest()
def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]:
# Prefer splitting on headings and paragraphs, then length bucket
parts: List[str] = []
# Split by headings while keeping content
blocks = re.split(r"(?m)^#{1,6}\s.*$", text)
for block in blocks:
block = block.strip()
if not block:
def iter_md(root: str, exclude=(" /.obsidian/ ", " /_backup_frontmatter/ ", " /_imported/ ")):
files = [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
out = []
for p in files:
pn = p.replace("\\","/")
if any(ex.strip() in pn for ex in ["/.obsidian/", "/_backup_frontmatter/", "/_imported/"]):
continue
# further split by double-newline paragraphs
paras = [p.strip() for p in block.split("\n\n") if p.strip()]
buf = ""
for p in paras:
if len(buf) + len(p) + 2 <= max_chars:
buf = f"{buf}\n\n{p}" if buf else p
else:
if buf:
parts.append(buf)
# if paragraph itself is very long, hard-split
while len(p) > max_chars:
parts.append(p[:max_chars])
p = p[max_chars - overlap:]
buf = p
if buf:
parts.append(buf)
buf = ""
# merge with overlap
merged: List[str] = []
for i, part in enumerate(parts):
if not merged:
merged.append(part)
else:
prev = merged[-1]
if len(prev) + len(part) <= max_chars:
merged[-1] = prev + "\n\n" + part
else:
# add overlap from prev end to next start
tail = prev[-overlap:] if len(prev) > overlap else prev
merged.append(tail + "\n\n" + part)
return merged if merged else [text[:max_chars]]
def extract_links(md_text: str) -> List[str]:
links = []
links += WIKI_LINK_RE.findall(md_text)
links += [m[1] for m in MD_LINK_RE.findall(md_text)]
return list(dict.fromkeys(links)) # de-dup, preserve order
def import_note(path: str, vault_root: str, api_base: str, prefix: str, dry_run: bool = False) -> Dict[str, Any]:
rel_path = os.path.relpath(path, vault_root).replace("\\", "/")
with open(path, "r", encoding="utf-8") as f:
post = frontmatter.load(f)
content = post.content
meta = post.metadata or {}
title = meta.get("title") or os.path.splitext(os.path.basename(path))[0]
note_id = meta.get("note_id") or stable_id(rel_path)
# Upsert note
note_payload = {
"note_id": note_id,
"title": title,
"path": rel_path,
"Typ": meta.get("Typ"),
"Status": meta.get("Status"),
"tags": meta.get("tags"),
"Rolle": meta.get("Rolle"),
"text": content,
}
if not dry_run:
r = requests.post(f"{api_base}/qdrant/upsert_note", json=note_payload, timeout=60)
r.raise_for_status()
# Chunks
chunks = chunk_text(content)
results = {"chunks": 0, "edges": 0}
for idx, chunk in enumerate(chunks, start=1):
chunk_id = stable_id(f"{rel_path}#chunk-{idx}")
links = extract_links(chunk)
chunk_payload = {
"chunk_id": chunk_id,
"note_id": note_id,
"title": title,
"path": rel_path,
"Typ": meta.get("Typ"),
"Status": meta.get("Status"),
"tags": meta.get("tags"),
"Rolle": meta.get("Rolle"),
"text": chunk,
"links": links,
}
if not dry_run:
r = requests.post(f"{api_base}/qdrant/upsert_chunk", json=chunk_payload, timeout=60)
r.raise_for_status()
results["chunks"] += 1
# Edges from links
for link in links:
edge_payload = {
"src_note_id": note_id,
"dst_note_id": stable_id(link), # naive target id from link text or URL
"src_chunk_id": chunk_id,
"dst_chunk_id": None,
"relation": "links_to",
"link_text": link,
}
if not dry_run:
r = requests.post(f"{api_base}/qdrant/upsert_edge", json=edge_payload, timeout=60)
r.raise_for_status()
results["edges"] += 1
return {"note_id": note_id, "title": title, "path": rel_path, **results}
out.append(p)
return out
def main():
load_dotenv()
ap = argparse.ArgumentParser()
ap.add_argument("--vault", required=True, help="Path to Obsidian vault root")
ap.add_argument("--api-base", default="http://127.0.0.1:8001", help="mindnet API base URL")
ap.add_argument("--prefix", default="mindnet", help="Collection prefix (kept for future use)")
ap.add_argument("--dry-run", action="store_true", help="Parse and show, but do not upsert")
ap.add_argument("--vault", required=True, help="Obsidian Vault Pfad (z.B. mindnet/vault)")
ap.add_argument("--apply", action="store_true", help="Schreibt in Qdrant (sonst Dry-Run)")
ap.add_argument("--note-id", help="Nur eine Note-ID verarbeiten")
ap.add_argument("--embed-note", action="store_true", help="Auch Note-Volltext einbetten (optional)")
args = ap.parse_args()
md_files = []
for root, _, files in os.walk(args.vault):
for fn in files:
if fn.lower().endswith(".md"):
md_files.append(os.path.join(root, fn))
# Qdrant
cfg = QdrantConfig(
url=os.getenv("QDRANT_URL", "http://127.0.0.1:6333"),
api_key=os.getenv("QDRANT_API_KEY") or None,
prefix=os.getenv("COLLECTION_PREFIX", "mindnet"),
dim=int(os.getenv("VECTOR_DIM","384")),
)
client = get_client(cfg)
ensure_collections(cfg)
if not md_files:
print("No .md files found.")
return 0
root = os.path.abspath(args.vault)
files = iter_md(root)
if not files:
print("Keine Markdown-Dateien gefunden.", file=sys.stderr); sys.exit(2)
print(f"Found {len(md_files)} markdown files.")
stats = {"notes": 0, "chunks": 0, "edges": 0}
for path in tqdm(md_files, desc="Importing"):
res = import_note(path, args.vault, args.api_base, args.prefix, args.dry_run)
stats["notes"] += 1
stats["chunks"] += res["chunks"]
stats["edges"] += res["edges"]
total_notes = 0
for path in files:
parsed = read_markdown(path)
fm = normalize_frontmatter(parsed.frontmatter)
try:
validate_required_frontmatter(fm)
except Exception:
continue
if args.note_id and fm.get("id") != args.note_id:
continue
print(f"Done. Notes: {stats['notes']} Chunks: {stats['chunks']} Edges: {stats['edges']}")
return 0
total_notes += 1
# Note-Payload
note_pl = make_note_payload(parsed, vault_root=root)
validate_note_payload(note_pl)
# Chunks
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
# Embeddings (Chunks)
texts = [c for c in (ch.text for ch in chunks)]
vectors = embed_texts(texts)
# Optional: Note-Vektor
note_vec = None
if args.embed_note:
note_vec = embed_one(parsed.body)
# Kanten
edges = deriv_edges_for_note(fm, chunk_pls)
# Dry-Run-Ausgabe
summary = {
"note_id": fm["id"],
"title": fm["title"],
"chunks": len(chunk_pls),
"edges": len(edges),
"path": note_pl["path"]
}
print(json.dumps(summary, ensure_ascii=False))
if args.apply:
# Notes upsert
notes_col, note_pts = points_for_note(cfg.prefix, note_pl, note_vec)
upsert_batch(client, notes_col, note_pts)
# Chunks upsert
chunks_col, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vectors)
upsert_batch(client, chunks_col, chunk_pts)
# Edges upsert
edges_col, edge_pts = points_for_edges(cfg.prefix, edges)
upsert_batch(client, edges_col, edge_pts)
print(f"Done. Processed notes: {total_notes}")
if __name__ == "__main__":
raise SystemExit(main())
main()