scripts/import_markdown.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
309462dfa9
commit
5254eb22ed
|
|
@ -1,157 +1,109 @@
|
||||||
|
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse, hashlib, os, re, sys, time
|
import argparse, os, glob, json, sys
|
||||||
from typing import List, Tuple, Dict, Any
|
from dotenv import load_dotenv
|
||||||
import requests
|
from qdrant_client import QdrantClient
|
||||||
import frontmatter
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
WIKI_LINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||||
MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
from app.core.note_payload import make_note_payload
|
||||||
|
from app.core.validate_note import validate_note_payload
|
||||||
|
from app.core.chunker import assemble_chunks
|
||||||
|
from app.core.chunk_payload import make_chunk_payloads
|
||||||
|
from app.core.embed import embed_texts, embed_one
|
||||||
|
from app.core.qdrant import QdrantConfig, ensure_collections, get_client
|
||||||
|
from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
|
||||||
|
from app.core.edges import deriv_edges_for_note
|
||||||
|
|
||||||
def stable_id(s: str) -> str:
|
def iter_md(root: str, exclude=(" /.obsidian/ ", " /_backup_frontmatter/ ", " /_imported/ ")):
|
||||||
return hashlib.sha1(s.encode("utf-8")).hexdigest()
|
files = [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
|
||||||
|
out = []
|
||||||
def chunk_text(text: str, max_chars: int = 1200, overlap: int = 200) -> List[str]:
|
for p in files:
|
||||||
# Prefer splitting on headings and paragraphs, then length bucket
|
pn = p.replace("\\","/")
|
||||||
parts: List[str] = []
|
if any(ex.strip() in pn for ex in ["/.obsidian/", "/_backup_frontmatter/", "/_imported/"]):
|
||||||
# Split by headings while keeping content
|
|
||||||
blocks = re.split(r"(?m)^#{1,6}\s.*$", text)
|
|
||||||
for block in blocks:
|
|
||||||
block = block.strip()
|
|
||||||
if not block:
|
|
||||||
continue
|
continue
|
||||||
# further split by double-newline paragraphs
|
out.append(p)
|
||||||
paras = [p.strip() for p in block.split("\n\n") if p.strip()]
|
return out
|
||||||
buf = ""
|
|
||||||
for p in paras:
|
|
||||||
if len(buf) + len(p) + 2 <= max_chars:
|
|
||||||
buf = f"{buf}\n\n{p}" if buf else p
|
|
||||||
else:
|
|
||||||
if buf:
|
|
||||||
parts.append(buf)
|
|
||||||
# if paragraph itself is very long, hard-split
|
|
||||||
while len(p) > max_chars:
|
|
||||||
parts.append(p[:max_chars])
|
|
||||||
p = p[max_chars - overlap:]
|
|
||||||
buf = p
|
|
||||||
if buf:
|
|
||||||
parts.append(buf)
|
|
||||||
buf = ""
|
|
||||||
# merge with overlap
|
|
||||||
merged: List[str] = []
|
|
||||||
for i, part in enumerate(parts):
|
|
||||||
if not merged:
|
|
||||||
merged.append(part)
|
|
||||||
else:
|
|
||||||
prev = merged[-1]
|
|
||||||
if len(prev) + len(part) <= max_chars:
|
|
||||||
merged[-1] = prev + "\n\n" + part
|
|
||||||
else:
|
|
||||||
# add overlap from prev end to next start
|
|
||||||
tail = prev[-overlap:] if len(prev) > overlap else prev
|
|
||||||
merged.append(tail + "\n\n" + part)
|
|
||||||
return merged if merged else [text[:max_chars]]
|
|
||||||
|
|
||||||
def extract_links(md_text: str) -> List[str]:
|
|
||||||
links = []
|
|
||||||
links += WIKI_LINK_RE.findall(md_text)
|
|
||||||
links += [m[1] for m in MD_LINK_RE.findall(md_text)]
|
|
||||||
return list(dict.fromkeys(links)) # de-dup, preserve order
|
|
||||||
|
|
||||||
def import_note(path: str, vault_root: str, api_base: str, prefix: str, dry_run: bool = False) -> Dict[str, Any]:
|
|
||||||
rel_path = os.path.relpath(path, vault_root).replace("\\", "/")
|
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
|
||||||
post = frontmatter.load(f)
|
|
||||||
content = post.content
|
|
||||||
meta = post.metadata or {}
|
|
||||||
title = meta.get("title") or os.path.splitext(os.path.basename(path))[0]
|
|
||||||
note_id = meta.get("note_id") or stable_id(rel_path)
|
|
||||||
|
|
||||||
# Upsert note
|
|
||||||
note_payload = {
|
|
||||||
"note_id": note_id,
|
|
||||||
"title": title,
|
|
||||||
"path": rel_path,
|
|
||||||
"Typ": meta.get("Typ"),
|
|
||||||
"Status": meta.get("Status"),
|
|
||||||
"tags": meta.get("tags"),
|
|
||||||
"Rolle": meta.get("Rolle"),
|
|
||||||
"text": content,
|
|
||||||
}
|
|
||||||
if not dry_run:
|
|
||||||
r = requests.post(f"{api_base}/qdrant/upsert_note", json=note_payload, timeout=60)
|
|
||||||
r.raise_for_status()
|
|
||||||
|
|
||||||
# Chunks
|
|
||||||
chunks = chunk_text(content)
|
|
||||||
results = {"chunks": 0, "edges": 0}
|
|
||||||
for idx, chunk in enumerate(chunks, start=1):
|
|
||||||
chunk_id = stable_id(f"{rel_path}#chunk-{idx}")
|
|
||||||
links = extract_links(chunk)
|
|
||||||
chunk_payload = {
|
|
||||||
"chunk_id": chunk_id,
|
|
||||||
"note_id": note_id,
|
|
||||||
"title": title,
|
|
||||||
"path": rel_path,
|
|
||||||
"Typ": meta.get("Typ"),
|
|
||||||
"Status": meta.get("Status"),
|
|
||||||
"tags": meta.get("tags"),
|
|
||||||
"Rolle": meta.get("Rolle"),
|
|
||||||
"text": chunk,
|
|
||||||
"links": links,
|
|
||||||
}
|
|
||||||
if not dry_run:
|
|
||||||
r = requests.post(f"{api_base}/qdrant/upsert_chunk", json=chunk_payload, timeout=60)
|
|
||||||
r.raise_for_status()
|
|
||||||
results["chunks"] += 1
|
|
||||||
|
|
||||||
# Edges from links
|
|
||||||
for link in links:
|
|
||||||
edge_payload = {
|
|
||||||
"src_note_id": note_id,
|
|
||||||
"dst_note_id": stable_id(link), # naive target id from link text or URL
|
|
||||||
"src_chunk_id": chunk_id,
|
|
||||||
"dst_chunk_id": None,
|
|
||||||
"relation": "links_to",
|
|
||||||
"link_text": link,
|
|
||||||
}
|
|
||||||
if not dry_run:
|
|
||||||
r = requests.post(f"{api_base}/qdrant/upsert_edge", json=edge_payload, timeout=60)
|
|
||||||
r.raise_for_status()
|
|
||||||
results["edges"] += 1
|
|
||||||
|
|
||||||
return {"note_id": note_id, "title": title, "path": rel_path, **results}
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
load_dotenv()
|
||||||
ap = argparse.ArgumentParser()
|
ap = argparse.ArgumentParser()
|
||||||
ap.add_argument("--vault", required=True, help="Path to Obsidian vault root")
|
ap.add_argument("--vault", required=True, help="Obsidian Vault Pfad (z.B. mindnet/vault)")
|
||||||
ap.add_argument("--api-base", default="http://127.0.0.1:8001", help="mindnet API base URL")
|
ap.add_argument("--apply", action="store_true", help="Schreibt in Qdrant (sonst Dry-Run)")
|
||||||
ap.add_argument("--prefix", default="mindnet", help="Collection prefix (kept for future use)")
|
ap.add_argument("--note-id", help="Nur eine Note-ID verarbeiten")
|
||||||
ap.add_argument("--dry-run", action="store_true", help="Parse and show, but do not upsert")
|
ap.add_argument("--embed-note", action="store_true", help="Auch Note-Volltext einbetten (optional)")
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
|
|
||||||
md_files = []
|
# Qdrant
|
||||||
for root, _, files in os.walk(args.vault):
|
cfg = QdrantConfig(
|
||||||
for fn in files:
|
url=os.getenv("QDRANT_URL", "http://127.0.0.1:6333"),
|
||||||
if fn.lower().endswith(".md"):
|
api_key=os.getenv("QDRANT_API_KEY") or None,
|
||||||
md_files.append(os.path.join(root, fn))
|
prefix=os.getenv("COLLECTION_PREFIX", "mindnet"),
|
||||||
|
dim=int(os.getenv("VECTOR_DIM","384")),
|
||||||
|
)
|
||||||
|
client = get_client(cfg)
|
||||||
|
ensure_collections(cfg)
|
||||||
|
|
||||||
if not md_files:
|
root = os.path.abspath(args.vault)
|
||||||
print("No .md files found.")
|
files = iter_md(root)
|
||||||
return 0
|
if not files:
|
||||||
|
print("Keine Markdown-Dateien gefunden.", file=sys.stderr); sys.exit(2)
|
||||||
|
|
||||||
print(f"Found {len(md_files)} markdown files.")
|
total_notes = 0
|
||||||
stats = {"notes": 0, "chunks": 0, "edges": 0}
|
for path in files:
|
||||||
for path in tqdm(md_files, desc="Importing"):
|
parsed = read_markdown(path)
|
||||||
res = import_note(path, args.vault, args.api_base, args.prefix, args.dry_run)
|
fm = normalize_frontmatter(parsed.frontmatter)
|
||||||
stats["notes"] += 1
|
try:
|
||||||
stats["chunks"] += res["chunks"]
|
validate_required_frontmatter(fm)
|
||||||
stats["edges"] += res["edges"]
|
except Exception:
|
||||||
|
continue
|
||||||
|
if args.note_id and fm.get("id") != args.note_id:
|
||||||
|
continue
|
||||||
|
|
||||||
print(f"Done. Notes: {stats['notes']} Chunks: {stats['chunks']} Edges: {stats['edges']}")
|
total_notes += 1
|
||||||
return 0
|
# Note-Payload
|
||||||
|
note_pl = make_note_payload(parsed, vault_root=root)
|
||||||
|
validate_note_payload(note_pl)
|
||||||
|
|
||||||
|
# Chunks
|
||||||
|
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
|
||||||
|
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks)
|
||||||
|
|
||||||
|
# Embeddings (Chunks)
|
||||||
|
texts = [c for c in (ch.text for ch in chunks)]
|
||||||
|
vectors = embed_texts(texts)
|
||||||
|
|
||||||
|
# Optional: Note-Vektor
|
||||||
|
note_vec = None
|
||||||
|
if args.embed_note:
|
||||||
|
note_vec = embed_one(parsed.body)
|
||||||
|
|
||||||
|
# Kanten
|
||||||
|
edges = deriv_edges_for_note(fm, chunk_pls)
|
||||||
|
|
||||||
|
# Dry-Run-Ausgabe
|
||||||
|
summary = {
|
||||||
|
"note_id": fm["id"],
|
||||||
|
"title": fm["title"],
|
||||||
|
"chunks": len(chunk_pls),
|
||||||
|
"edges": len(edges),
|
||||||
|
"path": note_pl["path"]
|
||||||
|
}
|
||||||
|
print(json.dumps(summary, ensure_ascii=False))
|
||||||
|
|
||||||
|
if args.apply:
|
||||||
|
# Notes upsert
|
||||||
|
notes_col, note_pts = points_for_note(cfg.prefix, note_pl, note_vec)
|
||||||
|
upsert_batch(client, notes_col, note_pts)
|
||||||
|
|
||||||
|
# Chunks upsert
|
||||||
|
chunks_col, chunk_pts = points_for_chunks(cfg.prefix, chunk_pls, vectors)
|
||||||
|
upsert_batch(client, chunks_col, chunk_pts)
|
||||||
|
|
||||||
|
# Edges upsert
|
||||||
|
edges_col, edge_pts = points_for_edges(cfg.prefix, edges)
|
||||||
|
upsert_batch(client, edges_col, edge_pts)
|
||||||
|
|
||||||
|
print(f"Done. Processed notes: {total_notes}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
raise SystemExit(main())
|
main()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user