#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ import_pdf_file.py Importiert eine einzelne PDF-Datei: - Text-Extraktion via pdfplumber - Chunking mit Satz-Overlap - POST an /embed - Verschieben nach _imported """ import os import sys import shutil import requests from uuid import uuid4 from datetime import datetime, timezone import pdfplumber from chunking_utils import chunk_with_sentence_overlap # Konfiguration API_URL = "http://localhost:8000/embed" MAX_LENGTH = 500 if len(sys.argv) < 3: print("? Usage: python import_pdf_file.py [topic]") sys.exit(1) category = sys.argv[1] filename = sys.argv[2] topic = sys.argv[3] if len(sys.argv) > 3 else "default" source_dir = os.path.expanduser(f"~/knowledge/{category}") file_path = os.path.join(source_dir, filename) archive_dir= os.path.join(source_dir, "_imported") if not os.path.isfile(file_path): print(f"? Datei nicht gefunden: {file_path}") sys.exit(1) os.makedirs(archive_dir, exist_ok=True) print(f"?? PDF-Import: {filename} in Kategorie {category} (Topic={topic})") # 1) PDF-Text extrahieren text = "" with pdfplumber.open(file_path) as pdf: for page in pdf.pages: page_text = page.extract_text() or "" text += page_text + "\n\n" # 2) Chunking chunks = chunk_with_sentence_overlap( text, max_length=MAX_LENGTH, overlap_sents=2 ) print(f"?? {len(chunks)} Chunks erzeugt.") # 3) Payload bauen now = datetime.now(timezone.utc).isoformat() payload = {"collection": category, "chunks": []} for idx, chunk in enumerate(chunks): payload["chunks"].append({ "text": chunk, "source": filename, "source_type": "pdf", "title": os.path.splitext(filename)[0], "version": "v1.0", "related_to": category, "tags": [category], "owner": "karate-agent", "context_tag": topic, "imported_at": now, "chunk_index": idx, "category": category }) # 4) Senden try: res = requests.post(API_URL, json=payload) res.raise_for_status() print(f"? {len(chunks)} Chunks erfolgreich eingebettet.") except Exception as e: print(f"? Fehler beim Senden: {e}") sys.exit(1) # 5) Archivieren shutil.move(file_path, os.path.join(archive_dir, filename)) print("?? PDF verschoben nach _imported/")