Trainer_LLM/scripts/import_pdf_file.py

93 lines
2.3 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
import_pdf_file.py
Importiert eine einzelne PDF-Datei:
- Text-Extraktion via pdfplumber
- Chunking mit Satz-Overlap
- POST an /embed
- Verschieben nach _imported
"""
import os
import sys
import shutil
import requests
from uuid import uuid4
from datetime import datetime, timezone
import pdfplumber
from chunking_utils import chunk_with_sentence_overlap
# Konfiguration
API_URL = "http://localhost:8000/embed"
MAX_LENGTH = 500
if len(sys.argv) < 3:
print("? Usage: python import_pdf_file.py <category> <filename> [topic]")
sys.exit(1)
category = sys.argv[1]
filename = sys.argv[2]
topic = sys.argv[3] if len(sys.argv) > 3 else "default"
source_dir = os.path.expanduser(f"~/knowledge/{category}")
file_path = os.path.join(source_dir, filename)
archive_dir= os.path.join(source_dir, "_imported")
if not os.path.isfile(file_path):
print(f"? Datei nicht gefunden: {file_path}")
sys.exit(1)
os.makedirs(archive_dir, exist_ok=True)
print(f"?? PDF-Import: {filename} in Kategorie {category} (Topic={topic})")
# 1) PDF-Text extrahieren
text = ""
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text() or ""
text += page_text + "\n\n"
# 2) Chunking
chunks = chunk_with_sentence_overlap(
text,
max_length=MAX_LENGTH,
overlap_sents=2
)
print(f"?? {len(chunks)} Chunks erzeugt.")
# 3) Payload bauen
now = datetime.now(timezone.utc).isoformat()
payload = {"collection": category, "chunks": []}
for idx, chunk in enumerate(chunks):
payload["chunks"].append({
"text": chunk,
"source": filename,
"source_type": "pdf",
"title": os.path.splitext(filename)[0],
"version": "v1.0",
"related_to": category,
"tags": [category],
"owner": "karate-agent",
"context_tag": topic,
"imported_at": now,
"chunk_index": idx,
"category": category
})
# 4) Senden
try:
res = requests.post(API_URL, json=payload)
res.raise_for_status()
print(f"? {len(chunks)} Chunks erfolgreich eingebettet.")
except Exception as e:
print(f"? Fehler beim Senden: {e}")
sys.exit(1)
# 5) Archivieren
shutil.move(file_path, os.path.join(archive_dir, filename))
print("?? PDF verschoben nach _imported/")