93 lines
2.3 KiB
Python
93 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
import_pdf_file.py
|
|
|
|
Importiert eine einzelne PDF-Datei:
|
|
- Text-Extraktion via pdfplumber
|
|
- Chunking mit Satz-Overlap
|
|
- POST an /embed
|
|
- Verschieben nach _imported
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import shutil
|
|
import requests
|
|
from uuid import uuid4
|
|
from datetime import datetime, timezone
|
|
|
|
import pdfplumber
|
|
from chunking_utils import chunk_with_sentence_overlap
|
|
|
|
# Konfiguration
|
|
API_URL = "http://localhost:8000/embed"
|
|
MAX_LENGTH = 500
|
|
|
|
if len(sys.argv) < 3:
|
|
print("? Usage: python import_pdf_file.py <category> <filename> [topic]")
|
|
sys.exit(1)
|
|
|
|
category = sys.argv[1]
|
|
filename = sys.argv[2]
|
|
topic = sys.argv[3] if len(sys.argv) > 3 else "default"
|
|
|
|
source_dir = os.path.expanduser(f"~/knowledge/{category}")
|
|
file_path = os.path.join(source_dir, filename)
|
|
archive_dir= os.path.join(source_dir, "_imported")
|
|
|
|
if not os.path.isfile(file_path):
|
|
print(f"? Datei nicht gefunden: {file_path}")
|
|
sys.exit(1)
|
|
|
|
os.makedirs(archive_dir, exist_ok=True)
|
|
print(f"?? PDF-Import: {filename} in Kategorie {category} (Topic={topic})")
|
|
|
|
# 1) PDF-Text extrahieren
|
|
text = ""
|
|
with pdfplumber.open(file_path) as pdf:
|
|
for page in pdf.pages:
|
|
page_text = page.extract_text() or ""
|
|
text += page_text + "\n\n"
|
|
|
|
# 2) Chunking
|
|
chunks = chunk_with_sentence_overlap(
|
|
text,
|
|
max_length=MAX_LENGTH,
|
|
overlap_sents=2
|
|
)
|
|
print(f"?? {len(chunks)} Chunks erzeugt.")
|
|
|
|
# 3) Payload bauen
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
payload = {"collection": category, "chunks": []}
|
|
|
|
for idx, chunk in enumerate(chunks):
|
|
payload["chunks"].append({
|
|
"text": chunk,
|
|
"source": filename,
|
|
"source_type": "pdf",
|
|
"title": os.path.splitext(filename)[0],
|
|
"version": "v1.0",
|
|
"related_to": category,
|
|
"tags": [category],
|
|
"owner": "karate-agent",
|
|
"context_tag": topic,
|
|
"imported_at": now,
|
|
"chunk_index": idx,
|
|
"category": category
|
|
})
|
|
|
|
# 4) Senden
|
|
try:
|
|
res = requests.post(API_URL, json=payload)
|
|
res.raise_for_status()
|
|
print(f"? {len(chunks)} Chunks erfolgreich eingebettet.")
|
|
except Exception as e:
|
|
print(f"? Fehler beim Senden: {e}")
|
|
sys.exit(1)
|
|
|
|
# 5) Archivieren
|
|
shutil.move(file_path, os.path.join(archive_dir, filename))
|
|
print("?? PDF verschoben nach _imported/")
|