Trainer_LLM/scripts/import_single_file.py

86 lines
2.2 KiB
Python

import os
import sys
import shutil
import requests
from datetime import datetime, timezone
#from text_chunker import chunk_text_paragraphs
from uuid import uuid4
from chunking_utils import (
chunk_text_paragraphs,
chunk_by_sentences,
chunk_with_sentence_overlap
)
# ?? Konfiguration
API_URL = "http://localhost:8000/embed"
CHUNK_SIZE = 500
OVERLAP = 100
# ?? Eingabeparameter: Kategorie, Dateiname, optionale Metadaten
if len(sys.argv) < 3:
print("? Aufruf: python import_single_file.py <category> <filename> [topic]")
sys.exit(1)
CATEGORY = sys.argv[1]
FILENAME = sys.argv[2]
TOPIC = sys.argv[3] if len(sys.argv) > 3 else None
SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}")
ARCHIVE_DIR = os.path.join(SOURCE_DIR, "_imported")
FILEPATH = os.path.join(SOURCE_DIR, FILENAME)
# ?? Validierung
if not os.path.exists(FILEPATH):
print(f"? Datei nicht gefunden: {FILEPATH}")
sys.exit(1)
os.makedirs(ARCHIVE_DIR, exist_ok=True)
print(f"?? Importiere Datei: {FILENAME} aus Kategorie: {CATEGORY}")
# ?? Inhalte lesen und in Chunks zerteilen
with open(FILEPATH, "r", encoding="utf-8") as f:
content = f.read()
chunks = chunk_with_sentence_overlap(content, max_length=CHUNK_SIZE, overlap_sents=2)
print(f"?? {len(chunks)} Textabschnitte erzeugt.")
# ?? Metadaten vorbereiten
now = datetime.now(timezone.utc).isoformat()
payload = {
"chunks": [],
"collection": CATEGORY
}
for i, chunk in enumerate(chunks):
payload["chunks"].append({
"text": chunk,
"source": FILENAME,
"source_type": "file",
"title": FILENAME.replace(".txt", ""),
"version": "v1.0",
"related_to": CATEGORY,
"tags": [CATEGORY],
"owner": "karate-agent",
"context_tag": TOPIC or "default",
"imported_at": now,
"chunk_index": i,
"category": CATEGORY
})
# ?? An API senden
try:
res = requests.post(API_URL, json=payload)
res.raise_for_status()
print(f"? {len(chunks)} Abschnitte erfolgreich eingebettet.")
except Exception as e:
print(f"? Fehler beim Senden: {e}")
sys.exit(1)
# ??? Datei archivieren
shutil.move(FILEPATH, os.path.join(ARCHIVE_DIR, FILENAME))
print(f"?? Datei nach _imported verschoben.")