86 lines
2.2 KiB
Python
86 lines
2.2 KiB
Python
import os
|
|
import sys
|
|
import shutil
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
#from text_chunker import chunk_text_paragraphs
|
|
from uuid import uuid4
|
|
from chunking_utils import (
|
|
chunk_text_paragraphs,
|
|
chunk_by_sentences,
|
|
chunk_with_sentence_overlap
|
|
)
|
|
|
|
|
|
# ?? Konfiguration
|
|
API_URL = "http://localhost:8000/embed"
|
|
CHUNK_SIZE = 500
|
|
OVERLAP = 100
|
|
|
|
# ?? Eingabeparameter: Kategorie, Dateiname, optionale Metadaten
|
|
if len(sys.argv) < 3:
|
|
print("? Aufruf: python import_single_file.py <category> <filename> [topic]")
|
|
sys.exit(1)
|
|
|
|
CATEGORY = sys.argv[1]
|
|
FILENAME = sys.argv[2]
|
|
TOPIC = sys.argv[3] if len(sys.argv) > 3 else None
|
|
|
|
SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}")
|
|
ARCHIVE_DIR = os.path.join(SOURCE_DIR, "_imported")
|
|
FILEPATH = os.path.join(SOURCE_DIR, FILENAME)
|
|
|
|
# ?? Validierung
|
|
if not os.path.exists(FILEPATH):
|
|
print(f"? Datei nicht gefunden: {FILEPATH}")
|
|
sys.exit(1)
|
|
|
|
os.makedirs(ARCHIVE_DIR, exist_ok=True)
|
|
|
|
print(f"?? Importiere Datei: {FILENAME} aus Kategorie: {CATEGORY}")
|
|
|
|
# ?? Inhalte lesen und in Chunks zerteilen
|
|
with open(FILEPATH, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
chunks = chunk_with_sentence_overlap(content, max_length=CHUNK_SIZE, overlap_sents=2)
|
|
print(f"?? {len(chunks)} Textabschnitte erzeugt.")
|
|
|
|
# ?? Metadaten vorbereiten
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
payload = {
|
|
"chunks": [],
|
|
"collection": CATEGORY
|
|
}
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
payload["chunks"].append({
|
|
"text": chunk,
|
|
"source": FILENAME,
|
|
"source_type": "file",
|
|
"title": FILENAME.replace(".txt", ""),
|
|
"version": "v1.0",
|
|
"related_to": CATEGORY,
|
|
"tags": [CATEGORY],
|
|
"owner": "karate-agent",
|
|
"context_tag": TOPIC or "default",
|
|
"imported_at": now,
|
|
"chunk_index": i,
|
|
"category": CATEGORY
|
|
})
|
|
|
|
# ?? An API senden
|
|
try:
|
|
res = requests.post(API_URL, json=payload)
|
|
res.raise_for_status()
|
|
print(f"? {len(chunks)} Abschnitte erfolgreich eingebettet.")
|
|
except Exception as e:
|
|
print(f"? Fehler beim Senden: {e}")
|
|
sys.exit(1)
|
|
|
|
# ??? Datei archivieren
|
|
shutil.move(FILEPATH, os.path.join(ARCHIVE_DIR, FILENAME))
|
|
print(f"?? Datei nach _imported verschoben.")
|