Trainer_LLM/scripts/archiv/import_textfile.py

import requests
import os
import sys

# 📌 Konfiguration
API_URL = "http://localhost:8000/embed"
CHUNK_SIZE = 500
OVERLAP = 100

def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
    """
    Teilt einen Text in überlappende Abschnitte auf.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + size, len(text))
        chunks.append(text[start:end])
        start += size - overlap
    return chunks

def read_text_file(path):
    """
    Liest den Inhalt einer Textdatei ein.
    """
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def import_chunks(chunks, collection):
    """
    Sendet die Textabschnitte an die API.
    """
    response = requests.post(API_URL, json={
        "texts": chunks,
        "collection": collection
    })
    response.raise_for_status()
    return response.json()

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("❌ Nutzung: python import_textfile.py <collection> <pfad_zur_txt_datei>")
        sys.exit(1)

    collection = sys.argv[1]
    filepath = os.path.expanduser(sys.argv[2])

    if not os.path.isfile(filepath):
        print(f"❌ Datei nicht gefunden: {filepath}")
        sys.exit(1)

    print(f"📄 Lade Datei: {filepath}")
    text = read_text_file(filepath)
    chunks = chunk_text(text)

    print(f"📦 {len(chunks)} Abschnitte vorbereitet – sende an Collection '{collection}'...")
    try:
        result = import_chunks(chunks, collection)
        print(f"✅ Import erfolgreich: {result}")
    except Exception as e:
        print(f"❌ Fehler beim Import: {e}")