Trainer_LLM/scripts/archiv/import_textfile.py

62 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import os
import sys
# 📌 Konfiguration
API_URL = "http://localhost:8000/embed"
CHUNK_SIZE = 500
OVERLAP = 100
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
"""
Teilt einen Text in überlappende Abschnitte auf.
"""
chunks = []
start = 0
while start < len(text):
end = min(start + size, len(text))
chunks.append(text[start:end])
start += size - overlap
return chunks
def read_text_file(path):
"""
Liest den Inhalt einer Textdatei ein.
"""
with open(path, "r", encoding="utf-8") as f:
return f.read()
def import_chunks(chunks, collection):
"""
Sendet die Textabschnitte an die API.
"""
response = requests.post(API_URL, json={
"texts": chunks,
"collection": collection
})
response.raise_for_status()
return response.json()
if __name__ == "__main__":
if len(sys.argv) != 3:
print("❌ Nutzung: python import_textfile.py <collection> <pfad_zur_txt_datei>")
sys.exit(1)
collection = sys.argv[1]
filepath = os.path.expanduser(sys.argv[2])
if not os.path.isfile(filepath):
print(f"❌ Datei nicht gefunden: {filepath}")
sys.exit(1)
print(f"📄 Lade Datei: {filepath}")
text = read_text_file(filepath)
chunks = chunk_text(text)
print(f"📦 {len(chunks)} Abschnitte vorbereitet sende an Collection '{collection}'...")
try:
result = import_chunks(chunks, collection)
print(f"✅ Import erfolgreich: {result}")
except Exception as e:
print(f"❌ Fehler beim Import: {e}")