62 lines
1.6 KiB
Python
62 lines
1.6 KiB
Python
import requests
|
||
import os
|
||
import sys
|
||
|
||
# 📌 Konfiguration
|
||
API_URL = "http://localhost:8000/embed"
|
||
CHUNK_SIZE = 500
|
||
OVERLAP = 100
|
||
|
||
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
|
||
"""
|
||
Teilt einen Text in überlappende Abschnitte auf.
|
||
"""
|
||
chunks = []
|
||
start = 0
|
||
while start < len(text):
|
||
end = min(start + size, len(text))
|
||
chunks.append(text[start:end])
|
||
start += size - overlap
|
||
return chunks
|
||
|
||
def read_text_file(path):
|
||
"""
|
||
Liest den Inhalt einer Textdatei ein.
|
||
"""
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
return f.read()
|
||
|
||
def import_chunks(chunks, collection):
|
||
"""
|
||
Sendet die Textabschnitte an die API.
|
||
"""
|
||
response = requests.post(API_URL, json={
|
||
"texts": chunks,
|
||
"collection": collection
|
||
})
|
||
response.raise_for_status()
|
||
return response.json()
|
||
|
||
if __name__ == "__main__":
|
||
if len(sys.argv) != 3:
|
||
print("❌ Nutzung: python import_textfile.py <collection> <pfad_zur_txt_datei>")
|
||
sys.exit(1)
|
||
|
||
collection = sys.argv[1]
|
||
filepath = os.path.expanduser(sys.argv[2])
|
||
|
||
if not os.path.isfile(filepath):
|
||
print(f"❌ Datei nicht gefunden: {filepath}")
|
||
sys.exit(1)
|
||
|
||
print(f"📄 Lade Datei: {filepath}")
|
||
text = read_text_file(filepath)
|
||
chunks = chunk_text(text)
|
||
|
||
print(f"📦 {len(chunks)} Abschnitte vorbereitet – sende an Collection '{collection}'...")
|
||
try:
|
||
result = import_chunks(chunks, collection)
|
||
print(f"✅ Import erfolgreich: {result}")
|
||
except Exception as e:
|
||
print(f"❌ Fehler beim Import: {e}")
|