67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
import os
|
||
import sys
|
||
import requests
|
||
|
||
# 📌 Konfiguration
|
||
API_URL = "http://localhost:8000/embed"
|
||
CHUNK_SIZE = 500
|
||
OVERLAP = 100
|
||
|
||
# 📁 Kommandozeilenparameter auswerten
|
||
if len(sys.argv) != 2:
|
||
print("❌ Bitte gib eine Kategorie an, z. B.: python index_documents.py karatetrainer")
|
||
sys.exit(1)
|
||
|
||
CATEGORY = sys.argv[1]
|
||
SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}")
|
||
COLLECTION = CATEGORY
|
||
|
||
if not os.path.exists(SOURCE_DIR):
|
||
print(f"❌ Der Ordner '{SOURCE_DIR}' existiert nicht.")
|
||
sys.exit(1)
|
||
|
||
print(f"📁 Lese Dokumente aus: {SOURCE_DIR}")
|
||
print(f"🎯 Ziel-Collection: {COLLECTION}")
|
||
|
||
# 🔧 Text in überlappende Chunks aufteilen
|
||
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
|
||
chunks = []
|
||
start = 0
|
||
while start < len(text):
|
||
end = min(start + size, len(text))
|
||
chunks.append(text[start:end])
|
||
start += size - overlap
|
||
return chunks
|
||
|
||
# 📚 Alle .txt-Dateien im Ordner lesen
|
||
def read_all_text_files(folder):
|
||
chunks = []
|
||
for filename in os.listdir(folder):
|
||
if filename.endswith(".txt"):
|
||
path = os.path.join(folder, filename)
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
text = f.read()
|
||
file_chunks = chunk_text(text)
|
||
chunks.extend(file_chunks)
|
||
return chunks
|
||
|
||
# 📤 An API senden
|
||
def embed_chunks(chunks, collection):
|
||
response = requests.post(API_URL, json={"texts": chunks, "collection": collection})
|
||
response.raise_for_status()
|
||
return response.json()
|
||
|
||
# 🚀 Hauptlogik
|
||
if __name__ == "__main__":
|
||
texts = read_all_text_files(SOURCE_DIR)
|
||
if not texts:
|
||
print("⚠️ Keine Textabschnitte gefunden.")
|
||
sys.exit(0)
|
||
|
||
print(f"📦 {len(texts)} Textabschnitte gefunden, sende an {API_URL}...")
|
||
try:
|
||
result = embed_chunks(texts, COLLECTION)
|
||
print(f"✅ Ergebnis: {result}")
|
||
except Exception as e:
|
||
print(f"❌ Fehler beim Senden: {e}")
|