Trainer_LLM/scripts/archiv/index_documents.py

67 lines
1.9 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
import requests
# 📌 Konfiguration
API_URL = "http://localhost:8000/embed"
CHUNK_SIZE = 500
OVERLAP = 100
# 📁 Kommandozeilenparameter auswerten
if len(sys.argv) != 2:
print("❌ Bitte gib eine Kategorie an, z.B.: python index_documents.py karatetrainer")
sys.exit(1)
CATEGORY = sys.argv[1]
SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}")
COLLECTION = CATEGORY
if not os.path.exists(SOURCE_DIR):
print(f"❌ Der Ordner '{SOURCE_DIR}' existiert nicht.")
sys.exit(1)
print(f"📁 Lese Dokumente aus: {SOURCE_DIR}")
print(f"🎯 Ziel-Collection: {COLLECTION}")
# 🔧 Text in überlappende Chunks aufteilen
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
chunks = []
start = 0
while start < len(text):
end = min(start + size, len(text))
chunks.append(text[start:end])
start += size - overlap
return chunks
# 📚 Alle .txt-Dateien im Ordner lesen
def read_all_text_files(folder):
chunks = []
for filename in os.listdir(folder):
if filename.endswith(".txt"):
path = os.path.join(folder, filename)
with open(path, "r", encoding="utf-8") as f:
text = f.read()
file_chunks = chunk_text(text)
chunks.extend(file_chunks)
return chunks
# 📤 An API senden
def embed_chunks(chunks, collection):
response = requests.post(API_URL, json={"texts": chunks, "collection": collection})
response.raise_for_status()
return response.json()
# 🚀 Hauptlogik
if __name__ == "__main__":
texts = read_all_text_files(SOURCE_DIR)
if not texts:
print("⚠️ Keine Textabschnitte gefunden.")
sys.exit(0)
print(f"📦 {len(texts)} Textabschnitte gefunden, sende an {API_URL}...")
try:
result = embed_chunks(texts, COLLECTION)
print(f"✅ Ergebnis: {result}")
except Exception as e:
print(f"❌ Fehler beim Senden: {e}")