Trainer_LLM/scripts/archiv/index_documents.py

67 lines
1.9 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import sys
import requests
# 📌 Konfiguration
API_URL = "http://localhost:8000/embed"
CHUNK_SIZE = 500
OVERLAP = 100
# 📁 Kommandozeilenparameter auswerten
if len(sys.argv) != 2:
print("❌ Bitte gib eine Kategorie an, z.B.: python index_documents.py karatetrainer")
sys.exit(1)
CATEGORY = sys.argv[1]
SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}")
COLLECTION = CATEGORY
if not os.path.exists(SOURCE_DIR):
print(f"❌ Der Ordner '{SOURCE_DIR}' existiert nicht.")
sys.exit(1)
print(f"📁 Lese Dokumente aus: {SOURCE_DIR}")
print(f"🎯 Ziel-Collection: {COLLECTION}")
# 🔧 Text in überlappende Chunks aufteilen
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
chunks = []
start = 0
while start < len(text):
end = min(start + size, len(text))
chunks.append(text[start:end])
start += size - overlap
return chunks
# 📚 Alle .txt-Dateien im Ordner lesen
def read_all_text_files(folder):
chunks = []
for filename in os.listdir(folder):
if filename.endswith(".txt"):
path = os.path.join(folder, filename)
with open(path, "r", encoding="utf-8") as f:
text = f.read()
file_chunks = chunk_text(text)
chunks.extend(file_chunks)
return chunks
# 📤 An API senden
def embed_chunks(chunks, collection):
response = requests.post(API_URL, json={"texts": chunks, "collection": collection})
response.raise_for_status()
return response.json()
# 🚀 Hauptlogik
if __name__ == "__main__":
texts = read_all_text_files(SOURCE_DIR)
if not texts:
print("⚠️ Keine Textabschnitte gefunden.")
sys.exit(0)
print(f"📦 {len(texts)} Textabschnitte gefunden, sende an {API_URL}...")
try:
result = embed_chunks(texts, COLLECTION)
print(f"✅ Ergebnis: {result}")
except Exception as e:
print(f"❌ Fehler beim Senden: {e}")