import os import sys import shutil import requests from datetime import datetime, timezone #from text_chunker import chunk_text_paragraphs from uuid import uuid4 from chunking_utils import ( chunk_text_paragraphs, chunk_by_sentences, chunk_with_sentence_overlap ) # ?? Konfiguration API_URL = "http://localhost:8000/embed" CHUNK_SIZE = 500 OVERLAP = 100 # ?? Eingabeparameter: Kategorie, Dateiname, optionale Metadaten if len(sys.argv) < 3: print("? Aufruf: python import_single_file.py [topic]") sys.exit(1) CATEGORY = sys.argv[1] FILENAME = sys.argv[2] TOPIC = sys.argv[3] if len(sys.argv) > 3 else None SOURCE_DIR = os.path.expanduser(f"~/knowledge/{CATEGORY}") ARCHIVE_DIR = os.path.join(SOURCE_DIR, "_imported") FILEPATH = os.path.join(SOURCE_DIR, FILENAME) # ?? Validierung if not os.path.exists(FILEPATH): print(f"? Datei nicht gefunden: {FILEPATH}") sys.exit(1) os.makedirs(ARCHIVE_DIR, exist_ok=True) print(f"?? Importiere Datei: {FILENAME} aus Kategorie: {CATEGORY}") # ?? Inhalte lesen und in Chunks zerteilen with open(FILEPATH, "r", encoding="utf-8") as f: content = f.read() chunks = chunk_with_sentence_overlap(content, max_length=CHUNK_SIZE, overlap_sents=2) print(f"?? {len(chunks)} Textabschnitte erzeugt.") # ?? Metadaten vorbereiten now = datetime.now(timezone.utc).isoformat() payload = { "chunks": [], "collection": CATEGORY } for i, chunk in enumerate(chunks): payload["chunks"].append({ "text": chunk, "source": FILENAME, "source_type": "file", "title": FILENAME.replace(".txt", ""), "version": "v1.0", "related_to": CATEGORY, "tags": [CATEGORY], "owner": "karate-agent", "context_tag": TOPIC or "default", "imported_at": now, "chunk_index": i, "category": CATEGORY }) # ?? An API senden try: res = requests.post(API_URL, json=payload) res.raise_for_status() print(f"? {len(chunks)} Abschnitte erfolgreich eingebettet.") except Exception as e: print(f"? Fehler beim Senden: {e}") sys.exit(1) # ??? Datei archivieren shutil.move(FILEPATH, os.path.join(ARCHIVE_DIR, FILENAME)) print(f"?? Datei nach _imported verschoben.")