import re def chunk_text_paragraphs(text, max_length=500, overlap=1): """ Zerteilt den Text absatzweise in Chunks bis `max_length` Zeichen. Optional wird `paragraph_overlap` Absatz(e) vom vorherigen Chunk übernommen. """ paragraphs = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()] chunks = [] current_chunk = [] current_len = 0 i = 0 while i < len(paragraphs): para = paragraphs[i] para_len = len(para) # Wenn dieser Absatz den max Chunk sprengt → neuen Chunk if current_len + para_len + 2 > max_length: # +2 für Leerzeile if current_chunk: chunks.append("\n\n".join(current_chunk)) # Optional: letzte N Absätze behalten if overlap > 0: current_chunk = current_chunk[-overlap:] current_len = sum(len(p) for p in current_chunk) + 2 * len(current_chunk) else: current_chunk = [] current_len = 0 else: # Einzelner Absatz ist zu groß → hart splitten chunks.append(para[:max_length]) para = para[max_length:] paragraphs.insert(i + 1, para) # Rest zurück in Liste i += 1 continue else: current_chunk.append(para) current_len += para_len + 2 # +2 für Trennung i += 1 if current_chunk: chunks.append("\n\n".join(current_chunk)) return chunks