47 lines
1.5 KiB
Python
47 lines
1.5 KiB
Python
import re
|
|
|
|
def chunk_text_paragraphs(text, max_length=500, overlap=1):
|
|
"""
|
|
Zerteilt den Text absatzweise in Chunks bis `max_length` Zeichen.
|
|
Optional wird `paragraph_overlap` Absatz(e) vom vorherigen Chunk übernommen.
|
|
"""
|
|
|
|
paragraphs = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()]
|
|
chunks = []
|
|
current_chunk = []
|
|
current_len = 0
|
|
|
|
i = 0
|
|
while i < len(paragraphs):
|
|
para = paragraphs[i]
|
|
para_len = len(para)
|
|
|
|
# Wenn dieser Absatz den max Chunk sprengt → neuen Chunk
|
|
if current_len + para_len + 2 > max_length: # +2 für Leerzeile
|
|
if current_chunk:
|
|
chunks.append("\n\n".join(current_chunk))
|
|
# Optional: letzte N Absätze behalten
|
|
if overlap > 0:
|
|
current_chunk = current_chunk[-overlap:]
|
|
current_len = sum(len(p) for p in current_chunk) + 2 * len(current_chunk)
|
|
else:
|
|
current_chunk = []
|
|
current_len = 0
|
|
else:
|
|
# Einzelner Absatz ist zu groß → hart splitten
|
|
chunks.append(para[:max_length])
|
|
para = para[max_length:]
|
|
paragraphs.insert(i + 1, para) # Rest zurück in Liste
|
|
i += 1
|
|
continue
|
|
else:
|
|
current_chunk.append(para)
|
|
current_len += para_len + 2 # +2 für Trennung
|
|
|
|
i += 1
|
|
|
|
if current_chunk:
|
|
chunks.append("\n\n".join(current_chunk))
|
|
|
|
return chunks
|