35 lines
1.1 KiB
Python
35 lines
1.1 KiB
Python
import re
|
|
|
|
def chunk_text_paragraphs(text: str, max_length: int = 500) -> list[str]:
|
|
# Absätze trennen
|
|
paragraphs = re.split(r'\n\s*\n', text.strip())
|
|
chunks: list[str] = []
|
|
current_chunk = ""
|
|
|
|
for para in paragraphs:
|
|
para = para.strip()
|
|
if not para:
|
|
continue
|
|
|
|
# Passt der Absatz noch zum aktuellen Chunk? +2 für die später hinzugefügten "\n\n"
|
|
if len(current_chunk) + len(para) + 2 <= max_length:
|
|
current_chunk = (current_chunk + "\n\n" + para) if current_chunk else para
|
|
else:
|
|
# Aktuellen Chunk abschließen
|
|
if current_chunk:
|
|
chunks.append(current_chunk)
|
|
|
|
# Ist der Absatz selbst zu groß? Dann hart splitten
|
|
if len(para) > max_length:
|
|
for i in range(0, len(para), max_length):
|
|
chunks.append(para[i:i + max_length])
|
|
current_chunk = ""
|
|
else:
|
|
current_chunk = para
|
|
|
|
# Letzten Chunk nicht vergessen
|
|
if current_chunk:
|
|
chunks.append(current_chunk)
|
|
|
|
return chunks
|