Trainer_LLM/scripts/archiv/text_chunker.py

47 lines
1.5 KiB
Python

import re
def chunk_text_paragraphs(text, max_length=500, overlap=1):
"""
Zerteilt den Text absatzweise in Chunks bis `max_length` Zeichen.
Optional wird `paragraph_overlap` Absatz(e) vom vorherigen Chunk übernommen.
"""
paragraphs = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()]
chunks = []
current_chunk = []
current_len = 0
i = 0
while i < len(paragraphs):
para = paragraphs[i]
para_len = len(para)
# Wenn dieser Absatz den max Chunk sprengt → neuen Chunk
if current_len + para_len + 2 > max_length: # +2 für Leerzeile
if current_chunk:
chunks.append("\n\n".join(current_chunk))
# Optional: letzte N Absätze behalten
if overlap > 0:
current_chunk = current_chunk[-overlap:]
current_len = sum(len(p) for p in current_chunk) + 2 * len(current_chunk)
else:
current_chunk = []
current_len = 0
else:
# Einzelner Absatz ist zu groß → hart splitten
chunks.append(para[:max_length])
para = para[max_length:]
paragraphs.insert(i + 1, para) # Rest zurück in Liste
i += 1
continue
else:
current_chunk.append(para)
current_len += para_len + 2 # +2 für Trennung
i += 1
if current_chunk:
chunks.append("\n\n".join(current_chunk))
return chunks