Enhance chunking functionality in version 4.2.8: Update callout pattern to support additional syntax for edge and abstract callouts. Modify get_chunk_config to allow fallback to chunk_profile if chunking_profile is not present. Ensure explicit passing of chunk_profile in make_chunk_payloads for improved payload handling. Update type hints in chunking_parser for better clarity.

This commit is contained in:
Lars 2026-01-11 11:49:16 +01:00
parent 1d66ca0649
commit 20fb1e92e2
4 changed files with 9 additions and 4 deletions

View File

@ -8,7 +8,7 @@ DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
"""
import re
import os
from typing import List, Tuple, Set, Dict, Any
from typing import List, Tuple, Set, Dict, Any, Optional
from .chunking_models import RawBlock
from .chunking_utils import extract_frontmatter_from_text

View File

@ -147,7 +147,9 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
cleaned_lines = []
i = 0
callout_start_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
# NEU (v4.2.8):
# WP-24c v4.2.8: Callout-Pattern für Edge und Abstract
callout_start_pattern = re.compile(r'^>\s*\[!(edge|abstract)[^\]]*\]', re.IGNORECASE)
while i < len(lines):
line = lines[i]

View File

@ -46,7 +46,7 @@ def get_chunk_config(note_type: str, frontmatter: Optional[Dict[str, Any]] = Non
# WP-24c v4.2.5: Priorität: Frontmatter > Type-Def > Defaults
profile_name = None
if frontmatter and "chunking_profile" in frontmatter:
profile_name = frontmatter.get("chunking_profile")
profile_name = frontmatter.get("chunking_profile") or frontmatter.get("chunk_profile")
if not profile_name:
profile_name = type_def.get("chunking_profile")
if not profile_name:

View File

@ -252,7 +252,10 @@ class IngestionService:
new_pool.append(cand)
ch.candidate_pool = new_pool
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
# chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
# v4.2.8 Fix C: Explizite Übergabe des Profil-Namens für den Chunk-Payload
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry, chunk_profile=profile)
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
# WP-24c v4.2.0: Kanten-Extraktion mit Note-Scope Zonen Support