Enhance chunking functionality in version 4.2.8: Update callout pattern to support additional syntax for edge and abstract callouts. Modify get_chunk_config to allow fallback to chunk_profile if chunking_profile is not present. Ensure explicit passing of chunk_profile in make_chunk_payloads for improved payload handling. Update type hints in chunking_parser for better clarity.
This commit is contained in:
parent
1d66ca0649
commit
20fb1e92e2
|
|
@ -8,7 +8,7 @@ DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
from typing import List, Tuple, Set, Dict, Any
|
from typing import List, Tuple, Set, Dict, Any, Optional
|
||||||
from .chunking_models import RawBlock
|
from .chunking_models import RawBlock
|
||||||
from .chunking_utils import extract_frontmatter_from_text
|
from .chunking_utils import extract_frontmatter_from_text
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -147,7 +147,9 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
cleaned_lines = []
|
cleaned_lines = []
|
||||||
i = 0
|
i = 0
|
||||||
|
|
||||||
callout_start_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
|
# NEU (v4.2.8):
|
||||||
|
# WP-24c v4.2.8: Callout-Pattern für Edge und Abstract
|
||||||
|
callout_start_pattern = re.compile(r'^>\s*\[!(edge|abstract)[^\]]*\]', re.IGNORECASE)
|
||||||
|
|
||||||
while i < len(lines):
|
while i < len(lines):
|
||||||
line = lines[i]
|
line = lines[i]
|
||||||
|
|
|
||||||
|
|
@ -46,7 +46,7 @@ def get_chunk_config(note_type: str, frontmatter: Optional[Dict[str, Any]] = Non
|
||||||
# WP-24c v4.2.5: Priorität: Frontmatter > Type-Def > Defaults
|
# WP-24c v4.2.5: Priorität: Frontmatter > Type-Def > Defaults
|
||||||
profile_name = None
|
profile_name = None
|
||||||
if frontmatter and "chunking_profile" in frontmatter:
|
if frontmatter and "chunking_profile" in frontmatter:
|
||||||
profile_name = frontmatter.get("chunking_profile")
|
profile_name = frontmatter.get("chunking_profile") or frontmatter.get("chunk_profile")
|
||||||
if not profile_name:
|
if not profile_name:
|
||||||
profile_name = type_def.get("chunking_profile")
|
profile_name = type_def.get("chunking_profile")
|
||||||
if not profile_name:
|
if not profile_name:
|
||||||
|
|
|
||||||
|
|
@ -252,7 +252,10 @@ class IngestionService:
|
||||||
new_pool.append(cand)
|
new_pool.append(cand)
|
||||||
ch.candidate_pool = new_pool
|
ch.candidate_pool = new_pool
|
||||||
|
|
||||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
# chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
||||||
|
# v4.2.8 Fix C: Explizite Übergabe des Profil-Namens für den Chunk-Payload
|
||||||
|
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry, chunk_profile=profile)
|
||||||
|
|
||||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||||
|
|
||||||
# WP-24c v4.2.0: Kanten-Extraktion mit Note-Scope Zonen Support
|
# WP-24c v4.2.0: Kanten-Extraktion mit Note-Scope Zonen Support
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user