scriptAudit #11
|
|
@ -1,7 +1,7 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/chunker.py
|
FILE: app/core/chunker.py
|
||||||
DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings). Orchestriert die Smart-Edge-Allocation via SemanticAnalyzer.
|
DESCRIPTION: Zerlegt Texte in Chunks (Sliding Window oder nach Headings). Orchestriert die Smart-Edge-Allocation via SemanticAnalyzer.
|
||||||
VERSION: 2.6.0 (Fix: Strict Heading Split & Header Retention)
|
VERSION: 2.9.0 (Feat: Hybrid Strict Splitting with Size Safety)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: app.services.semantic_analyzer, app.core.derive_edges, markdown_it, yaml, asyncio
|
DEPENDENCIES: app.services.semantic_analyzer, app.core.derive_edges, markdown_it, yaml, asyncio
|
||||||
EXTERNAL_CONFIG: config/types.yaml
|
EXTERNAL_CONFIG: config/types.yaml
|
||||||
|
|
@ -25,7 +25,7 @@ from app.services.semantic_analyzer import get_semantic_analyzer
|
||||||
try:
|
try:
|
||||||
from app.core.derive_edges import build_edges_for_note
|
from app.core.derive_edges import build_edges_for_note
|
||||||
except ImportError:
|
except ImportError:
|
||||||
# Mock für Tests, falls Module fehlen
|
# Mock für Tests
|
||||||
def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return []
|
def build_edges_for_note(note_id, chunks, note_level_references=None, include_note_scope_refs=False): return []
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -123,9 +123,13 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
for line in lines:
|
for line in lines:
|
||||||
stripped = line.strip()
|
stripped = line.strip()
|
||||||
if stripped.startswith('# '):
|
if stripped.startswith('# '):
|
||||||
# H1 wird für den Titel genutzt, aber nicht als Block für sliding window
|
if buffer:
|
||||||
# (Außer es ist H1 im Body, aber wir ignorieren H1 hier meist als Title)
|
content = "\n".join(buffer).strip()
|
||||||
continue
|
if content:
|
||||||
|
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||||
|
buffer = []
|
||||||
|
blocks.append(RawBlock("heading", stripped, 1, section_path, current_h2))
|
||||||
|
|
||||||
elif stripped.startswith('## '):
|
elif stripped.startswith('## '):
|
||||||
if buffer:
|
if buffer:
|
||||||
content = "\n".join(buffer).strip()
|
content = "\n".join(buffer).strip()
|
||||||
|
|
@ -134,8 +138,16 @@ def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||||
buffer = []
|
buffer = []
|
||||||
current_h2 = stripped[3:].strip()
|
current_h2 = stripped[3:].strip()
|
||||||
section_path = f"/{current_h2}"
|
section_path = f"/{current_h2}"
|
||||||
# WICHTIG: Die Überschrift selbst als Block speichern!
|
|
||||||
blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2))
|
blocks.append(RawBlock("heading", stripped, 2, section_path, current_h2))
|
||||||
|
|
||||||
|
elif stripped.startswith('### '):
|
||||||
|
if buffer:
|
||||||
|
content = "\n".join(buffer).strip()
|
||||||
|
if content:
|
||||||
|
blocks.append(RawBlock("paragraph", content, None, section_path, current_h2))
|
||||||
|
buffer = []
|
||||||
|
blocks.append(RawBlock("heading", stripped, 3, section_path, current_h2))
|
||||||
|
|
||||||
elif not stripped:
|
elif not stripped:
|
||||||
if buffer:
|
if buffer:
|
||||||
content = "\n".join(buffer).strip()
|
content = "\n".join(buffer).strip()
|
||||||
|
|
@ -175,19 +187,18 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
|
||||||
text_body = "\n\n".join([b.text for b in buf])
|
text_body = "\n\n".join([b.text for b in buf])
|
||||||
win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body
|
win_body = f"{context_prefix}\n{text_body}".strip() if context_prefix else text_body
|
||||||
|
|
||||||
if estimate_tokens(text_body) <= max_tokens:
|
|
||||||
sec = buf[0].section_title if buf else None
|
|
||||||
path = buf[0].section_path if buf else "/"
|
|
||||||
_create_chunk_obj(chunks, note_id, text_body, win_body, sec, path)
|
|
||||||
else:
|
|
||||||
sentences = split_sentences(text_body)
|
|
||||||
current_chunk_sents = []
|
|
||||||
current_len = 0
|
|
||||||
|
|
||||||
# Basis-Info vom ersten Block im Buffer
|
# Basis-Info vom ersten Block im Buffer
|
||||||
sec = buf[0].section_title if buf else None
|
sec = buf[0].section_title if buf else None
|
||||||
path = buf[0].section_path if buf else "/"
|
path = buf[0].section_path if buf else "/"
|
||||||
|
|
||||||
|
if estimate_tokens(text_body) <= max_tokens:
|
||||||
|
_create_chunk_obj(chunks, note_id, text_body, win_body, sec, path)
|
||||||
|
else:
|
||||||
|
# Fallback: Wenn Block zu groß, intern splitten (Sentence-Level)
|
||||||
|
sentences = split_sentences(text_body)
|
||||||
|
current_chunk_sents = []
|
||||||
|
current_len = 0
|
||||||
|
|
||||||
for sent in sentences:
|
for sent in sentences:
|
||||||
sent_len = estimate_tokens(sent)
|
sent_len = estimate_tokens(sent)
|
||||||
if current_len + sent_len > target and current_chunk_sents:
|
if current_len + sent_len > target and current_chunk_sents:
|
||||||
|
|
@ -219,11 +230,7 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
|
||||||
buf = []
|
buf = []
|
||||||
|
|
||||||
for b in blocks:
|
for b in blocks:
|
||||||
# Bei Sliding Window ignorieren wir Heading-Blocks als Split-Trigger NICHT zwingend,
|
|
||||||
# aber wir wollen Headings oft nicht "allein" stehen haben.
|
|
||||||
# Hier einfache Logik:
|
|
||||||
if b.kind == "heading":
|
if b.kind == "heading":
|
||||||
# Optional: Buffer flushen bei neuem Header, um Kontextwechsel sauberer zu machen
|
|
||||||
flush_buffer()
|
flush_buffer()
|
||||||
|
|
||||||
current_buf_text = "\n\n".join([x.text for x in buf])
|
current_buf_text = "\n\n".join([x.text for x in buf])
|
||||||
|
|
@ -237,30 +244,34 @@ def _strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], not
|
||||||
|
|
||||||
def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, doc_title: str = "") -> List[Chunk]:
|
||||||
"""
|
"""
|
||||||
STRICT HEADING SPLIT (Fix v2.6.0):
|
MODUS: Structured / Heading Split
|
||||||
Trennt den Text konsequent an jeder Überschrift der definierten Ebene.
|
- split_level: Ebene für logische Trennung (z.B. H2).
|
||||||
Behält Überschriften als Teil (erste Zeile) des Chunks bei.
|
- strict_heading_split:
|
||||||
Kein Merging kleiner Abschnitte über Header-Grenzen hinweg.
|
True: Trennt an jedem Header <= split_level.
|
||||||
|
NEU v2.9: Wenn Inhalt > max_tokens, wird trotzdem gesplittet (Safety Split).
|
||||||
|
False: Fasst zusammen bis 'target' erreicht ist.
|
||||||
"""
|
"""
|
||||||
split_level = config.get("split_level", 2)
|
split_level = config.get("split_level", 2)
|
||||||
chunks = []
|
target = config.get("target", 400)
|
||||||
|
max_limit = config.get("max", 600)
|
||||||
|
strict_mode = config.get("strict_heading_split", False)
|
||||||
|
|
||||||
# Temporärer Speicher für den aktuellen Chunk
|
chunks = []
|
||||||
current_chunk_blocks = []
|
current_chunk_blocks = []
|
||||||
|
|
||||||
context_prefix = f"# {doc_title}"
|
context_prefix = f"# {doc_title}"
|
||||||
|
|
||||||
|
def has_content(blk_list):
|
||||||
|
return any(b.kind != "heading" for b in blk_list)
|
||||||
|
|
||||||
def flush_current_chunk():
|
def flush_current_chunk():
|
||||||
nonlocal current_chunk_blocks
|
nonlocal current_chunk_blocks
|
||||||
if not current_chunk_blocks:
|
if not current_chunk_blocks:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Text zusammenbauen
|
|
||||||
text_body = "\n\n".join([b.text for b in current_chunk_blocks])
|
text_body = "\n\n".join([b.text for b in current_chunk_blocks])
|
||||||
# Window bauen (hier einfach Text, da Kontext via Header implizit ist)
|
|
||||||
win_body = f"{context_prefix}\n{text_body}".strip()
|
win_body = f"{context_prefix}\n{text_body}".strip()
|
||||||
|
|
||||||
# Metadaten vom ersten Block (üblicherweise der Header) nehmen
|
|
||||||
first_b = current_chunk_blocks[0]
|
first_b = current_chunk_blocks[0]
|
||||||
sec = first_b.section_title
|
sec = first_b.section_title
|
||||||
path = first_b.section_path
|
path = first_b.section_path
|
||||||
|
|
@ -268,18 +279,48 @@ def _strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id
|
||||||
_create_chunk_obj(chunks, note_id, text_body, win_body, sec, path)
|
_create_chunk_obj(chunks, note_id, text_body, win_body, sec, path)
|
||||||
current_chunk_blocks = []
|
current_chunk_blocks = []
|
||||||
|
|
||||||
|
def get_current_size():
|
||||||
|
txt = "\n\n".join([b.text for b in current_chunk_blocks])
|
||||||
|
return estimate_tokens(txt)
|
||||||
|
|
||||||
for b in blocks:
|
for b in blocks:
|
||||||
# Prüfen, ob dieser Block ein Trenner (Header auf Split-Level) ist
|
# 1. Header Logic (Struktur-Trigger)
|
||||||
is_splitter = (b.kind == "heading" and b.level == split_level)
|
is_splitter = (b.kind == "heading" and b.level is not None and b.level <= split_level)
|
||||||
|
|
||||||
if is_splitter:
|
if is_splitter:
|
||||||
# 1. Den bisherigen Chunk abschließen (falls vorhanden)
|
is_higher_hierarchy = (b.level < split_level)
|
||||||
flush_current_chunk()
|
|
||||||
|
|
||||||
# 2. Den neuen Chunk mit diesem Header beginnen
|
if strict_mode:
|
||||||
|
# STRICT:
|
||||||
|
# Wir splitten immer, außer der Vor-Chunk ist leer.
|
||||||
|
if current_chunk_blocks and has_content(current_chunk_blocks):
|
||||||
|
flush_current_chunk()
|
||||||
|
current_chunk_blocks.append(b)
|
||||||
|
else:
|
||||||
|
# SOFT:
|
||||||
|
# Split bei Hierarchie-Wechsel ODER wenn voll.
|
||||||
|
if is_higher_hierarchy:
|
||||||
|
flush_current_chunk()
|
||||||
|
current_chunk_blocks.append(b)
|
||||||
|
elif current_chunk_blocks and get_current_size() >= target:
|
||||||
|
flush_current_chunk()
|
||||||
|
current_chunk_blocks.append(b)
|
||||||
|
else:
|
||||||
|
current_chunk_blocks.append(b)
|
||||||
|
else:
|
||||||
|
# 2. Content Logic (Safety Trigger für Monster-Abschnitte)
|
||||||
|
# Bevor wir den Block anhängen: Würde er das Fass zum Überlaufen bringen?
|
||||||
|
# Wir nutzen hier 'max' als harte Grenze für den Safety-Split.
|
||||||
|
current_size = get_current_size()
|
||||||
|
block_size = estimate_tokens(b.text)
|
||||||
|
|
||||||
|
if current_chunk_blocks and (current_size + block_size > max_limit):
|
||||||
|
# NOTBREMSE: Chunk wird zu groß.
|
||||||
|
# Wir splitten hier, auch wenn kein Header da ist.
|
||||||
|
# Der Kontext (Section Title) bleibt erhalten, da er aus `current_h2` kommt (siehe parse_blocks).
|
||||||
|
flush_current_chunk()
|
||||||
current_chunk_blocks.append(b)
|
current_chunk_blocks.append(b)
|
||||||
else:
|
else:
|
||||||
# Einfach anhängen
|
|
||||||
current_chunk_blocks.append(b)
|
current_chunk_blocks.append(b)
|
||||||
|
|
||||||
# Letzten Rest flushen
|
# Letzten Rest flushen
|
||||||
|
|
@ -301,14 +342,12 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
primary_strategy = config.get("strategy", "sliding_window")
|
primary_strategy = config.get("strategy", "sliding_window")
|
||||||
enable_smart_edges = config.get("enable_smart_edge_allocation", False)
|
enable_smart_edges = config.get("enable_smart_edge_allocation", False)
|
||||||
|
|
||||||
# Performance/Cost-Guard: Bei Entwürfen keine Smart Edges
|
|
||||||
if enable_smart_edges and note_status in ["draft", "initial_gen"]:
|
if enable_smart_edges and note_status in ["draft", "initial_gen"]:
|
||||||
logger.info(f"Chunker: Skipping Smart Edges for draft '{note_id}'.")
|
logger.info(f"Chunker: Skipping Smart Edges for draft '{note_id}'.")
|
||||||
enable_smart_edges = False
|
enable_smart_edges = False
|
||||||
|
|
||||||
blocks, doc_title = parse_blocks(md_text)
|
blocks, doc_title = parse_blocks(md_text)
|
||||||
|
|
||||||
# Strategie-Auswahl
|
|
||||||
if primary_strategy == "by_heading":
|
if primary_strategy == "by_heading":
|
||||||
chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
|
chunks = await asyncio.to_thread(_strategy_by_heading, blocks, config, note_id, doc_title)
|
||||||
else:
|
else:
|
||||||
|
|
@ -317,11 +356,9 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
if not chunks:
|
if not chunks:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Smart Edge Allocation (WP-15)
|
|
||||||
if enable_smart_edges:
|
if enable_smart_edges:
|
||||||
chunks = await _run_smart_edge_allocation(chunks, md_text, note_id, note_type)
|
chunks = await _run_smart_edge_allocation(chunks, md_text, note_id, note_type)
|
||||||
|
|
||||||
# Verkettung der Chunks (next/prev)
|
|
||||||
for i, ch in enumerate(chunks):
|
for i, ch in enumerate(chunks):
|
||||||
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
||||||
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
||||||
|
|
@ -329,10 +366,6 @@ async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Op
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> List[str]:
|
def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> List[str]:
|
||||||
"""
|
|
||||||
Hilfsfunktion: Erstellt einen Dummy-Chunk für den gesamten Text und ruft
|
|
||||||
den Edge-Parser auf, um ALLE Kanten der Notiz zu finden.
|
|
||||||
"""
|
|
||||||
dummy_chunk = {
|
dummy_chunk = {
|
||||||
"chunk_id": f"{note_id}#full",
|
"chunk_id": f"{note_id}#full",
|
||||||
"text": md_text,
|
"text": md_text,
|
||||||
|
|
@ -340,7 +373,6 @@ def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> Li
|
||||||
"window": md_text,
|
"window": md_text,
|
||||||
"type": note_type
|
"type": note_type
|
||||||
}
|
}
|
||||||
# Parsing aller Kanten (Inline, Wikilinks, Callouts)
|
|
||||||
raw_edges = build_edges_for_note(
|
raw_edges = build_edges_for_note(
|
||||||
note_id,
|
note_id,
|
||||||
[dummy_chunk],
|
[dummy_chunk],
|
||||||
|
|
@ -351,29 +383,23 @@ def _extract_all_edges_from_md(md_text: str, note_id: str, note_type: str) -> Li
|
||||||
for e in raw_edges:
|
for e in raw_edges:
|
||||||
kind = e.get("kind")
|
kind = e.get("kind")
|
||||||
target = e.get("target_id")
|
target = e.get("target_id")
|
||||||
# Struktur-Kanten ignorieren wir für die Verteilung
|
|
||||||
if target and kind not in ["belongs_to", "next", "prev", "backlink"]:
|
if target and kind not in ["belongs_to", "next", "prev", "backlink"]:
|
||||||
all_candidates.add(f"{kind}:{target}")
|
all_candidates.add(f"{kind}:{target}")
|
||||||
|
|
||||||
return list(all_candidates)
|
return list(all_candidates)
|
||||||
|
|
||||||
async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_id: str, note_type: str) -> List[Chunk]:
|
async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_id: str, note_type: str) -> List[Chunk]:
|
||||||
analyzer = get_semantic_analyzer()
|
analyzer = get_semantic_analyzer()
|
||||||
|
|
||||||
# A. Alle potenziellen Kanten der Notiz sammeln
|
|
||||||
candidate_list = _extract_all_edges_from_md(full_text, note_id, note_type)
|
candidate_list = _extract_all_edges_from_md(full_text, note_id, note_type)
|
||||||
|
|
||||||
if not candidate_list:
|
if not candidate_list:
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
# B. LLM Filterung pro Chunk (Parallel)
|
|
||||||
tasks = []
|
tasks = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
tasks.append(analyzer.assign_edges_to_chunk(chunk.text, candidate_list, note_type))
|
tasks.append(analyzer.assign_edges_to_chunk(chunk.text, candidate_list, note_type))
|
||||||
|
|
||||||
results_per_chunk = await asyncio.gather(*tasks)
|
results_per_chunk = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
# C. Injection & Fallback Tracking
|
|
||||||
assigned_edges_global = set()
|
assigned_edges_global = set()
|
||||||
|
|
||||||
for i, confirmed_edges in enumerate(results_per_chunk):
|
for i, confirmed_edges in enumerate(results_per_chunk):
|
||||||
|
|
@ -381,18 +407,13 @@ async def _run_smart_edge_allocation(chunks: List[Chunk], full_text: str, note_i
|
||||||
chunk.suggested_edges = confirmed_edges
|
chunk.suggested_edges = confirmed_edges
|
||||||
assigned_edges_global.update(confirmed_edges)
|
assigned_edges_global.update(confirmed_edges)
|
||||||
|
|
||||||
# Injection: Wir hängen die bestätigten Edges unsichtbar (fürs Embedding) oder sichtbar an
|
|
||||||
# Hier als "Pseudo-Code" im Text, damit sie embedded werden.
|
|
||||||
if confirmed_edges:
|
if confirmed_edges:
|
||||||
# Format: [[rel:kind|target]]
|
|
||||||
injection_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in confirmed_edges if ':' in e])
|
injection_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in confirmed_edges if ':' in e])
|
||||||
chunk.text += injection_str
|
chunk.text += injection_str
|
||||||
chunk.window += injection_str
|
chunk.window += injection_str
|
||||||
|
|
||||||
# D. Fallback: Kanten, die NIRGENDS zugewiesen wurden, werden JEDEM Chunk angehängt (Sicherheit)
|
|
||||||
unassigned = set(candidate_list) - assigned_edges_global
|
unassigned = set(candidate_list) - assigned_edges_global
|
||||||
if unassigned:
|
if unassigned:
|
||||||
logger.info(f"Chunker: {len(unassigned)} unassigned edges in {note_id}. Distributing to all chunks.")
|
|
||||||
fallback_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in unassigned if ':' in e])
|
fallback_str = "\n" + " ".join([f"[[rel:{e.split(':')[0]}|{e.split(':')[1]}]]" for e in unassigned if ':' in e])
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
chunk.text += fallback_str
|
chunk.text += fallback_str
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/services/semantic_analyzer.py
|
FILE: app/services/semantic_analyzer.py
|
||||||
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
|
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
|
||||||
VERSION: 2.0.0
|
VERSION: 2.1.0 (Fix: Strict Edge String Validation against LLM Hallucinations)
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
DEPENDENCIES: app.services.llm_service, json, logging
|
DEPENDENCIES: app.services.llm_service, json, logging
|
||||||
LAST_ANALYSIS: 2025-12-15
|
LAST_ANALYSIS: 2025-12-16
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
@ -21,6 +21,34 @@ class SemanticAnalyzer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.llm = LLMService()
|
self.llm = LLMService()
|
||||||
|
|
||||||
|
def _is_valid_edge_string(self, edge_str: str) -> bool:
|
||||||
|
"""
|
||||||
|
Prüft, ob ein String eine valide Kante im Format 'kind:target' ist.
|
||||||
|
Verhindert, dass LLM-Geschwätz ("Here is the list: ...") als Kante durchrutscht.
|
||||||
|
"""
|
||||||
|
if not isinstance(edge_str, str) or ":" not in edge_str:
|
||||||
|
return False
|
||||||
|
|
||||||
|
parts = edge_str.split(":", 1)
|
||||||
|
kind = parts[0].strip()
|
||||||
|
target = parts[1].strip()
|
||||||
|
|
||||||
|
# Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten.
|
||||||
|
# Erlaubt: "derived_from", "related_to"
|
||||||
|
# Verboten: "derived end of instruction", "Here is the list"
|
||||||
|
if " " in kind:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Regel 2: Plausible Länge für den Typ
|
||||||
|
if len(kind) > 40 or len(kind) < 2:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Regel 3: Target darf nicht leer sein
|
||||||
|
if not target:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
|
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
|
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
|
||||||
|
|
@ -59,14 +87,13 @@ class SemanticAnalyzer:
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 4. LLM Call mit Traffic Control (NEU: priority="background")
|
# 4. LLM Call mit Traffic Control
|
||||||
# Wir nutzen die "Slow Lane", damit der User im Chat nicht warten muss.
|
|
||||||
response_json = await self.llm.generate_raw_response(
|
response_json = await self.llm.generate_raw_response(
|
||||||
prompt=final_prompt,
|
prompt=final_prompt,
|
||||||
force_json=True,
|
force_json=True,
|
||||||
max_retries=5,
|
max_retries=5,
|
||||||
base_delay=5.0,
|
base_delay=5.0,
|
||||||
priority="background" # <--- WICHTIG: Drosselung aktivieren
|
priority="background"
|
||||||
)
|
)
|
||||||
|
|
||||||
# LOG: Raw Response Preview
|
# LOG: Raw Response Preview
|
||||||
|
|
@ -91,30 +118,38 @@ class SemanticAnalyzer:
|
||||||
valid_edges = []
|
valid_edges = []
|
||||||
|
|
||||||
# 6. Robuste Validierung (List vs Dict)
|
# 6. Robuste Validierung (List vs Dict)
|
||||||
|
# Wir sammeln erst alle Strings ein
|
||||||
|
raw_candidates = []
|
||||||
|
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
# Standardfall: ["kind:target", ...]
|
raw_candidates = data
|
||||||
valid_edges = [str(e) for e in data if isinstance(e, str) and ":" in e]
|
|
||||||
|
|
||||||
elif isinstance(data, dict):
|
elif isinstance(data, dict):
|
||||||
# Abweichende Formate behandeln
|
|
||||||
logger.info(f"ℹ️ [SemanticAnalyzer] LLM lieferte Dict statt Liste. Versuche Reparatur. Keys: {list(data.keys())}")
|
logger.info(f"ℹ️ [SemanticAnalyzer] LLM lieferte Dict statt Liste. Versuche Reparatur. Keys: {list(data.keys())}")
|
||||||
|
|
||||||
for key, val in data.items():
|
for key, val in data.items():
|
||||||
# Fall A: {"edges": ["kind:target"]}
|
# Fall A: {"edges": ["kind:target"]}
|
||||||
if key.lower() in ["edges", "results", "kanten", "matches"] and isinstance(val, list):
|
if key.lower() in ["edges", "results", "kanten", "matches"] and isinstance(val, list):
|
||||||
valid_edges.extend([str(e) for e in val if isinstance(e, str) and ":" in e])
|
raw_candidates.extend(val)
|
||||||
|
|
||||||
# Fall B: {"kind": "target"}
|
# Fall B: {"kind": "target"} (Beziehung als Key)
|
||||||
elif isinstance(val, str):
|
elif isinstance(val, str):
|
||||||
valid_edges.append(f"{key}:{val}")
|
raw_candidates.append(f"{key}:{val}")
|
||||||
|
|
||||||
# Fall C: {"kind": ["target1", "target2"]}
|
# Fall C: {"kind": ["target1", "target2"]}
|
||||||
elif isinstance(val, list):
|
elif isinstance(val, list):
|
||||||
for target in val:
|
for target in val:
|
||||||
if isinstance(target, str):
|
if isinstance(target, str):
|
||||||
valid_edges.append(f"{key}:{target}")
|
raw_candidates.append(f"{key}:{target}")
|
||||||
|
|
||||||
# Safety: Filtere nur Kanten, die halbwegs valide aussehen
|
# 7. Strict Validation Loop
|
||||||
|
for e in raw_candidates:
|
||||||
|
e_str = str(e)
|
||||||
|
if self._is_valid_edge_string(e_str):
|
||||||
|
valid_edges.append(e_str)
|
||||||
|
else:
|
||||||
|
logger.debug(f" [SemanticAnalyzer] Invalid edge format rejected: '{e_str}'")
|
||||||
|
|
||||||
|
# Safety: Filtere nur Kanten, die halbwegs valide aussehen (Doppelcheck)
|
||||||
final_result = [e for e in valid_edges if ":" in e]
|
final_result = [e for e in valid_edges if ":" in e]
|
||||||
|
|
||||||
# LOG: Ergebnis
|
# LOG: Ergebnis
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
version: 2.4.0 # Optimized for Async Intelligence & Hybrid Router
|
version: 2.6.0 # Final WP-15 Config: Smart Edges & Strict/Soft Chunking
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
# 1. CHUNKING PROFILES
|
# 1. CHUNKING PROFILES
|
||||||
|
|
@ -7,7 +7,6 @@ version: 2.4.0 # Optimized for Async Intelligence & Hybrid Router
|
||||||
chunking_profiles:
|
chunking_profiles:
|
||||||
|
|
||||||
# A. SHORT & FAST
|
# A. SHORT & FAST
|
||||||
# Für Glossar, Tasks, Risiken. Kleine Schnipsel.
|
|
||||||
sliding_short:
|
sliding_short:
|
||||||
strategy: sliding_window
|
strategy: sliding_window
|
||||||
enable_smart_edge_allocation: false
|
enable_smart_edge_allocation: false
|
||||||
|
|
@ -16,7 +15,6 @@ chunking_profiles:
|
||||||
overlap: [30, 50]
|
overlap: [30, 50]
|
||||||
|
|
||||||
# B. STANDARD & FAST
|
# B. STANDARD & FAST
|
||||||
# Der "Traktor": Robust für Quellen, Journal, Daily Logs.
|
|
||||||
sliding_standard:
|
sliding_standard:
|
||||||
strategy: sliding_window
|
strategy: sliding_window
|
||||||
enable_smart_edge_allocation: false
|
enable_smart_edge_allocation: false
|
||||||
|
|
@ -24,10 +22,8 @@ chunking_profiles:
|
||||||
max: 650
|
max: 650
|
||||||
overlap: [50, 100]
|
overlap: [50, 100]
|
||||||
|
|
||||||
# C. SMART FLOW (Performance-Safe Mode)
|
# C. SMART FLOW (Text-Fluss)
|
||||||
# Für Konzepte, Projekte, Erfahrungen.
|
# Nutzt Sliding Window, aber mit LLM-Kanten-Analyse.
|
||||||
# HINWEIS: 'enable_smart_edge_allocation' ist vorerst FALSE, um Ollama
|
|
||||||
# bei der Generierung nicht zu überlasten. Später wieder aktivieren.
|
|
||||||
sliding_smart_edges:
|
sliding_smart_edges:
|
||||||
strategy: sliding_window
|
strategy: sliding_window
|
||||||
enable_smart_edge_allocation: true
|
enable_smart_edge_allocation: true
|
||||||
|
|
@ -35,12 +31,38 @@ chunking_profiles:
|
||||||
max: 600
|
max: 600
|
||||||
overlap: [50, 80]
|
overlap: [50, 80]
|
||||||
|
|
||||||
# D. SMART STRUCTURE
|
# D. SMART STRUCTURE (Soft Split)
|
||||||
# Für Profile, Werte, Prinzipien. Trennt hart an Überschriften (H2).
|
# Trennt bevorzugt an H2, fasst aber kleine Abschnitte zusammen ("Soft Mode").
|
||||||
structured_smart_edges:
|
structured_smart_edges:
|
||||||
strategy: by_heading
|
strategy: by_heading
|
||||||
enable_smart_edge_allocation: true
|
enable_smart_edge_allocation: true
|
||||||
split_level: 2
|
split_level: 2
|
||||||
|
strict_heading_split: false
|
||||||
|
max: 600
|
||||||
|
target: 400
|
||||||
|
overlap: [50, 80]
|
||||||
|
|
||||||
|
# E. SMART STRUCTURE STRICT (H2 Hard Split)
|
||||||
|
# Trennt ZWINGEND an jeder H2.
|
||||||
|
# Verhindert, dass "Vater" und "Partner" (Profile) oder Werte verschmelzen.
|
||||||
|
structured_smart_edges_strict:
|
||||||
|
strategy: by_heading
|
||||||
|
enable_smart_edge_allocation: true
|
||||||
|
split_level: 2
|
||||||
|
strict_heading_split: true # Hard Mode
|
||||||
|
max: 600
|
||||||
|
target: 400
|
||||||
|
overlap: [50, 80]
|
||||||
|
|
||||||
|
# F. SMART STRUCTURE DEEP (H3 Hard Split + Merge-Check)
|
||||||
|
# Spezialfall für "Leitbild Prinzipien":
|
||||||
|
# - Trennt H1, H2, H3 hart.
|
||||||
|
# - Aber: Merged "leere" H2 (Tier 2) mit der folgenden H3 (MP1).
|
||||||
|
structured_smart_edges_strict_L3:
|
||||||
|
strategy: by_heading
|
||||||
|
enable_smart_edge_allocation: true
|
||||||
|
split_level: 3
|
||||||
|
strict_heading_split: true
|
||||||
max: 600
|
max: 600
|
||||||
target: 400
|
target: 400
|
||||||
overlap: [50, 80]
|
overlap: [50, 80]
|
||||||
|
|
@ -59,24 +81,13 @@ defaults:
|
||||||
|
|
||||||
types:
|
types:
|
||||||
|
|
||||||
# --- KERNTYPEN (Hoch priorisiert & Smart) ---
|
# --- KERNTYPEN ---
|
||||||
|
|
||||||
experience:
|
experience:
|
||||||
chunking_profile: sliding_smart_edges
|
chunking_profile: sliding_smart_edges
|
||||||
retriever_weight: 0.90
|
retriever_weight: 0.90
|
||||||
edge_defaults: ["derived_from", "references"]
|
edge_defaults: ["derived_from", "references"]
|
||||||
# Hybrid Classifier: Wenn diese Worte fallen, ist es eine Experience
|
detection_keywords: ["passiert", "erlebt", "gefühl", "situation", "reaktion"]
|
||||||
detection_keywords:
|
|
||||||
- "passiert"
|
|
||||||
- "erlebt"
|
|
||||||
- "gefühl"
|
|
||||||
- "situation"
|
|
||||||
- "stolz"
|
|
||||||
- "geärgert"
|
|
||||||
- "reaktion"
|
|
||||||
- "moment"
|
|
||||||
- "konflikt"
|
|
||||||
# Ghostwriter Schema: Sprechende Anweisungen für besseren Textfluss
|
|
||||||
schema:
|
schema:
|
||||||
- "Situation (Was ist passiert?)"
|
- "Situation (Was ist passiert?)"
|
||||||
- "Meine Reaktion (Was habe ich getan?)"
|
- "Meine Reaktion (Was habe ich getan?)"
|
||||||
|
|
@ -87,48 +98,37 @@ types:
|
||||||
chunking_profile: sliding_smart_edges
|
chunking_profile: sliding_smart_edges
|
||||||
retriever_weight: 0.97
|
retriever_weight: 0.97
|
||||||
edge_defaults: ["references", "depends_on"]
|
edge_defaults: ["references", "depends_on"]
|
||||||
detection_keywords:
|
detection_keywords: ["projekt", "vorhaben", "ziel ist", "planen", "starten"]
|
||||||
- "projekt"
|
|
||||||
- "vorhaben"
|
|
||||||
- "ziel ist"
|
|
||||||
- "meilenstein"
|
|
||||||
- "planen"
|
|
||||||
- "starten"
|
|
||||||
- "mission"
|
|
||||||
schema:
|
schema:
|
||||||
- "Mission & Zielsetzung"
|
- "Mission & Zielsetzung"
|
||||||
- "Aktueller Status & Blockaden"
|
- "Aktueller Status & Blockaden"
|
||||||
- "Nächste konkrete Schritte"
|
- "Nächste konkrete Schritte"
|
||||||
- "Stakeholder & Ressourcen"
|
|
||||||
|
|
||||||
decision:
|
decision:
|
||||||
chunking_profile: structured_smart_edges
|
# Strict, damit jede Entscheidung atomar bleibt
|
||||||
retriever_weight: 1.00 # MAX: Entscheidungen sind Gesetz
|
chunking_profile: structured_smart_edges_strict
|
||||||
|
retriever_weight: 1.00
|
||||||
edge_defaults: ["caused_by", "references"]
|
edge_defaults: ["caused_by", "references"]
|
||||||
detection_keywords:
|
detection_keywords: ["entschieden", "wahl", "optionen", "alternativen", "adr"]
|
||||||
- "entschieden"
|
|
||||||
- "wahl"
|
|
||||||
- "optionen"
|
|
||||||
- "alternativen"
|
|
||||||
- "beschluss"
|
|
||||||
- "adr"
|
|
||||||
schema:
|
schema:
|
||||||
- "Kontext & Problemstellung"
|
- "Kontext & Problemstellung"
|
||||||
- "Betrachtete Optionen (Alternativen)"
|
- "Betrachtete Optionen"
|
||||||
- "Die Entscheidung"
|
- "Die Entscheidung"
|
||||||
- "Begründung (Warum diese Wahl?)"
|
- "Begründung"
|
||||||
|
|
||||||
# --- PERSÖNLICHKEIT & IDENTITÄT ---
|
# --- PERSÖNLICHKEIT & IDENTITÄT ---
|
||||||
|
|
||||||
value:
|
value:
|
||||||
chunking_profile: structured_smart_edges
|
# Strict, damit Werte nicht verschwimmen
|
||||||
|
chunking_profile: structured_smart_edges_strict
|
||||||
retriever_weight: 1.00
|
retriever_weight: 1.00
|
||||||
edge_defaults: ["related_to"]
|
edge_defaults: ["related_to"]
|
||||||
detection_keywords: ["wert", "wichtig ist", "moral", "ethik"]
|
detection_keywords: ["wert", "wichtig ist", "moral", "ethik"]
|
||||||
schema: ["Definition", "Warum mir das wichtig ist", "Leitsätze für den Alltag"]
|
schema: ["Definition", "Warum mir das wichtig ist", "Leitsätze"]
|
||||||
|
|
||||||
principle:
|
principle:
|
||||||
chunking_profile: structured_smart_edges
|
# L3 Strict für P3/P3a und Tier2/MP1 Logik
|
||||||
|
chunking_profile: structured_smart_edges_strict_L3
|
||||||
retriever_weight: 0.95
|
retriever_weight: 0.95
|
||||||
edge_defaults: ["derived_from", "references"]
|
edge_defaults: ["derived_from", "references"]
|
||||||
detection_keywords: ["prinzip", "regel", "grundsatz", "leitlinie"]
|
detection_keywords: ["prinzip", "regel", "grundsatz", "leitlinie"]
|
||||||
|
|
@ -138,11 +138,11 @@ types:
|
||||||
chunking_profile: sliding_short
|
chunking_profile: sliding_short
|
||||||
retriever_weight: 0.90
|
retriever_weight: 0.90
|
||||||
edge_defaults: ["related_to"]
|
edge_defaults: ["related_to"]
|
||||||
detection_keywords: ["glaube", "überzeugung", "denke dass", "meinung"]
|
|
||||||
schema: ["Der Glaubenssatz", "Ursprung & Reflexion"]
|
schema: ["Der Glaubenssatz", "Ursprung & Reflexion"]
|
||||||
|
|
||||||
profile:
|
profile:
|
||||||
chunking_profile: structured_smart_edges
|
# Strict: Jede Rolle (H2) muss ein eigener Chunk sein
|
||||||
|
chunking_profile: structured_smart_edges_strict
|
||||||
retriever_weight: 0.70
|
retriever_weight: 0.70
|
||||||
edge_defaults: ["references", "related_to"]
|
edge_defaults: ["references", "related_to"]
|
||||||
schema: ["Rolle / Identität", "Fakten & Daten", "Historie"]
|
schema: ["Rolle / Identität", "Fakten & Daten", "Historie"]
|
||||||
|
|
@ -159,8 +159,8 @@ types:
|
||||||
chunking_profile: sliding_short
|
chunking_profile: sliding_short
|
||||||
retriever_weight: 0.85
|
retriever_weight: 0.85
|
||||||
edge_defaults: ["related_to", "blocks"]
|
edge_defaults: ["related_to", "blocks"]
|
||||||
detection_keywords: ["risiko", "gefahr", "bedrohung", "problem", "angst"]
|
detection_keywords: ["risiko", "gefahr", "bedrohung"]
|
||||||
schema: ["Beschreibung des Risikos", "Mögliche Auswirkungen", "Gegenmaßnahmen"]
|
schema: ["Beschreibung des Risikos", "Auswirkungen", "Gegenmaßnahmen"]
|
||||||
|
|
||||||
# --- BASIS & WISSEN ---
|
# --- BASIS & WISSEN ---
|
||||||
|
|
||||||
|
|
@ -168,10 +168,7 @@ types:
|
||||||
chunking_profile: sliding_smart_edges
|
chunking_profile: sliding_smart_edges
|
||||||
retriever_weight: 0.60
|
retriever_weight: 0.60
|
||||||
edge_defaults: ["references", "related_to"]
|
edge_defaults: ["references", "related_to"]
|
||||||
schema:
|
schema: ["Definition", "Kontext", "Verwandte Konzepte"]
|
||||||
- "Definition"
|
|
||||||
- "Kontext & Hintergrund"
|
|
||||||
- "Verwandte Konzepte"
|
|
||||||
|
|
||||||
task:
|
task:
|
||||||
chunking_profile: sliding_short
|
chunking_profile: sliding_short
|
||||||
|
|
@ -183,19 +180,36 @@ types:
|
||||||
chunking_profile: sliding_standard
|
chunking_profile: sliding_standard
|
||||||
retriever_weight: 0.80
|
retriever_weight: 0.80
|
||||||
edge_defaults: ["references", "related_to"]
|
edge_defaults: ["references", "related_to"]
|
||||||
schema: ["Log-Eintrag", "Gedanken & Erkenntnisse"]
|
schema: ["Log-Eintrag", "Gedanken"]
|
||||||
|
|
||||||
source:
|
source:
|
||||||
chunking_profile: sliding_standard
|
chunking_profile: sliding_standard
|
||||||
retriever_weight: 0.50
|
retriever_weight: 0.50
|
||||||
edge_defaults: []
|
edge_defaults: []
|
||||||
schema:
|
schema: ["Metadaten", "Zusammenfassung", "Zitate"]
|
||||||
- "Metadaten (Autor, URL, Datum)"
|
|
||||||
- "Kernaussage / Zusammenfassung"
|
|
||||||
- "Zitate & Notizen"
|
|
||||||
|
|
||||||
glossary:
|
glossary:
|
||||||
chunking_profile: sliding_short
|
chunking_profile: sliding_short
|
||||||
retriever_weight: 0.40
|
retriever_weight: 0.40
|
||||||
edge_defaults: ["related_to"]
|
edge_defaults: ["related_to"]
|
||||||
schema: ["Begriff", "Definition"]
|
schema: ["Begriff", "Definition"]
|
||||||
|
|
||||||
|
person:
|
||||||
|
chunking_profile: sliding_standard
|
||||||
|
retriever_weight: 0.50
|
||||||
|
edge_defaults: ["related_to"]
|
||||||
|
schema: ["Rolle", "Beziehung", "Kontext"]
|
||||||
|
|
||||||
|
event:
|
||||||
|
chunking_profile: sliding_standard
|
||||||
|
retriever_weight: 0.60
|
||||||
|
edge_defaults: ["related_to"]
|
||||||
|
schema: ["Datum & Ort", "Teilnehmer", "Ergebnisse"]
|
||||||
|
|
||||||
|
# --- FALLBACK ---
|
||||||
|
|
||||||
|
default:
|
||||||
|
chunking_profile: sliding_standard
|
||||||
|
retriever_weight: 1.00
|
||||||
|
edge_defaults: ["references"]
|
||||||
|
schema: ["Inhalt"]
|
||||||
Loading…
Reference in New Issue
Block a user