WP24c - Agentic Edge Validation & Chunk-Aware Multigraph-System (v4.5.8) #22
|
|
@ -208,8 +208,9 @@ def extract_llm_validation_zones(markdown_body: str) -> List[Tuple[str, str]]:
|
|||
llm_validation_headers = get_llm_validation_zone_headers()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
# Prüfe auf Header
|
||||
# Prüfe auf Header (konfiguriertes Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL)
|
||||
header_match = re.match(header_pattern, line.strip())
|
||||
|
||||
if header_match:
|
||||
header_text = header_match.group(1).strip()
|
||||
|
||||
|
|
@ -266,11 +267,16 @@ def extract_callouts_from_markdown(
|
|||
) -> List[dict]:
|
||||
"""
|
||||
WP-24c v4.2.1: Extrahiert Callouts aus dem Original-Markdown.
|
||||
WP-24c v4.5.6: Header-Status-Maschine für korrekte Zonen-Erkennung.
|
||||
|
||||
Smart Logic: Nur Callouts, die NICHT in Chunks vorkommen (z.B. in Edge-Zonen),
|
||||
werden mit scope: "note" angelegt. Callouts, die bereits in Chunks erfasst wurden,
|
||||
werden übersprungen, um Duplikate zu vermeiden.
|
||||
|
||||
WP-24c v4.5.6: Prüft für jeden Callout, ob er in einer LLM-Validierungs-Zone liegt.
|
||||
- In LLM-Validierungs-Zone: rule_id = "candidate:explicit:callout"
|
||||
- In Standard-Zone: rule_id = "explicit:callout" (ohne candidate:)
|
||||
|
||||
Args:
|
||||
markdown_body: Original-Markdown-Text (vor Chunking-Filterung)
|
||||
note_id: ID der Note
|
||||
|
|
@ -287,52 +293,207 @@ def extract_callouts_from_markdown(
|
|||
|
||||
edges: List[dict] = []
|
||||
|
||||
# Extrahiere alle Callouts aus dem gesamten Markdown
|
||||
call_pairs, _ = extract_callout_relations(markdown_body)
|
||||
# WP-24c v4.5.6: Header-Status-Maschine - Baue Mapping von Zeilen zu Zonen-Status
|
||||
import os
|
||||
import re
|
||||
|
||||
for k, raw_t in call_pairs:
|
||||
t, sec = parse_link_target(raw_t, note_id)
|
||||
if not t:
|
||||
llm_validation_headers = get_llm_validation_zone_headers()
|
||||
llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
|
||||
# WP-24c v4.5.6: Konfigurierbare Header-Ebene (vollständig über .env steuerbar)
|
||||
header_level_pattern = "#" * llm_validation_level
|
||||
header_pattern = rf'^{re.escape(header_level_pattern)}\s+(.+?)$'
|
||||
|
||||
lines = markdown_body.split('\n')
|
||||
current_zone_is_llm_validation = False
|
||||
|
||||
# WP-24c v4.5.6: Zeile-für-Zeile Verarbeitung mit Zonen-Tracking
|
||||
# Extrahiere Callouts direkt während des Durchlaufs, um Zonen-Kontext zu behalten
|
||||
current_kind = None
|
||||
in_callout_block = False
|
||||
callout_block_lines = [] # Sammle Zeilen eines Callout-Blocks
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
|
||||
# WP-24c v4.5.6: Prüfe auf Header (Zonen-Wechsel)
|
||||
# Verwendet das konfigurierte Level aus MINDNET_LLM_VALIDATION_HEADER_LEVEL
|
||||
header_match = re.match(header_pattern, stripped)
|
||||
|
||||
if header_match:
|
||||
header_text = header_match.group(1).strip()
|
||||
# Prüfe, ob dieser Header eine LLM-Validierungs-Zone startet
|
||||
# WP-24c v4.5.6: Header-Status-Maschine - korrekte Zonen-Erkennung
|
||||
current_zone_is_llm_validation = any(
|
||||
header_text.lower() == llm_header.lower()
|
||||
for llm_header in llm_validation_headers
|
||||
)
|
||||
logger.debug(f"DEBUG-TRACER [Zone-Change]: Header '{header_text}' (Level {llm_validation_level}) -> LLM-Validierung: {current_zone_is_llm_validation}")
|
||||
# Beende aktuellen Callout-Block bei Header-Wechsel
|
||||
if in_callout_block:
|
||||
# Verarbeite gesammelten Callout-Block VOR dem Zonen-Wechsel
|
||||
if callout_block_lines:
|
||||
block_text = '\n'.join([lines[j] for j in callout_block_lines])
|
||||
block_call_pairs, _ = extract_callout_relations(block_text)
|
||||
|
||||
# Verarbeite jeden Callout mit Zonen-Kontext
|
||||
# WICHTIG: Verwende den Zonen-Status VOR dem Header-Wechsel
|
||||
zone_before_header = current_zone_is_llm_validation
|
||||
|
||||
for k, raw_t in block_call_pairs:
|
||||
t, sec = parse_link_target(raw_t, note_id)
|
||||
if not t:
|
||||
continue
|
||||
|
||||
callout_key = (k, t, sec)
|
||||
is_blocked = callout_key in existing_chunk_callouts
|
||||
|
||||
if is_blocked:
|
||||
continue
|
||||
|
||||
# WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status VOR Header
|
||||
if zone_before_header:
|
||||
rule_id = "candidate:explicit:callout"
|
||||
provenance = "explicit:callout"
|
||||
else:
|
||||
rule_id = "explicit:callout" # KEIN candidate: für Standard-Zonen
|
||||
provenance = "explicit:callout"
|
||||
|
||||
payload = {
|
||||
"edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
|
||||
"provenance": provenance,
|
||||
"rule_id": rule_id,
|
||||
"confidence": 0.7
|
||||
}
|
||||
if sec:
|
||||
payload["target_section"] = sec
|
||||
|
||||
logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if zone_before_header else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
|
||||
|
||||
edges.append(_edge(
|
||||
kind=k,
|
||||
scope="note",
|
||||
source_id=note_id,
|
||||
target_id=t,
|
||||
note_id=note_id,
|
||||
extra=payload
|
||||
))
|
||||
|
||||
# Reset für nächsten Block
|
||||
in_callout_block = False
|
||||
current_kind = None
|
||||
callout_block_lines = []
|
||||
continue
|
||||
|
||||
# WP-24c v4.2.2: Prüfe, ob dieser Callout bereits in einem Chunk vorkommt
|
||||
# Härtung: Berücksichtigt auch Sektions-Anker (sec) für Multigraph-Präzision
|
||||
# Ein Callout zu "Note#Section1" ist anders als "Note#Section2" oder "Note"
|
||||
callout_key = (k, t, sec)
|
||||
|
||||
# WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan Vergleich
|
||||
is_blocked = callout_key in existing_chunk_callouts
|
||||
logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key: ({k}, {t}, {sec}), Raw_Target: {raw_t}, In_Block_List: {is_blocked}, Block_List_Size: {len(existing_chunk_callouts) if existing_chunk_callouts else 0}")
|
||||
|
||||
if is_blocked:
|
||||
# Callout ist bereits in Chunk erfasst -> überspringe (wird mit chunk-Scope angelegt)
|
||||
# Die Sektion (sec) ist bereits im Key enthalten, daher wird Multigraph-Präzision gewährleistet
|
||||
logger.debug(f"DEBUG-TRACER [Global Scan Compare]: Key ({k}, {t}, {sec}) ist blockiert - überspringe")
|
||||
# WP-24c v4.5.6: Prüfe auf Callout-Start
|
||||
callout_start_match = re.match(r'^\s*>{1,}\s*\[!edge\]\s*(.*)$', stripped, re.IGNORECASE)
|
||||
if callout_start_match:
|
||||
in_callout_block = True
|
||||
callout_block_lines = [i] # Start-Zeile
|
||||
header_content = callout_start_match.group(1).strip()
|
||||
# Prüfe, ob Header einen Typ enthält
|
||||
if header_content and re.match(r'^[a-z_]+$', header_content, re.IGNORECASE):
|
||||
current_kind = header_content.lower()
|
||||
continue
|
||||
|
||||
# WP-24c v4.2.1: Callout ist NICHT in Chunks -> lege mit scope: "note" an
|
||||
# (typischerweise in Edge-Zonen, die nicht gechunkt werden)
|
||||
# WP-24c v4.3.1: Confidence auf 0.7 gesenkt, damit chunk-Scope (1.0) gewinnt
|
||||
payload = {
|
||||
"edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
|
||||
"provenance": "explicit:callout",
|
||||
"rule_id": "callout:edge",
|
||||
"confidence": 0.7 # WP-24c v4.3.1: Niedrigere Confidence für Note-Scope Callouts
|
||||
}
|
||||
if sec:
|
||||
payload["target_section"] = sec
|
||||
# WP-24c v4.5.6: Sammle Callout-Block-Zeilen
|
||||
if in_callout_block:
|
||||
if stripped.startswith('>'):
|
||||
callout_block_lines.append(i)
|
||||
else:
|
||||
# Callout-Block beendet - verarbeite gesammelte Zeilen
|
||||
if callout_block_lines:
|
||||
# Extrahiere Callouts aus diesem Block
|
||||
block_text = '\n'.join([lines[j] for j in callout_block_lines])
|
||||
block_call_pairs, _ = extract_callout_relations(block_text)
|
||||
|
||||
# Verarbeite jeden Callout mit Zonen-Kontext
|
||||
for k, raw_t in block_call_pairs:
|
||||
t, sec = parse_link_target(raw_t, note_id)
|
||||
if not t:
|
||||
continue
|
||||
|
||||
callout_key = (k, t, sec)
|
||||
is_blocked = callout_key in existing_chunk_callouts
|
||||
|
||||
if is_blocked:
|
||||
continue
|
||||
|
||||
# WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
|
||||
if current_zone_is_llm_validation:
|
||||
rule_id = "candidate:explicit:callout"
|
||||
provenance = "explicit:callout"
|
||||
else:
|
||||
rule_id = "explicit:callout" # KEIN candidate: für Standard-Zonen
|
||||
provenance = "explicit:callout"
|
||||
|
||||
payload = {
|
||||
"edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
|
||||
"provenance": provenance,
|
||||
"rule_id": rule_id,
|
||||
"confidence": 0.7
|
||||
}
|
||||
if sec:
|
||||
payload["target_section"] = sec
|
||||
|
||||
logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
|
||||
|
||||
edges.append(_edge(
|
||||
kind=k,
|
||||
scope="note",
|
||||
source_id=note_id,
|
||||
target_id=t,
|
||||
note_id=note_id,
|
||||
extra=payload
|
||||
))
|
||||
|
||||
# Reset für nächsten Block
|
||||
in_callout_block = False
|
||||
current_kind = None
|
||||
callout_block_lines = []
|
||||
|
||||
# WP-24c v4.5.6: Verarbeite letzten Callout-Block (falls am Ende)
|
||||
if in_callout_block and callout_block_lines:
|
||||
block_text = '\n'.join([lines[j] for j in callout_block_lines])
|
||||
block_call_pairs, _ = extract_callout_relations(block_text)
|
||||
|
||||
# WP-24c v4.4.0-DEBUG: Schnittstelle 3 - Global Scan erstellt Note-Scope Callout
|
||||
logger.debug(f"DEBUG-TRACER [Global Scan Create]: Erstelle Note-Scope Callout - Kind: {k}, Target: {t}, Section: {sec}, Raw_Target: {raw_t}, Edge_ID: {payload['edge_id']}, Confidence: {payload['confidence']}")
|
||||
|
||||
edges.append(_edge(
|
||||
kind=k,
|
||||
scope="note",
|
||||
source_id=note_id,
|
||||
target_id=t,
|
||||
note_id=note_id,
|
||||
extra=payload
|
||||
))
|
||||
for k, raw_t in block_call_pairs:
|
||||
t, sec = parse_link_target(raw_t, note_id)
|
||||
if not t:
|
||||
continue
|
||||
|
||||
callout_key = (k, t, sec)
|
||||
is_blocked = callout_key in existing_chunk_callouts
|
||||
|
||||
if is_blocked:
|
||||
continue
|
||||
|
||||
# WP-24c v4.5.6: Bestimme rule_id basierend auf Zonen-Status
|
||||
if current_zone_is_llm_validation:
|
||||
rule_id = "candidate:explicit:callout"
|
||||
provenance = "explicit:callout"
|
||||
else:
|
||||
rule_id = "explicit:callout"
|
||||
provenance = "explicit:callout"
|
||||
|
||||
payload = {
|
||||
"edge_id": _mk_edge_id(k, note_id, t, "note", target_section=sec),
|
||||
"provenance": provenance,
|
||||
"rule_id": rule_id,
|
||||
"confidence": 0.7
|
||||
}
|
||||
if sec:
|
||||
payload["target_section"] = sec
|
||||
|
||||
logger.debug(f"DEBUG-TRACER [Zone-Check]: Callout in {'LLM-Validierungs' if current_zone_is_llm_validation else 'Standard'}-Zone (Zeile {callout_block_lines[0]}) -> rule_id: {rule_id}")
|
||||
|
||||
edges.append(_edge(
|
||||
kind=k,
|
||||
scope="note",
|
||||
source_id=note_id,
|
||||
target_id=t,
|
||||
note_id=note_id,
|
||||
extra=payload
|
||||
))
|
||||
|
||||
return edges
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user