Implement WP-26 v1.3 (Phase 3): Enhance graph schema validation and edge handling
- Introduced a new function `load_graph_schema_full` to parse and cache both typical and prohibited edge types from the graph schema. - Updated `load_graph_schema` to utilize the full schema for improved edge type extraction. - Added `get_topology_info` to retrieve typical and prohibited edges for source/target pairs. - Implemented `validate_intra_note_edge` and `validate_edge_against_schema` for schema validation of intra-note edges. - Enhanced logging for schema validation outcomes and edge handling. - Updated documentation to reflect new validation features and testing procedures.
This commit is contained in:
parent
c5215e22e7
commit
509efc9393
|
|
@ -281,6 +281,8 @@ def get_edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
|
|||
|
||||
# Cache für geladenes Schema (vermeidet mehrfaches Parsen)
|
||||
_GRAPH_SCHEMA_CACHE: Optional[Dict[str, Dict[str, List[str]]]] = None
|
||||
# WP-26 v1.3: Erweitertes Schema mit prohibited edges
|
||||
_GRAPH_SCHEMA_FULL_CACHE: Optional[Dict[str, Dict[str, Dict[str, List[str]]]]] = None
|
||||
|
||||
def load_graph_schema() -> Dict[str, Dict[str, List[str]]]:
|
||||
"""
|
||||
|
|
@ -300,6 +302,31 @@ def load_graph_schema() -> Dict[str, Dict[str, List[str]]]:
|
|||
if _GRAPH_SCHEMA_CACHE is not None:
|
||||
return _GRAPH_SCHEMA_CACHE
|
||||
|
||||
# Nutze das erweiterte Schema und extrahiere nur typical
|
||||
full_schema = load_graph_schema_full()
|
||||
|
||||
schema: Dict[str, Dict[str, List[str]]] = {}
|
||||
for source_type, targets in full_schema.items():
|
||||
schema[source_type] = {}
|
||||
for target_type, edge_info in targets.items():
|
||||
schema[source_type][target_type] = edge_info.get("typical", [])
|
||||
|
||||
_GRAPH_SCHEMA_CACHE = schema
|
||||
return schema
|
||||
|
||||
|
||||
def load_graph_schema_full() -> Dict[str, Dict[str, Dict[str, List[str]]]]:
|
||||
"""
|
||||
WP-26 v1.3: Parst das graph_schema.md und extrahiert sowohl Typical als auch Prohibited Edge-Types.
|
||||
|
||||
Returns:
|
||||
Dict[source_type, Dict[target_type, {"typical": [...], "prohibited": [...]}]]
|
||||
Beispiel: {"experience": {"event": {"typical": ["caused_by"], "prohibited": ["consists_of"]}}}
|
||||
"""
|
||||
global _GRAPH_SCHEMA_FULL_CACHE
|
||||
if _GRAPH_SCHEMA_FULL_CACHE is not None:
|
||||
return _GRAPH_SCHEMA_FULL_CACHE
|
||||
|
||||
import re
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -331,10 +358,10 @@ def load_graph_schema() -> Dict[str, Dict[str, List[str]]]:
|
|||
|
||||
if not content:
|
||||
logger.warning("Graph-Schema nicht gefunden. Fallback auf leeres Schema.")
|
||||
_GRAPH_SCHEMA_CACHE = {}
|
||||
return _GRAPH_SCHEMA_CACHE
|
||||
_GRAPH_SCHEMA_FULL_CACHE = {}
|
||||
return _GRAPH_SCHEMA_FULL_CACHE
|
||||
|
||||
schema: Dict[str, Dict[str, List[str]]] = {}
|
||||
schema: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
||||
current_source = None
|
||||
|
||||
# Regex für Source-Header: ## Source: `experience`
|
||||
|
|
@ -346,6 +373,8 @@ def load_graph_schema() -> Dict[str, Dict[str, List[str]]]:
|
|||
r'^\|\s*`(\w+)`\s*\|\s*([^|]+)\s*\|\s*([^|]*)\s*\|'
|
||||
)
|
||||
|
||||
edge_pattern = re.compile(r'`(\w+)`')
|
||||
|
||||
for line in content.split('\n'):
|
||||
stripped = line.strip()
|
||||
|
||||
|
|
@ -363,19 +392,59 @@ def load_graph_schema() -> Dict[str, Dict[str, List[str]]]:
|
|||
if row_match:
|
||||
target_type = row_match.group(1).lower()
|
||||
typical_edges_raw = row_match.group(2).strip()
|
||||
prohibited_edges_raw = row_match.group(3).strip()
|
||||
|
||||
# Parse die Edge-Types (können mit Backticks und Kommas getrennt sein)
|
||||
# Format: `caused_by`, `resulted_in` oder `caused_by`
|
||||
edge_pattern = re.compile(r'`(\w+)`')
|
||||
# Parse die Edge-Types
|
||||
typical_edges = edge_pattern.findall(typical_edges_raw)
|
||||
prohibited_edges = edge_pattern.findall(prohibited_edges_raw)
|
||||
|
||||
if typical_edges:
|
||||
schema[current_source][target_type] = typical_edges
|
||||
schema[current_source][target_type] = {
|
||||
"typical": typical_edges,
|
||||
"prohibited": prohibited_edges
|
||||
}
|
||||
|
||||
logger.info(f"Graph-Schema geladen: {len(schema)} Source-Types")
|
||||
_GRAPH_SCHEMA_CACHE = schema
|
||||
logger.info(f"Graph-Schema (full) geladen: {len(schema)} Source-Types")
|
||||
_GRAPH_SCHEMA_FULL_CACHE = schema
|
||||
return schema
|
||||
|
||||
|
||||
def get_topology_info(source_type: str, target_type: str) -> Dict[str, List[str]]:
|
||||
"""
|
||||
WP-26 v1.3: Ermittelt Typical und Prohibited Edge-Types für ein Typ-Paar.
|
||||
|
||||
Args:
|
||||
source_type: Typ der Quell-Sektion (z.B. "experience")
|
||||
target_type: Typ der Ziel-Sektion (z.B. "insight")
|
||||
|
||||
Returns:
|
||||
Dict mit "typical" und "prohibited" Listen
|
||||
Beispiel: {"typical": ["resulted_in"], "prohibited": ["solves"]}
|
||||
"""
|
||||
schema = load_graph_schema_full()
|
||||
|
||||
source_lower = source_type.lower() if source_type else "default"
|
||||
target_lower = target_type.lower() if target_type else "any"
|
||||
|
||||
result = {"typical": [], "prohibited": []}
|
||||
|
||||
# 1. Exakter Match
|
||||
if source_lower in schema and target_lower in schema[source_lower]:
|
||||
return schema[source_lower][target_lower]
|
||||
|
||||
# 2. Fallback auf "any" Target
|
||||
if source_lower in schema and "any" in schema[source_lower]:
|
||||
return schema[source_lower]["any"]
|
||||
|
||||
# 3. Fallback auf "default" Source
|
||||
if "default" in schema:
|
||||
if target_lower in schema["default"]:
|
||||
return schema["default"][target_lower]
|
||||
if "any" in schema["default"]:
|
||||
return schema["default"]["any"]
|
||||
|
||||
# 4. Absoluter Fallback: alles erlaubt
|
||||
return {"typical": ["related_to", "references"], "prohibited": []}
|
||||
|
||||
def get_typical_edge_for(source_type: str, target_type: str) -> Optional[str]:
|
||||
"""
|
||||
WP-26 v1.1: Ermittelt den ersten "Typical Edge-Type" für ein Typ-Paar.
|
||||
|
|
@ -424,6 +493,8 @@ def clear_graph_schema_cache():
|
|||
"""
|
||||
WP-26 v1.1: Löscht den Cache für das Graph-Schema.
|
||||
Nützlich für Tests oder wenn das Schema neu geladen werden soll.
|
||||
WP-26 v1.3: Löscht auch den erweiterten Schema-Cache.
|
||||
"""
|
||||
global _GRAPH_SCHEMA_CACHE
|
||||
_GRAPH_SCHEMA_CACHE = None
|
||||
global _GRAPH_SCHEMA_CACHE, _GRAPH_SCHEMA_FULL_CACHE
|
||||
_GRAPH_SCHEMA_CACHE = None
|
||||
_GRAPH_SCHEMA_FULL_CACHE = None
|
||||
|
|
@ -41,7 +41,7 @@ from app.services.llm_service import LLMService
|
|||
# Package-Interne Imports (Refactoring WP-14)
|
||||
from .ingestion_utils import load_type_registry, resolve_note_type, get_chunk_config_by_profile
|
||||
from .ingestion_db import fetch_note_payload, artifacts_missing, purge_artifacts, is_explicit_edge_present
|
||||
from .ingestion_validation import validate_edge_candidate
|
||||
from .ingestion_validation import validate_edge_candidate, validate_edge_against_schema
|
||||
from .ingestion_note_payload import make_note_payload
|
||||
from .ingestion_chunk_payload import make_chunk_payloads
|
||||
|
||||
|
|
@ -621,6 +621,31 @@ class IngestionService:
|
|||
v_edge["target_section"] = target_section
|
||||
self.symmetry_buffer.append(v_edge)
|
||||
|
||||
# WP-26 v1.3: Schema-Validierung für Intra-Note-Edges (FA-12)
|
||||
# Prüfe is_internal Edges gegen graph_schema.md
|
||||
if explicit_edges:
|
||||
chunks_by_id = {c.get("chunk_id", c.get("id", "")): c for c in chunk_pls}
|
||||
schema_validated_edges = []
|
||||
schema_rejected_count = 0
|
||||
|
||||
for e in explicit_edges:
|
||||
is_valid, updated_edge = validate_edge_against_schema(
|
||||
edge=e,
|
||||
chunks_by_id=chunks_by_id,
|
||||
strict_mode=False # Im normalen Modus: atypische Edges erlaubt mit reduzierter Confidence
|
||||
)
|
||||
|
||||
if is_valid:
|
||||
schema_validated_edges.append(updated_edge)
|
||||
else:
|
||||
schema_rejected_count += 1
|
||||
logger.info(f"🚫 [SCHEMA-VALIDATION] Edge abgelehnt: {e.get('source_id')} -> {e.get('target_id')} ({e.get('kind')})")
|
||||
|
||||
if schema_rejected_count > 0:
|
||||
logger.info(f"📊 [SCHEMA-VALIDATION] {schema_rejected_count} Intra-Note-Edges aufgrund von Schema-Verletzungen abgelehnt")
|
||||
|
||||
explicit_edges = schema_validated_edges
|
||||
|
||||
# DB Upsert
|
||||
if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id)
|
||||
|
||||
|
|
|
|||
|
|
@ -3,24 +3,145 @@ FILE: app/core/ingestion/ingestion_validation.py
|
|||
DESCRIPTION: WP-15b semantische Validierung von Kanten gegen den LocalBatchCache.
|
||||
WP-24c: Erweiterung um automatische Symmetrie-Generierung (Inverse Kanten).
|
||||
WP-25b: Konsequente Lazy-Prompt-Orchestration (prompt_key + variables).
|
||||
VERSION: 3.0.0 (WP-24c: Symmetric Edge Management)
|
||||
WP-26 v1.3: Schema-Validierung für Intra-Note-Edges gegen graph_schema.md.
|
||||
VERSION: 3.1.0 (WP-26: Intra-Note-Edge Schema-Validation)
|
||||
STATUS: Active
|
||||
FIX:
|
||||
- WP-24c: Integration der EdgeRegistry zur dynamischen Inversions-Ermittlung.
|
||||
- WP-24c: Implementierung von validate_and_symmetrize für bidirektionale Graphen.
|
||||
- WP-25b: Beibehaltung der hierarchischen Prompt-Resolution und Modell-Spezi-Logik.
|
||||
- WP-26: FA-12 Schema-Validierung gegen effektiven Chunk-Typ.
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
from app.core.parser import NoteContext
|
||||
|
||||
# Import der neutralen Bereinigungs-Logik zur Vermeidung von Circular Imports
|
||||
from app.core.registry import clean_llm_text
|
||||
# WP-24c: Zugriff auf das dynamische Vokabular
|
||||
from app.services.edge_registry import registry as edge_registry
|
||||
# WP-26 v1.3: Graph-Schema für Validierung
|
||||
from app.core.graph.graph_utils import get_topology_info
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# WP-26 v1.3: Schema-Validierung für Intra-Note-Edges (FA-12)
|
||||
# ==============================================================================
|
||||
|
||||
def validate_intra_note_edge(
|
||||
edge: Dict[str, Any],
|
||||
source_chunk: Dict[str, Any],
|
||||
target_chunk: Dict[str, Any],
|
||||
strict_mode: bool = False
|
||||
) -> Tuple[bool, float, Optional[str]]:
|
||||
"""
|
||||
WP-26 v1.3 (FA-12): Validiert eine Intra-Note-Edge gegen das graph_schema.md.
|
||||
Verwendet den EFFEKTIVEN Typ (section_type || note_type) beider Chunks.
|
||||
|
||||
Args:
|
||||
edge: Das Edge-Dict mit "kind", "source_id", "target_id"
|
||||
source_chunk: Chunk-Payload der Quelle mit "type" (effektiver Typ)
|
||||
target_chunk: Chunk-Payload des Ziels mit "type" (effektiver Typ)
|
||||
strict_mode: Wenn True, werden atypische Edges abgelehnt (nicht nur gewarnt)
|
||||
|
||||
Returns:
|
||||
Tuple (is_valid, confidence, reason)
|
||||
- is_valid: True wenn die Edge erlaubt ist
|
||||
- confidence: Angepasste Confidence (0.7 für atypische, 0.0 für prohibited)
|
||||
- reason: Optional Begründung für Ablehnung/Warnung
|
||||
"""
|
||||
# Effektive Typen extrahieren (section_type hat Vorrang vor note_type)
|
||||
source_type = source_chunk.get("type") or source_chunk.get("note_type") or "default"
|
||||
target_type = target_chunk.get("type") or target_chunk.get("note_type") or "default"
|
||||
edge_kind = edge.get("kind", "related_to")
|
||||
|
||||
# Schema-Lookup
|
||||
topology = get_topology_info(source_type, target_type)
|
||||
typical_edges = topology.get("typical", [])
|
||||
prohibited_edges = topology.get("prohibited", [])
|
||||
|
||||
# 1. Prüfung: Ist die Edge verboten?
|
||||
if edge_kind in prohibited_edges:
|
||||
reason = f"Edge '{edge_kind}' von {source_type} → {target_type} ist verboten (prohibited)"
|
||||
logger.warning(f"🚫 [SCHEMA-VALIDATION] {reason}")
|
||||
return (False, 0.0, reason)
|
||||
|
||||
# 2. Prüfung: Ist die Edge typisch?
|
||||
if edge_kind in typical_edges:
|
||||
# Edge ist typisch → volle Confidence
|
||||
logger.debug(f"✅ [SCHEMA-VALIDATION] Edge '{edge_kind}' von {source_type} → {target_type} ist typisch")
|
||||
return (True, 1.0, None)
|
||||
|
||||
# 3. Edge ist atypisch (weder typical noch prohibited)
|
||||
reason = f"Edge '{edge_kind}' von {source_type} → {target_type} ist atypisch (nicht in typical: {typical_edges})"
|
||||
|
||||
if strict_mode:
|
||||
# Im Strict-Mode werden atypische Edges abgelehnt
|
||||
logger.warning(f"⚠️ [SCHEMA-VALIDATION] {reason} - ABGELEHNT (strict_mode)")
|
||||
return (False, 0.0, reason)
|
||||
else:
|
||||
# Im normalen Modus: Edge erlaubt, aber mit reduzierter Confidence (0.7)
|
||||
logger.info(f"ℹ️ [SCHEMA-VALIDATION] {reason} - erlaubt mit reduzierter Confidence")
|
||||
return (True, 0.7, reason)
|
||||
|
||||
|
||||
def validate_edge_against_schema(
|
||||
edge: Dict[str, Any],
|
||||
chunks_by_id: Dict[str, Dict[str, Any]],
|
||||
strict_mode: bool = False
|
||||
) -> Tuple[bool, Dict[str, Any]]:
|
||||
"""
|
||||
WP-26 v1.3: Wrapper für die Schema-Validierung mit Chunk-Lookup.
|
||||
|
||||
Args:
|
||||
edge: Das Edge-Dict
|
||||
chunks_by_id: Dictionary von chunk_id → chunk_payload
|
||||
strict_mode: Wenn True, werden atypische Edges abgelehnt
|
||||
|
||||
Returns:
|
||||
Tuple (is_valid, updated_edge)
|
||||
- is_valid: True wenn die Edge erlaubt ist
|
||||
- updated_edge: Edge mit ggf. angepasster Confidence
|
||||
"""
|
||||
source_id = edge.get("source_id", "")
|
||||
target_id = edge.get("target_id", "")
|
||||
is_internal = edge.get("is_internal", False)
|
||||
|
||||
# Nur Intra-Note-Edges validieren
|
||||
if not is_internal:
|
||||
return (True, edge)
|
||||
|
||||
# Chunks nachschlagen
|
||||
source_chunk = chunks_by_id.get(source_id, {})
|
||||
target_chunk = chunks_by_id.get(target_id, {})
|
||||
|
||||
# Wenn Chunks nicht gefunden → Edge erlauben (Integrität vor Präzision)
|
||||
if not source_chunk or not target_chunk:
|
||||
logger.debug(f"[SCHEMA-VALIDATION] Chunks nicht gefunden für {source_id} / {target_id} - Edge erlaubt")
|
||||
return (True, edge)
|
||||
|
||||
# Schema-Validierung durchführen
|
||||
is_valid, confidence, reason = validate_intra_note_edge(
|
||||
edge=edge,
|
||||
source_chunk=source_chunk,
|
||||
target_chunk=target_chunk,
|
||||
strict_mode=strict_mode
|
||||
)
|
||||
|
||||
if not is_valid:
|
||||
return (False, edge)
|
||||
|
||||
# Confidence anpassen wenn nötig
|
||||
updated_edge = edge.copy()
|
||||
if confidence < 1.0:
|
||||
original_confidence = edge.get("confidence", 1.0)
|
||||
updated_edge["confidence"] = min(original_confidence, confidence)
|
||||
updated_edge["schema_validation_note"] = reason
|
||||
|
||||
return (True, updated_edge)
|
||||
|
||||
async def validate_edge_candidate(
|
||||
chunk_text: str,
|
||||
edge: Dict,
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
# WP-26 Manuelle Testszenarien
|
||||
|
||||
**Version:** 1.0
|
||||
**Version:** 1.3
|
||||
**Datum:** 25. Januar 2026
|
||||
**Status:** Phase 1 Implementierung abgeschlossen
|
||||
**Status:** Alle Phasen (Phase 1-3) implementiert
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -271,14 +271,117 @@ python -m pytest tests/test_wp26_section_types.py -v
|
|||
|
||||
---
|
||||
|
||||
## 7. Nächste Schritte (Phase 2)
|
||||
## 7. Phase 2: Retriever-Anpassungen
|
||||
|
||||
Nach erfolgreicher Validierung von Phase 1:
|
||||
### 7.1 is_internal-Boost
|
||||
|
||||
1. **Retriever-Anpassung:** Path-Bonus für Intra-Note-Edges
|
||||
2. **Graph-Exploration:** Navigation entlang `typical edges` aus `graph_schema.md`
|
||||
3. **Schema-Validierung:** Agentic Validation gegen effektive Chunk-Typen
|
||||
**Konfiguration:** `config/retriever.yaml`
|
||||
|
||||
```yaml
|
||||
edge_scoring:
|
||||
internal_edge_boost: 1.2 # +20% Boost für Intra-Note-Edges
|
||||
external_edge_boost: 1.0 # Standard für Inter-Note-Edges
|
||||
```
|
||||
|
||||
**Manuelle Prüfung:**
|
||||
|
||||
1. Führe eine Suche durch, die eine Note mit internen Edges trifft
|
||||
2. Prüfe im Debug-Log, dass `is_internal: True` Edges höheres Gewicht erhalten
|
||||
|
||||
### 7.2 Aggregation-Level
|
||||
|
||||
**Konfiguration:** `config/retriever.yaml`
|
||||
|
||||
```yaml
|
||||
aggregation:
|
||||
level: note # "note" (default) oder "chunk"
|
||||
max_chunks_per_note: 3 # Limit bei "note"-Level
|
||||
```
|
||||
|
||||
**Test mit Chunk-Level:**
|
||||
|
||||
1. Setze `level: chunk` in `retriever.yaml`
|
||||
2. Führe Suche durch
|
||||
3. Prüfe, dass mehrere Chunks derselben Note zurückgegeben werden (keine Deduplizierung)
|
||||
|
||||
### 7.3 Unit-Tests Phase 2
|
||||
|
||||
```bash
|
||||
python -m pytest tests/test_wp26_phase2_retriever.py -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
**Ende der Testdokumentation**
|
||||
## 8. Phase 3: Schema-Validierung (FA-12)
|
||||
|
||||
### 8.1 get_topology_info()
|
||||
|
||||
Die neue Funktion ermittelt typische und verbotene Edge-Types für ein Source/Target-Typ-Paar.
|
||||
|
||||
**Beispiel:**
|
||||
|
||||
```python
|
||||
from app.core.graph.graph_utils import get_topology_info
|
||||
|
||||
topology = get_topology_info("experience", "insight")
|
||||
# Gibt: {"typical": ["resulted_in", ...], "prohibited": [...]}
|
||||
```
|
||||
|
||||
### 8.2 validate_intra_note_edge()
|
||||
|
||||
Validiert Intra-Note-Edges gegen das `graph_schema.md`.
|
||||
|
||||
**Verhalten:**
|
||||
|
||||
| Edge-Typ | Ergebnis | Confidence |
|
||||
|----------|----------|------------|
|
||||
| In `typical` | ✅ Erlaubt | 1.0 |
|
||||
| Nicht in `typical`, nicht in `prohibited` | ✅ Erlaubt (atypisch) | 0.7 |
|
||||
| In `prohibited` | ❌ Abgelehnt | 0.0 |
|
||||
|
||||
### 8.3 Manuelle Prüfung
|
||||
|
||||
1. Erstelle eine Note mit einer verbotenen Edge-Kombination
|
||||
2. Führe Ingestion durch
|
||||
3. Prüfe, dass die Edge abgelehnt wurde (Log: `🚫 [SCHEMA-VALIDATION]`)
|
||||
|
||||
### 8.4 Unit-Tests Phase 3
|
||||
|
||||
```bash
|
||||
python -m pytest tests/test_wp26_phase3_validation.py -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 9. Alle WP-26 Tests ausführen
|
||||
|
||||
```bash
|
||||
# Alle WP-26 Unit-Tests
|
||||
python -m pytest tests/test_wp26_section_types.py tests/test_wp26_phase2_retriever.py tests/test_wp26_phase3_validation.py -v
|
||||
|
||||
# Nur fehlgeschlagene Tests erneut ausführen
|
||||
python -m pytest --lf -v
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 10. Bekannte Einschränkungen
|
||||
|
||||
1. **Block-ID-Stability:** Obsidian aktualisiert Block-IDs nicht automatisch bei Umbenennung von Überschriften.
|
||||
2. **Heading-Links:** Links wie `[[#Section Name]]` werden unterstützt, aber Block-References (`[[#^id]]`) werden bevorzugt.
|
||||
3. **Nested Callouts:** Verschachtelte Callouts (`>> [!edge]`) werden korrekt verarbeitet.
|
||||
4. **Strict-Mode:** `strict_mode=True` in der Validierung lehnt atypische Edges ab (Standard: `False`).
|
||||
|
||||
---
|
||||
|
||||
## 11. Zusammenfassung
|
||||
|
||||
| Phase | Status | Beschreibung |
|
||||
|-------|--------|--------------|
|
||||
| Phase 1 | ✅ | Section-Types, Block-IDs, Intra-Note-Edges |
|
||||
| Phase 2 | ✅ | is_internal-Boost, Aggregation-Level |
|
||||
| Phase 3 | ✅ | Schema-Validierung (FA-12) |
|
||||
|
||||
---
|
||||
|
||||
**Ende der Testdokumentation (WP-26 v1.3)**
|
||||
|
|
|
|||
331
tests/test_wp26_phase3_validation.py
Normal file
331
tests/test_wp26_phase3_validation.py
Normal file
|
|
@ -0,0 +1,331 @@
|
|||
"""
|
||||
FILE: tests/test_wp26_phase3_validation.py
|
||||
DESCRIPTION: Unit-Tests für WP-26 Phase 3: Schema-Validierung für Intra-Note-Edges
|
||||
- FA-12: Validierung gegen effektiven Chunk-Typ
|
||||
- get_topology_info() Funktion
|
||||
- validate_intra_note_edge() Funktion
|
||||
VERSION: 1.0.0
|
||||
"""
|
||||
import pytest
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
class TestLoadGraphSchemaFull:
|
||||
"""UT-25: Erweitertes Schema-Laden mit prohibited Edges"""
|
||||
|
||||
def test_load_graph_schema_full_returns_dict(self):
|
||||
"""Schema wird korrekt geladen"""
|
||||
from app.core.graph.graph_utils import load_graph_schema_full, clear_graph_schema_cache
|
||||
|
||||
clear_graph_schema_cache()
|
||||
schema = load_graph_schema_full()
|
||||
|
||||
assert isinstance(schema, dict)
|
||||
|
||||
def test_schema_contains_typical_and_prohibited(self):
|
||||
"""Schema enthält sowohl typical als auch prohibited Listen"""
|
||||
from app.core.graph.graph_utils import load_graph_schema_full, clear_graph_schema_cache
|
||||
|
||||
clear_graph_schema_cache()
|
||||
schema = load_graph_schema_full()
|
||||
|
||||
# Prüfe, dass mindestens ein Eintrag existiert
|
||||
if schema:
|
||||
for source_type, targets in schema.items():
|
||||
for target_type, edge_info in targets.items():
|
||||
assert "typical" in edge_info
|
||||
assert "prohibited" in edge_info
|
||||
assert isinstance(edge_info["typical"], list)
|
||||
assert isinstance(edge_info["prohibited"], list)
|
||||
|
||||
|
||||
class TestGetTopologyInfo:
|
||||
"""UT-26: get_topology_info() Funktion"""
|
||||
|
||||
def test_get_topology_info_returns_dict(self):
|
||||
"""get_topology_info() gibt Dict mit typical und prohibited zurück"""
|
||||
from app.core.graph.graph_utils import get_topology_info, clear_graph_schema_cache
|
||||
|
||||
clear_graph_schema_cache()
|
||||
topology = get_topology_info("experience", "insight")
|
||||
|
||||
assert isinstance(topology, dict)
|
||||
assert "typical" in topology
|
||||
assert "prohibited" in topology
|
||||
|
||||
def test_get_topology_info_fallback(self):
|
||||
"""Fallback für unbekannte Typen gibt Defaults zurück"""
|
||||
from app.core.graph.graph_utils import get_topology_info, clear_graph_schema_cache
|
||||
|
||||
clear_graph_schema_cache()
|
||||
topology = get_topology_info("unknown_type_xyz", "another_unknown")
|
||||
|
||||
# Fallback sollte mindestens related_to oder references enthalten
|
||||
assert isinstance(topology["typical"], list)
|
||||
assert isinstance(topology["prohibited"], list)
|
||||
|
||||
def test_get_topology_info_experience_to_insight(self):
|
||||
"""Typische Edge von experience zu insight"""
|
||||
from app.core.graph.graph_utils import get_topology_info, clear_graph_schema_cache
|
||||
|
||||
clear_graph_schema_cache()
|
||||
topology = get_topology_info("experience", "insight")
|
||||
|
||||
# Basierend auf graph_schema.md
|
||||
assert len(topology["typical"]) > 0 or len(topology["prohibited"]) == 0
|
||||
|
||||
|
||||
class TestValidateIntraNoteEdge:
|
||||
"""UT-27: validate_intra_note_edge() Funktion"""
|
||||
|
||||
def test_validate_typical_edge_returns_true(self):
|
||||
"""Typische Edge wird akzeptiert mit Confidence 1.0"""
|
||||
from app.core.ingestion.ingestion_validation import validate_intra_note_edge
|
||||
|
||||
# Mock-Daten
|
||||
edge = {"kind": "resulted_in", "source_id": "chunk1", "target_id": "chunk2"}
|
||||
source_chunk = {"type": "experience"}
|
||||
target_chunk = {"type": "insight"}
|
||||
|
||||
is_valid, confidence, reason = validate_intra_note_edge(
|
||||
edge=edge,
|
||||
source_chunk=source_chunk,
|
||||
target_chunk=target_chunk,
|
||||
strict_mode=False
|
||||
)
|
||||
|
||||
# Edge sollte akzeptiert werden
|
||||
assert is_valid is True
|
||||
assert confidence >= 0.7 # Mindestens 0.7 (atypisch) oder 1.0 (typisch)
|
||||
|
||||
def test_validate_atypical_edge_reduced_confidence(self):
|
||||
"""Atypische Edge wird akzeptiert mit reduzierter Confidence"""
|
||||
from app.core.ingestion.ingestion_validation import validate_intra_note_edge
|
||||
|
||||
# Mock-Daten mit sehr ungewöhnlicher Edge
|
||||
edge = {"kind": "very_unusual_edge_type_xyz", "source_id": "chunk1", "target_id": "chunk2"}
|
||||
source_chunk = {"type": "experience"}
|
||||
target_chunk = {"type": "insight"}
|
||||
|
||||
is_valid, confidence, reason = validate_intra_note_edge(
|
||||
edge=edge,
|
||||
source_chunk=source_chunk,
|
||||
target_chunk=target_chunk,
|
||||
strict_mode=False
|
||||
)
|
||||
|
||||
# Atypische Edge sollte akzeptiert werden, aber mit reduzierter Confidence
|
||||
assert is_valid is True
|
||||
assert confidence == 0.7
|
||||
assert reason is not None
|
||||
|
||||
def test_validate_atypical_edge_strict_mode_rejected(self):
|
||||
"""Atypische Edge wird im Strict-Mode abgelehnt"""
|
||||
from app.core.ingestion.ingestion_validation import validate_intra_note_edge
|
||||
|
||||
# Mock-Daten
|
||||
edge = {"kind": "very_unusual_edge_type_xyz", "source_id": "chunk1", "target_id": "chunk2"}
|
||||
source_chunk = {"type": "experience"}
|
||||
target_chunk = {"type": "insight"}
|
||||
|
||||
is_valid, confidence, reason = validate_intra_note_edge(
|
||||
edge=edge,
|
||||
source_chunk=source_chunk,
|
||||
target_chunk=target_chunk,
|
||||
strict_mode=True
|
||||
)
|
||||
|
||||
# Im Strict-Mode sollte die Edge abgelehnt werden
|
||||
assert is_valid is False
|
||||
assert confidence == 0.0
|
||||
|
||||
def test_validate_uses_effective_type(self):
|
||||
"""Validierung verwendet effektiven Typ (section_type über note_type)"""
|
||||
from app.core.ingestion.ingestion_validation import validate_intra_note_edge
|
||||
|
||||
# Chunk hat sowohl type (effektiv) als auch note_type
|
||||
edge = {"kind": "related_to", "source_id": "chunk1", "target_id": "chunk2"}
|
||||
source_chunk = {"type": "insight", "note_type": "experience"} # type hat Vorrang
|
||||
target_chunk = {"type": "decision", "note_type": "experience"}
|
||||
|
||||
is_valid, confidence, reason = validate_intra_note_edge(
|
||||
edge=edge,
|
||||
source_chunk=source_chunk,
|
||||
target_chunk=target_chunk,
|
||||
strict_mode=False
|
||||
)
|
||||
|
||||
# Edge sollte gegen insight->decision validiert werden
|
||||
assert is_valid is True
|
||||
|
||||
|
||||
class TestValidateEdgeAgainstSchema:
|
||||
"""UT-28: validate_edge_against_schema() Wrapper-Funktion"""
|
||||
|
||||
def test_non_internal_edge_passes(self):
|
||||
"""Nicht-interne Edges werden ohne Schema-Check durchgelassen"""
|
||||
from app.core.ingestion.ingestion_validation import validate_edge_against_schema
|
||||
|
||||
edge = {
|
||||
"kind": "references",
|
||||
"source_id": "note1#chunk1",
|
||||
"target_id": "note2#chunk1",
|
||||
"is_internal": False
|
||||
}
|
||||
chunks_by_id = {}
|
||||
|
||||
is_valid, updated_edge = validate_edge_against_schema(
|
||||
edge=edge,
|
||||
chunks_by_id=chunks_by_id,
|
||||
strict_mode=False
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
assert updated_edge == edge
|
||||
|
||||
def test_internal_edge_validated(self):
|
||||
"""Interne Edges werden gegen Schema validiert"""
|
||||
from app.core.ingestion.ingestion_validation import validate_edge_against_schema
|
||||
|
||||
edge = {
|
||||
"kind": "derived_from",
|
||||
"source_id": "chunk1",
|
||||
"target_id": "chunk2",
|
||||
"is_internal": True,
|
||||
"confidence": 1.0
|
||||
}
|
||||
chunks_by_id = {
|
||||
"chunk1": {"type": "insight"},
|
||||
"chunk2": {"type": "experience"}
|
||||
}
|
||||
|
||||
is_valid, updated_edge = validate_edge_against_schema(
|
||||
edge=edge,
|
||||
chunks_by_id=chunks_by_id,
|
||||
strict_mode=False
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
|
||||
def test_missing_chunks_passes(self):
|
||||
"""Wenn Chunks nicht gefunden werden, wird Edge erlaubt (Integrität vor Präzision)"""
|
||||
from app.core.ingestion.ingestion_validation import validate_edge_against_schema
|
||||
|
||||
edge = {
|
||||
"kind": "derived_from",
|
||||
"source_id": "chunk1",
|
||||
"target_id": "chunk2",
|
||||
"is_internal": True
|
||||
}
|
||||
chunks_by_id = {} # Keine Chunks
|
||||
|
||||
is_valid, updated_edge = validate_edge_against_schema(
|
||||
edge=edge,
|
||||
chunks_by_id=chunks_by_id,
|
||||
strict_mode=False
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
|
||||
|
||||
class TestSchemaValidationIntegration:
|
||||
"""UT-29: Integration der Schema-Validierung"""
|
||||
|
||||
def test_clear_cache_clears_both_caches(self):
|
||||
"""clear_graph_schema_cache() löscht beide Caches"""
|
||||
from app.core.graph.graph_utils import (
|
||||
load_graph_schema,
|
||||
load_graph_schema_full,
|
||||
clear_graph_schema_cache,
|
||||
_GRAPH_SCHEMA_CACHE,
|
||||
_GRAPH_SCHEMA_FULL_CACHE
|
||||
)
|
||||
|
||||
# Lade beide Schemas
|
||||
load_graph_schema()
|
||||
load_graph_schema_full()
|
||||
|
||||
# Cache leeren
|
||||
clear_graph_schema_cache()
|
||||
|
||||
# Module-Level Variablen prüfen (Zugriff über import)
|
||||
import app.core.graph.graph_utils as utils_module
|
||||
assert utils_module._GRAPH_SCHEMA_CACHE is None
|
||||
assert utils_module._GRAPH_SCHEMA_FULL_CACHE is None
|
||||
|
||||
def test_topology_info_consistent_with_typical_edges(self):
|
||||
"""get_topology_info() ist konsistent mit get_typical_edge_for()"""
|
||||
from app.core.graph.graph_utils import (
|
||||
get_topology_info,
|
||||
get_typical_edge_for,
|
||||
clear_graph_schema_cache
|
||||
)
|
||||
|
||||
clear_graph_schema_cache()
|
||||
|
||||
# Test für experience -> insight
|
||||
topology = get_topology_info("experience", "insight")
|
||||
typical_edge = get_typical_edge_for("experience", "insight")
|
||||
|
||||
# Wenn get_typical_edge_for einen Wert zurückgibt, sollte er in typical sein
|
||||
if typical_edge and topology["typical"]:
|
||||
assert typical_edge in topology["typical"]
|
||||
|
||||
|
||||
class TestConfidenceAdjustment:
|
||||
"""UT-30: Confidence-Anpassung bei atypischen Edges"""
|
||||
|
||||
def test_atypical_edge_confidence_reduced(self):
|
||||
"""Atypische Edge erhält reduzierte Confidence (0.7)"""
|
||||
from app.core.ingestion.ingestion_validation import validate_edge_against_schema
|
||||
|
||||
edge = {
|
||||
"kind": "completely_unknown_edge_type_xyz123",
|
||||
"source_id": "chunk1",
|
||||
"target_id": "chunk2",
|
||||
"is_internal": True,
|
||||
"confidence": 1.0
|
||||
}
|
||||
chunks_by_id = {
|
||||
"chunk1": {"type": "experience"},
|
||||
"chunk2": {"type": "insight"}
|
||||
}
|
||||
|
||||
is_valid, updated_edge = validate_edge_against_schema(
|
||||
edge=edge,
|
||||
chunks_by_id=chunks_by_id,
|
||||
strict_mode=False
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
# Confidence sollte auf 0.7 reduziert worden sein (da atypisch)
|
||||
assert updated_edge.get("confidence") == 0.7
|
||||
|
||||
def test_schema_validation_note_added(self):
|
||||
"""Atypische Edge erhält Validierungs-Notiz"""
|
||||
from app.core.ingestion.ingestion_validation import validate_edge_against_schema
|
||||
|
||||
edge = {
|
||||
"kind": "completely_unknown_edge_type_xyz123",
|
||||
"source_id": "chunk1",
|
||||
"target_id": "chunk2",
|
||||
"is_internal": True,
|
||||
"confidence": 1.0
|
||||
}
|
||||
chunks_by_id = {
|
||||
"chunk1": {"type": "experience"},
|
||||
"chunk2": {"type": "insight"}
|
||||
}
|
||||
|
||||
is_valid, updated_edge = validate_edge_against_schema(
|
||||
edge=edge,
|
||||
chunks_by_id=chunks_by_id,
|
||||
strict_mode=False
|
||||
)
|
||||
|
||||
# Validierungs-Notiz sollte hinzugefügt worden sein
|
||||
assert "schema_validation_note" in updated_edge
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
Loading…
Reference in New Issue
Block a user