WP22_Korrektur #13

Merged
Lars merged 3 commits from WP22_Korrektur into main 2025-12-23 12:26:44 +01:00
3 changed files with 128 additions and 43 deletions

View File

@ -4,8 +4,9 @@ DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen (Notes
FIX: Korrekte Priorisierung von Frontmatter für chunk_profile und retriever_weight. FIX: Korrekte Priorisierung von Frontmatter für chunk_profile und retriever_weight.
Lade Chunk-Config basierend auf dem effektiven Profil, nicht nur dem Notiz-Typ. Lade Chunk-Config basierend auf dem effektiven Profil, nicht nur dem Notiz-Typ.
WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation. WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation.
WP-22: Kontextsensitive Kanten-Validierung mit Fundort-Reporting (Zeilennummern).
WP-22: Multi-Hash Refresh für konsistente Change Detection. WP-22: Multi-Hash Refresh für konsistente Change Detection.
VERSION: 2.8.6 (WP-22 Lifecycle & Registry) VERSION: 2.9.0 (WP-22 Full Integration: Context-Aware Registry)
STATUS: Active STATUS: Active
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client, app.services.edge_registry DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client, app.services.edge_registry
EXTERNAL_CONFIG: config/types.yaml EXTERNAL_CONFIG: config/types.yaml
@ -21,6 +22,7 @@ from app.core.parser import (
read_markdown, read_markdown,
normalize_frontmatter, normalize_frontmatter,
validate_required_frontmatter, validate_required_frontmatter,
extract_edges_with_context, # WP-22: Neue Funktion für Zeilennummern
) )
from app.core.note_payload import make_note_payload from app.core.note_payload import make_note_payload
from app.core.chunker import assemble_chunks, get_chunk_config from app.core.chunker import assemble_chunks, get_chunk_config
@ -158,7 +160,7 @@ class IngestionService:
logger.error(f"Validation failed for {file_path}: {e}") logger.error(f"Validation failed for {file_path}: {e}")
return {**result, "error": f"Validation failed: {str(e)}"} return {**result, "error": f"Validation failed: {str(e)}"}
# --- WP-22: Content Lifecycle Gate (Teil A) --- # --- WP-22: Content Lifecycle Gate ---
status = fm.get("status", "draft").lower().strip() status = fm.get("status", "draft").lower().strip()
# Hard Skip für System- oder Archiv-Dateien # Hard Skip für System- oder Archiv-Dateien
@ -229,6 +231,9 @@ class IngestionService:
try: try:
body_text = getattr(parsed, "body", "") or "" body_text = getattr(parsed, "body", "") or ""
# WP-22: Sicherstellen, dass die Registry aktuell ist (Lazy Reload)
edge_registry.ensure_latest()
# Konfiguration für das spezifische Profil laden # Konfiguration für das spezifische Profil laden
chunk_config = self._get_chunk_config_by_profile(effective_profile, note_type) chunk_config = self._get_chunk_config_by_profile(effective_profile, note_type)
@ -251,26 +256,47 @@ class IngestionService:
logger.error(f"Embedding failed: {e}") logger.error(f"Embedding failed: {e}")
raise RuntimeError(f"Embedding failed: {e}") raise RuntimeError(f"Embedding failed: {e}")
# Kanten generieren # --- WP-22: Kanten-Extraktion & Validierung ---
# A. Explizite User-Kanten mit Zeilennummern extrahieren
explicit_edges = extract_edges_with_context(parsed)
# B. System-Kanten generieren (Struktur: belongs_to, next, prev)
try: try:
raw_edges = build_edges_for_note( raw_system_edges = build_edges_for_note(
note_id, note_id,
chunk_pls, chunk_pls,
note_level_references=note_pl.get("references", []), note_level_references=note_pl.get("references", []),
include_note_scope_refs=note_scope_refs include_note_scope_refs=note_scope_refs
) )
except TypeError: except TypeError:
raw_edges = build_edges_for_note(note_id, chunk_pls) raw_system_edges = build_edges_for_note(note_id, chunk_pls)
# --- WP-22: Edge Registry Validation (Teil B) --- # C. Alle Kanten validieren und über die Registry mappen
edges = [] edges = []
if raw_edges: context = {"file": file_path, "note_id": note_id}
for edge in raw_edges:
original_kind = edge.get("kind", "related_to") # Zuerst User-Kanten (provenance="explicit")
# Normalisierung über die Registry (Alias-Auflösung) for e in explicit_edges:
canonical_kind = edge_registry.resolve(original_kind) valid_kind = edge_registry.resolve(
edge["kind"] = canonical_kind edge_type=e["kind"],
edges.append(edge) provenance="explicit",
context={**context, "line": e.get("line")}
)
e["kind"] = valid_kind
edges.append(e)
# Dann System-Kanten (provenance="structure")
for e in raw_system_edges:
# Sicherstellen, dass System-Kanten korrekt markiert sind
valid_kind = edge_registry.resolve(
edge_type=e.get("kind", "belongs_to"),
provenance="structure",
context={**context, "line": "system"}
)
e["kind"] = valid_kind
# Nur hinzufügen, wenn die Registry einen validen Typ zurückgibt
if valid_kind:
edges.append(e)
except Exception as e: except Exception as e:
logger.error(f"Processing failed: {e}", exc_info=True) logger.error(f"Processing failed: {e}", exc_info=True)

View File

@ -231,3 +231,20 @@ def extract_wikilinks(text: str) -> List[str]:
if raw: if raw:
out.append(raw) out.append(raw)
return out return out
def extract_edges_with_context(note: ParsedNote) -> List[Dict[str, Any]]:
"""Extrahiert Kanten-Typen, Ziele und Zeilennummern."""
edges = []
lines = note.body.splitlines()
# Erkennt [[rel:typ Ziel]]
rel_pattern = re.compile(r"\[\[rel:([a-zA-Z0-9_-]+)\s+([^\]|#]+)\]\]")
for i, line in enumerate(lines):
for match in rel_pattern.finditer(line):
edges.append({
"kind": match.group(1).strip(),
"target": match.group(2).strip(),
"line": i + 1,
"provenance": "explicit"
})
return edges

View File

@ -1,16 +1,15 @@
""" """
FILE: app/services/edge_registry.py FILE: app/services/edge_registry.py
DESCRIPTION: Single Source of Truth für Kanten-Typen. DESCRIPTION: Single Source of Truth für Kanten-Typen mit dynamischem Reload.
FIX: Regex angepasst auf Format **`canonical`** (Bold + Backticks). WP-22: Transparente Status-Meldungen für Dev-Umgebungen.
VERSION: 0.6.10 (Regex Precision Update) VERSION: 0.7.2 (Fix: Restore Console Visibility & Entry Counts)
""" """
import re import re
import os import os
import json import json
import logging import logging
from typing import Dict, Optional, Set import time
from typing import Dict, Optional, Set, Tuple
print(">>> MODULE_LOAD: edge_registry.py initialized <<<", flush=True)
from app.config import get_settings from app.config import get_settings
@ -18,6 +17,8 @@ logger = logging.getLogger(__name__)
class EdgeRegistry: class EdgeRegistry:
_instance = None _instance = None
# System-Kanten, die NIEMALS manuell im Markdown stehen dürfen
FORBIDDEN_SYSTEM_EDGES = {"next", "prev", "belongs_to"}
def __new__(cls, *args, **kwargs): def __new__(cls, *args, **kwargs):
if cls._instance is None: if cls._instance is None:
@ -33,31 +34,50 @@ class EdgeRegistry:
env_vocab_path = os.getenv("MINDNET_VOCAB_PATH") env_vocab_path = os.getenv("MINDNET_VOCAB_PATH")
env_vault_root = os.getenv("MINDNET_VAULT_ROOT") or getattr(settings, "MINDNET_VAULT_ROOT", "./vault") env_vault_root = os.getenv("MINDNET_VAULT_ROOT") or getattr(settings, "MINDNET_VAULT_ROOT", "./vault")
# Pfad-Priorität: 1. ENV -> 2. _system/dictionary -> 3. 01_User_Manual
if env_vocab_path: if env_vocab_path:
self.full_vocab_path = os.path.abspath(env_vocab_path) self.full_vocab_path = os.path.abspath(env_vocab_path)
else: else:
self.full_vocab_path = os.path.abspath( possible_paths = [
os.path.join(env_vault_root, "_system", "dictionary", "edge_vocabulary.md"),
os.path.join(env_vault_root, "01_User_Manual", "01_edge_vocabulary.md") os.path.join(env_vault_root, "01_User_Manual", "01_edge_vocabulary.md")
) ]
self.full_vocab_path = None
for p in possible_paths:
if os.path.exists(p):
self.full_vocab_path = os.path.abspath(p)
break
if not self.full_vocab_path:
self.full_vocab_path = os.path.abspath(possible_paths[0])
self.unknown_log_path = "data/logs/unknown_edges.jsonl" self.unknown_log_path = "data/logs/unknown_edges.jsonl"
self.canonical_map: Dict[str, str] = {} self.canonical_map: Dict[str, str] = {}
self.valid_types: Set[str] = set() self.valid_types: Set[str] = set()
self._last_mtime = 0.0
self._load_vocabulary() # Initialer Lade-Versuch mit Konsolen-Feedback
print(f"\n>>> [EDGE-REGISTRY] Initializing with Path: {self.full_vocab_path}", flush=True)
self.ensure_latest()
self.initialized = True self.initialized = True
def _load_vocabulary(self): def ensure_latest(self):
"""Parst die Markdown-Tabelle im Vault.""" """Prüft den Zeitstempel und lädt bei Bedarf neu."""
print(f">>> CHECK: Loading Vocabulary from {self.full_vocab_path}", flush=True)
if not os.path.exists(self.full_vocab_path): if not os.path.exists(self.full_vocab_path):
print(f"!!! [DICT-ERROR] File not found: {self.full_vocab_path} !!!", flush=True) print(f"!!! [EDGE-REGISTRY ERROR] File not found: {self.full_vocab_path} !!!", flush=True)
return return
# WP-22 Precision Regex: current_mtime = os.path.getmtime(self.full_vocab_path)
# Sucht nach | **`typ`** | oder | **typ** | if current_mtime > self._last_mtime:
# Die Backticks `? sind jetzt optional enthalten. self._load_vocabulary()
self._last_mtime = current_mtime
def _load_vocabulary(self):
"""Parst das Wörterbuch und meldet die Anzahl der gelesenen Einträge."""
self.canonical_map.clear()
self.valid_types.clear()
# Regex deckt | **canonical** | Aliase | ab
pattern = re.compile(r"\|\s*\*\*`?([a-zA-Z0-9_-]+)`?\*\*\s*\|\s*([^|]+)\|") pattern = re.compile(r"\|\s*\*\*`?([a-zA-Z0-9_-]+)`?\*\*\s*\|\s*([^|]+)\|")
try: try:
@ -74,36 +94,58 @@ class EdgeRegistry:
c_types += 1 c_types += 1
if aliases_str and "Kein Alias" not in aliases_str: if aliases_str and "Kein Alias" not in aliases_str:
# Aliase säubern (entfernt Backticks auch hier)
aliases = [a.strip() for a in aliases_str.split(",") if a.strip()] aliases = [a.strip() for a in aliases_str.split(",") if a.strip()]
for alias in aliases: for alias in aliases:
# Normalisierung: Kleinschreibung und Unterstriche
clean_alias = alias.replace("`", "").lower().strip().replace(" ", "_") clean_alias = alias.replace("`", "").lower().strip().replace(" ", "_")
self.canonical_map[clean_alias] = canonical self.canonical_map[clean_alias] = canonical
c_aliases += 1 c_aliases += 1
if c_types == 0: # Erfolgskontrolle für das Dev-Terminal
print("!!! [DICT-WARN] Pattern mismatch! Ensure types are **`canonical`** or **canonical**. !!!", flush=True) print(f"=== [EDGE-REGISTRY SUCCESS] Loaded {c_types} Canonical Types and {c_aliases} Aliases ===", flush=True)
else: logger.info(f"Registry reloaded from {self.full_vocab_path}")
print(f"=== [DICT-SUCCESS] Registered {c_types} Canonical Types and {c_aliases} Aliases ===", flush=True)
except Exception as e: except Exception as e:
print(f"!!! [DICT-FATAL] Error reading file: {e} !!!", flush=True) print(f"!!! [EDGE-REGISTRY FATAL] Error reading file: {e} !!!", flush=True)
logger.error(f"Error reading vocabulary: {e}")
def resolve(self, edge_type: str) -> str: def resolve(self, edge_type: str, provenance: str = "explicit", context: dict = None) -> str:
"""Normalisiert Kanten-Typen via Registry oder loggt Unbekannte.""" """Validierung mit Fundort-Logging."""
self.ensure_latest()
if not edge_type: return "related_to" if not edge_type: return "related_to"
clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_")
clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_")
ctx = context or {}
# 1. Schutz der Systemkanten (Verbot für manuelle Nutzung)
if provenance == "explicit" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
self._log_issue(clean_type, "forbidden_system_usage", ctx)
return "related_to"
# 2. Akzeptanz interner Strukturkanten
if provenance == "structure" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
return clean_type
# 3. Mapping via Wörterbuch
if clean_type in self.canonical_map: if clean_type in self.canonical_map:
return self.canonical_map[clean_type] return self.canonical_map[clean_type]
self._log_unknown(clean_type) # 4. Unbekannte Kante
self._log_issue(clean_type, "unknown_type", ctx)
return clean_type return clean_type
def _log_unknown(self, edge_type: str): def _log_issue(self, edge_type: str, error_kind: str, ctx: dict):
"""Detailliertes JSONL-Logging für Debugging."""
try: try:
os.makedirs(os.path.dirname(self.unknown_log_path), exist_ok=True) os.makedirs(os.path.dirname(self.unknown_log_path), exist_ok=True)
entry = {"unknown_type": edge_type, "status": "new"} entry = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"edge_type": edge_type,
"error": error_kind,
"file": ctx.get("file", "unknown"),
"line": ctx.get("line", "unknown"),
"note_id": ctx.get("note_id", "unknown")
}
with open(self.unknown_log_path, "a", encoding="utf-8") as f: with open(self.unknown_log_path, "a", encoding="utf-8") as f:
f.write(json.dumps(entry) + "\n") f.write(json.dumps(entry) + "\n")
except Exception: pass except Exception: pass