Compare commits

..

No commits in common. "c68c7404cfafac819d5185faaefec8a6d2adc34c" and "2d43e0596c99a9bacbfc7eac078f93ba073af32b" have entirely different histories.

3 changed files with 43 additions and 128 deletions

View File

@ -4,9 +4,8 @@ DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen (Notes
FIX: Korrekte Priorisierung von Frontmatter für chunk_profile und retriever_weight. FIX: Korrekte Priorisierung von Frontmatter für chunk_profile und retriever_weight.
Lade Chunk-Config basierend auf dem effektiven Profil, nicht nur dem Notiz-Typ. Lade Chunk-Config basierend auf dem effektiven Profil, nicht nur dem Notiz-Typ.
WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation. WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation.
WP-22: Kontextsensitive Kanten-Validierung mit Fundort-Reporting (Zeilennummern).
WP-22: Multi-Hash Refresh für konsistente Change Detection. WP-22: Multi-Hash Refresh für konsistente Change Detection.
VERSION: 2.9.0 (WP-22 Full Integration: Context-Aware Registry) VERSION: 2.8.6 (WP-22 Lifecycle & Registry)
STATUS: Active STATUS: Active
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client, app.services.edge_registry DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client, app.services.edge_registry
EXTERNAL_CONFIG: config/types.yaml EXTERNAL_CONFIG: config/types.yaml
@ -22,7 +21,6 @@ from app.core.parser import (
read_markdown, read_markdown,
normalize_frontmatter, normalize_frontmatter,
validate_required_frontmatter, validate_required_frontmatter,
extract_edges_with_context, # WP-22: Neue Funktion für Zeilennummern
) )
from app.core.note_payload import make_note_payload from app.core.note_payload import make_note_payload
from app.core.chunker import assemble_chunks, get_chunk_config from app.core.chunker import assemble_chunks, get_chunk_config
@ -160,7 +158,7 @@ class IngestionService:
logger.error(f"Validation failed for {file_path}: {e}") logger.error(f"Validation failed for {file_path}: {e}")
return {**result, "error": f"Validation failed: {str(e)}"} return {**result, "error": f"Validation failed: {str(e)}"}
# --- WP-22: Content Lifecycle Gate --- # --- WP-22: Content Lifecycle Gate (Teil A) ---
status = fm.get("status", "draft").lower().strip() status = fm.get("status", "draft").lower().strip()
# Hard Skip für System- oder Archiv-Dateien # Hard Skip für System- oder Archiv-Dateien
@ -231,9 +229,6 @@ class IngestionService:
try: try:
body_text = getattr(parsed, "body", "") or "" body_text = getattr(parsed, "body", "") or ""
# WP-22: Sicherstellen, dass die Registry aktuell ist (Lazy Reload)
edge_registry.ensure_latest()
# Konfiguration für das spezifische Profil laden # Konfiguration für das spezifische Profil laden
chunk_config = self._get_chunk_config_by_profile(effective_profile, note_type) chunk_config = self._get_chunk_config_by_profile(effective_profile, note_type)
@ -256,47 +251,26 @@ class IngestionService:
logger.error(f"Embedding failed: {e}") logger.error(f"Embedding failed: {e}")
raise RuntimeError(f"Embedding failed: {e}") raise RuntimeError(f"Embedding failed: {e}")
# --- WP-22: Kanten-Extraktion & Validierung --- # Kanten generieren
# A. Explizite User-Kanten mit Zeilennummern extrahieren
explicit_edges = extract_edges_with_context(parsed)
# B. System-Kanten generieren (Struktur: belongs_to, next, prev)
try: try:
raw_system_edges = build_edges_for_note( raw_edges = build_edges_for_note(
note_id, note_id,
chunk_pls, chunk_pls,
note_level_references=note_pl.get("references", []), note_level_references=note_pl.get("references", []),
include_note_scope_refs=note_scope_refs include_note_scope_refs=note_scope_refs
) )
except TypeError: except TypeError:
raw_system_edges = build_edges_for_note(note_id, chunk_pls) raw_edges = build_edges_for_note(note_id, chunk_pls)
# C. Alle Kanten validieren und über die Registry mappen # --- WP-22: Edge Registry Validation (Teil B) ---
edges = [] edges = []
context = {"file": file_path, "note_id": note_id} if raw_edges:
for edge in raw_edges:
# Zuerst User-Kanten (provenance="explicit") original_kind = edge.get("kind", "related_to")
for e in explicit_edges: # Normalisierung über die Registry (Alias-Auflösung)
valid_kind = edge_registry.resolve( canonical_kind = edge_registry.resolve(original_kind)
edge_type=e["kind"], edge["kind"] = canonical_kind
provenance="explicit", edges.append(edge)
context={**context, "line": e.get("line")}
)
e["kind"] = valid_kind
edges.append(e)
# Dann System-Kanten (provenance="structure")
for e in raw_system_edges:
# Sicherstellen, dass System-Kanten korrekt markiert sind
valid_kind = edge_registry.resolve(
edge_type=e.get("kind", "belongs_to"),
provenance="structure",
context={**context, "line": "system"}
)
e["kind"] = valid_kind
# Nur hinzufügen, wenn die Registry einen validen Typ zurückgibt
if valid_kind:
edges.append(e)
except Exception as e: except Exception as e:
logger.error(f"Processing failed: {e}", exc_info=True) logger.error(f"Processing failed: {e}", exc_info=True)

View File

@ -231,20 +231,3 @@ def extract_wikilinks(text: str) -> List[str]:
if raw: if raw:
out.append(raw) out.append(raw)
return out return out
def extract_edges_with_context(note: ParsedNote) -> List[Dict[str, Any]]:
"""Extrahiert Kanten-Typen, Ziele und Zeilennummern."""
edges = []
lines = note.body.splitlines()
# Erkennt [[rel:typ Ziel]]
rel_pattern = re.compile(r"\[\[rel:([a-zA-Z0-9_-]+)\s+([^\]|#]+)\]\]")
for i, line in enumerate(lines):
for match in rel_pattern.finditer(line):
edges.append({
"kind": match.group(1).strip(),
"target": match.group(2).strip(),
"line": i + 1,
"provenance": "explicit"
})
return edges

View File

@ -1,15 +1,16 @@
""" """
FILE: app/services/edge_registry.py FILE: app/services/edge_registry.py
DESCRIPTION: Single Source of Truth für Kanten-Typen mit dynamischem Reload. DESCRIPTION: Single Source of Truth für Kanten-Typen.
WP-22: Transparente Status-Meldungen für Dev-Umgebungen. FIX: Regex angepasst auf Format **`canonical`** (Bold + Backticks).
VERSION: 0.7.2 (Fix: Restore Console Visibility & Entry Counts) VERSION: 0.6.10 (Regex Precision Update)
""" """
import re import re
import os import os
import json import json
import logging import logging
import time from typing import Dict, Optional, Set
from typing import Dict, Optional, Set, Tuple
print(">>> MODULE_LOAD: edge_registry.py initialized <<<", flush=True)
from app.config import get_settings from app.config import get_settings
@ -17,8 +18,6 @@ logger = logging.getLogger(__name__)
class EdgeRegistry: class EdgeRegistry:
_instance = None _instance = None
# System-Kanten, die NIEMALS manuell im Markdown stehen dürfen
FORBIDDEN_SYSTEM_EDGES = {"next", "prev", "belongs_to"}
def __new__(cls, *args, **kwargs): def __new__(cls, *args, **kwargs):
if cls._instance is None: if cls._instance is None:
@ -34,50 +33,31 @@ class EdgeRegistry:
env_vocab_path = os.getenv("MINDNET_VOCAB_PATH") env_vocab_path = os.getenv("MINDNET_VOCAB_PATH")
env_vault_root = os.getenv("MINDNET_VAULT_ROOT") or getattr(settings, "MINDNET_VAULT_ROOT", "./vault") env_vault_root = os.getenv("MINDNET_VAULT_ROOT") or getattr(settings, "MINDNET_VAULT_ROOT", "./vault")
# Pfad-Priorität: 1. ENV -> 2. _system/dictionary -> 3. 01_User_Manual
if env_vocab_path: if env_vocab_path:
self.full_vocab_path = os.path.abspath(env_vocab_path) self.full_vocab_path = os.path.abspath(env_vocab_path)
else: else:
possible_paths = [ self.full_vocab_path = os.path.abspath(
os.path.join(env_vault_root, "_system", "dictionary", "edge_vocabulary.md"),
os.path.join(env_vault_root, "01_User_Manual", "01_edge_vocabulary.md") os.path.join(env_vault_root, "01_User_Manual", "01_edge_vocabulary.md")
] )
self.full_vocab_path = None
for p in possible_paths:
if os.path.exists(p):
self.full_vocab_path = os.path.abspath(p)
break
if not self.full_vocab_path:
self.full_vocab_path = os.path.abspath(possible_paths[0])
self.unknown_log_path = "data/logs/unknown_edges.jsonl" self.unknown_log_path = "data/logs/unknown_edges.jsonl"
self.canonical_map: Dict[str, str] = {} self.canonical_map: Dict[str, str] = {}
self.valid_types: Set[str] = set() self.valid_types: Set[str] = set()
self._last_mtime = 0.0
# Initialer Lade-Versuch mit Konsolen-Feedback self._load_vocabulary()
print(f"\n>>> [EDGE-REGISTRY] Initializing with Path: {self.full_vocab_path}", flush=True)
self.ensure_latest()
self.initialized = True self.initialized = True
def ensure_latest(self): def _load_vocabulary(self):
"""Prüft den Zeitstempel und lädt bei Bedarf neu.""" """Parst die Markdown-Tabelle im Vault."""
print(f">>> CHECK: Loading Vocabulary from {self.full_vocab_path}", flush=True)
if not os.path.exists(self.full_vocab_path): if not os.path.exists(self.full_vocab_path):
print(f"!!! [EDGE-REGISTRY ERROR] File not found: {self.full_vocab_path} !!!", flush=True) print(f"!!! [DICT-ERROR] File not found: {self.full_vocab_path} !!!", flush=True)
return return
current_mtime = os.path.getmtime(self.full_vocab_path) # WP-22 Precision Regex:
if current_mtime > self._last_mtime: # Sucht nach | **`typ`** | oder | **typ** |
self._load_vocabulary() # Die Backticks `? sind jetzt optional enthalten.
self._last_mtime = current_mtime
def _load_vocabulary(self):
"""Parst das Wörterbuch und meldet die Anzahl der gelesenen Einträge."""
self.canonical_map.clear()
self.valid_types.clear()
# Regex deckt | **canonical** | Aliase | ab
pattern = re.compile(r"\|\s*\*\*`?([a-zA-Z0-9_-]+)`?\*\*\s*\|\s*([^|]+)\|") pattern = re.compile(r"\|\s*\*\*`?([a-zA-Z0-9_-]+)`?\*\*\s*\|\s*([^|]+)\|")
try: try:
@ -94,58 +74,36 @@ class EdgeRegistry:
c_types += 1 c_types += 1
if aliases_str and "Kein Alias" not in aliases_str: if aliases_str and "Kein Alias" not in aliases_str:
# Aliase säubern (entfernt Backticks auch hier)
aliases = [a.strip() for a in aliases_str.split(",") if a.strip()] aliases = [a.strip() for a in aliases_str.split(",") if a.strip()]
for alias in aliases: for alias in aliases:
# Normalisierung: Kleinschreibung und Unterstriche
clean_alias = alias.replace("`", "").lower().strip().replace(" ", "_") clean_alias = alias.replace("`", "").lower().strip().replace(" ", "_")
self.canonical_map[clean_alias] = canonical self.canonical_map[clean_alias] = canonical
c_aliases += 1 c_aliases += 1
# Erfolgskontrolle für das Dev-Terminal if c_types == 0:
print(f"=== [EDGE-REGISTRY SUCCESS] Loaded {c_types} Canonical Types and {c_aliases} Aliases ===", flush=True) print("!!! [DICT-WARN] Pattern mismatch! Ensure types are **`canonical`** or **canonical**. !!!", flush=True)
logger.info(f"Registry reloaded from {self.full_vocab_path}") else:
print(f"=== [DICT-SUCCESS] Registered {c_types} Canonical Types and {c_aliases} Aliases ===", flush=True)
except Exception as e: except Exception as e:
print(f"!!! [EDGE-REGISTRY FATAL] Error reading file: {e} !!!", flush=True) print(f"!!! [DICT-FATAL] Error reading file: {e} !!!", flush=True)
logger.error(f"Error reading vocabulary: {e}")
def resolve(self, edge_type: str, provenance: str = "explicit", context: dict = None) -> str: def resolve(self, edge_type: str) -> str:
"""Validierung mit Fundort-Logging.""" """Normalisiert Kanten-Typen via Registry oder loggt Unbekannte."""
self.ensure_latest()
if not edge_type: return "related_to" if not edge_type: return "related_to"
clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_") clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_")
ctx = context or {}
# 1. Schutz der Systemkanten (Verbot für manuelle Nutzung)
if provenance == "explicit" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
self._log_issue(clean_type, "forbidden_system_usage", ctx)
return "related_to"
# 2. Akzeptanz interner Strukturkanten
if provenance == "structure" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
return clean_type
# 3. Mapping via Wörterbuch
if clean_type in self.canonical_map: if clean_type in self.canonical_map:
return self.canonical_map[clean_type] return self.canonical_map[clean_type]
# 4. Unbekannte Kante self._log_unknown(clean_type)
self._log_issue(clean_type, "unknown_type", ctx)
return clean_type return clean_type
def _log_issue(self, edge_type: str, error_kind: str, ctx: dict): def _log_unknown(self, edge_type: str):
"""Detailliertes JSONL-Logging für Debugging."""
try: try:
os.makedirs(os.path.dirname(self.unknown_log_path), exist_ok=True) os.makedirs(os.path.dirname(self.unknown_log_path), exist_ok=True)
entry = { entry = {"unknown_type": edge_type, "status": "new"}
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"edge_type": edge_type,
"error": error_kind,
"file": ctx.get("file", "unknown"),
"line": ctx.get("line", "unknown"),
"note_id": ctx.get("note_id", "unknown")
}
with open(self.unknown_log_path, "a", encoding="utf-8") as f: with open(self.unknown_log_path, "a", encoding="utf-8") as f:
f.write(json.dumps(entry) + "\n") f.write(json.dumps(entry) + "\n")
except Exception: pass except Exception: pass