neue parser logik

This commit is contained in:
Lars 2025-12-23 07:31:33 +01:00
parent 2d43e0596c
commit 99482ad65a
3 changed files with 107 additions and 48 deletions

View File

@ -4,8 +4,9 @@ DESCRIPTION: Haupt-Ingestion-Logik. Transformiert Markdown in den Graphen (Notes
FIX: Korrekte Priorisierung von Frontmatter für chunk_profile und retriever_weight. FIX: Korrekte Priorisierung von Frontmatter für chunk_profile und retriever_weight.
Lade Chunk-Config basierend auf dem effektiven Profil, nicht nur dem Notiz-Typ. Lade Chunk-Config basierend auf dem effektiven Profil, nicht nur dem Notiz-Typ.
WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation. WP-22: Integration von Content Lifecycle (Status Gate) und Edge Registry Validation.
WP-22: Kontextsensitive Kanten-Validierung mit Fundort-Reporting (Zeilennummern).
WP-22: Multi-Hash Refresh für konsistente Change Detection. WP-22: Multi-Hash Refresh für konsistente Change Detection.
VERSION: 2.8.6 (WP-22 Lifecycle & Registry) VERSION: 2.9.0 (WP-22 Full Integration: Context-Aware Registry)
STATUS: Active STATUS: Active
DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client, app.services.edge_registry DEPENDENCIES: app.core.parser, app.core.note_payload, app.core.chunker, app.core.derive_edges, app.core.qdrant*, app.services.embeddings_client, app.services.edge_registry
EXTERNAL_CONFIG: config/types.yaml EXTERNAL_CONFIG: config/types.yaml
@ -21,6 +22,7 @@ from app.core.parser import (
read_markdown, read_markdown,
normalize_frontmatter, normalize_frontmatter,
validate_required_frontmatter, validate_required_frontmatter,
extract_edges_with_context, # WP-22: Neue Funktion für Zeilennummern
) )
from app.core.note_payload import make_note_payload from app.core.note_payload import make_note_payload
from app.core.chunker import assemble_chunks, get_chunk_config from app.core.chunker import assemble_chunks, get_chunk_config
@ -158,7 +160,7 @@ class IngestionService:
logger.error(f"Validation failed for {file_path}: {e}") logger.error(f"Validation failed for {file_path}: {e}")
return {**result, "error": f"Validation failed: {str(e)}"} return {**result, "error": f"Validation failed: {str(e)}"}
# --- WP-22: Content Lifecycle Gate (Teil A) --- # --- WP-22: Content Lifecycle Gate ---
status = fm.get("status", "draft").lower().strip() status = fm.get("status", "draft").lower().strip()
# Hard Skip für System- oder Archiv-Dateien # Hard Skip für System- oder Archiv-Dateien
@ -229,6 +231,9 @@ class IngestionService:
try: try:
body_text = getattr(parsed, "body", "") or "" body_text = getattr(parsed, "body", "") or ""
# WP-22: Sicherstellen, dass die Registry aktuell ist (Lazy Reload)
edge_registry.ensure_latest()
# Konfiguration für das spezifische Profil laden # Konfiguration für das spezifische Profil laden
chunk_config = self._get_chunk_config_by_profile(effective_profile, note_type) chunk_config = self._get_chunk_config_by_profile(effective_profile, note_type)
@ -251,26 +256,47 @@ class IngestionService:
logger.error(f"Embedding failed: {e}") logger.error(f"Embedding failed: {e}")
raise RuntimeError(f"Embedding failed: {e}") raise RuntimeError(f"Embedding failed: {e}")
# Kanten generieren # --- WP-22: Kanten-Extraktion & Validierung ---
# A. Explizite User-Kanten mit Zeilennummern extrahieren
explicit_edges = extract_edges_with_context(parsed)
# B. System-Kanten generieren (Struktur: belongs_to, next, prev)
try: try:
raw_edges = build_edges_for_note( raw_system_edges = build_edges_for_note(
note_id, note_id,
chunk_pls, chunk_pls,
note_level_references=note_pl.get("references", []), note_level_references=note_pl.get("references", []),
include_note_scope_refs=note_scope_refs include_note_scope_refs=note_scope_refs
) )
except TypeError: except TypeError:
raw_edges = build_edges_for_note(note_id, chunk_pls) raw_system_edges = build_edges_for_note(note_id, chunk_pls)
# --- WP-22: Edge Registry Validation (Teil B) --- # C. Alle Kanten validieren und über die Registry mappen
edges = [] edges = []
if raw_edges: context = {"file": file_path, "note_id": note_id}
for edge in raw_edges:
original_kind = edge.get("kind", "related_to") # Zuerst User-Kanten (provenance="explicit")
# Normalisierung über die Registry (Alias-Auflösung) for e in explicit_edges:
canonical_kind = edge_registry.resolve(original_kind) valid_kind = edge_registry.resolve(
edge["kind"] = canonical_kind edge_type=e["kind"],
edges.append(edge) provenance="explicit",
context={**context, "line": e.get("line")}
)
e["kind"] = valid_kind
edges.append(e)
# Dann System-Kanten (provenance="structure")
for e in raw_system_edges:
# Sicherstellen, dass System-Kanten korrekt markiert sind
valid_kind = edge_registry.resolve(
edge_type=e.get("kind", "belongs_to"),
provenance="structure",
context={**context, "line": "system"}
)
e["kind"] = valid_kind
# Nur hinzufügen, wenn die Registry einen validen Typ zurückgibt
if valid_kind:
edges.append(e)
except Exception as e: except Exception as e:
logger.error(f"Processing failed: {e}", exc_info=True) logger.error(f"Processing failed: {e}", exc_info=True)

View File

@ -231,3 +231,20 @@ def extract_wikilinks(text: str) -> List[str]:
if raw: if raw:
out.append(raw) out.append(raw)
return out return out
def extract_edges_with_context(note: ParsedNote) -> List[Dict[str, Any]]:
"""Extrahiert Kanten-Typen, Ziele und Zeilennummern."""
edges = []
lines = note.body.splitlines()
# Erkennt [[rel:typ Ziel]]
rel_pattern = re.compile(r"\[\[rel:([a-zA-Z0-9_-]+)\s+([^\]|#]+)\]\]")
for i, line in enumerate(lines):
for match in rel_pattern.finditer(line):
edges.append({
"kind": match.group(1).strip(),
"target": match.group(2).strip(),
"line": i + 1,
"provenance": "explicit"
})
return edges

View File

@ -1,16 +1,9 @@
"""
FILE: app/services/edge_registry.py
DESCRIPTION: Single Source of Truth für Kanten-Typen.
FIX: Regex angepasst auf Format **`canonical`** (Bold + Backticks).
VERSION: 0.6.10 (Regex Precision Update)
"""
import re import re
import os import os
import json import json
import logging import logging
from typing import Dict, Optional, Set import time
from typing import Dict, Optional, Set, Tuple
print(">>> MODULE_LOAD: edge_registry.py initialized <<<", flush=True)
from app.config import get_settings from app.config import get_settings
@ -18,6 +11,8 @@ logger = logging.getLogger(__name__)
class EdgeRegistry: class EdgeRegistry:
_instance = None _instance = None
# System-Kanten, die NIEMALS manuell im Markdown stehen dürfen
FORBIDDEN_SYSTEM_EDGES = {"next", "prev", "belongs_to"}
def __new__(cls, *args, **kwargs): def __new__(cls, *args, **kwargs):
if cls._instance is None: if cls._instance is None:
@ -43,26 +38,30 @@ class EdgeRegistry:
self.unknown_log_path = "data/logs/unknown_edges.jsonl" self.unknown_log_path = "data/logs/unknown_edges.jsonl"
self.canonical_map: Dict[str, str] = {} self.canonical_map: Dict[str, str] = {}
self.valid_types: Set[str] = set() self.valid_types: Set[str] = set()
self._last_mtime = 0.0 # Für dynamisches Neuladen
self._load_vocabulary() self.ensure_latest()
self.initialized = True self.initialized = True
def _load_vocabulary(self): def ensure_latest(self):
"""Parst die Markdown-Tabelle im Vault.""" """Prüft den Zeitstempel der Datei und lädt ggf. neu."""
print(f">>> CHECK: Loading Vocabulary from {self.full_vocab_path}", flush=True)
if not os.path.exists(self.full_vocab_path): if not os.path.exists(self.full_vocab_path):
print(f"!!! [DICT-ERROR] File not found: {self.full_vocab_path} !!!", flush=True)
return return
# WP-22 Precision Regex: current_mtime = os.path.getmtime(self.full_vocab_path)
# Sucht nach | **`typ`** | oder | **typ** | if current_mtime > self._last_mtime:
# Die Backticks `? sind jetzt optional enthalten. self._load_vocabulary()
self._last_mtime = current_mtime
def _load_vocabulary(self):
"""Parst das Wörterbuch und baut die Canonical-Map auf."""
self.canonical_map.clear()
self.valid_types.clear()
pattern = re.compile(r"\|\s*\*\*`?([a-zA-Z0-9_-]+)`?\*\*\s*\|\s*([^|]+)\|") pattern = re.compile(r"\|\s*\*\*`?([a-zA-Z0-9_-]+)`?\*\*\s*\|\s*([^|]+)\|")
try: try:
with open(self.full_vocab_path, "r", encoding="utf-8") as f: with open(self.full_vocab_path, "r", encoding="utf-8") as f:
c_types, c_aliases = 0, 0
for line in f: for line in f:
match = pattern.search(line) match = pattern.search(line)
if match: if match:
@ -71,39 +70,56 @@ class EdgeRegistry:
self.valid_types.add(canonical) self.valid_types.add(canonical)
self.canonical_map[canonical] = canonical self.canonical_map[canonical] = canonical
c_types += 1
if aliases_str and "Kein Alias" not in aliases_str: if aliases_str and "Kein Alias" not in aliases_str:
# Aliase säubern (entfernt Backticks auch hier)
aliases = [a.strip() for a in aliases_str.split(",") if a.strip()] aliases = [a.strip() for a in aliases_str.split(",") if a.strip()]
for alias in aliases: for alias in aliases:
clean_alias = alias.replace("`", "").lower().strip().replace(" ", "_") clean_alias = alias.replace("`", "").lower().strip().replace(" ", "_")
self.canonical_map[clean_alias] = canonical self.canonical_map[clean_alias] = canonical
c_aliases += 1 logger.info(f"EdgeRegistry reloaded: {len(self.valid_types)} types.")
if c_types == 0:
print("!!! [DICT-WARN] Pattern mismatch! Ensure types are **`canonical`** or **canonical**. !!!", flush=True)
else:
print(f"=== [DICT-SUCCESS] Registered {c_types} Canonical Types and {c_aliases} Aliases ===", flush=True)
except Exception as e: except Exception as e:
print(f"!!! [DICT-FATAL] Error reading file: {e} !!!", flush=True) logger.error(f"Error reading vocabulary: {e}")
def resolve(self, edge_type: str) -> str: def resolve(self, edge_type: str, provenance: str = "explicit", context: dict = None) -> str:
"""Normalisiert Kanten-Typen via Registry oder loggt Unbekannte.""" """
Validiert Kanten gegen System-Regeln und Wörterbuch.
- provenance: 'explicit' (User/Markdown) oder 'structure' (System-Pipeline)
- context: {'file': str, 'line': int}
"""
self.ensure_latest()
if not edge_type: return "related_to" if not edge_type: return "related_to"
clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_")
clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_")
ctx = context or {}
# 1. Check auf "verbotene" manuelle Nutzung von Systemkanten
if provenance == "explicit" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
self._log_issue(clean_type, "forbidden_system_usage", ctx)
return "related_to" # Fallback, um System-Integrität zu schützen
# 2. Stillschweigende Akzeptanz für echte System-Struktur
if provenance == "structure" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
return clean_type
# 3. Mapping gegen das Wörterbuch
if clean_type in self.canonical_map: if clean_type in self.canonical_map:
return self.canonical_map[clean_type] return self.canonical_map[clean_type]
self._log_unknown(clean_type) # 4. Unbekannte Kante loggen
self._log_issue(clean_type, "unknown_type", ctx)
return clean_type return clean_type
def _log_unknown(self, edge_type: str): def _log_issue(self, edge_type: str, error_kind: str, ctx: dict):
"""Schreibt detaillierte Fehler mit Fundort."""
try: try:
os.makedirs(os.path.dirname(self.unknown_log_path), exist_ok=True) os.makedirs(os.path.dirname(self.unknown_log_path), exist_ok=True)
entry = {"unknown_type": edge_type, "status": "new"} entry = {
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"edge_type": edge_type,
"error": error_kind,
"file": ctx.get("file", "unknown"),
"line": ctx.get("line", "unknown")
}
with open(self.unknown_log_path, "a", encoding="utf-8") as f: with open(self.unknown_log_path, "a", encoding="utf-8") as f:
f.write(json.dumps(entry) + "\n") f.write(json.dumps(entry) + "\n")
except Exception: pass except Exception: pass