app/core/type_registry.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
53591b6f27
commit
f4be219790
|
|
@ -1,111 +1,117 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
type_registry.py v1.0.0
|
Modul: app/core/type_registry.py
|
||||||
|
Version: 1.0.0
|
||||||
|
Datum: 2025-11-08
|
||||||
|
|
||||||
Zweck:
|
Zweck
|
||||||
- Optionale, konfigurierbare Type-Registry laden (YAML/JSON), um pro "type"
|
-----
|
||||||
(aus Frontmatter) Chunk-Profile, Default-Edges und optionale
|
Lädt eine optionale Typ-Registry (config/types.yaml) und stellt
|
||||||
Retriever-Gewichte bereitzustellen – ohne bestehende Funktionen zu brechen.
|
komfortable Zugriffsfunktionen bereit. Die Registry ist *optional*:
|
||||||
|
- Fehlt die Datei oder ist das YAML defekt, wird ein konservativer
|
||||||
|
Default (Typ "concept") verwendet und es wird eine Warnung ausgegeben.
|
||||||
|
- Änderungen an der Datei greifen nach einem Neustart des Prozesses.
|
||||||
|
|
||||||
Kompatibilität:
|
Öffentliche API
|
||||||
- Keine Abhängigkeiten von anderen Modulen.
|
---------------
|
||||||
- Keine harten Fehler, wenn Registry fehlt oder unvollständig ist.
|
- load_type_registry(path: str = "config/types.yaml") -> dict
|
||||||
|
- get_type_config(note_type: str, reg: dict) -> dict
|
||||||
|
- resolve_note_type(fm_type: str | None, reg: dict) -> str
|
||||||
|
- effective_chunk_profile(note_type: str, reg: dict) -> str | None
|
||||||
|
- profile_overlap(profile: str | None) -> tuple[int,int] # nur Overlap-Empfehlung
|
||||||
|
|
||||||
Nutzung:
|
Hinweis
|
||||||
from app.core.type_registry import (
|
-------
|
||||||
load_type_registry, resolve_chunk_profile, get_edge_defaults_for_type,
|
Die Registry steuert KEINE Breaking Changes. Ohne Datei/Typ bleibt das
|
||||||
get_retriever_weight_for_type
|
Verhalten exakt wie im Release-Stand 20251105.
|
||||||
)
|
|
||||||
|
|
||||||
Umgebungsvariablen:
|
|
||||||
TYPE_REGISTRY_PATH (default: "config/types.yaml")
|
|
||||||
|
|
||||||
Format (Beispiel):
|
|
||||||
version: 1.0
|
|
||||||
types:
|
|
||||||
concept:
|
|
||||||
chunk_profile: long
|
|
||||||
edge_defaults: [references, related_to]
|
|
||||||
retriever_weight: 1.0
|
|
||||||
task:
|
|
||||||
chunk_profile: short
|
|
||||||
edge_defaults: [depends_on, belongs_to]
|
|
||||||
retriever_weight: 0.8
|
|
||||||
experience:
|
|
||||||
chunk_profile: medium
|
|
||||||
edge_defaults: [derived_from, inspired_by]
|
|
||||||
retriever_weight: 0.9
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from functools import lru_cache
|
||||||
|
from typing import Dict, Any, Optional, Tuple
|
||||||
import os
|
import os
|
||||||
import json
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import yaml # type: ignore
|
import yaml # PyYAML
|
||||||
except Exception: # yaml ist optional; JSON wird ebenfalls unterstützt
|
except Exception:
|
||||||
yaml = None # type: ignore
|
yaml = None # wird erst benötigt, wenn eine Datei gelesen werden soll
|
||||||
|
|
||||||
_CACHE: Dict[str, Dict[str, Any]] = {}
|
# Konservativer Default – bewusst minimal
|
||||||
|
_DEFAULT_REGISTRY: Dict[str, Any] = {
|
||||||
|
"version": "1.0",
|
||||||
|
"types": {
|
||||||
|
"concept": {
|
||||||
|
"chunk_profile": "medium",
|
||||||
|
"edge_defaults": ["references", "related_to"],
|
||||||
|
"retriever_weight": 1.0,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"_using_defaults": True,
|
||||||
|
"_warning": "types.yaml missing or invalid – using built-in defaults (type=concept).",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Chunk-Profile → Overlap-Empfehlungen (nur für synthetische Fensterbildung)
|
||||||
|
# Die absoluten Chunk-Längen bleiben Aufgabe des Chunkers (assemble_chunks).
|
||||||
|
_PROFILE_TO_OVERLAP: Dict[str, Tuple[int, int]] = {
|
||||||
|
"short": (20, 30),
|
||||||
|
"medium": (40, 60),
|
||||||
|
"long": (60, 80),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def load_type_registry(path: str = "config/types.yaml") -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Lädt die Registry aus 'path'. Bei Fehlern wird ein konserviver Default geliefert.
|
||||||
|
Die Rückgabe ist *prozessweit* gecached.
|
||||||
|
"""
|
||||||
|
if not path:
|
||||||
|
return dict(_DEFAULT_REGISTRY)
|
||||||
|
|
||||||
|
if not os.path.isfile(path):
|
||||||
|
return dict(_DEFAULT_REGISTRY)
|
||||||
|
|
||||||
|
if yaml is None:
|
||||||
|
# PyYAML fehlt → auf Default zurückfallen
|
||||||
|
return dict(_DEFAULT_REGISTRY)
|
||||||
|
|
||||||
def _safe_load_yaml_or_json(path: str) -> Dict[str, Any]:
|
|
||||||
if not os.path.exists(path):
|
|
||||||
return {}
|
|
||||||
try:
|
try:
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
data = f.read()
|
data = yaml.safe_load(f) or {}
|
||||||
|
# Minimal validieren
|
||||||
|
if not isinstance(data, dict) or "types" not in data or not isinstance(data["types"], dict):
|
||||||
|
return dict(_DEFAULT_REGISTRY)
|
||||||
|
data.setdefault("version", "1.0")
|
||||||
|
data.setdefault("_using_defaults", False)
|
||||||
|
return data
|
||||||
except Exception:
|
except Exception:
|
||||||
return {}
|
return dict(_DEFAULT_REGISTRY)
|
||||||
# YAML bevorzugen, wenn verfügbar und Datei nach YAML aussieht
|
|
||||||
if path.lower().endswith((".yaml", ".yml")) and yaml is not None:
|
|
||||||
try:
|
|
||||||
return yaml.safe_load(data) or {}
|
|
||||||
except Exception:
|
|
||||||
return {}
|
|
||||||
# JSON fallback
|
|
||||||
try:
|
|
||||||
return json.loads(data)
|
|
||||||
except Exception:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def load_type_registry(path: Optional[str] = None) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Lädt einmalig die Registry und cached sie.
|
|
||||||
Fehlt sie, wird ein leeres Dict geliefert (keine Fehler).
|
|
||||||
"""
|
|
||||||
key = path or os.getenv("TYPE_REGISTRY_PATH", "config/types.yaml")
|
|
||||||
if key in _CACHE:
|
|
||||||
return _CACHE[key]
|
|
||||||
obj = _safe_load_yaml_or_json(key)
|
|
||||||
if not isinstance(obj, dict):
|
|
||||||
obj = {}
|
|
||||||
_CACHE[key] = obj
|
|
||||||
return obj
|
|
||||||
|
|
||||||
def _types_map(reg: Dict[str, Any]) -> Dict[str, Any]:
|
def get_type_config(note_type: Optional[str], reg: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
return reg.get("types", {}) if isinstance(reg, dict) else {}
|
t = (note_type or "concept").strip().lower()
|
||||||
|
types = (reg or {}).get("types", {}) if isinstance(reg, dict) else {}
|
||||||
|
return types.get(t) or types.get("concept") or _DEFAULT_REGISTRY["types"]["concept"]
|
||||||
|
|
||||||
def resolve_chunk_profile(note_type: str, default_profile: str = "default") -> str:
|
|
||||||
reg = load_type_registry()
|
|
||||||
tmap = _types_map(reg)
|
|
||||||
entry = tmap.get(note_type, {})
|
|
||||||
return str(entry.get("chunk_profile", default_profile))
|
|
||||||
|
|
||||||
def get_edge_defaults_for_type(note_type: str) -> List[str]:
|
def resolve_note_type(fm_type: Optional[str], reg: Dict[str, Any]) -> str:
|
||||||
reg = load_type_registry()
|
"""Liefert einen gültigen Typ (unbekannt → 'concept')."""
|
||||||
tmap = _types_map(reg)
|
t = (fm_type or "concept").strip().lower()
|
||||||
entry = tmap.get(note_type, {})
|
types = (reg or {}).get("types", {}) if isinstance(reg, dict) else {}
|
||||||
v = entry.get("edge_defaults", [])
|
return t if t in types else "concept"
|
||||||
if not isinstance(v, list):
|
|
||||||
return []
|
|
||||||
return [str(x) for x in v]
|
|
||||||
|
|
||||||
def get_retriever_weight_for_type(note_type: str) -> Optional[float]:
|
|
||||||
reg = load_type_registry()
|
def effective_chunk_profile(note_type: Optional[str], reg: Dict[str, Any]) -> Optional[str]:
|
||||||
tmap = _types_map(reg)
|
cfg = get_type_config(note_type, reg)
|
||||||
entry = tmap.get(note_type, {})
|
prof = cfg.get("chunk_profile")
|
||||||
v = entry.get("retriever_weight", None)
|
if isinstance(prof, str) and prof.strip():
|
||||||
try:
|
return prof.strip().lower()
|
||||||
return float(v) if v is not None else None
|
|
||||||
except Exception:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def profile_overlap(profile: Optional[str]) -> Tuple[int, int]:
|
||||||
|
"""Gibt eine Overlap-Empfehlung (low, high) für das Profil zurück."""
|
||||||
|
if not profile:
|
||||||
|
return _PROFILE_TO_OVERLAP["medium"]
|
||||||
|
return _PROFILE_TO_OVERLAP.get(profile.strip().lower(), _PROFILE_TO_OVERLAP["medium"])
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user