Parametrisierung der wesentliche Einstellwerte in der types.yaml
This commit is contained in:
parent
8b8baa27b3
commit
cd5383432e
|
|
@ -4,8 +4,8 @@ DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
|||
WP-14: Vollständig modularisiert.
|
||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||
AUDIT v2.13.4: 100% Logik-Erhalt (Parameters, Registry-Context, DB-Points).
|
||||
VERSION: 2.13.4
|
||||
AUDIT v2.13.7: Synchronisierung des Context-Scanners mit der Registry (WP-14).
|
||||
VERSION: 2.13.7
|
||||
STATUS: Active
|
||||
"""
|
||||
import logging
|
||||
|
|
@ -75,7 +75,9 @@ class IngestionService:
|
|||
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...")
|
||||
for path in file_paths:
|
||||
try:
|
||||
ctx = pre_scan_markdown(path)
|
||||
# ANPASSUNG: Übergabe der Registry für dynamische Scan-Parameter (WP-14)
|
||||
# Ermöglicht die Nutzung von summary_settings aus types.yaml
|
||||
ctx = pre_scan_markdown(path, registry=self.registry)
|
||||
if ctx:
|
||||
# Mehrfache Indizierung für robusten Look-up (ID, Titel, Dateiname)
|
||||
self.batch_cache[ctx.note_id] = ctx
|
||||
|
|
@ -108,7 +110,12 @@ class IngestionService:
|
|||
except Exception as e:
|
||||
return {**result, "error": f"Validation failed: {str(e)}"}
|
||||
|
||||
if fm.get("status", "draft").lower().strip() in ["system", "template", "archive", "hidden"]:
|
||||
# Dynamischer Lifecycle-Filter aus der Registry
|
||||
ingest_cfg = self.registry.get("ingestion_settings", {})
|
||||
ignore_list = ingest_cfg.get("ignore_statuses", ["system", "template", "archive", "hidden"])
|
||||
|
||||
current_status = fm.get("status", "draft").lower().strip()
|
||||
if current_status in ignore_list:
|
||||
return {**result, "status": "skipped", "reason": "lifecycle_filter"}
|
||||
|
||||
# 2. Payload & Change Detection (Multi-Hash)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_utils.py
|
||||
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
|
||||
AUDIT v2.13.7: Dynamisierung von Cleanup-Patterns und Default-Typen (WP-14).
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
|
|
@ -8,16 +9,27 @@ import re
|
|||
import yaml
|
||||
from typing import Any, Optional, Dict
|
||||
|
||||
def extract_json_from_response(text: str) -> Any:
|
||||
def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any:
|
||||
"""
|
||||
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic).
|
||||
Entfernt <s>, [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit.
|
||||
WP-14: Nutzt nun dynamische cleanup_patterns aus der Registry.
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return []
|
||||
|
||||
clean = text.replace("<s>", "").replace("</s>", "")
|
||||
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
||||
# Fallback-Patterns für die Bereinigung
|
||||
patterns = ["<s>", "</s>", "[OUT]", "[/OUT]"]
|
||||
|
||||
# Falls keine Registry übergeben wurde, versuchen wir sie zu laden
|
||||
reg = registry or load_type_registry()
|
||||
if reg:
|
||||
# Lade Patterns aus llm_settings (WP-14 Erweiterung)
|
||||
patterns = reg.get("llm_settings", {}).get("cleanup_patterns", patterns)
|
||||
|
||||
clean = text
|
||||
for p in patterns:
|
||||
clean = clean.replace(p, "")
|
||||
|
||||
clean = clean.strip()
|
||||
|
||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
||||
|
|
@ -52,10 +64,17 @@ def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
|||
except Exception: return {}
|
||||
|
||||
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
||||
"""Bestimmt den finalen Notiz-Typ (Fallback auf 'concept')."""
|
||||
"""
|
||||
Bestimmt den finalen Notiz-Typ.
|
||||
WP-14: Fallback wird nun über ingestion_settings.default_note_type gesteuert.
|
||||
"""
|
||||
types = registry.get("types", {})
|
||||
if requested and requested in types: return requested
|
||||
return "concept"
|
||||
if requested and requested in types:
|
||||
return requested
|
||||
|
||||
# Dynamischer Fallback aus der Registry (Standard: 'concept')
|
||||
ingest_cfg = registry.get("ingestion_settings", {})
|
||||
return ingest_cfg.get("default_note_type", "concept")
|
||||
|
||||
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
||||
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
||||
|
|
|
|||
|
|
@ -1,21 +1,36 @@
|
|||
"""
|
||||
FILE: app/core/parsing/parsing_scanner.py
|
||||
DESCRIPTION: Pre-Scan für den LocalBatchCache (Pass 1).
|
||||
AUDIT v1.1.0: Dynamisierung der Scan-Parameter (WP-14).
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
from typing import Optional, Dict, Any
|
||||
from .parsing_models import NoteContext
|
||||
from .parsing_markdown import read_markdown
|
||||
|
||||
def pre_scan_markdown(path: str) -> Optional[NoteContext]:
|
||||
"""Extrahiert Identität und Kurz-Kontext zur Validierung."""
|
||||
def pre_scan_markdown(path: str, registry: Optional[Dict[str, Any]] = None) -> Optional[NoteContext]:
|
||||
"""
|
||||
Extrahiert Identität und Kurz-Kontext zur Validierung.
|
||||
WP-14: Scan-Tiefe und Summary-Länge sind nun über die Registry steuerbar.
|
||||
"""
|
||||
parsed = read_markdown(path)
|
||||
if not parsed: return None
|
||||
|
||||
# WP-14: Konfiguration laden oder Standardwerte nutzen
|
||||
reg = registry or {}
|
||||
summary_cfg = reg.get("summary_settings", {})
|
||||
scan_depth = summary_cfg.get("pre_scan_depth", 600)
|
||||
max_len = summary_cfg.get("max_summary_length", 500)
|
||||
|
||||
fm = parsed.frontmatter
|
||||
# ID-Findung: Frontmatter ID oder Dateiname als Fallback
|
||||
note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0])
|
||||
clean_body = re.sub(r'[#*`>]', '', parsed.body[:600]).strip()
|
||||
summary = clean_body[:500] + "..." if len(clean_body) > 500 else clean_body
|
||||
|
||||
# Erstelle Kurz-Zusammenfassung mit dynamischen Limits
|
||||
clean_body = re.sub(r'[#*`>]', '', parsed.body[:scan_depth]).strip()
|
||||
summary = clean_body[:max_len] + "..." if len(clean_body) > max_len else clean_body
|
||||
|
||||
return NoteContext(
|
||||
note_id=note_id,
|
||||
title=str(fm.get("title", note_id)),
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
version: 2.6.0 # Final WP-15 Config: Smart Edges & Strict/Soft Chunking
|
||||
version: 2.7.0 # WP-14 Update: Dynamisierung der Ingestion-Pipeline
|
||||
|
||||
# ==============================================================================
|
||||
# 1. CHUNKING PROFILES
|
||||
|
|
@ -76,7 +76,32 @@ defaults:
|
|||
edge_defaults: []
|
||||
|
||||
# ==============================================================================
|
||||
# 3. TYPE DEFINITIONS
|
||||
# 3. INGESTION SETTINGS (WP-14 Dynamization)
|
||||
# ==============================================================================
|
||||
# Steuert, welche Notizen verarbeitet werden und wie Fallbacks aussehen.
|
||||
ingestion_settings:
|
||||
# Liste der Status-Werte, die beim Import ignoriert werden sollen.
|
||||
ignore_statuses: ["system", "template", "archive", "hidden"]
|
||||
# Standard-Typ, falls kein Typ im Frontmatter angegeben ist.
|
||||
default_note_type: "concept"
|
||||
|
||||
# ==============================================================================
|
||||
# 4. SUMMARY & SCAN SETTINGS
|
||||
# ==============================================================================
|
||||
# Steuert die Tiefe des Pre-Scans für den Context-Cache.
|
||||
summary_settings:
|
||||
max_summary_length: 500
|
||||
pre_scan_depth: 600
|
||||
|
||||
# ==============================================================================
|
||||
# 5. LLM SETTINGS
|
||||
# ==============================================================================
|
||||
# Steuerzeichen und Patterns zur Bereinigung der LLM-Antworten.
|
||||
llm_settings:
|
||||
cleanup_patterns: ["<s>", "</s>", "[OUT]", "[/OUT]", "```json", "```"]
|
||||
|
||||
# ==============================================================================
|
||||
# 6. TYPE DEFINITIONS
|
||||
# ==============================================================================
|
||||
|
||||
types:
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user