Parametrisierung der wesentliche Einstellwerte in der types.yaml
This commit is contained in:
parent
8b8baa27b3
commit
cd5383432e
|
|
@ -4,8 +4,8 @@ DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||||
WP-14: Vollständig modularisiert.
|
WP-14: Vollständig modularisiert.
|
||||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||||
AUDIT v2.13.4: 100% Logik-Erhalt (Parameters, Registry-Context, DB-Points).
|
AUDIT v2.13.7: Synchronisierung des Context-Scanners mit der Registry (WP-14).
|
||||||
VERSION: 2.13.4
|
VERSION: 2.13.7
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -75,7 +75,9 @@ class IngestionService:
|
||||||
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...")
|
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...")
|
||||||
for path in file_paths:
|
for path in file_paths:
|
||||||
try:
|
try:
|
||||||
ctx = pre_scan_markdown(path)
|
# ANPASSUNG: Übergabe der Registry für dynamische Scan-Parameter (WP-14)
|
||||||
|
# Ermöglicht die Nutzung von summary_settings aus types.yaml
|
||||||
|
ctx = pre_scan_markdown(path, registry=self.registry)
|
||||||
if ctx:
|
if ctx:
|
||||||
# Mehrfache Indizierung für robusten Look-up (ID, Titel, Dateiname)
|
# Mehrfache Indizierung für robusten Look-up (ID, Titel, Dateiname)
|
||||||
self.batch_cache[ctx.note_id] = ctx
|
self.batch_cache[ctx.note_id] = ctx
|
||||||
|
|
@ -108,7 +110,12 @@ class IngestionService:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {**result, "error": f"Validation failed: {str(e)}"}
|
return {**result, "error": f"Validation failed: {str(e)}"}
|
||||||
|
|
||||||
if fm.get("status", "draft").lower().strip() in ["system", "template", "archive", "hidden"]:
|
# Dynamischer Lifecycle-Filter aus der Registry
|
||||||
|
ingest_cfg = self.registry.get("ingestion_settings", {})
|
||||||
|
ignore_list = ingest_cfg.get("ignore_statuses", ["system", "template", "archive", "hidden"])
|
||||||
|
|
||||||
|
current_status = fm.get("status", "draft").lower().strip()
|
||||||
|
if current_status in ignore_list:
|
||||||
return {**result, "status": "skipped", "reason": "lifecycle_filter"}
|
return {**result, "status": "skipped", "reason": "lifecycle_filter"}
|
||||||
|
|
||||||
# 2. Payload & Change Detection (Multi-Hash)
|
# 2. Payload & Change Detection (Multi-Hash)
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/ingestion/ingestion_utils.py
|
FILE: app/core/ingestion/ingestion_utils.py
|
||||||
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
|
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
|
||||||
|
AUDIT v2.13.7: Dynamisierung von Cleanup-Patterns und Default-Typen (WP-14).
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
@ -8,16 +9,27 @@ import re
|
||||||
import yaml
|
import yaml
|
||||||
from typing import Any, Optional, Dict
|
from typing import Any, Optional, Dict
|
||||||
|
|
||||||
def extract_json_from_response(text: str) -> Any:
|
def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic).
|
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (v2.11.14 Logic).
|
||||||
Entfernt <s>, [OUT], [/OUT] und Markdown-Blöcke für maximale Robustheit.
|
WP-14: Nutzt nun dynamische cleanup_patterns aus der Registry.
|
||||||
"""
|
"""
|
||||||
if not text or not isinstance(text, str):
|
if not text or not isinstance(text, str):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
clean = text.replace("<s>", "").replace("</s>", "")
|
# Fallback-Patterns für die Bereinigung
|
||||||
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
patterns = ["<s>", "</s>", "[OUT]", "[/OUT]"]
|
||||||
|
|
||||||
|
# Falls keine Registry übergeben wurde, versuchen wir sie zu laden
|
||||||
|
reg = registry or load_type_registry()
|
||||||
|
if reg:
|
||||||
|
# Lade Patterns aus llm_settings (WP-14 Erweiterung)
|
||||||
|
patterns = reg.get("llm_settings", {}).get("cleanup_patterns", patterns)
|
||||||
|
|
||||||
|
clean = text
|
||||||
|
for p in patterns:
|
||||||
|
clean = clean.replace(p, "")
|
||||||
|
|
||||||
clean = clean.strip()
|
clean = clean.strip()
|
||||||
|
|
||||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
||||||
|
|
@ -52,10 +64,17 @@ def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
||||||
except Exception: return {}
|
except Exception: return {}
|
||||||
|
|
||||||
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
||||||
"""Bestimmt den finalen Notiz-Typ (Fallback auf 'concept')."""
|
"""
|
||||||
|
Bestimmt den finalen Notiz-Typ.
|
||||||
|
WP-14: Fallback wird nun über ingestion_settings.default_note_type gesteuert.
|
||||||
|
"""
|
||||||
types = registry.get("types", {})
|
types = registry.get("types", {})
|
||||||
if requested and requested in types: return requested
|
if requested and requested in types:
|
||||||
return "concept"
|
return requested
|
||||||
|
|
||||||
|
# Dynamischer Fallback aus der Registry (Standard: 'concept')
|
||||||
|
ingest_cfg = registry.get("ingestion_settings", {})
|
||||||
|
return ingest_cfg.get("default_note_type", "concept")
|
||||||
|
|
||||||
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
||||||
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
||||||
|
|
|
||||||
|
|
@ -1,21 +1,36 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/parsing/parsing_scanner.py
|
FILE: app/core/parsing/parsing_scanner.py
|
||||||
DESCRIPTION: Pre-Scan für den LocalBatchCache (Pass 1).
|
DESCRIPTION: Pre-Scan für den LocalBatchCache (Pass 1).
|
||||||
|
AUDIT v1.1.0: Dynamisierung der Scan-Parameter (WP-14).
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import Optional
|
from typing import Optional, Dict, Any
|
||||||
from .parsing_models import NoteContext
|
from .parsing_models import NoteContext
|
||||||
from .parsing_markdown import read_markdown
|
from .parsing_markdown import read_markdown
|
||||||
|
|
||||||
def pre_scan_markdown(path: str) -> Optional[NoteContext]:
|
def pre_scan_markdown(path: str, registry: Optional[Dict[str, Any]] = None) -> Optional[NoteContext]:
|
||||||
"""Extrahiert Identität und Kurz-Kontext zur Validierung."""
|
"""
|
||||||
|
Extrahiert Identität und Kurz-Kontext zur Validierung.
|
||||||
|
WP-14: Scan-Tiefe und Summary-Länge sind nun über die Registry steuerbar.
|
||||||
|
"""
|
||||||
parsed = read_markdown(path)
|
parsed = read_markdown(path)
|
||||||
if not parsed: return None
|
if not parsed: return None
|
||||||
|
|
||||||
|
# WP-14: Konfiguration laden oder Standardwerte nutzen
|
||||||
|
reg = registry or {}
|
||||||
|
summary_cfg = reg.get("summary_settings", {})
|
||||||
|
scan_depth = summary_cfg.get("pre_scan_depth", 600)
|
||||||
|
max_len = summary_cfg.get("max_summary_length", 500)
|
||||||
|
|
||||||
fm = parsed.frontmatter
|
fm = parsed.frontmatter
|
||||||
|
# ID-Findung: Frontmatter ID oder Dateiname als Fallback
|
||||||
note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0])
|
note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0])
|
||||||
clean_body = re.sub(r'[#*`>]', '', parsed.body[:600]).strip()
|
|
||||||
summary = clean_body[:500] + "..." if len(clean_body) > 500 else clean_body
|
# Erstelle Kurz-Zusammenfassung mit dynamischen Limits
|
||||||
|
clean_body = re.sub(r'[#*`>]', '', parsed.body[:scan_depth]).strip()
|
||||||
|
summary = clean_body[:max_len] + "..." if len(clean_body) > max_len else clean_body
|
||||||
|
|
||||||
return NoteContext(
|
return NoteContext(
|
||||||
note_id=note_id,
|
note_id=note_id,
|
||||||
title=str(fm.get("title", note_id)),
|
title=str(fm.get("title", note_id)),
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
version: 2.6.0 # Final WP-15 Config: Smart Edges & Strict/Soft Chunking
|
version: 2.7.0 # WP-14 Update: Dynamisierung der Ingestion-Pipeline
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
# 1. CHUNKING PROFILES
|
# 1. CHUNKING PROFILES
|
||||||
|
|
@ -76,7 +76,32 @@ defaults:
|
||||||
edge_defaults: []
|
edge_defaults: []
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
# 3. TYPE DEFINITIONS
|
# 3. INGESTION SETTINGS (WP-14 Dynamization)
|
||||||
|
# ==============================================================================
|
||||||
|
# Steuert, welche Notizen verarbeitet werden und wie Fallbacks aussehen.
|
||||||
|
ingestion_settings:
|
||||||
|
# Liste der Status-Werte, die beim Import ignoriert werden sollen.
|
||||||
|
ignore_statuses: ["system", "template", "archive", "hidden"]
|
||||||
|
# Standard-Typ, falls kein Typ im Frontmatter angegeben ist.
|
||||||
|
default_note_type: "concept"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 4. SUMMARY & SCAN SETTINGS
|
||||||
|
# ==============================================================================
|
||||||
|
# Steuert die Tiefe des Pre-Scans für den Context-Cache.
|
||||||
|
summary_settings:
|
||||||
|
max_summary_length: 500
|
||||||
|
pre_scan_depth: 600
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 5. LLM SETTINGS
|
||||||
|
# ==============================================================================
|
||||||
|
# Steuerzeichen und Patterns zur Bereinigung der LLM-Antworten.
|
||||||
|
llm_settings:
|
||||||
|
cleanup_patterns: ["<s>", "</s>", "[OUT]", "[/OUT]", "```json", "```"]
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 6. TYPE DEFINITIONS
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
types:
|
types:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user