aufräumen und löschen von Alt-Scripten WP19b
This commit is contained in:
parent
f08a331bc6
commit
e3858e8bc3
|
|
@ -1,176 +0,0 @@
|
||||||
"""
|
|
||||||
FILE: app/core/chunk_payload.py
|
|
||||||
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
|
||||||
FEATURES:
|
|
||||||
- Inkludiert Nachbarschafts-IDs (prev/next) und Titel.
|
|
||||||
- FIX 3: Robuste Erkennung des Inputs (Frontmatter-Dict vs. Note-Objekt), damit Overrides ankommen.
|
|
||||||
VERSION: 2.3.0
|
|
||||||
STATUS: Active
|
|
||||||
DEPENDENCIES: yaml, os
|
|
||||||
EXTERNAL_CONFIG: config/types.yaml
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
import os, yaml
|
|
||||||
|
|
||||||
def _env(n: str, d: Optional[str]=None) -> str:
|
|
||||||
v = os.getenv(n)
|
|
||||||
return v if v is not None else (d or "")
|
|
||||||
|
|
||||||
def _load_types() -> dict:
|
|
||||||
p = _env("MINDNET_TYPES_FILE", "./config/types.yaml")
|
|
||||||
try:
|
|
||||||
with open(p, "r", encoding="utf-8") as f:
|
|
||||||
return yaml.safe_load(f) or {}
|
|
||||||
except Exception:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def _get_types_map(reg: dict) -> dict:
|
|
||||||
if isinstance(reg, dict) and isinstance(reg.get("types"), dict):
|
|
||||||
return reg["types"]
|
|
||||||
return reg if isinstance(reg, dict) else {}
|
|
||||||
|
|
||||||
def _get_defaults(reg: dict) -> dict:
|
|
||||||
if isinstance(reg, dict) and isinstance(reg.get("defaults"), dict):
|
|
||||||
return reg["defaults"]
|
|
||||||
if isinstance(reg, dict) and isinstance(reg.get("global"), dict):
|
|
||||||
return reg["global"]
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def _as_float(x: Any):
|
|
||||||
try: return float(x)
|
|
||||||
except Exception: return None
|
|
||||||
|
|
||||||
def _resolve_chunk_profile_from_config(note_type: str, reg: dict) -> Optional[str]:
|
|
||||||
# 1. Type Level
|
|
||||||
types = _get_types_map(reg)
|
|
||||||
if isinstance(types, dict):
|
|
||||||
t = types.get(note_type, {})
|
|
||||||
if isinstance(t, dict):
|
|
||||||
cp = t.get("chunking_profile") or t.get("chunk_profile")
|
|
||||||
if isinstance(cp, str) and cp: return cp
|
|
||||||
# 2. Defaults Level
|
|
||||||
defs = _get_defaults(reg)
|
|
||||||
if isinstance(defs, dict):
|
|
||||||
cp = defs.get("chunking_profile") or defs.get("chunk_profile")
|
|
||||||
if isinstance(cp, str) and cp: return cp
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _resolve_retriever_weight_from_config(note_type: str, reg: dict) -> float:
|
|
||||||
"""
|
|
||||||
Liest Weight nur aus Config (Type > Default).
|
|
||||||
Wird aufgerufen, wenn im Frontmatter nichts steht.
|
|
||||||
"""
|
|
||||||
# 1. Type Level
|
|
||||||
types = _get_types_map(reg)
|
|
||||||
if isinstance(types, dict):
|
|
||||||
t = types.get(note_type, {})
|
|
||||||
if isinstance(t, dict) and (t.get("retriever_weight") is not None):
|
|
||||||
v = _as_float(t.get("retriever_weight"))
|
|
||||||
if v is not None: return float(v)
|
|
||||||
|
|
||||||
# 2. Defaults Level
|
|
||||||
defs = _get_defaults(reg)
|
|
||||||
if isinstance(defs, dict) and (defs.get("retriever_weight") is not None):
|
|
||||||
v = _as_float(defs.get("retriever_weight"))
|
|
||||||
if v is not None: return float(v)
|
|
||||||
|
|
||||||
return 1.0
|
|
||||||
|
|
||||||
def _as_list(x):
|
|
||||||
if x is None: return []
|
|
||||||
if isinstance(x, list): return x
|
|
||||||
return [x]
|
|
||||||
|
|
||||||
def make_chunk_payloads(note: Dict[str, Any],
|
|
||||||
note_path: str,
|
|
||||||
chunks_from_chunker: List[Any],
|
|
||||||
*,
|
|
||||||
note_text: str = "",
|
|
||||||
types_cfg: Optional[dict] = None,
|
|
||||||
file_path: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
Erstellt die Payloads für die Chunks.
|
|
||||||
|
|
||||||
Argument 'note' kann sein:
|
|
||||||
A) Ein komplexes Objekt/Dict mit Key "frontmatter" (Legacy / Tests)
|
|
||||||
B) Direkt das Frontmatter-Dictionary (Call aus ingestion.py)
|
|
||||||
"""
|
|
||||||
|
|
||||||
# --- FIX 3: Intelligente Erkennung der Input-Daten ---
|
|
||||||
# Wir prüfen: Ist 'note' ein Container MIT 'frontmatter', oder IST es das 'frontmatter'?
|
|
||||||
if isinstance(note, dict) and "frontmatter" in note and isinstance(note["frontmatter"], dict):
|
|
||||||
# Fall A: Container (wir müssen auspacken)
|
|
||||||
fm = note["frontmatter"]
|
|
||||||
else:
|
|
||||||
# Fall B: Direktes Dict (so ruft ingestion.py es auf!)
|
|
||||||
fm = note or {}
|
|
||||||
|
|
||||||
note_type = fm.get("type") or note.get("type") or "concept"
|
|
||||||
|
|
||||||
# Title Extraction (Fallback Chain)
|
|
||||||
title = fm.get("title") or note.get("title") or fm.get("id") or "Untitled"
|
|
||||||
|
|
||||||
reg = types_cfg if isinstance(types_cfg, dict) else _load_types()
|
|
||||||
|
|
||||||
# --- Profil-Ermittlung ---
|
|
||||||
# Da wir 'fm' jetzt korrekt haben, funktionieren diese lookups:
|
|
||||||
cp = fm.get("chunking_profile") or fm.get("chunk_profile")
|
|
||||||
|
|
||||||
if not cp:
|
|
||||||
cp = _resolve_chunk_profile_from_config(note_type, reg)
|
|
||||||
if not cp:
|
|
||||||
cp = "sliding_standard"
|
|
||||||
|
|
||||||
# --- Retriever Weight Ermittlung ---
|
|
||||||
rw = fm.get("retriever_weight")
|
|
||||||
|
|
||||||
if rw is None:
|
|
||||||
rw = _resolve_retriever_weight_from_config(note_type, reg)
|
|
||||||
|
|
||||||
try:
|
|
||||||
rw = float(rw)
|
|
||||||
except Exception:
|
|
||||||
rw = 1.0
|
|
||||||
|
|
||||||
tags = fm.get("tags") or []
|
|
||||||
if isinstance(tags, str):
|
|
||||||
tags = [tags]
|
|
||||||
|
|
||||||
out: List[Dict[str, Any]] = []
|
|
||||||
for idx, ch in enumerate(chunks_from_chunker):
|
|
||||||
# Attribute extrahieren
|
|
||||||
cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None)
|
|
||||||
nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id"))
|
|
||||||
index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx)
|
|
||||||
text = getattr(ch, "text", None) or (ch.get("text") if isinstance(ch, dict) else "")
|
|
||||||
window = getattr(ch, "window", None) or (ch.get("window") if isinstance(ch, dict) else text)
|
|
||||||
prev_id = getattr(ch, "neighbors_prev", None) or (ch.get("neighbors_prev") if isinstance(ch, dict) else None)
|
|
||||||
next_id = getattr(ch, "neighbors_next", None) or (ch.get("neighbors_next") if isinstance(ch, dict) else None)
|
|
||||||
|
|
||||||
pl: Dict[str, Any] = {
|
|
||||||
"note_id": nid,
|
|
||||||
"chunk_id": cid,
|
|
||||||
"title": title,
|
|
||||||
"index": int(index),
|
|
||||||
"ord": int(index) + 1,
|
|
||||||
"type": note_type,
|
|
||||||
"tags": tags,
|
|
||||||
"text": text,
|
|
||||||
"window": window,
|
|
||||||
"neighbors_prev": _as_list(prev_id),
|
|
||||||
"neighbors_next": _as_list(next_id),
|
|
||||||
"section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""),
|
|
||||||
"path": note_path,
|
|
||||||
"source_path": file_path or note_path,
|
|
||||||
"retriever_weight": float(rw),
|
|
||||||
"chunk_profile": cp, # Jetzt endlich mit dem Override-Wert!
|
|
||||||
}
|
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
for alias in ("chunk_num", "Chunk_Number"):
|
|
||||||
pl.pop(alias, None)
|
|
||||||
|
|
||||||
out.append(pl)
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
"""
|
|
||||||
FILE: app/core/chunker.py
|
|
||||||
DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher.
|
|
||||||
VERSION: 3.3.0
|
|
||||||
"""
|
|
||||||
from .chunking.chunking_processor import assemble_chunks
|
|
||||||
from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text
|
|
||||||
from .chunking.chunking_models import Chunk
|
|
||||||
|
|
||||||
__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"]
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
"""
|
|
||||||
FILE: app/core/ingestion.py
|
|
||||||
DESCRIPTION: Facade für das Ingestion-Package. Stellt 100% Abwärtskompatibilität sicher.
|
|
||||||
WP-14: Modularisierung der Ingestion-Pipeline abgeschlossen.
|
|
||||||
Nutzt interne Module mit 'ingestion_' Präfix für maximale Wartbarkeit.
|
|
||||||
VERSION: 2.13.0
|
|
||||||
STATUS: Active
|
|
||||||
"""
|
|
||||||
# Export der Hauptklasse für externe Module (z.B. scripts/import_markdown.py)
|
|
||||||
from .ingestion.ingestion_processor import IngestionService
|
|
||||||
|
|
||||||
# Export der Hilfsfunktionen für Abwärtskompatibilität
|
|
||||||
from .ingestion.ingestion_utils import extract_json_from_response, load_type_registry
|
|
||||||
|
|
||||||
__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"]
|
|
||||||
|
|
@ -18,7 +18,7 @@ from app.core.parser import (
|
||||||
read_markdown, pre_scan_markdown, normalize_frontmatter,
|
read_markdown, pre_scan_markdown, normalize_frontmatter,
|
||||||
validate_required_frontmatter, NoteContext
|
validate_required_frontmatter, NoteContext
|
||||||
)
|
)
|
||||||
from app.core.chunker import assemble_chunks
|
from app.core.chunking import assemble_chunks
|
||||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
||||||
from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
|
from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -59,7 +59,7 @@ def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
||||||
|
|
||||||
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
||||||
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
|
||||||
from app.core.chunker import get_chunk_config
|
from app.core.chunking import get_chunk_config
|
||||||
profiles = registry.get("chunking_profiles", {})
|
profiles = registry.get("chunking_profiles", {})
|
||||||
if profile_name in profiles:
|
if profile_name in profiles:
|
||||||
cfg = profiles[profile_name].copy()
|
cfg = profiles[profile_name].copy()
|
||||||
|
|
|
||||||
|
|
@ -1,268 +0,0 @@
|
||||||
"""
|
|
||||||
FILE: app/core/note_payload.py
|
|
||||||
DESCRIPTION: Baut das JSON-Objekt.
|
|
||||||
FEATURES:
|
|
||||||
1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection.
|
|
||||||
2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile').
|
|
||||||
VERSION: 2.3.0
|
|
||||||
STATUS: Active
|
|
||||||
DEPENDENCIES: yaml, os, json, pathlib, hashlib
|
|
||||||
EXTERNAL_CONFIG: config/types.yaml
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import Any, Dict, Tuple, Optional
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import pathlib
|
|
||||||
import hashlib
|
|
||||||
|
|
||||||
try:
|
|
||||||
import yaml # type: ignore
|
|
||||||
except Exception:
|
|
||||||
yaml = None
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Helper
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _as_dict(x) -> Dict[str, Any]:
|
|
||||||
"""Versucht, ein ParsedMarkdown-ähnliches Objekt in ein Dict zu überführen."""
|
|
||||||
if isinstance(x, dict):
|
|
||||||
return dict(x)
|
|
||||||
|
|
||||||
out: Dict[str, Any] = {}
|
|
||||||
for attr in (
|
|
||||||
"frontmatter",
|
|
||||||
"body",
|
|
||||||
"id",
|
|
||||||
"note_id",
|
|
||||||
"title",
|
|
||||||
"path",
|
|
||||||
"tags",
|
|
||||||
"type",
|
|
||||||
"created",
|
|
||||||
"modified",
|
|
||||||
"date",
|
|
||||||
):
|
|
||||||
if hasattr(x, attr):
|
|
||||||
val = getattr(x, attr)
|
|
||||||
if val is not None:
|
|
||||||
out[attr] = val
|
|
||||||
|
|
||||||
if not out:
|
|
||||||
out["raw"] = str(x)
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def _pick_args(*args, **kwargs) -> Tuple[Optional[str], Optional[dict]]:
|
|
||||||
path = kwargs.get("path") or (args[0] if args else None)
|
|
||||||
types_cfg = kwargs.get("types_cfg") or kwargs.get("types") or None
|
|
||||||
return path, types_cfg
|
|
||||||
|
|
||||||
|
|
||||||
def _env_float(name: str, default: float) -> float:
|
|
||||||
try:
|
|
||||||
return float(os.environ.get(name, default))
|
|
||||||
except Exception:
|
|
||||||
return default
|
|
||||||
|
|
||||||
|
|
||||||
def _ensure_list(x) -> list:
|
|
||||||
if x is None:
|
|
||||||
return []
|
|
||||||
if isinstance(x, list):
|
|
||||||
return [str(i) for i in x]
|
|
||||||
if isinstance(x, (set, tuple)):
|
|
||||||
return [str(i) for i in x]
|
|
||||||
return [str(x)]
|
|
||||||
|
|
||||||
# --- Hash Logic ---
|
|
||||||
def _compute_hash(content: str) -> str:
|
|
||||||
"""Berechnet einen SHA-256 Hash für den gegebenen String."""
|
|
||||||
if not content:
|
|
||||||
return ""
|
|
||||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
||||||
|
|
||||||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
|
||||||
"""
|
|
||||||
Stellt den String zusammen, der gehasht werden soll.
|
|
||||||
"""
|
|
||||||
body = str(n.get("body") or "")
|
|
||||||
|
|
||||||
if mode == "body":
|
|
||||||
return body
|
|
||||||
|
|
||||||
if mode == "full":
|
|
||||||
fm = n.get("frontmatter") or {}
|
|
||||||
# Wichtig: Sortierte Keys für deterministisches Verhalten!
|
|
||||||
# Wir nehmen alle steuernden Metadaten auf
|
|
||||||
meta_parts = []
|
|
||||||
# Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden
|
|
||||||
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
|
||||||
val = fm.get(k)
|
|
||||||
if val is not None:
|
|
||||||
meta_parts.append(f"{k}:{val}")
|
|
||||||
|
|
||||||
meta_str = "|".join(meta_parts)
|
|
||||||
return f"{meta_str}||{body}"
|
|
||||||
|
|
||||||
return body
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Type-Registry laden
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def _load_types_config(explicit_cfg: Optional[dict] = None) -> dict:
|
|
||||||
if explicit_cfg and isinstance(explicit_cfg, dict):
|
|
||||||
return explicit_cfg
|
|
||||||
|
|
||||||
path = os.getenv("MINDNET_TYPES_FILE") or "./config/types.yaml"
|
|
||||||
if not os.path.isfile(path) or yaml is None:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
|
||||||
data = yaml.safe_load(f) or {}
|
|
||||||
return data if isinstance(data, dict) else {}
|
|
||||||
except Exception:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
|
||||||
def _cfg_for_type(note_type: str, reg: dict) -> dict:
|
|
||||||
if not isinstance(reg, dict):
|
|
||||||
return {}
|
|
||||||
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
|
|
||||||
return types.get(note_type, {}) if isinstance(types, dict) else {}
|
|
||||||
|
|
||||||
|
|
||||||
def _cfg_defaults(reg: dict) -> dict:
|
|
||||||
if not isinstance(reg, dict):
|
|
||||||
return {}
|
|
||||||
for key in ("defaults", "default", "global"):
|
|
||||||
v = reg.get(key)
|
|
||||||
if isinstance(v, dict):
|
|
||||||
return v
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Haupt-API
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Baut das Note-Payload für mindnet_notes auf.
|
|
||||||
Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups.
|
|
||||||
"""
|
|
||||||
n = _as_dict(note)
|
|
||||||
path_arg, types_cfg_explicit = _pick_args(*args, **kwargs)
|
|
||||||
reg = _load_types_config(types_cfg_explicit)
|
|
||||||
|
|
||||||
# Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide')
|
|
||||||
hash_source = kwargs.get("hash_source", "parsed")
|
|
||||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
|
||||||
|
|
||||||
fm = n.get("frontmatter") or {}
|
|
||||||
fm_type = fm.get("type") or n.get("type") or "concept"
|
|
||||||
note_type = str(fm_type)
|
|
||||||
|
|
||||||
cfg_type = _cfg_for_type(note_type, reg)
|
|
||||||
cfg_def = _cfg_defaults(reg)
|
|
||||||
|
|
||||||
# --- retriever_weight ---
|
|
||||||
default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)
|
|
||||||
retriever_weight = fm.get("retriever_weight")
|
|
||||||
if retriever_weight is None:
|
|
||||||
retriever_weight = cfg_type.get(
|
|
||||||
"retriever_weight",
|
|
||||||
cfg_def.get("retriever_weight", default_rw),
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
retriever_weight = float(retriever_weight)
|
|
||||||
except Exception:
|
|
||||||
retriever_weight = default_rw
|
|
||||||
|
|
||||||
# --- chunk_profile (FIXED LOGIC) ---
|
|
||||||
# 1. Frontmatter Override (beide Schreibweisen erlaubt)
|
|
||||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
|
||||||
|
|
||||||
# 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml)
|
|
||||||
if chunk_profile is None:
|
|
||||||
chunk_profile = cfg_type.get("chunking_profile")
|
|
||||||
|
|
||||||
# 3. Default Config (Fallback auf sliding_standard statt medium)
|
|
||||||
if chunk_profile is None:
|
|
||||||
chunk_profile = cfg_def.get("chunking_profile", "sliding_standard")
|
|
||||||
|
|
||||||
# 4. Safety Fallback
|
|
||||||
if not isinstance(chunk_profile, str) or not chunk_profile:
|
|
||||||
chunk_profile = "sliding_standard"
|
|
||||||
|
|
||||||
# --- edge_defaults ---
|
|
||||||
edge_defaults = fm.get("edge_defaults")
|
|
||||||
if edge_defaults is None:
|
|
||||||
edge_defaults = cfg_type.get(
|
|
||||||
"edge_defaults",
|
|
||||||
cfg_def.get("edge_defaults", []),
|
|
||||||
)
|
|
||||||
edge_defaults = _ensure_list(edge_defaults)
|
|
||||||
|
|
||||||
# --- Basis-Metadaten ---
|
|
||||||
note_id = n.get("note_id") or n.get("id") or fm.get("id")
|
|
||||||
title = n.get("title") or fm.get("title") or ""
|
|
||||||
path = n.get("path") or path_arg
|
|
||||||
if isinstance(path, pathlib.Path):
|
|
||||||
path = str(path)
|
|
||||||
|
|
||||||
payload: Dict[str, Any] = {
|
|
||||||
"note_id": note_id,
|
|
||||||
"title": title,
|
|
||||||
"type": note_type,
|
|
||||||
"path": path or "",
|
|
||||||
"retriever_weight": retriever_weight,
|
|
||||||
"chunk_profile": chunk_profile,
|
|
||||||
"edge_defaults": edge_defaults,
|
|
||||||
"hashes": {} # Init Hash Dict
|
|
||||||
}
|
|
||||||
|
|
||||||
# --- MULTI-HASH CALCULATION (Strategy Decoupling) ---
|
|
||||||
# Wir berechnen immer BEIDE Strategien und speichern sie.
|
|
||||||
# ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird.
|
|
||||||
modes_to_calc = ["body", "full"]
|
|
||||||
|
|
||||||
for mode in modes_to_calc:
|
|
||||||
content_to_hash = _get_hash_source_content(n, mode)
|
|
||||||
computed_hash = _compute_hash(content_to_hash)
|
|
||||||
# Key Schema: mode:source:normalize (z.B. "full:parsed:canonical")
|
|
||||||
key = f"{mode}:{hash_source}:{hash_normalize}"
|
|
||||||
payload["hashes"][key] = computed_hash
|
|
||||||
|
|
||||||
# Tags / Keywords
|
|
||||||
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
|
||||||
if tags:
|
|
||||||
payload["tags"] = _ensure_list(tags)
|
|
||||||
|
|
||||||
# Aliases
|
|
||||||
aliases = fm.get("aliases")
|
|
||||||
if aliases:
|
|
||||||
payload["aliases"] = _ensure_list(aliases)
|
|
||||||
|
|
||||||
# Zeit
|
|
||||||
for k in ("created", "modified", "date"):
|
|
||||||
v = fm.get(k) or n.get(k)
|
|
||||||
if v:
|
|
||||||
payload[k] = str(v)
|
|
||||||
|
|
||||||
# Fulltext
|
|
||||||
if "body" in n and n["body"]:
|
|
||||||
payload["fulltext"] = str(n["body"])
|
|
||||||
|
|
||||||
# JSON Validation
|
|
||||||
json.loads(json.dumps(payload, ensure_ascii=False))
|
|
||||||
|
|
||||||
return payload
|
|
||||||
|
|
@ -1,199 +0,0 @@
|
||||||
"""
|
|
||||||
FILE: app/services/semantic_analyzer.py
|
|
||||||
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
|
|
||||||
WP-20 Fix: Volle Kompatibilität mit der provider-basierten Routing-Logik (OpenRouter Primary).
|
|
||||||
WP-22: Integration von valid_types zur Halluzinations-Vermeidung.
|
|
||||||
FIX: Mistral-sicheres JSON-Parsing (<s> & [OUT] Handling) und 100% Logik-Erhalt.
|
|
||||||
VERSION: 2.2.6
|
|
||||||
STATUS: Active
|
|
||||||
DEPENDENCIES: app.services.llm_service, app.services.edge_registry, json, logging, re
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from typing import List, Optional, Any
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
# Importe
|
|
||||||
from app.services.llm_service import LLMService
|
|
||||||
# WP-22: Registry für Vokabular-Erzwingung
|
|
||||||
from app.services.edge_registry import registry as edge_registry
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
class SemanticAnalyzer:
|
|
||||||
def __init__(self):
|
|
||||||
self.llm = LLMService()
|
|
||||||
|
|
||||||
def _is_valid_edge_string(self, edge_str: str) -> bool:
|
|
||||||
"""
|
|
||||||
Prüft, ob ein String eine valide Kante im Format 'kind:target' ist.
|
|
||||||
Verhindert, dass LLM-Geschwätz als Kante durchrutscht.
|
|
||||||
"""
|
|
||||||
if not isinstance(edge_str, str) or ":" not in edge_str:
|
|
||||||
return False
|
|
||||||
|
|
||||||
parts = edge_str.split(":", 1)
|
|
||||||
kind = parts[0].strip()
|
|
||||||
target = parts[1].strip()
|
|
||||||
|
|
||||||
# Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten.
|
|
||||||
if " " in kind:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Regel 2: Plausible Länge für den Typ (Vermeidet Sätze als Typ)
|
|
||||||
if len(kind) > 40 or len(kind) < 2:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Regel 3: Target darf nicht leer sein
|
|
||||||
if not target:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _extract_json_safely(self, text: str) -> Any:
|
|
||||||
"""
|
|
||||||
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama).
|
|
||||||
Implementiert robuste Recovery-Logik für Cloud-Provider.
|
|
||||||
"""
|
|
||||||
if not text:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 1. Entferne Mistral/Llama Steuerzeichen und Tags
|
|
||||||
clean = text.replace("<s>", "").replace("</s>", "")
|
|
||||||
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
|
|
||||||
clean = clean.strip()
|
|
||||||
|
|
||||||
# 2. Suche nach Markdown JSON-Blöcken
|
|
||||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
|
||||||
payload = match.group(1) if match else clean
|
|
||||||
|
|
||||||
try:
|
|
||||||
return json.loads(payload.strip())
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
# 3. Recovery: Suche nach der ersten [ und letzten ]
|
|
||||||
start = payload.find('[')
|
|
||||||
end = payload.rfind(']') + 1
|
|
||||||
if start != -1 and end > start:
|
|
||||||
try:
|
|
||||||
return json.loads(payload[start:end])
|
|
||||||
except: pass
|
|
||||||
|
|
||||||
# 4. Zweite Recovery: Suche nach der ersten { und letzten }
|
|
||||||
start_obj = payload.find('{')
|
|
||||||
end_obj = payload.rfind('}') + 1
|
|
||||||
if start_obj != -1 and end_obj > start_obj:
|
|
||||||
try:
|
|
||||||
return json.loads(payload[start_obj:end_obj])
|
|
||||||
except: pass
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
|
|
||||||
"""
|
|
||||||
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
|
|
||||||
Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind.
|
|
||||||
WP-20: Nutzt primär den konfigurierten Provider (z.B. OpenRouter).
|
|
||||||
"""
|
|
||||||
if not all_edges:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 1. Bestimmung des Providers und Modells (Dynamisch über Settings)
|
|
||||||
provider = self.llm.settings.MINDNET_LLM_PROVIDER
|
|
||||||
model = self.llm.settings.OPENROUTER_MODEL if provider == "openrouter" else self.llm.settings.GEMINI_MODEL
|
|
||||||
|
|
||||||
# 2. Prompt laden (Provider-spezifisch via get_prompt)
|
|
||||||
prompt_template = self.llm.get_prompt("edge_allocation_template", provider)
|
|
||||||
|
|
||||||
if not prompt_template or not isinstance(prompt_template, str):
|
|
||||||
logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' ungültig. Nutze Recovery-Template.")
|
|
||||||
prompt_template = (
|
|
||||||
"TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n"
|
|
||||||
"TEXT: {chunk_text}\n"
|
|
||||||
"KANDIDATEN: {edge_list}\n"
|
|
||||||
"OUTPUT: JSON Liste von Strings [\"kind:target\"]."
|
|
||||||
)
|
|
||||||
|
|
||||||
# 3. Daten für Template vorbereiten (Vokabular-Check)
|
|
||||||
edge_registry.ensure_latest()
|
|
||||||
valid_types_str = ", ".join(sorted(list(edge_registry.valid_types)))
|
|
||||||
edges_str = "\n".join([f"- {e}" for e in all_edges])
|
|
||||||
|
|
||||||
logger.debug(f"🔍 [SemanticAnalyzer] Request: {len(chunk_text)} chars Text, {len(all_edges)} Candidates.")
|
|
||||||
|
|
||||||
# 4. Prompt füllen mit Format-Check (Kein Shortcut)
|
|
||||||
try:
|
|
||||||
# Wir begrenzen den Text auf eine vernünftige Länge für das Kontextfenster
|
|
||||||
final_prompt = prompt_template.format(
|
|
||||||
chunk_text=chunk_text[:6000],
|
|
||||||
edge_list=edges_str,
|
|
||||||
valid_types=valid_types_str
|
|
||||||
)
|
|
||||||
except Exception as format_err:
|
|
||||||
logger.error(f"❌ [SemanticAnalyzer] Prompt Formatting failed: {format_err}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 5. LLM Call mit Background Priority & Semaphore Control
|
|
||||||
response_json = await self.llm.generate_raw_response(
|
|
||||||
prompt=final_prompt,
|
|
||||||
force_json=True,
|
|
||||||
max_retries=3,
|
|
||||||
base_delay=2.0,
|
|
||||||
priority="background",
|
|
||||||
provider=provider,
|
|
||||||
model_override=model
|
|
||||||
)
|
|
||||||
|
|
||||||
# 6. Mistral-sicheres JSON Parsing via Helper
|
|
||||||
data = self._extract_json_safely(response_json)
|
|
||||||
|
|
||||||
if not data:
|
|
||||||
return []
|
|
||||||
|
|
||||||
# 7. Robuste Normalisierung (List vs Dict Recovery)
|
|
||||||
raw_candidates = []
|
|
||||||
if isinstance(data, list):
|
|
||||||
raw_candidates = data
|
|
||||||
elif isinstance(data, dict):
|
|
||||||
logger.info(f"ℹ️ [SemanticAnalyzer] LLM returned dict, trying recovery.")
|
|
||||||
for key in ["edges", "results", "kanten", "matches"]:
|
|
||||||
if key in data and isinstance(data[key], list):
|
|
||||||
raw_candidates.extend(data[key])
|
|
||||||
break
|
|
||||||
# Falls immer noch leer, nutze Schlüssel-Wert Paare als Behelf
|
|
||||||
if not raw_candidates:
|
|
||||||
for k, v in data.items():
|
|
||||||
if isinstance(v, str): raw_candidates.append(f"{k}:{v}")
|
|
||||||
elif isinstance(v, list):
|
|
||||||
for target in v:
|
|
||||||
if isinstance(target, str): raw_candidates.append(f"{k}:{target}")
|
|
||||||
|
|
||||||
# 8. Strikte Validierung gegen Kanten-Format
|
|
||||||
valid_edges = []
|
|
||||||
for e in raw_candidates:
|
|
||||||
e_str = str(e).strip()
|
|
||||||
if self._is_valid_edge_string(e_str):
|
|
||||||
valid_edges.append(e_str)
|
|
||||||
else:
|
|
||||||
logger.debug(f" [SemanticAnalyzer] Rejected invalid edge format: '{e_str}'")
|
|
||||||
|
|
||||||
if valid_edges:
|
|
||||||
logger.info(f"✅ [SemanticAnalyzer] Assigned {len(valid_edges)} edges to chunk.")
|
|
||||||
return valid_edges
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"💥 [SemanticAnalyzer] Critical error during analysis: {e}", exc_info=True)
|
|
||||||
return []
|
|
||||||
|
|
||||||
async def close(self):
|
|
||||||
if self.llm:
|
|
||||||
await self.llm.close()
|
|
||||||
|
|
||||||
# Singleton Instanziierung
|
|
||||||
_analyzer_instance = None
|
|
||||||
def get_semantic_analyzer():
|
|
||||||
global _analyzer_instance
|
|
||||||
if _analyzer_instance is None:
|
|
||||||
_analyzer_instance = SemanticAnalyzer()
|
|
||||||
return _analyzer_instance
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse, os, json, glob, statistics as stats
|
import argparse, os, json, glob, statistics as stats
|
||||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||||
from app.core.chunker import assemble_chunks
|
from app.core.chunking import assemble_chunks
|
||||||
|
|
||||||
def iter_md(root: str):
|
def iter_md(root: str):
|
||||||
for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True):
|
for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True):
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
||||||
# Pfad-Setup
|
# Pfad-Setup
|
||||||
sys.path.insert(0, os.path.abspath("."))
|
sys.path.insert(0, os.path.abspath("."))
|
||||||
|
|
||||||
from app.core.chunker import assemble_chunks, _extract_all_edges_from_md
|
from app.core.chunking import assemble_chunks, _extract_all_edges_from_md
|
||||||
from app.core.derive_edges import build_edges_for_note
|
from app.core.derive_edges import build_edges_for_note
|
||||||
|
|
||||||
# Mock für Settings, falls nötig
|
# Mock für Settings, falls nötig
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse, os, glob
|
import argparse, os, glob
|
||||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||||
from app.core.chunker import assemble_chunks
|
from app.core.chunking import assemble_chunks
|
||||||
|
|
||||||
def iter_md(root: str):
|
def iter_md(root: str):
|
||||||
return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
|
return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from slugify import slugify
|
||||||
from app.core.parser import read_markdown, normalize_frontmatter
|
from app.core.parser import read_markdown, normalize_frontmatter
|
||||||
from app.core.parser import FRONTMATTER_RE # für Re-Inject
|
from app.core.parser import FRONTMATTER_RE # für Re-Inject
|
||||||
from app.core.validate_note import validate_note_payload
|
from app.core.validate_note import validate_note_payload
|
||||||
from app.core.note_payload import make_note_payload
|
from app.core.ingestion.ingestion_note_payload import make_note_payload
|
||||||
|
|
||||||
DATE_IN_NAME = re.compile(r"(?P<y>\d{4})[-_\.]?(?P<m>\d{2})[-_\.]?(?P<d>\d{2})")
|
DATE_IN_NAME = re.compile(r"(?P<y>\d{4})[-_\.]?(?P<m>\d{2})[-_\.]?(?P<d>\d{2})")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,8 @@ from jsonschema import ValidationError
|
||||||
from app.core.parser import read_markdown, validate_required_frontmatter, normalize_frontmatter
|
from app.core.parser import read_markdown, validate_required_frontmatter, normalize_frontmatter
|
||||||
from app.core.note_payload import make_note_payload
|
from app.core.note_payload import make_note_payload
|
||||||
from app.core.validate_note import validate_note_payload
|
from app.core.validate_note import validate_note_payload
|
||||||
|
from app.core.ingestion.ingestion_note_payload import make_note_payload
|
||||||
|
|
||||||
|
|
||||||
def iter_md_files(root: str, include: str, exclude: list[str]) -> list[str]:
|
def iter_md_files(root: str, include: str, exclude: list[str]) -> list[str]:
|
||||||
# include z.B. "**/*.md"
|
# include z.B. "**/*.md"
|
||||||
|
|
|
||||||
|
|
@ -10,9 +10,9 @@ import argparse, os, json
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||||
from app.core.note_payload import make_note_payload
|
from app.core.chunking import assemble_chunks
|
||||||
from app.core.chunker import assemble_chunks
|
from app.core.ingestion.ingestion_note_payload import make_note_payload
|
||||||
from app.core.chunk_payload import make_chunk_payloads
|
from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads
|
||||||
try:
|
try:
|
||||||
from app.core.derive_edges import build_edges_for_note
|
from app.core.derive_edges import build_edges_for_note
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,10 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse, os, glob, json
|
import argparse, os, glob, json
|
||||||
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||||
from app.core.chunker import assemble_chunks
|
from app.core.chunking import assemble_chunks
|
||||||
from app.core.chunk_payload import make_chunk_payloads
|
from app.core.ingestion.ingestion_note_payload import make_note_payload
|
||||||
from app.core.note_payload import make_note_payload
|
from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads
|
||||||
|
|
||||||
|
|
||||||
def iter_md(root: str) -> list[str]:
|
def iter_md(root: str) -> list[str]:
|
||||||
return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
|
return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user