diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py deleted file mode 100644 index 9058753..0000000 --- a/app/core/chunk_payload.py +++ /dev/null @@ -1,176 +0,0 @@ -""" -FILE: app/core/chunk_payload.py -DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'. -FEATURES: - - Inkludiert Nachbarschafts-IDs (prev/next) und Titel. - - FIX 3: Robuste Erkennung des Inputs (Frontmatter-Dict vs. Note-Objekt), damit Overrides ankommen. -VERSION: 2.3.0 -STATUS: Active -DEPENDENCIES: yaml, os -EXTERNAL_CONFIG: config/types.yaml -""" -from __future__ import annotations -from typing import Any, Dict, List, Optional -import os, yaml - -def _env(n: str, d: Optional[str]=None) -> str: - v = os.getenv(n) - return v if v is not None else (d or "") - -def _load_types() -> dict: - p = _env("MINDNET_TYPES_FILE", "./config/types.yaml") - try: - with open(p, "r", encoding="utf-8") as f: - return yaml.safe_load(f) or {} - except Exception: - return {} - -def _get_types_map(reg: dict) -> dict: - if isinstance(reg, dict) and isinstance(reg.get("types"), dict): - return reg["types"] - return reg if isinstance(reg, dict) else {} - -def _get_defaults(reg: dict) -> dict: - if isinstance(reg, dict) and isinstance(reg.get("defaults"), dict): - return reg["defaults"] - if isinstance(reg, dict) and isinstance(reg.get("global"), dict): - return reg["global"] - return {} - -def _as_float(x: Any): - try: return float(x) - except Exception: return None - -def _resolve_chunk_profile_from_config(note_type: str, reg: dict) -> Optional[str]: - # 1. Type Level - types = _get_types_map(reg) - if isinstance(types, dict): - t = types.get(note_type, {}) - if isinstance(t, dict): - cp = t.get("chunking_profile") or t.get("chunk_profile") - if isinstance(cp, str) and cp: return cp - # 2. Defaults Level - defs = _get_defaults(reg) - if isinstance(defs, dict): - cp = defs.get("chunking_profile") or defs.get("chunk_profile") - if isinstance(cp, str) and cp: return cp - return None - -def _resolve_retriever_weight_from_config(note_type: str, reg: dict) -> float: - """ - Liest Weight nur aus Config (Type > Default). - Wird aufgerufen, wenn im Frontmatter nichts steht. - """ - # 1. Type Level - types = _get_types_map(reg) - if isinstance(types, dict): - t = types.get(note_type, {}) - if isinstance(t, dict) and (t.get("retriever_weight") is not None): - v = _as_float(t.get("retriever_weight")) - if v is not None: return float(v) - - # 2. Defaults Level - defs = _get_defaults(reg) - if isinstance(defs, dict) and (defs.get("retriever_weight") is not None): - v = _as_float(defs.get("retriever_weight")) - if v is not None: return float(v) - - return 1.0 - -def _as_list(x): - if x is None: return [] - if isinstance(x, list): return x - return [x] - -def make_chunk_payloads(note: Dict[str, Any], - note_path: str, - chunks_from_chunker: List[Any], - *, - note_text: str = "", - types_cfg: Optional[dict] = None, - file_path: Optional[str] = None) -> List[Dict[str, Any]]: - """ - Erstellt die Payloads für die Chunks. - - Argument 'note' kann sein: - A) Ein komplexes Objekt/Dict mit Key "frontmatter" (Legacy / Tests) - B) Direkt das Frontmatter-Dictionary (Call aus ingestion.py) - """ - - # --- FIX 3: Intelligente Erkennung der Input-Daten --- - # Wir prüfen: Ist 'note' ein Container MIT 'frontmatter', oder IST es das 'frontmatter'? - if isinstance(note, dict) and "frontmatter" in note and isinstance(note["frontmatter"], dict): - # Fall A: Container (wir müssen auspacken) - fm = note["frontmatter"] - else: - # Fall B: Direktes Dict (so ruft ingestion.py es auf!) - fm = note or {} - - note_type = fm.get("type") or note.get("type") or "concept" - - # Title Extraction (Fallback Chain) - title = fm.get("title") or note.get("title") or fm.get("id") or "Untitled" - - reg = types_cfg if isinstance(types_cfg, dict) else _load_types() - - # --- Profil-Ermittlung --- - # Da wir 'fm' jetzt korrekt haben, funktionieren diese lookups: - cp = fm.get("chunking_profile") or fm.get("chunk_profile") - - if not cp: - cp = _resolve_chunk_profile_from_config(note_type, reg) - if not cp: - cp = "sliding_standard" - - # --- Retriever Weight Ermittlung --- - rw = fm.get("retriever_weight") - - if rw is None: - rw = _resolve_retriever_weight_from_config(note_type, reg) - - try: - rw = float(rw) - except Exception: - rw = 1.0 - - tags = fm.get("tags") or [] - if isinstance(tags, str): - tags = [tags] - - out: List[Dict[str, Any]] = [] - for idx, ch in enumerate(chunks_from_chunker): - # Attribute extrahieren - cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None) - nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id")) - index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx) - text = getattr(ch, "text", None) or (ch.get("text") if isinstance(ch, dict) else "") - window = getattr(ch, "window", None) or (ch.get("window") if isinstance(ch, dict) else text) - prev_id = getattr(ch, "neighbors_prev", None) or (ch.get("neighbors_prev") if isinstance(ch, dict) else None) - next_id = getattr(ch, "neighbors_next", None) or (ch.get("neighbors_next") if isinstance(ch, dict) else None) - - pl: Dict[str, Any] = { - "note_id": nid, - "chunk_id": cid, - "title": title, - "index": int(index), - "ord": int(index) + 1, - "type": note_type, - "tags": tags, - "text": text, - "window": window, - "neighbors_prev": _as_list(prev_id), - "neighbors_next": _as_list(next_id), - "section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""), - "path": note_path, - "source_path": file_path or note_path, - "retriever_weight": float(rw), - "chunk_profile": cp, # Jetzt endlich mit dem Override-Wert! - } - - # Cleanup - for alias in ("chunk_num", "Chunk_Number"): - pl.pop(alias, None) - - out.append(pl) - - return out \ No newline at end of file diff --git a/app/core/chunker.py b/app/core/chunker.py deleted file mode 100644 index 4a624e2..0000000 --- a/app/core/chunker.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -FILE: app/core/chunker.py -DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher. -VERSION: 3.3.0 -""" -from .chunking.chunking_processor import assemble_chunks -from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text -from .chunking.chunking_models import Chunk - -__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"] \ No newline at end of file diff --git a/app/core/ingestion.py b/app/core/ingestion.py deleted file mode 100644 index a140178..0000000 --- a/app/core/ingestion.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -FILE: app/core/ingestion.py -DESCRIPTION: Facade für das Ingestion-Package. Stellt 100% Abwärtskompatibilität sicher. - WP-14: Modularisierung der Ingestion-Pipeline abgeschlossen. - Nutzt interne Module mit 'ingestion_' Präfix für maximale Wartbarkeit. -VERSION: 2.13.0 -STATUS: Active -""" -# Export der Hauptklasse für externe Module (z.B. scripts/import_markdown.py) -from .ingestion.ingestion_processor import IngestionService - -# Export der Hilfsfunktionen für Abwärtskompatibilität -from .ingestion.ingestion_utils import extract_json_from_response, load_type_registry - -__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"] \ No newline at end of file diff --git a/app/core/ingestion/ingestion_processor.py b/app/core/ingestion/ingestion_processor.py index fc9923f..268b47c 100644 --- a/app/core/ingestion/ingestion_processor.py +++ b/app/core/ingestion/ingestion_processor.py @@ -18,7 +18,7 @@ from app.core.parser import ( read_markdown, pre_scan_markdown, normalize_frontmatter, validate_required_frontmatter, NoteContext ) -from app.core.chunker import assemble_chunks +from app.core.chunking import assemble_chunks from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch diff --git a/app/core/ingestion/ingestion_utils.py b/app/core/ingestion/ingestion_utils.py index dadba30..c3b6068 100644 --- a/app/core/ingestion/ingestion_utils.py +++ b/app/core/ingestion/ingestion_utils.py @@ -59,7 +59,7 @@ def resolve_note_type(registry: dict, requested: Optional[str]) -> str: def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]: """Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.""" - from app.core.chunker import get_chunk_config + from app.core.chunking import get_chunk_config profiles = registry.get("chunking_profiles", {}) if profile_name in profiles: cfg = profiles[profile_name].copy() diff --git a/app/core/note_payload.py b/app/core/note_payload.py deleted file mode 100644 index 957a97e..0000000 --- a/app/core/note_payload.py +++ /dev/null @@ -1,268 +0,0 @@ -""" -FILE: app/core/note_payload.py -DESCRIPTION: Baut das JSON-Objekt. -FEATURES: - 1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection. - 2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile'). -VERSION: 2.3.0 -STATUS: Active -DEPENDENCIES: yaml, os, json, pathlib, hashlib -EXTERNAL_CONFIG: config/types.yaml -""" - -from __future__ import annotations - -from typing import Any, Dict, Tuple, Optional -import os -import json -import pathlib -import hashlib - -try: - import yaml # type: ignore -except Exception: - yaml = None - - -# --------------------------------------------------------------------------- -# Helper -# --------------------------------------------------------------------------- - -def _as_dict(x) -> Dict[str, Any]: - """Versucht, ein ParsedMarkdown-ähnliches Objekt in ein Dict zu überführen.""" - if isinstance(x, dict): - return dict(x) - - out: Dict[str, Any] = {} - for attr in ( - "frontmatter", - "body", - "id", - "note_id", - "title", - "path", - "tags", - "type", - "created", - "modified", - "date", - ): - if hasattr(x, attr): - val = getattr(x, attr) - if val is not None: - out[attr] = val - - if not out: - out["raw"] = str(x) - - return out - - -def _pick_args(*args, **kwargs) -> Tuple[Optional[str], Optional[dict]]: - path = kwargs.get("path") or (args[0] if args else None) - types_cfg = kwargs.get("types_cfg") or kwargs.get("types") or None - return path, types_cfg - - -def _env_float(name: str, default: float) -> float: - try: - return float(os.environ.get(name, default)) - except Exception: - return default - - -def _ensure_list(x) -> list: - if x is None: - return [] - if isinstance(x, list): - return [str(i) for i in x] - if isinstance(x, (set, tuple)): - return [str(i) for i in x] - return [str(x)] - -# --- Hash Logic --- -def _compute_hash(content: str) -> str: - """Berechnet einen SHA-256 Hash für den gegebenen String.""" - if not content: - return "" - return hashlib.sha256(content.encode("utf-8")).hexdigest() - -def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str: - """ - Stellt den String zusammen, der gehasht werden soll. - """ - body = str(n.get("body") or "") - - if mode == "body": - return body - - if mode == "full": - fm = n.get("frontmatter") or {} - # Wichtig: Sortierte Keys für deterministisches Verhalten! - # Wir nehmen alle steuernden Metadaten auf - meta_parts = [] - # Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden - for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]): - val = fm.get(k) - if val is not None: - meta_parts.append(f"{k}:{val}") - - meta_str = "|".join(meta_parts) - return f"{meta_str}||{body}" - - return body - - -# --------------------------------------------------------------------------- -# Type-Registry laden -# --------------------------------------------------------------------------- - -def _load_types_config(explicit_cfg: Optional[dict] = None) -> dict: - if explicit_cfg and isinstance(explicit_cfg, dict): - return explicit_cfg - - path = os.getenv("MINDNET_TYPES_FILE") or "./config/types.yaml" - if not os.path.isfile(path) or yaml is None: - return {} - - try: - with open(path, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) or {} - return data if isinstance(data, dict) else {} - except Exception: - return {} - - -def _cfg_for_type(note_type: str, reg: dict) -> dict: - if not isinstance(reg, dict): - return {} - types = reg.get("types") if isinstance(reg.get("types"), dict) else reg - return types.get(note_type, {}) if isinstance(types, dict) else {} - - -def _cfg_defaults(reg: dict) -> dict: - if not isinstance(reg, dict): - return {} - for key in ("defaults", "default", "global"): - v = reg.get(key) - if isinstance(v, dict): - return v - return {} - - -# --------------------------------------------------------------------------- -# Haupt-API -# --------------------------------------------------------------------------- - -def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]: - """ - Baut das Note-Payload für mindnet_notes auf. - Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups. - """ - n = _as_dict(note) - path_arg, types_cfg_explicit = _pick_args(*args, **kwargs) - reg = _load_types_config(types_cfg_explicit) - - # Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide') - hash_source = kwargs.get("hash_source", "parsed") - hash_normalize = kwargs.get("hash_normalize", "canonical") - - fm = n.get("frontmatter") or {} - fm_type = fm.get("type") or n.get("type") or "concept" - note_type = str(fm_type) - - cfg_type = _cfg_for_type(note_type, reg) - cfg_def = _cfg_defaults(reg) - - # --- retriever_weight --- - default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0) - retriever_weight = fm.get("retriever_weight") - if retriever_weight is None: - retriever_weight = cfg_type.get( - "retriever_weight", - cfg_def.get("retriever_weight", default_rw), - ) - try: - retriever_weight = float(retriever_weight) - except Exception: - retriever_weight = default_rw - - # --- chunk_profile (FIXED LOGIC) --- - # 1. Frontmatter Override (beide Schreibweisen erlaubt) - chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile") - - # 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml) - if chunk_profile is None: - chunk_profile = cfg_type.get("chunking_profile") - - # 3. Default Config (Fallback auf sliding_standard statt medium) - if chunk_profile is None: - chunk_profile = cfg_def.get("chunking_profile", "sliding_standard") - - # 4. Safety Fallback - if not isinstance(chunk_profile, str) or not chunk_profile: - chunk_profile = "sliding_standard" - - # --- edge_defaults --- - edge_defaults = fm.get("edge_defaults") - if edge_defaults is None: - edge_defaults = cfg_type.get( - "edge_defaults", - cfg_def.get("edge_defaults", []), - ) - edge_defaults = _ensure_list(edge_defaults) - - # --- Basis-Metadaten --- - note_id = n.get("note_id") or n.get("id") or fm.get("id") - title = n.get("title") or fm.get("title") or "" - path = n.get("path") or path_arg - if isinstance(path, pathlib.Path): - path = str(path) - - payload: Dict[str, Any] = { - "note_id": note_id, - "title": title, - "type": note_type, - "path": path or "", - "retriever_weight": retriever_weight, - "chunk_profile": chunk_profile, - "edge_defaults": edge_defaults, - "hashes": {} # Init Hash Dict - } - - # --- MULTI-HASH CALCULATION (Strategy Decoupling) --- - # Wir berechnen immer BEIDE Strategien und speichern sie. - # ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird. - modes_to_calc = ["body", "full"] - - for mode in modes_to_calc: - content_to_hash = _get_hash_source_content(n, mode) - computed_hash = _compute_hash(content_to_hash) - # Key Schema: mode:source:normalize (z.B. "full:parsed:canonical") - key = f"{mode}:{hash_source}:{hash_normalize}" - payload["hashes"][key] = computed_hash - - # Tags / Keywords - tags = fm.get("tags") or fm.get("keywords") or n.get("tags") - if tags: - payload["tags"] = _ensure_list(tags) - - # Aliases - aliases = fm.get("aliases") - if aliases: - payload["aliases"] = _ensure_list(aliases) - - # Zeit - for k in ("created", "modified", "date"): - v = fm.get(k) or n.get(k) - if v: - payload[k] = str(v) - - # Fulltext - if "body" in n and n["body"]: - payload["fulltext"] = str(n["body"]) - - # JSON Validation - json.loads(json.dumps(payload, ensure_ascii=False)) - - return payload \ No newline at end of file diff --git a/app/services/semantic_analyzer.py b/app/services/semantic_analyzer.py deleted file mode 100644 index 2d492a5..0000000 --- a/app/services/semantic_analyzer.py +++ /dev/null @@ -1,199 +0,0 @@ -""" -FILE: app/services/semantic_analyzer.py -DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen. - WP-20 Fix: Volle Kompatibilität mit der provider-basierten Routing-Logik (OpenRouter Primary). - WP-22: Integration von valid_types zur Halluzinations-Vermeidung. -FIX: Mistral-sicheres JSON-Parsing ( & [OUT] Handling) und 100% Logik-Erhalt. -VERSION: 2.2.6 -STATUS: Active -DEPENDENCIES: app.services.llm_service, app.services.edge_registry, json, logging, re -""" - -import json -import logging -import re -from typing import List, Optional, Any -from dataclasses import dataclass - -# Importe -from app.services.llm_service import LLMService -# WP-22: Registry für Vokabular-Erzwingung -from app.services.edge_registry import registry as edge_registry - -logger = logging.getLogger(__name__) - -class SemanticAnalyzer: - def __init__(self): - self.llm = LLMService() - - def _is_valid_edge_string(self, edge_str: str) -> bool: - """ - Prüft, ob ein String eine valide Kante im Format 'kind:target' ist. - Verhindert, dass LLM-Geschwätz als Kante durchrutscht. - """ - if not isinstance(edge_str, str) or ":" not in edge_str: - return False - - parts = edge_str.split(":", 1) - kind = parts[0].strip() - target = parts[1].strip() - - # Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten. - if " " in kind: - return False - - # Regel 2: Plausible Länge für den Typ (Vermeidet Sätze als Typ) - if len(kind) > 40 or len(kind) < 2: - return False - - # Regel 3: Target darf nicht leer sein - if not target: - return False - - return True - - def _extract_json_safely(self, text: str) -> Any: - """ - Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama). - Implementiert robuste Recovery-Logik für Cloud-Provider. - """ - if not text: - return [] - - # 1. Entferne Mistral/Llama Steuerzeichen und Tags - clean = text.replace("", "").replace("", "") - clean = clean.replace("[OUT]", "").replace("[/OUT]", "") - clean = clean.strip() - - # 2. Suche nach Markdown JSON-Blöcken - match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL) - payload = match.group(1) if match else clean - - try: - return json.loads(payload.strip()) - except json.JSONDecodeError: - # 3. Recovery: Suche nach der ersten [ und letzten ] - start = payload.find('[') - end = payload.rfind(']') + 1 - if start != -1 and end > start: - try: - return json.loads(payload[start:end]) - except: pass - - # 4. Zweite Recovery: Suche nach der ersten { und letzten } - start_obj = payload.find('{') - end_obj = payload.rfind('}') + 1 - if start_obj != -1 and end_obj > start_obj: - try: - return json.loads(payload[start_obj:end_obj]) - except: pass - return [] - - async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]: - """ - Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM. - Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind. - WP-20: Nutzt primär den konfigurierten Provider (z.B. OpenRouter). - """ - if not all_edges: - return [] - - # 1. Bestimmung des Providers und Modells (Dynamisch über Settings) - provider = self.llm.settings.MINDNET_LLM_PROVIDER - model = self.llm.settings.OPENROUTER_MODEL if provider == "openrouter" else self.llm.settings.GEMINI_MODEL - - # 2. Prompt laden (Provider-spezifisch via get_prompt) - prompt_template = self.llm.get_prompt("edge_allocation_template", provider) - - if not prompt_template or not isinstance(prompt_template, str): - logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' ungültig. Nutze Recovery-Template.") - prompt_template = ( - "TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n" - "TEXT: {chunk_text}\n" - "KANDIDATEN: {edge_list}\n" - "OUTPUT: JSON Liste von Strings [\"kind:target\"]." - ) - - # 3. Daten für Template vorbereiten (Vokabular-Check) - edge_registry.ensure_latest() - valid_types_str = ", ".join(sorted(list(edge_registry.valid_types))) - edges_str = "\n".join([f"- {e}" for e in all_edges]) - - logger.debug(f"🔍 [SemanticAnalyzer] Request: {len(chunk_text)} chars Text, {len(all_edges)} Candidates.") - - # 4. Prompt füllen mit Format-Check (Kein Shortcut) - try: - # Wir begrenzen den Text auf eine vernünftige Länge für das Kontextfenster - final_prompt = prompt_template.format( - chunk_text=chunk_text[:6000], - edge_list=edges_str, - valid_types=valid_types_str - ) - except Exception as format_err: - logger.error(f"❌ [SemanticAnalyzer] Prompt Formatting failed: {format_err}") - return [] - - try: - # 5. LLM Call mit Background Priority & Semaphore Control - response_json = await self.llm.generate_raw_response( - prompt=final_prompt, - force_json=True, - max_retries=3, - base_delay=2.0, - priority="background", - provider=provider, - model_override=model - ) - - # 6. Mistral-sicheres JSON Parsing via Helper - data = self._extract_json_safely(response_json) - - if not data: - return [] - - # 7. Robuste Normalisierung (List vs Dict Recovery) - raw_candidates = [] - if isinstance(data, list): - raw_candidates = data - elif isinstance(data, dict): - logger.info(f"ℹ️ [SemanticAnalyzer] LLM returned dict, trying recovery.") - for key in ["edges", "results", "kanten", "matches"]: - if key in data and isinstance(data[key], list): - raw_candidates.extend(data[key]) - break - # Falls immer noch leer, nutze Schlüssel-Wert Paare als Behelf - if not raw_candidates: - for k, v in data.items(): - if isinstance(v, str): raw_candidates.append(f"{k}:{v}") - elif isinstance(v, list): - for target in v: - if isinstance(target, str): raw_candidates.append(f"{k}:{target}") - - # 8. Strikte Validierung gegen Kanten-Format - valid_edges = [] - for e in raw_candidates: - e_str = str(e).strip() - if self._is_valid_edge_string(e_str): - valid_edges.append(e_str) - else: - logger.debug(f" [SemanticAnalyzer] Rejected invalid edge format: '{e_str}'") - - if valid_edges: - logger.info(f"✅ [SemanticAnalyzer] Assigned {len(valid_edges)} edges to chunk.") - return valid_edges - - except Exception as e: - logger.error(f"💥 [SemanticAnalyzer] Critical error during analysis: {e}", exc_info=True) - return [] - - async def close(self): - if self.llm: - await self.llm.close() - -# Singleton Instanziierung -_analyzer_instance = None -def get_semantic_analyzer(): - global _analyzer_instance - if _analyzer_instance is None: - _analyzer_instance = SemanticAnalyzer() - return _analyzer_instance \ No newline at end of file diff --git a/scripts/audit_chunks.py b/scripts/audit_chunks.py index 6311141..65ac7a1 100644 --- a/scripts/audit_chunks.py +++ b/scripts/audit_chunks.py @@ -2,7 +2,7 @@ from __future__ import annotations import argparse, os, json, glob, statistics as stats from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter -from app.core.chunker import assemble_chunks +from app.core.chunking import assemble_chunks def iter_md(root: str): for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True): diff --git a/scripts/debug_edge_loss.py b/scripts/debug_edge_loss.py index e88d2f3..ed91423 100644 --- a/scripts/debug_edge_loss.py +++ b/scripts/debug_edge_loss.py @@ -6,7 +6,7 @@ from pathlib import Path # Pfad-Setup sys.path.insert(0, os.path.abspath(".")) -from app.core.chunker import assemble_chunks, _extract_all_edges_from_md +from app.core.chunking import assemble_chunks, _extract_all_edges_from_md from app.core.derive_edges import build_edges_for_note # Mock für Settings, falls nötig diff --git a/scripts/dump_note_chunks.py b/scripts/dump_note_chunks.py index 8aba330..54b8514 100644 --- a/scripts/dump_note_chunks.py +++ b/scripts/dump_note_chunks.py @@ -2,7 +2,7 @@ from __future__ import annotations import argparse, os, glob from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter -from app.core.chunker import assemble_chunks +from app.core.chunking import assemble_chunks def iter_md(root: str): return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)] diff --git a/scripts/fix_frontmatter.py b/scripts/fix_frontmatter.py index fa9edc1..b5f04d0 100644 --- a/scripts/fix_frontmatter.py +++ b/scripts/fix_frontmatter.py @@ -7,7 +7,7 @@ from slugify import slugify from app.core.parser import read_markdown, normalize_frontmatter from app.core.parser import FRONTMATTER_RE # für Re-Inject from app.core.validate_note import validate_note_payload -from app.core.note_payload import make_note_payload +from app.core.ingestion.ingestion_note_payload import make_note_payload DATE_IN_NAME = re.compile(r"(?P\d{4})[-_\.]?(?P\d{2})[-_\.]?(?P\d{2})") diff --git a/scripts/parse_validate_notes.py b/scripts/parse_validate_notes.py index 1fc5f66..d341fed 100644 --- a/scripts/parse_validate_notes.py +++ b/scripts/parse_validate_notes.py @@ -8,6 +8,8 @@ from jsonschema import ValidationError from app.core.parser import read_markdown, validate_required_frontmatter, normalize_frontmatter from app.core.note_payload import make_note_payload from app.core.validate_note import validate_note_payload +from app.core.ingestion.ingestion_note_payload import make_note_payload + def iter_md_files(root: str, include: str, exclude: list[str]) -> list[str]: # include z.B. "**/*.md" diff --git a/scripts/payload_dryrun.py b/scripts/payload_dryrun.py index ce3980a..f2ee242 100644 --- a/scripts/payload_dryrun.py +++ b/scripts/payload_dryrun.py @@ -10,9 +10,9 @@ import argparse, os, json from typing import Any, Dict, List, Optional from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter -from app.core.note_payload import make_note_payload -from app.core.chunker import assemble_chunks -from app.core.chunk_payload import make_chunk_payloads +from app.core.chunking import assemble_chunks +from app.core.ingestion.ingestion_note_payload import make_note_payload +from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads try: from app.core.derive_edges import build_edges_for_note except Exception: diff --git a/scripts/preview_chunks.py b/scripts/preview_chunks.py index 9046d2a..25bb25a 100644 --- a/scripts/preview_chunks.py +++ b/scripts/preview_chunks.py @@ -2,9 +2,10 @@ from __future__ import annotations import argparse, os, glob, json from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter -from app.core.chunker import assemble_chunks -from app.core.chunk_payload import make_chunk_payloads -from app.core.note_payload import make_note_payload +from app.core.chunking import assemble_chunks +from app.core.ingestion.ingestion_note_payload import make_note_payload +from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads + def iter_md(root: str) -> list[str]: return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]