aufräumen und löschen von Alt-Scripten WP19b

This commit is contained in:
Lars 2025-12-27 14:15:22 +01:00
parent f08a331bc6
commit e3858e8bc3
14 changed files with 15 additions and 680 deletions

View File

@ -1,176 +0,0 @@
"""
FILE: app/core/chunk_payload.py
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
FEATURES:
- Inkludiert Nachbarschafts-IDs (prev/next) und Titel.
- FIX 3: Robuste Erkennung des Inputs (Frontmatter-Dict vs. Note-Objekt), damit Overrides ankommen.
VERSION: 2.3.0
STATUS: Active
DEPENDENCIES: yaml, os
EXTERNAL_CONFIG: config/types.yaml
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional
import os, yaml
def _env(n: str, d: Optional[str]=None) -> str:
v = os.getenv(n)
return v if v is not None else (d or "")
def _load_types() -> dict:
p = _env("MINDNET_TYPES_FILE", "./config/types.yaml")
try:
with open(p, "r", encoding="utf-8") as f:
return yaml.safe_load(f) or {}
except Exception:
return {}
def _get_types_map(reg: dict) -> dict:
if isinstance(reg, dict) and isinstance(reg.get("types"), dict):
return reg["types"]
return reg if isinstance(reg, dict) else {}
def _get_defaults(reg: dict) -> dict:
if isinstance(reg, dict) and isinstance(reg.get("defaults"), dict):
return reg["defaults"]
if isinstance(reg, dict) and isinstance(reg.get("global"), dict):
return reg["global"]
return {}
def _as_float(x: Any):
try: return float(x)
except Exception: return None
def _resolve_chunk_profile_from_config(note_type: str, reg: dict) -> Optional[str]:
# 1. Type Level
types = _get_types_map(reg)
if isinstance(types, dict):
t = types.get(note_type, {})
if isinstance(t, dict):
cp = t.get("chunking_profile") or t.get("chunk_profile")
if isinstance(cp, str) and cp: return cp
# 2. Defaults Level
defs = _get_defaults(reg)
if isinstance(defs, dict):
cp = defs.get("chunking_profile") or defs.get("chunk_profile")
if isinstance(cp, str) and cp: return cp
return None
def _resolve_retriever_weight_from_config(note_type: str, reg: dict) -> float:
"""
Liest Weight nur aus Config (Type > Default).
Wird aufgerufen, wenn im Frontmatter nichts steht.
"""
# 1. Type Level
types = _get_types_map(reg)
if isinstance(types, dict):
t = types.get(note_type, {})
if isinstance(t, dict) and (t.get("retriever_weight") is not None):
v = _as_float(t.get("retriever_weight"))
if v is not None: return float(v)
# 2. Defaults Level
defs = _get_defaults(reg)
if isinstance(defs, dict) and (defs.get("retriever_weight") is not None):
v = _as_float(defs.get("retriever_weight"))
if v is not None: return float(v)
return 1.0
def _as_list(x):
if x is None: return []
if isinstance(x, list): return x
return [x]
def make_chunk_payloads(note: Dict[str, Any],
note_path: str,
chunks_from_chunker: List[Any],
*,
note_text: str = "",
types_cfg: Optional[dict] = None,
file_path: Optional[str] = None) -> List[Dict[str, Any]]:
"""
Erstellt die Payloads für die Chunks.
Argument 'note' kann sein:
A) Ein komplexes Objekt/Dict mit Key "frontmatter" (Legacy / Tests)
B) Direkt das Frontmatter-Dictionary (Call aus ingestion.py)
"""
# --- FIX 3: Intelligente Erkennung der Input-Daten ---
# Wir prüfen: Ist 'note' ein Container MIT 'frontmatter', oder IST es das 'frontmatter'?
if isinstance(note, dict) and "frontmatter" in note and isinstance(note["frontmatter"], dict):
# Fall A: Container (wir müssen auspacken)
fm = note["frontmatter"]
else:
# Fall B: Direktes Dict (so ruft ingestion.py es auf!)
fm = note or {}
note_type = fm.get("type") or note.get("type") or "concept"
# Title Extraction (Fallback Chain)
title = fm.get("title") or note.get("title") or fm.get("id") or "Untitled"
reg = types_cfg if isinstance(types_cfg, dict) else _load_types()
# --- Profil-Ermittlung ---
# Da wir 'fm' jetzt korrekt haben, funktionieren diese lookups:
cp = fm.get("chunking_profile") or fm.get("chunk_profile")
if not cp:
cp = _resolve_chunk_profile_from_config(note_type, reg)
if not cp:
cp = "sliding_standard"
# --- Retriever Weight Ermittlung ---
rw = fm.get("retriever_weight")
if rw is None:
rw = _resolve_retriever_weight_from_config(note_type, reg)
try:
rw = float(rw)
except Exception:
rw = 1.0
tags = fm.get("tags") or []
if isinstance(tags, str):
tags = [tags]
out: List[Dict[str, Any]] = []
for idx, ch in enumerate(chunks_from_chunker):
# Attribute extrahieren
cid = getattr(ch, "id", None) or (ch.get("id") if isinstance(ch, dict) else None)
nid = getattr(ch, "note_id", None) or (ch.get("note_id") if isinstance(ch, dict) else fm.get("id"))
index = getattr(ch, "index", None) or (ch.get("index") if isinstance(ch, dict) else idx)
text = getattr(ch, "text", None) or (ch.get("text") if isinstance(ch, dict) else "")
window = getattr(ch, "window", None) or (ch.get("window") if isinstance(ch, dict) else text)
prev_id = getattr(ch, "neighbors_prev", None) or (ch.get("neighbors_prev") if isinstance(ch, dict) else None)
next_id = getattr(ch, "neighbors_next", None) or (ch.get("neighbors_next") if isinstance(ch, dict) else None)
pl: Dict[str, Any] = {
"note_id": nid,
"chunk_id": cid,
"title": title,
"index": int(index),
"ord": int(index) + 1,
"type": note_type,
"tags": tags,
"text": text,
"window": window,
"neighbors_prev": _as_list(prev_id),
"neighbors_next": _as_list(next_id),
"section": getattr(ch, "section", None) or (ch.get("section") if isinstance(ch, dict) else ""),
"path": note_path,
"source_path": file_path or note_path,
"retriever_weight": float(rw),
"chunk_profile": cp, # Jetzt endlich mit dem Override-Wert!
}
# Cleanup
for alias in ("chunk_num", "Chunk_Number"):
pl.pop(alias, None)
out.append(pl)
return out

View File

@ -1,10 +0,0 @@
"""
FILE: app/core/chunker.py
DESCRIPTION: Facade für das Chunking-Package. Stellt 100% Abwärtskompatibilität sicher.
VERSION: 3.3.0
"""
from .chunking.chunking_processor import assemble_chunks
from .chunking.chunking_utils import get_chunk_config, extract_frontmatter_from_text
from .chunking.chunking_models import Chunk
__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"]

View File

@ -1,15 +0,0 @@
"""
FILE: app/core/ingestion.py
DESCRIPTION: Facade für das Ingestion-Package. Stellt 100% Abwärtskompatibilität sicher.
WP-14: Modularisierung der Ingestion-Pipeline abgeschlossen.
Nutzt interne Module mit 'ingestion_' Präfix für maximale Wartbarkeit.
VERSION: 2.13.0
STATUS: Active
"""
# Export der Hauptklasse für externe Module (z.B. scripts/import_markdown.py)
from .ingestion.ingestion_processor import IngestionService
# Export der Hilfsfunktionen für Abwärtskompatibilität
from .ingestion.ingestion_utils import extract_json_from_response, load_type_registry
__all__ = ["IngestionService", "extract_json_from_response", "load_type_registry"]

View File

@ -18,7 +18,7 @@ from app.core.parser import (
read_markdown, pre_scan_markdown, normalize_frontmatter, read_markdown, pre_scan_markdown, normalize_frontmatter,
validate_required_frontmatter, NoteContext validate_required_frontmatter, NoteContext
) )
from app.core.chunker import assemble_chunks from app.core.chunking import assemble_chunks
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch from app.core.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch

View File

@ -59,7 +59,7 @@ def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]: def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
"""Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.""" """Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry."""
from app.core.chunker import get_chunk_config from app.core.chunking import get_chunk_config
profiles = registry.get("chunking_profiles", {}) profiles = registry.get("chunking_profiles", {})
if profile_name in profiles: if profile_name in profiles:
cfg = profiles[profile_name].copy() cfg = profiles[profile_name].copy()

View File

@ -1,268 +0,0 @@
"""
FILE: app/core/note_payload.py
DESCRIPTION: Baut das JSON-Objekt.
FEATURES:
1. Multi-Hash: Berechnet immer 'body' AND 'full' Hashes für flexible Change Detection.
2. Config-Fix: Liest korrekt 'chunking_profile' aus types.yaml (statt Legacy 'chunk_profile').
VERSION: 2.3.0
STATUS: Active
DEPENDENCIES: yaml, os, json, pathlib, hashlib
EXTERNAL_CONFIG: config/types.yaml
"""
from __future__ import annotations
from typing import Any, Dict, Tuple, Optional
import os
import json
import pathlib
import hashlib
try:
import yaml # type: ignore
except Exception:
yaml = None
# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------
def _as_dict(x) -> Dict[str, Any]:
"""Versucht, ein ParsedMarkdown-ähnliches Objekt in ein Dict zu überführen."""
if isinstance(x, dict):
return dict(x)
out: Dict[str, Any] = {}
for attr in (
"frontmatter",
"body",
"id",
"note_id",
"title",
"path",
"tags",
"type",
"created",
"modified",
"date",
):
if hasattr(x, attr):
val = getattr(x, attr)
if val is not None:
out[attr] = val
if not out:
out["raw"] = str(x)
return out
def _pick_args(*args, **kwargs) -> Tuple[Optional[str], Optional[dict]]:
path = kwargs.get("path") or (args[0] if args else None)
types_cfg = kwargs.get("types_cfg") or kwargs.get("types") or None
return path, types_cfg
def _env_float(name: str, default: float) -> float:
try:
return float(os.environ.get(name, default))
except Exception:
return default
def _ensure_list(x) -> list:
if x is None:
return []
if isinstance(x, list):
return [str(i) for i in x]
if isinstance(x, (set, tuple)):
return [str(i) for i in x]
return [str(x)]
# --- Hash Logic ---
def _compute_hash(content: str) -> str:
"""Berechnet einen SHA-256 Hash für den gegebenen String."""
if not content:
return ""
return hashlib.sha256(content.encode("utf-8")).hexdigest()
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
"""
Stellt den String zusammen, der gehasht werden soll.
"""
body = str(n.get("body") or "")
if mode == "body":
return body
if mode == "full":
fm = n.get("frontmatter") or {}
# Wichtig: Sortierte Keys für deterministisches Verhalten!
# Wir nehmen alle steuernden Metadaten auf
meta_parts = []
# Hier checken wir keys, die eine Neu-Indizierung rechtfertigen würden
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
val = fm.get(k)
if val is not None:
meta_parts.append(f"{k}:{val}")
meta_str = "|".join(meta_parts)
return f"{meta_str}||{body}"
return body
# ---------------------------------------------------------------------------
# Type-Registry laden
# ---------------------------------------------------------------------------
def _load_types_config(explicit_cfg: Optional[dict] = None) -> dict:
if explicit_cfg and isinstance(explicit_cfg, dict):
return explicit_cfg
path = os.getenv("MINDNET_TYPES_FILE") or "./config/types.yaml"
if not os.path.isfile(path) or yaml is None:
return {}
try:
with open(path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f) or {}
return data if isinstance(data, dict) else {}
except Exception:
return {}
def _cfg_for_type(note_type: str, reg: dict) -> dict:
if not isinstance(reg, dict):
return {}
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
return types.get(note_type, {}) if isinstance(types, dict) else {}
def _cfg_defaults(reg: dict) -> dict:
if not isinstance(reg, dict):
return {}
for key in ("defaults", "default", "global"):
v = reg.get(key)
if isinstance(v, dict):
return v
return {}
# ---------------------------------------------------------------------------
# Haupt-API
# ---------------------------------------------------------------------------
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
"""
Baut das Note-Payload für mindnet_notes auf.
Inkludiert Hash-Berechnung (Body & Full) und korrigierte Config-Lookups.
"""
n = _as_dict(note)
path_arg, types_cfg_explicit = _pick_args(*args, **kwargs)
reg = _load_types_config(types_cfg_explicit)
# Hash Config (Parameter für Source/Normalize, Mode ist hardcoded auf 'beide')
hash_source = kwargs.get("hash_source", "parsed")
hash_normalize = kwargs.get("hash_normalize", "canonical")
fm = n.get("frontmatter") or {}
fm_type = fm.get("type") or n.get("type") or "concept"
note_type = str(fm_type)
cfg_type = _cfg_for_type(note_type, reg)
cfg_def = _cfg_defaults(reg)
# --- retriever_weight ---
default_rw = _env_float("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0)
retriever_weight = fm.get("retriever_weight")
if retriever_weight is None:
retriever_weight = cfg_type.get(
"retriever_weight",
cfg_def.get("retriever_weight", default_rw),
)
try:
retriever_weight = float(retriever_weight)
except Exception:
retriever_weight = default_rw
# --- chunk_profile (FIXED LOGIC) ---
# 1. Frontmatter Override (beide Schreibweisen erlaubt)
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
# 2. Type Config (Korrekter Key 'chunking_profile' aus types.yaml)
if chunk_profile is None:
chunk_profile = cfg_type.get("chunking_profile")
# 3. Default Config (Fallback auf sliding_standard statt medium)
if chunk_profile is None:
chunk_profile = cfg_def.get("chunking_profile", "sliding_standard")
# 4. Safety Fallback
if not isinstance(chunk_profile, str) or not chunk_profile:
chunk_profile = "sliding_standard"
# --- edge_defaults ---
edge_defaults = fm.get("edge_defaults")
if edge_defaults is None:
edge_defaults = cfg_type.get(
"edge_defaults",
cfg_def.get("edge_defaults", []),
)
edge_defaults = _ensure_list(edge_defaults)
# --- Basis-Metadaten ---
note_id = n.get("note_id") or n.get("id") or fm.get("id")
title = n.get("title") or fm.get("title") or ""
path = n.get("path") or path_arg
if isinstance(path, pathlib.Path):
path = str(path)
payload: Dict[str, Any] = {
"note_id": note_id,
"title": title,
"type": note_type,
"path": path or "",
"retriever_weight": retriever_weight,
"chunk_profile": chunk_profile,
"edge_defaults": edge_defaults,
"hashes": {} # Init Hash Dict
}
# --- MULTI-HASH CALCULATION (Strategy Decoupling) ---
# Wir berechnen immer BEIDE Strategien und speichern sie.
# ingestion.py entscheidet dann anhand der ENV-Variable, welcher verglichen wird.
modes_to_calc = ["body", "full"]
for mode in modes_to_calc:
content_to_hash = _get_hash_source_content(n, mode)
computed_hash = _compute_hash(content_to_hash)
# Key Schema: mode:source:normalize (z.B. "full:parsed:canonical")
key = f"{mode}:{hash_source}:{hash_normalize}"
payload["hashes"][key] = computed_hash
# Tags / Keywords
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
if tags:
payload["tags"] = _ensure_list(tags)
# Aliases
aliases = fm.get("aliases")
if aliases:
payload["aliases"] = _ensure_list(aliases)
# Zeit
for k in ("created", "modified", "date"):
v = fm.get(k) or n.get(k)
if v:
payload[k] = str(v)
# Fulltext
if "body" in n and n["body"]:
payload["fulltext"] = str(n["body"])
# JSON Validation
json.loads(json.dumps(payload, ensure_ascii=False))
return payload

View File

@ -1,199 +0,0 @@
"""
FILE: app/services/semantic_analyzer.py
DESCRIPTION: KI-gestützte Kanten-Validierung. Nutzt LLM (Background-Priority), um Kanten präzise einem Chunk zuzuordnen.
WP-20 Fix: Volle Kompatibilität mit der provider-basierten Routing-Logik (OpenRouter Primary).
WP-22: Integration von valid_types zur Halluzinations-Vermeidung.
FIX: Mistral-sicheres JSON-Parsing (<s> & [OUT] Handling) und 100% Logik-Erhalt.
VERSION: 2.2.6
STATUS: Active
DEPENDENCIES: app.services.llm_service, app.services.edge_registry, json, logging, re
"""
import json
import logging
import re
from typing import List, Optional, Any
from dataclasses import dataclass
# Importe
from app.services.llm_service import LLMService
# WP-22: Registry für Vokabular-Erzwingung
from app.services.edge_registry import registry as edge_registry
logger = logging.getLogger(__name__)
class SemanticAnalyzer:
def __init__(self):
self.llm = LLMService()
def _is_valid_edge_string(self, edge_str: str) -> bool:
"""
Prüft, ob ein String eine valide Kante im Format 'kind:target' ist.
Verhindert, dass LLM-Geschwätz als Kante durchrutscht.
"""
if not isinstance(edge_str, str) or ":" not in edge_str:
return False
parts = edge_str.split(":", 1)
kind = parts[0].strip()
target = parts[1].strip()
# Regel 1: Ein 'kind' (Beziehungstyp) darf keine Leerzeichen enthalten.
if " " in kind:
return False
# Regel 2: Plausible Länge für den Typ (Vermeidet Sätze als Typ)
if len(kind) > 40 or len(kind) < 2:
return False
# Regel 3: Target darf nicht leer sein
if not target:
return False
return True
def _extract_json_safely(self, text: str) -> Any:
"""
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen (Mistral/Llama).
Implementiert robuste Recovery-Logik für Cloud-Provider.
"""
if not text:
return []
# 1. Entferne Mistral/Llama Steuerzeichen und Tags
clean = text.replace("<s>", "").replace("</s>", "")
clean = clean.replace("[OUT]", "").replace("[/OUT]", "")
clean = clean.strip()
# 2. Suche nach Markdown JSON-Blöcken
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
payload = match.group(1) if match else clean
try:
return json.loads(payload.strip())
except json.JSONDecodeError:
# 3. Recovery: Suche nach der ersten [ und letzten ]
start = payload.find('[')
end = payload.rfind(']') + 1
if start != -1 and end > start:
try:
return json.loads(payload[start:end])
except: pass
# 4. Zweite Recovery: Suche nach der ersten { und letzten }
start_obj = payload.find('{')
end_obj = payload.rfind('}') + 1
if start_obj != -1 and end_obj > start_obj:
try:
return json.loads(payload[start_obj:end_obj])
except: pass
return []
async def assign_edges_to_chunk(self, chunk_text: str, all_edges: List[str], note_type: str) -> List[str]:
"""
Sendet einen Chunk und eine Liste potenzieller Kanten an das LLM.
Das LLM filtert heraus, welche Kanten für diesen Chunk relevant sind.
WP-20: Nutzt primär den konfigurierten Provider (z.B. OpenRouter).
"""
if not all_edges:
return []
# 1. Bestimmung des Providers und Modells (Dynamisch über Settings)
provider = self.llm.settings.MINDNET_LLM_PROVIDER
model = self.llm.settings.OPENROUTER_MODEL if provider == "openrouter" else self.llm.settings.GEMINI_MODEL
# 2. Prompt laden (Provider-spezifisch via get_prompt)
prompt_template = self.llm.get_prompt("edge_allocation_template", provider)
if not prompt_template or not isinstance(prompt_template, str):
logger.warning("⚠️ [SemanticAnalyzer] Prompt 'edge_allocation_template' ungültig. Nutze Recovery-Template.")
prompt_template = (
"TASK: Wähle aus den Kandidaten die relevanten Kanten für den Text.\n"
"TEXT: {chunk_text}\n"
"KANDIDATEN: {edge_list}\n"
"OUTPUT: JSON Liste von Strings [\"kind:target\"]."
)
# 3. Daten für Template vorbereiten (Vokabular-Check)
edge_registry.ensure_latest()
valid_types_str = ", ".join(sorted(list(edge_registry.valid_types)))
edges_str = "\n".join([f"- {e}" for e in all_edges])
logger.debug(f"🔍 [SemanticAnalyzer] Request: {len(chunk_text)} chars Text, {len(all_edges)} Candidates.")
# 4. Prompt füllen mit Format-Check (Kein Shortcut)
try:
# Wir begrenzen den Text auf eine vernünftige Länge für das Kontextfenster
final_prompt = prompt_template.format(
chunk_text=chunk_text[:6000],
edge_list=edges_str,
valid_types=valid_types_str
)
except Exception as format_err:
logger.error(f"❌ [SemanticAnalyzer] Prompt Formatting failed: {format_err}")
return []
try:
# 5. LLM Call mit Background Priority & Semaphore Control
response_json = await self.llm.generate_raw_response(
prompt=final_prompt,
force_json=True,
max_retries=3,
base_delay=2.0,
priority="background",
provider=provider,
model_override=model
)
# 6. Mistral-sicheres JSON Parsing via Helper
data = self._extract_json_safely(response_json)
if not data:
return []
# 7. Robuste Normalisierung (List vs Dict Recovery)
raw_candidates = []
if isinstance(data, list):
raw_candidates = data
elif isinstance(data, dict):
logger.info(f" [SemanticAnalyzer] LLM returned dict, trying recovery.")
for key in ["edges", "results", "kanten", "matches"]:
if key in data and isinstance(data[key], list):
raw_candidates.extend(data[key])
break
# Falls immer noch leer, nutze Schlüssel-Wert Paare als Behelf
if not raw_candidates:
for k, v in data.items():
if isinstance(v, str): raw_candidates.append(f"{k}:{v}")
elif isinstance(v, list):
for target in v:
if isinstance(target, str): raw_candidates.append(f"{k}:{target}")
# 8. Strikte Validierung gegen Kanten-Format
valid_edges = []
for e in raw_candidates:
e_str = str(e).strip()
if self._is_valid_edge_string(e_str):
valid_edges.append(e_str)
else:
logger.debug(f" [SemanticAnalyzer] Rejected invalid edge format: '{e_str}'")
if valid_edges:
logger.info(f"✅ [SemanticAnalyzer] Assigned {len(valid_edges)} edges to chunk.")
return valid_edges
except Exception as e:
logger.error(f"💥 [SemanticAnalyzer] Critical error during analysis: {e}", exc_info=True)
return []
async def close(self):
if self.llm:
await self.llm.close()
# Singleton Instanziierung
_analyzer_instance = None
def get_semantic_analyzer():
global _analyzer_instance
if _analyzer_instance is None:
_analyzer_instance = SemanticAnalyzer()
return _analyzer_instance

View File

@ -2,7 +2,7 @@
from __future__ import annotations from __future__ import annotations
import argparse, os, json, glob, statistics as stats import argparse, os, json, glob, statistics as stats
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.chunker import assemble_chunks from app.core.chunking import assemble_chunks
def iter_md(root: str): def iter_md(root: str):
for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True): for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True):

View File

@ -6,7 +6,7 @@ from pathlib import Path
# Pfad-Setup # Pfad-Setup
sys.path.insert(0, os.path.abspath(".")) sys.path.insert(0, os.path.abspath("."))
from app.core.chunker import assemble_chunks, _extract_all_edges_from_md from app.core.chunking import assemble_chunks, _extract_all_edges_from_md
from app.core.derive_edges import build_edges_for_note from app.core.derive_edges import build_edges_for_note
# Mock für Settings, falls nötig # Mock für Settings, falls nötig

View File

@ -2,7 +2,7 @@
from __future__ import annotations from __future__ import annotations
import argparse, os, glob import argparse, os, glob
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.chunker import assemble_chunks from app.core.chunking import assemble_chunks
def iter_md(root: str): def iter_md(root: str):
return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)] return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]

View File

@ -7,7 +7,7 @@ from slugify import slugify
from app.core.parser import read_markdown, normalize_frontmatter from app.core.parser import read_markdown, normalize_frontmatter
from app.core.parser import FRONTMATTER_RE # für Re-Inject from app.core.parser import FRONTMATTER_RE # für Re-Inject
from app.core.validate_note import validate_note_payload from app.core.validate_note import validate_note_payload
from app.core.note_payload import make_note_payload from app.core.ingestion.ingestion_note_payload import make_note_payload
DATE_IN_NAME = re.compile(r"(?P<y>\d{4})[-_\.]?(?P<m>\d{2})[-_\.]?(?P<d>\d{2})") DATE_IN_NAME = re.compile(r"(?P<y>\d{4})[-_\.]?(?P<m>\d{2})[-_\.]?(?P<d>\d{2})")

View File

@ -8,6 +8,8 @@ from jsonschema import ValidationError
from app.core.parser import read_markdown, validate_required_frontmatter, normalize_frontmatter from app.core.parser import read_markdown, validate_required_frontmatter, normalize_frontmatter
from app.core.note_payload import make_note_payload from app.core.note_payload import make_note_payload
from app.core.validate_note import validate_note_payload from app.core.validate_note import validate_note_payload
from app.core.ingestion.ingestion_note_payload import make_note_payload
def iter_md_files(root: str, include: str, exclude: list[str]) -> list[str]: def iter_md_files(root: str, include: str, exclude: list[str]) -> list[str]:
# include z.B. "**/*.md" # include z.B. "**/*.md"

View File

@ -10,9 +10,9 @@ import argparse, os, json
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.note_payload import make_note_payload from app.core.chunking import assemble_chunks
from app.core.chunker import assemble_chunks from app.core.ingestion.ingestion_note_payload import make_note_payload
from app.core.chunk_payload import make_chunk_payloads from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads
try: try:
from app.core.derive_edges import build_edges_for_note from app.core.derive_edges import build_edges_for_note
except Exception: except Exception:

View File

@ -2,9 +2,10 @@
from __future__ import annotations from __future__ import annotations
import argparse, os, glob, json import argparse, os, glob, json
from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter from app.core.parser import read_markdown, normalize_frontmatter, validate_required_frontmatter
from app.core.chunker import assemble_chunks from app.core.chunking import assemble_chunks
from app.core.chunk_payload import make_chunk_payloads from app.core.ingestion.ingestion_note_payload import make_note_payload
from app.core.note_payload import make_note_payload from app.core.ingestion.ingestion_chunk_payload import make_chunk_payloads
def iter_md(root: str) -> list[str]: def iter_md(root: str) -> list[str]:
return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)] return [p for p in glob.glob(os.path.join(root, "**", "*.md"), recursive=True)]