WP15b #15
|
|
@ -1,52 +1,68 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/ingestion/ingestion_chunk_payload.py
|
FILE: app/core/ingestion/ingestion_chunk_payload.py
|
||||||
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
||||||
Fix v2.4.1: Behebt AttributeError bei Zugriff auf Chunk-Objekte.
|
Fix v2.4.2: Audit-Check (Cleanup pop, Config-Resolution Hierarchie).
|
||||||
VERSION: 2.4.1
|
VERSION: 2.4.2
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Resolution Helpers (Audited)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _as_list(x):
|
def _as_list(x):
|
||||||
if x is None: return []
|
if x is None: return []
|
||||||
return x if isinstance(x, list) else [x]
|
return x if isinstance(x, list) else [x]
|
||||||
|
|
||||||
def make_chunk_payloads(note: Dict[str, Any],
|
def _resolve_val(note_type: str, reg: dict, key: str, default: Any) -> Any:
|
||||||
note_path: str,
|
"""Hierarchische Suche: Type > Default."""
|
||||||
chunks_from_chunker: List[Any],
|
types = reg.get("types", {})
|
||||||
**kwargs) -> List[Dict[str, Any]]:
|
if isinstance(types, dict):
|
||||||
"""
|
t_cfg = types.get(note_type, {})
|
||||||
Erstellt die Payloads für die Chunks eines Dokuments.
|
if isinstance(t_cfg, dict):
|
||||||
Robust gegenüber Chunk-Objekten (Dataclasses) und Dictionaries.
|
val = t_cfg.get(key) or t_cfg.get(key.replace("ing", "")) # chunking_ vs chunk_
|
||||||
"""
|
if val is not None: return val
|
||||||
# Frontmatter Extraktion
|
defs = reg.get("defaults", {}) or reg.get("global", {})
|
||||||
if isinstance(note, dict) and "frontmatter" in note:
|
if isinstance(defs, dict):
|
||||||
fm = note["frontmatter"]
|
val = defs.get(key) or defs.get(key.replace("ing", ""))
|
||||||
else:
|
if val is not None: return val
|
||||||
fm = note or {}
|
return default
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Haupt-API
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], **kwargs) -> List[Dict[str, Any]]:
|
||||||
|
"""Erstellt die Payloads für die Chunks inklusive Audit-Resolution."""
|
||||||
|
if isinstance(note, dict) and "frontmatter" in note: fm = note["frontmatter"]
|
||||||
|
else: fm = note or {}
|
||||||
|
|
||||||
|
reg = kwargs.get("types_cfg") or {}
|
||||||
note_type = fm.get("type") or "concept"
|
note_type = fm.get("type") or "concept"
|
||||||
title = fm.get("title") or fm.get("id") or "Untitled"
|
title = fm.get("title") or fm.get("id") or "Untitled"
|
||||||
tags = _as_list(fm.get("tags") or [])
|
tags = _as_list(fm.get("tags") or [])
|
||||||
cp = fm.get("chunking_profile") or fm.get("chunk_profile") or "sliding_standard"
|
|
||||||
rw = float(fm.get("retriever_weight", 1.0))
|
# Audit: Resolution Hierarchie
|
||||||
|
cp = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||||
|
if not cp: cp = _resolve_val(note_type, reg, "chunking_profile", "sliding_standard")
|
||||||
|
|
||||||
|
rw = fm.get("retriever_weight")
|
||||||
|
if rw is None: rw = _resolve_val(note_type, reg, "retriever_weight", 1.0)
|
||||||
|
try: rw = float(rw)
|
||||||
|
except: rw = 1.0
|
||||||
|
|
||||||
out: List[Dict[str, Any]] = []
|
out: List[Dict[str, Any]] = []
|
||||||
for idx, ch in enumerate(chunks_from_chunker):
|
for idx, ch in enumerate(chunks_from_chunker):
|
||||||
# Dynamische Extraktion basierend auf Typ (Objekt vs Dict)
|
|
||||||
is_dict = isinstance(ch, dict)
|
is_dict = isinstance(ch, dict)
|
||||||
|
|
||||||
cid = getattr(ch, "id", None) if not is_dict else ch.get("id")
|
cid = getattr(ch, "id", None) if not is_dict else ch.get("id")
|
||||||
nid = getattr(ch, "note_id", None) if not is_dict else ch.get("note_id")
|
nid = getattr(ch, "note_id", None) if not is_dict else ch.get("note_id")
|
||||||
index = getattr(ch, "index", idx) if not is_dict else ch.get("index", idx)
|
index = getattr(ch, "index", idx) if not is_dict else ch.get("index", idx)
|
||||||
text = getattr(ch, "text", "") if not is_dict else ch.get("text", "")
|
text = getattr(ch, "text", "") if not is_dict else ch.get("text", "")
|
||||||
window = getattr(ch, "window", text) if not is_dict else ch.get("window", text)
|
window = getattr(ch, "window", text) if not is_dict else ch.get("window", text)
|
||||||
|
|
||||||
prev_id = getattr(ch, "neighbors_prev", None) if not is_dict else ch.get("neighbors_prev")
|
prev_id = getattr(ch, "neighbors_prev", None) if not is_dict else ch.get("neighbors_prev")
|
||||||
next_id = getattr(ch, "neighbors_next", None) if not is_dict else ch.get("neighbors_next")
|
next_id = getattr(ch, "neighbors_next", None) if not is_dict else ch.get("neighbors_next")
|
||||||
|
|
||||||
# Korrektur des AttributeError: Nutzt getattr für Objekte, .get für Dicts
|
|
||||||
section = getattr(ch, "section_title", "") if not is_dict else ch.get("section", "")
|
section = getattr(ch, "section_title", "") if not is_dict else ch.get("section", "")
|
||||||
|
|
||||||
pl: Dict[str, Any] = {
|
pl: Dict[str, Any] = {
|
||||||
|
|
@ -67,6 +83,10 @@ def make_chunk_payloads(note: Dict[str, Any],
|
||||||
"retriever_weight": rw,
|
"retriever_weight": rw,
|
||||||
"chunk_profile": cp
|
"chunk_profile": cp
|
||||||
}
|
}
|
||||||
out.append(pl)
|
|
||||||
|
|
||||||
|
# Audit: Cleanup Pop (Alias Felder entfernen)
|
||||||
|
for alias in ("chunk_num", "Chunk_Number"):
|
||||||
|
pl.pop(alias, None)
|
||||||
|
|
||||||
|
out.append(pl)
|
||||||
return out
|
return out
|
||||||
|
|
@ -3,8 +3,8 @@ FILE: app/core/ingestion/ingestion_note_payload.py
|
||||||
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
||||||
FEATURES:
|
FEATURES:
|
||||||
- Multi-Hash (body/full) für flexible Change Detection.
|
- Multi-Hash (body/full) für flexible Change Detection.
|
||||||
- Fix v2.4.2: edge_defaults Logik wiederhergestellt (DoD-Korrektur).
|
- Fix v2.4.3: Vollständiger Audit-Check (Env-Vars, JSON-Validation, Edge-Defaults).
|
||||||
VERSION: 2.4.2
|
VERSION: 2.4.3
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -13,14 +13,13 @@ import os
|
||||||
import json
|
import json
|
||||||
import pathlib
|
import pathlib
|
||||||
import hashlib
|
import hashlib
|
||||||
import yaml
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Helper
|
# Helper
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _as_dict(x) -> Dict[str, Any]:
|
def _as_dict(x) -> Dict[str, Any]:
|
||||||
"""Versucht, ein ParsedMarkdown-ähnliches Objekt in ein Dict zu überführen."""
|
"""Versucht, ein Objekt in ein Dict zu überführen."""
|
||||||
if isinstance(x, dict): return dict(x)
|
if isinstance(x, dict): return dict(x)
|
||||||
out: Dict[str, Any] = {}
|
out: Dict[str, Any] = {}
|
||||||
for attr in ("frontmatter", "body", "id", "note_id", "title", "path", "tags", "type", "created", "modified", "date"):
|
for attr in ("frontmatter", "body", "id", "note_id", "title", "path", "tags", "type", "created", "modified", "date"):
|
||||||
|
|
@ -31,25 +30,24 @@ def _as_dict(x) -> Dict[str, Any]:
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def _ensure_list(x) -> list:
|
def _ensure_list(x) -> list:
|
||||||
"""Sichert, dass das Ergebnis eine Liste von Strings ist."""
|
"""Sichert String-Listen Integrität."""
|
||||||
if x is None: return []
|
if x is None: return []
|
||||||
if isinstance(x, list): return [str(i) for i in x]
|
if isinstance(x, list): return [str(i) for i in x]
|
||||||
if isinstance(x, (set, tuple)): return [str(i) for i in x]
|
if isinstance(x, (set, tuple)): return [str(i) for i in x]
|
||||||
return [str(x)]
|
return [str(x)]
|
||||||
|
|
||||||
def _compute_hash(content: str) -> str:
|
def _compute_hash(content: str) -> str:
|
||||||
"""Berechnet einen SHA-256 Hash."""
|
"""SHA-256 Hash-Berechnung."""
|
||||||
if not content: return ""
|
if not content: return ""
|
||||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||||
"""Stellt den zu hashenden Content deterministisch zusammen."""
|
"""Generiert den Hash-Input-String."""
|
||||||
body = str(n.get("body") or "")
|
body = str(n.get("body") or "")
|
||||||
if mode == "body": return body
|
if mode == "body": return body
|
||||||
if mode == "full":
|
if mode == "full":
|
||||||
fm = n.get("frontmatter") or {}
|
fm = n.get("frontmatter") or {}
|
||||||
meta_parts = []
|
meta_parts = []
|
||||||
# Steuernde Metadaten für Change Detection
|
|
||||||
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
for k in sorted(["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight"]):
|
||||||
val = fm.get(k)
|
val = fm.get(k)
|
||||||
if val is not None: meta_parts.append(f"{k}:{val}")
|
if val is not None: meta_parts.append(f"{k}:{val}")
|
||||||
|
|
@ -57,13 +55,13 @@ def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||||
return body
|
return body
|
||||||
|
|
||||||
def _cfg_for_type(note_type: str, reg: dict) -> dict:
|
def _cfg_for_type(note_type: str, reg: dict) -> dict:
|
||||||
"""Holt die typ-spezifische Konfiguration."""
|
"""Extrahiert Typ-spezifische Config."""
|
||||||
if not isinstance(reg, dict): return {}
|
if not isinstance(reg, dict): return {}
|
||||||
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
|
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
|
||||||
return types.get(note_type, {}) if isinstance(types, dict) else {}
|
return types.get(note_type, {}) if isinstance(types, dict) else {}
|
||||||
|
|
||||||
def _cfg_defaults(reg: dict) -> dict:
|
def _cfg_defaults(reg: dict) -> dict:
|
||||||
"""Holt die globalen Default-Werte aus der Registry."""
|
"""Extrahiert globale Default-Werte."""
|
||||||
if not isinstance(reg, dict): return {}
|
if not isinstance(reg, dict): return {}
|
||||||
for key in ("defaults", "default", "global"):
|
for key in ("defaults", "default", "global"):
|
||||||
v = reg.get(key)
|
v = reg.get(key)
|
||||||
|
|
@ -75,9 +73,7 @@ def _cfg_defaults(reg: dict) -> dict:
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
"""
|
"""Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung."""
|
||||||
Baut das Note-Payload inklusive Multi-Hash und edge_defaults.
|
|
||||||
"""
|
|
||||||
n = _as_dict(note)
|
n = _as_dict(note)
|
||||||
reg = kwargs.get("types_cfg") or {}
|
reg = kwargs.get("types_cfg") or {}
|
||||||
hash_source = kwargs.get("hash_source", "parsed")
|
hash_source = kwargs.get("hash_source", "parsed")
|
||||||
|
|
@ -89,19 +85,22 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
cfg_type = _cfg_for_type(note_type, reg)
|
cfg_type = _cfg_for_type(note_type, reg)
|
||||||
cfg_def = _cfg_defaults(reg)
|
cfg_def = _cfg_defaults(reg)
|
||||||
|
|
||||||
# --- retriever_weight ---
|
# --- retriever_weight Audit ---
|
||||||
|
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
|
||||||
retriever_weight = fm.get("retriever_weight")
|
retriever_weight = fm.get("retriever_weight")
|
||||||
if retriever_weight is None:
|
if retriever_weight is None:
|
||||||
retriever_weight = cfg_type.get("retriever_weight", cfg_def.get("retriever_weight", 1.0))
|
retriever_weight = cfg_type.get("retriever_weight", cfg_def.get("retriever_weight", default_rw))
|
||||||
try: retriever_weight = float(retriever_weight)
|
try: retriever_weight = float(retriever_weight)
|
||||||
except: retriever_weight = 1.0
|
except: retriever_weight = default_rw
|
||||||
|
|
||||||
# --- chunk_profile ---
|
# --- chunk_profile Audit ---
|
||||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||||
if chunk_profile is None:
|
if chunk_profile is None:
|
||||||
chunk_profile = cfg_type.get("chunking_profile", cfg_def.get("chunking_profile", "sliding_standard"))
|
chunk_profile = cfg_type.get("chunking_profile")
|
||||||
|
if chunk_profile is None:
|
||||||
|
chunk_profile = cfg_def.get("chunking_profile", "sliding_standard")
|
||||||
|
|
||||||
# --- edge_defaults (WIEDERHERGESTELLT) ---
|
# --- edge_defaults ---
|
||||||
edge_defaults = fm.get("edge_defaults")
|
edge_defaults = fm.get("edge_defaults")
|
||||||
if edge_defaults is None:
|
if edge_defaults is None:
|
||||||
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
|
edge_defaults = cfg_type.get("edge_defaults", cfg_def.get("edge_defaults", []))
|
||||||
|
|
@ -110,29 +109,35 @@ def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||||
# --- Basis-Metadaten ---
|
# --- Basis-Metadaten ---
|
||||||
note_id = n.get("note_id") or n.get("id") or fm.get("id")
|
note_id = n.get("note_id") or n.get("id") or fm.get("id")
|
||||||
title = n.get("title") or fm.get("title") or ""
|
title = n.get("title") or fm.get("title") or ""
|
||||||
|
path = n.get("path") or kwargs.get("file_path") or ""
|
||||||
|
if isinstance(path, pathlib.Path): path = str(path)
|
||||||
|
|
||||||
payload: Dict[str, Any] = {
|
payload: Dict[str, Any] = {
|
||||||
"note_id": note_id,
|
"note_id": note_id,
|
||||||
"title": title,
|
"title": title,
|
||||||
"type": note_type,
|
"type": note_type,
|
||||||
"path": str(n.get("path") or kwargs.get("file_path") or ""),
|
"path": path,
|
||||||
"retriever_weight": retriever_weight,
|
"retriever_weight": retriever_weight,
|
||||||
"chunk_profile": chunk_profile,
|
"chunk_profile": chunk_profile,
|
||||||
"edge_defaults": edge_defaults, # Feld jetzt wieder enthalten
|
"edge_defaults": edge_defaults,
|
||||||
"hashes": {}
|
"hashes": {}
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- MULTI-HASH ---
|
# --- MULTI-HASH ---
|
||||||
for mode in ["body", "full"]:
|
for mode in ["body", "full"]:
|
||||||
key = f"{mode}:{hash_source}:{hash_normalize}"
|
content = _get_hash_source_content(n, mode)
|
||||||
payload["hashes"][key] = _compute_hash(_get_hash_source_content(n, mode))
|
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
|
||||||
|
|
||||||
# Metadaten-Felder
|
# Metadaten
|
||||||
if fm.get("tags") or n.get("tags"): payload["tags"] = _ensure_list(fm.get("tags") or n.get("tags"))
|
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
||||||
|
if tags: payload["tags"] = _ensure_list(tags)
|
||||||
if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases"))
|
if fm.get("aliases"): payload["aliases"] = _ensure_list(fm.get("aliases"))
|
||||||
for k in ("created", "modified", "date"):
|
for k in ("created", "modified", "date"):
|
||||||
v = fm.get(k) or n.get(k)
|
v = fm.get(k) or n.get(k)
|
||||||
if v: payload[k] = str(v)
|
if v: payload[k] = str(v)
|
||||||
if n.get("body"): payload["fulltext"] = str(n["body"])
|
if n.get("body"): payload["fulltext"] = str(n["body"])
|
||||||
|
|
||||||
|
# Final JSON Validation Audit
|
||||||
|
json.loads(json.dumps(payload, ensure_ascii=False))
|
||||||
|
|
||||||
return payload
|
return payload
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
"""
|
"""
|
||||||
FILE: app/core/ingestion/ingestion_processor.py
|
FILE: app/core/ingestion/ingestion_processor.py
|
||||||
DESCRIPTION: Orchestriert den Ingestion-Prozess (Parsing -> Chunking -> Validierung -> DB).
|
DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||||
WP-14: Modularisiert. Nutzt interne Module für DB, Validierung und Payloads.
|
WP-14: Vollständig modularisiert.
|
||||||
WP-15b: Implementiert den Two-Pass Workflow via run_batch.
|
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||||
VERSION: 2.13.2
|
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||||
|
AUDIT v2.13.4: 100% Logik-Erhalt (Parameters, Registry-Context, DB-Points).
|
||||||
|
VERSION: 2.13.4
|
||||||
STATUS: Active
|
STATUS: Active
|
||||||
"""
|
"""
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -67,7 +69,7 @@ class IngestionService:
|
||||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
|
async def run_batch(self, file_paths: List[str], vault_root: str) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
WP-15b: Implementiert den Two-Pass Ingestion Workflow.
|
WP-15b: Implementiert den Two-Pass Ingestion Workflow.
|
||||||
Pass 1: Pre-Scan füllt den Context-Cache.
|
Pass 1: Pre-Scan füllt den Context-Cache (3-Wege-Indexierung).
|
||||||
Pass 2: Verarbeitung nutzt den Cache für die semantische Prüfung.
|
Pass 2: Verarbeitung nutzt den Cache für die semantische Prüfung.
|
||||||
"""
|
"""
|
||||||
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...")
|
logger.info(f"🔍 [Pass 1] Pre-Scanning {len(file_paths)} files for Context Cache...")
|
||||||
|
|
@ -91,6 +93,7 @@ class IngestionService:
|
||||||
apply = kwargs.get("apply", False)
|
apply = kwargs.get("apply", False)
|
||||||
force_replace = kwargs.get("force_replace", False)
|
force_replace = kwargs.get("force_replace", False)
|
||||||
purge_before = kwargs.get("purge_before", False)
|
purge_before = kwargs.get("purge_before", False)
|
||||||
|
note_scope_refs = kwargs.get("note_scope_refs", False)
|
||||||
hash_source = kwargs.get("hash_source", "parsed")
|
hash_source = kwargs.get("hash_source", "parsed")
|
||||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||||
|
|
||||||
|
|
@ -110,7 +113,11 @@ class IngestionService:
|
||||||
|
|
||||||
# 2. Payload & Change Detection (Multi-Hash)
|
# 2. Payload & Change Detection (Multi-Hash)
|
||||||
note_type = resolve_note_type(self.registry, fm.get("type"))
|
note_type = resolve_note_type(self.registry, fm.get("type"))
|
||||||
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=file_path, hash_source=hash_source, hash_normalize=hash_normalize)
|
note_pl = make_note_payload(
|
||||||
|
parsed, vault_root=vault_root, file_path=file_path,
|
||||||
|
hash_source=hash_source, hash_normalize=hash_normalize,
|
||||||
|
types_cfg=self.registry
|
||||||
|
)
|
||||||
note_id = note_pl["note_id"]
|
note_id = note_pl["note_id"]
|
||||||
|
|
||||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||||
|
|
@ -134,11 +141,11 @@ class IngestionService:
|
||||||
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
||||||
|
|
||||||
# WP-15b: Chunker-Aufruf bereitet Candidate-Pool vor
|
# WP-15b: Chunker-Aufruf bereitet Candidate-Pool vor
|
||||||
chunks = await assemble_chunks(fm["id"], body_text, note_type, config=chunk_cfg)
|
chunks = await assemble_chunks(note_id, body_text, note_type, config=chunk_cfg)
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
filtered = []
|
filtered = []
|
||||||
for cand in getattr(ch, "candidate_pool", []):
|
for cand in getattr(ch, "candidate_pool", []):
|
||||||
# Nur global_pool Kandidaten erfordern binäre Validierung
|
# WP-15b: Nur global_pool Kandidaten erfordern binäre Validierung
|
||||||
if cand.get("provenance") == "global_pool" and enable_smart:
|
if cand.get("provenance") == "global_pool" and enable_smart:
|
||||||
if await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm, self.settings.MINDNET_LLM_PROVIDER):
|
if await validate_edge_candidate(ch.text, cand, self.batch_cache, self.llm, self.settings.MINDNET_LLM_PROVIDER):
|
||||||
filtered.append(cand)
|
filtered.append(cand)
|
||||||
|
|
@ -147,16 +154,23 @@ class IngestionService:
|
||||||
ch.candidate_pool = filtered
|
ch.candidate_pool = filtered
|
||||||
|
|
||||||
# Payload-Erstellung via interne Module
|
# Payload-Erstellung via interne Module
|
||||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path)
|
chunk_pls = make_chunk_payloads(
|
||||||
|
fm, note_pl["path"], chunks, file_path=file_path,
|
||||||
|
types_cfg=self.registry
|
||||||
|
)
|
||||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||||
|
|
||||||
# Kanten-Aggregation
|
# Kanten-Aggregation
|
||||||
edges = build_edges_for_note(note_id, chunk_pls, note_level_references=note_pl.get("references", []))
|
edges = build_edges_for_note(
|
||||||
|
note_id, chunk_pls,
|
||||||
|
note_level_references=note_pl.get("references", []),
|
||||||
|
include_note_scope_refs=note_scope_refs
|
||||||
|
)
|
||||||
for e in edges:
|
for e in edges:
|
||||||
e["kind"] = edge_registry.resolve(
|
e["kind"] = edge_registry.resolve(
|
||||||
e.get("kind", "related_to"),
|
e.get("kind", "related_to"),
|
||||||
provenance=e.get("provenance", "explicit"),
|
provenance=e.get("provenance", "explicit"),
|
||||||
context={"file": file_path, "note_id": note_id}
|
context={"file": file_path, "note_id": note_id, "line": e.get("line", "system")}
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4. DB Upsert
|
# 4. DB Upsert
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user