Dateien nach "app/core" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
a7c5630e5b
commit
f1e1cde597
|
|
@ -1,32 +1,27 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/core/chunk_payload.py
|
||||
Version: 2.2.0
|
||||
Datum: 2025-10-06
|
||||
# Modul: app/core/chunk_payload.py
|
||||
# Version: 2.3.1
|
||||
# Datum: 2025-11-08
|
||||
#
|
||||
# Zweck
|
||||
# -----
|
||||
# Erzeugt Qdrant-Payloads für Chunks. Voll abwärtskompatibel zu v2.2.0.
|
||||
# Fixes:
|
||||
# - 'retriever_weight' aus Frontmatter wird IMMER in jeden Chunk-Payload übernommen
|
||||
# (Float; Default via ENV MINDNET_DEFAULT_RETRIEVER_WEIGHT, sonst 1.0).
|
||||
# - 'chunk_profile' aus Frontmatter wird – falls vorhanden – in jeden Chunk-Payload übernommen.
|
||||
# - Robustere Fenster/Overlap-Erzeugung bleibt erhalten.
|
||||
#
|
||||
# Hinweis zu Qdrant:
|
||||
# Qdrant ist schemaflexibel. Ein Feld erscheint in der UI/HTTP-API erst,
|
||||
# wenn mindestens 1 Punkt es im Payload besitzt. Für konsistente Typisierung
|
||||
# empfiehlt sich zusätzlich eine Payload-Index-Definition (z.B. FLOAT für
|
||||
# 'retriever_weight').
|
||||
|
||||
Zweck
|
||||
-----
|
||||
Erzeugt Qdrant-Payloads für Chunks. Voll abwärtskompatibel zu v2.0.1.
|
||||
Neu: Wenn der Chunker KEIN Overlap im Fenster liefert (== window fehlt / identisch zur Kernpassage),
|
||||
erzeugen wir FENSTER mit synthetischem Overlap auf Basis chunk_config.get_sizes(note_type)['overlap'].
|
||||
|
||||
Felder (beibehalten aus 2.0.1):
|
||||
- note_id, chunk_id, id (Alias), chunk_index, seq, path
|
||||
- window (mit Overlap), text (ohne linkes Overlap)
|
||||
- start, end (Offsets im gesamten Body)
|
||||
- overlap_left, overlap_right
|
||||
- token_count?, section_title?, section_path?, type?, title?, tags?
|
||||
|
||||
Kompatibilität:
|
||||
- 'id' == 'chunk_id' als Alias
|
||||
- Pfade bleiben relativ (keine führenden '/'), Backslashes → Slashes
|
||||
- Robust für Chunk-Objekte oder Dicts; Fensterquelle: 'window'|'text'|'content'|'raw'
|
||||
|
||||
Lizenz: MIT (projektintern)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
try:
|
||||
|
|
@ -37,7 +32,6 @@ except Exception:
|
|||
# konservativer Default, falls Import fehlschlägt
|
||||
return {"overlap": (40, 60), "target": (250, 350), "max": 500}
|
||||
|
||||
|
||||
# ------------------------------- Utils ------------------------------- #
|
||||
|
||||
def _get_attr_or_key(obj: Any, key: str, default=None):
|
||||
|
|
@ -67,6 +61,16 @@ def _normalize_rel_path(p: str) -> str:
|
|||
p = p[1:]
|
||||
return p
|
||||
|
||||
def _to_float(val: Any, default: float) -> float:
|
||||
try:
|
||||
if val is None:
|
||||
return float(default)
|
||||
if isinstance(val, (int, float)):
|
||||
return float(val)
|
||||
s = str(val).strip().replace(",", ".")
|
||||
return float(s)
|
||||
except Exception:
|
||||
return float(default)
|
||||
|
||||
# ---------------------- Overlap & Offsets ---------------------------- #
|
||||
|
||||
|
|
@ -104,7 +108,6 @@ def _overlap_len_suffix_prefix(a: str, b: str, max_probe: int = 4096) -> int:
|
|||
return k
|
||||
return 0
|
||||
|
||||
|
||||
# ----------------------------- Public API ---------------------------- #
|
||||
|
||||
def make_chunk_payloads(
|
||||
|
|
@ -116,6 +119,7 @@ def make_chunk_payloads(
|
|||
"""
|
||||
Baut Payloads pro Chunk. Falls Fenster ohne Overlap geliefert werden,
|
||||
erzeugen wir synthetische 'window'-Texte mit typgerechtem Overlap.
|
||||
Zusätzlich werden 'retriever_weight' (float) und 'chunk_profile' übernommen.
|
||||
"""
|
||||
note_id = str(frontmatter.get("id") or "").strip()
|
||||
note_type = str(frontmatter.get("type", "")).lower()
|
||||
|
|
@ -123,6 +127,11 @@ def make_chunk_payloads(
|
|||
note_tags = frontmatter.get("tags", None)
|
||||
rel_path = _normalize_rel_path(rel_path)
|
||||
|
||||
# --- neue Felder aus FM (mit Defaults) ---
|
||||
default_rw = _to_float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0), 1.0)
|
||||
fm_rw = _to_float(frontmatter.get("retriever_weight"), default_rw)
|
||||
fm_chunk_profile = frontmatter.get("chunk_profile") or frontmatter.get("profile") or None
|
||||
|
||||
# 1) Rohdaten sammeln (so wie geliefert)
|
||||
chunks_list = list(chunks)
|
||||
raw_windows: List[str] = []
|
||||
|
|
@ -235,6 +244,8 @@ def make_chunk_payloads(
|
|||
"end": ends[i],
|
||||
"overlap_left": overlaps_left[i],
|
||||
"overlap_right": overlaps_right[i],
|
||||
# NEU:
|
||||
"retriever_weight": fm_rw,
|
||||
}
|
||||
# optionale Metadaten
|
||||
if note_type:
|
||||
|
|
@ -250,15 +261,15 @@ def make_chunk_payloads(
|
|||
if section_paths[i] is not None:
|
||||
sp = str(section_paths[i]).replace("\\", "/")
|
||||
pl["section_path"] = sp if sp else "/"
|
||||
if fm_chunk_profile is not None:
|
||||
pl["chunk_profile"] = str(fm_chunk_profile)
|
||||
|
||||
payloads.append(pl)
|
||||
|
||||
return payloads
|
||||
|
||||
|
||||
# __main__ Demo (optional)
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
fm = {"id": "demo", "title": "Demo", "type": "concept"}
|
||||
# Beispiel ohne echte Fenster → erzeugt synthetische Overlaps
|
||||
fm = {"id": "demo", "title": "Demo", "type": "concept", "retriever_weight": 0.75, "chunk_profile": "tight"}
|
||||
chunks = [
|
||||
{"id": "demo#1", "text": "Alpha Beta Gamma"},
|
||||
{"id": "demo#2", "text": "Gamma Delta"},
|
||||
|
|
@ -267,5 +278,3 @@ if __name__ == "__main__": # pragma: no cover
|
|||
pls = make_chunk_payloads(fm, "path/demo.md", chunks, note_text="Alpha Beta Gamma Delta Epsilon Zeta")
|
||||
from pprint import pprint
|
||||
pprint(pls)
|
||||
recon = "".join(p["text"] for p in pls)
|
||||
print("RECON:", recon)
|
||||
|
|
|
|||
|
|
@ -1,8 +1,12 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Modul: app/core/note_payload.py
|
||||
# Version: 1.7.0
|
||||
# Datum: 2025-09-09
|
||||
# Version: 1.8.0
|
||||
# Datum: 2025-11-08
|
||||
# Änderungen:
|
||||
# - 'retriever_weight' (Float; Default via ENV MINDNET_DEFAULT_RETRIEVER_WEIGHT, sonst 1.0) aus Frontmatter in Note-Payload übernommen.
|
||||
# - 'chunk_profile' (falls vorhanden) übernommen.
|
||||
# - Hash-Logik unverändert, kompatibel zu 1.7.0.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
@ -35,9 +39,7 @@ def _resolve_hash_mode(explicit: Optional[str]) -> str:
|
|||
if explicit:
|
||||
val = explicit.strip().lower()
|
||||
else:
|
||||
val = (os.environ.get("MINDNET_HASH_MODE")
|
||||
or os.environ.get("MINDNET_HASH_COMPARE")
|
||||
or "body").strip().lower()
|
||||
val = (os.environ.get("MINDNET_HASH_MODE") or os.environ.get("MINDNET_HASH_COMPARE") or "body").strip().lower()
|
||||
if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"):
|
||||
return "full"
|
||||
if val in ("frontmatter", "fm"):
|
||||
|
|
@ -81,6 +83,17 @@ def _hash_for(mode: str, *, body: str, fm: Dict[str, Any], normalize: str) -> st
|
|||
# default: body
|
||||
return _sha256(body_n)
|
||||
|
||||
def _to_float(val: Any, default: float) -> float:
|
||||
try:
|
||||
if val is None:
|
||||
return float(default)
|
||||
if isinstance(val, (int, float)):
|
||||
return float(val)
|
||||
s = str(val).strip().replace(",", ".")
|
||||
return float(s)
|
||||
except Exception:
|
||||
return float(default)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Kernfunktion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -95,13 +108,7 @@ def make_note_payload(
|
|||
file_path: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Liefert den Note-Payload inkl. Mehrfach-Hashes.
|
||||
- Es werden IMMER die drei Hashes für (body|frontmatter|full) unter
|
||||
'parsed:canonical' erzeugt (Schlüssel: z. B. 'body:parsed:canonical').
|
||||
- Zusätzlich werden – falls die aktuelle Konfig (source/normalize) davon
|
||||
abweicht – die drei Hashes unter den entsprechenden Schlüsseln erzeugt,
|
||||
z. B. 'frontmatter:raw:none'.
|
||||
- 'hash_fulltext' und 'hash_signature' repräsentieren den *aktuellen* Modus.
|
||||
Liefert den Note-Payload inkl. Mehrfach-Hashes und FM-Feldern.
|
||||
"""
|
||||
# dict oder Objekt akzeptieren
|
||||
if isinstance(parsed, dict):
|
||||
|
|
@ -170,6 +177,11 @@ def make_note_payload(
|
|||
# Wikilinks (Note-Ebene)
|
||||
refs = list(dict.fromkeys(extract_wikilinks(body_parsed))) if body_parsed else []
|
||||
|
||||
# NEU: Defaults & Casting
|
||||
default_rw = _to_float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0), 1.0)
|
||||
fm_rw = _to_float(fm.get("retriever_weight"), default_rw)
|
||||
fm_chunk_profile = fm.get("chunk_profile") or fm.get("profile") or None
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"note_id": fm.get("id") or fm.get("note_id"),
|
||||
"title": fm.get("title"),
|
||||
|
|
@ -191,47 +203,16 @@ def make_note_payload(
|
|||
"hash_full": hash_full,
|
||||
# Fallback-Refs
|
||||
"references": refs,
|
||||
# NEU:
|
||||
"retriever_weight": fm_rw,
|
||||
}
|
||||
|
||||
if fm_chunk_profile is not None:
|
||||
payload["chunk_profile"] = str(fm_chunk_profile)
|
||||
|
||||
for k in ("area", "project", "source", "lang", "slug", "aliases"):
|
||||
if k in fm:
|
||||
payload[k] = fm[k]
|
||||
# --- MINIMAL PATCH: retriever_weight in Note-Payload injizieren (ohne Seiteneffekte) ---
|
||||
# Annahmen: Variablen `payload`, `parsed_note`, `retriever_weight`, `type_defaults` existieren bereits
|
||||
# und `payload` enthält die bisherigen Felder wie gehabt.
|
||||
|
||||
# Frontmatter defensiv holen, ohne Struktur zu verändern:
|
||||
fm = {}
|
||||
try:
|
||||
fm = getattr(parsed_note, "frontmatter", {}) or {}
|
||||
except Exception:
|
||||
pass
|
||||
if not isinstance(fm, dict):
|
||||
fm = {}
|
||||
|
||||
# Note-Typ möglichst aus Frontmatter oder parsed_note lesen, ohne bestehende Logik zu beeinflussen:
|
||||
note_type = fm.get("type")
|
||||
if not note_type:
|
||||
note_type = getattr(parsed_note, "type", None)
|
||||
|
||||
# Wertkaskade: Frontmatter > type_defaults > Funktionsargument > (kein Fallback: wir setzen nur, wenn vorhanden)
|
||||
rw_val = None
|
||||
if "retriever_weight" in fm:
|
||||
rw_val = fm["retriever_weight"]
|
||||
elif type_defaults and note_type in type_defaults and isinstance(type_defaults[note_type], dict):
|
||||
if "retriever_weight" in type_defaults[note_type]:
|
||||
rw_val = type_defaults[note_type]["retriever_weight"]
|
||||
elif retriever_weight is not None:
|
||||
rw_val = retriever_weight
|
||||
|
||||
# Nur setzen, wenn ein Wert vorhanden ist – und robust nach float wandeln:
|
||||
if rw_val is not None:
|
||||
try:
|
||||
payload["retriever_weight"] = float(str(rw_val).replace(",", "."))
|
||||
except Exception:
|
||||
# Keine Havarie riskieren – wenn nicht konvertierbar, nicht setzen.
|
||||
pass
|
||||
# --- END MINIMAL PATCH ---
|
||||
|
||||
return payload
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user