app/core/chunk_payload.py aktualisiert
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
2ddf034983
commit
2de786fc64
|
|
@ -1,158 +1,220 @@
|
|||
# app/core/chunk_payload.py
|
||||
# Line count: 214
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Any, Dict, Iterable, List, Optional, Union
|
||||
|
||||
# ---- Helpers ----
|
||||
def _coerce_float(val: Any) -> Optional[float]:
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
|
||||
def _get(obj: Any, key: str, default: Any = None) -> Any:
|
||||
if obj is None:
|
||||
return default
|
||||
if hasattr(obj, key):
|
||||
try:
|
||||
val = getattr(obj, key)
|
||||
return val if val is not None else default
|
||||
except Exception:
|
||||
pass
|
||||
if isinstance(obj, dict):
|
||||
if key in obj:
|
||||
val = obj.get(key, default)
|
||||
return val if val is not None else default
|
||||
return default
|
||||
|
||||
|
||||
def _get_frontmatter(note: Any) -> Dict[str, Any]:
|
||||
fm = _get(note, "frontmatter", None)
|
||||
if isinstance(fm, dict):
|
||||
return fm
|
||||
meta = _get(note, "meta", None)
|
||||
if isinstance(meta, dict) and isinstance(meta.get("frontmatter"), dict):
|
||||
return meta["frontmatter"]
|
||||
return {}
|
||||
|
||||
|
||||
def _get_from_frontmatter(fm: Dict[str, Any], key: str, default: Any = None) -> Any:
|
||||
if not isinstance(fm, dict):
|
||||
return default
|
||||
if key in fm:
|
||||
val = fm.get(key, default)
|
||||
return val if val is not None else default
|
||||
return default
|
||||
|
||||
|
||||
def _coerce_tags(val: Any) -> List[str]:
|
||||
if val is None:
|
||||
return None
|
||||
try:
|
||||
if isinstance(val, (int, float)):
|
||||
return float(val)
|
||||
if isinstance(val, str):
|
||||
v = val.strip()
|
||||
if not v:
|
||||
return None
|
||||
return float(v.replace(",", "."))
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
def _extract_weight(frontmatter: Dict[str, Any], explicit: Optional[float]) -> Optional[float]:
|
||||
if explicit is not None:
|
||||
return _coerce_float(explicit)
|
||||
if frontmatter is None:
|
||||
return None
|
||||
if "retriever_weight" in frontmatter:
|
||||
return _coerce_float(frontmatter.get("retriever_weight"))
|
||||
# also accept nested style: retriever: { weight: 0.8 }
|
||||
retriever = frontmatter.get("retriever")
|
||||
if isinstance(retriever, dict) and "weight" in retriever:
|
||||
return _coerce_float(retriever.get("weight"))
|
||||
return None
|
||||
|
||||
def _ensure_list(x: Any) -> List[Any]:
|
||||
if x is None:
|
||||
return []
|
||||
if isinstance(x, list):
|
||||
return x
|
||||
return [x]
|
||||
if isinstance(val, list):
|
||||
return [str(x) for x in val]
|
||||
if isinstance(val, str):
|
||||
parts = [t.strip() for t in val.split(",")]
|
||||
return [p for p in parts if p]
|
||||
return []
|
||||
|
||||
def _resolve_note_id(frontmatter: Dict[str, Any], kw_note_id: Optional[str]) -> Optional[str]:
|
||||
if kw_note_id:
|
||||
return kw_note_id
|
||||
if not isinstance(frontmatter, dict):
|
||||
return None
|
||||
return frontmatter.get("id") or frontmatter.get("note_id")
|
||||
|
||||
def _base_fields(frontmatter: Dict[str, Any], note_id: Optional[str], path: str) -> Dict[str, Any]:
|
||||
title = None
|
||||
typ = None
|
||||
tags = None
|
||||
if isinstance(frontmatter, dict):
|
||||
title = frontmatter.get("title")
|
||||
typ = frontmatter.get("type") or frontmatter.get("note_type")
|
||||
# tags can be list[str] or comma separated string
|
||||
tags = frontmatter.get("tags")
|
||||
if isinstance(tags, str):
|
||||
tags = [t.strip() for t in tags.split(",") if t.strip()]
|
||||
def _resolve_retriever_weight(
|
||||
fm: Dict[str, Any],
|
||||
explicit: Optional[float],
|
||||
) -> Optional[float]:
|
||||
if explicit is not None:
|
||||
return explicit
|
||||
val = _get_from_frontmatter(fm, "retriever_weight", None)
|
||||
if isinstance(val, (int, float)):
|
||||
return float(val)
|
||||
retr = fm.get("retriever")
|
||||
if isinstance(retr, dict):
|
||||
v = retr.get("weight")
|
||||
if isinstance(v, (int, float)):
|
||||
return float(v)
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_note_fields(note: Any) -> Dict[str, Any]:
|
||||
fm = _get_frontmatter(note)
|
||||
|
||||
note_id = _get_from_frontmatter(fm, "id", None)
|
||||
if note_id is None:
|
||||
note_id = _get(note, "note_id", None)
|
||||
if note_id is None:
|
||||
note_id = _get(note, "id", None)
|
||||
|
||||
title = _get_from_frontmatter(fm, "title", None)
|
||||
if title is None:
|
||||
title = _get(note, "title", None)
|
||||
|
||||
ntype = _get_from_frontmatter(fm, "type", None)
|
||||
if ntype is None:
|
||||
ntype = _get(note, "type", None)
|
||||
|
||||
tags = _get_from_frontmatter(fm, "tags", None)
|
||||
if tags is None:
|
||||
tags = _get(note, "tags", None)
|
||||
tags = _coerce_tags(tags)
|
||||
|
||||
path = _get_from_frontmatter(fm, "path", None)
|
||||
if path is None:
|
||||
path = _get(note, "path", None)
|
||||
if path is None:
|
||||
path = _get(note, "source", None)
|
||||
if path is None:
|
||||
path = _get(note, "filepath", None)
|
||||
|
||||
return {
|
||||
"note_id": note_id,
|
||||
"title": title,
|
||||
"type": typ,
|
||||
"type": ntype,
|
||||
"tags": tags,
|
||||
"path": path or None,
|
||||
"path": path,
|
||||
"frontmatter": fm,
|
||||
}
|
||||
|
||||
# ---- Public API ----
|
||||
|
||||
def _extract_chunk_text_and_index(
|
||||
chunk: Any,
|
||||
fallback_index: int,
|
||||
) -> Tuple[str, int]:
|
||||
"""
|
||||
Akzeptiert verschiedene Chunk-Formate:
|
||||
- str (reiner Text)
|
||||
- dict mit keys: text | window | body | content
|
||||
- Objekt mit Attributen: text | window | body | content
|
||||
- (text, idx) Tuple
|
||||
"""
|
||||
# Tuple (text, idx)
|
||||
if isinstance(chunk, tuple) and len(chunk) == 2 and isinstance(chunk[0], str):
|
||||
txt, idx = chunk
|
||||
try:
|
||||
idx_int = int(idx)
|
||||
except Exception:
|
||||
idx_int = fallback_index
|
||||
return txt, idx_int
|
||||
|
||||
# String
|
||||
if isinstance(chunk, str):
|
||||
return chunk, fallback_index
|
||||
|
||||
# Dict
|
||||
if isinstance(chunk, dict):
|
||||
txt = (
|
||||
chunk.get("text")
|
||||
or chunk.get("window")
|
||||
or chunk.get("body")
|
||||
or chunk.get("content")
|
||||
)
|
||||
if isinstance(txt, str):
|
||||
idx = chunk.get("index")
|
||||
try:
|
||||
idx_int = int(idx) if idx is not None else fallback_index
|
||||
except Exception:
|
||||
idx_int = fallback_index
|
||||
return txt, idx_int
|
||||
|
||||
# Objekt mit Attributen
|
||||
for attr in ("text", "window", "body", "content"):
|
||||
if hasattr(chunk, attr):
|
||||
try:
|
||||
txt = getattr(chunk, attr)
|
||||
except Exception:
|
||||
txt = None
|
||||
if isinstance(txt, str):
|
||||
# Optionale "index"-Quelle
|
||||
idx = None
|
||||
if hasattr(chunk, "index"):
|
||||
try:
|
||||
idx = getattr(chunk, "index")
|
||||
except Exception:
|
||||
idx = None
|
||||
try:
|
||||
idx_int = int(idx) if idx is not None else fallback_index
|
||||
except Exception:
|
||||
idx_int = fallback_index
|
||||
return txt, idx_int
|
||||
|
||||
# Wenn nichts passt -> klarer Fehler
|
||||
raise ValueError("Unsupported chunk format: cannot extract text/index")
|
||||
|
||||
|
||||
def make_chunk_payloads(
|
||||
frontmatter: Dict[str, Any],
|
||||
*args,
|
||||
note_id: Optional[str] = None,
|
||||
chunks: Optional[Iterable[Any]] = None,
|
||||
path: str = "",
|
||||
chunk_profile: Optional[str] = None,
|
||||
note: Any,
|
||||
chunks: Iterable[Any],
|
||||
*,
|
||||
retriever_weight: Optional[float] = None,
|
||||
**kwargs,
|
||||
base_payload: Optional[Dict[str, Any]] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Build chunk payload dictionaries for Qdrant.
|
||||
|
||||
This function is intentionally permissive to stay compatible with older callers:
|
||||
- If `chunks` is a list of dictionaries that already contain payload-like fields,
|
||||
those are augmented.
|
||||
- If `chunks` is a list of strings, minimal payloads are created.
|
||||
- If `chunks` is a list of dicts with keys like `text`, `window`, or `index`, they are normalized.
|
||||
|
||||
Always injects `retriever_weight` into each payload when available (from explicit arg or frontmatter).
|
||||
Erzeugt Qdrant-Payloads für Chunk-Punkte.
|
||||
- Kopiert Note-Metadaten (note_id/title/type/tags/path)
|
||||
- Schreibt text + chunk_index je Chunk
|
||||
- Setzt retriever_weight, wenn vorhanden/angegeben
|
||||
"""
|
||||
# Backward-compat for callers that might pass via kwargs
|
||||
if chunks is None:
|
||||
chunks = kwargs.get("payloads") or kwargs.get("pls") or kwargs.get("items") or kwargs.get("chunk_items")
|
||||
|
||||
note_id_resolved = _resolve_note_id(frontmatter, note_id)
|
||||
weight = _extract_weight(frontmatter, retriever_weight)
|
||||
base = _base_fields(frontmatter, note_id_resolved, path)
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
for idx, item in enumerate(_ensure_list(chunks)):
|
||||
# Case A: already a full payload dict (heuristic: has 'text' or 'window' or 'note_id' keys)
|
||||
if isinstance(item, dict) and ("text" in item or "window" in item or "note_id" in item):
|
||||
pl = dict(item) # shallow copy
|
||||
# ensure base fields exist if missing
|
||||
for k, v in base.items():
|
||||
pl.setdefault(k, v)
|
||||
# ensure chunk_index if not present
|
||||
pl.setdefault("chunk_index", item.get("index", idx))
|
||||
# inject retriever_weight
|
||||
if weight is not None:
|
||||
pl["retriever_weight"] = weight
|
||||
out.append(pl)
|
||||
continue
|
||||
note_fields = _resolve_note_fields(note)
|
||||
fm = note_fields["frontmatter"]
|
||||
rw = _resolve_retriever_weight(fm, retriever_weight)
|
||||
|
||||
# Case B: item is a dict with nested 'payload'
|
||||
if isinstance(item, dict) and "payload" in item and isinstance(item["payload"], dict):
|
||||
pl = dict(item["payload"])
|
||||
for k, v in base.items():
|
||||
pl.setdefault(k, v)
|
||||
pl.setdefault("chunk_index", pl.get("index", idx))
|
||||
if weight is not None:
|
||||
pl["retriever_weight"] = weight
|
||||
out.append(pl)
|
||||
continue
|
||||
# Basisfelder, die jeder Chunk tragen soll
|
||||
common: Dict[str, Any] = {}
|
||||
if base_payload:
|
||||
common.update({k: v for k, v in base_payload.items() if v is not None})
|
||||
|
||||
# Case C: item is a plain string -> treat as text (no window context)
|
||||
if isinstance(item, str):
|
||||
text_val = item
|
||||
pl = {
|
||||
**base,
|
||||
"chunk_index": idx,
|
||||
"text": text_val,
|
||||
"window": text_val,
|
||||
}
|
||||
if weight is not None:
|
||||
pl["retriever_weight"] = weight
|
||||
out.append(pl)
|
||||
continue
|
||||
if note_fields.get("note_id") is not None:
|
||||
common["note_id"] = note_fields["note_id"]
|
||||
if note_fields.get("title") is not None:
|
||||
common["title"] = note_fields["title"]
|
||||
if note_fields.get("type") is not None:
|
||||
common["type"] = note_fields["type"]
|
||||
if note_fields.get("tags"):
|
||||
common["tags"] = note_fields["tags"]
|
||||
if note_fields.get("path") is not None:
|
||||
common["path"] = note_fields["path"]
|
||||
if rw is not None:
|
||||
common["retriever_weight"] = rw
|
||||
|
||||
# Case D: item has 'text'/'window' under different names
|
||||
if isinstance(item, dict):
|
||||
text_val = item.get("text") or item.get("body") or item.get("content") or ""
|
||||
window_val = item.get("window") or text_val
|
||||
pl = {
|
||||
**base,
|
||||
"chunk_index": item.get("chunk_index", item.get("index", idx)),
|
||||
"text": text_val,
|
||||
"window": window_val,
|
||||
}
|
||||
if weight is not None:
|
||||
pl["retriever_weight"] = weight
|
||||
out.append(pl)
|
||||
continue
|
||||
|
||||
# Fallback: minimal payload
|
||||
pl = {**base, "chunk_index": idx}
|
||||
if weight is not None:
|
||||
pl["retriever_weight"] = weight
|
||||
out.append(pl)
|
||||
for i, ch in enumerate(chunks):
|
||||
text, idx = _extract_chunk_text_and_index(ch, i)
|
||||
payload = dict(common) # copy
|
||||
payload["chunk_index"] = idx
|
||||
payload["text"] = text
|
||||
out.append(payload)
|
||||
|
||||
return out
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user