diff --git a/scripts/import_markdown.py b/scripts/import_markdown.py index 57988bb..336408f 100644 --- a/scripts/import_markdown.py +++ b/scripts/import_markdown.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -scripts/import_markdown.py +scripts/import_markdown.py (V2.3.1) Zweck ----- @@ -11,6 +11,12 @@ Zweck - Integriert eine optionale Type-Registry (types.yaml), um z. B. chunk_profile und retriever_weight pro Notiz-Typ zu steuern. +Fix in V2.3.1 +------------- +- `retriever_weight` wird nun **immer deterministisch** gesetzt: + Frontmatter-Override > types.yaml > Default=1.0 (falls nichts konfiguriert). + Das Feld wird **zwingend** in mindnet_notes **und** mindnet_chunks geschrieben. + Kompatibilität & Fixes ---------------------- - Unterstützt sowohl app.core.derive_edges (bevorzugt) als auch app.core.edges als Fallback @@ -18,7 +24,6 @@ Kompatibilität & Fixes nicht zu TypeError führen. - `scroll_filter` wird für alle Scrolls verwendet (Qdrant >= 1.7.x). - `--purge-before-upsert` entfernt alte Chunks/Edges einer Note, wenn sich die Note geändert hat. -- `retriever_weight` aus types.yaml bzw. Frontmatter wird in Note- und Chunk-Payload gespiegelt. - Baseline-Hash-Strategie: hash_mode (body|frontmatter|full), hash_source (parsed|raw), hash_normalize (canonical|none). Aufrufbeispiele @@ -113,7 +118,7 @@ def effective_chunk_profile(note_type: str, reg: dict) -> Optional[str]: return prof return None -def effective_retriever_weight(note_type: str, reg: dict) -> Optional[float]: +def effective_retriever_weight_from_registry(note_type: str, reg: dict) -> Optional[float]: cfg = get_type_config(note_type, reg) w = cfg.get("retriever_weight") try: @@ -121,6 +126,25 @@ def effective_retriever_weight(note_type: str, reg: dict) -> Optional[float]: except Exception: return None +def compute_effective_retriever_weight(fm: Dict[str, Any], note_type: str, reg: dict) -> float: + """Ermittelt den finalen retriever_weight: + 1) Frontmatter-Override + 2) types.yaml (für den type) + 3) Default 1.0 + """ + # 1) Frontmatter-Override + if fm.get("retriever_weight") is not None: + try: + return float(fm.get("retriever_weight")) + except Exception: + pass + # 2) Registry + r = effective_retriever_weight_from_registry(note_type, reg) + if r is not None: + return float(r) + # 3) Default + return 1.0 + # ------------------------------------------------------------ # Sonstige Helper # ------------------------------------------------------------ @@ -345,12 +369,9 @@ def main() -> None: if prof: fm["chunk_profile"] = prof - weight = effective_retriever_weight(note_type, reg) - if weight is not None: - try: - fm["retriever_weight"] = float(weight) - except Exception: - pass + # NEU: finalen retriever_weight deterministisch bestimmen + rw = compute_effective_retriever_weight(fm, note_type, reg) + fm["retriever_weight"] = rw # Frontmatter spiegeln, damit nachfolgende Builder konsistent sind # --- Payload aufbauen (inkl. Hashes) --- try: @@ -369,12 +390,11 @@ def main() -> None: if not note_pl.get("fulltext"): note_pl["fulltext"] = getattr(parsed, "body", "") or "" - # retriever_weight sicher in Note-Payload spiegeln - if "retriever_weight" not in note_pl and fm.get("retriever_weight") is not None: - try: - note_pl["retriever_weight"] = float(fm.get("retriever_weight")) - except Exception: - pass + # NEU: retriever_weight **immer** in Note-Payload setzen + try: + note_pl["retriever_weight"] = float(rw) + except Exception: + note_pl["retriever_weight"] = 1.0 note_id = note_pl.get("note_id") or fm.get("id") if not note_id: @@ -410,15 +430,13 @@ def main() -> None: print(json.dumps({"path": path, "note_id": note_id, "error": f"chunk build failed: {type(e).__name__}: {e}"})) continue - # retriever_weight auf Chunk-Payload spiegeln - if fm.get("retriever_weight") is not None: - try: - rw = float(fm.get("retriever_weight")) - for pl in chunk_pls: - if "retriever_weight" not in pl: - pl["retriever_weight"] = rw - except Exception: - pass + # NEU: retriever_weight **immer** auf Chunk-Payload spiegeln + try: + rwf = float(rw) + except Exception: + rwf = 1.0 + for pl in chunk_pls: + pl["retriever_weight"] = rwf # Embeddings (fallback: Nullvektoren) vecs: List[List[float]] = [[0.0] * int(cfg.dim) for _ in chunk_pls] @@ -450,6 +468,8 @@ def main() -> None: summary = { "note_id": note_id, "title": fm.get("title"), + "type": fm.get("type"), + "rw": rw, "chunks": len(chunk_pls), "edges": len(edges), "edges_failed": edges_failed,