From 7b56f696d6f1dfb1bdb46730e9adf595a9ababea Mon Sep 17 00:00:00 2001 From: Lars Date: Sat, 8 Nov 2025 16:58:47 +0100 Subject: [PATCH] Dateien nach "app/core" hochladen --- app/core/chunk_payload.py | 58 +++++++++++++++---------------------- app/core/derive_edges.py | 61 ++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 71 deletions(-) diff --git a/app/core/chunk_payload.py b/app/core/chunk_payload.py index b1d893c..5676190 100644 --- a/app/core/chunk_payload.py +++ b/app/core/chunk_payload.py @@ -2,19 +2,28 @@ # -*- coding: utf-8 -*- """ Modul: app/core/chunk_payload.py -Version: 2.3.0 -Datum: 2025-11-08 +Version: 2.2.0 +Datum: 2025-10-06 -Änderungen ggü. 2.2.0 ----------------------- -- Optionaler Parameter `chunk_profile: str | None` (abwärtskompatibel). -- Bei fehlenden *echten* Fenstern (kein Overlap geliefert) wird das synthetische - Overlap anhand des Chunk-Profils (short|medium|long) gewählt. Ohne Profil - bleibt das bisherige Verhalten bestehen (Übernahme aus get_sizes(note_type)). +Zweck +----- +Erzeugt Qdrant-Payloads für Chunks. Voll abwärtskompatibel zu v2.0.1. +Neu: Wenn der Chunker KEIN Overlap im Fenster liefert (== window fehlt / identisch zur Kernpassage), +erzeugen wir FENSTER mit synthetischem Overlap auf Basis chunk_config.get_sizes(note_type)['overlap']. -Hinweis ------- -IDs, Felder und Vektoren bleiben unverändert. +Felder (beibehalten aus 2.0.1): + - note_id, chunk_id, id (Alias), chunk_index, seq, path + - window (mit Overlap), text (ohne linkes Overlap) + - start, end (Offsets im gesamten Body) + - overlap_left, overlap_right + - token_count?, section_title?, section_path?, type?, title?, tags? + +Kompatibilität: + - 'id' == 'chunk_id' als Alias + - Pfade bleiben relativ (keine führenden '/'), Backslashes → Slashes + - Robust für Chunk-Objekte oder Dicts; Fensterquelle: 'window'|'text'|'content'|'raw' + +Lizenz: MIT (projektintern) """ from __future__ import annotations @@ -28,13 +37,6 @@ except Exception: # konservativer Default, falls Import fehlschlägt return {"overlap": (40, 60), "target": (250, 350), "max": 500} -# NEU: optionaler Import – Overlap-Empfehlungen aus der Type-Registry -try: - from app.core.type_registry import profile_overlap as _profile_overlap -except Exception: - def _profile_overlap(_profile: Optional[str]) -> tuple[int,int]: - return (40, 60) - # ------------------------------- Utils ------------------------------- # @@ -110,8 +112,6 @@ def make_chunk_payloads( rel_path: str, chunks: Iterable[Union[Dict[str, Any], Any]], note_text: Optional[str] = None, - *, # neue, nur-keyword Parameter bleiben abwärtskompatibel - chunk_profile: Optional[str] = None, ) -> List[Dict[str, Any]]: """ Baut Payloads pro Chunk. Falls Fenster ohne Overlap geliefert werden, @@ -158,28 +158,15 @@ def make_chunk_payloads( windows_final = raw_windows[:] # bereits mit Overlap geliefert else: # Keine echten Fenster → Segmente sind identisch zu "Fenstern" (bisher), - # wir erzeugen synthetische Fenster mit Overlap. + # wir erzeugen synthetische Fenster mit Overlap gemäß Typ segments = [w or "" for w in raw_windows] overlaps_left = [] windows_final = [] recon = "" - try: - # Bisheriges Verhalten: aus get_sizes(note_type) overlap_low, overlap_high = tuple(_get_sizes(note_type).get("overlap", (40, 60))) except Exception: overlap_low, overlap_high = (40, 60) - - # Registry-Profil (falls vorhanden) übersteuert *nur* den Overlap defensiv - if isinstance(chunk_profile, str) and chunk_profile.strip(): - try: - o_low, o_high = _profile_overlap(chunk_profile.strip().lower()) - # defensiver Clamp: niemals größer als 3x Default - overlap_low = max(0, min(o_low, overlap_low * 3)) - overlap_high = max(overlap_low, min(o_high, overlap_high * 3)) - except Exception: - pass - overlap_target = int(overlap_low) for i, seg in enumerate(segments): @@ -268,6 +255,7 @@ def make_chunk_payloads( return payloads +# __main__ Demo (optional) if __name__ == "__main__": # pragma: no cover fm = {"id": "demo", "title": "Demo", "type": "concept"} # Beispiel ohne echte Fenster → erzeugt synthetische Overlaps @@ -276,7 +264,7 @@ if __name__ == "__main__": # pragma: no cover {"id": "demo#2", "text": "Gamma Delta"}, {"id": "demo#3", "text": "Delta Epsilon Zeta"}, ] - pls = make_chunk_payloads(fm, "path/demo.md", chunks, note_text="Alpha Beta Gamma Delta Epsilon Zeta", chunk_profile="long") + pls = make_chunk_payloads(fm, "path/demo.md", chunks, note_text="Alpha Beta Gamma Delta Epsilon Zeta") from pprint import pprint pprint(pls) recon = "".join(p["text"] for p in pls) diff --git a/app/core/derive_edges.py b/app/core/derive_edges.py index 24ee1c7..82ae58b 100644 --- a/app/core/derive_edges.py +++ b/app/core/derive_edges.py @@ -2,17 +2,32 @@ # -*- coding: utf-8 -*- """ Modul: app/core/derive_edges.py -Version: 1.5.0 -Datum: 2025-11-08 +Version: 1.4.0 +Datum: 2025-10-01 -Änderung --------- -- Integration der Type-Registry (optional): Ist im Typ die Default-Kante - "references" enthalten, werden Note-Scope-References/Backlinks **additiv** - aktiviert – auch wenn `include_note_scope_refs=False` übergeben wurde. - (Keine Breaking Changes: bestehende Parameter bleiben erhalten.) +Zweck +----- +Robuste Kantenbildung für mindnet (Notes/Chunks): +- belongs_to (chunk -> note) +- next / prev (chunk-Kette) +- references (chunk-scope) aus Chunk.window/text +- optional references/backlink (note-scope) -Weitere Logik (belongs_to/prev/next & chunk-scope references) bleibt unverändert. +Wichtig: Wikilinks werden mit der Parser-Funktion `extract_wikilinks` extrahiert, +damit Varianten wie [[id#anchor]] oder [[id|label]] korrekt auf 'id' reduziert werden. + +Erwartete Chunk-Payload-Felder: + { + "note_id": "...", + "chunk_id": "...", # Alias "id" ist zulässig + "id": "...", + "chunk_index": int, + "seq": int, + "window": str, + "text": str, + "path": "rel/path.md", + ... + } """ from __future__ import annotations @@ -21,16 +36,6 @@ from typing import Dict, List, Optional, Iterable # WICHTIG: benutze die Parser-Extraktion für saubere Wikilinks from app.core.parser import extract_wikilinks -# optional: Type-Registry (Fallback: deaktiviert) -try: - from app.core.type_registry import load_type_registry, get_type_config - _REG = load_type_registry() # prozessweiter Cache -except Exception: # pragma: no cover - _REG = {"types": {"concept": {"edge_defaults": ["references"]}}} - def get_type_config(_t, _r): # type: ignore - return {"edge_defaults": ["references"]} - - def _get(d: dict, *keys, default=None): for k in keys: if k in d and d[k] is not None: @@ -81,25 +86,9 @@ def build_edges_for_note( - next / prev: zwischen aufeinanderfolgenden Chunks - references: pro Chunk aus window/text - optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references - - Type-Registry-Erweiterung (additiv): - - Wenn der *Note-Typ* 'references' in seinen edge_defaults hat, werden - note-scope references/backlinks zusätzlich aktiviert. """ edges: List[dict] = [] - # Typ aus Chunk-Payloads ableiten (falls vorhanden) - note_type = None - for ch in chunks: - nt = ch.get("type") - if isinstance(nt, str) and nt.strip(): - note_type = nt.strip().lower() - break - type_cfg = get_type_config(note_type, _REG) - edge_defaults = [e for e in (type_cfg.get("edge_defaults") or []) if isinstance(e, str)] - - want_note_scope_refs = bool(include_note_scope_refs) or ("references" in edge_defaults) - # belongs_to for ch in chunks: cid = _get(ch, "chunk_id", "id") @@ -130,7 +119,7 @@ def build_edges_for_note( refs_all.extend(refs) # optional: note-scope references/backlinks - if want_note_scope_refs: + if include_note_scope_refs: refs_note = refs_all[:] if note_level_references: refs_note.extend([r for r in note_level_references if isinstance(r, str) and r])