Dateien nach "app/core" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s

This commit is contained in:
Lars 2025-11-08 16:58:47 +01:00
parent ead1b8c1bc
commit 7b56f696d6
2 changed files with 48 additions and 71 deletions

View File

@ -2,19 +2,28 @@
# -*- coding: utf-8 -*-
"""
Modul: app/core/chunk_payload.py
Version: 2.3.0
Datum: 2025-11-08
Version: 2.2.0
Datum: 2025-10-06
Änderungen ggü. 2.2.0
----------------------
- Optionaler Parameter `chunk_profile: str | None` (abwärtskompatibel).
- Bei fehlenden *echten* Fenstern (kein Overlap geliefert) wird das synthetische
Overlap anhand des Chunk-Profils (short|medium|long) gewählt. Ohne Profil
bleibt das bisherige Verhalten bestehen (Übernahme aus get_sizes(note_type)).
Zweck
-----
Erzeugt Qdrant-Payloads für Chunks. Voll abwärtskompatibel zu v2.0.1.
Neu: Wenn der Chunker KEIN Overlap im Fenster liefert (== window fehlt / identisch zur Kernpassage),
erzeugen wir FENSTER mit synthetischem Overlap auf Basis chunk_config.get_sizes(note_type)['overlap'].
Hinweis
------
IDs, Felder und Vektoren bleiben unverändert.
Felder (beibehalten aus 2.0.1):
- note_id, chunk_id, id (Alias), chunk_index, seq, path
- window (mit Overlap), text (ohne linkes Overlap)
- start, end (Offsets im gesamten Body)
- overlap_left, overlap_right
- token_count?, section_title?, section_path?, type?, title?, tags?
Kompatibilität:
- 'id' == 'chunk_id' als Alias
- Pfade bleiben relativ (keine führenden '/'), Backslashes Slashes
- Robust für Chunk-Objekte oder Dicts; Fensterquelle: 'window'|'text'|'content'|'raw'
Lizenz: MIT (projektintern)
"""
from __future__ import annotations
@ -28,13 +37,6 @@ except Exception:
# konservativer Default, falls Import fehlschlägt
return {"overlap": (40, 60), "target": (250, 350), "max": 500}
# NEU: optionaler Import Overlap-Empfehlungen aus der Type-Registry
try:
from app.core.type_registry import profile_overlap as _profile_overlap
except Exception:
def _profile_overlap(_profile: Optional[str]) -> tuple[int,int]:
return (40, 60)
# ------------------------------- Utils ------------------------------- #
@ -110,8 +112,6 @@ def make_chunk_payloads(
rel_path: str,
chunks: Iterable[Union[Dict[str, Any], Any]],
note_text: Optional[str] = None,
*, # neue, nur-keyword Parameter bleiben abwärtskompatibel
chunk_profile: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Baut Payloads pro Chunk. Falls Fenster ohne Overlap geliefert werden,
@ -158,28 +158,15 @@ def make_chunk_payloads(
windows_final = raw_windows[:] # bereits mit Overlap geliefert
else:
# Keine echten Fenster → Segmente sind identisch zu "Fenstern" (bisher),
# wir erzeugen synthetische Fenster mit Overlap.
# wir erzeugen synthetische Fenster mit Overlap gemäß Typ
segments = [w or "" for w in raw_windows]
overlaps_left = []
windows_final = []
recon = ""
try:
# Bisheriges Verhalten: aus get_sizes(note_type)
overlap_low, overlap_high = tuple(_get_sizes(note_type).get("overlap", (40, 60)))
except Exception:
overlap_low, overlap_high = (40, 60)
# Registry-Profil (falls vorhanden) übersteuert *nur* den Overlap defensiv
if isinstance(chunk_profile, str) and chunk_profile.strip():
try:
o_low, o_high = _profile_overlap(chunk_profile.strip().lower())
# defensiver Clamp: niemals größer als 3x Default
overlap_low = max(0, min(o_low, overlap_low * 3))
overlap_high = max(overlap_low, min(o_high, overlap_high * 3))
except Exception:
pass
overlap_target = int(overlap_low)
for i, seg in enumerate(segments):
@ -268,6 +255,7 @@ def make_chunk_payloads(
return payloads
# __main__ Demo (optional)
if __name__ == "__main__": # pragma: no cover
fm = {"id": "demo", "title": "Demo", "type": "concept"}
# Beispiel ohne echte Fenster → erzeugt synthetische Overlaps
@ -276,7 +264,7 @@ if __name__ == "__main__": # pragma: no cover
{"id": "demo#2", "text": "Gamma Delta"},
{"id": "demo#3", "text": "Delta Epsilon Zeta"},
]
pls = make_chunk_payloads(fm, "path/demo.md", chunks, note_text="Alpha Beta Gamma Delta Epsilon Zeta", chunk_profile="long")
pls = make_chunk_payloads(fm, "path/demo.md", chunks, note_text="Alpha Beta Gamma Delta Epsilon Zeta")
from pprint import pprint
pprint(pls)
recon = "".join(p["text"] for p in pls)

View File

@ -2,17 +2,32 @@
# -*- coding: utf-8 -*-
"""
Modul: app/core/derive_edges.py
Version: 1.5.0
Datum: 2025-11-08
Version: 1.4.0
Datum: 2025-10-01
Änderung
--------
- Integration der Type-Registry (optional): Ist im Typ die Default-Kante
"references" enthalten, werden Note-Scope-References/Backlinks **additiv**
aktiviert auch wenn `include_note_scope_refs=False` übergeben wurde.
(Keine Breaking Changes: bestehende Parameter bleiben erhalten.)
Zweck
-----
Robuste Kantenbildung für mindnet (Notes/Chunks):
- belongs_to (chunk -> note)
- next / prev (chunk-Kette)
- references (chunk-scope) aus Chunk.window/text
- optional references/backlink (note-scope)
Weitere Logik (belongs_to/prev/next & chunk-scope references) bleibt unverändert.
Wichtig: Wikilinks werden mit der Parser-Funktion `extract_wikilinks` extrahiert,
damit Varianten wie [[id#anchor]] oder [[id|label]] korrekt auf 'id' reduziert werden.
Erwartete Chunk-Payload-Felder:
{
"note_id": "...",
"chunk_id": "...", # Alias "id" ist zulässig
"id": "...",
"chunk_index": int,
"seq": int,
"window": str,
"text": str,
"path": "rel/path.md",
...
}
"""
from __future__ import annotations
@ -21,16 +36,6 @@ from typing import Dict, List, Optional, Iterable
# WICHTIG: benutze die Parser-Extraktion für saubere Wikilinks
from app.core.parser import extract_wikilinks
# optional: Type-Registry (Fallback: deaktiviert)
try:
from app.core.type_registry import load_type_registry, get_type_config
_REG = load_type_registry() # prozessweiter Cache
except Exception: # pragma: no cover
_REG = {"types": {"concept": {"edge_defaults": ["references"]}}}
def get_type_config(_t, _r): # type: ignore
return {"edge_defaults": ["references"]}
def _get(d: dict, *keys, default=None):
for k in keys:
if k in d and d[k] is not None:
@ -81,25 +86,9 @@ def build_edges_for_note(
- next / prev: zwischen aufeinanderfolgenden Chunks
- references: pro Chunk aus window/text
- optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references
Type-Registry-Erweiterung (additiv):
- Wenn der *Note-Typ* 'references' in seinen edge_defaults hat, werden
note-scope references/backlinks zusätzlich aktiviert.
"""
edges: List[dict] = []
# Typ aus Chunk-Payloads ableiten (falls vorhanden)
note_type = None
for ch in chunks:
nt = ch.get("type")
if isinstance(nt, str) and nt.strip():
note_type = nt.strip().lower()
break
type_cfg = get_type_config(note_type, _REG)
edge_defaults = [e for e in (type_cfg.get("edge_defaults") or []) if isinstance(e, str)]
want_note_scope_refs = bool(include_note_scope_refs) or ("references" in edge_defaults)
# belongs_to
for ch in chunks:
cid = _get(ch, "chunk_id", "id")
@ -130,7 +119,7 @@ def build_edges_for_note(
refs_all.extend(refs)
# optional: note-scope references/backlinks
if want_note_scope_refs:
if include_note_scope_refs:
refs_note = refs_all[:]
if note_level_references:
refs_note.extend([r for r in note_level_references if isinstance(r, str) and r])