Dateien nach "app/core" hochladen
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
All checks were successful
Deploy mindnet to llm-node / deploy (push) Successful in 3s
This commit is contained in:
parent
ead1b8c1bc
commit
7b56f696d6
|
|
@ -2,19 +2,28 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
Modul: app/core/chunk_payload.py
|
Modul: app/core/chunk_payload.py
|
||||||
Version: 2.3.0
|
Version: 2.2.0
|
||||||
Datum: 2025-11-08
|
Datum: 2025-10-06
|
||||||
|
|
||||||
Änderungen ggü. 2.2.0
|
Zweck
|
||||||
----------------------
|
-----
|
||||||
- Optionaler Parameter `chunk_profile: str | None` (abwärtskompatibel).
|
Erzeugt Qdrant-Payloads für Chunks. Voll abwärtskompatibel zu v2.0.1.
|
||||||
- Bei fehlenden *echten* Fenstern (kein Overlap geliefert) wird das synthetische
|
Neu: Wenn der Chunker KEIN Overlap im Fenster liefert (== window fehlt / identisch zur Kernpassage),
|
||||||
Overlap anhand des Chunk-Profils (short|medium|long) gewählt. Ohne Profil
|
erzeugen wir FENSTER mit synthetischem Overlap auf Basis chunk_config.get_sizes(note_type)['overlap'].
|
||||||
bleibt das bisherige Verhalten bestehen (Übernahme aus get_sizes(note_type)).
|
|
||||||
|
|
||||||
Hinweis
|
Felder (beibehalten aus 2.0.1):
|
||||||
------
|
- note_id, chunk_id, id (Alias), chunk_index, seq, path
|
||||||
IDs, Felder und Vektoren bleiben unverändert.
|
- window (mit Overlap), text (ohne linkes Overlap)
|
||||||
|
- start, end (Offsets im gesamten Body)
|
||||||
|
- overlap_left, overlap_right
|
||||||
|
- token_count?, section_title?, section_path?, type?, title?, tags?
|
||||||
|
|
||||||
|
Kompatibilität:
|
||||||
|
- 'id' == 'chunk_id' als Alias
|
||||||
|
- Pfade bleiben relativ (keine führenden '/'), Backslashes → Slashes
|
||||||
|
- Robust für Chunk-Objekte oder Dicts; Fensterquelle: 'window'|'text'|'content'|'raw'
|
||||||
|
|
||||||
|
Lizenz: MIT (projektintern)
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
@ -28,13 +37,6 @@ except Exception:
|
||||||
# konservativer Default, falls Import fehlschlägt
|
# konservativer Default, falls Import fehlschlägt
|
||||||
return {"overlap": (40, 60), "target": (250, 350), "max": 500}
|
return {"overlap": (40, 60), "target": (250, 350), "max": 500}
|
||||||
|
|
||||||
# NEU: optionaler Import – Overlap-Empfehlungen aus der Type-Registry
|
|
||||||
try:
|
|
||||||
from app.core.type_registry import profile_overlap as _profile_overlap
|
|
||||||
except Exception:
|
|
||||||
def _profile_overlap(_profile: Optional[str]) -> tuple[int,int]:
|
|
||||||
return (40, 60)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------- Utils ------------------------------- #
|
# ------------------------------- Utils ------------------------------- #
|
||||||
|
|
||||||
|
|
@ -110,8 +112,6 @@ def make_chunk_payloads(
|
||||||
rel_path: str,
|
rel_path: str,
|
||||||
chunks: Iterable[Union[Dict[str, Any], Any]],
|
chunks: Iterable[Union[Dict[str, Any], Any]],
|
||||||
note_text: Optional[str] = None,
|
note_text: Optional[str] = None,
|
||||||
*, # neue, nur-keyword Parameter bleiben abwärtskompatibel
|
|
||||||
chunk_profile: Optional[str] = None,
|
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Baut Payloads pro Chunk. Falls Fenster ohne Overlap geliefert werden,
|
Baut Payloads pro Chunk. Falls Fenster ohne Overlap geliefert werden,
|
||||||
|
|
@ -158,28 +158,15 @@ def make_chunk_payloads(
|
||||||
windows_final = raw_windows[:] # bereits mit Overlap geliefert
|
windows_final = raw_windows[:] # bereits mit Overlap geliefert
|
||||||
else:
|
else:
|
||||||
# Keine echten Fenster → Segmente sind identisch zu "Fenstern" (bisher),
|
# Keine echten Fenster → Segmente sind identisch zu "Fenstern" (bisher),
|
||||||
# wir erzeugen synthetische Fenster mit Overlap.
|
# wir erzeugen synthetische Fenster mit Overlap gemäß Typ
|
||||||
segments = [w or "" for w in raw_windows]
|
segments = [w or "" for w in raw_windows]
|
||||||
overlaps_left = []
|
overlaps_left = []
|
||||||
windows_final = []
|
windows_final = []
|
||||||
recon = ""
|
recon = ""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Bisheriges Verhalten: aus get_sizes(note_type)
|
|
||||||
overlap_low, overlap_high = tuple(_get_sizes(note_type).get("overlap", (40, 60)))
|
overlap_low, overlap_high = tuple(_get_sizes(note_type).get("overlap", (40, 60)))
|
||||||
except Exception:
|
except Exception:
|
||||||
overlap_low, overlap_high = (40, 60)
|
overlap_low, overlap_high = (40, 60)
|
||||||
|
|
||||||
# Registry-Profil (falls vorhanden) übersteuert *nur* den Overlap defensiv
|
|
||||||
if isinstance(chunk_profile, str) and chunk_profile.strip():
|
|
||||||
try:
|
|
||||||
o_low, o_high = _profile_overlap(chunk_profile.strip().lower())
|
|
||||||
# defensiver Clamp: niemals größer als 3x Default
|
|
||||||
overlap_low = max(0, min(o_low, overlap_low * 3))
|
|
||||||
overlap_high = max(overlap_low, min(o_high, overlap_high * 3))
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
overlap_target = int(overlap_low)
|
overlap_target = int(overlap_low)
|
||||||
|
|
||||||
for i, seg in enumerate(segments):
|
for i, seg in enumerate(segments):
|
||||||
|
|
@ -268,6 +255,7 @@ def make_chunk_payloads(
|
||||||
return payloads
|
return payloads
|
||||||
|
|
||||||
|
|
||||||
|
# __main__ Demo (optional)
|
||||||
if __name__ == "__main__": # pragma: no cover
|
if __name__ == "__main__": # pragma: no cover
|
||||||
fm = {"id": "demo", "title": "Demo", "type": "concept"}
|
fm = {"id": "demo", "title": "Demo", "type": "concept"}
|
||||||
# Beispiel ohne echte Fenster → erzeugt synthetische Overlaps
|
# Beispiel ohne echte Fenster → erzeugt synthetische Overlaps
|
||||||
|
|
@ -276,7 +264,7 @@ if __name__ == "__main__": # pragma: no cover
|
||||||
{"id": "demo#2", "text": "Gamma Delta"},
|
{"id": "demo#2", "text": "Gamma Delta"},
|
||||||
{"id": "demo#3", "text": "Delta Epsilon Zeta"},
|
{"id": "demo#3", "text": "Delta Epsilon Zeta"},
|
||||||
]
|
]
|
||||||
pls = make_chunk_payloads(fm, "path/demo.md", chunks, note_text="Alpha Beta Gamma Delta Epsilon Zeta", chunk_profile="long")
|
pls = make_chunk_payloads(fm, "path/demo.md", chunks, note_text="Alpha Beta Gamma Delta Epsilon Zeta")
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
pprint(pls)
|
pprint(pls)
|
||||||
recon = "".join(p["text"] for p in pls)
|
recon = "".join(p["text"] for p in pls)
|
||||||
|
|
|
||||||
|
|
@ -2,17 +2,32 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""
|
"""
|
||||||
Modul: app/core/derive_edges.py
|
Modul: app/core/derive_edges.py
|
||||||
Version: 1.5.0
|
Version: 1.4.0
|
||||||
Datum: 2025-11-08
|
Datum: 2025-10-01
|
||||||
|
|
||||||
Änderung
|
Zweck
|
||||||
--------
|
-----
|
||||||
- Integration der Type-Registry (optional): Ist im Typ die Default-Kante
|
Robuste Kantenbildung für mindnet (Notes/Chunks):
|
||||||
"references" enthalten, werden Note-Scope-References/Backlinks **additiv**
|
- belongs_to (chunk -> note)
|
||||||
aktiviert – auch wenn `include_note_scope_refs=False` übergeben wurde.
|
- next / prev (chunk-Kette)
|
||||||
(Keine Breaking Changes: bestehende Parameter bleiben erhalten.)
|
- references (chunk-scope) aus Chunk.window/text
|
||||||
|
- optional references/backlink (note-scope)
|
||||||
|
|
||||||
Weitere Logik (belongs_to/prev/next & chunk-scope references) bleibt unverändert.
|
Wichtig: Wikilinks werden mit der Parser-Funktion `extract_wikilinks` extrahiert,
|
||||||
|
damit Varianten wie [[id#anchor]] oder [[id|label]] korrekt auf 'id' reduziert werden.
|
||||||
|
|
||||||
|
Erwartete Chunk-Payload-Felder:
|
||||||
|
{
|
||||||
|
"note_id": "...",
|
||||||
|
"chunk_id": "...", # Alias "id" ist zulässig
|
||||||
|
"id": "...",
|
||||||
|
"chunk_index": int,
|
||||||
|
"seq": int,
|
||||||
|
"window": str,
|
||||||
|
"text": str,
|
||||||
|
"path": "rel/path.md",
|
||||||
|
...
|
||||||
|
}
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
@ -21,16 +36,6 @@ from typing import Dict, List, Optional, Iterable
|
||||||
# WICHTIG: benutze die Parser-Extraktion für saubere Wikilinks
|
# WICHTIG: benutze die Parser-Extraktion für saubere Wikilinks
|
||||||
from app.core.parser import extract_wikilinks
|
from app.core.parser import extract_wikilinks
|
||||||
|
|
||||||
# optional: Type-Registry (Fallback: deaktiviert)
|
|
||||||
try:
|
|
||||||
from app.core.type_registry import load_type_registry, get_type_config
|
|
||||||
_REG = load_type_registry() # prozessweiter Cache
|
|
||||||
except Exception: # pragma: no cover
|
|
||||||
_REG = {"types": {"concept": {"edge_defaults": ["references"]}}}
|
|
||||||
def get_type_config(_t, _r): # type: ignore
|
|
||||||
return {"edge_defaults": ["references"]}
|
|
||||||
|
|
||||||
|
|
||||||
def _get(d: dict, *keys, default=None):
|
def _get(d: dict, *keys, default=None):
|
||||||
for k in keys:
|
for k in keys:
|
||||||
if k in d and d[k] is not None:
|
if k in d and d[k] is not None:
|
||||||
|
|
@ -81,25 +86,9 @@ def build_edges_for_note(
|
||||||
- next / prev: zwischen aufeinanderfolgenden Chunks
|
- next / prev: zwischen aufeinanderfolgenden Chunks
|
||||||
- references: pro Chunk aus window/text
|
- references: pro Chunk aus window/text
|
||||||
- optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references
|
- optional note-scope references/backlinks: dedupliziert über alle Chunk-Funde + note_level_references
|
||||||
|
|
||||||
Type-Registry-Erweiterung (additiv):
|
|
||||||
- Wenn der *Note-Typ* 'references' in seinen edge_defaults hat, werden
|
|
||||||
note-scope references/backlinks zusätzlich aktiviert.
|
|
||||||
"""
|
"""
|
||||||
edges: List[dict] = []
|
edges: List[dict] = []
|
||||||
|
|
||||||
# Typ aus Chunk-Payloads ableiten (falls vorhanden)
|
|
||||||
note_type = None
|
|
||||||
for ch in chunks:
|
|
||||||
nt = ch.get("type")
|
|
||||||
if isinstance(nt, str) and nt.strip():
|
|
||||||
note_type = nt.strip().lower()
|
|
||||||
break
|
|
||||||
type_cfg = get_type_config(note_type, _REG)
|
|
||||||
edge_defaults = [e for e in (type_cfg.get("edge_defaults") or []) if isinstance(e, str)]
|
|
||||||
|
|
||||||
want_note_scope_refs = bool(include_note_scope_refs) or ("references" in edge_defaults)
|
|
||||||
|
|
||||||
# belongs_to
|
# belongs_to
|
||||||
for ch in chunks:
|
for ch in chunks:
|
||||||
cid = _get(ch, "chunk_id", "id")
|
cid = _get(ch, "chunk_id", "id")
|
||||||
|
|
@ -130,7 +119,7 @@ def build_edges_for_note(
|
||||||
refs_all.extend(refs)
|
refs_all.extend(refs)
|
||||||
|
|
||||||
# optional: note-scope references/backlinks
|
# optional: note-scope references/backlinks
|
||||||
if want_note_scope_refs:
|
if include_note_scope_refs:
|
||||||
refs_note = refs_all[:]
|
refs_note = refs_all[:]
|
||||||
if note_level_references:
|
if note_level_references:
|
||||||
refs_note.extend([r for r in note_level_references if isinstance(r, str) and r])
|
refs_note.extend([r for r in note_level_references if isinstance(r, str) and r])
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user