Implement chunk-aware graph traversal in hybrid_retrieve: Extract both note_id and chunk_id from hits to enhance seed coverage for edge retrieval. Combine direct and additional chunk IDs for improved accuracy in subgraph expansion. Update debug logging to reflect the new seed and chunk ID handling, ensuring better traceability in graph retrieval processes.

2026-01-11 17:48:30 +01:00 · 2026-01-11 17:48:30 +01:00 · 2445f7cb2b
commit 2445f7cb2b
parent 47fdcf8eed
1 changed files with 26 additions and 8 deletions
--- a/app/core/retrieval/retriever.py
+++ b/app/core/retrieval/retriever.py
@ -396,22 +396,40 @@ def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
    subgraph: ga.Subgraph | None = None
    if depth > 0 and hits:
-        seed_ids = list({h[2].get("note_id") for h in hits if h[2].get("note_id")})
+        # WP-24c v4.5.2: Chunk-Aware Graph Traversal
        # Extrahiere sowohl note_id als auch chunk_id (pid) direkt aus den Hits
        # Dies stellt sicher, dass Chunk-Scope Edges gefunden werden
        seed_note_ids = list({h[2].get("note_id") for h in hits if h[2].get("note_id")})
        seed_chunk_ids = list({h[0] for h in hits if h[0]})  # pid ist die Chunk-ID
-        if seed_ids:
+        # Kombiniere beide Sets für vollständige Seed-Abdeckung
        # Chunk-IDs können auch als Note-IDs fungieren (für Note-Scope Edges)
        all_seed_ids = list(set(seed_note_ids + seed_chunk_ids))
        if all_seed_ids:
            try:
-                # WP-24c v4.1.0: Scope-Awareness - Lade Chunk-IDs für Note-IDs
+                # WP-24c v4.5.2: Chunk-IDs sind bereits aus Hits extrahiert
-                chunk_ids = _get_chunk_ids_for_notes(client, prefix, seed_ids)
+                # Zusätzlich können wir noch weitere Chunk-IDs für die Note-IDs laden
                # (für den Fall, dass nicht alle Chunks in den Top-K Hits sind)
                additional_chunk_ids = _get_chunk_ids_for_notes(client, prefix, seed_note_ids)
                # Kombiniere direkte Chunk-IDs aus Hits mit zusätzlich geladenen
                all_chunk_ids = list(set(seed_chunk_ids + additional_chunk_ids))
-                # Erweiterte Edge-Retrieval mit Chunk-Scope und Section-Filtering
+                # WP-24c v4.5.2: Erweiterte Edge-Retrieval mit Chunk-Scope und Section-Filtering
                # Verwende all_seed_ids (enthält sowohl note_id als auch chunk_id)
                # und all_chunk_ids für explizite Chunk-Scope Edge-Suche
                subgraph = ga.expand(
-                    client, prefix, seed_ids, 
+                    client, prefix, all_seed_ids, 
                    depth=depth, 
                    edge_types=expand_cfg.get("edge_types"),
-                    chunk_ids=chunk_ids,
+                    chunk_ids=all_chunk_ids,
                    target_section=target_section
                )
                # WP-24c v4.5.2: Debug-Logging für Chunk-Awareness
                logger.debug(f"🔍 [SEEDS] Note-IDs: {len(seed_note_ids)}, Chunk-IDs: {len(seed_chunk_ids)}, Total Seeds: {len(all_seed_ids)}")
                logger.debug(f"   -> Zusätzliche Chunk-IDs geladen: {len(additional_chunk_ids)}, Total Chunk-IDs: {len(all_chunk_ids)}")
                # --- WP-24c v4.1.0: Chunk-Level Edge-Aggregation & Deduplizierung ---
                # Verhindert Score-Explosion durch multiple Links auf versch. Abschnitte.
                # Logik: 1. Kante zählt voll, weitere dämpfen auf Faktor 0.1.
@ -512,7 +530,7 @@ def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
        # WP-24c v4.5.1: Subgraph hat kein .edges Attribut, sondern .adj (Adjazenzliste)
        # Zähle alle Kanten aus der Adjazenzliste
        edge_count = sum(len(edges) for edges in subgraph.adj.values()) if hasattr(subgraph, 'adj') else 0
-        logger.debug(f"📊 [GRAPH] Subgraph enthält {edge_count} Kanten für {len(seed_ids)} Seed-Notizen")
+        logger.debug(f"📊 [GRAPH] Subgraph enthält {edge_count} Kanten")
    else:
        logger.debug(f"📊 [GRAPH] Kein Subgraph (depth=0 oder keine Seed-IDs)")