From bd45abd781cf77d34a193f644e68c39fa0c8eeed Mon Sep 17 00:00:00 2001 From: Lars Date: Thu, 4 Sep 2025 08:22:01 +0200 Subject: [PATCH] app/core/qdrant_points.py aktualisiert --- app/core/qdrant_points.py | 45 ++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/app/core/qdrant_points.py b/app/core/qdrant_points.py index b5759dc..4fe5290 100644 --- a/app/core/qdrant_points.py +++ b/app/core/qdrant_points.py @@ -9,10 +9,7 @@ def _names(prefix: str) -> Tuple[str, str, str]: def _to_uuid(stable_key: str) -> str: - """ - Erzeuge eine stabile UUIDv5 aus einem stabilen String-Key (z. B. note_id, chunk_id, edge_id). - Wir verwenden NAMESPACE_URL, damit die UUIDs deterministisch sind. - """ + """Stabile UUIDv5 aus einem String-Key (deterministisch).""" return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key)) @@ -22,14 +19,12 @@ def points_for_note( note_vec: List[float] | None, dim: int, ) -> Tuple[str, List[rest.PointStruct]]: - """ - (collection_name, [PointStruct]) für die Notes-Collection. - Falls kein Note-Embedding vorhanden -> Nullvektor der Länge `dim`. - """ + """Notes-Collection: falls kein Note-Embedding -> Nullvektor der Länge dim.""" notes_col, _, _ = _names(prefix) vector = note_vec if note_vec is not None else [0.0] * int(dim) - # Qdrant-Point-ID MUSS int oder UUID sein -> aus note_id eine UUIDv5 machen - point_id = _to_uuid(note_payload["note_id"]) + # Qdrant-Point-ID MUSS int/UUID sein + raw_note_id = note_payload.get("note_id") or note_payload.get("id") or "missing-note-id" + point_id = _to_uuid(raw_note_id) pt = rest.PointStruct(id=point_id, vector=vector, payload=note_payload) return notes_col, [pt] @@ -40,26 +35,42 @@ def points_for_chunks( vectors: List[List[float]], ) -> Tuple[str, List[rest.PointStruct]]: """ - (collection_name, [PointStruct]) für die Chunks-Collection. - Erwartet pro Chunk einen Vektor (oder Nullvektor, wenn --skip-embed). + Chunks-Collection: erwartet pro Chunk einen Vektor. + Robustheit: + - Fehlt 'chunk_id', nutze 'id', sonst baue '${note_id}#${i}' (1-basiert). + - Schreibe die abgeleitete ID zurück in die Payload (pl['chunk_id']). """ _, chunks_col, _ = _names(prefix) points: List[rest.PointStruct] = [] - for pl, vec in zip(chunk_payloads, vectors): - point_id = _to_uuid(pl["chunk_id"]) + for i, (pl, vec) in enumerate(zip(chunk_payloads, vectors), start=1): + chunk_id = pl.get("chunk_id") or pl.get("id") + if not chunk_id: + note_id = pl.get("note_id") or pl.get("parent_note_id") or "missing-note" + chunk_id = f"{note_id}#{i}" + pl["chunk_id"] = chunk_id # persistenter Fallback in Payload + point_id = _to_uuid(chunk_id) points.append(rest.PointStruct(id=point_id, vector=vec, payload=pl)) return chunks_col, points def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]: """ - (collection_name, [PointStruct]) für die Edges-Collection. - Edges-Collection ist ohne Vektor angelegt -> nur Payload + UUID-IDs. + Edges-Collection ohne Vektor. + Robustheit: + - Fehlt 'edge_id', dann konstruiere aus (kind, source_id, target_id, seq) eine stabile ID. """ _, _, edges_col = _names(prefix) points: List[rest.PointStruct] = [] for pl in edge_payloads: - point_id = _to_uuid(pl["edge_id"]) + edge_id = pl.get("edge_id") + if not edge_id: + kind = pl.get("kind", "edge") + s = pl.get("source_id", "unknown-src") + t = pl.get("target_id", "unknown-tgt") + seq = pl.get("seq") or pl.get("order") or "" + edge_id = f"{kind}:{s}->{t}#{seq}" + pl["edge_id"] = edge_id + point_id = _to_uuid(edge_id) points.append(rest.PointStruct(id=point_id, payload=pl)) return edges_col, points