Compare commits
967 Commits
_basis_1_s
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 0d61a9e191 | |||
| 55d1a7e290 | |||
| 4537e65428 | |||
| 43327c1f6d | |||
| 39a6998123 | |||
| 273c4c6919 | |||
| 2ed4488cf6 | |||
| 36490425c5 | |||
| b8cb8bb89b | |||
| 6d268d9dfb | |||
| df5f9b3fe4 | |||
| 5e67cd470c | |||
| 0b2a1f1a63 | |||
| d0012355b9 | |||
| 1056078e6a | |||
| c42a76b3d7 | |||
| ec9b3c68af | |||
| f9118a36f8 | |||
| e52eed40ca | |||
| 43641441ef | |||
| c613d81846 | |||
| de5db09b51 | |||
| 7cb8fd6602 | |||
| 6047e94964 | |||
| 78fbc9b31b | |||
| 742792770c | |||
| b19f91c3ee | |||
| 9b0d8c18cb | |||
| f2a2f4d2df | |||
| ea0fd951f2 | |||
| c8c828c8a8 | |||
| 716a063849 | |||
| 3dc81ade0f | |||
| 1df89205ac | |||
| 2445f7cb2b | |||
| 47fdcf8eed | |||
| 3e27c72b80 | |||
| 2d87f9d816 | |||
| d7d6155203 | |||
| f8506c0bb2 | |||
| c91910ee9f | |||
| ee91583614 | |||
| 3a17b646e1 | |||
| 727de50290 | |||
| a780104b3c | |||
| f51e1cb2c4 | |||
| 20fb1e92e2 | |||
| 1d66ca0649 | |||
| 55b64c331a | |||
| 4d43cc526e | |||
| 6131b315d7 | |||
| dfff46e45c | |||
| 003a270548 | |||
| 39fd15b565 | |||
| be2bed9927 | |||
| 2da98e8e37 | |||
| a852975811 | |||
| 8fd7ef804d | |||
| b0f4309a29 | |||
| c33b1c644a | |||
| 7cc823e2f4 | |||
| 7e00344b84 | |||
| ec89d83916 | |||
| 57656bbaaf | |||
| 7953acf3ee | |||
| 3f528f2184 | |||
| 29e334625e | |||
| 114cea80de | |||
| 981b0cba1f | |||
| e2c40666d1 | |||
| c9ae58725c | |||
| 4318395c83 | |||
| 00264a9653 | |||
| 7e4ea670b1 | |||
| 008a470f02 | |||
| 7ed82ad82e | |||
| 72cf71fa87 | |||
| 9cb08777fa | |||
| 2c18f8b3de | |||
| d5d6987ce2 | |||
| 61a319a049 | |||
| a392dc2786 | |||
| 5e2a074019 | |||
| 9b3fd7723e | |||
| 4802eba27b | |||
| 745352ff3f | |||
| 13f0a0c9bc | |||
| 4a404d74de | |||
| 8ed4efaadc | |||
| d17c966301 | |||
| 548c503e7c | |||
| 277444ec0a | |||
| 62a00d1ac3 | |||
| 8505538b34 | |||
| a9d0874fe9 | |||
| 1563ebbdf9 | |||
| 38fac89f73 | |||
| 7026fc4fed | |||
| d41da670fc | |||
| 5541ceb13d | |||
| ac26cc4940 | |||
| 9b906bbabf | |||
| 9a98093e70 | |||
| de05784428 | |||
| f62983b08f | |||
| d0eae8e43c | |||
| 3d2f3d12d9 | |||
| 0d2469f8fa | |||
| 124849c580 | |||
| ea38743a2a | |||
| 5ab01c5150 | |||
| ed3f3e5588 | |||
| bb6959a090 | |||
| d49d509451 | |||
| 008167268f | |||
| 67d7154328 | |||
| 4de9a4f649 | |||
| d35bdc64b9 | |||
| 3a768be488 | |||
| 1965c984f4 | |||
| e3e1700de5 | |||
| f05d766b64 | |||
| 58e414041a | |||
| cd5056d4c9 | |||
| 39fb821481 | |||
| beb87a8c43 | |||
| 4327fc939c | |||
| ef1046c6f5 | |||
| ef8cf719f2 | |||
| 6aa6b32a6c | |||
| 33dff04d47 | |||
| 65d697b7be | |||
| 06fc42ed37 | |||
| 3c5c567077 | |||
| 8f65e550c8 | |||
| 6b83879741 | |||
| be265e9cc0 | |||
| 680c36ab59 | |||
| 96b4f65cd1 | |||
| b1a897e51c | |||
| e5a34efee9 | |||
| f9ac4e4dbf | |||
| 1b40e29f40 | |||
| 7eba1fb487 | |||
| 838083b909 | |||
| 8f5eb36b5f | |||
| b7d1bcce3d | |||
| 03d3173ca6 | |||
| 38a61d7b50 | |||
| 0a429e1f7b | |||
| 857ba953e3 | |||
| e180018c99 | |||
| ac9956bf00 | |||
| 62b5a8bf65 | |||
| 303efefcb7 | |||
| feeb7c2d92 | |||
| ea9a54421a | |||
| fdf99b2bb0 | |||
| c7cd641f89 | |||
| 18b90c8df3 | |||
| 8d3bc1c2e2 | |||
| 079d988034 | |||
| aa9d388337 | |||
| 92bd3d9a47 | |||
| 53058d1504 | |||
| 3fe8463a03 | |||
| 5c4ce5d727 | |||
| 459193e7b1 | |||
| 98f21323fb | |||
| 515248d438 | |||
| b0c69ad3e0 | |||
| c5f29ab4ae | |||
| 876ee898d8 | |||
| e93bab6ea7 | |||
| 5225090490 | |||
| e9532e8878 | |||
| 23b1cb2966 | |||
| fa909e2e7d | |||
| 7fa9ce81bd | |||
| 8490911958 | |||
| 19d899b277 | |||
| 37ec8b614e | |||
| e045371969 | |||
| cd5383432e | |||
| 8b8baa27b3 | |||
| 386fa3ef0c | |||
| 19c96fd00f | |||
| ecb35fb869 | |||
| 21cda0072a | |||
| e3858e8bc3 | |||
| f08a331bc6 | |||
| cfcaa926cd | |||
| 8ade34af0a | |||
| a6d37c92d2 | |||
| 1b7b8091a3 | |||
| 94e5ebf577 | |||
| cf302e8334 | |||
| 82c7752266 | |||
| c676c8263f | |||
| f6b2375d65 | |||
| d1a065fec8 | |||
| f686ecf947 | |||
| 6ac1f318d0 | |||
| cbaf664123 | |||
| 470e653da6 | |||
| 43d3d8f7f3 | |||
| 83c0c9944d | |||
| f6f3213b84 | |||
| e5db7011f3 | |||
| b0bc8518ed | |||
| 046b648286 | |||
| e04aecb0c5 | |||
| d64ba06809 | |||
| b0d73cb053 | |||
| 5213d262a2 | |||
| 2985f5288b | |||
| 27b404560c | |||
| 16e128668c | |||
| ecfdc67485 | |||
| c9cf1b7e4c | |||
| fa6eb0795a | |||
| 4ac504a4ae | |||
| 56dd1bcd84 | |||
| b4a07a05af | |||
| 5c55229376 | |||
| f3fd71b828 | |||
| 079cf174d4 | |||
| 4ab44e36a2 | |||
| 5278c75ac1 | |||
| a908853c30 | |||
| 867a7a8b44 | |||
| 2c073c7d3c | |||
| 0157faab89 | |||
| f1bfa40b5b | |||
| dcc3083455 | |||
| a5b4dfb31f | |||
| a733212c0f | |||
| 49b454d2ec | |||
| 18780e5330 | |||
| 36fb27edf0 | |||
| c60aba63a4 | |||
| 2a98c37ca1 | |||
| 0ac8a14ea7 | |||
| 234949800b | |||
| c68c7404cf | |||
| f8ed7bb62e | |||
| 81d005d969 | |||
| 99482ad65a | |||
| 2d43e0596c | |||
| 74cac7e16c | |||
| 0c2dc61cb5 | |||
| 97c3461685 | |||
| 41e7a49d52 | |||
| 725dc6bda4 | |||
| 5da8eb6543 | |||
| 4f2296b9b0 | |||
| cb9addb1b6 | |||
| c5198a80d8 | |||
| 0df266b2e9 | |||
| 444f64e94c | |||
| 59bcd227f9 | |||
| 7f707cffb9 | |||
| a50c4494c6 | |||
| 418612cd10 | |||
| 828fe1f8ce | |||
| c184c3cae2 | |||
| 2cd3605017 | |||
| 0d71f41a13 | |||
| f5bfb0cfb4 | |||
| f7ab32ebf4 | |||
| 64dbd57fc5 | |||
| ba46957556 | |||
| 33b0c83c87 | |||
| 5dd58f49f0 | |||
| cc12dcf993 | |||
| cbfdd96152 | |||
| babab3167b | |||
| c61d9c8236 | |||
| e47241740d | |||
| 6db1714449 | |||
| 136c3bb43f | |||
| 2c3ee8efd6 | |||
| 48729e6f5d | |||
| 3eac646cb6 | |||
| 9a18f3cc8b | |||
| 5dd20d683f | |||
| 8af744fc97 | |||
| 342d3e5103 | |||
| 7f7d8c87db | |||
| 43f695de54 | |||
| 20b219d86c | |||
| 0ff39d7b14 | |||
| e2ee5df815 | |||
| 77a6db7e92 | |||
| d53912a8f1 | |||
| cdcaff184f | |||
| 25ec3880bb | |||
| edbd8f0ca8 | |||
| a4272c17a9 | |||
| c8cdf218f2 | |||
| b3833f2051 | |||
| a272c39613 | |||
| 6df9b54626 | |||
| 156c2c2fd5 | |||
| 6011b96fc1 | |||
| 7639bb8472 | |||
| c61b66b49d | |||
| cf49715c66 | |||
| bf8a814c58 | |||
| 8fadec5c2c | |||
| 7263fee4c7 | |||
| 4204c2c974 | |||
| 9372cfb8ca | |||
| 179949b289 | |||
| cd3946bd11 | |||
| 12e374bc05 | |||
| fd2fac7112 | |||
| 60092b378b | |||
| 9025af62f0 | |||
| 83bb18b6a7 | |||
| ec759dd1dc | |||
| c454fc40bb | |||
| bfb42cfc24 | |||
| 49dd91fee0 | |||
| 56c1862205 | |||
| dbefd30dce | |||
| bc06ce5b90 | |||
| b26d9fd10e | |||
| 909ee338d2 | |||
| 84e319a42f | |||
| efd9c74cd3 | |||
| eccdf43cd3 | |||
| 9e25e5b26b | |||
| cee8fc05c2 | |||
| 9a55d45832 | |||
| 48de6aed8c | |||
| 8772271648 | |||
| 0cb216c2bc | |||
| eb45d78c47 | |||
| 582aed61ec | |||
| 0294414d26 | |||
| 3a2a78e0fd | |||
| 245a96809a | |||
| bb41b11cce | |||
| af73ab9371 | |||
| cba1337066 | |||
| d50ed70467 | |||
| f313f0873b | |||
| f7a4dab707 | |||
| 3861246ac6 | |||
| 053c22bc15 | |||
| bbfa2049eb | |||
| 0b47ffdcb6 | |||
| cf91730e45 | |||
| ae0768e4de | |||
| ccc848f2e2 | |||
| c6ccad1d18 | |||
| 4e8512c812 | |||
| f1bcbb1543 | |||
| 537a2883bf | |||
| 72a3988e49 | |||
| 93b8bc48e4 | |||
| 19f8d76e21 | |||
| 772a202d6e | |||
| dc16bbf8a4 | |||
| 028ad7e941 | |||
| 620858a575 | |||
| 0bb5f57cc6 | |||
| 0a4374dc8c | |||
| 8afa402118 | |||
| 1eee189a49 | |||
| d2e0b48aa5 | |||
| 302e2a3d24 | |||
| faaa3ef55f | |||
| 70ffa5cd4e | |||
| 65bcea71ee | |||
| 16af07dd21 | |||
| fc4c868eaa | |||
| 1fe9582cbe | |||
| d34f388ee1 | |||
| 4cd82d1d2b | |||
| fb6e35ed01 | |||
| 0ba8ae8d1e | |||
| 1121d63383 | |||
| 0b8f0a6c22 | |||
| 30047f8e00 | |||
| 7b6cc8da7c | |||
| c5e613d2b1 | |||
| d965d96cde | |||
| 65bc7622f8 | |||
| 844d13316a | |||
| a403d8baf6 | |||
| d25d623b9c | |||
| 1e3fcc1633 | |||
| 0d28a6c1a7 | |||
| 87083355ee | |||
| 7e9e496d86 | |||
| d93b9b30ae | |||
| 7fc316d284 | |||
| e27b1f4621 | |||
| 3c19e192bc | |||
| df971f9c56 | |||
| ecc2b60427 | |||
| 2826cbadbc | |||
| a2856bfe87 | |||
| 69617802c3 | |||
| 135c02bc9a | |||
| cd28a9d35c | |||
| 49cdc9a13f | |||
| 3629bc3fb9 | |||
| 37c0f526ec | |||
| bf40169662 | |||
| 9fbf0b7c91 | |||
| 652d22e8e8 | |||
| 94dbaafc72 | |||
| 69ad7bc823 | |||
| d59bd1885d | |||
| f6d8751f23 | |||
| 9a38daafc0 | |||
| e5bc77b93e | |||
| 2bcf1930fe | |||
| 2d58220a3c | |||
| 8436f4192d | |||
| 13ab2b7d68 | |||
| 714763f92f | |||
| a1cd0741c9 | |||
| c741cc7d1b | |||
| e8b8c9a350 | |||
| f016a16c68 | |||
| 5bf91d51da | |||
| 00aecf692d | |||
| b815f6235f | |||
| a1a58727fd | |||
| b1cf89982b | |||
| 5fa02aed2d | |||
| 1fde4ed72a | |||
| 52210a91fd | |||
| 9d792e11ce | |||
| 2677ad7269 | |||
| 98c395b5de | |||
| b4ebbe5c28 | |||
| 88e2cbf254 | |||
| b153571933 | |||
| aba6f0c38b | |||
| f4299db347 | |||
| 06f77fe8b7 | |||
| 54e38d58c3 | |||
| fd47a954bd | |||
| 5aae33f578 | |||
| 4ed04039e5 | |||
| e7cec6acc6 | |||
| a7cda3f51c | |||
| d2ee48555a | |||
| 8639791914 | |||
| 765fad6a8d | |||
| 408c4ace93 | |||
| 565d34fb7b | |||
| 0170d9291f | |||
| 865b261294 | |||
| 62aa24a4bc | |||
| f2de556c16 | |||
| 67d865d373 | |||
| 5c0a36c9ea | |||
| f50ae4c934 | |||
| 1fefc538ac | |||
| a520b62d08 | |||
| 1063c94f5d | |||
| 954e21ca81 | |||
| fd39211801 | |||
| ff23ce6f00 | |||
| 86464cec11 | |||
| ba57e8b43f | |||
| be406d230f | |||
| 38abde8516 | |||
| 9bb6566419 | |||
| 2acaf3c060 | |||
| ec33163d98 | |||
| 2d0913cf3b | |||
| 76ea8e3350 | |||
| bbc1e589f5 | |||
| 987e297c07 | |||
| 046aa2cf48 | |||
| 0bc0e66a85 | |||
| 2310633de3 | |||
| bd36d78025 | |||
| d2270fafdd | |||
| 86faf5b8f0 | |||
| 902f26ee95 | |||
| ea9fe1a6ed | |||
| 6c2074166c | |||
| bd44af2b68 | |||
| 972cd0dfac | |||
| 9cc16bb220 | |||
| 03594424a1 | |||
| 97985371ca | |||
| 3c5c01998f | |||
| 2fa24cb1bd | |||
| 0304bae9f4 | |||
| ac6d34731a | |||
| be5ad7e080 | |||
| 32128c06c1 | |||
| 07b7f419de | |||
| 4d6fce2d93 | |||
| b398168b1f | |||
| 6b397fea2e | |||
| d20386e0a7 | |||
| fcb90db2d0 | |||
| 5d8e96372f | |||
| 0d1a98279e | |||
| 73641abf8a | |||
| a4873b3d45 | |||
| 4f171e5245 | |||
| d48b671314 | |||
| b9d73af062 | |||
| 21904e62ee | |||
| f9af30b195 | |||
| 531e3790b3 | |||
| 7745b71832 | |||
| b39663408c | |||
| 9c4696e9d5 | |||
| cf5e53f341 | |||
| e512f768ff | |||
| c3ef65d069 | |||
| 90f0be6baf | |||
| eba51eba0f | |||
| fbaa07ba88 | |||
| d0ecc5316f | |||
| 9ee39277ae | |||
| 9d3fd21c88 | |||
| d7217672a3 | |||
| e39b685b1e | |||
| 5b9399361d | |||
| 7631b93900 | |||
| f98eb8fc16 | |||
| ab60106fb9 | |||
| caab41a760 | |||
| 78d8ab11ed | |||
| d49da1b5fe | |||
| 27497fd6be | |||
| 466ce3ca82 | |||
| 8b5af5ec65 | |||
| 34cbc0c0eb | |||
| 52ff4bbff5 | |||
| 9e7104e362 | |||
| 882ae5368f | |||
| 12a424ce23 | |||
| c3230761e8 | |||
| f5104008c2 | |||
| 013e8c6d0c | |||
| 603bba533b | |||
| e202a8d88b | |||
| c56a31c6bc | |||
| b73b18bf9a | |||
| 0f351252b2 | |||
| d7d7411fa0 | |||
| 45af7f074c | |||
| 70ab118612 | |||
| ba0cdea9d0 | |||
| 09500ff764 | |||
| b29ce4a0a6 | |||
| 311e598b69 | |||
| 413dca770c | |||
| 9d9239b11e | |||
| a571c41c61 | |||
| ba1fab9dcf | |||
| 5dc9c13a71 | |||
| 70821bae5a | |||
| 486abf82dd | |||
| 19afe3af33 | |||
| 113042b7f4 | |||
| f8eebd91cb | |||
| 00297fc354 | |||
| 5baac082f9 | |||
| 611b0aa54e | |||
| 110f0dbcf5 | |||
| 1688cf0a1e | |||
| 13b7c8858a | |||
| b8b6f243d1 | |||
| eaa6e805ce | |||
| 05dbb21c58 | |||
| f1f4358dee | |||
| 3374c41f07 | |||
| 95affe87e5 | |||
| 5e86398fcd | |||
| 844f0cbe42 | |||
| 4d5d7847e5 | |||
| 06795feed6 | |||
| a4548a7ee1 | |||
| 42216865e2 | |||
| e48bdb2401 | |||
| c67eb4b2f3 | |||
| fafa74b0b8 | |||
| 049a39aba3 | |||
| cd26430a61 | |||
| aab010ff17 | |||
| 9a7bc32051 | |||
| bfdd8ac6be | |||
| 018669fb87 | |||
| fb2de78d62 | |||
| 7e8d6c1954 | |||
| d21230c6c3 | |||
| 4e98d5bfa1 | |||
| 8998269377 | |||
| 6e9a227ad2 | |||
| 4e7506da5a | |||
| 06ad93c77c | |||
| 0bb7b34259 | |||
| 859c17b49d | |||
| 46b26c9624 | |||
| fd215c18e4 | |||
| efa3afc7dc | |||
| e03dd66051 | |||
| 3476fe5fae | |||
| 3e08c8347e | |||
| c691123d2d | |||
| a4b81535e8 | |||
| a898cf4b65 | |||
| 10009a0a43 | |||
| 4dfe47ff99 | |||
| bd997e61d6 | |||
| 23d0670126 | |||
| 12c600edbe | |||
| e4115e705e | |||
| 0cbc4a2d72 | |||
| cea6d35729 | |||
| 4c56918d8a | |||
| f17263417e | |||
| 6f0b463489 | |||
| 28c01caa1a | |||
| 95b59e9b0a | |||
| bc967f1f6e | |||
| 4228b8a74f | |||
| 300086fc83 | |||
| 688eb262a9 | |||
| bfb6f5dede | |||
| 88c0fbe56b | |||
| 3d74eff224 | |||
| 22d08afe2d | |||
| a97f757e34 | |||
| bbc8f13944 | |||
| f18a40d76c | |||
| c7644a36aa | |||
| f394f31c12 | |||
| 0513673c2a | |||
| a73542a391 | |||
| c465797654 | |||
| 7ce3f982f2 | |||
| 6311456917 | |||
| 3c06559c5d | |||
| 5c9f2e8c83 | |||
| 62902faaca | |||
| d22fd77b22 | |||
| 2f86f493d6 | |||
| eafa0fad85 | |||
| 2e7c497b69 | |||
| 79dcfdcf9a | |||
| 487e28a904 | |||
| c492f97a67 | |||
| ec2ff74514 | |||
| c501f8d6e6 | |||
| b4287cbfda | |||
| 26724a1db1 | |||
| d6368ee709 | |||
| ea211a5c0b | |||
| 9e8b433c95 | |||
| 2a1c62aeed | |||
| 948d6f4b47 | |||
| 725271a7da | |||
| 4ea62e6886 | |||
| c01cf3b078 | |||
| c67777a6b9 | |||
| e5e5e7560c | |||
| 6e1cd5fbbc | |||
| 8bd8f7c0bb | |||
| 1c42cd1f78 | |||
| be090a43f2 | |||
| 2c81fa0cb5 | |||
| c3815afcd4 | |||
| 8800160564 | |||
| bdbc4a1bf7 | |||
| 597090bc45 | |||
| af36c410b4 | |||
| 4dcd606c10 | |||
| bbd5a7fa48 | |||
| 6dc37ccb66 | |||
| b84906283e | |||
| af0dcdf76a | |||
| f1e1cde597 | |||
| a7c5630e5b | |||
| 82151d6bc3 | |||
| c93e7ad598 | |||
| 290b271cf6 | |||
| a686bdbeaf | |||
| 2b84c62875 | |||
| 2de786fc64 | |||
| 2ddf034983 | |||
| b2043f4f84 | |||
| 52eae52061 | |||
| e451ea64ae | |||
| eb6e4028ff | |||
| ea45493941 | |||
| da70f0e00c | |||
| 8bf3707511 | |||
| 804eeef858 | |||
| 36658f12fe | |||
| f0e675ffd8 | |||
| 90911c1074 | |||
| 1729e0b40d | |||
| 70b3169e7f | |||
| 315a1c0945 | |||
| b9316c693d | |||
| 820e52d869 | |||
| 444e8e4206 | |||
| e6f2c8b4e8 | |||
| f6b7652219 | |||
| 3cd00ab5cf | |||
| 0feae50d90 | |||
| d79c55fc64 | |||
| 98a6f5c480 | |||
| 0fa4aa7280 | |||
| 3b192e2eed | |||
| 2794d26181 | |||
| 7b56f696d6 | |||
| ead1b8c1bc | |||
| b8126e00c4 | |||
| 62f8f4b313 | |||
| 21924541dc | |||
| 197242ca42 | |||
| 7922734dce | |||
| afd7675425 | |||
| 8b3b343645 | |||
| 6713f342c3 | |||
| 79bb8a143c | |||
| d03a181536 | |||
| 8e5304c156 | |||
| c487db6534 | |||
| 0ca54f429e | |||
| 6c62f64784 | |||
| 3282f85007 | |||
| b186569750 | |||
| 7cde2462ec | |||
| c0b96f7c05 | |||
| 3a88caac69 | |||
| 31457e9240 | |||
| 307335a733 | |||
| 4eb5e34ea7 | |||
| f4be219790 | |||
| 53591b6f27 | |||
| a14d0bb7cb | |||
| c2802e7cb3 | |||
| c7f2815f1d | |||
| afc90fe992 | |||
| 2f9ce824a0 | |||
| a39b2a6950 | |||
| 70fa95966c | |||
| 10479014d5 | |||
| 148e22aa7d | |||
| a9fe6196f7 | |||
| ae20665a66 | |||
| ccbf078463 | |||
| cd1e46784f | |||
| b49d97b8d7 | |||
| 3d44de9d87 | |||
| 987b3c1770 | |||
| 25bc6544c4 | |||
| b241ae753b | |||
| 2041771b14 | |||
| b5958a9f63 | |||
| c05dbd4b3b | |||
| 44e468fc21 | |||
| 5f358d329b | |||
| 59fe8154f2 | |||
| d5b2e9bb8f | |||
| 3a3907e1b6 | |||
| f66cdc70b2 | |||
| e299a497a7 | |||
| eefb29d2ad | |||
| 668c485037 | |||
| c83d5bb970 | |||
| 8638b3c95b | |||
| c9b55c16f7 | |||
| a3b9d53d06 | |||
| 215915cee2 | |||
| df9271ed81 | |||
| 97e73a0fa0 | |||
| e2ebe98e7f | |||
| 345e30de71 | |||
| 0084160e12 | |||
| 7280c111f0 | |||
| 763de11d2b | |||
| 9c0de3eeb2 | |||
| 9c81e9fbf1 | |||
| 40603f1652 | |||
| 4a34da31ad | |||
| 6e394faab1 | |||
| 25d0895cac | |||
| f5f93cfb12 | |||
| cf1ebe9d6f | |||
| dd0d4ff8b1 | |||
| 98103f3270 | |||
| a740ebb42e | |||
| 7c79f820cb | |||
| f3b2326304 | |||
| 4eded247c5 | |||
| 3ffb857181 | |||
| ec92906399 | |||
| 87671640ea | |||
| c725e984bb | |||
| 8f05b57744 | |||
| 120b0147f2 | |||
| ae695b4b15 | |||
| 89fee685c3 | |||
| 23eb9a21c8 | |||
| a009efe91c | |||
| dbd3111adf | |||
| 2b35e0c07a | |||
| ba3354df03 | |||
| 7c8c19f416 | |||
| 35f83241cb | |||
| 5297f9b47e | |||
| cbc50ec4eb | |||
| 198cc291e2 | |||
| cb8c3a6ea6 | |||
| 66f597bbba | |||
| fe237c8b4b | |||
| ec7b08a625 | |||
| 9dd728579d | |||
| 0a6ca00f64 | |||
| 6e856ce05f | |||
| 1b8b78cfe9 | |||
| 49508d9f87 | |||
| 63a660223e | |||
| 2cbc2e2f43 | |||
| eb2ad44755 | |||
| c3979142a6 | |||
| 1a30de6bfc | |||
| 8c0df47966 | |||
| 4c177378ae | |||
| a2db175819 | |||
| 2d0f050524 | |||
| 2d2b78edb3 | |||
| 627fa80f3e | |||
| 96cbbb093e | |||
| 6551308f14 | |||
| 9ccc2badd4 | |||
| 07a83bdeb7 | |||
| 224dfb6588 | |||
| d8d6c68ac2 | |||
| 3e369e5464 | |||
| abaaba97f9 | |||
| bd49467d26 | |||
| b44524e1a9 | |||
| 0319667f8e | |||
| 4c3ad1f83c | |||
| a7a845187a | |||
| e14a9a11d2 | |||
| d7ce757d65 | |||
| 6f5447c316 | |||
| 727b67811c | |||
| a85fb68015 | |||
| 2dfe7328cc | |||
| cfc0a0ca8f | |||
| 2b1c7fe20d | |||
| d5e1d9c09b | |||
| 554540451e | |||
| c10134c22b | |||
| 9175b3a045 | |||
| 0ecae44888 | |||
| a9c2a10fb5 | |||
| e11094e5fa | |||
| 38913dc3e1 | |||
| dd2b7932a2 | |||
| 2051e187ba | |||
| ca817a8cc4 | |||
| 2d815f5902 | |||
| 0abdc647c9 | |||
| fded2db73e | |||
| 2274c9faf5 | |||
| 280b236490 | |||
| ef2c144cdd | |||
| 634cf0ff1b | |||
| 2dc3d5c169 | |||
| fcad1fdfaa | |||
| 5c6cdfb5e1 | |||
| 1d6ba761a8 | |||
| 893ba6c8f4 | |||
| a835eff14b | |||
| 029b4d8a6e | |||
| 0053b45691 | |||
| f46812d1d2 | |||
| 8495df961b | |||
| abc8be1077 | |||
| 84baf27f7e | |||
| 1a08d904f9 | |||
| 31ca9a95d7 | |||
| ade91b26c3 | |||
| d797957abc | |||
| 104ddddb96 | |||
| c8c4c7fa00 | |||
| 3111a26229 | |||
| 26ef257e06 | |||
| 6864ea7b53 | |||
| c2ecc15d89 | |||
| 9bbcc0952b | |||
| 179ec4d38b | |||
| 02a61cdf0d | |||
| 6a84b4e5aa | |||
| 497a9b9d58 | |||
| c5972be618 | |||
| 43e77c4c6b | |||
| d66985e597 | |||
| ff4bed99c9 | |||
| 2b4c1b83bd | |||
| f85aa8f604 | |||
| e5ca3c4177 | |||
| 0f2f8208e3 | |||
| ba17516814 | |||
| c0979b69e2 | |||
| 4b5c33bb90 | |||
| 1ea4d53061 | |||
| 17b548e9b0 | |||
| 5755eb1f08 | |||
| e2f15324d7 | |||
| b0b0802a56 | |||
| e14cc8b758 | |||
| 55c6a5058c | |||
| 52d5d01337 | |||
| 93ceff7f17 | |||
| 0a891cfd95 | |||
| 66adc6a354 | |||
| e99f1c2ba0 | |||
| f52a9face8 | |||
| 4fcf22131c | |||
| d27324f8dd | |||
| dd86a15555 | |||
| 28d173b8fd | |||
| b4b7ea76ab | |||
| 6ea452cc3f | |||
| 6e0e965883 | |||
| 6b4b3ce2ba | |||
| 18cdbab885 | |||
| ebca9b4caf | |||
| 6945ef8ddc | |||
| f20bf03ee1 | |||
| 7cda15553d | |||
| 3c67fd5f9b | |||
| 0150931df4 | |||
| 5ff1b06009 | |||
| 5279bcae18 | |||
| daa58e6b27 | |||
| 589d021744 | |||
| e58b9c8ada | |||
| 507c28e006 | |||
| 2d74fe4057 | |||
| e8312268d2 | |||
| f4cd1e51c0 | |||
| 60e707e7df | |||
| 0d6eb4dac4 | |||
| 80d8f986bd | |||
| 9ee3bb9007 | |||
| 8a07010704 | |||
| 314fc8ac94 | |||
| b46e5461a7 | |||
| ada7c1797d | |||
| 54197995f5 | |||
| 13e510dd9c | |||
| 5c937fbffe |
|
|
@ -1,22 +1,36 @@
|
|||
# .gitea/workflows/deploy.yml
|
||||
name: Deploy mindnet to llm-node
|
||||
on: { push: { branches: [ "main" ] } }
|
||||
# on: {}
|
||||
|
||||
concurrency: { group: deploy-mindnet, cancel-in-progress: false }
|
||||
on:
|
||||
push:
|
||||
branches: ["main"]
|
||||
|
||||
concurrency:
|
||||
group: deploy-mindnet
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: linux_host
|
||||
env:
|
||||
DEPLOY_DIRS: "app scripts schemas docker requirements.txt README.md"
|
||||
TARGET_BASE: "/home/llmadmin/mindnet"
|
||||
TARGET_BASE: /home/llmadmin/mindnet
|
||||
DEPLOY_DIRS: "app scripts schemas docker tests config requirements.txt README.md"
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: https://github.com/actions/checkout@v4
|
||||
- name: Ensure target base
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Ensure target base exists
|
||||
run: install -d "$TARGET_BASE"
|
||||
- name: Deploy whitelisted
|
||||
|
||||
- name: Stop API (graceful)
|
||||
continue-on-error: true
|
||||
run: |
|
||||
if systemctl --user list-unit-files | grep -q '^mindnet-api.service'; then
|
||||
systemctl --user stop mindnet-api.service || true
|
||||
fi
|
||||
|
||||
- name: Deploy whitelisted (rsync)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
IFS=' ' read -r -a DIRS <<< "$DEPLOY_DIRS"
|
||||
|
|
@ -24,22 +38,27 @@ jobs:
|
|||
if [ -e "$d" ]; then
|
||||
if [ -d "$d" ]; then
|
||||
install -d "$TARGET_BASE/$d"
|
||||
rsync -a --delete --exclude='.git' \
|
||||
--exclude='.env' --exclude='.env.*' --exclude='**/.env*' \
|
||||
rsync -a --delete \
|
||||
--exclude='.git' \
|
||||
--exclude='.env*' \
|
||||
--exclude='.venv' \
|
||||
--exclude='hf_cache' --exclude='hf_cache/**' \
|
||||
"$d"/ "$TARGET_BASE/$d"/
|
||||
else
|
||||
rsync -a "$d" "$TARGET_BASE/$d"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
- name: Python venv (idempotent)
|
||||
|
||||
- name: Python venv & requirements
|
||||
run: |
|
||||
cd "$TARGET_BASE"
|
||||
[ -d .venv ] || python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install --upgrade pip
|
||||
[ -f requirements.txt ] && pip install -r requirements.txt || true
|
||||
- name: Optional — restart mindnet-api
|
||||
|
||||
- name: Start/Restart mindnet-api
|
||||
continue-on-error: true
|
||||
run: |
|
||||
if systemctl --user list-unit-files | grep -q '^mindnet-api.service'; then
|
||||
|
|
|
|||
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"CodeGPT.apiKey": "Google AI Studio"
|
||||
}
|
||||
237
ANALYSE_TYPES_YAML_ZUGRIFFE.md
Normal file
237
ANALYSE_TYPES_YAML_ZUGRIFFE.md
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
# Analyse: Zugriffe auf config/types.yaml
|
||||
|
||||
## Zusammenfassung
|
||||
|
||||
Diese Analyse prüft, welche Scripte auf `config/types.yaml` zugreifen und ob sie auf Elemente zugreifen, die in der aktuellen `types.yaml` nicht mehr vorhanden sind.
|
||||
|
||||
**Datum:** 2025-01-XX
|
||||
**Version types.yaml:** 2.7.0
|
||||
|
||||
---
|
||||
|
||||
## ❌ KRITISCHE PROBLEME
|
||||
|
||||
### 1. `edge_defaults` fehlt in types.yaml, wird aber im Code verwendet
|
||||
|
||||
**Status:** ⚠️ **PROBLEM** - Code sucht nach `edge_defaults` in types.yaml, aber dieses Feld existiert nicht mehr.
|
||||
|
||||
**Betroffene Dateien:**
|
||||
|
||||
#### a) `app/core/graph/graph_utils.py` (Zeilen 101-112)
|
||||
```python
|
||||
def get_edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
|
||||
"""Ermittelt Standard-Kanten für einen Typ."""
|
||||
types_map = reg.get("types", reg) if isinstance(reg, dict) else {}
|
||||
if note_type and isinstance(types_map, dict):
|
||||
t = types_map.get(note_type)
|
||||
if isinstance(t, dict) and isinstance(t.get("edge_defaults"), list): # ❌ Sucht nach edge_defaults
|
||||
return [str(x) for x in t["edge_defaults"] if isinstance(x, str)]
|
||||
for key in ("defaults", "default", "global"):
|
||||
v = reg.get(key)
|
||||
if isinstance(v, dict) and isinstance(v.get("edge_defaults"), list): # ❌ Sucht nach edge_defaults
|
||||
return [str(x) for x in v["edge_defaults"] if isinstance(x, str)]
|
||||
return []
|
||||
```
|
||||
**Problem:** Funktion gibt immer `[]` zurück, da `edge_defaults` nicht in types.yaml existiert.
|
||||
|
||||
#### b) `app/core/graph/graph_derive_edges.py` (Zeile 64)
|
||||
```python
|
||||
defaults = get_edge_defaults_for(note_type, reg) # ❌ Wird verwendet, liefert aber []
|
||||
```
|
||||
**Problem:** Keine automatischen Default-Kanten werden mehr erzeugt.
|
||||
|
||||
#### c) `app/services/discovery.py` (Zeile 212)
|
||||
```python
|
||||
defaults = type_def.get("edge_defaults") # ❌ Sucht nach edge_defaults
|
||||
return defaults[0] if defaults else "related_to"
|
||||
```
|
||||
**Problem:** Fallback funktioniert, aber nutzt nicht die neue dynamische Lösung.
|
||||
|
||||
#### d) `tests/check_types_registry_edges.py` (Zeile 170)
|
||||
```python
|
||||
eddefs = (tdef or {}).get("edge_defaults") or [] # ❌ Sucht nach edge_defaults
|
||||
```
|
||||
**Problem:** Test findet keine `edge_defaults` mehr und gibt Warnung aus.
|
||||
|
||||
**✅ Lösung bereits implementiert:**
|
||||
- `app/core/ingestion/ingestion_note_payload.py` (WP-24c, Zeilen 124-134) nutzt bereits die neue dynamische Lösung über `edge_registry.get_topology_info()`.
|
||||
|
||||
**Empfehlung:**
|
||||
- `get_edge_defaults_for()` in `graph_utils.py` sollte auf die EdgeRegistry umgestellt werden.
|
||||
- `discovery.py` sollte ebenfalls die EdgeRegistry nutzen.
|
||||
|
||||
---
|
||||
|
||||
### 2. Inkonsistenz: `chunk_profile` vs `chunking_profile`
|
||||
|
||||
**Status:** ⚠️ **WARNUNG** - Meistens abgefangen durch Fallback-Logik.
|
||||
|
||||
**Problem:**
|
||||
- In `types.yaml` heißt es: `chunking_profile` ✅
|
||||
- `app/core/type_registry.py` (Zeile 88) sucht nach: `chunk_profile` ❌
|
||||
|
||||
```python
|
||||
def effective_chunk_profile(note_type: Optional[str], reg: Dict[str, Any]) -> Optional[str]:
|
||||
cfg = get_type_config(note_type, reg)
|
||||
prof = cfg.get("chunk_profile") # ❌ Sucht nach "chunk_profile", aber types.yaml hat "chunking_profile"
|
||||
if isinstance(prof, str) and prof.strip():
|
||||
return prof.strip().lower()
|
||||
return None
|
||||
```
|
||||
|
||||
**Betroffene Dateien:**
|
||||
- `app/core/type_registry.py` (Zeile 88) - verwendet `chunk_profile` statt `chunking_profile`
|
||||
|
||||
**✅ Gut gehandhabt:**
|
||||
- `app/core/ingestion/ingestion_chunk_payload.py` (Zeile 33) - hat Fallback: `t_cfg.get(key) or t_cfg.get(key.replace("ing", ""))`
|
||||
- `app/core/ingestion/ingestion_note_payload.py` (Zeile 120) - prüft beide Varianten
|
||||
|
||||
**Empfehlung:**
|
||||
- `type_registry.py` sollte auch `chunking_profile` prüfen (oder beide Varianten).
|
||||
|
||||
---
|
||||
|
||||
## ✅ KORREKT VERWENDETE ELEMENTE
|
||||
|
||||
### 1. `chunking_profiles` ✅
|
||||
- **Verwendet in:**
|
||||
- `app/core/chunking/chunking_utils.py` (Zeile 33) ✅
|
||||
- **Status:** Korrekt vorhanden in types.yaml
|
||||
|
||||
### 2. `defaults` ✅
|
||||
- **Verwendet in:**
|
||||
- `app/core/ingestion/ingestion_chunk_payload.py` (Zeile 36) ✅
|
||||
- `app/core/ingestion/ingestion_note_payload.py` (Zeile 104) ✅
|
||||
- `app/core/chunking/chunking_utils.py` (Zeile 35) ✅
|
||||
- **Status:** Korrekt vorhanden in types.yaml
|
||||
|
||||
### 3. `ingestion_settings` ✅
|
||||
- **Verwendet in:**
|
||||
- `app/core/ingestion/ingestion_note_payload.py` (Zeile 105) ✅
|
||||
- **Status:** Korrekt vorhanden in types.yaml
|
||||
|
||||
### 4. `llm_settings` ✅
|
||||
- **Verwendet in:**
|
||||
- `app/core/registry.py` (Zeile 37) ✅
|
||||
- **Status:** Korrekt vorhanden in types.yaml
|
||||
|
||||
### 5. `types` (Hauptstruktur) ✅
|
||||
- **Verwendet in:** Viele Dateien
|
||||
- **Status:** Korrekt vorhanden in types.yaml
|
||||
|
||||
### 6. `types[].chunking_profile` ✅
|
||||
- **Verwendet in:**
|
||||
- `app/core/chunking/chunking_utils.py` (Zeile 35) ✅
|
||||
- `app/core/ingestion/ingestion_chunk_payload.py` (Zeile 67) ✅
|
||||
- `app/core/ingestion/ingestion_note_payload.py` (Zeile 120) ✅
|
||||
- **Status:** Korrekt vorhanden in types.yaml
|
||||
|
||||
### 7. `types[].retriever_weight` ✅
|
||||
- **Verwendet in:**
|
||||
- `app/core/ingestion/ingestion_chunk_payload.py` (Zeile 71) ✅
|
||||
- `app/core/ingestion/ingestion_note_payload.py` (Zeile 111) ✅
|
||||
- `app/core/retrieval/retriever_scoring.py` (Zeile 87) ✅
|
||||
- **Status:** Korrekt vorhanden in types.yaml
|
||||
|
||||
### 8. `types[].detection_keywords` ✅
|
||||
- **Verwendet in:**
|
||||
- `app/routers/chat.py` (Zeilen 104, 150) ✅
|
||||
- **Status:** Korrekt vorhanden in types.yaml
|
||||
|
||||
### 9. `types[].schema` ✅
|
||||
- **Verwendet in:**
|
||||
- `app/routers/chat.py` (vermutlich) ✅
|
||||
- **Status:** Korrekt vorhanden in types.yaml
|
||||
|
||||
---
|
||||
|
||||
## 📋 ZUSAMMENFASSUNG DER ZUGRIFFE
|
||||
|
||||
### Dateien, die auf types.yaml zugreifen:
|
||||
|
||||
1. **app/core/type_registry.py** ⚠️
|
||||
- Verwendet: `types`, `chunk_profile` (sollte `chunking_profile` sein)
|
||||
- Problem: Sucht nach `chunk_profile` statt `chunking_profile`
|
||||
|
||||
2. **app/core/registry.py** ✅
|
||||
- Verwendet: `llm_settings.cleanup_patterns`
|
||||
- Status: OK
|
||||
|
||||
3. **app/core/ingestion/ingestion_chunk_payload.py** ✅
|
||||
- Verwendet: `types`, `defaults`, `chunking_profile`, `retriever_weight`
|
||||
- Status: OK (hat Fallback für chunk_profile/chunking_profile)
|
||||
|
||||
4. **app/core/ingestion/ingestion_note_payload.py** ✅
|
||||
- Verwendet: `types`, `defaults`, `ingestion_settings`, `chunking_profile`, `retriever_weight`
|
||||
- Status: OK (nutzt neue EdgeRegistry für edge_defaults)
|
||||
|
||||
5. **app/core/chunking/chunking_utils.py** ✅
|
||||
- Verwendet: `chunking_profiles`, `types`, `defaults.chunking_profile`
|
||||
- Status: OK
|
||||
|
||||
6. **app/core/retrieval/retriever_scoring.py** ✅
|
||||
- Verwendet: `retriever_weight` (aus Payload, kommt ursprünglich aus types.yaml)
|
||||
- Status: OK
|
||||
|
||||
7. **app/core/graph/graph_utils.py** ❌
|
||||
- Verwendet: `types[].edge_defaults` (existiert nicht mehr!)
|
||||
- Problem: Sucht nach `edge_defaults` in types.yaml
|
||||
|
||||
8. **app/core/graph/graph_derive_edges.py** ❌
|
||||
- Verwendet: `get_edge_defaults_for()` → sucht nach `edge_defaults`
|
||||
- Problem: Keine Default-Kanten mehr
|
||||
|
||||
9. **app/services/discovery.py** ⚠️
|
||||
- Verwendet: `types[].edge_defaults` (existiert nicht mehr!)
|
||||
- Problem: Fallback funktioniert, aber nutzt nicht neue Lösung
|
||||
|
||||
10. **app/routers/chat.py** ✅
|
||||
- Verwendet: `types[].detection_keywords`
|
||||
- Status: OK
|
||||
|
||||
11. **tests/test_type_registry.py** ⚠️
|
||||
- Verwendet: `types[].chunk_profile`, `types[].edge_defaults`
|
||||
- Problem: Test verwendet alte Struktur
|
||||
|
||||
12. **tests/check_types_registry_edges.py** ❌
|
||||
- Verwendet: `types[].edge_defaults` (existiert nicht mehr!)
|
||||
- Problem: Test findet keine edge_defaults
|
||||
|
||||
13. **scripts/payload_dryrun.py** ✅
|
||||
- Verwendet: Indirekt über `make_note_payload()` und `make_chunk_payloads()`
|
||||
- Status: OK
|
||||
|
||||
---
|
||||
|
||||
## 🔧 EMPFOHLENE FIXES
|
||||
|
||||
### Priorität 1 (Kritisch):
|
||||
|
||||
1. **`app/core/graph/graph_utils.py` - `get_edge_defaults_for()`**
|
||||
- Sollte auf `edge_registry.get_topology_info()` umgestellt werden
|
||||
- Oder: Rückwärtskompatibilität beibehalten, aber EdgeRegistry als primäre Quelle nutzen
|
||||
|
||||
2. **`app/core/graph/graph_derive_edges.py`**
|
||||
- Nutzt `get_edge_defaults_for()`, sollte nach Fix von graph_utils.py funktionieren
|
||||
|
||||
3. **`app/services/discovery.py`**
|
||||
- Sollte EdgeRegistry für `edge_defaults` nutzen
|
||||
|
||||
### Priorität 2 (Warnung):
|
||||
|
||||
4. **`app/core/type_registry.py` - `effective_chunk_profile()`**
|
||||
- Sollte auch `chunking_profile` prüfen (nicht nur `chunk_profile`)
|
||||
|
||||
5. **`tests/test_type_registry.py`**
|
||||
- Test sollte aktualisiert werden, um `chunking_profile` statt `chunk_profile` zu verwenden
|
||||
|
||||
6. **`tests/check_types_registry_edges.py`**
|
||||
- Test sollte auf EdgeRegistry umgestellt werden oder als deprecated markiert werden
|
||||
|
||||
---
|
||||
|
||||
## 📝 HINWEISE
|
||||
|
||||
- **WP-24c** hat bereits eine Lösung für `edge_defaults` implementiert: Dynamische Abfrage über `edge_registry.get_topology_info()`
|
||||
- Die alte Lösung (statische `edge_defaults` in types.yaml) wurde durch die dynamische Lösung ersetzt
|
||||
- Code-Stellen, die noch die alte Lösung verwenden, sollten migriert werden
|
||||
|
|
@ -1,4 +1,3 @@
|
|||
|
||||
# mindnet API (bundle)
|
||||
|
||||
This bundle provides a minimal FastAPI app for embeddings and Qdrant upserts/queries plus a Markdown importer.
|
||||
|
|
@ -33,3 +32,7 @@ python3 scripts/import_markdown.py --vault /path/to/Obsidian
|
|||
- `POST /qdrant/query` → semantic search over chunks with optional filters
|
||||
|
||||
See `scripts/quick_test.sh` for a runnable example.
|
||||
|
||||
>Anmerkung:
|
||||
Diese Datei ist veraltet und muss auf Stand 2.6.0 gebracht werden
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,10 @@
|
|||
"""
|
||||
FILE: app/__init__.py
|
||||
DESCRIPTION: Paket-Initialisierung.
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: None
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
105
app/config.py
105
app/config.py
|
|
@ -1,22 +1,115 @@
|
|||
"""
|
||||
Version 0.1
|
||||
FILE: app/config.py
|
||||
DESCRIPTION: Zentrale Pydantic-Konfiguration.
|
||||
WP-20: Hybrid-Cloud Modus Support (OpenRouter/Gemini/Ollama).
|
||||
FIX: Einführung von Parametern zur intelligenten Rate-Limit Steuerung (429 Handling).
|
||||
VERSION: 0.6.7
|
||||
STATUS: Active
|
||||
DEPENDENCIES: os, functools, pathlib, python-dotenv
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# WP-20: Lade Umgebungsvariablen aus der .env Datei
|
||||
# override=True garantiert, dass Änderungen in der .env immer Vorrang haben.
|
||||
# WP-24c v4.5.10: Expliziter Pfad für .env-Datei, um Probleme mit Arbeitsverzeichnis zu vermeiden
|
||||
# Suche .env im Projekt-Root (3 Ebenen über app/config.py: app/config.py -> app/ -> root/)
|
||||
_project_root = Path(__file__).parent.parent.parent
|
||||
_env_file = _project_root / ".env"
|
||||
_env_loaded = False
|
||||
|
||||
# Versuche zuerst expliziten Pfad
|
||||
if _env_file.exists():
|
||||
_env_loaded = load_dotenv(_env_file, override=True)
|
||||
if _env_loaded:
|
||||
# Optional: Logging (nur wenn logging bereits initialisiert ist)
|
||||
try:
|
||||
import logging
|
||||
_logger = logging.getLogger(__name__)
|
||||
_logger.debug(f"✅ .env geladen von: {_env_file}")
|
||||
except:
|
||||
pass # Logging noch nicht initialisiert
|
||||
|
||||
# Fallback: Automatische Suche (für Dev/Test oder wenn .env an anderer Stelle liegt)
|
||||
if not _env_loaded:
|
||||
_env_loaded = load_dotenv(override=True)
|
||||
if _env_loaded:
|
||||
try:
|
||||
import logging
|
||||
_logger = logging.getLogger(__name__)
|
||||
_logger.debug(f"✅ .env geladen via automatische Suche (cwd: {Path.cwd()})")
|
||||
except:
|
||||
pass
|
||||
|
||||
class Settings:
|
||||
# Qdrant
|
||||
# --- Qdrant Datenbank ---
|
||||
QDRANT_URL: str = os.getenv("QDRANT_URL", "http://127.0.0.1:6333")
|
||||
QDRANT_API_KEY: str | None = os.getenv("QDRANT_API_KEY")
|
||||
COLLECTION_PREFIX: str = os.getenv("MINDNET_PREFIX", "mindnet")
|
||||
VECTOR_SIZE: int = int(os.getenv("MINDNET_VECTOR_SIZE", "384"))
|
||||
# WP-24c v4.5.10: Harmonisierung - Unterstützt beide Umgebungsvariablen für Abwärtskompatibilität
|
||||
# COLLECTION_PREFIX hat Priorität, MINDNET_PREFIX als Fallback
|
||||
# WP-24c v4.5.10-FIX: Default auf "mindnet" (Prod) statt "mindnet_dev" (Dev)
|
||||
# Dev muss explizit COLLECTION_PREFIX=mindnet_dev in .env setzen
|
||||
COLLECTION_PREFIX: str = os.getenv("COLLECTION_PREFIX") or os.getenv("MINDNET_PREFIX") or "mindnet"
|
||||
|
||||
# WP-22: Vektor-Dimension für das Embedding-Modell (nomic)
|
||||
VECTOR_SIZE: int = int(os.getenv("VECTOR_DIM", "768"))
|
||||
DISTANCE: str = os.getenv("MINDNET_DISTANCE", "Cosine")
|
||||
# Embeddings
|
||||
|
||||
# --- Lokale Embeddings (Ollama & Sentence-Transformers) ---
|
||||
EMBEDDING_MODEL: str = os.getenv("MINDNET_EMBEDDING_MODEL", "nomic-embed-text")
|
||||
MODEL_NAME: str = os.getenv("MINDNET_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
||||
# API
|
||||
|
||||
# --- WP-20 Hybrid LLM Provider ---
|
||||
# Erlaubt: "ollama" | "gemini" | "openrouter"
|
||||
MINDNET_LLM_PROVIDER: str = os.getenv("MINDNET_LLM_PROVIDER", "openrouter").lower()
|
||||
# Standardwert 10000, falls nichts in der .env steht
|
||||
MAX_OLLAMA_CHARS: int = int(os.getenv("MAX_OLLAMA_CHARS", 10000))
|
||||
|
||||
# Google AI Studio (2025er Lite-Modell für höhere Kapazität)
|
||||
GOOGLE_API_KEY: str | None = os.getenv("GOOGLE_API_KEY")
|
||||
GEMINI_MODEL: str = os.getenv("MINDNET_GEMINI_MODEL", "gemini-2.5-flash-lite")
|
||||
|
||||
# OpenRouter Integration (Verfügbares Free-Modell 2025)
|
||||
OPENROUTER_API_KEY: str | None = os.getenv("OPENROUTER_API_KEY")
|
||||
OPENROUTER_MODEL: str = os.getenv("OPENROUTER_MODEL", "mistralai/mistral-7b-instruct:free")
|
||||
|
||||
LLM_FALLBACK_ENABLED: bool = os.getenv("MINDNET_LLM_FALLBACK", "true").lower() == "true"
|
||||
|
||||
# --- NEU: Intelligente Rate-Limit Steuerung ---
|
||||
# Dauer der Wartezeit in Sekunden, wenn ein HTTP 429 (Rate Limit) auftritt
|
||||
LLM_RATE_LIMIT_WAIT: float = float(os.getenv("MINDNET_LLM_RATE_LIMIT_WAIT", "60.0"))
|
||||
# Anzahl der Cloud-Retries bei 429, bevor Ollama-Fallback greift
|
||||
LLM_RATE_LIMIT_RETRIES: int = int(os.getenv("MINDNET_LLM_RATE_LIMIT_RETRIES", "3"))
|
||||
|
||||
# --- WP-05 Lokales LLM (Ollama) ---
|
||||
OLLAMA_URL: str = os.getenv("MINDNET_OLLAMA_URL", "http://127.0.0.1:11434")
|
||||
LLM_MODEL: str = os.getenv("MINDNET_LLM_MODEL", "phi3:mini")
|
||||
PROMPTS_PATH: str = os.getenv("MINDNET_PROMPTS_PATH", "config/prompts.yaml")
|
||||
|
||||
# --- WP-06 / WP-14 Performance & Last-Steuerung ---
|
||||
LLM_TIMEOUT: float = float(os.getenv("MINDNET_LLM_TIMEOUT", "300.0"))
|
||||
DECISION_CONFIG_PATH: str = os.getenv("MINDNET_DECISION_CONFIG", "config/decision_engine.yaml")
|
||||
BACKGROUND_LIMIT: int = int(os.getenv("MINDNET_LLM_BACKGROUND_LIMIT", "2"))
|
||||
|
||||
# --- System-Pfade & Ingestion-Logik ---
|
||||
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
|
||||
MINDNET_VAULT_ROOT: str = os.getenv("MINDNET_VAULT_ROOT", "./vault_master")
|
||||
MINDNET_TYPES_FILE: str = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml")
|
||||
MINDNET_VOCAB_PATH: str = os.getenv("MINDNET_VOCAB_PATH", "/mindnet/vault/mindnet/_system/dictionary/edge_vocabulary.md")
|
||||
CHANGE_DETECTION_MODE: str = os.getenv("MINDNET_CHANGE_DETECTION_MODE", "full")
|
||||
|
||||
# --- WP-04 Retriever Gewichte ---
|
||||
RETRIEVER_W_SEM: float = float(os.getenv("MINDNET_WP04_W_SEM", "0.70"))
|
||||
RETRIEVER_W_EDGE: float = float(os.getenv("MINDNET_WP04_W_EDGE", "0.25"))
|
||||
RETRIEVER_W_CENT: float = float(os.getenv("MINDNET_WP04_W_CENT", "0.05"))
|
||||
RETRIEVER_TOP_K: int = int(os.getenv("MINDNET_WP04_TOP_K", "10"))
|
||||
RETRIEVER_EXPAND_DEPTH: int = int(os.getenv("MINDNET_WP04_EXPAND_DEPTH", "1"))
|
||||
RETRIEVER_EDGE_TYPES: str = os.getenv("MINDNET_WP04_EDGE_TYPES", "references,belongs_to,prev,next")
|
||||
|
||||
@lru_cache
|
||||
def get_settings() -> Settings:
|
||||
"""Gibt die zentralen Einstellungen als Singleton zurück."""
|
||||
return Settings()
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
TYPE_SIZES = {
|
||||
"thought": {"target": (150, 250), "max": 300, "overlap": (30, 40)},
|
||||
"experience":{"target": (250, 350), "max": 450, "overlap": (40, 60)},
|
||||
"journal": {"target": (200, 300), "max": 400, "overlap": (30, 50)},
|
||||
"task": {"target": (120, 200), "max": 250, "overlap": (20, 30)},
|
||||
"project": {"target": (300, 450), "max": 600, "overlap": (50, 70)},
|
||||
"concept": {"target": (250, 400), "max": 550, "overlap": (40, 60)},
|
||||
"source": {"target": (200, 350), "max": 500, "overlap": (30, 50)},
|
||||
}
|
||||
DEFAULT = {"target": (250, 350), "max": 500, "overlap": (40, 60)}
|
||||
|
||||
def get_sizes(note_type: str):
|
||||
return TYPE_SIZES.get(str(note_type).lower(), DEFAULT)
|
||||
|
|
@ -1,253 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/core/chunk_payload.py
|
||||
Version: 2.0.0
|
||||
Datum: 2025-09-09
|
||||
|
||||
Kurzbeschreibung
|
||||
----------------
|
||||
Erzeugt **Chunk-Payloads** für die Qdrant-Collection `<prefix>_chunks` auf Basis der
|
||||
vom Chunker gelieferten `Chunk`-Objekte. Ziel:
|
||||
- *Verlustfreie Rekonstruktion*: Jeder Chunk enthält seinen **Text** (payload["text"]).
|
||||
- *Schnelle Abfragen*: Wichtige Note-Metadaten werden gespiegelt, um Filter ohne Join zu erlauben.
|
||||
- *Graph-Kompatibilität*: Wikilinks und externe Links werden extrahiert; Nachbarschaften werden übernommen.
|
||||
- *Monitoring*: Token- und Längenmetriken sowie Text-Hash erleichtern Audits und Re-Embeddings.
|
||||
|
||||
Kompatibilität
|
||||
--------------
|
||||
- **Abwärtskompatibel** zur bisherigen `make_chunk_payloads`-Signatur.
|
||||
- Zusätzliche Felder stören bestehende Upserts nicht (Payload ist schema-flexibel).
|
||||
- Erwartet, dass `Chunk` u. a. die Attribute `id`, `index`, `text`, `char_start`, `char_end`,
|
||||
`section_title`, `section_path`, `neighbors_prev`, `neighbors_next` liefert.
|
||||
|
||||
CLI (Schnelltest)
|
||||
-----------------
|
||||
# Preview aus einer Markdown-Datei
|
||||
python3 -m app.core.chunk_payload --from-file ./test_vault/20_experiences/exp-two.md --vault-root ./test_vault
|
||||
|
||||
# Nur IDs & Tokenmengen
|
||||
python3 -m app.core.chunk_payload --from-file ./test_vault/20_experiences/exp-two.md --vault-root ./test_vault --summary
|
||||
|
||||
Felder (Auszug)
|
||||
---------------
|
||||
id : "<note_id>#cNN"
|
||||
scope : "chunk"
|
||||
note_id : "<note_id>"
|
||||
note_title : str
|
||||
note_type : str
|
||||
note_status : str
|
||||
area, project : optional
|
||||
tags : list[str]
|
||||
note_path : str (relativ, Slashes normalisiert)
|
||||
chunk_index : int
|
||||
section_title : str | None
|
||||
section_path : str | None
|
||||
char_start : int | None
|
||||
char_end : int | None
|
||||
char_len : int
|
||||
token_est : int (≈ len(text)/4)
|
||||
neighbors : {"prev": str|None, "next": str|None}
|
||||
text : str (Chunk-Text, **Pflicht**)
|
||||
text_sha256 : str "sha256:<hex>"
|
||||
lang : optional
|
||||
wikilinks : list[str]
|
||||
external_links : list[{"href": str, "label": str|None}]
|
||||
references : list[{"target_id": str, "kind": "wikilink"}]
|
||||
embed_model : optional (durchreichbar)
|
||||
embed_dim : optional
|
||||
embed_version : optional
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
# Paket-Import (normaler Betrieb)
|
||||
from app.core.chunker import Chunk
|
||||
from app.core.parser import extract_wikilinks, read_markdown, normalize_frontmatter, validate_required_frontmatter
|
||||
except Exception: # pragma: no cover
|
||||
# Relativ (lokale Tests)
|
||||
from .chunker import Chunk # type: ignore
|
||||
from .parser import extract_wikilinks, read_markdown, normalize_frontmatter, validate_required_frontmatter # type: ignore
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Utilities
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RE_MD_LINK = re.compile(r"\[([^\]]*)\]\(([^)\s]+)(?:\s+\"([^\"]+)\")?\)")
|
||||
RE_HTTP_SCHEMES = ("http://", "https://", "mailto:", "obsidian://", "tel:")
|
||||
|
||||
def _estimate_tokens(text: str) -> int:
|
||||
"""Grobe Token-Schätzung (≈ 1 Token pro 4 Zeichen)."""
|
||||
return max(0, int(round(len((text or '').strip()) / 4)))
|
||||
|
||||
def _sha256_text(text: str) -> str:
|
||||
h = hashlib.sha256()
|
||||
h.update((text or "").encode("utf-8"))
|
||||
return "sha256:" + h.hexdigest()
|
||||
|
||||
def _normalize_rel_path(path: Optional[str], vault_root: Optional[str]) -> Optional[str]:
|
||||
if not path:
|
||||
return None
|
||||
p = str(path)
|
||||
p = p.replace("\\", "/")
|
||||
if vault_root and os.path.isabs(p):
|
||||
try:
|
||||
p = os.path.relpath(p, vault_root)
|
||||
except Exception:
|
||||
pass
|
||||
p = p.replace("\\", "/").lstrip("/")
|
||||
return p
|
||||
|
||||
def _extract_external_links(text: str) -> List[Dict[str, Optional[str]]]:
|
||||
"""Findet Markdown-Links [label](href "title") mit erlaubten Schemes."""
|
||||
out: List[Dict[str, Optional[str]]] = []
|
||||
if not text:
|
||||
return out
|
||||
for m in RE_MD_LINK.finditer(text):
|
||||
label = (m.group(1) or "").strip() or None
|
||||
href = (m.group(2) or "").strip()
|
||||
title = (m.group(3) or "").strip() or None
|
||||
if any(href.startswith(s) for s in RE_HTTP_SCHEMES):
|
||||
out.append({"href": href, "label": label or title})
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def make_chunk_payloads(note_meta: Dict, path: str, chunks: List[Chunk]) -> List[Dict]:
|
||||
"""
|
||||
Erzeugt Payload-Dicts für alle Chunks einer Note.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
note_meta : Dict
|
||||
Normalisierte Frontmatter der Note (mind.: id, title, type, status, tags, [area, project, lang]).
|
||||
path : str
|
||||
Pfad zur Note (sollte relativ zum Vault sein; wird hier zur Sicherheit normalisiert).
|
||||
chunks : List[Chunk]
|
||||
Vom Chunker erzeugte Chunks.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[Dict]
|
||||
Liste von Payloads (ein Eintrag pro Chunk).
|
||||
"""
|
||||
res: List[Dict] = []
|
||||
rel_path = _normalize_rel_path(path, vault_root=None)
|
||||
|
||||
for ch in chunks:
|
||||
text: str = getattr(ch, "text", "") or ""
|
||||
wikilinks = extract_wikilinks(text)
|
||||
ext_links = _extract_external_links(text)
|
||||
|
||||
payload: Dict = {
|
||||
"id": getattr(ch, "id", None),
|
||||
"scope": "chunk",
|
||||
"note_id": note_meta.get("id"),
|
||||
"note_title": note_meta.get("title"),
|
||||
# gespiegelt für schnelle Filter:
|
||||
"note_type": note_meta.get("type"),
|
||||
"note_status": note_meta.get("status"),
|
||||
"area": note_meta.get("area"),
|
||||
"project": note_meta.get("project"),
|
||||
"tags": note_meta.get("tags"),
|
||||
# Pfad
|
||||
"note_path": rel_path,
|
||||
"path": rel_path, # Back-compat
|
||||
# Reihenfolge & Section
|
||||
"chunk_index": getattr(ch, "index", None),
|
||||
"section_title": getattr(ch, "section_title", None),
|
||||
"section_path": getattr(ch, "section_path", None),
|
||||
# Position
|
||||
"char_start": getattr(ch, "char_start", None),
|
||||
"char_end": getattr(ch, "char_end", None),
|
||||
"char_len": max(0, int(getattr(ch, "char_end", 0) or 0) - int(getattr(ch, "char_start", 0) or 0)) or len(text),
|
||||
# Nachbarn
|
||||
"neighbors": {
|
||||
"prev": getattr(ch, "neighbors_prev", None),
|
||||
"next": getattr(ch, "neighbors_next", None),
|
||||
},
|
||||
# Inhalt & Metrik
|
||||
"text": text,
|
||||
"text_sha256": _sha256_text(text),
|
||||
"token_est": _estimate_tokens(text),
|
||||
# Sprache
|
||||
"lang": note_meta.get("lang"),
|
||||
# Links
|
||||
"wikilinks": wikilinks,
|
||||
"external_links": ext_links,
|
||||
"references": [{"target_id": w, "kind": "wikilink"} for w in wikilinks],
|
||||
}
|
||||
|
||||
# Entferne Nones/Leeres, aber **text** bleibt (darf leer sein z. B. bei Bild-Only-Chunks)
|
||||
cleaned = {}
|
||||
for k, v in payload.items():
|
||||
if v in (None, [], {}):
|
||||
# immer behalten:
|
||||
if k in ("text", "neighbors"):
|
||||
cleaned[k] = v
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
cleaned[k] = v
|
||||
|
||||
res.append(cleaned)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI zum schnellen Testen
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cli() -> None:
|
||||
ap = argparse.ArgumentParser(description="Chunk-Payloads aus einer einzelnen Markdown-Datei erzeugen")
|
||||
ap.add_argument("--from-file", required=True, help="Pfad zur Markdown-Datei")
|
||||
ap.add_argument("--vault-root", default=None, help="Vault-Wurzel (zur Pfad-Relativierung)")
|
||||
ap.add_argument("--summary", action="store_true", help="Nur kurze Übersicht je Chunk ausgeben")
|
||||
args = ap.parse_args()
|
||||
|
||||
parsed = read_markdown(args.from_file)
|
||||
fm = normalize_frontmatter(parsed.frontmatter)
|
||||
validate_required_frontmatter(fm)
|
||||
|
||||
# lazy import, um Zyklen zu vermeiden
|
||||
try:
|
||||
from app.core.chunker import assemble_chunks
|
||||
except Exception:
|
||||
from .chunker import assemble_chunks # type: ignore
|
||||
|
||||
chunks = assemble_chunks(fm["id"], parsed.body, fm.get("type", "concept"))
|
||||
rel = _normalize_rel_path(parsed.path, args.vault_root)
|
||||
|
||||
pls = make_chunk_payloads(fm, rel or parsed.path, chunks)
|
||||
|
||||
if args.summary:
|
||||
out = []
|
||||
for p in pls:
|
||||
out.append({
|
||||
"id": p.get("id"),
|
||||
"chunk_index": p.get("chunk_index"),
|
||||
"token_est": p.get("token_est"),
|
||||
"wikilinks": p.get("wikilinks"),
|
||||
"ext_links": [e.get("href") for e in p.get("external_links", [])],
|
||||
"prev": (p.get("neighbors") or {}).get("prev"),
|
||||
"next": (p.get("neighbors") or {}).get("next"),
|
||||
})
|
||||
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||||
else:
|
||||
print(json.dumps(pls, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
_cli()
|
||||
|
|
@ -1,226 +0,0 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
import re
|
||||
import math
|
||||
from markdown_it import MarkdownIt
|
||||
from markdown_it.token import Token
|
||||
from .chunk_config import get_sizes
|
||||
|
||||
# --- Hilfen ---
|
||||
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
||||
_WS = re.compile(r'\s+')
|
||||
|
||||
def estimate_tokens(text: str) -> int:
|
||||
# leichte Approximation: 1 Token ≈ 4 Zeichen; robust + schnell
|
||||
t = len(text.strip())
|
||||
return max(1, math.ceil(t / 4))
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
text = _WS.sub(' ', text.strip())
|
||||
if not text:
|
||||
return []
|
||||
parts = _SENT_SPLIT.split(text)
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
|
||||
@dataclass
|
||||
class RawBlock:
|
||||
kind: str # "heading" | "paragraph" | "list" | "code" | "table" | "thematic_break" | "blockquote"
|
||||
text: str
|
||||
level: Optional[int] # heading level (2,3,...) or None
|
||||
section_path: str # e.g., "/H2 Title/H3 Subtitle"
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
id: str
|
||||
note_id: str
|
||||
index: int
|
||||
text: str
|
||||
token_count: int
|
||||
section_title: Optional[str]
|
||||
section_path: str
|
||||
neighbors_prev: Optional[str]
|
||||
neighbors_next: Optional[str]
|
||||
char_start: int
|
||||
char_end: int
|
||||
|
||||
# --- Markdown zu RawBlocks: H2/H3 als Sections, andere Blöcke gruppiert ---
|
||||
def parse_blocks(md_text: str) -> List[RawBlock]:
|
||||
md = MarkdownIt("commonmark").enable("table")
|
||||
tokens: List[Token] = md.parse(md_text)
|
||||
|
||||
blocks: List[RawBlock] = []
|
||||
h2, h3 = None, None
|
||||
section_path = "/"
|
||||
cur_text = []
|
||||
cur_kind = None
|
||||
|
||||
def push(kind: str, txt: str, lvl: Optional[int]):
|
||||
nonlocal section_path
|
||||
txt = txt.strip()
|
||||
if not txt:
|
||||
return
|
||||
title = None
|
||||
if kind == "heading" and lvl:
|
||||
title = txt
|
||||
blocks.append(RawBlock(kind=kind, text=txt, level=lvl, section_path=section_path))
|
||||
|
||||
i = 0
|
||||
while i < len(tokens):
|
||||
t = tokens[i]
|
||||
if t.type == "heading_open":
|
||||
lvl = int(t.tag[1])
|
||||
# Sammle heading inline
|
||||
i += 1
|
||||
title_txt = ""
|
||||
while i < len(tokens) and tokens[i].type != "heading_close":
|
||||
if tokens[i].type == "inline":
|
||||
title_txt += tokens[i].content
|
||||
i += 1
|
||||
title_txt = title_txt.strip()
|
||||
# Section-Pfad aktualisieren
|
||||
if lvl == 2:
|
||||
h2, h3 = title_txt, None
|
||||
section_path = f"/{h2}"
|
||||
elif lvl == 3:
|
||||
h3 = title_txt
|
||||
section_path = f"/{h2}/{h3}" if h2 else f"/{h3}"
|
||||
push("heading", title_txt, lvl)
|
||||
elif t.type in ("paragraph_open", "bullet_list_open", "ordered_list_open",
|
||||
"fence", "code_block", "blockquote_open", "table_open", "hr"):
|
||||
kind = {
|
||||
"paragraph_open": "paragraph",
|
||||
"bullet_list_open": "list",
|
||||
"ordered_list_open": "list",
|
||||
"fence": "code",
|
||||
"code_block": "code",
|
||||
"blockquote_open": "blockquote",
|
||||
"table_open": "table",
|
||||
"hr": "thematic_break",
|
||||
}[t.type]
|
||||
|
||||
if t.type in ("fence", "code_block"):
|
||||
# Codeblock hat eigenen content im selben Token
|
||||
content = t.content or ""
|
||||
push(kind, content, None)
|
||||
else:
|
||||
# inline sammeln bis close
|
||||
content = ""
|
||||
i += 1
|
||||
depth = 1
|
||||
while i < len(tokens) and depth > 0:
|
||||
tk = tokens[i]
|
||||
if tk.type.endswith("_open"):
|
||||
depth += 1
|
||||
elif tk.type.endswith("_close"):
|
||||
depth -= 1
|
||||
elif tk.type == "inline":
|
||||
content += tk.content
|
||||
i += 1
|
||||
push(kind, content, None)
|
||||
continue # wir sind schon auf nächstem Token
|
||||
i += 1
|
||||
|
||||
return blocks
|
||||
|
||||
def assemble_chunks(note_id: str, md_text: str, note_type: str) -> List[Chunk]:
|
||||
sizes = get_sizes(note_type)
|
||||
target = sum(sizes["target"]) // 2 # mittlerer Zielwert
|
||||
max_tokens = sizes["max"]
|
||||
ov_min, ov_max = sizes["overlap"]
|
||||
overlap = (ov_min + ov_max) // 2
|
||||
|
||||
blocks = parse_blocks(md_text)
|
||||
|
||||
chunks: List[Chunk] = []
|
||||
buf: List[Tuple[str, str, str]] = [] # (text, section_title, section_path)
|
||||
char_pos = 0
|
||||
|
||||
def flush_buffer(force=False):
|
||||
nonlocal buf, chunks, char_pos
|
||||
if not buf:
|
||||
return
|
||||
text = "\n\n".join([b[0] for b in buf]).strip()
|
||||
if not text:
|
||||
buf = []
|
||||
return
|
||||
|
||||
# Wenn zu groß, satzbasiert weich umbrechen
|
||||
toks = estimate_tokens(text)
|
||||
if toks > max_tokens:
|
||||
sentences = split_sentences(text)
|
||||
cur = []
|
||||
cur_tokens = 0
|
||||
for s in sentences:
|
||||
st = estimate_tokens(s)
|
||||
if cur_tokens + st > target and cur:
|
||||
_emit("\n".join(cur))
|
||||
# Overlap: letzte Sätze wiederverwenden
|
||||
ov_text = " ".join(cur)[-overlap*4:] # 4 chars/token Heuristik
|
||||
cur = [ov_text, s] if ov_text else [s]
|
||||
cur_tokens = estimate_tokens(" ".join(cur))
|
||||
else:
|
||||
cur.append(s)
|
||||
cur_tokens += st
|
||||
if cur:
|
||||
_emit("\n".join(cur))
|
||||
else:
|
||||
_emit(text)
|
||||
buf = []
|
||||
|
||||
def _emit(text_block: str):
|
||||
nonlocal chunks, char_pos
|
||||
idx = len(chunks)
|
||||
chunk_id = f"{note_id}#c{idx:02d}"
|
||||
token_count = estimate_tokens(text_block)
|
||||
# section aus letztem buffer-entry ableiten
|
||||
sec_title = buf[-1][1] if buf else None
|
||||
sec_path = buf[-1][2] if buf else "/"
|
||||
start = char_pos
|
||||
end = start + len(text_block)
|
||||
chunks.append(Chunk(
|
||||
id=chunk_id,
|
||||
note_id=note_id,
|
||||
index=idx,
|
||||
text=text_block,
|
||||
token_count=token_count,
|
||||
section_title=sec_title,
|
||||
section_path=sec_path,
|
||||
neighbors_prev=None,
|
||||
neighbors_next=None,
|
||||
char_start=start,
|
||||
char_end=end
|
||||
))
|
||||
char_pos = end + 1
|
||||
|
||||
# Blocks in Puffer sammeln; bei Überschreiten Zielbereich flushen
|
||||
cur_sec_title = None
|
||||
for b in blocks:
|
||||
if b.kind == "heading" and b.level in (2, 3):
|
||||
# Sectionwechsel ⇒ Buffer flushen
|
||||
flush_buffer()
|
||||
cur_sec_title = b.text.strip()
|
||||
# Heading selbst nicht als Chunk, aber als Kontexttitel nutzen
|
||||
continue
|
||||
|
||||
txt = b.text.strip()
|
||||
if not txt:
|
||||
continue
|
||||
|
||||
tentative = "\n\n".join([*(x[0] for x in buf), txt]).strip()
|
||||
if estimate_tokens(tentative) > max(get_sizes(note_type)["target"]):
|
||||
# weicher Schnitt vor Hinzufügen
|
||||
flush_buffer()
|
||||
buf.append((txt, cur_sec_title, b.section_path))
|
||||
|
||||
# bei Erreichen ~Target flushen
|
||||
if estimate_tokens("\n\n".join([x[0] for x in buf])) >= target:
|
||||
flush_buffer()
|
||||
|
||||
flush_buffer(force=True)
|
||||
|
||||
# neighbors setzen
|
||||
for i, ch in enumerate(chunks):
|
||||
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
||||
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
||||
return chunks
|
||||
10
app/core/chunking/__init__.py
Normal file
10
app/core/chunking/__init__.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
"""
|
||||
FILE: app/core/chunking/__init__.py
|
||||
DESCRIPTION: Package-Einstiegspunkt für Chunking. Exportiert assemble_chunks.
|
||||
VERSION: 3.3.0
|
||||
"""
|
||||
from .chunking_processor import assemble_chunks
|
||||
from .chunking_utils import get_chunk_config, extract_frontmatter_from_text
|
||||
from .chunking_models import Chunk
|
||||
|
||||
__all__ = ["assemble_chunks", "get_chunk_config", "extract_frontmatter_from_text", "Chunk"]
|
||||
33
app/core/chunking/chunking_models.py
Normal file
33
app/core/chunking/chunking_models.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_models.py
|
||||
DESCRIPTION: Datenklassen für das Chunking-System.
|
||||
"""
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Dict, Optional, Any
|
||||
|
||||
@dataclass
|
||||
class RawBlock:
|
||||
"""Repräsentiert einen logischen Block aus dem Markdown-Parsing."""
|
||||
kind: str
|
||||
text: str
|
||||
level: Optional[int]
|
||||
section_path: str
|
||||
section_title: Optional[str]
|
||||
exclude_from_chunking: bool = False # WP-24c v4.2.0: Flag für Edge-Zonen, die nicht gechunkt werden sollen
|
||||
is_meta_content: bool = False # WP-24c v4.2.6: Flag für Meta-Content (Callouts), der später entfernt wird
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
"""Das finale Chunk-Objekt für Embedding und Graph-Speicherung."""
|
||||
id: str
|
||||
note_id: str
|
||||
index: int
|
||||
text: str
|
||||
window: str
|
||||
token_count: int
|
||||
section_title: Optional[str]
|
||||
section_path: str
|
||||
neighbors_prev: Optional[str]
|
||||
neighbors_next: Optional[str]
|
||||
candidate_pool: List[Dict[str, Any]] = field(default_factory=list)
|
||||
suggested_edges: Optional[List[str]] = None
|
||||
251
app/core/chunking/chunking_parser.py
Normal file
251
app/core/chunking/chunking_parser.py
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_parser.py
|
||||
DESCRIPTION: Zerlegt Markdown in logische Einheiten (RawBlocks).
|
||||
Hält alle Überschriftenebenen (H1-H6) im Stream.
|
||||
Stellt die Funktion parse_edges_robust zur Verfügung.
|
||||
WP-24c v4.2.0: Identifiziert Edge-Zonen und markiert sie für Chunking-Ausschluss.
|
||||
WP-24c v4.2.5: Callout-Exclusion - Callouts werden als separate RawBlocks identifiziert und ausgeschlossen.
|
||||
"""
|
||||
import re
|
||||
import os
|
||||
from typing import List, Tuple, Set, Dict, Any, Optional
|
||||
from .chunking_models import RawBlock
|
||||
from .chunking_utils import extract_frontmatter_from_text
|
||||
|
||||
_WS = re.compile(r'\s+')
|
||||
_SENT_SPLIT = re.compile(r'(?<=[.!?])\s+(?=[A-ZÄÖÜ0-9„(])')
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
"""Teilt Text in Sätze auf unter Berücksichtigung deutscher Interpunktion."""
|
||||
text = _WS.sub(' ', text.strip())
|
||||
if not text: return []
|
||||
# Splittet bei Punkt, Ausrufezeichen oder Fragezeichen, gefolgt von Leerzeichen und Großbuchstabe
|
||||
return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
|
||||
|
||||
def parse_blocks(md_text: str) -> Tuple[List[RawBlock], str]:
|
||||
"""
|
||||
Zerlegt Text in logische Einheiten (RawBlocks), inklusive H1-H6.
|
||||
WP-24c v4.2.0: Identifiziert Edge-Zonen (LLM-Validierung & Note-Scope) und markiert sie für Chunking-Ausschluss.
|
||||
WP-24c v4.2.6: Callouts werden mit is_meta_content=True markiert (werden gechunkt, aber später entfernt).
|
||||
"""
|
||||
blocks = []
|
||||
h1_title = "Dokument"
|
||||
section_path = "/"
|
||||
current_section_title = None
|
||||
|
||||
# Frontmatter entfernen
|
||||
fm, text_without_fm = extract_frontmatter_from_text(md_text)
|
||||
|
||||
# WP-24c v4.2.0: Konfigurierbare Header-Namen und -Ebenen
|
||||
llm_validation_headers = os.getenv(
|
||||
"MINDNET_LLM_VALIDATION_HEADERS",
|
||||
"Unzugeordnete Kanten,Edge Pool,Candidates"
|
||||
)
|
||||
llm_validation_header_list = [h.strip() for h in llm_validation_headers.split(",") if h.strip()]
|
||||
if not llm_validation_header_list:
|
||||
llm_validation_header_list = ["Unzugeordnete Kanten", "Edge Pool", "Candidates"]
|
||||
|
||||
note_scope_headers = os.getenv(
|
||||
"MINDNET_NOTE_SCOPE_ZONE_HEADERS",
|
||||
"Smart Edges,Relationen,Global Links,Note-Level Relations,Globale Verbindungen"
|
||||
)
|
||||
note_scope_header_list = [h.strip() for h in note_scope_headers.split(",") if h.strip()]
|
||||
if not note_scope_header_list:
|
||||
note_scope_header_list = ["Smart Edges", "Relationen", "Global Links", "Note-Level Relations", "Globale Verbindungen"]
|
||||
|
||||
# Header-Ebenen konfigurierbar (Default: LLM=3, Note-Scope=2)
|
||||
llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
|
||||
note_scope_level = int(os.getenv("MINDNET_NOTE_SCOPE_HEADER_LEVEL", "2"))
|
||||
|
||||
# Status-Tracking für Edge-Zonen
|
||||
in_exclusion_zone = False
|
||||
exclusion_zone_type = None # "llm_validation" oder "note_scope"
|
||||
|
||||
# H1 für Note-Titel extrahieren (Metadaten-Zweck)
|
||||
h1_match = re.search(r'^#\s+(.*)', text_without_fm, re.MULTILINE)
|
||||
if h1_match:
|
||||
h1_title = h1_match.group(1).strip()
|
||||
|
||||
lines = text_without_fm.split('\n')
|
||||
buffer = []
|
||||
|
||||
# WP-24c v4.2.5: Callout-Erkennung (auch verschachtelt: >>)
|
||||
# Regex für Callouts: >\s*[!edge] oder >\s*[!abstract] (auch mit mehreren >)
|
||||
callout_pattern = re.compile(r'^\s*>{1,}\s*\[!(edge|abstract)\]', re.IGNORECASE)
|
||||
|
||||
# WP-24c v4.2.5: Markiere verarbeitete Zeilen, um sie zu überspringen
|
||||
processed_indices = set()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if i in processed_indices:
|
||||
continue
|
||||
|
||||
stripped = line.strip()
|
||||
|
||||
# WP-24c v4.2.5: Callout-Erkennung (VOR Heading-Erkennung)
|
||||
# Prüfe, ob diese Zeile ein Callout startet
|
||||
callout_match = callout_pattern.match(line)
|
||||
if callout_match:
|
||||
# Vorherigen Text-Block abschließen
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock(
|
||||
"paragraph", content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
))
|
||||
buffer = []
|
||||
|
||||
# Sammle alle Zeilen des Callout-Blocks
|
||||
callout_lines = [line]
|
||||
leading_gt_count = len(line) - len(line.lstrip('>'))
|
||||
processed_indices.add(i)
|
||||
|
||||
# Sammle alle Zeilen, die zum Callout gehören (gleiche oder höhere Einrückung)
|
||||
j = i + 1
|
||||
while j < len(lines):
|
||||
next_line = lines[j]
|
||||
if not next_line.strip().startswith('>'):
|
||||
break
|
||||
next_leading_gt = len(next_line) - len(next_line.lstrip('>'))
|
||||
if next_leading_gt < leading_gt_count:
|
||||
break
|
||||
callout_lines.append(next_line)
|
||||
processed_indices.add(j)
|
||||
j += 1
|
||||
|
||||
# WP-24c v4.2.6: Erstelle Callout-Block mit is_meta_content = True
|
||||
# Callouts werden gechunkt (für Chunk-Attribution), aber später entfernt (Clean-Context)
|
||||
callout_content = "\n".join(callout_lines)
|
||||
blocks.append(RawBlock(
|
||||
"callout", callout_content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone, # Nur Edge-Zonen werden ausgeschlossen
|
||||
is_meta_content=True # WP-24c v4.2.6: Markierung für spätere Entfernung
|
||||
))
|
||||
continue
|
||||
|
||||
# Heading-Erkennung (H1 bis H6)
|
||||
heading_match = re.match(r'^(#{1,6})\s+(.*)', stripped)
|
||||
if heading_match:
|
||||
# Vorherigen Text-Block abschließen
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock(
|
||||
"paragraph", content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
))
|
||||
buffer = []
|
||||
|
||||
level = len(heading_match.group(1))
|
||||
title = heading_match.group(2).strip()
|
||||
|
||||
# WP-24c v4.2.0: Prüfe, ob dieser Header eine Edge-Zone startet
|
||||
is_llm_validation_zone = (
|
||||
level == llm_validation_level and
|
||||
any(title.lower() == h.lower() for h in llm_validation_header_list)
|
||||
)
|
||||
is_note_scope_zone = (
|
||||
level == note_scope_level and
|
||||
any(title.lower() == h.lower() for h in note_scope_header_list)
|
||||
)
|
||||
|
||||
if is_llm_validation_zone:
|
||||
in_exclusion_zone = True
|
||||
exclusion_zone_type = "llm_validation"
|
||||
elif is_note_scope_zone:
|
||||
in_exclusion_zone = True
|
||||
exclusion_zone_type = "note_scope"
|
||||
elif in_exclusion_zone:
|
||||
# Neuer Header gefunden, der keine Edge-Zone ist -> Zone beendet
|
||||
in_exclusion_zone = False
|
||||
exclusion_zone_type = None
|
||||
|
||||
# Pfad- und Titel-Update für die Metadaten der folgenden Blöcke
|
||||
if level == 1:
|
||||
current_section_title = title; section_path = "/"
|
||||
elif level == 2:
|
||||
current_section_title = title; section_path = f"/{current_section_title}"
|
||||
|
||||
# Die Überschrift selbst als regulären Block hinzufügen (auch markiert, wenn in Zone)
|
||||
blocks.append(RawBlock(
|
||||
"heading", stripped, level, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
))
|
||||
continue
|
||||
|
||||
# Trenner (---) oder Leerzeilen beenden Blöcke, außer innerhalb von Callouts
|
||||
if (not stripped or stripped == "---") and not line.startswith('>'):
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock(
|
||||
"paragraph", content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
))
|
||||
buffer = []
|
||||
if stripped == "---":
|
||||
blocks.append(RawBlock(
|
||||
"separator", "---", None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
))
|
||||
else:
|
||||
buffer.append(line)
|
||||
|
||||
if buffer:
|
||||
content = "\n".join(buffer).strip()
|
||||
if content:
|
||||
blocks.append(RawBlock(
|
||||
"paragraph", content, None, section_path, current_section_title,
|
||||
exclude_from_chunking=in_exclusion_zone
|
||||
))
|
||||
|
||||
return blocks, h1_title
|
||||
|
||||
def parse_edges_robust(text: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extrahiert Kanten-Kandidaten aus Wikilinks und Callouts.
|
||||
WP-24c v4.2.7: Gibt Liste von Dicts zurück mit is_callout Flag für Chunk-Attribution.
|
||||
WP-24c v4.2.9 Fix A: current_edge_type bleibt über Leerzeilen hinweg erhalten,
|
||||
damit alle Links in einem Callout-Block korrekt verarbeitet werden.
|
||||
|
||||
Returns:
|
||||
List[Dict] mit keys: "edge" (str: "kind:target"), "is_callout" (bool)
|
||||
"""
|
||||
found_edges: List[Dict[str, any]] = []
|
||||
# 1. Wikilinks [[rel:kind|target]]
|
||||
inlines = re.findall(r'\[\[rel:([^\|\]]+)\|?([^\]]*)\]\]', text)
|
||||
for kind, target in inlines:
|
||||
k = kind.strip().lower()
|
||||
t = target.strip()
|
||||
if k and t:
|
||||
found_edges.append({"edge": f"{k}:{t}", "is_callout": False})
|
||||
|
||||
# 2. Callout Edges > [!edge] kind
|
||||
lines = text.split('\n')
|
||||
current_edge_type = None
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
callout_match = re.match(r'>+\s*\[!edge\]\s*([^:\s]+)', stripped)
|
||||
if callout_match:
|
||||
current_edge_type = callout_match.group(1).strip().lower()
|
||||
# Links in der gleichen Zeile des Callouts
|
||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||
for l in links:
|
||||
if "rel:" not in l:
|
||||
found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
|
||||
continue
|
||||
# Links in Folgezeilen des Callouts
|
||||
# WP-24c v4.2.9 Fix A: current_edge_type bleibt über Leerzeilen hinweg erhalten
|
||||
# innerhalb eines Callout-Blocks, damit alle Links korrekt verarbeitet werden
|
||||
if current_edge_type and stripped.startswith('>'):
|
||||
# Fortsetzung des Callout-Blocks: Links extrahieren
|
||||
links = re.findall(r'\[\[([^\]]+)\]\]', stripped)
|
||||
for l in links:
|
||||
if "rel:" not in l:
|
||||
found_edges.append({"edge": f"{current_edge_type}:{l}", "is_callout": True})
|
||||
elif current_edge_type and not stripped.startswith('>') and stripped:
|
||||
# Nicht-Callout-Zeile mit Inhalt: Callout-Block beendet
|
||||
current_edge_type = None
|
||||
# Leerzeilen werden ignoriert - current_edge_type bleibt erhalten
|
||||
return found_edges
|
||||
204
app/core/chunking/chunking_processor.py
Normal file
204
app/core/chunking/chunking_processor.py
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_processor.py
|
||||
DESCRIPTION: Der zentrale Orchestrator für das Chunking-System.
|
||||
AUDIT v3.3.4: Wiederherstellung der "Gold-Standard" Qualität.
|
||||
- Fix: Synchronisierung der Parameter (context_prefix) für alle Strategien.
|
||||
- Integriert physikalische Kanten-Injektion (Propagierung).
|
||||
- Stellt H1-Kontext-Fenster sicher.
|
||||
- Baut den Candidate-Pool für die WP-15b Ingestion auf.
|
||||
WP-24c v4.2.0: Konfigurierbare Header-Namen für LLM-Validierung.
|
||||
WP-24c v4.2.5: Wiederherstellung der Chunking-Präzision
|
||||
- Frontmatter-Override für chunking_profile
|
||||
- Callout-Exclusion aus Chunks
|
||||
- Strict-Mode ohne Carry-Over
|
||||
WP-24c v4.2.6: Finale Härtung - "Semantic First, Clean Second"
|
||||
- Callouts werden gechunkt (Chunk-Attribution), aber später entfernt (Clean-Context)
|
||||
- remove_callouts_from_text erst nach propagate_section_edges und Candidate Pool
|
||||
WP-24c v4.2.7: Wiederherstellung der Chunk-Attribution
|
||||
- Callout-Kanten erhalten explicit:callout Provenance im candidate_pool
|
||||
- graph_derive_edges.py erkennt diese und verhindert Note-Scope Duplikate
|
||||
"""
|
||||
import asyncio
|
||||
import re
|
||||
import os
|
||||
import logging
|
||||
from typing import List, Dict, Optional
|
||||
from .chunking_models import Chunk
|
||||
from .chunking_utils import get_chunk_config, extract_frontmatter_from_text
|
||||
from .chunking_parser import parse_blocks, parse_edges_robust
|
||||
from .chunking_strategies import strategy_sliding_window, strategy_by_heading
|
||||
from .chunking_propagation import propagate_section_edges
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def assemble_chunks(note_id: str, md_text: str, note_type: str, config: Optional[Dict] = None) -> List[Chunk]:
|
||||
"""
|
||||
Hauptfunktion zur Zerlegung einer Note.
|
||||
Verbindet Strategien mit physikalischer Kontext-Anreicherung.
|
||||
WP-24c v4.2.5: Frontmatter-Override für chunking_profile wird berücksichtigt.
|
||||
"""
|
||||
# 1. WP-24c v4.2.5: Frontmatter VOR Konfiguration extrahieren (für Override)
|
||||
fm, body_text = extract_frontmatter_from_text(md_text)
|
||||
|
||||
# 2. Konfiguration mit Frontmatter-Override
|
||||
if config is None:
|
||||
config = get_chunk_config(note_type, frontmatter=fm)
|
||||
|
||||
blocks, doc_title = parse_blocks(md_text)
|
||||
|
||||
# WP-24c v4.2.6: Filtere NUR Edge-Zonen (LLM-Validierung & Note-Scope)
|
||||
# Callouts (is_meta_content=True) müssen durch, damit Chunk-Attribution erhalten bleibt
|
||||
blocks_for_chunking = [b for b in blocks if not getattr(b, 'exclude_from_chunking', False)]
|
||||
|
||||
# Vorbereitung des H1-Präfix für die Embedding-Fenster (Breadcrumbs)
|
||||
h1_prefix = f"# {doc_title}" if doc_title else ""
|
||||
|
||||
# 2. Anwendung der Splitting-Strategie
|
||||
# Alle Strategien nutzen nun einheitlich context_prefix für die Window-Bildung.
|
||||
# WP-24c v4.2.6: Callouts sind in blocks_for_chunking enthalten (für Chunk-Attribution)
|
||||
if config.get("strategy") == "by_heading":
|
||||
chunks = await asyncio.to_thread(
|
||||
strategy_by_heading, blocks_for_chunking, config, note_id, context_prefix=h1_prefix
|
||||
)
|
||||
else:
|
||||
chunks = await asyncio.to_thread(
|
||||
strategy_sliding_window, blocks_for_chunking, config, note_id, context_prefix=h1_prefix
|
||||
)
|
||||
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
# 3. Physikalische Kontext-Anreicherung (Der Qualitäts-Fix)
|
||||
# WP-24c v4.2.6: Arbeite auf Original-Text inkl. Callouts (für korrekte Chunk-Attribution)
|
||||
# Schreibt Kanten aus Callouts/Inlines hart in den Text für Qdrant.
|
||||
chunks = propagate_section_edges(chunks)
|
||||
|
||||
# 5. WP-15b: Candidate Pool Aufbau (Metadaten für IngestionService)
|
||||
# WP-24c v4.2.7: Markiere Callout-Kanten explizit für Chunk-Attribution
|
||||
# Zuerst die explizit im Text vorhandenen Kanten sammeln.
|
||||
# WP-24c v4.4.0-DEBUG: Schnittstelle 1 - Extraktion
|
||||
for idx, ch in enumerate(chunks):
|
||||
# Wir extrahieren aus dem bereits (durch Propagation) angereicherten Text.
|
||||
# ch.candidate_pool wird im Modell-Konstruktor als leere Liste initialisiert.
|
||||
for edge_info in parse_edges_robust(ch.text):
|
||||
edge_str = edge_info["edge"]
|
||||
is_callout = edge_info.get("is_callout", False)
|
||||
parts = edge_str.split(':', 1)
|
||||
if len(parts) == 2:
|
||||
k, t = parts
|
||||
# WP-24c v4.2.7: Callout-Kanten erhalten explicit:callout Provenance
|
||||
# WP-24c v4.4.1: Harmonisierung - Provenance muss exakt "explicit:callout" sein
|
||||
provenance = "explicit:callout" if is_callout else "explicit"
|
||||
# WP-24c v4.4.1: Verwende "to" für Kompatibilität (wird auch in graph_derive_edges.py erwartet)
|
||||
# Zusätzlich "target_id" für maximale Kompatibilität mit ingestion_processor Validierung
|
||||
pool_entry = {"kind": k, "to": t, "provenance": provenance}
|
||||
if is_callout:
|
||||
# WP-24c v4.4.1: Für Callouts auch "target_id" hinzufügen für Validierung
|
||||
pool_entry["target_id"] = t
|
||||
ch.candidate_pool.append(pool_entry)
|
||||
|
||||
# WP-24c v4.4.0-DEBUG: Schnittstelle 1 - Logging
|
||||
if is_callout:
|
||||
logger.debug(f"DEBUG-TRACER [Extraction]: Chunk Index: {idx}, Chunk ID: {ch.id}, Kind: {k}, Target: {t}, Provenance: {provenance}, Is_Callout: {is_callout}, Raw_Edge_Str: {edge_str}")
|
||||
|
||||
# 6. Global Pool (Unzugeordnete Kanten - kann mitten im Dokument oder am Ende stehen)
|
||||
# WP-24c v4.2.0: Konfigurierbare Header-Namen und -Ebene via .env
|
||||
# Sucht nach ALLEN Edge-Pool Blöcken im Original-Markdown (nicht nur am Ende).
|
||||
llm_validation_headers = os.getenv(
|
||||
"MINDNET_LLM_VALIDATION_HEADERS",
|
||||
"Unzugeordnete Kanten,Edge Pool,Candidates"
|
||||
)
|
||||
header_list = [h.strip() for h in llm_validation_headers.split(",") if h.strip()]
|
||||
# Fallback auf Defaults, falls leer
|
||||
if not header_list:
|
||||
header_list = ["Unzugeordnete Kanten", "Edge Pool", "Candidates"]
|
||||
|
||||
# Header-Ebene konfigurierbar (Default: 3 für ###)
|
||||
llm_validation_level = int(os.getenv("MINDNET_LLM_VALIDATION_HEADER_LEVEL", "3"))
|
||||
header_level_pattern = "#" * llm_validation_level
|
||||
|
||||
# Regex-Pattern mit konfigurierbaren Headern und Ebene
|
||||
# WP-24c v4.2.0: finditer statt search, um ALLE Zonen zu finden (auch mitten im Dokument)
|
||||
# Zone endet bei einem neuen Header (jeder Ebene) oder am Dokument-Ende
|
||||
header_pattern = "|".join(re.escape(h) for h in header_list)
|
||||
zone_pattern = rf'^{re.escape(header_level_pattern)}\s*(?:{header_pattern})\s*\n(.*?)(?=\n#|$)'
|
||||
|
||||
for pool_match in re.finditer(zone_pattern, body_text, re.DOTALL | re.IGNORECASE | re.MULTILINE):
|
||||
global_edges = parse_edges_robust(pool_match.group(1))
|
||||
for edge_info in global_edges:
|
||||
edge_str = edge_info["edge"]
|
||||
parts = edge_str.split(':', 1)
|
||||
if len(parts) == 2:
|
||||
k, t = parts
|
||||
# Diese Kanten werden als "global_pool" markiert für die spätere KI-Prüfung.
|
||||
for ch in chunks:
|
||||
ch.candidate_pool.append({"kind": k, "to": t, "provenance": "global_pool"})
|
||||
|
||||
# 7. De-Duplikation des Pools & Linking
|
||||
for ch in chunks:
|
||||
seen = set()
|
||||
unique = []
|
||||
for c in ch.candidate_pool:
|
||||
# Eindeutigkeit über Typ, Ziel und Herkunft (Provenance)
|
||||
key = (c["kind"], c["to"], c["provenance"])
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(c)
|
||||
ch.candidate_pool = unique
|
||||
|
||||
# 8. WP-24c v4.2.6: Clean-Context - Entferne Callout-Syntax aus Chunk-Text
|
||||
# WICHTIG: Dies geschieht NACH propagate_section_edges und Candidate Pool Aufbau,
|
||||
# damit Chunk-Attribution erhalten bleibt und Kanten korrekt extrahiert werden.
|
||||
# Hinweis: Callouts können mehrzeilig sein (auch verschachtelt: >>)
|
||||
def remove_callouts_from_text(text: str) -> str:
|
||||
"""Entfernt alle Callout-Zeilen (> [!edge] oder > [!abstract]) aus dem Text."""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
i = 0
|
||||
|
||||
# NEU (v4.2.8):
|
||||
# WP-24c v4.2.8: Callout-Pattern für Edge und Abstract
|
||||
callout_start_pattern = re.compile(r'^>\s*\[!(edge|abstract)[^\]]*\]', re.IGNORECASE)
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
callout_match = callout_start_pattern.match(line)
|
||||
|
||||
if callout_match:
|
||||
# Callout gefunden: Überspringe alle Zeilen des Callout-Blocks
|
||||
leading_gt_count = len(line) - len(line.lstrip('>'))
|
||||
i += 1
|
||||
|
||||
# Überspringe alle Zeilen, die zum Callout gehören
|
||||
while i < len(lines):
|
||||
next_line = lines[i]
|
||||
if not next_line.strip().startswith('>'):
|
||||
break
|
||||
next_leading_gt = len(next_line) - len(next_line.lstrip('>'))
|
||||
if next_leading_gt < leading_gt_count:
|
||||
break
|
||||
i += 1
|
||||
else:
|
||||
# Normale Zeile: Behalte
|
||||
cleaned_lines.append(line)
|
||||
i += 1
|
||||
|
||||
# Normalisiere Leerzeilen (max. 2 aufeinanderfolgende)
|
||||
result = '\n'.join(cleaned_lines)
|
||||
result = re.sub(r'\n\s*\n\s*\n+', '\n\n', result)
|
||||
return result
|
||||
|
||||
for ch in chunks:
|
||||
ch.text = remove_callouts_from_text(ch.text)
|
||||
if ch.window:
|
||||
ch.window = remove_callouts_from_text(ch.window)
|
||||
|
||||
# Verknüpfung der Nachbarschaften für Graph-Traversierung
|
||||
for i, ch in enumerate(chunks):
|
||||
ch.neighbors_prev = chunks[i-1].id if i > 0 else None
|
||||
ch.neighbors_next = chunks[i+1].id if i < len(chunks)-1 else None
|
||||
|
||||
return chunks
|
||||
69
app/core/chunking/chunking_propagation.py
Normal file
69
app/core/chunking/chunking_propagation.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_propagation.py
|
||||
DESCRIPTION: Injiziert Sektions-Kanten physisch in den Text (Embedding-Enrichment).
|
||||
Fix v3.3.6: Nutzt robustes Parsing zur Erkennung vorhandener Kanten,
|
||||
um Dopplungen direkt hinter [!edge] Callouts format-agnostisch zu verhindern.
|
||||
"""
|
||||
from typing import List, Dict, Set
|
||||
from .chunking_models import Chunk
|
||||
from .chunking_parser import parse_edges_robust
|
||||
|
||||
def propagate_section_edges(chunks: List[Chunk]) -> List[Chunk]:
|
||||
"""
|
||||
Sammelt Kanten pro Sektion und schreibt sie hart in den Text und das Window.
|
||||
Verhindert Dopplungen, wenn Kanten bereits via [!edge] Callout vorhanden sind.
|
||||
"""
|
||||
# 1. Sammeln: Alle expliziten Kanten pro Sektions-Pfad aggregieren
|
||||
section_map: Dict[str, Set[str]] = {} # path -> set(kind:target)
|
||||
|
||||
for ch in chunks:
|
||||
# Root-Level "/" ignorieren (zu global), Fokus auf spezifische Kapitel
|
||||
if not ch.section_path or ch.section_path == "/":
|
||||
continue
|
||||
|
||||
# Nutzt den robusten Parser aus dem Package
|
||||
# WP-24c v4.2.7: parse_edges_robust gibt jetzt Liste von Dicts zurück
|
||||
edge_infos = parse_edges_robust(ch.text)
|
||||
if edge_infos:
|
||||
if ch.section_path not in section_map:
|
||||
section_map[ch.section_path] = set()
|
||||
for edge_info in edge_infos:
|
||||
section_map[ch.section_path].add(edge_info["edge"])
|
||||
|
||||
# 2. Injizieren: Kanten in jeden Chunk der Sektion zurückschreiben (Broadcasting)
|
||||
for ch in chunks:
|
||||
if ch.section_path in section_map:
|
||||
edges_to_add = section_map[ch.section_path]
|
||||
if not edges_to_add:
|
||||
continue
|
||||
|
||||
# Vorhandene Kanten (Typ:Ziel) in DIESEM Chunk ermitteln,
|
||||
# um Dopplungen (z.B. durch Callouts) zu vermeiden.
|
||||
# WP-24c v4.2.7: parse_edges_robust gibt jetzt Liste von Dicts zurück
|
||||
existing_edge_infos = parse_edges_robust(ch.text)
|
||||
existing_edges = {ei["edge"] for ei in existing_edge_infos}
|
||||
|
||||
injections = []
|
||||
# Sortierung für deterministische Ergebnisse
|
||||
for e_str in sorted(list(edges_to_add)):
|
||||
# Wenn die Kante (Typ + Ziel) bereits vorhanden ist (egal welches Format),
|
||||
# überspringen wir die Injektion für diesen Chunk.
|
||||
if e_str in existing_edges:
|
||||
continue
|
||||
|
||||
kind, target = e_str.split(':', 1)
|
||||
injections.append(f"[[rel:{kind}|{target}]]")
|
||||
|
||||
if injections:
|
||||
# Physische Anreicherung
|
||||
# Triple-Newline für saubere Trennung im Embedding-Fenster
|
||||
block = "\n\n\n" + " ".join(injections)
|
||||
ch.text += block
|
||||
|
||||
# Auch ins Window schreiben, da Qdrant hier sucht!
|
||||
if ch.window:
|
||||
ch.window += block
|
||||
else:
|
||||
ch.window = ch.text
|
||||
|
||||
return chunks
|
||||
190
app/core/chunking/chunking_strategies.py
Normal file
190
app/core/chunking/chunking_strategies.py
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_strategies.py
|
||||
DESCRIPTION: Strategien für atomares Sektions-Chunking v3.9.9.
|
||||
Implementiert das 'Pack-and-Carry-Over' Verfahren nach Regel 1-3.
|
||||
- Keine redundante Kanten-Injektion.
|
||||
- Strikte Einhaltung von Sektionsgrenzen via Look-Ahead.
|
||||
- Fix: Synchronisierung der Parameter mit dem Orchestrator (context_prefix).
|
||||
WP-24c v4.2.5: Strict-Mode ohne Carry-Over - Bei strict_heading_split wird nach jeder Sektion geflasht.
|
||||
"""
|
||||
from typing import List, Dict, Any, Optional
|
||||
from .chunking_models import RawBlock, Chunk
|
||||
from .chunking_utils import estimate_tokens
|
||||
from .chunking_parser import split_sentences
|
||||
|
||||
def _create_win(context_prefix: str, sec_title: Optional[str], text: str) -> str:
|
||||
"""Baut den Breadcrumb-Kontext für das Embedding-Fenster."""
|
||||
parts = [context_prefix] if context_prefix else []
|
||||
# Verhindert Dopplung, falls der Context-Prefix (H1) bereits den Sektionsnamen enthält
|
||||
if sec_title and f"# {sec_title}" != context_prefix and sec_title not in (context_prefix or ""):
|
||||
parts.append(sec_title)
|
||||
prefix = " > ".join(parts)
|
||||
return f"{prefix}\n{text}".strip() if prefix else text
|
||||
|
||||
def strategy_by_heading(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
||||
"""
|
||||
Universelle Heading-Strategie mit Carry-Over Logik.
|
||||
Synchronisiert auf context_prefix für Kompatibilität mit dem Orchestrator.
|
||||
"""
|
||||
smart_edge = config.get("enable_smart_edge_allocation", True)
|
||||
strict = config.get("strict_heading_split", False)
|
||||
target = config.get("target", 400)
|
||||
max_tokens = config.get("max", 600)
|
||||
split_level = config.get("split_level", 2)
|
||||
overlap_cfg = config.get("overlap", (50, 80))
|
||||
overlap = sum(overlap_cfg) // 2 if isinstance(overlap_cfg, (list, tuple)) else overlap_cfg
|
||||
|
||||
chunks: List[Chunk] = []
|
||||
|
||||
def _emit(txt, title, path):
|
||||
"""Schreibt den finalen Chunk ohne Text-Modifikationen."""
|
||||
idx = len(chunks)
|
||||
win = _create_win(context_prefix, title, txt)
|
||||
chunks.append(Chunk(
|
||||
id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx,
|
||||
text=txt, window=win, token_count=estimate_tokens(txt),
|
||||
section_title=title, section_path=path, neighbors_prev=None, neighbors_next=None
|
||||
))
|
||||
|
||||
# --- SCHRITT 1: Gruppierung in atomare Sektions-Einheiten ---
|
||||
sections: List[Dict[str, Any]] = []
|
||||
curr_blocks = []
|
||||
for b in blocks:
|
||||
if b.kind == "heading" and b.level <= split_level:
|
||||
if curr_blocks:
|
||||
sections.append({
|
||||
"text": "\n\n".join([x.text for x in curr_blocks]),
|
||||
"meta": curr_blocks[0],
|
||||
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
|
||||
})
|
||||
curr_blocks = [b]
|
||||
else:
|
||||
curr_blocks.append(b)
|
||||
if curr_blocks:
|
||||
sections.append({
|
||||
"text": "\n\n".join([x.text for x in curr_blocks]),
|
||||
"meta": curr_blocks[0],
|
||||
"is_empty": len(curr_blocks) == 1 and curr_blocks[0].kind == "heading"
|
||||
})
|
||||
|
||||
# --- SCHRITT 2: Verarbeitung der Queue ---
|
||||
queue = list(sections)
|
||||
current_chunk_text = ""
|
||||
current_meta = {"title": None, "path": "/"}
|
||||
|
||||
# Bestimmung des Modus: Hard-Split wenn smart_edge=False ODER strict=True
|
||||
is_hard_split_mode = (not smart_edge) or (strict)
|
||||
|
||||
while queue:
|
||||
item = queue.pop(0)
|
||||
item_text = item["text"]
|
||||
|
||||
# Initialisierung für neuen Chunk
|
||||
if not current_chunk_text:
|
||||
current_meta["title"] = item["meta"].section_title
|
||||
current_meta["path"] = item["meta"].section_path
|
||||
|
||||
# FALL A: HARD SPLIT MODUS (WP-24c v4.2.5: Strict-Mode ohne Carry-Over)
|
||||
if is_hard_split_mode:
|
||||
# WP-24c v4.2.5: Bei strict_heading_split: true wird nach JEDER Sektion geflasht
|
||||
# Kein Carry-Over erlaubt, auch nicht für leere Überschriften
|
||||
if current_chunk_text:
|
||||
# Flashe vorherigen Chunk
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||
current_chunk_text = ""
|
||||
|
||||
# Neue Sektion: Initialisiere Meta
|
||||
current_meta["title"] = item["meta"].section_title
|
||||
current_meta["path"] = item["meta"].section_path
|
||||
|
||||
# WP-24c v4.2.5: Auch leere Sektionen werden als separater Chunk erstellt
|
||||
# (nur Überschrift, kein Inhalt)
|
||||
if item.get("is_empty", False):
|
||||
# Leere Sektion: Nur Überschrift als Chunk
|
||||
_emit(item_text, current_meta["title"], current_meta["path"])
|
||||
else:
|
||||
# Normale Sektion: Prüfe auf Token-Limit
|
||||
if estimate_tokens(item_text) > max_tokens:
|
||||
# Sektion zu groß: Smart Zerlegung (aber trotzdem in separaten Chunks)
|
||||
sents = split_sentences(item_text)
|
||||
header_prefix = item["meta"].text if item["meta"].kind == "heading" else ""
|
||||
|
||||
take_sents = []; take_len = 0
|
||||
while sents:
|
||||
s = sents.pop(0); slen = estimate_tokens(s)
|
||||
if take_len + slen > target and take_sents:
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
||||
take_sents = [s]; take_len = slen
|
||||
else:
|
||||
take_sents.append(s); take_len += slen
|
||||
|
||||
if take_sents:
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
||||
else:
|
||||
# Sektion passt: Direkt als Chunk
|
||||
_emit(item_text, current_meta["title"], current_meta["path"])
|
||||
|
||||
current_chunk_text = ""
|
||||
continue
|
||||
|
||||
# FALL B: SMART MODE (Regel 1-3)
|
||||
combined_text = (current_chunk_text + "\n\n" + item_text).strip() if current_chunk_text else item_text
|
||||
combined_est = estimate_tokens(combined_text)
|
||||
|
||||
if combined_est <= max_tokens:
|
||||
# Regel 1 & 2: Passt rein laut Schätzung -> Aufnehmen
|
||||
current_chunk_text = combined_text
|
||||
else:
|
||||
if current_chunk_text:
|
||||
# Regel 2: Flashen an Sektionsgrenze, Item zurücklegen
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||
current_chunk_text = ""
|
||||
queue.insert(0, item)
|
||||
else:
|
||||
# Regel 3: Einzelne Sektion zu groß -> Smart Zerlegung
|
||||
sents = split_sentences(item_text)
|
||||
header_prefix = item["meta"].text if item["meta"].kind == "heading" else ""
|
||||
|
||||
take_sents = []; take_len = 0
|
||||
while sents:
|
||||
s = sents.pop(0); slen = estimate_tokens(s)
|
||||
if take_len + slen > target and take_sents:
|
||||
sents.insert(0, s); break
|
||||
take_sents.append(s); take_len += slen
|
||||
|
||||
_emit(" ".join(take_sents), current_meta["title"], current_meta["path"])
|
||||
|
||||
if sents:
|
||||
remainder = " ".join(sents)
|
||||
# Kontext-Erhalt: Überschrift für den Rest wiederholen
|
||||
if header_prefix and not remainder.startswith(header_prefix):
|
||||
remainder = header_prefix + "\n\n" + remainder
|
||||
# Carry-Over: Rest wird vorne in die Queue geschoben
|
||||
queue.insert(0, {"text": remainder, "meta": item["meta"], "is_split": True})
|
||||
|
||||
if current_chunk_text:
|
||||
_emit(current_chunk_text, current_meta["title"], current_meta["path"])
|
||||
|
||||
return chunks
|
||||
|
||||
def strategy_sliding_window(blocks: List[RawBlock], config: Dict[str, Any], note_id: str, context_prefix: str = "") -> List[Chunk]:
|
||||
"""Standard-Sliding-Window für flache Texte ohne Sektionsfokus."""
|
||||
target = config.get("target", 400); max_tokens = config.get("max", 600)
|
||||
chunks: List[Chunk] = []; buf: List[RawBlock] = []
|
||||
|
||||
for b in blocks:
|
||||
b_tokens = estimate_tokens(b.text)
|
||||
curr_tokens = sum(estimate_tokens(x.text) for x in buf) if buf else 0
|
||||
if curr_tokens + b_tokens > max_tokens and buf:
|
||||
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||
win = _create_win(context_prefix, buf[0].section_title, txt)
|
||||
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=curr_tokens, section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
||||
buf = []
|
||||
buf.append(b)
|
||||
|
||||
if buf:
|
||||
txt = "\n\n".join([x.text for x in buf]); idx = len(chunks)
|
||||
win = _create_win(context_prefix, buf[0].section_title, txt)
|
||||
chunks.append(Chunk(id=f"{note_id}#c{idx:02d}", note_id=note_id, index=idx, text=txt, window=win, token_count=estimate_tokens(txt), section_title=buf[0].section_title, section_path=buf[0].section_path, neighbors_prev=None, neighbors_next=None))
|
||||
|
||||
return chunks
|
||||
74
app/core/chunking/chunking_utils.py
Normal file
74
app/core/chunking/chunking_utils.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
"""
|
||||
FILE: app/core/chunking/chunking_utils.py
|
||||
DESCRIPTION: Hilfswerkzeuge für Token-Schätzung und YAML-Konfiguration.
|
||||
"""
|
||||
import math
|
||||
import yaml
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Tuple, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent.parent.parent
|
||||
CONFIG_PATH = BASE_DIR / "config" / "types.yaml"
|
||||
DEFAULT_PROFILE = {"strategy": "sliding_window", "target": 400, "max": 600, "overlap": (50, 80)}
|
||||
|
||||
_CONFIG_CACHE = None
|
||||
|
||||
def load_yaml_config() -> Dict[str, Any]:
|
||||
global _CONFIG_CACHE
|
||||
if _CONFIG_CACHE is not None: return _CONFIG_CACHE
|
||||
if not CONFIG_PATH.exists(): return {}
|
||||
try:
|
||||
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
_CONFIG_CACHE = data
|
||||
return data
|
||||
except Exception: return {}
|
||||
|
||||
def get_chunk_config(note_type: str, frontmatter: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Lädt die Chunking-Strategie basierend auf dem Note-Type.
|
||||
WP-24c v4.2.5: Frontmatter-Override für chunking_profile hat höchste Priorität.
|
||||
|
||||
Args:
|
||||
note_type: Der Typ der Note (z.B. "decision", "experience")
|
||||
frontmatter: Optionales Frontmatter-Dict mit chunking_profile Override
|
||||
|
||||
Returns:
|
||||
Dict mit Chunking-Konfiguration
|
||||
"""
|
||||
full_config = load_yaml_config()
|
||||
profiles = full_config.get("chunking_profiles", {})
|
||||
type_def = full_config.get("types", {}).get(note_type.lower(), {})
|
||||
|
||||
# WP-24c v4.2.5: Priorität: Frontmatter > Type-Def > Defaults
|
||||
profile_name = None
|
||||
if frontmatter and "chunking_profile" in frontmatter:
|
||||
profile_name = frontmatter.get("chunking_profile") or frontmatter.get("chunk_profile")
|
||||
if not profile_name:
|
||||
profile_name = type_def.get("chunking_profile")
|
||||
if not profile_name:
|
||||
profile_name = full_config.get("defaults", {}).get("chunking_profile", "sliding_standard")
|
||||
|
||||
config = profiles.get(profile_name, DEFAULT_PROFILE).copy()
|
||||
if "overlap" in config and isinstance(config["overlap"], list):
|
||||
config["overlap"] = tuple(config["overlap"])
|
||||
return config
|
||||
|
||||
def estimate_tokens(text: str) -> int:
|
||||
"""Grobe Schätzung der Token-Anzahl."""
|
||||
return max(1, math.ceil(len(text.strip()) / 4))
|
||||
|
||||
def extract_frontmatter_from_text(md_text: str) -> Tuple[Dict[str, Any], str]:
|
||||
"""Trennt YAML-Frontmatter vom Text."""
|
||||
import re
|
||||
fm_match = re.match(r'^\s*---\s*\n(.*?)\n---', md_text, re.DOTALL)
|
||||
if not fm_match: return {}, md_text
|
||||
try:
|
||||
frontmatter = yaml.safe_load(fm_match.group(1))
|
||||
if not isinstance(frontmatter, dict): frontmatter = {}
|
||||
except Exception: frontmatter = {}
|
||||
text_without_fm = re.sub(r'^\s*---\s*\n(.*?)\n---', '', md_text, flags=re.DOTALL)
|
||||
return frontmatter, text_without_fm.strip()
|
||||
35
app/core/database/__init__.py
Normal file
35
app/core/database/__init__.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
PACKAGE: app.core.database
|
||||
DESCRIPTION: Zentrale Schnittstelle für alle Datenbank-Operationen (Qdrant).
|
||||
Bündelt Client-Initialisierung und Point-Konvertierung.
|
||||
"""
|
||||
from .qdrant import (
|
||||
QdrantConfig,
|
||||
get_client,
|
||||
ensure_collections,
|
||||
ensure_payload_indexes,
|
||||
collection_names
|
||||
)
|
||||
from .qdrant_points import (
|
||||
points_for_note,
|
||||
points_for_chunks,
|
||||
points_for_edges,
|
||||
upsert_batch,
|
||||
get_edges_for_sources,
|
||||
search_chunks_by_vector
|
||||
)
|
||||
|
||||
# Öffentlicher Export für das Gesamtsystem
|
||||
__all__ = [
|
||||
"QdrantConfig",
|
||||
"get_client",
|
||||
"ensure_collections",
|
||||
"ensure_payload_indexes",
|
||||
"collection_names",
|
||||
"points_for_note",
|
||||
"points_for_chunks",
|
||||
"points_for_edges",
|
||||
"upsert_batch",
|
||||
"get_edges_for_sources",
|
||||
"search_chunks_by_vector"
|
||||
]
|
||||
173
app/core/database/qdrant.py
Normal file
173
app/core/database/qdrant.py
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
"""
|
||||
FILE: app/core/database/qdrant.py
|
||||
DESCRIPTION: Qdrant-Client Factory und Schema-Management.
|
||||
Erstellt Collections und Payload-Indizes.
|
||||
MODULARISIERUNG: Verschoben in das database-Paket für WP-14.
|
||||
VERSION: 2.2.2 (WP-Fix: Index für target_section)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, dataclasses, os
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Dict, List
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Konfiguration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class QdrantConfig:
|
||||
"""Konfigurationsobjekt für den Qdrant-Verbindungsaufbau."""
|
||||
host: Optional[str] = None
|
||||
port: Optional[int] = None
|
||||
url: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
prefix: str = "mindnet"
|
||||
dim: int = 384
|
||||
distance: str = "Cosine" # Cosine | Dot | Euclid
|
||||
on_disk_payload: bool = True
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> "QdrantConfig":
|
||||
"""Erstellt die Konfiguration aus Umgebungsvariablen."""
|
||||
# Entweder URL ODER Host/Port, API-Key optional
|
||||
url = os.getenv("QDRANT_URL") or None
|
||||
host = os.getenv("QDRANT_HOST") or None
|
||||
port = os.getenv("QDRANT_PORT")
|
||||
port = int(port) if port else None
|
||||
api_key = os.getenv("QDRANT_API_KEY") or None
|
||||
# WP-24c v4.5.10: Harmonisierung - Unterstützt beide Umgebungsvariablen für Abwärtskompatibilität
|
||||
# COLLECTION_PREFIX hat Priorität, MINDNET_PREFIX als Fallback
|
||||
prefix = os.getenv("COLLECTION_PREFIX") or os.getenv("MINDNET_PREFIX") or "mindnet"
|
||||
dim = int(os.getenv("VECTOR_DIM") or 384)
|
||||
distance = os.getenv("DISTANCE", "Cosine")
|
||||
on_disk_payload = (os.getenv("ON_DISK_PAYLOAD", "true").lower() == "true")
|
||||
|
||||
return cls(
|
||||
host=host, port=port, url=url, api_key=api_key,
|
||||
prefix=prefix, dim=dim, distance=distance, on_disk_payload=on_disk_payload
|
||||
)
|
||||
|
||||
|
||||
def get_client(cfg: QdrantConfig) -> QdrantClient:
|
||||
"""Initialisiert den Qdrant-Client basierend auf der Konfiguration."""
|
||||
# QdrantClient akzeptiert entweder url=... oder host/port
|
||||
if cfg.url:
|
||||
return QdrantClient(url=cfg.url, api_key=cfg.api_key, timeout=60.0)
|
||||
return QdrantClient(host=cfg.host or "127.0.0.1", port=cfg.port or 6333, api_key=cfg.api_key, timeout=60.0)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Collections
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def collection_names(prefix: str) -> Tuple[str, str, str]:
|
||||
"""Gibt die standardisierten Collection-Namen zurück."""
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
|
||||
def _vector_params(dim: int, distance: str) -> rest.VectorParams:
|
||||
"""Erstellt Vektor-Parameter für das Collection-Schema."""
|
||||
# Distance: "Cosine" | "Dot" | "Euclid"
|
||||
dist = getattr(rest.Distance, distance.capitalize(), rest.Distance.COSINE)
|
||||
return rest.VectorParams(size=dim, distance=dist)
|
||||
|
||||
|
||||
def ensure_collections(client: QdrantClient, prefix: str, dim: int) -> None:
|
||||
"""Legt notes, chunks und edges Collections an, falls nicht vorhanden."""
|
||||
notes, chunks, edges = collection_names(prefix)
|
||||
|
||||
# notes
|
||||
if not client.collection_exists(notes):
|
||||
client.create_collection(
|
||||
collection_name=notes,
|
||||
vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
# chunks
|
||||
if not client.collection_exists(chunks):
|
||||
client.create_collection(
|
||||
collection_name=chunks,
|
||||
vectors_config=_vector_params(dim, os.getenv("DISTANCE", "Cosine")),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
# edges (Dummy-Vektor, da primär via Payload gefiltert wird)
|
||||
if not client.collection_exists(edges):
|
||||
client.create_collection(
|
||||
collection_name=edges,
|
||||
vectors_config=_vector_params(1, "Dot"),
|
||||
on_disk_payload=True,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Payload-Indizes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _ensure_index(client: QdrantClient, collection: str, field: str, schema: rest.PayloadSchemaType) -> None:
|
||||
"""Idempotentes Anlegen eines Payload-Indexes für ein spezifisches Feld."""
|
||||
try:
|
||||
client.create_payload_index(collection_name=collection, field_name=field, field_schema=schema, wait=True)
|
||||
except Exception as e:
|
||||
# Fehler ignorieren, falls Index bereits existiert
|
||||
logger.debug(f"Index check for {field} in {collection}: {e}")
|
||||
|
||||
|
||||
def ensure_payload_indexes(client: QdrantClient, prefix: str) -> None:
|
||||
"""
|
||||
Stellt sicher, dass alle benötigten Payload-Indizes für die Suche existieren.
|
||||
- notes: note_id, type, title, updated, tags
|
||||
- chunks: note_id, chunk_id, index, type, tags
|
||||
- edges: note_id, kind, scope, source_id, target_id, chunk_id, target_section
|
||||
"""
|
||||
notes, chunks, edges = collection_names(prefix)
|
||||
|
||||
# NOTES
|
||||
for field, schema in [
|
||||
("note_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("type", rest.PayloadSchemaType.KEYWORD),
|
||||
("title", rest.PayloadSchemaType.TEXT),
|
||||
("updated", rest.PayloadSchemaType.INTEGER),
|
||||
("tags", rest.PayloadSchemaType.KEYWORD),
|
||||
]:
|
||||
_ensure_index(client, notes, field, schema)
|
||||
|
||||
# CHUNKS
|
||||
for field, schema in [
|
||||
("note_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("chunk_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("index", rest.PayloadSchemaType.INTEGER),
|
||||
("type", rest.PayloadSchemaType.KEYWORD),
|
||||
("tags", rest.PayloadSchemaType.KEYWORD),
|
||||
]:
|
||||
_ensure_index(client, chunks, field, schema)
|
||||
|
||||
# EDGES
|
||||
for field, schema in [
|
||||
("note_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("kind", rest.PayloadSchemaType.KEYWORD),
|
||||
("scope", rest.PayloadSchemaType.KEYWORD),
|
||||
("source_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("target_id", rest.PayloadSchemaType.KEYWORD),
|
||||
("chunk_id", rest.PayloadSchemaType.KEYWORD),
|
||||
# NEU: Index für Section-Links (WP-15b)
|
||||
("target_section", rest.PayloadSchemaType.KEYWORD),
|
||||
]:
|
||||
_ensure_index(client, edges, field, schema)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"QdrantConfig",
|
||||
"get_client",
|
||||
"ensure_collections",
|
||||
"ensure_payload_indexes",
|
||||
"collection_names",
|
||||
]
|
||||
354
app/core/database/qdrant_points.py
Normal file
354
app/core/database/qdrant_points.py
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
"""
|
||||
FILE: app/core/database/qdrant_points.py
|
||||
DESCRIPTION: Object-Mapper für Qdrant. Konvertiert JSON-Payloads (Notes, Chunks, Edges)
|
||||
in PointStructs und generiert deterministische UUIDs.
|
||||
VERSION: 4.1.0 (WP-24c: Gold-Standard Identity v4.1.0 - target_section Support)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, uuid, os, app.core.graph.graph_utils
|
||||
LAST_ANALYSIS: 2026-01-10
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import uuid
|
||||
from typing import List, Tuple, Iterable, Optional, Dict, Any
|
||||
|
||||
from qdrant_client.http import models as rest
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
# WP-24c: Import der zentralen Identitäts-Logik zur Vermeidung von ID-Drift
|
||||
from app.core.graph.graph_utils import _mk_edge_id
|
||||
|
||||
# --------------------- ID helpers ---------------------
|
||||
|
||||
def _to_uuid(stable_key: str) -> str:
|
||||
"""
|
||||
Erzeugt eine deterministische UUIDv5 basierend auf einem stabilen Schlüssel.
|
||||
Härtung v1.5.2: Guard gegen leere Schlüssel zur Vermeidung von Pydantic-Fehlern.
|
||||
"""
|
||||
if not stable_key:
|
||||
raise ValueError("UUID generation failed: stable_key is empty or None")
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, str(stable_key)))
|
||||
|
||||
def _names(prefix: str) -> Tuple[str, str, str]:
|
||||
"""Interne Auflösung der Collection-Namen basierend auf dem Präfix."""
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
# --------------------- Points builders ---------------------
|
||||
|
||||
def points_for_note(prefix: str, note_payload: dict, note_vec: List[float] | None, dim: int) -> Tuple[str, List[rest.PointStruct]]:
|
||||
"""Konvertiert Note-Metadaten in Qdrant Points."""
|
||||
notes_col, _, _ = _names(prefix)
|
||||
# Nutzt Null-Vektor als Fallback, falls kein Embedding vorhanden ist
|
||||
vector = note_vec if note_vec is not None else [0.0] * int(dim)
|
||||
|
||||
raw_note_id = note_payload.get("note_id") or note_payload.get("id") or "missing-note-id"
|
||||
point_id = _to_uuid(raw_note_id)
|
||||
|
||||
pt = rest.PointStruct(
|
||||
id=point_id,
|
||||
vector=vector,
|
||||
payload=note_payload
|
||||
)
|
||||
return notes_col, [pt]
|
||||
|
||||
def points_for_chunks(prefix: str, chunk_payloads: List[dict], vectors: List[List[float]]) -> Tuple[str, List[rest.PointStruct]]:
|
||||
"""Konvertiert Chunks und deren Vektoren in Qdrant Points."""
|
||||
_, chunks_col, _ = _names(prefix)
|
||||
points: List[rest.PointStruct] = []
|
||||
for i, (pl, vec) in enumerate(zip(chunk_payloads, vectors), start=1):
|
||||
chunk_id = pl.get("chunk_id") or pl.get("id")
|
||||
if not chunk_id:
|
||||
note_id = pl.get("note_id") or pl.get("parent_note_id") or "missing-note"
|
||||
chunk_id = f"{note_id}#{i}"
|
||||
pl["chunk_id"] = chunk_id
|
||||
|
||||
point_id = _to_uuid(chunk_id)
|
||||
points.append(rest.PointStruct(
|
||||
id=point_id,
|
||||
vector=vec,
|
||||
payload=pl
|
||||
))
|
||||
return chunks_col, points
|
||||
|
||||
def _normalize_edge_payload(pl: dict) -> dict:
|
||||
"""Normalisiert Edge-Felder und sichert Schema-Konformität."""
|
||||
kind = pl.get("kind") or pl.get("edge_type") or "edge"
|
||||
source_id = pl.get("source_id") or pl.get("src_id") or "unknown-src"
|
||||
target_id = pl.get("target_id") or pl.get("dst_id") or "unknown-tgt"
|
||||
seq = pl.get("seq") or pl.get("order") or pl.get("index")
|
||||
|
||||
# WP-Fix: target_section explizit durchreichen
|
||||
target_section = pl.get("target_section")
|
||||
|
||||
pl.setdefault("kind", kind)
|
||||
pl.setdefault("source_id", source_id)
|
||||
pl.setdefault("target_id", target_id)
|
||||
|
||||
if seq is not None and "seq" not in pl:
|
||||
pl["seq"] = seq
|
||||
|
||||
if target_section is not None:
|
||||
pl["target_section"] = target_section
|
||||
|
||||
return pl
|
||||
|
||||
def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]:
|
||||
"""
|
||||
Konvertiert Kanten-Payloads in PointStructs.
|
||||
WP-24c v4.1.0: Nutzt die zentrale _mk_edge_id Funktion aus graph_utils.
|
||||
Dies eliminiert den ID-Drift zwischen manuellen und virtuellen Kanten.
|
||||
|
||||
GOLD-STANDARD v4.1.0: Die ID-Generierung verwendet 4 Parameter + optional target_section
|
||||
(kind, source_id, target_id, scope, target_section).
|
||||
rule_id und variant werden ignoriert, target_section fließt ein (Multigraph-Support).
|
||||
"""
|
||||
_, _, edges_col = _names(prefix)
|
||||
points: List[rest.PointStruct] = []
|
||||
|
||||
for raw in edge_payloads:
|
||||
pl = _normalize_edge_payload(raw)
|
||||
|
||||
# Extraktion der Identitäts-Parameter (GOLD-STANDARD v4.1.0)
|
||||
kind = pl.get("kind", "edge")
|
||||
s = pl.get("source_id", "unknown-src")
|
||||
t = pl.get("target_id", "unknown-tgt")
|
||||
scope = pl.get("scope", "note")
|
||||
target_section = pl.get("target_section") # WP-24c v4.1.0: target_section für Section-Links
|
||||
|
||||
# Hinweis: rule_id und variant werden im Payload gespeichert,
|
||||
# fließen aber NICHT in die ID-Generierung ein (v4.0.0 Standard)
|
||||
# target_section fließt in die ID ein (v4.1.0: Multigraph-Support für Section-Links)
|
||||
|
||||
try:
|
||||
# Aufruf der Single-Source-of-Truth für IDs
|
||||
# GOLD-STANDARD v4.1.0: 4 Parameter + optional target_section
|
||||
point_id = _mk_edge_id(
|
||||
kind=kind,
|
||||
s=s,
|
||||
t=t,
|
||||
scope=scope,
|
||||
target_section=target_section
|
||||
)
|
||||
|
||||
# Synchronisierung des Payloads mit der berechneten ID
|
||||
pl["edge_id"] = point_id
|
||||
|
||||
points.append(rest.PointStruct(
|
||||
id=point_id,
|
||||
vector=[0.0],
|
||||
payload=pl
|
||||
))
|
||||
except ValueError as e:
|
||||
# Fehlerhaft definierte Kanten werden übersprungen, um Pydantic-Crashes zu vermeiden
|
||||
continue
|
||||
|
||||
return edges_col, points
|
||||
|
||||
# --------------------- Vector schema & overrides ---------------------
|
||||
|
||||
def _preferred_name(candidates: List[str]) -> str:
|
||||
"""Ermittelt den primären Vektor-Namen aus einer Liste von Kandidaten."""
|
||||
for k in ("text", "default", "embedding", "content"):
|
||||
if k in candidates:
|
||||
return k
|
||||
return sorted(candidates)[0]
|
||||
|
||||
def _env_override_for_collection(collection: str) -> Optional[str]:
|
||||
"""
|
||||
Prüft auf Umgebungsvariablen-Overrides für Vektor-Namen.
|
||||
Returns:
|
||||
- "__single__" für erzwungenen Single-Vector Modus
|
||||
- Name (str) für spezifischen Named-Vector
|
||||
- None für automatische Erkennung
|
||||
"""
|
||||
base = os.getenv("MINDNET_VECTOR_NAME")
|
||||
if collection.endswith("_notes"):
|
||||
base = os.getenv("NOTES_VECTOR_NAME", base)
|
||||
elif collection.endswith("_chunks"):
|
||||
base = os.getenv("CHUNKS_VECTOR_NAME", base)
|
||||
elif collection.endswith("_edges"):
|
||||
base = os.getenv("EDGES_VECTOR_NAME", base)
|
||||
|
||||
if not base:
|
||||
return None
|
||||
val = base.strip()
|
||||
if val.lower() in ("__single__", "single"):
|
||||
return "__single__"
|
||||
return val
|
||||
|
||||
def _get_vector_schema(client: QdrantClient, collection_name: str) -> dict:
|
||||
"""Ermittelt das Vektor-Schema einer existierenden Collection via API."""
|
||||
try:
|
||||
info = client.get_collection(collection_name=collection_name)
|
||||
vecs = getattr(info, "vectors", None)
|
||||
# Prüfung auf Single-Vector Konfiguration
|
||||
if hasattr(vecs, "size") and isinstance(vecs.size, int):
|
||||
return {"kind": "single", "size": vecs.size}
|
||||
# Prüfung auf Named-Vectors Konfiguration
|
||||
cfg = getattr(vecs, "config", None)
|
||||
if isinstance(cfg, dict) and cfg:
|
||||
names = list(cfg.keys())
|
||||
if names:
|
||||
return {"kind": "named", "names": names, "primary": _preferred_name(names)}
|
||||
except Exception:
|
||||
pass
|
||||
return {"kind": "single", "size": None}
|
||||
|
||||
def _as_named(points: List[rest.PointStruct], name: str) -> List[rest.PointStruct]:
|
||||
"""Transformiert PointStructs in das Named-Vector Format."""
|
||||
out: List[rest.PointStruct] = []
|
||||
for pt in points:
|
||||
vec = getattr(pt, "vector", None)
|
||||
if isinstance(vec, dict):
|
||||
if name in vec:
|
||||
out.append(pt)
|
||||
else:
|
||||
fallback_vec = None
|
||||
try:
|
||||
fallback_vec = list(next(iter(vec.values())))
|
||||
except Exception:
|
||||
fallback_vec = [0.0]
|
||||
out.append(rest.PointStruct(id=pt.id, vector={name: fallback_vec}, payload=pt.payload))
|
||||
elif vec is not None:
|
||||
out.append(rest.PointStruct(id=pt.id, vector={name: vec}, payload=pt.payload))
|
||||
else:
|
||||
out.append(pt)
|
||||
return out
|
||||
|
||||
# --------------------- Qdrant ops ---------------------
|
||||
|
||||
def upsert_batch(client: QdrantClient, collection: str, points: List[rest.PointStruct], wait: bool = True) -> None:
|
||||
"""
|
||||
Schreibt Points hocheffizient in eine Collection.
|
||||
Unterstützt automatische Schema-Erkennung und Named-Vector Transformation.
|
||||
WP-Fix: 'wait=True' ist Default für Datenkonsistenz zwischen den Ingest-Phasen.
|
||||
"""
|
||||
if not points:
|
||||
return
|
||||
|
||||
# 1) ENV overrides prüfen
|
||||
override = _env_override_for_collection(collection)
|
||||
if override == "__single__":
|
||||
client.upsert(collection_name=collection, points=points, wait=wait)
|
||||
return
|
||||
elif isinstance(override, str):
|
||||
client.upsert(collection_name=collection, points=_as_named(points, override), wait=wait)
|
||||
return
|
||||
|
||||
# 2) Automatische Schema-Erkennung (Live-Check)
|
||||
schema = _get_vector_schema(client, collection)
|
||||
if schema.get("kind") == "named":
|
||||
name = schema.get("primary") or _preferred_name(schema.get("names") or [])
|
||||
client.upsert(collection_name=collection, points=_as_named(points, name), wait=wait)
|
||||
return
|
||||
|
||||
# 3) Fallback: Single-Vector Upsert
|
||||
client.upsert(collection_name=collection, points=points, wait=wait)
|
||||
|
||||
# --- Optional search helpers ---
|
||||
|
||||
def _filter_any(field: str, values: Iterable[str]) -> rest.Filter:
|
||||
"""Hilfsfunktion für händische Filter-Konstruktion (Logical OR)."""
|
||||
return rest.Filter(should=[rest.FieldCondition(key=field, match=rest.MatchValue(value=v)) for v in values])
|
||||
|
||||
def _merge_filters(*filters: Optional[rest.Filter]) -> Optional[rest.Filter]:
|
||||
"""Führt mehrere Filter-Objekte zu einem konsolidierten Filter zusammen."""
|
||||
fs = [f for f in filters if f is not None]
|
||||
if not fs:
|
||||
return None
|
||||
if len(fs) == 1:
|
||||
return fs[0]
|
||||
must = []
|
||||
for f in fs:
|
||||
if getattr(f, "must", None):
|
||||
must.extend(f.must)
|
||||
if getattr(f, "should", None):
|
||||
must.append(rest.Filter(should=f.should))
|
||||
return rest.Filter(must=must)
|
||||
|
||||
def _filter_from_dict(filters: Optional[Dict[str, Any]]) -> Optional[rest.Filter]:
|
||||
"""Konvertiert ein Python-Dict in ein Qdrant-Filter Objekt."""
|
||||
if not filters:
|
||||
return None
|
||||
parts = []
|
||||
for k, v in filters.items():
|
||||
if isinstance(v, (list, tuple, set)):
|
||||
parts.append(_filter_any(k, [str(x) for x in v]))
|
||||
else:
|
||||
parts.append(rest.Filter(must=[rest.FieldCondition(key=k, match=rest.MatchValue(value=v))]))
|
||||
return _merge_filters(*parts)
|
||||
|
||||
def search_chunks_by_vector(client: QdrantClient, prefix: str, vector: List[float], top: int = 10, filters: Optional[Dict[str, Any]] = None) -> List[Tuple[str, float, dict]]:
|
||||
"""Sucht semantisch ähnliche Chunks in der Vektordatenbank."""
|
||||
_, chunks_col, _ = _names(prefix)
|
||||
flt = _filter_from_dict(filters)
|
||||
res = client.search(
|
||||
collection_name=chunks_col,
|
||||
query_vector=vector,
|
||||
limit=top,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
query_filter=flt
|
||||
)
|
||||
out: List[Tuple[str, float, dict]] = []
|
||||
for r in res:
|
||||
out.append((str(r.id), float(r.score), dict(r.payload or {})))
|
||||
return out
|
||||
|
||||
|
||||
# --- Edge retrieval helper ---
|
||||
|
||||
def get_edges_for_sources(
|
||||
client: QdrantClient,
|
||||
prefix: str,
|
||||
source_ids: Iterable[str],
|
||||
edge_types: Optional[Iterable[str]] = None,
|
||||
limit: int = 2048,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Ruft alle Kanten ab, die von einer Menge von Quell-Notizen ausgehen."""
|
||||
source_ids = list(source_ids)
|
||||
if not source_ids or limit <= 0:
|
||||
return []
|
||||
|
||||
# Namen der Edges-Collection auflösen
|
||||
_, _, edges_col = _names(prefix)
|
||||
|
||||
# Filter-Bau: source_id IN source_ids
|
||||
src_filter = _filter_any("source_id", [str(s) for s in source_ids])
|
||||
|
||||
# Optionaler Filter auf den Kanten-Typ
|
||||
kind_filter = None
|
||||
if edge_types:
|
||||
kind_filter = _filter_any("kind", [str(k) for k in edge_types])
|
||||
|
||||
flt = _merge_filters(src_filter, kind_filter)
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
next_page = None
|
||||
remaining = int(limit)
|
||||
|
||||
# Paginated Scroll API (NUR Payload, keine Vektoren)
|
||||
while remaining > 0:
|
||||
batch_limit = min(256, remaining)
|
||||
res, next_page = client.scroll(
|
||||
collection_name=edges_col,
|
||||
scroll_filter=flt,
|
||||
limit=batch_limit,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=next_page,
|
||||
)
|
||||
|
||||
if not res:
|
||||
break
|
||||
|
||||
for r in res:
|
||||
out.append(dict(r.payload or {}))
|
||||
remaining -= 1
|
||||
if remaining <= 0:
|
||||
break
|
||||
|
||||
if next_page is None or remaining <= 0:
|
||||
break
|
||||
|
||||
return out
|
||||
|
|
@ -1,126 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Name: app/core/derive_edges.py
|
||||
Version: v1.1.0 (2025-09-05)
|
||||
|
||||
Kurzbeschreibung
|
||||
Leitet Edges aus Wikilinks ([[…]]) ab und löst Zielnoten robust auf.
|
||||
Erzeugt:
|
||||
- "references" (Note -> Note) mit seq="body", pro Match eine eigene Occurrence 'occ'
|
||||
- "backlink" (inverse zu "references", gleiche seq/occ)
|
||||
- "references_at" (Chunk -> Note) mit seq=<chunk_index> und eigener 'occ' je Match
|
||||
|
||||
Aufruf
|
||||
from app.core.derive_edges import build_note_index, derive_wikilink_edges
|
||||
|
||||
Parameter / Felder
|
||||
- note_payload: {"note_id","title","path","fulltext": <body> , …}
|
||||
- chunks_payloads: [{"chunk_id","text",…}, …]
|
||||
- note_index: build_note_index([...]) -> (by_id, by_slug, by_file_slug)
|
||||
|
||||
Kompatibilität
|
||||
- Rückwärtskompatible Payload-Felder, nur erweitert um 'seq' und 'occ'.
|
||||
|
||||
Changelog
|
||||
v1.1.0: Occurrence-Zählung ('occ') je Match; 'seq="body"' für references.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import re
|
||||
import unicodedata
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
# [[Ziel]], [[Ziel|Alias]], [[Ziel#Heading]], [[Ziel#Heading|Alias]]
|
||||
WIKILINK_RE = re.compile(r"\[\[([^\]|#]+)(?:#([^\]|]+))?(?:\|([^\]]+))?\]\]")
|
||||
|
||||
def _slug(s: str) -> str:
|
||||
s = s.strip()
|
||||
if s.endswith(".md"):
|
||||
s = s[:-3]
|
||||
s = unicodedata.normalize("NFKD", s)
|
||||
s = "".join(ch for ch in s if not unicodedata.combining(ch))
|
||||
s = s.replace("\\", "/")
|
||||
s = s.split("/")[-1]
|
||||
s = s.lower().replace(" ", "-")
|
||||
s = re.sub(r"[^a-z0-9\-]+", "", s)
|
||||
s = re.sub(r"-{2,}", "-", s).strip("-")
|
||||
return s
|
||||
|
||||
def build_note_index(notes_payloads: List[dict]) -> Tuple[Dict[str, dict], Dict[str, dict], Dict[str, dict]]:
|
||||
by_id: Dict[str, dict] = {}
|
||||
by_slug: Dict[str, dict] = {}
|
||||
by_file_slug: Dict[str, dict] = {}
|
||||
for n in notes_payloads:
|
||||
nid = n.get("note_id") or n.get("id")
|
||||
if not nid:
|
||||
continue
|
||||
by_id[nid] = n
|
||||
title = n.get("title", "")
|
||||
path = n.get("path", "")
|
||||
file_slug = _slug(path.split("/")[-1]) if path else ""
|
||||
if title:
|
||||
by_slug[_slug(title)] = n
|
||||
if file_slug:
|
||||
by_file_slug[file_slug] = n
|
||||
return by_id, by_slug, by_file_slug
|
||||
|
||||
def resolve_target(note_like: str, idx: Tuple[Dict[str,dict],Dict[str,dict],Dict[str,dict]]):
|
||||
by_id, by_slug, by_file_slug = idx
|
||||
key = note_like.strip()
|
||||
if key in by_id:
|
||||
return by_id[key]["note_id"], "by_id"
|
||||
s = _slug(key)
|
||||
if s in by_slug:
|
||||
return by_slug[s]["note_id"], "by_slug"
|
||||
if s in by_file_slug:
|
||||
return by_file_slug[s]["note_id"], "by_file_slug"
|
||||
return None, "unresolved"
|
||||
|
||||
def derive_wikilink_edges(note_payload: dict, chunks_payloads: List[dict], note_index) -> List[dict]:
|
||||
edges: List[dict] = []
|
||||
source_note_id = note_payload["note_id"]
|
||||
|
||||
def _make_edge(kind: str, src: str, tgt: str, seq=None, occ=None, extra: dict|None=None):
|
||||
e = {"edge_id": None, "kind": kind, "source_id": src, "target_id": tgt}
|
||||
if seq is not None:
|
||||
e["seq"] = seq
|
||||
if occ is not None:
|
||||
e["occ"] = occ
|
||||
if extra:
|
||||
e.update(extra)
|
||||
return e
|
||||
|
||||
# Volltext (Note-Ebene)
|
||||
fulltext = note_payload.get("fulltext") or note_payload.get("body") or ""
|
||||
if fulltext:
|
||||
for k, m in enumerate(WIKILINK_RE.finditer(fulltext), start=1):
|
||||
raw_target, heading, alias = m.groups()
|
||||
target_id, how = resolve_target(raw_target, note_index)
|
||||
extra = {"raw": raw_target, "alias": alias, "heading": heading, "resolution": how}
|
||||
if target_id:
|
||||
edges.append(_make_edge("references", source_note_id, target_id, seq="body", occ=k, extra=extra))
|
||||
edges.append(_make_edge("backlink", target_id, source_note_id, seq="body", occ=k, extra=extra))
|
||||
else:
|
||||
extra["status"] = "unresolved"
|
||||
extra["target_label"] = raw_target
|
||||
edges.append(_make_edge("references", source_note_id, raw_target, seq="body", occ=k, extra=extra))
|
||||
|
||||
# Chunks (Chunk-Ebene)
|
||||
for i, ch in enumerate(chunks_payloads, start=1):
|
||||
txt = ch.get("text") or ch.get("content") or ""
|
||||
if not txt:
|
||||
continue
|
||||
occ = 0
|
||||
for m in WIKILINK_RE.finditer(txt):
|
||||
occ += 1
|
||||
raw_target, heading, alias = m.groups()
|
||||
target_id, how = resolve_target(raw_target, note_index)
|
||||
extra = {"raw": raw_target, "alias": alias, "heading": heading, "resolution": how}
|
||||
if target_id:
|
||||
edges.append(_make_edge("references_at", ch["chunk_id"], target_id, seq=i, occ=occ, extra=extra))
|
||||
else:
|
||||
extra["status"] = "unresolved"
|
||||
extra["target_label"] = raw_target
|
||||
edges.append(_make_edge("references_at", ch["chunk_id"], raw_target, seq=i, occ=occ, extra=extra))
|
||||
return edges
|
||||
|
|
@ -1,120 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Modul: app/core/edges.py
|
||||
Version: 1.0.0
|
||||
Datum: 2025-09-09
|
||||
|
||||
Zweck
|
||||
-----
|
||||
Zentrale, konsistente Erzeugung von Edge-Payloads im **neuen Schema**:
|
||||
- kind : "belongs_to" | "next" | "prev" | "references" | "backlink"
|
||||
- source_id : ID des Quellknotens (Chunk- oder Note-ID)
|
||||
- target_id : ID des Zielknotens
|
||||
- scope : "chunk" | "note"
|
||||
- note_id : Owner-Note (für performantes Filtern/Löschen)
|
||||
- seq : optional (z. B. Reihenfolge von Vorkommen)
|
||||
|
||||
Hinweise
|
||||
--------
|
||||
- Edges werden dedupliziert (key=(kind,source_id,target_id,scope)).
|
||||
- Für Chunk-Edges wird `note_id` aus dem Chunk-Payload entnommen.
|
||||
- Für Note-Scope-Edges ist `note_id` die Quell-Note-ID.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Dict, List
|
||||
|
||||
def build_edges_for_note(
|
||||
note_id: str,
|
||||
chunk_payloads: List[Dict],
|
||||
note_level_refs: List[str] | None,
|
||||
*,
|
||||
include_note_scope_refs: bool = False,
|
||||
) -> List[Dict]:
|
||||
edges: List[Dict] = []
|
||||
|
||||
# Chunk-Scope: belongs_to / prev / next / references
|
||||
for ch in chunk_payloads:
|
||||
cid = ch["id"]
|
||||
owner = ch.get("note_id") or note_id
|
||||
# belongs_to
|
||||
edges.append({
|
||||
"kind": "belongs_to",
|
||||
"source_id": cid,
|
||||
"target_id": note_id,
|
||||
"scope": "chunk",
|
||||
"note_id": owner,
|
||||
})
|
||||
# Nachbarn
|
||||
nb = ch.get("neighbors") or {}
|
||||
prev_id = nb.get("prev")
|
||||
next_id = nb.get("next")
|
||||
if prev_id:
|
||||
edges.append({
|
||||
"kind": "prev",
|
||||
"source_id": cid,
|
||||
"target_id": prev_id,
|
||||
"scope": "chunk",
|
||||
"note_id": owner,
|
||||
})
|
||||
edges.append({
|
||||
"kind": "next",
|
||||
"source_id": prev_id,
|
||||
"target_id": cid,
|
||||
"scope": "chunk",
|
||||
"note_id": owner,
|
||||
})
|
||||
if next_id:
|
||||
edges.append({
|
||||
"kind": "next",
|
||||
"source_id": cid,
|
||||
"target_id": next_id,
|
||||
"scope": "chunk",
|
||||
"note_id": owner,
|
||||
})
|
||||
edges.append({
|
||||
"kind": "prev",
|
||||
"source_id": next_id,
|
||||
"target_id": cid,
|
||||
"scope": "chunk",
|
||||
"note_id": owner,
|
||||
})
|
||||
# references aus Chunk
|
||||
for ref in (ch.get("references") or []):
|
||||
tid = ref.get("target_id")
|
||||
if not tid:
|
||||
continue
|
||||
edges.append({
|
||||
"kind": "references",
|
||||
"source_id": cid,
|
||||
"target_id": tid,
|
||||
"scope": "chunk",
|
||||
"note_id": owner,
|
||||
})
|
||||
|
||||
# Note-Scope: backlink (immer); references (optional)
|
||||
unique_refs = list(dict.fromkeys(note_level_refs or []))
|
||||
for tid in unique_refs:
|
||||
if include_note_scope_refs:
|
||||
edges.append({
|
||||
"kind": "references",
|
||||
"source_id": note_id,
|
||||
"target_id": tid,
|
||||
"scope": "note",
|
||||
"note_id": note_id,
|
||||
})
|
||||
edges.append({
|
||||
"kind": "backlink",
|
||||
"source_id": tid,
|
||||
"target_id": note_id,
|
||||
"scope": "note",
|
||||
"note_id": note_id,
|
||||
})
|
||||
|
||||
# Dedupe
|
||||
dedup = {}
|
||||
for e in edges:
|
||||
k = (e["kind"], e["source_id"], e["target_id"], e.get("scope", ""))
|
||||
dedup[k] = e
|
||||
return list(dedup.values())
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
from __future__ import annotations
|
||||
import os, time, json
|
||||
import urllib.request
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# Backend-Auswahl:
|
||||
# - EMBED_BACKEND=ollama -> EMBED_URL=/api/embeddings (Ollama), EMBED_MODEL=z.B. nomic-embed-text
|
||||
# - EMBED_BACKEND=mini -> EMBED_URL=/embed (unser MiniLM-Server), EMBED_MODEL=minilm-384
|
||||
EMBED_BACKEND = os.getenv("EMBED_BACKEND", "mini").lower()
|
||||
EMBED_URL = os.getenv("EMBED_URL", "http://127.0.0.1:8990/embed")
|
||||
EMBED_MODEL = os.getenv("EMBED_MODEL", "minilm-384")
|
||||
EMBED_BATCH = int(os.getenv("EMBED_BATCH", "64"))
|
||||
TIMEOUT = 60
|
||||
|
||||
class EmbedError(RuntimeError): ...
|
||||
|
||||
def _post_json(url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
|
||||
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
def _embed_mini(inputs: List[str], model: str, batch: int) -> List[List[float]]:
|
||||
out: List[List[float]] = []
|
||||
i = 0
|
||||
while i < len(inputs):
|
||||
chunk = inputs[i:i+batch]
|
||||
# einfache Retries
|
||||
for attempt in range(5):
|
||||
try:
|
||||
resp = _post_json(EMBED_URL, {"model": model, "inputs": chunk})
|
||||
vecs = resp.get("embeddings") or resp.get("vectors") or resp.get("data")
|
||||
if not isinstance(vecs, list):
|
||||
raise EmbedError(f"Bad embed response keys: {list(resp.keys())}")
|
||||
out.extend(vecs)
|
||||
break
|
||||
except Exception:
|
||||
if attempt == 4:
|
||||
raise
|
||||
time.sleep(1.5 * (attempt + 1))
|
||||
i += batch
|
||||
return out
|
||||
|
||||
def _embed_ollama(inputs: List[str], model: str, batch: int) -> List[List[float]]:
|
||||
# Ollama /api/embeddings akzeptiert "input" als String ODER Array.
|
||||
# Die Response enthält:
|
||||
# - für single input: {"embedding":[...], "model":"...", ...}
|
||||
# - für array input: {"embeddings":[[...],[...],...], "model":"...", ...} (je nach Version)
|
||||
# Um maximal kompatibel zu sein, rufen wir pro Text einzeln auf.
|
||||
out: List[List[float]] = []
|
||||
for text in inputs:
|
||||
# Retries
|
||||
for attempt in range(5):
|
||||
try:
|
||||
resp = _post_json(EMBED_URL, {"model": model, "input": text})
|
||||
if "embedding" in resp and isinstance(resp["embedding"], list):
|
||||
out.append(resp["embedding"])
|
||||
elif "embeddings" in resp and isinstance(resp["embeddings"], list):
|
||||
# Falls Server array zurückgibt, nimm das erste Element
|
||||
vecs = resp["embeddings"]
|
||||
out.append(vecs[0] if vecs else [])
|
||||
else:
|
||||
raise EmbedError(f"Ollama response unexpected keys: {list(resp.keys())}")
|
||||
break
|
||||
except Exception:
|
||||
if attempt == 4:
|
||||
raise
|
||||
time.sleep(1.5 * (attempt + 1))
|
||||
return out
|
||||
|
||||
def embed_texts(texts: List[str], model: str | None = None, batch_size: int | None = None) -> List[List[float]]:
|
||||
model = model or EMBED_MODEL
|
||||
batch = batch_size or EMBED_BATCH
|
||||
if not texts:
|
||||
return []
|
||||
if EMBED_BACKEND == "ollama":
|
||||
return _embed_ollama(texts, model, batch)
|
||||
# default: mini
|
||||
return _embed_mini(texts, model, batch)
|
||||
|
||||
def embed_one(text: str, model: str | None = None) -> List[float]:
|
||||
return embed_texts([text], model=model, batch_size=1)[0]
|
||||
16
app/core/graph/__init__.py
Normal file
16
app/core/graph/__init__.py
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
"""
|
||||
FILE: app/core/graph/__init__.py
|
||||
DESCRIPTION: Unified Graph Package. Exportiert Kanten-Ableitung und Graph-Adapter.
|
||||
"""
|
||||
from .graph_derive_edges import build_edges_for_note
|
||||
from .graph_utils import PROVENANCE_PRIORITY
|
||||
from .graph_subgraph import Subgraph, expand
|
||||
from .graph_weights import EDGE_BASE_WEIGHTS
|
||||
|
||||
__all__ = [
|
||||
"build_edges_for_note",
|
||||
"PROVENANCE_PRIORITY",
|
||||
"Subgraph",
|
||||
"expand",
|
||||
"EDGE_BASE_WEIGHTS"
|
||||
]
|
||||
101
app/core/graph/graph_db_adapter.py
Normal file
101
app/core/graph/graph_db_adapter.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_db_adapter.py
|
||||
DESCRIPTION: Datenbeschaffung aus Qdrant für den Graphen.
|
||||
AUDIT v1.2.0: Gold-Standard v4.1.0 - Scope-Awareness & Section-Filtering.
|
||||
- Erweiterte Suche nach chunk_id-Edges für Scope-Awareness
|
||||
- Optionales target_section-Filtering für präzise Section-Links
|
||||
- Vollständige Metadaten-Unterstützung (provenance, confidence, virtual)
|
||||
VERSION: 1.2.0 (WP-24c: Gold-Standard v4.1.0)
|
||||
"""
|
||||
from typing import List, Dict, Optional
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
# Nutzt die zentrale Infrastruktur für konsistente Collection-Namen (WP-14)
|
||||
from app.core.database import collection_names
|
||||
|
||||
def fetch_edges_from_qdrant(
|
||||
client: QdrantClient,
|
||||
prefix: str,
|
||||
seeds: List[str],
|
||||
edge_types: Optional[List[str]] = None,
|
||||
target_section: Optional[str] = None,
|
||||
chunk_ids: Optional[List[str]] = None,
|
||||
limit: int = 2048,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Holt Edges aus der Datenbank basierend auf Seed-IDs.
|
||||
WP-24c v4.1.0: Scope-Aware Edge Retrieval mit Section-Filtering.
|
||||
|
||||
Args:
|
||||
client: Qdrant Client
|
||||
prefix: Collection-Präfix
|
||||
seeds: Liste von Note-IDs für die Suche
|
||||
edge_types: Optionale Filterung nach Kanten-Typen
|
||||
target_section: Optionales Section-Filtering (für präzise Section-Links)
|
||||
chunk_ids: Optionale Liste von Chunk-IDs für Scope-Awareness (Chunk-Level Edges)
|
||||
limit: Maximale Anzahl zurückgegebener Edges
|
||||
"""
|
||||
if not seeds or limit <= 0:
|
||||
return []
|
||||
|
||||
# Konsistente Namensauflösung via database-Paket
|
||||
# Rückgabe: (notes_col, chunks_col, edges_col)
|
||||
_, _, edges_col = collection_names(prefix)
|
||||
|
||||
# WP-24c v4.1.0: Scope-Awareness - Suche nach Note- UND Chunk-Level Edges
|
||||
seed_conditions = []
|
||||
for field in ("source_id", "target_id", "note_id"):
|
||||
for s in seeds:
|
||||
seed_conditions.append(
|
||||
rest.FieldCondition(key=field, match=rest.MatchValue(value=str(s)))
|
||||
)
|
||||
|
||||
# Chunk-Level Edges: Wenn chunk_ids angegeben, suche auch nach chunk_id als source_id
|
||||
if chunk_ids:
|
||||
for cid in chunk_ids:
|
||||
seed_conditions.append(
|
||||
rest.FieldCondition(key="source_id", match=rest.MatchValue(value=str(cid)))
|
||||
)
|
||||
|
||||
seeds_filter = rest.Filter(should=seed_conditions) if seed_conditions else None
|
||||
|
||||
# Optionaler Filter auf spezifische Kanten-Typen (z.B. für Intent-Routing)
|
||||
type_filter = None
|
||||
if edge_types:
|
||||
type_conds = [
|
||||
rest.FieldCondition(key="kind", match=rest.MatchValue(value=str(k)))
|
||||
for k in edge_types
|
||||
]
|
||||
type_filter = rest.Filter(should=type_conds)
|
||||
|
||||
# WP-24c v4.1.0: Section-Filtering für präzise Section-Links
|
||||
section_filter = None
|
||||
if target_section:
|
||||
section_filter = rest.Filter(must=[
|
||||
rest.FieldCondition(key="target_section", match=rest.MatchValue(value=str(target_section)))
|
||||
])
|
||||
|
||||
must = []
|
||||
if seeds_filter:
|
||||
must.append(seeds_filter)
|
||||
if type_filter:
|
||||
must.append(type_filter)
|
||||
if section_filter:
|
||||
must.append(section_filter)
|
||||
|
||||
flt = rest.Filter(must=must) if must else None
|
||||
|
||||
# Abfrage via Qdrant Scroll API
|
||||
# WICHTIG: with_payload=True lädt alle Metadaten (target_section, provenance etc.)
|
||||
pts, _ = client.scroll(
|
||||
collection_name=edges_col,
|
||||
scroll_filter=flt,
|
||||
limit=limit,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
)
|
||||
|
||||
# Wir geben das vollständige Payload zurück, damit der Retriever
|
||||
# alle Signale für die Super-Edge-Aggregation und das Scoring hat.
|
||||
return [dict(p.payload) for p in pts if p.payload]
|
||||
1008
app/core/graph/graph_derive_edges.py
Normal file
1008
app/core/graph/graph_derive_edges.py
Normal file
File diff suppressed because it is too large
Load Diff
164
app/core/graph/graph_extractors.py
Normal file
164
app/core/graph/graph_extractors.py
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_extractors.py
|
||||
DESCRIPTION: Regex-basierte Extraktion von Relationen aus Text.
|
||||
AUDIT:
|
||||
- Regex für Wikilinks liberalisiert (Umlaute, Sonderzeichen).
|
||||
- Callout-Parser erweitert für Multi-Line-Listen und Header-Typen.
|
||||
"""
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
# Erlaube alle Zeichen außer ']' im Target (fängt Umlaute, Emojis, '&', '#' ab)
|
||||
_WIKILINK_RE = re.compile(r"\[\[(?:[^\|\]]+\|)?([^\]]+)\]\]")
|
||||
|
||||
_REL_PIPE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s*\|\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||
_REL_SPACE = re.compile(r"\[\[\s*rel:(?P<kind>[a-z_]+)\s+(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||
_REL_TEXT = re.compile(r"rel\s*:\s*(?P<kind>[a-z_]+)\s*\[\[\s*(?P<target>[^\]]+?)\s*\]\]", re.IGNORECASE)
|
||||
|
||||
# Erkennt [!edge] Callouts mit einem oder mehreren '>' am Anfang (für verschachtelte Callouts)
|
||||
_CALLOUT_START = re.compile(r"^\s*>{1,}\s*\[!edge\]\s*(.*)$", re.IGNORECASE)
|
||||
# Erkennt "kind: targets..."
|
||||
_REL_LINE = re.compile(r"^(?P<kind>[a-z_]+)\s*:\s*(?P<targets>.+?)\s*$", re.IGNORECASE)
|
||||
# Erkennt reine Typen (z.B. "depends_on" im Header)
|
||||
_SIMPLE_KIND = re.compile(r"^[a-z_]+$", re.IGNORECASE)
|
||||
|
||||
def extract_typed_relations(text: str) -> Tuple[List[Tuple[str, str]], str]:
|
||||
"""
|
||||
Findet Inline-Relationen wie [[rel:depends_on Target]].
|
||||
Gibt (Liste[(kind, target)], bereinigter_text) zurück.
|
||||
"""
|
||||
if not text: return [], ""
|
||||
pairs = []
|
||||
def _collect(m):
|
||||
k, t = m.group("kind").strip().lower(), m.group("target").strip()
|
||||
pairs.append((k, t))
|
||||
return ""
|
||||
text = _REL_PIPE.sub(_collect, text)
|
||||
text = _REL_SPACE.sub(_collect, text)
|
||||
text = _REL_TEXT.sub(_collect, text)
|
||||
return pairs, text
|
||||
|
||||
def extract_callout_relations(text: str) -> Tuple[List[Tuple[str,str]], str]:
|
||||
"""
|
||||
Verarbeitet Obsidian [!edge]-Callouts.
|
||||
Unterstützt zwei Formate:
|
||||
1. Explizit: "kind: [[Target]]"
|
||||
2. Implizit (Header): "> [!edge] kind" gefolgt von "[[Target]]" Zeilen
|
||||
3. Verschachtelt: ">> [!edge] kind" in verschachtelten Callouts
|
||||
"""
|
||||
if not text: return [], text
|
||||
lines = text.splitlines()
|
||||
out_pairs = []
|
||||
keep_lines = []
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
m = _CALLOUT_START.match(line)
|
||||
if not m:
|
||||
keep_lines.append(line)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Callout-Block gefunden. Wir sammeln alle relevanten Zeilen.
|
||||
block_lines = []
|
||||
|
||||
# Header Content prüfen (z.B. "type" aus "> [!edge] type" oder ">> [!edge] type")
|
||||
header_raw = m.group(1).strip()
|
||||
if header_raw:
|
||||
block_lines.append(header_raw)
|
||||
|
||||
# Bestimme die Einrückungsebene (Anzahl der '>' am Anfang der ersten Zeile)
|
||||
leading_gt_count = len(line) - len(line.lstrip('>'))
|
||||
if leading_gt_count == 0:
|
||||
leading_gt_count = 1 # Fallback für den Fall, dass kein '>' gefunden wurde
|
||||
|
||||
i += 1
|
||||
# Sammle alle Zeilen, die mit mindestens der gleichen Anzahl '>' beginnen
|
||||
while i < len(lines):
|
||||
next_line = lines[i]
|
||||
stripped = next_line.lstrip()
|
||||
# Prüfe, ob die Zeile mit mindestens der gleichen Anzahl '>' beginnt
|
||||
if not stripped.startswith('>'):
|
||||
break
|
||||
next_leading_gt_count = len(next_line) - len(next_line.lstrip('>'))
|
||||
# Wenn die Einrückung kleiner wird, haben wir den Block verlassen
|
||||
if next_leading_gt_count < leading_gt_count:
|
||||
break
|
||||
# Entferne genau die Anzahl der führenden '>' entsprechend der Einrückungsebene
|
||||
# und dann führende Leerzeichen
|
||||
if next_leading_gt_count >= leading_gt_count:
|
||||
# Entferne die führenden '>' (entsprechend der Einrückungsebene)
|
||||
content = stripped[leading_gt_count:].lstrip()
|
||||
if content:
|
||||
block_lines.append(content)
|
||||
i += 1
|
||||
|
||||
# Verarbeitung des Blocks
|
||||
current_kind = None
|
||||
|
||||
# Heuristik: Ist die allererste Zeile (meist aus dem Header) ein reiner Typ?
|
||||
# Dann setzen wir diesen als Default für den Block.
|
||||
if block_lines:
|
||||
first = block_lines[0]
|
||||
# Wenn es NICHT wie "Key: Value" aussieht, aber wie ein Wort:
|
||||
if not _REL_LINE.match(first) and _SIMPLE_KIND.match(first):
|
||||
current_kind = first.lower()
|
||||
|
||||
for bl in block_lines:
|
||||
# Prüfe, ob diese Zeile selbst ein neuer [!edge] Callout ist (für verschachtelte Blöcke)
|
||||
edge_match = re.match(r"^\s*\[!edge\]\s*(.*)$", bl, re.IGNORECASE)
|
||||
if edge_match:
|
||||
# Neuer Edge-Callout gefunden, setze den Typ
|
||||
edge_content = edge_match.group(1).strip()
|
||||
if edge_content:
|
||||
# Prüfe, ob es ein "kind: targets" Format ist
|
||||
mrel = _REL_LINE.match(edge_content)
|
||||
if mrel:
|
||||
current_kind = mrel.group("kind").strip().lower()
|
||||
targets = mrel.group("targets")
|
||||
# Links extrahieren
|
||||
found = _WIKILINK_RE.findall(targets)
|
||||
if found:
|
||||
for t in found: out_pairs.append((current_kind, t.strip()))
|
||||
elif _SIMPLE_KIND.match(edge_content):
|
||||
# Reiner Typ ohne Targets
|
||||
current_kind = edge_content.lower()
|
||||
continue
|
||||
|
||||
# 1. Prüfen auf explizites "Kind: Targets" (überschreibt Header-Typ für diese Zeile)
|
||||
mrel = _REL_LINE.match(bl)
|
||||
if mrel:
|
||||
line_kind = mrel.group("kind").strip().lower()
|
||||
targets = mrel.group("targets")
|
||||
|
||||
# Links extrahieren
|
||||
found = _WIKILINK_RE.findall(targets)
|
||||
if found:
|
||||
for t in found: out_pairs.append((line_kind, t.strip()))
|
||||
else:
|
||||
# Fallback für kommagetrennten Plaintext
|
||||
for raw in re.split(r"[,;]", targets):
|
||||
if raw.strip(): out_pairs.append((line_kind, raw.strip()))
|
||||
|
||||
# Aktualisiere current_kind für nachfolgende Zeilen
|
||||
current_kind = line_kind
|
||||
continue
|
||||
|
||||
# 2. Kein Key:Value Muster -> Prüfen auf Links, die den current_kind nutzen
|
||||
found = _WIKILINK_RE.findall(bl)
|
||||
if found:
|
||||
if current_kind:
|
||||
for t in found: out_pairs.append((current_kind, t.strip()))
|
||||
else:
|
||||
# Link ohne Typ und ohne Header-Typ.
|
||||
# Wird ignoriert oder könnte als 'related_to' fallback dienen.
|
||||
# Aktuell: Ignorieren, um False Positives zu vermeiden.
|
||||
pass
|
||||
|
||||
return out_pairs, "\n".join(keep_lines)
|
||||
|
||||
def extract_wikilinks(text: str) -> List[str]:
|
||||
"""Findet Standard-Wikilinks [[Target]] oder [[Alias|Target]]."""
|
||||
if not text: return []
|
||||
return [m.strip() for m in _WIKILINK_RE.findall(text) if m.strip()]
|
||||
180
app/core/graph/graph_subgraph.py
Normal file
180
app/core/graph/graph_subgraph.py
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_subgraph.py
|
||||
DESCRIPTION: In-Memory Repräsentation eines Graphen für Scoring und Analyse.
|
||||
Zentrale Komponente für die Graph-Expansion (BFS) und Bonus-Berechnung.
|
||||
WP-15c Update: Erhalt von Metadaten (target_section, provenance)
|
||||
für präzises Retrieval-Reasoning.
|
||||
WP-24c v4.1.0: Scope-Awareness und Section-Filtering Support.
|
||||
VERSION: 1.3.0 (WP-24c: Gold-Standard v4.1.0)
|
||||
STATUS: Active
|
||||
"""
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from typing import Dict, List, Optional, DefaultDict, Any, Set
|
||||
from qdrant_client import QdrantClient
|
||||
|
||||
# Lokale Paket-Imports
|
||||
from .graph_weights import EDGE_BASE_WEIGHTS, calculate_edge_weight
|
||||
from .graph_db_adapter import fetch_edges_from_qdrant
|
||||
|
||||
class Subgraph:
|
||||
"""
|
||||
Leichtgewichtiger Subgraph mit Adjazenzlisten & Kennzahlen.
|
||||
Wird für die Berechnung von Graph-Boni im Retriever genutzt.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
# adj speichert nun vollständige Payloads statt nur Tripel
|
||||
self.adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
||||
self.reverse_adj: DefaultDict[str, List[Dict]] = defaultdict(list)
|
||||
self.in_degree: DefaultDict[str, int] = defaultdict(int)
|
||||
self.out_degree: DefaultDict[str, int] = defaultdict(int)
|
||||
# WP-24c v4.1.0: Chunk-Level In-Degree für präzise Scoring-Aggregation
|
||||
self.chunk_level_in_degree: DefaultDict[str, int] = defaultdict(int)
|
||||
|
||||
def add_edge(self, e: Dict) -> None:
|
||||
"""
|
||||
Fügt eine Kante hinzu und aktualisiert Indizes.
|
||||
WP-15c: Speichert das vollständige Payload für den Explanation Layer.
|
||||
"""
|
||||
src = e.get("source")
|
||||
tgt = e.get("target")
|
||||
kind = e.get("kind")
|
||||
|
||||
# Das gesamte Payload wird als Kanten-Objekt behalten
|
||||
# Wir stellen sicher, dass alle relevanten Metadaten vorhanden sind
|
||||
edge_data = {
|
||||
"source": src,
|
||||
"target": tgt,
|
||||
"kind": kind,
|
||||
"weight": e.get("weight", EDGE_BASE_WEIGHTS.get(kind, 0.0)),
|
||||
"provenance": e.get("provenance", "rule"),
|
||||
"confidence": e.get("confidence", 1.0),
|
||||
"target_section": e.get("target_section"), # Essentiell für Präzision
|
||||
"is_super_edge": e.get("is_super_edge", False),
|
||||
"virtual": e.get("virtual", False), # WP-24c v4.1.0: Für Authority-Priorisierung
|
||||
"chunk_id": e.get("chunk_id") # WP-24c v4.1.0: Für RAG-Kontext
|
||||
}
|
||||
|
||||
owner = e.get("note_id")
|
||||
|
||||
if not src or not tgt:
|
||||
return
|
||||
|
||||
# 1. Forward-Kante
|
||||
self.adj[src].append(edge_data)
|
||||
self.out_degree[src] += 1
|
||||
self.in_degree[tgt] += 1
|
||||
|
||||
# 2. Reverse-Kante (für Explanation Layer & Backlinks)
|
||||
self.reverse_adj[tgt].append(edge_data)
|
||||
|
||||
# 3. Kontext-Note Handling (erhöht die Zentralität der Parent-Note)
|
||||
if owner and owner != src:
|
||||
# Wir erstellen eine virtuelle Kontext-Kante
|
||||
ctx_edge = edge_data.copy()
|
||||
ctx_edge["source"] = owner
|
||||
ctx_edge["via_context"] = True
|
||||
|
||||
self.adj[owner].append(ctx_edge)
|
||||
self.out_degree[owner] += 1
|
||||
if owner != tgt:
|
||||
self.reverse_adj[tgt].append(ctx_edge)
|
||||
self.in_degree[owner] += 1
|
||||
|
||||
def aggregate_edge_bonus(self, node_id: str) -> float:
|
||||
"""Summe der ausgehenden Kantengewichte (Hub-Score)."""
|
||||
return sum(edge["weight"] for edge in self.adj.get(node_id, []))
|
||||
|
||||
def edge_bonus(self, node_id: str) -> float:
|
||||
"""API für Retriever (WP-04a Kompatibilität)."""
|
||||
return self.aggregate_edge_bonus(node_id)
|
||||
|
||||
def centrality_bonus(self, node_id: str) -> float:
|
||||
"""
|
||||
Log-gedämpfte Zentralität basierend auf dem In-Degree.
|
||||
Begrenzt auf einen maximalen Boost von 0.15.
|
||||
"""
|
||||
indeg = self.in_degree.get(node_id, 0)
|
||||
if indeg <= 0:
|
||||
return 0.0
|
||||
# math.log1p(x) entspricht log(1+x)
|
||||
return min(math.log1p(indeg) / 10.0, 0.15)
|
||||
|
||||
def get_outgoing_edges(self, node_id: str) -> List[Dict[str, Any]]:
|
||||
"""Gibt alle ausgehenden Kanten einer Node inkl. Metadaten zurück."""
|
||||
return self.adj.get(node_id, [])
|
||||
|
||||
def get_incoming_edges(self, node_id: str) -> List[Dict[str, Any]]:
|
||||
"""Gibt alle eingehenden Kanten einer Node inkl. Metadaten zurück."""
|
||||
return self.reverse_adj.get(node_id, [])
|
||||
|
||||
|
||||
def expand(
|
||||
client: QdrantClient,
|
||||
prefix: str,
|
||||
seeds: List[str],
|
||||
depth: int = 1,
|
||||
edge_types: Optional[List[str]] = None,
|
||||
chunk_ids: Optional[List[str]] = None,
|
||||
target_section: Optional[str] = None,
|
||||
) -> Subgraph:
|
||||
"""
|
||||
Expandiert ab Seeds entlang von Edges bis zu einer bestimmten Tiefe.
|
||||
WP-24c v4.1.0: Unterstützt Scope-Awareness (chunk_ids) und Section-Filtering.
|
||||
|
||||
Args:
|
||||
client: Qdrant Client
|
||||
prefix: Collection-Präfix
|
||||
seeds: Liste von Note-IDs für die Expansion
|
||||
depth: Maximale Tiefe der Expansion
|
||||
edge_types: Optionale Filterung nach Kanten-Typen
|
||||
chunk_ids: Optionale Liste von Chunk-IDs für Scope-Awareness
|
||||
target_section: Optionales Section-Filtering
|
||||
"""
|
||||
sg = Subgraph()
|
||||
frontier = set(seeds)
|
||||
visited = set()
|
||||
|
||||
for _ in range(max(depth, 0)):
|
||||
if not frontier:
|
||||
break
|
||||
|
||||
# WP-24c v4.1.0: Erweiterte Edge-Retrieval mit Scope-Awareness und Section-Filtering
|
||||
payloads = fetch_edges_from_qdrant(
|
||||
client, prefix, list(frontier),
|
||||
edge_types=edge_types,
|
||||
chunk_ids=chunk_ids,
|
||||
target_section=target_section
|
||||
)
|
||||
next_frontier: Set[str] = set()
|
||||
|
||||
for pl in payloads:
|
||||
src, tgt = pl.get("source_id"), pl.get("target_id")
|
||||
if not src or not tgt: continue
|
||||
|
||||
# WP-15c: Wir übergeben das vollständige Payload an add_edge
|
||||
# WP-24c v4.1.0: virtual Flag wird für Authority-Priorisierung benötigt
|
||||
edge_payload = {
|
||||
"source": src,
|
||||
"target": tgt,
|
||||
"kind": pl.get("kind", "edge"),
|
||||
"weight": calculate_edge_weight(pl),
|
||||
"note_id": pl.get("note_id"),
|
||||
"provenance": pl.get("provenance", "rule"),
|
||||
"confidence": pl.get("confidence", 1.0),
|
||||
"target_section": pl.get("target_section"),
|
||||
"virtual": pl.get("virtual", False), # WP-24c v4.1.0: Für Authority-Priorisierung
|
||||
"chunk_id": pl.get("chunk_id") # WP-24c v4.1.0: Für RAG-Kontext
|
||||
}
|
||||
|
||||
sg.add_edge(edge_payload)
|
||||
|
||||
# BFS Logik: Neue Ziele in die nächste Frontier aufnehmen
|
||||
if tgt not in visited:
|
||||
next_frontier.add(str(tgt))
|
||||
|
||||
visited |= frontier
|
||||
frontier = next_frontier - visited
|
||||
|
||||
return sg
|
||||
177
app/core/graph/graph_utils.py
Normal file
177
app/core/graph/graph_utils.py
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_utils.py
|
||||
DESCRIPTION: Basale Werkzeuge, ID-Generierung und Provenance-Konfiguration für den Graphen.
|
||||
AUDIT v4.0.0:
|
||||
- GOLD-STANDARD v4.0.0: Strikte 4-Parameter-ID für Kanten (kind, source, target, scope).
|
||||
- Eliminiert ID-Inkonsistenz zwischen Phase 1 (Autorität) und Phase 2 (Symmetrie).
|
||||
- rule_id und variant werden ignoriert in der ID-Generierung (nur im Payload gespeichert).
|
||||
- Fix für das "Steinzeitaxt"-Problem durch konsistente ID-Generierung.
|
||||
VERSION: 4.0.0 (WP-24c: Gold-Standard Identity)
|
||||
STATUS: Active
|
||||
"""
|
||||
import os
|
||||
import uuid
|
||||
import hashlib
|
||||
from typing import Iterable, List, Optional, Set, Any, Tuple
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None
|
||||
|
||||
# WP-15b: Prioritäten-Ranking für die De-Duplizierung von Kanten unterschiedlicher Herkunft
|
||||
PROVENANCE_PRIORITY = {
|
||||
"explicit:wikilink": 1.00,
|
||||
"inline:rel": 0.95,
|
||||
"callout:edge": 0.90,
|
||||
"explicit:callout": 0.90, # WP-24c v4.2.7: Callout-Kanten aus candidate_pool
|
||||
"semantic_ai": 0.90, # Validierte KI-Kanten
|
||||
"structure:belongs_to": 1.00,
|
||||
"structure:order": 0.95, # next/prev
|
||||
"explicit:note_scope": 1.00,
|
||||
"explicit:note_zone": 1.00, # WP-24c v4.2.0: Note-Scope Zonen (höchste Priorität)
|
||||
"derived:backlink": 0.90,
|
||||
"edge_defaults": 0.70 # Heuristik basierend auf types.yaml
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pfad-Auflösung (Integration der .env Umgebungsvariablen)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_vocab_path() -> str:
|
||||
"""Liefert den Pfad zum Edge-Vokabular aus der .env oder den Default."""
|
||||
return os.getenv("MINDNET_VOCAB_PATH", "/mindnet/vault/mindnet/_system/dictionary/edge_vocabulary.md")
|
||||
|
||||
def get_schema_path() -> str:
|
||||
"""Liefert den Pfad zum Graph-Schema aus der .env oder den Default."""
|
||||
return os.getenv("MINDNET_SCHEMA_PATH", "/mindnet/vault/mindnet/_system/dictionary/graph_schema.md")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ID & String Helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get(d: dict, *keys, default=None):
|
||||
"""Sicherer Zugriff auf tief verschachtelte Dictionary-Keys."""
|
||||
for k in keys:
|
||||
if isinstance(d, dict) and k in d and d[k] is not None:
|
||||
return d[k]
|
||||
return default
|
||||
|
||||
def _dedupe_seq(seq: Iterable[str]) -> List[str]:
|
||||
"""Dedupliziert eine Sequenz von Strings unter Beibehaltung der Reihenfolge."""
|
||||
seen: Set[str] = set()
|
||||
out: List[str] = []
|
||||
for s in seq:
|
||||
if s not in seen:
|
||||
seen.add(s)
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
def parse_link_target(raw: str, current_note_id: Optional[str] = None) -> Tuple[str, Optional[str]]:
|
||||
"""
|
||||
Trennt einen Obsidian-Link [[Target#Section]] in seine Bestandteile Target und Section.
|
||||
Behandelt Self-Links (z.B. [[#Ziele]]), indem die aktuelle note_id eingesetzt wird.
|
||||
|
||||
Returns:
|
||||
Tuple (target_id, target_section)
|
||||
"""
|
||||
if not raw:
|
||||
return "", None
|
||||
|
||||
parts = raw.split("#", 1)
|
||||
target = parts[0].strip()
|
||||
section = parts[1].strip() if len(parts) > 1 else None
|
||||
|
||||
# Spezialfall: Self-Link innerhalb derselben Datei
|
||||
if not target and section and current_note_id:
|
||||
target = current_note_id
|
||||
|
||||
return target, section
|
||||
|
||||
def _mk_edge_id(kind: str, s: str, t: str, scope: str, target_section: Optional[str] = None) -> str:
|
||||
"""
|
||||
WP-24c v4.0.0: DER GLOBALE STANDARD für Kanten-IDs.
|
||||
Erzeugt eine deterministische UUIDv5. Dies stellt sicher, dass manuelle Links
|
||||
und systemgenerierte Symmetrien dieselbe Point-ID in Qdrant erhalten.
|
||||
|
||||
GOLD-STANDARD v4.0.0: Die ID basiert STRICT auf vier Parametern:
|
||||
f"edge:{kind}:{source}:{target}:{scope}"
|
||||
|
||||
Die Parameter rule_id und variant werden IGNORIERT und fließen NICHT in die ID ein.
|
||||
Sie können weiterhin im Payload gespeichert werden, haben aber keinen Einfluss auf die Identität.
|
||||
|
||||
Args:
|
||||
kind: Typ der Relation (z.B. 'mastered_by')
|
||||
s: Kanonische ID der Quell-Note
|
||||
t: Kanonische ID der Ziel-Note
|
||||
scope: Granularität (Standard: 'note')
|
||||
rule_id: Optionale ID der Regel (aus graph_derive_edges) - IGNORIERT in ID-Generierung
|
||||
variant: Optionale Variante für multiple Links zum selben Ziel - IGNORIERT in ID-Generierung
|
||||
"""
|
||||
if not all([kind, s, t]):
|
||||
raise ValueError(f"Incomplete data for edge ID: kind={kind}, src={s}, tgt={t}")
|
||||
|
||||
# Der String enthält nun alle distinkten semantischen Merkmale
|
||||
base = f"edge:{kind}:{s}:{t}:{scope}"
|
||||
|
||||
# Wenn ein Link auf eine spezifische Sektion zeigt, ist es eine andere Relation
|
||||
if target_section:
|
||||
base += f":{target_section}"
|
||||
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, base))
|
||||
|
||||
def _edge(kind: str, scope: str, source_id: str, target_id: str, note_id: str, extra: Optional[dict] = None) -> dict:
|
||||
"""
|
||||
Konstruiert ein standardisiertes Kanten-Payload für Qdrant.
|
||||
Wird von graph_derive_edges.py benötigt.
|
||||
"""
|
||||
pl = {
|
||||
"kind": kind,
|
||||
"relation": kind,
|
||||
"scope": scope,
|
||||
"source_id": source_id,
|
||||
"target_id": target_id,
|
||||
"note_id": note_id,
|
||||
"virtual": False # Standardmäßig explizit, solange nicht anders in Phase 2 gesetzt
|
||||
}
|
||||
if extra:
|
||||
pl.update(extra)
|
||||
return pl
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Registry Operations
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_types_registry() -> dict:
|
||||
"""
|
||||
Lädt die zentrale YAML-Registry (types.yaml).
|
||||
Pfad wird über die Umgebungsvariable MINDNET_TYPES_FILE gesteuert.
|
||||
"""
|
||||
p = os.getenv("MINDNET_TYPES_FILE", "./config/types.yaml")
|
||||
if not os.path.isfile(p) or yaml is None:
|
||||
return {}
|
||||
try:
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
return data if data is not None else {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def get_edge_defaults_for(note_type: Optional[str], reg: dict) -> List[str]:
|
||||
"""
|
||||
Ermittelt die konfigurierten Standard-Kanten für einen Note-Typ.
|
||||
Greift bei Bedarf auf die globalen Defaults in der Registry zurück.
|
||||
"""
|
||||
types_map = reg.get("types", reg) if isinstance(reg, dict) else {}
|
||||
if note_type and isinstance(types_map, dict):
|
||||
t_cfg = types_map.get(note_type)
|
||||
if isinstance(t_cfg, dict) and isinstance(t_cfg.get("edge_defaults"), list):
|
||||
return [str(x) for x in t_cfg["edge_defaults"]]
|
||||
|
||||
# Fallback auf globale Defaults
|
||||
for key in ("defaults", "default", "global"):
|
||||
v = reg.get(key)
|
||||
if isinstance(v, dict) and isinstance(v.get("edge_defaults"), list):
|
||||
return [str(x) for x in v["edge_defaults"] if isinstance(x, str)]
|
||||
|
||||
return []
|
||||
39
app/core/graph/graph_weights.py
Normal file
39
app/core/graph/graph_weights.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
"""
|
||||
FILE: app/core/graph/graph_weights.py
|
||||
DESCRIPTION: Definition der Basisgewichte und Berechnung der Kanteneffektivität.
|
||||
"""
|
||||
from typing import Dict
|
||||
|
||||
# Basisgewichte je Edge-Typ (WP-04a Config)
|
||||
EDGE_BASE_WEIGHTS: Dict[str, float] = {
|
||||
# Struktur
|
||||
"belongs_to": 0.10,
|
||||
"next": 0.06,
|
||||
"prev": 0.06,
|
||||
"backlink": 0.04,
|
||||
"references_at": 0.08,
|
||||
|
||||
# Wissen
|
||||
"references": 0.20,
|
||||
"depends_on": 0.18,
|
||||
"related_to": 0.15,
|
||||
"similar_to": 0.12,
|
||||
}
|
||||
|
||||
def calculate_edge_weight(pl: Dict) -> float:
|
||||
"""Berechnet das effektive Edge-Gewicht aus kind + confidence."""
|
||||
kind = pl.get("kind", "edge")
|
||||
base = EDGE_BASE_WEIGHTS.get(kind, 0.0)
|
||||
|
||||
conf_raw = pl.get("confidence", None)
|
||||
try:
|
||||
conf = float(conf_raw) if conf_raw is not None else None
|
||||
except Exception:
|
||||
conf = None
|
||||
|
||||
if conf is None:
|
||||
return base
|
||||
|
||||
# Clamp confidence 0.0 - 1.0
|
||||
conf = max(0.0, min(1.0, conf))
|
||||
return base * conf
|
||||
26
app/core/ingestion/__init__.py
Normal file
26
app/core/ingestion/__init__.py
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/__init__.py
|
||||
DESCRIPTION: Package-Einstiegspunkt für Ingestion. Exportiert den IngestionService.
|
||||
AUDIT v2.13.10: Abschluss der Modularisierung (WP-14).
|
||||
Bricht Zirkelbezüge durch Nutzung der neutralen registry.py auf.
|
||||
VERSION: 2.13.10
|
||||
"""
|
||||
# Der IngestionService ist der primäre Orchestrator für den Datenimport
|
||||
from .ingestion_processor import IngestionService
|
||||
|
||||
# Hilfswerkzeuge für JSON-Verarbeitung und Konfigurations-Management
|
||||
# load_type_registry wird hier re-exportiert, um die Abwärtskompatibilität zu wahren,
|
||||
# obwohl die Implementierung nun in app.core.registry liegt.
|
||||
from .ingestion_utils import (
|
||||
extract_json_from_response,
|
||||
load_type_registry,
|
||||
resolve_note_type
|
||||
)
|
||||
|
||||
# Öffentliche API des Pakets
|
||||
__all__ = [
|
||||
"IngestionService",
|
||||
"extract_json_from_response",
|
||||
"load_type_registry",
|
||||
"resolve_note_type"
|
||||
]
|
||||
131
app/core/ingestion/ingestion_chunk_payload.py
Normal file
131
app/core/ingestion/ingestion_chunk_payload.py
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_chunk_payload.py
|
||||
DESCRIPTION: Baut das JSON-Objekt für 'mindnet_chunks'.
|
||||
Fix v2.4.3: Integration der zentralen Registry (WP-14) für konsistente Defaults.
|
||||
WP-24c v4.3.0: candidate_pool wird explizit übernommen für Chunk-Attribution.
|
||||
VERSION: 2.4.4 (WP-24c v4.3.0)
|
||||
STATUS: Active
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import Any, Dict, List, Optional
|
||||
import logging
|
||||
|
||||
# ENTSCHEIDENDER FIX: Import der neutralen Registry-Logik zur Vermeidung von Circular Imports
|
||||
from app.core.registry import load_type_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Resolution Helpers (Audited)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _as_list(x):
|
||||
"""Sichert die Listen-Integrität für Metadaten wie Tags."""
|
||||
if x is None: return []
|
||||
return x if isinstance(x, list) else [x]
|
||||
|
||||
def _resolve_val(note_type: str, reg: dict, key: str, default: Any) -> Any:
|
||||
"""
|
||||
Hierarchische Suche in der Registry: Type-Spezifisch > Globaler Default.
|
||||
WP-14: Erlaubt dynamische Konfiguration via types.yaml.
|
||||
"""
|
||||
types = reg.get("types", {})
|
||||
if isinstance(types, dict):
|
||||
t_cfg = types.get(note_type, {})
|
||||
if isinstance(t_cfg, dict):
|
||||
# Fallback für Key-Varianten (z.B. chunking_profile vs chunk_profile)
|
||||
val = t_cfg.get(key) or t_cfg.get(key.replace("ing", ""))
|
||||
if val is not None: return val
|
||||
|
||||
defs = reg.get("defaults", {}) or reg.get("global", {})
|
||||
if isinstance(defs, dict):
|
||||
val = defs.get(key) or defs.get(key.replace("ing", ""))
|
||||
if val is not None: return val
|
||||
|
||||
return default
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Haupt-API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def make_chunk_payloads(note: Dict[str, Any], note_path: str, chunks_from_chunker: List[Any], **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Erstellt die Payloads für die Chunks inklusive Audit-Resolution.
|
||||
Nutzt nun die zentrale Registry für alle Fallbacks.
|
||||
"""
|
||||
if isinstance(note, dict) and "frontmatter" in note:
|
||||
fm = note["frontmatter"]
|
||||
else:
|
||||
fm = note or {}
|
||||
|
||||
# WP-14 Fix: Nutzt übergebene Registry oder lädt sie global
|
||||
reg = kwargs.get("types_cfg") or load_type_registry()
|
||||
|
||||
note_type = fm.get("type") or "concept"
|
||||
title = fm.get("title") or fm.get("id") or "Untitled"
|
||||
tags = _as_list(fm.get("tags") or [])
|
||||
|
||||
# Audit: Resolution Hierarchie (Frontmatter > Registry)
|
||||
cp = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
if not cp:
|
||||
cp = _resolve_val(note_type, reg, "chunking_profile", "sliding_standard")
|
||||
|
||||
rw = fm.get("retriever_weight")
|
||||
if rw is None:
|
||||
rw = _resolve_val(note_type, reg, "retriever_weight", 1.0)
|
||||
try:
|
||||
rw = float(rw)
|
||||
except:
|
||||
rw = 1.0
|
||||
|
||||
out: List[Dict[str, Any]] = []
|
||||
for idx, ch in enumerate(chunks_from_chunker):
|
||||
is_dict = isinstance(ch, dict)
|
||||
cid = getattr(ch, "id", None) if not is_dict else ch.get("id")
|
||||
nid = getattr(ch, "note_id", None) if not is_dict else ch.get("note_id")
|
||||
index = getattr(ch, "index", idx) if not is_dict else ch.get("index", idx)
|
||||
text = getattr(ch, "text", "") if not is_dict else ch.get("text", "")
|
||||
window = getattr(ch, "window", text) if not is_dict else ch.get("window", text)
|
||||
prev_id = getattr(ch, "neighbors_prev", None) if not is_dict else ch.get("neighbors_prev")
|
||||
next_id = getattr(ch, "neighbors_next", None) if not is_dict else ch.get("neighbors_next")
|
||||
section = getattr(ch, "section_title", "") if not is_dict else ch.get("section", "")
|
||||
# WP-24c v4.3.0: candidate_pool muss erhalten bleiben für Chunk-Attribution
|
||||
candidate_pool = getattr(ch, "candidate_pool", []) if not is_dict else ch.get("candidate_pool", [])
|
||||
|
||||
pl: Dict[str, Any] = {
|
||||
"note_id": nid or fm.get("id"),
|
||||
"chunk_id": cid,
|
||||
"title": title,
|
||||
"index": int(index),
|
||||
"ord": int(index) + 1,
|
||||
"type": note_type,
|
||||
"tags": tags,
|
||||
"text": text,
|
||||
"window": window,
|
||||
"neighbors_prev": _as_list(prev_id),
|
||||
"neighbors_next": _as_list(next_id),
|
||||
"section": section,
|
||||
"path": note_path,
|
||||
"source_path": kwargs.get("file_path") or note_path,
|
||||
"retriever_weight": rw,
|
||||
"chunk_profile": cp,
|
||||
"candidate_pool": candidate_pool # WP-24c v4.3.0: Kritisch für Chunk-Attribution
|
||||
}
|
||||
|
||||
# Audit: Cleanup Pop (Vermeidung von redundanten Alias-Feldern)
|
||||
for alias in ("chunk_num", "Chunk_Number"):
|
||||
pl.pop(alias, None)
|
||||
|
||||
# WP-24c v4.4.0-DEBUG: Schnittstelle 2 - Transfer
|
||||
# Log-Output unmittelbar bevor das Dictionary zurückgegeben wird
|
||||
pool_size = len(candidate_pool) if candidate_pool else 0
|
||||
pool_content = candidate_pool if candidate_pool else []
|
||||
explicit_callout_in_pool = [c for c in pool_content if isinstance(c, dict) and c.get("provenance") == "explicit:callout"]
|
||||
logger.debug(f"DEBUG-TRACER [Payload]: Chunk ID: {cid}, Index: {index}, Pool-Size: {pool_size}, Pool-Inhalt: {pool_content}, Explicit-Callout-Count: {len(explicit_callout_in_pool)}, Has_Candidate_Pool_Key: {'candidate_pool' in pl}")
|
||||
if explicit_callout_in_pool:
|
||||
for ec in explicit_callout_in_pool:
|
||||
logger.debug(f"DEBUG-TRACER [Payload]: Explicit-Callout Detail - Kind: {ec.get('kind')}, To: {ec.get('to')}, Provenance: {ec.get('provenance')}")
|
||||
|
||||
out.append(pl)
|
||||
|
||||
return out
|
||||
116
app/core/ingestion/ingestion_db.py
Normal file
116
app/core/ingestion/ingestion_db.py
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_db.py
|
||||
DESCRIPTION: Datenbank-Schnittstelle für Note-Metadaten und Artefakt-Prüfung.
|
||||
WP-14: Umstellung auf zentrale database-Infrastruktur.
|
||||
WP-24c: Integration der Authority-Prüfung für Point-IDs.
|
||||
Ermöglicht dem Prozessor die Unterscheidung zwischen
|
||||
manueller Nutzer-Autorität und virtuellen Symmetrien.
|
||||
VERSION: 2.2.0 (WP-24c: Authority Lookup Integration)
|
||||
STATUS: Active
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional, Tuple, List
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
# Import der modularisierten Namen-Logik zur Sicherstellung der Konsistenz
|
||||
from app.core.database import collection_names
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def fetch_note_payload(client: QdrantClient, prefix: str, note_id: str) -> Optional[dict]:
|
||||
"""
|
||||
Holt die Metadaten einer Note aus Qdrant via Scroll-API.
|
||||
Wird primär für die Change-Detection (Hash-Vergleich) genutzt.
|
||||
"""
|
||||
notes_col, _, _ = collection_names(prefix)
|
||||
try:
|
||||
f = rest.Filter(must=[
|
||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
||||
])
|
||||
pts, _ = client.scroll(
|
||||
collection_name=notes_col,
|
||||
scroll_filter=f,
|
||||
limit=1,
|
||||
with_payload=True
|
||||
)
|
||||
return pts[0].payload if pts else None
|
||||
except Exception as e:
|
||||
logger.debug(f"Note {note_id} not found or error during fetch: {e}")
|
||||
return None
|
||||
|
||||
def artifacts_missing(client: QdrantClient, prefix: str, note_id: str) -> Tuple[bool, bool]:
|
||||
"""
|
||||
Prüft Qdrant aktiv auf vorhandene Chunks und Edges für eine Note.
|
||||
Gibt (chunks_missing, edges_missing) als Boolean-Tupel zurück.
|
||||
"""
|
||||
_, chunks_col, edges_col = collection_names(prefix)
|
||||
try:
|
||||
# Filter für die note_id Suche
|
||||
f = rest.Filter(must=[
|
||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
||||
])
|
||||
c_pts, _ = client.scroll(collection_name=chunks_col, scroll_filter=f, limit=1)
|
||||
e_pts, _ = client.scroll(collection_name=edges_col, scroll_filter=f, limit=1)
|
||||
return (not bool(c_pts)), (not bool(e_pts))
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking artifacts for {note_id}: {e}")
|
||||
return True, True
|
||||
|
||||
def is_explicit_edge_present(client: QdrantClient, prefix: str, edge_id: str) -> bool:
|
||||
"""
|
||||
WP-24c: Prüft via Point-ID, ob bereits eine explizite Kante existiert.
|
||||
Wird vom IngestionProcessor in Phase 2 genutzt, um das Überschreiben
|
||||
von manuellem Wissen durch virtuelle Symmetrie-Kanten zu verhindern.
|
||||
|
||||
Args:
|
||||
edge_id: Die deterministisch berechnete UUID der Kante.
|
||||
Returns:
|
||||
True, wenn eine physische Kante (virtual=False) existiert.
|
||||
"""
|
||||
if not edge_id:
|
||||
return False
|
||||
|
||||
_, _, edges_col = collection_names(prefix)
|
||||
try:
|
||||
# retrieve ist die effizienteste Methode für den Zugriff via ID
|
||||
res = client.retrieve(
|
||||
collection_name=edges_col,
|
||||
ids=[edge_id],
|
||||
with_payload=True
|
||||
)
|
||||
|
||||
if res and len(res) > 0:
|
||||
# Wir prüfen das 'virtual' Flag im Payload
|
||||
is_virtual = res[0].payload.get("virtual", False)
|
||||
if not is_virtual:
|
||||
return True # Es ist eine explizite Nutzer-Kante
|
||||
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.debug(f"Authority check failed for ID {edge_id}: {e}")
|
||||
return False
|
||||
|
||||
def purge_artifacts(client: QdrantClient, prefix: str, note_id: str):
|
||||
"""
|
||||
Löscht verwaiste Chunks und Edges einer Note vor einem Re-Import.
|
||||
Stellt sicher, dass keine Duplikate bei Inhaltsänderungen entstehen.
|
||||
"""
|
||||
_, chunks_col, edges_col = collection_names(prefix)
|
||||
try:
|
||||
f = rest.Filter(must=[
|
||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=note_id))
|
||||
])
|
||||
# Chunks löschen
|
||||
client.delete(
|
||||
collection_name=chunks_col,
|
||||
points_selector=rest.FilterSelector(filter=f)
|
||||
)
|
||||
# Edges löschen
|
||||
client.delete(
|
||||
collection_name=edges_col,
|
||||
points_selector=rest.FilterSelector(filter=f)
|
||||
)
|
||||
logger.info(f"🧹 [PURGE] Local artifacts for '{note_id}' cleared.")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ [PURGE ERROR] Failed to clear artifacts for {note_id}: {e}")
|
||||
176
app/core/ingestion/ingestion_note_payload.py
Normal file
176
app/core/ingestion/ingestion_note_payload.py
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_note_payload.py
|
||||
DESCRIPTION: Baut das JSON-Objekt für mindnet_notes.
|
||||
WP-14: Integration der zentralen Registry.
|
||||
WP-24c: Dynamische Ermittlung von edge_defaults aus dem Graph-Schema.
|
||||
VERSION: 2.5.0 (WP-24c: Dynamic Topology Integration)
|
||||
STATUS: Active
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from typing import Any, Dict, Tuple, Optional
|
||||
import os
|
||||
import json
|
||||
import pathlib
|
||||
import hashlib
|
||||
|
||||
# Import der zentralen Registry-Logik
|
||||
from app.core.registry import load_type_registry
|
||||
# WP-24c: Zugriff auf das dynamische Graph-Schema
|
||||
from app.services.edge_registry import registry as edge_registry
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _as_dict(x) -> Dict[str, Any]:
|
||||
"""Versucht, ein Objekt in ein Dict zu überführen."""
|
||||
if isinstance(x, dict): return dict(x)
|
||||
out: Dict[str, Any] = {}
|
||||
for attr in ("frontmatter", "body", "id", "note_id", "title", "path", "tags", "type", "created", "modified", "date"):
|
||||
if hasattr(x, attr):
|
||||
val = getattr(x, attr)
|
||||
if val is not None: out[attr] = val
|
||||
if not out: out["raw"] = str(x)
|
||||
return out
|
||||
|
||||
def _ensure_list(x) -> list:
|
||||
"""Sichert String-Listen Integrität."""
|
||||
if x is None: return []
|
||||
if isinstance(x, list): return [str(i) for i in x]
|
||||
if isinstance(x, (set, tuple)): return [str(i) for i in x]
|
||||
return [str(x)]
|
||||
|
||||
def _compute_hash(content: str) -> str:
|
||||
"""SHA-256 Hash-Berechnung."""
|
||||
if not content: return ""
|
||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||
|
||||
def _get_hash_source_content(n: Dict[str, Any], mode: str) -> str:
|
||||
"""
|
||||
Generiert den Hash-Input-String basierend auf Body oder Metadaten.
|
||||
Inkludiert alle entscheidungsrelevanten Profil-Parameter.
|
||||
"""
|
||||
body = str(n.get("body") or "").strip()
|
||||
if mode == "body": return body
|
||||
if mode == "full":
|
||||
fm = n.get("frontmatter") or {}
|
||||
meta_parts = []
|
||||
# Alle Felder, die das Chunking oder Retrieval beeinflussen
|
||||
keys = [
|
||||
"title", "type", "status", "tags",
|
||||
"chunking_profile", "chunk_profile",
|
||||
"retriever_weight", "split_level", "strict_heading_split"
|
||||
]
|
||||
for k in sorted(keys):
|
||||
val = fm.get(k)
|
||||
if val is not None: meta_parts.append(f"{k}:{val}")
|
||||
return f"{'|'.join(meta_parts)}||{body}"
|
||||
return body
|
||||
|
||||
def _cfg_for_type(note_type: str, reg: dict) -> dict:
|
||||
"""Extrahiert Typ-spezifische Config aus der Registry."""
|
||||
if not isinstance(reg, dict): return {}
|
||||
types = reg.get("types") if isinstance(reg.get("types"), dict) else reg
|
||||
return types.get(note_type, {}) if isinstance(types, dict) else {}
|
||||
|
||||
def _cfg_defaults(reg: dict) -> dict:
|
||||
"""Extrahiert globale Default-Werte aus der Registry."""
|
||||
if not isinstance(reg, dict): return {}
|
||||
for key in ("defaults", "default", "global"):
|
||||
v = reg.get(key)
|
||||
if isinstance(v, dict): return v
|
||||
return {}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Haupt-API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def make_note_payload(note: Any, *args, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Baut das Note-Payload inklusive Multi-Hash und Audit-Validierung.
|
||||
WP-24c: Nutzt die EdgeRegistry zur dynamischen Auflösung von Typical Edges.
|
||||
"""
|
||||
n = _as_dict(note)
|
||||
|
||||
# Registry & Context Settings
|
||||
reg = kwargs.get("types_cfg") or load_type_registry()
|
||||
hash_source = kwargs.get("hash_source", "parsed")
|
||||
hash_normalize = kwargs.get("hash_normalize", "canonical")
|
||||
|
||||
fm = n.get("frontmatter") or {}
|
||||
note_type = str(fm.get("type") or n.get("type") or "concept")
|
||||
|
||||
cfg_type = _cfg_for_type(note_type, reg)
|
||||
cfg_def = _cfg_defaults(reg)
|
||||
ingest_cfg = reg.get("ingestion_settings", {})
|
||||
|
||||
# --- retriever_weight Audit ---
|
||||
default_rw = float(os.environ.get("MINDNET_DEFAULT_RETRIEVER_WEIGHT", 1.0))
|
||||
retriever_weight = fm.get("retriever_weight")
|
||||
if retriever_weight is None:
|
||||
retriever_weight = cfg_type.get("retriever_weight", cfg_def.get("retriever_weight", default_rw))
|
||||
try:
|
||||
retriever_weight = float(retriever_weight)
|
||||
except:
|
||||
retriever_weight = default_rw
|
||||
|
||||
# --- chunk_profile Audit ---
|
||||
chunk_profile = fm.get("chunking_profile") or fm.get("chunk_profile")
|
||||
if chunk_profile is None:
|
||||
chunk_profile = cfg_type.get("chunking_profile") or cfg_type.get("chunk_profile")
|
||||
if chunk_profile is None:
|
||||
chunk_profile = ingest_cfg.get("default_chunk_profile", cfg_def.get("chunking_profile", "sliding_standard"))
|
||||
|
||||
# --- WP-24c: edge_defaults Dynamisierung ---
|
||||
# 1. Priorität: Manuelle Definition im Frontmatter
|
||||
edge_defaults = fm.get("edge_defaults")
|
||||
|
||||
# 2. Priorität: Dynamische Abfrage der 'Typical Edges' aus dem Graph-Schema
|
||||
if edge_defaults is None:
|
||||
topology = edge_registry.get_topology_info(note_type, "any")
|
||||
edge_defaults = topology.get("typical", [])
|
||||
|
||||
# 3. Fallback: Leere Liste, falls kein Schema-Eintrag existiert
|
||||
edge_defaults = _ensure_list(edge_defaults)
|
||||
|
||||
# --- Basis-Metadaten ---
|
||||
note_id = n.get("note_id") or n.get("id") or fm.get("id")
|
||||
title = n.get("title") or fm.get("title") or ""
|
||||
path = n.get("path") or kwargs.get("file_path") or ""
|
||||
if isinstance(path, pathlib.Path): path = str(path)
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"note_id": note_id,
|
||||
"title": title,
|
||||
"type": note_type,
|
||||
"path": path,
|
||||
"retriever_weight": retriever_weight,
|
||||
"chunk_profile": chunk_profile,
|
||||
"edge_defaults": edge_defaults,
|
||||
"hashes": {}
|
||||
}
|
||||
|
||||
# --- MULTI-HASH ---
|
||||
# Generiert Hashes für Change Detection (WP-15b)
|
||||
for mode in ["body", "full"]:
|
||||
content = _get_hash_source_content(n, mode)
|
||||
payload["hashes"][f"{mode}:{hash_source}:{hash_normalize}"] = _compute_hash(content)
|
||||
|
||||
# Metadaten Anreicherung (Tags, Aliases, Zeitstempel)
|
||||
tags = fm.get("tags") or fm.get("keywords") or n.get("tags")
|
||||
if tags: payload["tags"] = _ensure_list(tags)
|
||||
|
||||
aliases = fm.get("aliases")
|
||||
if aliases: payload["aliases"] = _ensure_list(aliases)
|
||||
|
||||
for k in ("created", "modified", "date"):
|
||||
v = fm.get(k) or n.get(k)
|
||||
if v: payload[k] = str(v)
|
||||
|
||||
if n.get("body"):
|
||||
payload["fulltext"] = str(n["body"])
|
||||
|
||||
# Final JSON Validation Audit
|
||||
json.loads(json.dumps(payload, ensure_ascii=False))
|
||||
|
||||
return payload
|
||||
652
app/core/ingestion/ingestion_processor.py
Normal file
652
app/core/ingestion/ingestion_processor.py
Normal file
|
|
@ -0,0 +1,652 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_processor.py
|
||||
DESCRIPTION: Der zentrale IngestionService (Orchestrator).
|
||||
WP-25a: Integration der Mixture of Experts (MoE) Architektur.
|
||||
WP-15b: Two-Pass Workflow mit globalem Kontext-Cache.
|
||||
WP-20/22: Cloud-Resilienz und Content-Lifecycle integriert.
|
||||
AUDIT v4.2.4:
|
||||
- GOLD-STANDARD v4.2.4: Hash-basierte Change-Detection (MINDNET_CHANGE_DETECTION_MODE).
|
||||
- Wiederherstellung des iterativen Abgleichs basierend auf Inhalts-Hashes.
|
||||
- Phase 2 verwendet exakt dieselbe ID-Generierung wie Phase 1 (inkl. target_section).
|
||||
- Authority-Check in Phase 2 prüft mit konsistenter ID-Generierung.
|
||||
- Eliminiert Duplikate durch inkonsistente ID-Generierung (Steinzeitaxt-Problem).
|
||||
VERSION: 4.2.4 (WP-24c: Hash-Integrität)
|
||||
STATUS: Active
|
||||
"""
|
||||
import logging
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
# Core Module Imports
|
||||
from app.core.parser import (
|
||||
read_markdown, pre_scan_markdown, normalize_frontmatter,
|
||||
validate_required_frontmatter, NoteContext
|
||||
)
|
||||
from app.core.chunking import assemble_chunks
|
||||
# WP-24c: Import der zentralen Identitäts-Logik
|
||||
from app.core.graph.graph_utils import _mk_edge_id
|
||||
|
||||
# Datenbank-Ebene (Modularisierte database-Infrastruktur)
|
||||
from app.core.database.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
||||
from app.core.database.qdrant_points import points_for_chunks, points_for_note, points_for_edges, upsert_batch
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
# Services
|
||||
from app.services.embeddings_client import EmbeddingsClient
|
||||
from app.services.edge_registry import registry as edge_registry
|
||||
from app.services.llm_service import LLMService
|
||||
|
||||
# Package-Interne Imports (Refactoring WP-14)
|
||||
from .ingestion_utils import load_type_registry, resolve_note_type, get_chunk_config_by_profile
|
||||
from .ingestion_db import fetch_note_payload, artifacts_missing, purge_artifacts, is_explicit_edge_present
|
||||
from .ingestion_validation import validate_edge_candidate
|
||||
from .ingestion_note_payload import make_note_payload
|
||||
from .ingestion_chunk_payload import make_chunk_payloads
|
||||
|
||||
# Fallback für Edges (Struktur-Verknüpfung)
|
||||
try:
|
||||
from app.core.graph.graph_derive_edges import build_edges_for_note
|
||||
except ImportError:
|
||||
def build_edges_for_note(*args, **kwargs): return []
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class IngestionService:
|
||||
def __init__(self, collection_prefix: str = None):
|
||||
"""Initialisiert den Service und nutzt die neue database-Infrastruktur."""
|
||||
from app.config import get_settings
|
||||
self.settings = get_settings()
|
||||
|
||||
# --- LOGGING CLEANUP ---
|
||||
# Unterdrückt Bibliotheks-Lärm, erhält aber inhaltliche Service-Logs
|
||||
for lib in ["httpx", "httpcore", "qdrant_client", "urllib3", "openai"]:
|
||||
logging.getLogger(lib).setLevel(logging.WARNING)
|
||||
|
||||
self.prefix = collection_prefix or self.settings.COLLECTION_PREFIX
|
||||
self.cfg = QdrantConfig.from_env()
|
||||
self.cfg.prefix = self.prefix
|
||||
self.client = get_client(self.cfg)
|
||||
|
||||
self.registry = load_type_registry()
|
||||
self.embedder = EmbeddingsClient()
|
||||
self.llm = LLMService()
|
||||
|
||||
# WP-25a: Auflösung der Dimension über das Embedding-Profil (MoE)
|
||||
embed_cfg = self.llm.profiles.get("embedding_expert", {})
|
||||
self.dim = embed_cfg.get("dimensions") or self.settings.VECTOR_SIZE
|
||||
|
||||
self.active_hash_mode = self.settings.CHANGE_DETECTION_MODE
|
||||
|
||||
# WP-15b: Kontext-Gedächtnis für ID-Auflösung (Globaler Cache)
|
||||
self.batch_cache: Dict[str, NoteContext] = {}
|
||||
|
||||
# WP-24c: Puffer für Phase 2 (Symmetrie-Injektion am Ende des gesamten Imports)
|
||||
self.symmetry_buffer: List[Dict[str, Any]] = []
|
||||
|
||||
try:
|
||||
ensure_collections(self.client, self.prefix, self.dim)
|
||||
ensure_payload_indexes(self.client, self.prefix)
|
||||
except Exception as e:
|
||||
logger.warning(f"DB initialization warning: {e}")
|
||||
|
||||
def _log_id_collision(
|
||||
self,
|
||||
note_id: str,
|
||||
existing_path: str,
|
||||
conflicting_path: str,
|
||||
action: str = "ERROR"
|
||||
) -> None:
|
||||
"""
|
||||
WP-24c v4.5.10: Loggt ID-Kollisionen in eine dedizierte Log-Datei.
|
||||
|
||||
Schreibt alle ID-Kollisionen in logs/id_collisions.log für manuelle Analyse.
|
||||
Format: JSONL (eine Kollision pro Zeile) mit allen relevanten Metadaten.
|
||||
|
||||
Args:
|
||||
note_id: Die doppelte note_id
|
||||
existing_path: Pfad der bereits vorhandenen Datei
|
||||
conflicting_path: Pfad der kollidierenden Datei
|
||||
action: Gewählte Aktion (z.B. "ERROR", "SKIPPED")
|
||||
"""
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
# Erstelle Log-Verzeichnis falls nicht vorhanden
|
||||
log_dir = "logs"
|
||||
if not os.path.exists(log_dir):
|
||||
os.makedirs(log_dir)
|
||||
|
||||
log_file = os.path.join(log_dir, "id_collisions.log")
|
||||
|
||||
# Erstelle Log-Eintrag mit allen relevanten Informationen
|
||||
log_entry = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"note_id": note_id,
|
||||
"existing_file": {
|
||||
"path": existing_path,
|
||||
"filename": os.path.basename(existing_path) if existing_path else None
|
||||
},
|
||||
"conflicting_file": {
|
||||
"path": conflicting_path,
|
||||
"filename": os.path.basename(conflicting_path) if conflicting_path else None
|
||||
},
|
||||
"action": action,
|
||||
"collection_prefix": self.prefix
|
||||
}
|
||||
|
||||
# Schreibe als JSONL (eine Zeile pro Eintrag)
|
||||
try:
|
||||
with open(log_file, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Konnte ID-Kollision nicht in Log-Datei schreiben: {e}")
|
||||
|
||||
def _persist_rejected_edges(self, note_id: str, rejected_edges: List[Dict[str, Any]]) -> None:
|
||||
"""
|
||||
WP-24c v4.5.9: Persistiert abgelehnte Kanten für Audit-Zwecke.
|
||||
|
||||
Schreibt rejected_edges in eine JSONL-Datei im _system Ordner oder logs/rejected_edges.log.
|
||||
Dies ermöglicht die Analyse der Ablehnungsgründe und Verbesserung der Validierungs-Logik.
|
||||
|
||||
Args:
|
||||
note_id: ID der Note, zu der die abgelehnten Kanten gehören
|
||||
rejected_edges: Liste von abgelehnten Edge-Dicts
|
||||
"""
|
||||
if not rejected_edges:
|
||||
return
|
||||
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
# WP-24c v4.5.9: Erstelle Log-Verzeichnis falls nicht vorhanden
|
||||
log_dir = "logs"
|
||||
if not os.path.exists(log_dir):
|
||||
os.makedirs(log_dir)
|
||||
|
||||
log_file = os.path.join(log_dir, "rejected_edges.log")
|
||||
|
||||
# WP-24c v4.5.9: Schreibe als JSONL (eine Kante pro Zeile)
|
||||
try:
|
||||
with open(log_file, "a", encoding="utf-8") as f:
|
||||
for edge in rejected_edges:
|
||||
log_entry = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"note_id": note_id,
|
||||
"edge": {
|
||||
"kind": edge.get("kind", "unknown"),
|
||||
"source_id": edge.get("source_id", "unknown"),
|
||||
"target_id": edge.get("target_id") or edge.get("to", "unknown"),
|
||||
"scope": edge.get("scope", "unknown"),
|
||||
"provenance": edge.get("provenance", "unknown"),
|
||||
"rule_id": edge.get("rule_id", "unknown"),
|
||||
"confidence": edge.get("confidence", 0.0),
|
||||
"target_section": edge.get("target_section")
|
||||
}
|
||||
}
|
||||
f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
|
||||
|
||||
logger.debug(f"📝 [AUDIT] {len(rejected_edges)} abgelehnte Kanten für '{note_id}' in {log_file} gespeichert")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ [AUDIT] Fehler beim Speichern der rejected_edges: {e}")
|
||||
|
||||
def _is_valid_id(self, text: Optional[str]) -> bool:
|
||||
"""WP-24c: Prüft IDs auf fachliche Validität (Ghost-ID Schutz)."""
|
||||
if not text or not isinstance(text, str) or len(text.strip()) < 2:
|
||||
return False
|
||||
blacklisted = {"none", "unknown", "insight", "source", "task", "project", "person", "concept"}
|
||||
if text.lower().strip() in blacklisted:
|
||||
return False
|
||||
return True
|
||||
|
||||
async def run_batch(self, file_paths: List[str], vault_root: str) -> Dict[str, Any]:
|
||||
"""
|
||||
WP-15b: Phase 1 des Two-Pass Workflows.
|
||||
Verarbeitet Batches und schreibt NUR Nutzer-Autorität (explizite Kanten).
|
||||
"""
|
||||
self.batch_cache.clear()
|
||||
logger.info(f"--- 🔍 START BATCH PHASE 1 ({len(file_paths)} Dateien) ---")
|
||||
|
||||
# 1. Schritt: Pre-Scan (Context-Cache füllen)
|
||||
for path in file_paths:
|
||||
try:
|
||||
ctx = pre_scan_markdown(path, registry=self.registry)
|
||||
if ctx:
|
||||
self.batch_cache[ctx.note_id] = ctx
|
||||
self.batch_cache[ctx.title] = ctx
|
||||
self.batch_cache[os.path.splitext(os.path.basename(path))[0]] = ctx
|
||||
except Exception as e:
|
||||
logger.warning(f" ⚠️ Pre-scan fehlgeschlagen für {path}: {e}")
|
||||
|
||||
# 2. Schritt: Batch Processing (Authority Only)
|
||||
processed_count = 0
|
||||
success_count = 0
|
||||
for p in file_paths:
|
||||
processed_count += 1
|
||||
res = await self.process_file(p, vault_root, apply=True, purge_before=True)
|
||||
if res.get("status") == "success":
|
||||
success_count += 1
|
||||
|
||||
logger.info(f"--- ✅ Batch Phase 1 abgeschlossen ({success_count}/{processed_count}) ---")
|
||||
return {
|
||||
"status": "success",
|
||||
"processed": processed_count,
|
||||
"success": success_count,
|
||||
"buffered_symmetries": len(self.symmetry_buffer)
|
||||
}
|
||||
|
||||
async def commit_vault_symmetries(self) -> Dict[str, Any]:
|
||||
"""
|
||||
WP-24c: Führt PHASE 2 (Globale Symmetrie-Injektion) aus.
|
||||
Wird am Ende des gesamten Imports aufgerufen.
|
||||
"""
|
||||
if not self.symmetry_buffer:
|
||||
return {"status": "skipped", "reason": "buffer_empty"}
|
||||
|
||||
logger.info(f"🔄 PHASE 2: Validiere {len(self.symmetry_buffer)} Symmetrien gegen Live-DB...")
|
||||
final_virtuals = []
|
||||
for v_edge in self.symmetry_buffer:
|
||||
# WP-24c v4.1.0: Korrekte Extraktion der Identitäts-Parameter
|
||||
src = v_edge.get("source_id") or v_edge.get("note_id") # source_id hat Priorität
|
||||
tgt = v_edge.get("target_id")
|
||||
kind = v_edge.get("kind")
|
||||
scope = v_edge.get("scope", "note")
|
||||
target_section = v_edge.get("target_section") # WP-24c v4.1.0: target_section berücksichtigen
|
||||
|
||||
if not all([src, tgt, kind]):
|
||||
continue
|
||||
|
||||
# WP-24c v4.1.0: Nutzung der zentralisierten ID-Logik aus graph_utils
|
||||
# GOLD-STANDARD v4.1.0: ID-Generierung muss absolut synchron zu Phase 1 sein
|
||||
# - Wenn target_section vorhanden, muss es in die ID einfließen
|
||||
# - Dies stellt sicher, dass der Authority-Check korrekt funktioniert
|
||||
try:
|
||||
v_id = _mk_edge_id(kind, src, tgt, scope, target_section=target_section)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# AUTHORITY-CHECK: Nur schreiben, wenn keine manuelle Kante existiert
|
||||
# Prüft mit exakt derselben ID, die in Phase 1 verwendet wurde (inkl. target_section)
|
||||
if not is_explicit_edge_present(self.client, self.prefix, v_id):
|
||||
final_virtuals.append(v_edge)
|
||||
section_info = f" (section: {target_section})" if target_section else ""
|
||||
logger.info(f" 🔄 [SYMMETRY] Add inverse: {src} --({kind})--> {tgt}{section_info}")
|
||||
else:
|
||||
logger.info(f" 🛡️ [PROTECTED] Manuelle Kante gefunden. Symmetrie für {kind} unterdrückt.")
|
||||
|
||||
if final_virtuals:
|
||||
col, pts = points_for_edges(self.prefix, final_virtuals)
|
||||
upsert_batch(self.client, col, pts, wait=True)
|
||||
|
||||
count = len(final_virtuals)
|
||||
self.symmetry_buffer.clear()
|
||||
return {"status": "success", "added": count}
|
||||
|
||||
async def process_file(self, file_path: str, vault_root: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Transformiert eine Markdown-Datei (Phase 1).
|
||||
Schreibt Notes/Chunks/Explicit Edges sofort.
|
||||
"""
|
||||
apply = kwargs.get("apply", False)
|
||||
force_replace = kwargs.get("force_replace", False)
|
||||
purge_before = kwargs.get("purge_before", False)
|
||||
|
||||
result = {"path": file_path, "status": "skipped", "changed": False, "error": None}
|
||||
|
||||
try:
|
||||
# Ordner-Filter (.trash / .obsidian)
|
||||
if ".trash" in file_path or any(part.startswith('.') for part in file_path.split(os.sep)):
|
||||
return {**result, "status": "skipped", "reason": "ignored_folder"}
|
||||
|
||||
# WP-24c v4.5.9: Path-Normalization für konsistente Hash-Prüfung
|
||||
# Normalisiere file_path zu absolutem Pfad für konsistente Verarbeitung
|
||||
normalized_file_path = os.path.abspath(file_path) if not os.path.isabs(file_path) else file_path
|
||||
|
||||
parsed = read_markdown(normalized_file_path)
|
||||
if not parsed: return {**result, "error": "Empty file"}
|
||||
fm = normalize_frontmatter(parsed.frontmatter)
|
||||
validate_required_frontmatter(fm)
|
||||
|
||||
note_pl = make_note_payload(parsed, vault_root=vault_root, file_path=normalized_file_path, types_cfg=self.registry)
|
||||
note_id = note_pl.get("note_id")
|
||||
|
||||
if not note_id:
|
||||
return {**result, "status": "error", "error": "missing_id"}
|
||||
|
||||
logger.info(f"📄 Bearbeite: '{note_id}' | Pfad: {normalized_file_path} | Title: {note_pl.get('title', 'N/A')}")
|
||||
|
||||
# WP-24c v4.5.9: Strikte Change Detection (Hash-basierte Inhaltsprüfung)
|
||||
# Prüft Hash VOR der Verarbeitung, um redundante Ingestion zu vermeiden
|
||||
old_payload = None if force_replace else fetch_note_payload(self.client, self.prefix, note_id)
|
||||
|
||||
# WP-24c v4.5.10: Prüfe auf ID-Kollisionen (zwei Dateien mit derselben note_id)
|
||||
if old_payload and not force_replace:
|
||||
old_path = old_payload.get("path", "")
|
||||
if old_path and old_path != normalized_file_path:
|
||||
# ID-Kollision erkannt: Zwei verschiedene Dateien haben dieselbe note_id
|
||||
# Logge die Kollision in dedizierte Log-Datei
|
||||
self._log_id_collision(
|
||||
note_id=note_id,
|
||||
existing_path=old_path,
|
||||
conflicting_path=normalized_file_path,
|
||||
action="ERROR"
|
||||
)
|
||||
logger.error(
|
||||
f"❌ [ID-KOLLISION] Kritischer Fehler: Die note_id '{note_id}' wird bereits von einer anderen Datei verwendet!\n"
|
||||
f" Bereits vorhanden: '{old_path}'\n"
|
||||
f" Konflikt mit: '{normalized_file_path}'\n"
|
||||
f" Lösung: Bitte ändern Sie die 'id' im Frontmatter einer der beiden Dateien, um eine eindeutige ID zu gewährleisten.\n"
|
||||
f" Details wurden in logs/id_collisions.log gespeichert."
|
||||
)
|
||||
return {**result, "status": "error", "error": "id_collision", "note_id": note_id, "existing_path": old_path, "conflicting_path": normalized_file_path}
|
||||
|
||||
logger.debug(f"🔍 [CHANGE-DETECTION] Start für '{note_id}': force_replace={force_replace}, old_payload={old_payload is not None}")
|
||||
|
||||
content_changed = True
|
||||
hash_match = False
|
||||
if old_payload and not force_replace:
|
||||
# Nutzt die über MINDNET_CHANGE_DETECTION_MODE gesteuerte Genauigkeit
|
||||
# Mapping: 'full' -> 'full:parsed:canonical', 'body' -> 'body:parsed:canonical'
|
||||
h_key = f"{self.active_hash_mode or 'full'}:parsed:canonical"
|
||||
new_h = note_pl.get("hashes", {}).get(h_key)
|
||||
old_h = old_payload.get("hashes", {}).get(h_key)
|
||||
|
||||
# WP-24c v4.5.9-DEBUG: Detaillierte Hash-Diagnose (INFO-Level)
|
||||
logger.info(f"🔍 [CHANGE-DETECTION] Hash-Vergleich für '{note_id}':")
|
||||
logger.debug(f" -> Hash-Key: '{h_key}'")
|
||||
logger.debug(f" -> Active Hash-Mode: '{self.active_hash_mode or 'full'}'")
|
||||
logger.debug(f" -> New Hash vorhanden: {bool(new_h)}")
|
||||
logger.debug(f" -> Old Hash vorhanden: {bool(old_h)}")
|
||||
if new_h:
|
||||
logger.debug(f" -> New Hash (erste 32 Zeichen): {new_h[:32]}...")
|
||||
if old_h:
|
||||
logger.debug(f" -> Old Hash (erste 32 Zeichen): {old_h[:32]}...")
|
||||
logger.debug(f" -> Verfügbare Hash-Keys in new: {list(note_pl.get('hashes', {}).keys())}")
|
||||
logger.debug(f" -> Verfügbare Hash-Keys in old: {list(old_payload.get('hashes', {}).keys())}")
|
||||
|
||||
if new_h and old_h:
|
||||
hash_match = (new_h == old_h)
|
||||
if hash_match:
|
||||
content_changed = False
|
||||
logger.info(f"🔍 [CHANGE-DETECTION] ✅ Hash identisch für '{note_id}': {h_key} = {new_h[:16]}...")
|
||||
else:
|
||||
logger.warning(f"🔍 [CHANGE-DETECTION] ❌ Hash geändert für '{note_id}': alt={old_h[:16]}..., neu={new_h[:16]}...")
|
||||
# Finde erste unterschiedliche Position
|
||||
diff_pos = next((i for i, (a, b) in enumerate(zip(new_h, old_h)) if a != b), None)
|
||||
if diff_pos is not None:
|
||||
logger.debug(f" -> Hash-Unterschied: Erste unterschiedliche Position: {diff_pos}")
|
||||
else:
|
||||
logger.debug(f" -> Hash-Unterschied: Längen unterschiedlich (new={len(new_h)}, old={len(old_h)})")
|
||||
|
||||
# WP-24c v4.5.10: Logge Hash-Input für Diagnose (DEBUG-Level)
|
||||
# WICHTIG: _get_hash_source_content benötigt ein Dictionary, nicht das ParsedNote-Objekt!
|
||||
from app.core.ingestion.ingestion_note_payload import _get_hash_source_content, _as_dict
|
||||
hash_mode = self.active_hash_mode or 'full'
|
||||
# Konvertiere parsed zu Dictionary für _get_hash_source_content
|
||||
parsed_dict = _as_dict(parsed)
|
||||
hash_input = _get_hash_source_content(parsed_dict, hash_mode)
|
||||
logger.debug(f" -> Hash-Input (erste 200 Zeichen): {hash_input[:200]}...")
|
||||
logger.debug(f" -> Hash-Input Länge: {len(hash_input)}")
|
||||
|
||||
# WP-24c v4.5.10: Vergleiche auch Body-Länge und Frontmatter (DEBUG-Level)
|
||||
# Verwende parsed.body statt note_pl.get("body")
|
||||
new_body = str(getattr(parsed, "body", "") or "").strip()
|
||||
old_body = str(old_payload.get("body", "")).strip() if old_payload else ""
|
||||
logger.debug(f" -> Body-Länge: new={len(new_body)}, old={len(old_body)}")
|
||||
if len(new_body) != len(old_body):
|
||||
logger.debug(f" -> ⚠️ Body-Länge unterschiedlich! Mögliche Ursache: Parsing-Unterschiede")
|
||||
|
||||
# Verwende parsed.frontmatter statt note_pl.get("frontmatter")
|
||||
new_fm = getattr(parsed, "frontmatter", {}) or {}
|
||||
old_fm = old_payload.get("frontmatter", {}) if old_payload else {}
|
||||
logger.debug(f" -> Frontmatter-Keys: new={sorted(new_fm.keys())}, old={sorted(old_fm.keys())}")
|
||||
# Prüfe relevante Frontmatter-Felder
|
||||
relevant_keys = ["title", "type", "status", "tags", "chunking_profile", "chunk_profile", "retriever_weight", "split_level", "strict_heading_split"]
|
||||
for key in relevant_keys:
|
||||
new_val = new_fm.get(key) if isinstance(new_fm, dict) else getattr(new_fm, key, None)
|
||||
old_val = old_fm.get(key) if isinstance(old_fm, dict) else None
|
||||
if new_val != old_val:
|
||||
logger.debug(f" -> ⚠️ Frontmatter '{key}' unterschiedlich: new={new_val}, old={old_val}")
|
||||
else:
|
||||
# WP-24c v4.5.10: Wenn Hash fehlt, als geändert behandeln (Sicherheit)
|
||||
logger.debug(f"⚠️ [CHANGE-DETECTION] Hash fehlt für '{note_id}': new_h={bool(new_h)}, old_h={bool(old_h)}")
|
||||
logger.debug(f" -> Grund: Hash wird als 'geändert' behandelt, da Hash-Werte fehlen")
|
||||
else:
|
||||
if force_replace:
|
||||
logger.debug(f"🔍 [CHANGE-DETECTION] '{note_id}': force_replace=True -> überspringe Hash-Check")
|
||||
elif not old_payload:
|
||||
logger.debug(f"🔍 [CHANGE-DETECTION] '{note_id}': ⚠️ Keine alte Payload gefunden -> erste Verarbeitung oder gelöscht")
|
||||
|
||||
# WP-24c v4.5.9: Strikte Logik - überspringe komplett wenn Hash identisch
|
||||
# WICHTIG: Artifact-Check NACH Hash-Check, da purge_before die Artefakte löschen kann
|
||||
# Wenn Hash identisch ist, sind die Artefakte entweder vorhanden oder werden gerade neu geschrieben
|
||||
if not force_replace and hash_match and old_payload:
|
||||
# WP-24c v4.5.9: Hash identisch -> überspringe komplett (auch wenn Artefakte nach PURGE fehlen)
|
||||
# Der Hash ist die autoritative Quelle für "Inhalt unverändert"
|
||||
# Artefakte werden beim nächsten normalen Import wieder erstellt, wenn nötig
|
||||
logger.info(f"⏭️ [SKIP] '{note_id}' unverändert (Hash identisch - überspringe komplett, auch wenn Artefakte fehlen)")
|
||||
return {**result, "status": "unchanged", "note_id": note_id, "reason": "hash_identical"}
|
||||
elif not force_replace and old_payload and not hash_match:
|
||||
# WP-24c v4.5.10: Hash geändert - erlaube Verarbeitung (DEBUG-Level)
|
||||
logger.debug(f"🔍 [CHANGE-DETECTION] '{note_id}': Hash geändert -> erlaube Verarbeitung")
|
||||
|
||||
# WP-24c v4.5.10: Hash geändert oder keine alte Payload - prüfe Artefakte für normale Verarbeitung
|
||||
c_miss, e_miss = artifacts_missing(self.client, self.prefix, note_id)
|
||||
logger.debug(f"🔍 [CHANGE-DETECTION] '{note_id}': Artifact-Check: c_miss={c_miss}, e_miss={e_miss}")
|
||||
|
||||
if not apply:
|
||||
return {**result, "status": "dry-run", "changed": True, "note_id": note_id}
|
||||
|
||||
# Chunks & MoE
|
||||
profile = note_pl.get("chunk_profile", "sliding_standard")
|
||||
note_type = resolve_note_type(self.registry, fm.get("type"))
|
||||
chunk_cfg = get_chunk_config_by_profile(self.registry, profile, note_type)
|
||||
enable_smart = chunk_cfg.get("enable_smart_edge_allocation", False)
|
||||
chunks = await assemble_chunks(note_id, getattr(parsed, "body", ""), note_type, config=chunk_cfg)
|
||||
|
||||
# WP-24c v4.5.8: Validierung in Chunk-Schleife entfernt
|
||||
# Alle candidate: Kanten werden jetzt in Phase 3 (nach build_edges_for_note) validiert
|
||||
# Dies stellt sicher, dass auch Note-Scope Kanten aus LLM-Validierungs-Zonen geprüft werden
|
||||
# Der candidate_pool wird unverändert weitergegeben, damit build_edges_for_note alle Kanten erkennt
|
||||
# WP-24c v4.5.8: Nur ID-Validierung bleibt (Ghost-ID Schutz), keine LLM-Validierung mehr hier
|
||||
for ch in chunks:
|
||||
new_pool = []
|
||||
for cand in getattr(ch, "candidate_pool", []):
|
||||
# WP-24c v4.5.8: Nur ID-Validierung (Ghost-ID Schutz)
|
||||
t_id = cand.get('target_id') or cand.get('to') or cand.get('note_id')
|
||||
if not self._is_valid_id(t_id):
|
||||
continue
|
||||
# WP-24c v4.5.8: Alle Kanten gehen durch - LLM-Validierung erfolgt in Phase 3
|
||||
new_pool.append(cand)
|
||||
ch.candidate_pool = new_pool
|
||||
|
||||
# chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry)
|
||||
# v4.2.8 Fix C: Explizite Übergabe des Profil-Namens für den Chunk-Payload
|
||||
chunk_pls = make_chunk_payloads(fm, note_pl["path"], chunks, file_path=file_path, types_cfg=self.registry, chunk_profile=profile)
|
||||
|
||||
vecs = await self.embedder.embed_documents([c.get("window") or "" for c in chunk_pls]) if chunk_pls else []
|
||||
|
||||
# WP-24c v4.2.0: Kanten-Extraktion mit Note-Scope Zonen Support
|
||||
# Übergabe des Original-Markdown-Texts für Note-Scope Zonen-Extraktion
|
||||
markdown_body = getattr(parsed, "body", "")
|
||||
raw_edges = build_edges_for_note(
|
||||
note_id,
|
||||
chunk_pls,
|
||||
note_level_references=note_pl.get("references", []),
|
||||
markdown_body=markdown_body
|
||||
)
|
||||
|
||||
# WP-24c v4.5.8: Phase 3 - Finaler Validierungs-Gate für candidate: Kanten
|
||||
# Prüfe alle Kanten mit rule_id ODER provenance beginnend mit "candidate:"
|
||||
# Dies schließt alle Kandidaten ein, unabhängig von ihrer Herkunft (global_pool, explicit:callout, etc.)
|
||||
|
||||
# WP-24c v4.5.8: Kontext-Optimierung für Note-Scope Kanten
|
||||
# Aggregiere den gesamten Note-Text für bessere Validierungs-Entscheidungen
|
||||
note_text = markdown_body or " ".join([c.get("text", "") or c.get("window", "") for c in chunk_pls])
|
||||
# Erstelle eine Note-Summary aus den wichtigsten Chunks (für bessere Kontext-Qualität)
|
||||
note_summary = " ".join([c.get("window", "") or c.get("text", "") for c in chunk_pls[:5]]) # Top 5 Chunks
|
||||
|
||||
validated_edges = []
|
||||
rejected_edges = []
|
||||
|
||||
for e in raw_edges:
|
||||
rule_id = e.get("rule_id", "")
|
||||
provenance = e.get("provenance", "")
|
||||
|
||||
# WP-24c v4.5.8: Trigger-Kriterium - rule_id ODER provenance beginnt mit "candidate:"
|
||||
is_candidate = (rule_id and rule_id.startswith("candidate:")) or (provenance and provenance.startswith("candidate:"))
|
||||
|
||||
if is_candidate:
|
||||
# Extrahiere target_id für Validierung (aus verschiedenen möglichen Feldern)
|
||||
target_id = e.get("target_id") or e.get("to")
|
||||
if not target_id:
|
||||
# Fallback: Versuche aus Payload zu extrahieren
|
||||
payload = e.get("extra", {}) if isinstance(e.get("extra"), dict) else {}
|
||||
target_id = payload.get("target_id") or payload.get("to")
|
||||
|
||||
if not target_id:
|
||||
logger.warning(f"⚠️ [PHASE 3] Keine target_id gefunden für Kante: {e}")
|
||||
rejected_edges.append(e)
|
||||
continue
|
||||
|
||||
kind = e.get("kind", "related_to")
|
||||
source_id = e.get("source_id", note_id)
|
||||
scope = e.get("scope", "chunk")
|
||||
|
||||
# WP-24c v4.5.8: Kontext-Optimierung für Note-Scope Kanten
|
||||
# Für scope: note verwende Note-Summary oder gesamten Note-Text
|
||||
# Für scope: chunk verwende den spezifischen Chunk-Text (falls verfügbar)
|
||||
if scope == "note":
|
||||
validation_text = note_summary or note_text
|
||||
context_info = "Note-Scope (aggregiert)"
|
||||
else:
|
||||
# Für Chunk-Scope: Versuche Chunk-Text zu finden, sonst Note-Text
|
||||
chunk_id = e.get("chunk_id") or source_id
|
||||
chunk_text = None
|
||||
for ch in chunk_pls:
|
||||
if ch.get("chunk_id") == chunk_id or ch.get("id") == chunk_id:
|
||||
chunk_text = ch.get("text") or ch.get("window", "")
|
||||
break
|
||||
validation_text = chunk_text or note_text
|
||||
context_info = f"Chunk-Scope ({chunk_id})"
|
||||
|
||||
# Erstelle Edge-Dict für Validierung (kompatibel mit validate_edge_candidate)
|
||||
edge_for_validation = {
|
||||
"kind": kind,
|
||||
"to": target_id, # validate_edge_candidate erwartet "to"
|
||||
"target_id": target_id,
|
||||
"provenance": provenance if not provenance.startswith("candidate:") else provenance.replace("candidate:", "").strip(),
|
||||
"confidence": e.get("confidence", 0.9)
|
||||
}
|
||||
|
||||
logger.info(f"🚀 [PHASE 3] Validierung: {source_id} -> {target_id} ({kind}) | Scope: {scope} | Kontext: {context_info}")
|
||||
|
||||
# WP-24c v4.5.8: Validiere gegen optimierten Kontext
|
||||
is_valid = await validate_edge_candidate(
|
||||
chunk_text=validation_text,
|
||||
edge=edge_for_validation,
|
||||
batch_cache=self.batch_cache,
|
||||
llm_service=self.llm,
|
||||
profile_name="ingest_validator"
|
||||
)
|
||||
|
||||
if is_valid:
|
||||
# WP-24c v4.5.8: Entferne candidate: Präfix (Kante wird zum Fakt)
|
||||
new_rule_id = rule_id.replace("candidate:", "").strip() if rule_id else provenance.replace("candidate:", "").strip() if provenance.startswith("candidate:") else provenance
|
||||
if not new_rule_id:
|
||||
new_rule_id = e.get("provenance", "explicit").replace("candidate:", "").strip()
|
||||
|
||||
# Aktualisiere rule_id und provenance im Edge
|
||||
e["rule_id"] = new_rule_id
|
||||
if provenance.startswith("candidate:"):
|
||||
e["provenance"] = provenance.replace("candidate:", "").strip()
|
||||
|
||||
validated_edges.append(e)
|
||||
logger.info(f"✅ [PHASE 3] VERIFIED: {source_id} -> {target_id} ({kind}) | rule_id: {new_rule_id}")
|
||||
else:
|
||||
# WP-24c v4.5.8: Kante ablehnen (nicht zu validated_edges hinzufügen)
|
||||
rejected_edges.append(e)
|
||||
logger.info(f"🚫 [PHASE 3] REJECTED: {source_id} -> {target_id} ({kind})")
|
||||
else:
|
||||
# WP-24c v4.5.8: Keine candidate: Kante -> direkt übernehmen
|
||||
validated_edges.append(e)
|
||||
|
||||
# WP-24c v4.5.8: Phase 3 abgeschlossen - rejected_edges werden NICHT weiterverarbeitet
|
||||
# WP-24c v4.5.9: Persistierung von rejected_edges für Audit-Zwecke
|
||||
if rejected_edges:
|
||||
logger.info(f"🚫 [PHASE 3] {len(rejected_edges)} Kanten abgelehnt und werden nicht in die DB geschrieben")
|
||||
self._persist_rejected_edges(note_id, rejected_edges)
|
||||
|
||||
# WP-24c v4.5.8: Verwende validated_edges statt raw_edges für weitere Verarbeitung
|
||||
# Nur verified Kanten (ohne candidate: Präfix) werden in Phase 2 (Symmetrie) verarbeitet
|
||||
explicit_edges = []
|
||||
for e in validated_edges:
|
||||
t_raw = e.get("target_id")
|
||||
t_ctx = self.batch_cache.get(t_raw)
|
||||
t_id = t_ctx.note_id if t_ctx else t_raw
|
||||
|
||||
if not self._is_valid_id(t_id): continue
|
||||
|
||||
resolved_kind = edge_registry.resolve(e.get("kind", "related_to"), provenance="explicit")
|
||||
# WP-24c v4.1.0: target_section aus dem Edge-Payload extrahieren und beibehalten
|
||||
target_section = e.get("target_section")
|
||||
e.update({
|
||||
"kind": resolved_kind,
|
||||
"relation": resolved_kind, # Konsistenz: kind und relation identisch
|
||||
"target_id": t_id,
|
||||
"source_id": e.get("source_id") or note_id, # Sicherstellen, dass source_id gesetzt ist
|
||||
"origin_note_id": note_id,
|
||||
"virtual": False
|
||||
})
|
||||
explicit_edges.append(e)
|
||||
|
||||
# Symmetrie puffern (WP-24c v4.1.0: Korrekte Symmetrie-Integrität)
|
||||
inv_kind = edge_registry.get_inverse(resolved_kind)
|
||||
if inv_kind and t_id != note_id:
|
||||
# GOLD-STANDARD v4.1.0: Symmetrie-Integrität
|
||||
v_edge = {
|
||||
"note_id": t_id, # Besitzer-Wechsel: Symmetrie gehört zum Link-Ziel
|
||||
"source_id": t_id, # Neue Quelle ist das Link-Ziel
|
||||
"target_id": note_id, # Ziel ist die ursprüngliche Quelle
|
||||
"kind": inv_kind, # Inverser Kanten-Typ
|
||||
"relation": inv_kind, # Konsistenz: kind und relation identisch
|
||||
"scope": "note", # Symmetrien sind immer Note-Level
|
||||
"virtual": True,
|
||||
"origin_note_id": note_id, # Tracking: Woher kommt die Symmetrie
|
||||
}
|
||||
# target_section beibehalten, falls vorhanden (für Section-Links)
|
||||
if target_section:
|
||||
v_edge["target_section"] = target_section
|
||||
self.symmetry_buffer.append(v_edge)
|
||||
|
||||
# DB Upsert
|
||||
if purge_before and old_payload: purge_artifacts(self.client, self.prefix, note_id)
|
||||
|
||||
col_n, pts_n = points_for_note(self.prefix, note_pl, None, self.dim)
|
||||
upsert_batch(self.client, col_n, pts_n, wait=True)
|
||||
|
||||
if chunk_pls and vecs:
|
||||
col_c, pts_c = points_for_chunks(self.prefix, chunk_pls, vecs)
|
||||
upsert_batch(self.client, col_c, pts_c, wait=True)
|
||||
|
||||
if explicit_edges:
|
||||
col_e, pts_e = points_for_edges(self.prefix, explicit_edges)
|
||||
upsert_batch(self.client, col_e, pts_e, wait=True)
|
||||
|
||||
logger.info(f" ✨ Phase 1 fertig: {len(explicit_edges)} explizite Kanten für '{note_id}'.")
|
||||
return {"status": "success", "note_id": note_id}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Fehler bei {file_path}: {e}", exc_info=True)
|
||||
return {**result, "status": "error", "error": str(e)}
|
||||
|
||||
async def create_from_text(self, markdown_content: str, filename: str, vault_root: str, folder: str = "00_Inbox") -> Dict[str, Any]:
|
||||
"""Erstellt eine Note aus einem Textstream."""
|
||||
target_path = os.path.join(vault_root, folder, filename)
|
||||
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
||||
with open(target_path, "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
await asyncio.sleep(0.1)
|
||||
return await self.process_file(file_path=target_path, vault_root=vault_root, apply=True, force_replace=True, purge_before=True)
|
||||
71
app/core/ingestion/ingestion_utils.py
Normal file
71
app/core/ingestion/ingestion_utils.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_utils.py
|
||||
DESCRIPTION: Hilfswerkzeuge für JSON-Recovery, Typ-Registry und Konfigurations-Lookups.
|
||||
AUDIT v2.13.9: Behebung des Circular Imports durch Nutzung der app.core.registry.
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Optional, Dict
|
||||
|
||||
# ENTSCHEIDENDER FIX: Import der Basis-Logik aus dem neutralen Registry-Modul.
|
||||
# Dies bricht den Zirkelbezug auf, da dieses Modul keine Services mehr importiert.
|
||||
from app.core.registry import load_type_registry, clean_llm_text
|
||||
|
||||
def extract_json_from_response(text: str, registry: Optional[dict] = None) -> Any:
|
||||
"""
|
||||
Extrahiert JSON-Daten und bereinigt LLM-Steuerzeichen.
|
||||
WP-14: Nutzt nun die zentrale clean_llm_text Funktion aus app.core.registry.
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# 1. Text zentral bereinigen via neutralem Modul
|
||||
clean = clean_llm_text(text, registry)
|
||||
|
||||
# 2. Markdown-Code-Blöcke extrahieren
|
||||
match = re.search(r"```(?:json)?\s*(.*?)\s*```", clean, re.DOTALL)
|
||||
payload = match.group(1) if match else clean
|
||||
|
||||
try:
|
||||
return json.loads(payload.strip())
|
||||
except json.JSONDecodeError:
|
||||
# Recovery: Suche nach Liste
|
||||
start = payload.find('[')
|
||||
end = payload.rfind(']') + 1
|
||||
if start != -1 and end > start:
|
||||
try: return json.loads(payload[start:end])
|
||||
except: pass
|
||||
|
||||
# Recovery: Suche nach Objekt
|
||||
start_obj = payload.find('{')
|
||||
end_obj = payload.rfind('}') + 1
|
||||
if start_obj != -1 and end_obj > start_obj:
|
||||
try: return json.loads(payload[start_obj:end_obj])
|
||||
except: pass
|
||||
return []
|
||||
|
||||
def resolve_note_type(registry: dict, requested: Optional[str]) -> str:
|
||||
"""
|
||||
Bestimmt den finalen Notiz-Typ.
|
||||
WP-14: Fallback wird nun über ingestion_settings.default_note_type gesteuert.
|
||||
"""
|
||||
types = registry.get("types", {})
|
||||
if requested and requested in types:
|
||||
return requested
|
||||
|
||||
# Dynamischer Fallback aus der Registry (Standard: 'concept')
|
||||
ingest_cfg = registry.get("ingestion_settings", {})
|
||||
return ingest_cfg.get("default_note_type", "concept")
|
||||
|
||||
def get_chunk_config_by_profile(registry: dict, profile_name: str, note_type: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Holt die Chunker-Parameter für ein spezifisches Profil aus der Registry.
|
||||
"""
|
||||
from app.core.chunking import get_chunk_config
|
||||
profiles = registry.get("chunking_profiles", {})
|
||||
if profile_name in profiles:
|
||||
cfg = profiles[profile_name].copy()
|
||||
if "overlap" in cfg and isinstance(cfg["overlap"], list):
|
||||
cfg["overlap"] = tuple(cfg["overlap"])
|
||||
return cfg
|
||||
return get_chunk_config(note_type)
|
||||
150
app/core/ingestion/ingestion_validation.py
Normal file
150
app/core/ingestion/ingestion_validation.py
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
"""
|
||||
FILE: app/core/ingestion/ingestion_validation.py
|
||||
DESCRIPTION: WP-15b semantische Validierung von Kanten gegen den LocalBatchCache.
|
||||
WP-24c: Erweiterung um automatische Symmetrie-Generierung (Inverse Kanten).
|
||||
WP-25b: Konsequente Lazy-Prompt-Orchestration (prompt_key + variables).
|
||||
VERSION: 3.0.0 (WP-24c: Symmetric Edge Management)
|
||||
STATUS: Active
|
||||
FIX:
|
||||
- WP-24c: Integration der EdgeRegistry zur dynamischen Inversions-Ermittlung.
|
||||
- WP-24c: Implementierung von validate_and_symmetrize für bidirektionale Graphen.
|
||||
- WP-25b: Beibehaltung der hierarchischen Prompt-Resolution und Modell-Spezi-Logik.
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
from app.core.parser import NoteContext
|
||||
|
||||
# Import der neutralen Bereinigungs-Logik zur Vermeidung von Circular Imports
|
||||
from app.core.registry import clean_llm_text
|
||||
# WP-24c: Zugriff auf das dynamische Vokabular
|
||||
from app.services.edge_registry import registry as edge_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def validate_edge_candidate(
|
||||
chunk_text: str,
|
||||
edge: Dict,
|
||||
batch_cache: Dict[str, NoteContext],
|
||||
llm_service: Any,
|
||||
provider: Optional[str] = None,
|
||||
profile_name: str = "ingest_validator"
|
||||
) -> bool:
|
||||
"""
|
||||
WP-15b/25b: Validiert einen Kandidaten semantisch gegen das Ziel im Cache.
|
||||
Nutzt Lazy-Prompt-Loading (PROMPT-TRACE) für deterministische YES/NO Entscheidungen.
|
||||
"""
|
||||
target_id = edge.get("to")
|
||||
target_ctx = batch_cache.get(target_id)
|
||||
|
||||
# Robust Lookup Fix (v2.12.2): Support für Anker (Note#Section)
|
||||
if not target_ctx and "#" in str(target_id):
|
||||
base_id = target_id.split("#")[0]
|
||||
target_ctx = batch_cache.get(base_id)
|
||||
|
||||
# Sicherheits-Fallback (Hard-Link Integrity)
|
||||
# Wenn das Ziel nicht im Cache ist, erlauben wir die Kante (Link-Erhalt).
|
||||
if not target_ctx:
|
||||
logger.info(f"ℹ️ [VALIDATION SKIP] No context for '{target_id}' - allowing link.")
|
||||
return True
|
||||
|
||||
try:
|
||||
logger.info(f"⚖️ [VALIDATING] Relation '{edge.get('kind')}' -> '{target_id}' (Profile: {profile_name})...")
|
||||
|
||||
# WP-25b: Lazy-Prompt Aufruf.
|
||||
# Übergabe von prompt_key und Variablen für modell-optimierte Formatierung.
|
||||
raw_response = await llm_service.generate_raw_response(
|
||||
prompt_key="edge_validation",
|
||||
variables={
|
||||
"chunk_text": chunk_text[:1500],
|
||||
"target_title": target_ctx.title,
|
||||
"target_summary": target_ctx.summary,
|
||||
"edge_kind": edge.get("kind", "related_to")
|
||||
},
|
||||
priority="background",
|
||||
profile_name=profile_name
|
||||
)
|
||||
|
||||
# Bereinigung zur Sicherstellung der Interpretierbarkeit (Mistral/Qwen Safe)
|
||||
response = clean_llm_text(raw_response)
|
||||
|
||||
# Semantische Prüfung des Ergebnisses
|
||||
is_valid = "YES" in response.upper()
|
||||
|
||||
if is_valid:
|
||||
logger.info(f"✅ [VALIDATED] Relation to '{target_id}' confirmed.")
|
||||
else:
|
||||
logger.info(f"🚫 [REJECTED] Relation to '{target_id}' irrelevant for this chunk.")
|
||||
return is_valid
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
error_type = type(e).__name__
|
||||
|
||||
# WP-25b: Differenzierung zwischen transienten und permanenten Fehlern
|
||||
# Transiente Fehler (Netzwerk) → erlauben (Integrität vor Präzision)
|
||||
if any(x in error_str for x in ["timeout", "connection", "network", "unreachable", "refused"]):
|
||||
logger.warning(f"⚠️ Transient error for {target_id}: {error_type} - {e}. Allowing edge.")
|
||||
return True
|
||||
|
||||
# Permanente Fehler → ablehnen (Graph-Qualität schützen)
|
||||
logger.error(f"❌ Permanent validation error for {target_id}: {error_type} - {e}")
|
||||
return False
|
||||
|
||||
async def validate_and_symmetrize(
|
||||
chunk_text: str,
|
||||
edge: Dict,
|
||||
source_id: str,
|
||||
batch_cache: Dict[str, NoteContext],
|
||||
llm_service: Any,
|
||||
profile_name: str = "ingest_validator"
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
WP-24c: Erweitertes Validierungs-Gateway.
|
||||
Prüft die Primärkante und erzeugt bei Erfolg automatisch die inverse Kante.
|
||||
|
||||
Returns:
|
||||
List[Dict]: Eine Liste mit 0, 1 (nur Primär) oder 2 (Primär + Invers) Kanten.
|
||||
"""
|
||||
# 1. Semantische Prüfung der Primärkante (A -> B)
|
||||
is_valid = await validate_edge_candidate(
|
||||
chunk_text=chunk_text,
|
||||
edge=edge,
|
||||
batch_cache=batch_cache,
|
||||
llm_service=llm_service,
|
||||
profile_name=profile_name
|
||||
)
|
||||
|
||||
if not is_valid:
|
||||
return []
|
||||
|
||||
validated_edges = [edge]
|
||||
|
||||
# 2. WP-24c: Symmetrie-Generierung (B -> A)
|
||||
# Wir laden den inversen Typ dynamisch aus der EdgeRegistry (Single Source of Truth)
|
||||
original_kind = edge.get("kind", "related_to")
|
||||
inverse_kind = edge_registry.get_inverse(original_kind)
|
||||
|
||||
# Wir erzeugen eine inverse Kante nur, wenn ein sinnvoller inverser Typ existiert
|
||||
# und das Ziel der Primärkante (to) valide ist.
|
||||
target_id = edge.get("to")
|
||||
|
||||
if target_id and source_id:
|
||||
# Die inverse Kante zeigt vom Ziel der Primärkante zurück zur Quelle.
|
||||
# Sie wird als 'virtual' markiert, um sie im Retrieval/UI identifizierbar zu machen.
|
||||
inverse_edge = {
|
||||
"to": source_id,
|
||||
"kind": inverse_kind,
|
||||
"provenance": "structure", # System-generiert, geschützt durch Firewall
|
||||
"confidence": edge.get("confidence", 0.9) * 0.9, # Leichte Dämpfung für virtuelle Pfade
|
||||
"virtual": True,
|
||||
"note_id": target_id, # Die Note, von der die inverse Kante ausgeht
|
||||
"rule_id": f"symmetry:{original_kind}"
|
||||
}
|
||||
|
||||
# Wir fügen die Symmetrie nur hinzu, wenn sie einen echten Mehrwert bietet
|
||||
# (Vermeidung von redundanten related_to -> related_to Loops)
|
||||
if inverse_kind != original_kind or original_kind not in ["related_to", "references"]:
|
||||
validated_edges.append(inverse_edge)
|
||||
logger.info(f"🔄 [SYMMETRY] Generated inverse edge: '{target_id}' --({inverse_kind})--> '{source_id}'")
|
||||
|
||||
return validated_edges
|
||||
53
app/core/logging_setup.py
Normal file
53
app/core/logging_setup.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
import logging
|
||||
import os
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
def setup_logging(log_level: int = None):
|
||||
"""
|
||||
Konfiguriert das Logging-System mit File- und Console-Handler.
|
||||
WP-24c v4.4.0-DEBUG: Unterstützt DEBUG-Level für End-to-End Tracing.
|
||||
|
||||
Args:
|
||||
log_level: Optionales Log-Level (logging.DEBUG, logging.INFO, etc.)
|
||||
Falls nicht gesetzt, wird aus DEBUG Umgebungsvariable gelesen.
|
||||
"""
|
||||
# 1. Log-Level bestimmen
|
||||
if log_level is None:
|
||||
# WP-24c v4.4.0-DEBUG: Unterstützung für DEBUG-Level via Umgebungsvariable
|
||||
debug_mode = os.getenv("DEBUG", "false").lower() == "true"
|
||||
log_level = logging.DEBUG if debug_mode else logging.INFO
|
||||
|
||||
# 2. Log-Verzeichnis erstellen (falls nicht vorhanden)
|
||||
log_dir = "logs"
|
||||
if not os.path.exists(log_dir):
|
||||
os.makedirs(log_dir)
|
||||
|
||||
log_file = os.path.join(log_dir, "mindnet.log")
|
||||
|
||||
# 3. Formatter definieren (Zeitstempel | Level | Modul | Nachricht)
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s | %(levelname)-8s | %(name)s | %(message)s',
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
|
||||
# 4. File Handler: Schreibt in Datei (max. 5MB pro Datei, behält 5 Backups)
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file, maxBytes=5*1024*1024, backupCount=5, encoding='utf-8'
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
file_handler.setLevel(log_level) # WP-24c v4.4.0-DEBUG: Respektiert log_level
|
||||
|
||||
# 5. Stream Handler: Schreibt weiterhin auf die Konsole
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setFormatter(formatter)
|
||||
console_handler.setLevel(log_level) # WP-24c v4.4.0-DEBUG: Respektiert log_level
|
||||
|
||||
# 6. Root Logger konfigurieren
|
||||
logging.basicConfig(
|
||||
level=log_level,
|
||||
handlers=[file_handler, console_handler],
|
||||
force=True # Überschreibt bestehende Konfiguration
|
||||
)
|
||||
|
||||
level_name = "DEBUG" if log_level == logging.DEBUG else "INFO"
|
||||
logging.info(f"📝 Logging initialized (Level: {level_name}). Writing to {log_file}")
|
||||
|
|
@ -1,229 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Modul: app/core/note_payload.py
|
||||
# Version: 1.7.0
|
||||
# Datum: 2025-09-09
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
try:
|
||||
from app.core.parser import read_markdown, extract_wikilinks, FRONTMATTER_RE
|
||||
except Exception: # pragma: no cover
|
||||
from .parser import read_markdown, extract_wikilinks, FRONTMATTER_RE # type: ignore
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _canon_frontmatter(fm: Dict[str, Any]) -> str:
|
||||
return json.dumps(fm or {}, ensure_ascii=False, separators=(",", ":"), sort_keys=True)
|
||||
|
||||
def _normalize_body(body: str, mode: str) -> str:
|
||||
if mode == "none":
|
||||
return body if body is not None else ""
|
||||
text = (body or "").replace("\r\n", "\n").replace("\r", "\n")
|
||||
text = "\n".join(line.rstrip() for line in text.split("\n"))
|
||||
return text
|
||||
|
||||
def _resolve_hash_mode(explicit: Optional[str]) -> str:
|
||||
if explicit:
|
||||
val = explicit.strip().lower()
|
||||
else:
|
||||
val = (os.environ.get("MINDNET_HASH_MODE")
|
||||
or os.environ.get("MINDNET_HASH_COMPARE")
|
||||
or "body").strip().lower()
|
||||
if val in ("full", "fulltext", "body+frontmatter", "bodyplusfrontmatter"):
|
||||
return "full"
|
||||
if val in ("frontmatter", "fm"):
|
||||
return "frontmatter"
|
||||
return "body"
|
||||
|
||||
def _read_raw_body_from_file(file_path: Optional[str]) -> Tuple[str, Dict[str, Any]]:
|
||||
if not file_path or not os.path.exists(file_path):
|
||||
return "", {}
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
raw = f.read()
|
||||
except Exception:
|
||||
return "", {}
|
||||
m = FRONTMATTER_RE.match(raw)
|
||||
fm = {}
|
||||
if m:
|
||||
fm_txt = m.group(1)
|
||||
try:
|
||||
import yaml # lazy
|
||||
fm = yaml.safe_load(fm_txt) or {}
|
||||
except Exception:
|
||||
fm = {}
|
||||
body = raw[m.end():]
|
||||
else:
|
||||
body = raw
|
||||
return body, fm
|
||||
|
||||
def _sha256(s: str) -> str:
|
||||
h = hashlib.sha256()
|
||||
h.update(s.encode("utf-8"))
|
||||
return h.hexdigest()
|
||||
|
||||
def _hash_for(mode: str, *, body: str, fm: Dict[str, Any], normalize: str) -> str:
|
||||
body_n = _normalize_body(body or "", normalize)
|
||||
fm_s = _canon_frontmatter(fm or {})
|
||||
if mode == "frontmatter":
|
||||
return _sha256(fm_s)
|
||||
if mode == "full":
|
||||
return _sha256(body_n + "\n--FM--\n" + fm_s)
|
||||
# default: body
|
||||
return _sha256(body_n)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Kernfunktion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def make_note_payload(
|
||||
parsed: Any,
|
||||
vault_root: Optional[str] = None,
|
||||
*,
|
||||
hash_mode: Optional[str] = None,
|
||||
hash_normalize: Optional[str] = None,
|
||||
hash_source: Optional[str] = None,
|
||||
file_path: Optional[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Liefert den Note-Payload inkl. Mehrfach-Hashes.
|
||||
- Es werden IMMER die drei Hashes für (body|frontmatter|full) unter
|
||||
'parsed:canonical' erzeugt (Schlüssel: z. B. 'body:parsed:canonical').
|
||||
- Zusätzlich werden – falls die aktuelle Konfig (source/normalize) davon
|
||||
abweicht – die drei Hashes unter den entsprechenden Schlüsseln erzeugt,
|
||||
z. B. 'frontmatter:raw:none'.
|
||||
- 'hash_fulltext' und 'hash_signature' repräsentieren den *aktuellen* Modus.
|
||||
"""
|
||||
# dict oder Objekt akzeptieren
|
||||
if isinstance(parsed, dict):
|
||||
fm = parsed.get("frontmatter") or {}
|
||||
body_parsed = parsed.get("body") or ""
|
||||
path = parsed.get("path") or ""
|
||||
else:
|
||||
fm = getattr(parsed, "frontmatter", {}) or {}
|
||||
body_parsed = getattr(parsed, "body", "") or ""
|
||||
path = getattr(parsed, "path", "") or ""
|
||||
|
||||
# Zielpfad relativieren
|
||||
rel_path = path
|
||||
try:
|
||||
if vault_root:
|
||||
rel = os.path.relpath(path, vault_root)
|
||||
rel = rel.replace("\\", "/").lstrip("/")
|
||||
rel_path = rel
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Konfiguration auflösen
|
||||
mode_resolved = _resolve_hash_mode(hash_mode) # body|frontmatter|full
|
||||
src = (hash_source or os.environ.get("MINDNET_HASH_SOURCE", "parsed")).strip().lower() # parsed|raw
|
||||
norm = (hash_normalize or os.environ.get("MINDNET_HASH_NORMALIZE", "canonical")).strip().lower() # canonical|none
|
||||
|
||||
# Body-Quelle laden
|
||||
raw_body, raw_fm = ("", {})
|
||||
if src == "raw":
|
||||
raw_body, raw_fm = _read_raw_body_from_file(file_path or path)
|
||||
if isinstance(raw_fm, dict) and raw_fm:
|
||||
merged_fm = dict(fm)
|
||||
for k, v in raw_fm.items():
|
||||
merged_fm.setdefault(k, v)
|
||||
fm = merged_fm
|
||||
body_for_hash = raw_body
|
||||
else:
|
||||
body_for_hash = body_parsed
|
||||
|
||||
# --- 1) Standard-Tripel (parsed:canonical) immer erzeugen ---
|
||||
std_src = "parsed"
|
||||
std_norm = "canonical"
|
||||
std_hashes: Dict[str, str] = {}
|
||||
for m in ("body", "frontmatter", "full"):
|
||||
std_hashes[f"{m}:{std_src}:{std_norm}"] = _hash_for(
|
||||
m, body=body_parsed, fm=fm, normalize=std_norm
|
||||
)
|
||||
|
||||
# Convenience-Felder (für Tools)
|
||||
hash_body = std_hashes["body:parsed:canonical"]
|
||||
hash_frontmatter = std_hashes["frontmatter:parsed:canonical"]
|
||||
hash_full = std_hashes["full:parsed:canonical"]
|
||||
|
||||
# --- 2) Hashes für die *aktuelle* Konfiguration (falls abweichend) ---
|
||||
cur_hashes: Dict[str, str] = {}
|
||||
if not (src == std_src and norm == std_norm):
|
||||
for m in ("body", "frontmatter", "full"):
|
||||
cur_hashes[f"{m}:{src}:{norm}"] = _hash_for(
|
||||
m, body=body_for_hash, fm=fm, normalize=norm
|
||||
)
|
||||
|
||||
# --- 3) Aktueller Modus für Backwards-Compat Felder ---
|
||||
current_hash = _hash_for(mode_resolved, body=body_for_hash, fm=fm, normalize=norm)
|
||||
hash_signature = f"{mode_resolved}:{src}:{norm}:{current_hash}"
|
||||
|
||||
# Wikilinks (Note-Ebene)
|
||||
refs = list(dict.fromkeys(extract_wikilinks(body_parsed))) if body_parsed else []
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"note_id": fm.get("id") or fm.get("note_id"),
|
||||
"title": fm.get("title"),
|
||||
"type": fm.get("type"),
|
||||
"status": fm.get("status"),
|
||||
"created": fm.get("created"),
|
||||
"updated": fm.get("updated"),
|
||||
"path": rel_path or fm.get("path"),
|
||||
"tags": fm.get("tags"),
|
||||
# Volltext für verlustfreien Export
|
||||
"fulltext": body_parsed,
|
||||
# Backwards-Compat:
|
||||
"hash_fulltext": current_hash,
|
||||
"hash_signature": hash_signature,
|
||||
# Option C: Mehrfach-Hashes
|
||||
"hashes": {**std_hashes, **cur_hashes},
|
||||
"hash_body": hash_body,
|
||||
"hash_frontmatter": hash_frontmatter,
|
||||
"hash_full": hash_full,
|
||||
# Fallback-Refs
|
||||
"references": refs,
|
||||
}
|
||||
|
||||
for k in ("area", "project", "source", "lang", "slug", "aliases"):
|
||||
if k in fm:
|
||||
payload[k] = fm[k]
|
||||
|
||||
return payload
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI – Sichtprüfung
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cli() -> None:
|
||||
ap = argparse.ArgumentParser(description="Note-Payload aus Markdown erzeugen und anzeigen")
|
||||
ap.add_argument("--from-file", dest="src", required=True)
|
||||
ap.add_argument("--vault-root", dest="vault_root", default=None)
|
||||
ap.add_argument("--print", dest="do_print", action="store_true")
|
||||
ap.add_argument("--hash-mode", choices=["body", "frontmatter", "full"], default=None)
|
||||
ap.add_argument("--hash-normalize", choices=["canonical", "none"], default=None)
|
||||
ap.add_argument("--hash-source", choices=["parsed", "raw"], default=None)
|
||||
args = ap.parse_args()
|
||||
|
||||
parsed = read_markdown(args.src)
|
||||
payload = make_note_payload(
|
||||
parsed,
|
||||
vault_root=args.vault_root,
|
||||
hash_mode=args.hash_mode,
|
||||
hash_normalize=args.hash_normalize,
|
||||
hash_source=args.hash_source,
|
||||
file_path=args.src,
|
||||
)
|
||||
if args.do_print:
|
||||
print(json.dumps(payload, ensure_ascii=False, indent=2))
|
||||
|
||||
if __name__ == "__main__": # pragma: no cover
|
||||
_cli()
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
import os
|
||||
import unicodedata
|
||||
import yaml
|
||||
from typing import Tuple, Dict
|
||||
|
||||
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL) # YAML-Frontmatter am Anfang
|
||||
|
||||
@dataclass
|
||||
class ParsedNote:
|
||||
frontmatter: Dict
|
||||
body: str
|
||||
path: str
|
||||
|
||||
def _strip_bom(text: str) -> str:
|
||||
return text.lstrip("\ufeff")
|
||||
|
||||
def _normalize_text(t: str) -> str:
|
||||
# Unicode-NFKC + vereinheitlichte Zeilenenden + Trim
|
||||
t = unicodedata.normalize("NFKC", t)
|
||||
t = t.replace("\r\n", "\n").replace("\r", "\n")
|
||||
return t
|
||||
|
||||
def read_markdown(path: str) -> ParsedNote:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
raw = _strip_bom(f.read())
|
||||
raw = _normalize_text(raw)
|
||||
|
||||
m = FRONTMATTER_RE.match(raw)
|
||||
front, body = {}, raw
|
||||
if m:
|
||||
yaml_block = m.group(1)
|
||||
body = raw[m.end():]
|
||||
try:
|
||||
front = yaml.safe_load(yaml_block) or {}
|
||||
if not isinstance(front, dict):
|
||||
raise ValueError("Frontmatter must be a mapping")
|
||||
except yaml.YAMLError as e:
|
||||
raise ValueError(f"Invalid YAML frontmatter in {path}: {e}") from e
|
||||
|
||||
return ParsedNote(frontmatter=front, body=body, path=path)
|
||||
|
||||
RE_WIKILINK = re.compile(r"\[\[([^\]\|#]+)(?:#[^\]]+)?(?:\|[^\]]+)?\]\]") # [[id]] | [[id#anchor]] | [[id|label]]
|
||||
|
||||
def extract_wikilinks(text: str) -> list[str]:
|
||||
return list({m.group(1).strip() for m in RE_WIKILINK.finditer(text)})
|
||||
|
||||
def validate_required_frontmatter(fm: dict, required=("title","id","type","status","created")):
|
||||
missing = [k for k in required if k not in fm or fm[k] in (None, "")]
|
||||
if missing:
|
||||
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
|
||||
|
||||
# einfache Plausibilitäten
|
||||
if not isinstance(fm.get("tags", []), (list, tuple)):
|
||||
if "tags" in fm and fm["tags"] not in (None, ""):
|
||||
raise ValueError("tags must be a list of strings")
|
||||
|
||||
def normalize_frontmatter(fm: dict) -> dict:
|
||||
# kleinere Normalisierungen ohne die Semantik zu verändern
|
||||
out = dict(fm)
|
||||
if "tags" in out and isinstance(out["tags"], list):
|
||||
out["tags"] = [str(t).strip() for t in out["tags"]]
|
||||
if "embedding_exclude" in out:
|
||||
out["embedding_exclude"] = bool(out["embedding_exclude"])
|
||||
return out
|
||||
22
app/core/parser/__init__.py
Normal file
22
app/core/parser/__init__.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
FILE: app/core/parser/__init__.py
|
||||
DESCRIPTION: Package-Einstiegspunkt für den Parser.
|
||||
Ermöglicht das Löschen der parser.py Facade.
|
||||
VERSION: 1.10.0
|
||||
"""
|
||||
from .parsing_models import ParsedNote, NoteContext
|
||||
from .parsing_utils import (
|
||||
FRONTMATTER_RE, validate_required_frontmatter,
|
||||
normalize_frontmatter, extract_wikilinks, extract_edges_with_context
|
||||
)
|
||||
from .parsing_markdown import read_markdown
|
||||
from .parsing_scanner import pre_scan_markdown
|
||||
|
||||
# Kompatibilitäts-Alias
|
||||
FRONTMATTER_END = FRONTMATTER_RE
|
||||
|
||||
__all__ = [
|
||||
"ParsedNote", "NoteContext", "FRONTMATTER_RE", "FRONTMATTER_END",
|
||||
"read_markdown", "pre_scan_markdown", "validate_required_frontmatter",
|
||||
"normalize_frontmatter", "extract_wikilinks", "extract_edges_with_context"
|
||||
]
|
||||
60
app/core/parser/parsing_markdown.py
Normal file
60
app/core/parser/parsing_markdown.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
"""
|
||||
FILE: app/core/parsing/parsing_markdown.py
|
||||
DESCRIPTION: Fehlertolerantes Einlesen von Markdown und Frontmatter-Splitting.
|
||||
"""
|
||||
import io
|
||||
import os
|
||||
import json
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
from .parsing_models import ParsedNote
|
||||
from .parsing_utils import FRONTMATTER_RE
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None
|
||||
|
||||
_FALLBACK_ENCODINGS: Tuple[str, ...] = ("utf-8", "utf-8-sig", "cp1252", "latin-1")
|
||||
|
||||
def _split_frontmatter(text: str) -> Tuple[Dict[str, Any], str]:
|
||||
"""Zerlegt Text in Frontmatter-Dict und Body."""
|
||||
lines = text.splitlines(True)
|
||||
if not lines or not FRONTMATTER_RE.match(lines[0]):
|
||||
return {}, text
|
||||
end_idx = None
|
||||
for i in range(1, min(len(lines), 2000)):
|
||||
if FRONTMATTER_RE.match(lines[i]):
|
||||
end_idx = i
|
||||
break
|
||||
if end_idx is None: return {}, text
|
||||
fm_raw = "".join(lines[1:end_idx])
|
||||
body = "".join(lines[end_idx + 1:])
|
||||
if yaml is None: raise RuntimeError("PyYAML not installed.")
|
||||
try:
|
||||
loaded = yaml.safe_load(fm_raw) or {}
|
||||
data = loaded if isinstance(loaded, dict) else {}
|
||||
except Exception as e:
|
||||
print(json.dumps({"warn": "frontmatter_yaml_parse_failed", "error": str(e)}))
|
||||
data = {}
|
||||
if body.startswith("\n"): body = body[1:]
|
||||
return data, body
|
||||
|
||||
def _read_text_with_fallback(path: str) -> Tuple[str, str, bool]:
|
||||
"""Liest Datei mit Encoding-Fallback-Kette."""
|
||||
last_err = None
|
||||
for enc in _FALLBACK_ENCODINGS:
|
||||
try:
|
||||
with io.open(path, "r", encoding=enc, errors="strict") as f:
|
||||
return f.read(), enc, (enc != "utf-8")
|
||||
except UnicodeDecodeError as e:
|
||||
last_err = str(e); continue
|
||||
with open(path, "rb") as fb:
|
||||
text = fb.read().decode("utf-8", errors="replace")
|
||||
return text, "utf-8(replace)", True
|
||||
|
||||
def read_markdown(path: str) -> Optional[ParsedNote]:
|
||||
"""Öffentliche API zum Einlesen einer Datei."""
|
||||
if not os.path.exists(path): return None
|
||||
text, enc, had_fb = _read_text_with_fallback(path)
|
||||
fm, body = _split_frontmatter(text)
|
||||
return ParsedNote(frontmatter=fm or {}, body=body or "", path=path)
|
||||
22
app/core/parser/parsing_models.py
Normal file
22
app/core/parser/parsing_models.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
FILE: app/core/parsing/parsing_models.py
|
||||
DESCRIPTION: Datenklassen für das Parsing-System.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List
|
||||
|
||||
@dataclass
|
||||
class ParsedNote:
|
||||
"""Container für eine vollständig eingelesene Markdown-Datei."""
|
||||
frontmatter: Dict[str, Any]
|
||||
body: str
|
||||
path: str
|
||||
|
||||
@dataclass
|
||||
class NoteContext:
|
||||
"""Metadaten-Container für den flüchtigen LocalBatchCache (Pass 1)."""
|
||||
note_id: str
|
||||
title: str
|
||||
type: str
|
||||
summary: str
|
||||
tags: List[str]
|
||||
40
app/core/parser/parsing_scanner.py
Normal file
40
app/core/parser/parsing_scanner.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
"""
|
||||
FILE: app/core/parsing/parsing_scanner.py
|
||||
DESCRIPTION: Pre-Scan für den LocalBatchCache (Pass 1).
|
||||
AUDIT v1.1.0: Dynamisierung der Scan-Parameter (WP-14).
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
from typing import Optional, Dict, Any
|
||||
from .parsing_models import NoteContext
|
||||
from .parsing_markdown import read_markdown
|
||||
|
||||
def pre_scan_markdown(path: str, registry: Optional[Dict[str, Any]] = None) -> Optional[NoteContext]:
|
||||
"""
|
||||
Extrahiert Identität und Kurz-Kontext zur Validierung.
|
||||
WP-14: Scan-Tiefe und Summary-Länge sind nun über die Registry steuerbar.
|
||||
"""
|
||||
parsed = read_markdown(path)
|
||||
if not parsed: return None
|
||||
|
||||
# WP-14: Konfiguration laden oder Standardwerte nutzen
|
||||
reg = registry or {}
|
||||
summary_cfg = reg.get("summary_settings", {})
|
||||
scan_depth = summary_cfg.get("pre_scan_depth", 600)
|
||||
max_len = summary_cfg.get("max_summary_length", 500)
|
||||
|
||||
fm = parsed.frontmatter
|
||||
# ID-Findung: Frontmatter ID oder Dateiname als Fallback
|
||||
note_id = str(fm.get("id") or os.path.splitext(os.path.basename(path))[0])
|
||||
|
||||
# Erstelle Kurz-Zusammenfassung mit dynamischen Limits
|
||||
clean_body = re.sub(r'[#*`>]', '', parsed.body[:scan_depth]).strip()
|
||||
summary = clean_body[:max_len] + "..." if len(clean_body) > max_len else clean_body
|
||||
|
||||
return NoteContext(
|
||||
note_id=note_id,
|
||||
title=str(fm.get("title", note_id)),
|
||||
type=str(fm.get("type", "concept")),
|
||||
summary=summary,
|
||||
tags=fm.get("tags", []) if isinstance(fm.get("tags"), list) else []
|
||||
)
|
||||
69
app/core/parser/parsing_utils.py
Normal file
69
app/core/parser/parsing_utils.py
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
"""
|
||||
FILE: app/core/parsing/parsing_utils.py
|
||||
DESCRIPTION: Werkzeuge zur Validierung, Normalisierung und Wikilink-Extraktion.
|
||||
"""
|
||||
import re
|
||||
from typing import Any, Dict, List, Tuple, Optional
|
||||
from .parsing_models import ParsedNote
|
||||
|
||||
# Öffentliche Konstanten für Abwärtskompatibilität
|
||||
FRONTMATTER_RE = re.compile(r"^\s*---\s*$")
|
||||
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
||||
|
||||
def validate_required_frontmatter(fm: Dict[str, Any], required: Tuple[str, ...] = ("id", "title")) -> None:
|
||||
"""Prüft, ob alle Pflichtfelder vorhanden sind."""
|
||||
if fm is None: fm = {}
|
||||
missing = []
|
||||
for k in required:
|
||||
v = fm.get(k)
|
||||
if v is None or (isinstance(v, str) and not v.strip()):
|
||||
missing.append(k)
|
||||
if missing:
|
||||
raise ValueError(f"Missing required frontmatter fields: {', '.join(missing)}")
|
||||
if "tags" in fm and fm["tags"] not in (None, "") and not isinstance(fm["tags"], (list, tuple)):
|
||||
raise ValueError("frontmatter 'tags' must be a list of strings")
|
||||
|
||||
def normalize_frontmatter(fm: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Normalisierung von Tags und Boolean-Feldern."""
|
||||
out = dict(fm or {})
|
||||
if "tags" in out:
|
||||
if isinstance(out["tags"], str):
|
||||
out["tags"] = [out["tags"].strip()] if out["tags"].strip() else []
|
||||
elif isinstance(out["tags"], list):
|
||||
out["tags"] = [str(t).strip() for t in out["tags"] if t is not None]
|
||||
else:
|
||||
out["tags"] = [str(out["tags"]).strip()] if out["tags"] not in (None, "") else []
|
||||
if "embedding_exclude" in out:
|
||||
out["embedding_exclude"] = bool(out["embedding_exclude"])
|
||||
return out
|
||||
|
||||
def extract_wikilinks(text: str) -> List[str]:
|
||||
"""Extrahiert Wikilinks als einfache Liste von IDs."""
|
||||
if not text: return []
|
||||
out: List[str] = []
|
||||
for m in _WIKILINK_RE.finditer(text):
|
||||
raw = (m.group(1) or "").strip()
|
||||
if not raw: continue
|
||||
if "|" in raw: raw = raw.split("|", 1)[0].strip()
|
||||
if "#" in raw: raw = raw.split("#", 1)[0].strip()
|
||||
if raw: out.append(raw)
|
||||
return out
|
||||
|
||||
def extract_edges_with_context(parsed: ParsedNote) -> List[Dict[str, Any]]:
|
||||
"""WP-22: Extrahiert Wikilinks mit Zeilennummern für die EdgeRegistry."""
|
||||
edges = []
|
||||
if not parsed or not parsed.body: return edges
|
||||
lines = parsed.body.splitlines()
|
||||
for line_num, line_content in enumerate(lines, 1):
|
||||
for match in _WIKILINK_RE.finditer(line_content):
|
||||
raw = (match.group(1) or "").strip()
|
||||
if not raw: continue
|
||||
if "|" in raw:
|
||||
parts = raw.split("|", 1)
|
||||
target, kind = parts[0].strip(), parts[1].strip()
|
||||
else:
|
||||
target, kind = raw.strip(), "related_to"
|
||||
if "#" in target: target = target.split("#", 1)[0].strip()
|
||||
if target:
|
||||
edges.append({"to": target, "kind": kind, "line": line_num, "provenance": "explicit"})
|
||||
return edges
|
||||
|
|
@ -1,124 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Name: app/core/qdrant.py
|
||||
Version: v1.4.0 (2025-09-09)
|
||||
|
||||
Kurzbeschreibung:
|
||||
Qdrant-Client & Collection-Setup für mindnet.
|
||||
- Stellt sicher, dass {prefix}_notes / {prefix}_chunks / {prefix}_edges existieren.
|
||||
- Edges-Collection nutzt 1D Dummy-Vektor.
|
||||
- NEW: ensure_payload_indexes(...) legt sinnvolle Payload-Indizes an.
|
||||
|
||||
Aufruf:
|
||||
from app.core.qdrant import QdrantConfig, get_client, ensure_collections, ensure_payload_indexes
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple
|
||||
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
|
||||
@dataclass
|
||||
class QdrantConfig:
|
||||
url: str
|
||||
api_key: Optional[str]
|
||||
prefix: str
|
||||
dim: int
|
||||
|
||||
@staticmethod
|
||||
def from_env() -> "QdrantConfig":
|
||||
url = os.getenv("QDRANT_URL")
|
||||
if not url:
|
||||
host = os.getenv("QDRANT_HOST", "127.0.0.1")
|
||||
port = int(os.getenv("QDRANT_PORT", "6333"))
|
||||
url = f"http://{host}:{port}"
|
||||
api_key = os.getenv("QDRANT_API_KEY") or None
|
||||
prefix = os.getenv("COLLECTION_PREFIX", "mindnet")
|
||||
dim = int(os.getenv("VECTOR_DIM", "384"))
|
||||
return QdrantConfig(url=url, api_key=api_key, prefix=prefix, dim=dim)
|
||||
|
||||
|
||||
def get_client(cfg: QdrantConfig) -> QdrantClient:
|
||||
return QdrantClient(url=cfg.url, api_key=cfg.api_key)
|
||||
|
||||
|
||||
def _create_notes(client: QdrantClient, name: str, dim: int) -> None:
|
||||
if not client.collection_exists(name):
|
||||
client.create_collection(
|
||||
collection_name=name,
|
||||
vectors_config=rest.VectorParams(size=dim, distance=rest.Distance.COSINE),
|
||||
)
|
||||
|
||||
def _create_chunks(client: QdrantClient, name: str, dim: int) -> None:
|
||||
if not client.collection_exists(name):
|
||||
client.create_collection(
|
||||
collection_name=name,
|
||||
vectors_config=rest.VectorParams(size=dim, distance=rest.Distance.COSINE),
|
||||
)
|
||||
|
||||
def _create_edges(client: QdrantClient, name: str) -> None:
|
||||
if not client.collection_exists(name):
|
||||
client.create_collection(
|
||||
collection_name=name,
|
||||
vectors_config=rest.VectorParams(size=1, distance=rest.Distance.DOT), # 1D-Dummy
|
||||
)
|
||||
|
||||
|
||||
def ensure_collections(client: QdrantClient, prefix: str, dim: int, destructive: bool = False) -> None:
|
||||
notes = f"{prefix}_notes"
|
||||
chunks = f"{prefix}_chunks"
|
||||
edges = f"{prefix}_edges"
|
||||
|
||||
_create_notes(client, notes, dim)
|
||||
_create_chunks(client, chunks, dim)
|
||||
|
||||
if client.collection_exists(edges):
|
||||
try:
|
||||
info = client.get_collection(edges)
|
||||
vectors_cfg = getattr(getattr(info.result, "config", None), "params", None)
|
||||
has_vectors = getattr(vectors_cfg, "vectors", None) is not None
|
||||
except Exception:
|
||||
has_vectors = True
|
||||
if not has_vectors:
|
||||
if destructive:
|
||||
client.delete_collection(edges)
|
||||
_create_edges(client, edges)
|
||||
else:
|
||||
print(f"[ensure_collections] WARN: '{edges}' ohne VectorConfig; destructive=False.", flush=True)
|
||||
else:
|
||||
_create_edges(client, edges)
|
||||
|
||||
|
||||
def collection_names(prefix: str) -> Tuple[str, str, str]:
|
||||
return (f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges")
|
||||
|
||||
|
||||
# -------------------------------
|
||||
# NEW: Payload-Indexing
|
||||
# -------------------------------
|
||||
|
||||
def _safe_create_index(client: QdrantClient, col: str, field: str, schema: rest.PayloadSchemaType):
|
||||
try:
|
||||
client.create_payload_index(
|
||||
collection_name=col,
|
||||
field_name=field,
|
||||
field_schema=schema,
|
||||
)
|
||||
except Exception:
|
||||
# bereits vorhanden oder nicht unterstütztes Schema → ignorieren
|
||||
pass
|
||||
|
||||
def ensure_payload_indexes(client: QdrantClient, prefix: str) -> None:
|
||||
notes, chunks, edges = collection_names(prefix)
|
||||
# Notes
|
||||
_safe_create_index(client, notes, "note_id", rest.PayloadSchemaType.KEYWORD)
|
||||
# Chunks
|
||||
_safe_create_index(client, chunks, "note_id", rest.PayloadSchemaType.KEYWORD)
|
||||
_safe_create_index(client, chunks, "chunk_index", rest.PayloadSchemaType.INTEGER)
|
||||
# Edges
|
||||
for f in ("kind", "scope", "source_id", "target_id", "note_id"):
|
||||
_safe_create_index(client, edges, f, rest.PayloadSchemaType.KEYWORD)
|
||||
|
|
@ -1,136 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
app/core/qdrant_points.py
|
||||
|
||||
Zweck
|
||||
- Gemeinsame Helfer zum Erzeugen von Qdrant-Points für Notes, Chunks und Edges.
|
||||
- Abwärtskompatibel zu altem Edge-Payload-Schema aus edges.py:
|
||||
- alt: {'edge_type','src_id','dst_id', ...}
|
||||
- neu: {'kind','source_id','target_id', ...}
|
||||
|
||||
Version
|
||||
- 1.3 (2025-09-08)
|
||||
|
||||
Änderungen (ggü. 1.2)
|
||||
- points_for_edges() akzeptiert jetzt beide Edge-Schemata.
|
||||
- Normalisiert alte Felder auf 'kind' / 'source_id' / 'target_id' und schreibt eine
|
||||
stabile 'edge_id' zurück in die Payload.
|
||||
- Verhindert, dass mehrere Edges dieselbe Point-ID erhalten (Root Cause deiner 1-Edge-Sammlung).
|
||||
|
||||
Aufruf / Verwendung
|
||||
- Wird von Import-/Backfill-Skripten via:
|
||||
from app.core.qdrant_points import points_for_note, points_for_chunks, points_for_edges, upsert_batch
|
||||
eingebunden. Keine CLI.
|
||||
|
||||
Hinweise
|
||||
- Edges bekommen absichtlich einen 1D-Dummy-Vektor [0.0], damit Qdrant das Objekt akzeptiert.
|
||||
- Die Point-IDs werden deterministisch aus stabilen Strings (UUIDv5) abgeleitet.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import uuid
|
||||
from typing import List, Tuple
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
|
||||
def _names(prefix: str) -> Tuple[str, str, str]:
|
||||
return f"{prefix}_notes", f"{prefix}_chunks", f"{prefix}_edges"
|
||||
|
||||
|
||||
def _to_uuid(stable_key: str) -> str:
|
||||
"""Stabile UUIDv5 aus einem String-Key (deterministisch)."""
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, stable_key))
|
||||
|
||||
|
||||
def points_for_note(
|
||||
prefix: str,
|
||||
note_payload: dict,
|
||||
note_vec: List[float] | None,
|
||||
dim: int,
|
||||
) -> Tuple[str, List[rest.PointStruct]]:
|
||||
"""Notes-Collection: falls kein Note-Embedding -> Nullvektor der Länge dim."""
|
||||
notes_col, _, _ = _names(prefix)
|
||||
vector = note_vec if note_vec is not None else [0.0] * int(dim)
|
||||
raw_note_id = note_payload.get("note_id") or note_payload.get("id") or "missing-note-id"
|
||||
point_id = _to_uuid(raw_note_id)
|
||||
pt = rest.PointStruct(id=point_id, vector=vector, payload=note_payload)
|
||||
return notes_col, [pt]
|
||||
|
||||
|
||||
def points_for_chunks(
|
||||
prefix: str,
|
||||
chunk_payloads: List[dict],
|
||||
vectors: List[List[float]],
|
||||
) -> Tuple[str, List[rest.PointStruct]]:
|
||||
"""
|
||||
Chunks-Collection: erwartet pro Chunk einen Vektor.
|
||||
Robustheit:
|
||||
- Fehlt 'chunk_id', nutze 'id', sonst baue '${note_id}#${i}' (1-basiert).
|
||||
- Schreibe die abgeleitete ID zurück in die Payload (pl['chunk_id']).
|
||||
"""
|
||||
_, chunks_col, _ = _names(prefix)
|
||||
points: List[rest.PointStruct] = []
|
||||
for i, (pl, vec) in enumerate(zip(chunk_payloads, vectors), start=1):
|
||||
chunk_id = pl.get("chunk_id") or pl.get("id")
|
||||
if not chunk_id:
|
||||
note_id = pl.get("note_id") or pl.get("parent_note_id") or "missing-note"
|
||||
chunk_id = f"{note_id}#{i}"
|
||||
pl["chunk_id"] = chunk_id
|
||||
point_id = _to_uuid(chunk_id)
|
||||
points.append(rest.PointStruct(id=point_id, vector=vec, payload=pl))
|
||||
return chunks_col, points
|
||||
|
||||
|
||||
def _normalize_edge_payload(pl: dict) -> dict:
|
||||
"""
|
||||
Sorgt für kompatible Feldnamen.
|
||||
akzeptiert:
|
||||
- neu: kind, source_id, target_id, seq?
|
||||
- alt: edge_type, src_id, dst_id, order?/index?
|
||||
schreibt zurück: kind, source_id, target_id, seq?
|
||||
"""
|
||||
# bereits neu?
|
||||
kind = pl.get("kind") or pl.get("edge_type") or "edge"
|
||||
source_id = pl.get("source_id") or pl.get("src_id") or "unknown-src"
|
||||
target_id = pl.get("target_id") or pl.get("dst_id") or "unknown-tgt"
|
||||
seq = pl.get("seq") or pl.get("order") or pl.get("index")
|
||||
|
||||
# in Payload zurückschreiben (ohne alte Felder zu entfernen → maximal kompatibel)
|
||||
pl.setdefault("kind", kind)
|
||||
pl.setdefault("source_id", source_id)
|
||||
pl.setdefault("target_id", target_id)
|
||||
if seq is not None and "seq" not in pl:
|
||||
pl["seq"] = seq
|
||||
return pl
|
||||
|
||||
|
||||
def points_for_edges(prefix: str, edge_payloads: List[dict]) -> Tuple[str, List[rest.PointStruct]]:
|
||||
"""
|
||||
Edges-Collection mit 1D-Dummy-Vektor.
|
||||
- Akzeptiert sowohl neues als auch altes Edge-Schema (siehe _normalize_edge_payload).
|
||||
- Fehlt 'edge_id', wird sie stabil aus (kind, source_id, target_id, seq) konstruiert.
|
||||
"""
|
||||
_, _, edges_col = _names(prefix)
|
||||
points: List[rest.PointStruct] = []
|
||||
for raw in edge_payloads:
|
||||
pl = _normalize_edge_payload(raw)
|
||||
|
||||
edge_id = pl.get("edge_id")
|
||||
if not edge_id:
|
||||
kind = pl.get("kind", "edge")
|
||||
s = pl.get("source_id", "unknown-src")
|
||||
t = pl.get("target_id", "unknown-tgt")
|
||||
seq = pl.get("seq") or ""
|
||||
edge_id = f"{kind}:{s}->{t}#{seq}"
|
||||
pl["edge_id"] = edge_id
|
||||
|
||||
point_id = _to_uuid(edge_id)
|
||||
points.append(rest.PointStruct(id=point_id, vector=[0.0], payload=pl))
|
||||
return edges_col, points
|
||||
|
||||
|
||||
def upsert_batch(client, collection: str, points: List[rest.PointStruct]) -> None:
|
||||
if not points:
|
||||
return
|
||||
client.upsert(collection_name=collection, points=points, wait=True)
|
||||
43
app/core/registry.py
Normal file
43
app/core/registry.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
"""
|
||||
FILE: app/core/registry.py
|
||||
DESCRIPTION: Zentraler Base-Layer für Konfigurations-Loading und Text-Bereinigung.
|
||||
Bricht Zirkelbezüge zwischen Ingestion und LLMService auf.
|
||||
VERSION: 1.0.0
|
||||
"""
|
||||
import os
|
||||
import yaml
|
||||
from typing import Optional, List
|
||||
|
||||
def load_type_registry(custom_path: Optional[str] = None) -> dict:
|
||||
"""Lädt die types.yaml zur Steuerung der typ-spezifischen Logik."""
|
||||
# Wir nutzen hier einen direkten Import von Settings, um Zyklen zu vermeiden
|
||||
from app.config import get_settings
|
||||
settings = get_settings()
|
||||
path = custom_path or settings.MINDNET_TYPES_FILE
|
||||
if not os.path.exists(path):
|
||||
return {}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def clean_llm_text(text: str, registry: Optional[dict] = None) -> str:
|
||||
"""
|
||||
Entfernt LLM-Steuerzeichen (<s>, [OUT] etc.) aus einem Text.
|
||||
Wird sowohl für JSON-Parsing als auch für Chat-Antworten genutzt.
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
return ""
|
||||
|
||||
default_patterns = ["<s>", "</s>", "[OUT]", "[/OUT]"]
|
||||
reg = registry or load_type_registry()
|
||||
|
||||
# Lade Patterns aus llm_settings (WP-14)
|
||||
patterns: List[str] = reg.get("llm_settings", {}).get("cleanup_patterns", default_patterns)
|
||||
|
||||
clean = text
|
||||
for p in patterns:
|
||||
clean = clean.replace(p, "")
|
||||
|
||||
return clean.strip()
|
||||
25
app/core/retrieval/__init__.py
Normal file
25
app/core/retrieval/__init__.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
"""
|
||||
PACKAGE: app.core.retrieval
|
||||
DESCRIPTION: Zentrale Schnittstelle für Retrieval-Operationen (Vektor- & Graph-Suche).
|
||||
Bündelt Suche und mathematische Scoring-Engine.
|
||||
"""
|
||||
from .retriever import (
|
||||
Retriever,
|
||||
hybrid_retrieve,
|
||||
semantic_retrieve
|
||||
)
|
||||
|
||||
from .retriever_scoring import (
|
||||
get_weights,
|
||||
compute_wp22_score,
|
||||
get_status_multiplier
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Retriever",
|
||||
"hybrid_retrieve",
|
||||
"semantic_retrieve",
|
||||
"get_weights",
|
||||
"compute_wp22_score",
|
||||
"get_status_multiplier"
|
||||
]
|
||||
378
app/core/retrieval/decision_engine.py
Normal file
378
app/core/retrieval/decision_engine.py
Normal file
|
|
@ -0,0 +1,378 @@
|
|||
"""
|
||||
FILE: app/core/retrieval/decision_engine.py
|
||||
DESCRIPTION: Der Agentic Orchestrator für MindNet (WP-25b Edition).
|
||||
Realisiert Multi-Stream Retrieval, Intent-basiertes Routing
|
||||
und die neue Lazy-Prompt Orchestrierung (Module A & B).
|
||||
VERSION: 1.3.2 (WP-25b: Full Robustness Recovery & Regex Parsing)
|
||||
STATUS: Active
|
||||
FIX:
|
||||
- WP-25b: ULTRA-Robustes Intent-Parsing via Regex (Fix: 'CODING[/S]' -> 'CODING').
|
||||
- WP-25b: Wiederherstellung der prepend_instruction Logik via variables.
|
||||
- WP-25a: Voller Erhalt der Profil-Kaskade via LLMService v3.5.5.
|
||||
- WP-25: Beibehaltung von Stream-Tracing, Edge-Boosts und Pre-Initialization.
|
||||
- RECOVERY: Wiederherstellung der lokalen Sicherheits-Gates aus v1.2.1.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
import yaml
|
||||
import os
|
||||
import re # Neu für robustes Intent-Parsing
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
# Core & Service Imports
|
||||
from app.models.dto import QueryRequest, QueryResponse
|
||||
from app.core.retrieval.retriever import Retriever
|
||||
from app.services.llm_service import LLMService
|
||||
from app.config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DecisionEngine:
|
||||
def __init__(self):
|
||||
"""Initialisiert die Engine und lädt die modularen Konfigurationen."""
|
||||
self.settings = get_settings()
|
||||
self.retriever = Retriever()
|
||||
self.llm_service = LLMService()
|
||||
self.config = self._load_engine_config()
|
||||
|
||||
def _load_engine_config(self) -> Dict[str, Any]:
|
||||
"""Lädt die Multi-Stream Konfiguration (WP-25/25a)."""
|
||||
path = os.getenv("MINDNET_DECISION_CONFIG", "config/decision_engine.yaml")
|
||||
if not os.path.exists(path):
|
||||
logger.error(f"❌ Decision Engine Config not found at {path}")
|
||||
return {"strategies": {}, "streams_library": {}}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
config = yaml.safe_load(f) or {}
|
||||
|
||||
# WP-25b FIX: Schema-Validierung
|
||||
required_keys = ["strategies", "streams_library"]
|
||||
missing = [k for k in required_keys if k not in config]
|
||||
if missing:
|
||||
logger.error(f"❌ Missing required keys in decision_engine.yaml: {missing}")
|
||||
return {"strategies": {}, "streams_library": {}}
|
||||
|
||||
# Warnung bei unbekannten Top-Level-Keys
|
||||
known_keys = {"version", "settings", "strategies", "streams_library"}
|
||||
unknown = set(config.keys()) - known_keys
|
||||
if unknown:
|
||||
logger.warning(f"⚠️ Unknown keys in decision_engine.yaml: {unknown}")
|
||||
|
||||
logger.info(f"⚙️ Decision Engine Config loaded (v{config.get('version', 'unknown')})")
|
||||
return config
|
||||
except yaml.YAMLError as e:
|
||||
logger.error(f"❌ YAML syntax error in decision_engine.yaml: {e}")
|
||||
return {"strategies": {}, "streams_library": {}}
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to load decision_engine.yaml: {e}")
|
||||
return {"strategies": {}, "streams_library": {}}
|
||||
|
||||
async def ask(self, query: str) -> str:
|
||||
"""
|
||||
Hauptmethode des MindNet Chats.
|
||||
Orchestriert den agentischen Prozess: Routing -> Retrieval -> Kompression -> Synthese.
|
||||
"""
|
||||
# 1. Intent Recognition (Strategy Routing)
|
||||
strategy_key = await self._determine_strategy(query)
|
||||
|
||||
strategies = self.config.get("strategies", {})
|
||||
strategy = strategies.get(strategy_key)
|
||||
|
||||
if not strategy:
|
||||
logger.warning(f"⚠️ Unknown strategy '{strategy_key}'. Fallback to FACT_WHAT.")
|
||||
strategy_key = "FACT_WHAT"
|
||||
strategy = strategies.get("FACT_WHAT")
|
||||
|
||||
if not strategy and strategies:
|
||||
strategy_key = next(iter(strategies))
|
||||
strategy = strategies[strategy_key]
|
||||
|
||||
if not strategy:
|
||||
return "Entschuldigung, meine Wissensbasis ist aktuell nicht konfiguriert."
|
||||
|
||||
# 2. Multi-Stream Retrieval & Pre-Synthesis (Parallel Tasks inkl. Kompression)
|
||||
stream_results = await self._execute_parallel_streams(strategy, query)
|
||||
|
||||
# 3. Finale Synthese
|
||||
return await self._generate_final_answer(strategy_key, strategy, query, stream_results)
|
||||
|
||||
async def _determine_strategy(self, query: str) -> str:
|
||||
"""WP-25b: Nutzt den LLM-Router via Lazy-Loading und bereinigt Modell-Artefakte via Regex."""
|
||||
settings_cfg = self.config.get("settings", {})
|
||||
prompt_key = settings_cfg.get("router_prompt_key", "intent_router_v1")
|
||||
router_profile = settings_cfg.get("router_profile")
|
||||
|
||||
try:
|
||||
# Delegation an LLMService ohne manuelle Vor-Formatierung
|
||||
response = await self.llm_service.generate_raw_response(
|
||||
prompt_key=prompt_key,
|
||||
variables={"query": query},
|
||||
max_retries=1,
|
||||
priority="realtime",
|
||||
profile_name=router_profile
|
||||
)
|
||||
|
||||
# --- ULTRA-ROBUST PARSING (Fix für 'CODING[/S]') ---
|
||||
# 1. Alles in Großbuchstaben umwandeln
|
||||
raw_text = str(response).upper()
|
||||
|
||||
# 2. Regex: Suche das erste Wort, das nur aus A-Z und Unterstrichen besteht
|
||||
# Dies ignoriert [/S], </s>, Newlines oder Plaudereien des Modells
|
||||
match = re.search(r'\b(FACT_WHEN|FACT_WHAT|DECISION|EMPATHY|CODING|INTERVIEW)\b', raw_text)
|
||||
|
||||
if match:
|
||||
intent = match.group(1)
|
||||
logger.info(f"🎯 [ROUTING] Parsed Intent: '{intent}' from raw response: '{response.strip()}'")
|
||||
return intent
|
||||
|
||||
# Fallback, falls Regex nicht greift
|
||||
logger.warning(f"⚠️ Unmapped intent '{response.strip()}' from router. Falling back to FACT_WHAT.")
|
||||
return "FACT_WHAT"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Strategy Routing failed: {e}")
|
||||
return "FACT_WHAT"
|
||||
|
||||
async def _execute_parallel_streams(self, strategy: Dict, query: str) -> Dict[str, str]:
|
||||
"""Führt Such-Streams aus und komprimiert überlange Ergebnisse (Pre-Synthesis)."""
|
||||
stream_keys = strategy.get("use_streams", [])
|
||||
library = self.config.get("streams_library", {})
|
||||
|
||||
# Phase 1: Retrieval Tasks starten
|
||||
retrieval_tasks = []
|
||||
active_streams = []
|
||||
for key in stream_keys:
|
||||
stream_cfg = library.get(key)
|
||||
if stream_cfg:
|
||||
active_streams.append(key)
|
||||
retrieval_tasks.append(self._run_single_stream(key, stream_cfg, query))
|
||||
|
||||
# Ergebnisse sammeln
|
||||
retrieval_results = await asyncio.gather(*retrieval_tasks, return_exceptions=True)
|
||||
|
||||
# Phase 2: Formatierung und optionale Kompression
|
||||
# WP-24c v4.5.5: Context-Reuse - Sicherstellen, dass formatted_context auch bei Kompressions-Fehlern erhalten bleibt
|
||||
final_stream_tasks = []
|
||||
formatted_contexts = {} # WP-24c v4.5.5: Persistenz für Fallback-Zugriff
|
||||
|
||||
for name, res in zip(active_streams, retrieval_results):
|
||||
if isinstance(res, Exception):
|
||||
logger.error(f"Stream '{name}' failed during retrieval: {res}")
|
||||
error_msg = f"[Fehler im Wissens-Stream {name}]"
|
||||
formatted_contexts[name] = error_msg
|
||||
async def _err(msg=error_msg): return msg
|
||||
final_stream_tasks.append(_err())
|
||||
continue
|
||||
|
||||
formatted_context = self._format_stream_context(res)
|
||||
formatted_contexts[name] = formatted_context # WP-24c v4.5.5: Persistenz für Fallback
|
||||
|
||||
# WP-25a: Kompressions-Check (Inhaltsverdichtung)
|
||||
stream_cfg = library.get(name, {})
|
||||
threshold = stream_cfg.get("compression_threshold", 4000)
|
||||
|
||||
if len(formatted_context) > threshold:
|
||||
logger.info(f"⚙️ [WP-25b] Triggering Lazy-Compression for stream '{name}'...")
|
||||
comp_profile = stream_cfg.get("compression_profile")
|
||||
# WP-24c v4.5.5: Kompression mit Context-Reuse - bei Fehler wird formatted_context zurückgegeben
|
||||
final_stream_tasks.append(
|
||||
self._compress_stream_content(name, formatted_context, query, comp_profile)
|
||||
)
|
||||
else:
|
||||
async def _direct(c=formatted_context): return c
|
||||
final_stream_tasks.append(_direct())
|
||||
|
||||
# Finale Inhalte parallel fertigstellen
|
||||
# WP-24c v4.5.5: Bei Kompressions-Fehlern wird der Original-Content zurückgegeben (siehe _compress_stream_content)
|
||||
final_contents = await asyncio.gather(*final_stream_tasks, return_exceptions=True)
|
||||
|
||||
# WP-24c v4.5.5: Exception-Handling für finale Inhalte - verwende Original-Content bei Fehlern
|
||||
final_results = {}
|
||||
for name, content in zip(active_streams, final_contents):
|
||||
if isinstance(content, Exception):
|
||||
logger.warning(f"⚠️ [CONTEXT-REUSE] Stream '{name}' Fehler in finaler Verarbeitung: {content}. Verwende Original-Context.")
|
||||
final_results[name] = formatted_contexts.get(name, f"[Fehler im Stream {name}]")
|
||||
else:
|
||||
final_results[name] = content
|
||||
|
||||
logger.debug(f"📊 [STREAMS] Finale Stream-Ergebnisse: {[(k, len(v)) for k, v in final_results.items()]}")
|
||||
return final_results
|
||||
|
||||
async def _compress_stream_content(self, stream_name: str, content: str, query: str, profile: Optional[str]) -> str:
|
||||
"""
|
||||
WP-25b: Inhaltsverdichtung via Lazy-Loading 'compression_template'.
|
||||
WP-24c v4.5.5: Context-Reuse - Bei Fehlern wird der Original-Content zurückgegeben,
|
||||
um Re-Retrieval zu vermeiden.
|
||||
"""
|
||||
try:
|
||||
# WP-24c v4.5.5: Logging für LLM-Trace im Kompressions-Modus
|
||||
logger.debug(f"🔧 [COMPRESSION] Starte Kompression für Stream '{stream_name}' (Content-Länge: {len(content)})")
|
||||
|
||||
summary = await self.llm_service.generate_raw_response(
|
||||
prompt_key="compression_template",
|
||||
variables={
|
||||
"stream_name": stream_name,
|
||||
"content": content,
|
||||
"query": query
|
||||
},
|
||||
profile_name=profile,
|
||||
priority="background",
|
||||
max_retries=1
|
||||
)
|
||||
|
||||
# WP-24c v4.5.5: Validierung des Kompressions-Ergebnisses
|
||||
if summary and len(summary.strip()) > 10:
|
||||
logger.debug(f"✅ [COMPRESSION] Kompression erfolgreich für '{stream_name}' (Original: {len(content)}, Komprimiert: {len(summary)})")
|
||||
return summary.strip()
|
||||
else:
|
||||
logger.warning(f"⚠️ [COMPRESSION] Kompressions-Ergebnis zu kurz für '{stream_name}', verwende Original-Content")
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
# WP-24c v4.5.5: Context-Reuse - Bei Fehlern Original-Content zurückgeben (kein Re-Retrieval)
|
||||
logger.error(f"❌ [COMPRESSION] Kompression von '{stream_name}' fehlgeschlagen: {e}")
|
||||
logger.info(f"🔄 [CONTEXT-REUSE] Verwende Original-Content für '{stream_name}' (Länge: {len(content)}) - KEIN Re-Retrieval")
|
||||
return content
|
||||
|
||||
async def _run_single_stream(self, name: str, cfg: Dict, query: str) -> QueryResponse:
|
||||
"""Spezialisierte Graph-Suche mit Stream-Tracing und Edge-Boosts."""
|
||||
transformed_query = cfg.get("query_template", "{query}").format(query=query)
|
||||
|
||||
request = QueryRequest(
|
||||
query=transformed_query,
|
||||
top_k=cfg.get("top_k", 5),
|
||||
filters={"type": cfg.get("filter_types", [])},
|
||||
expand={"depth": 1},
|
||||
boost_edges=cfg.get("edge_boosts", {}), # Erhalt der Gewichtung
|
||||
explain=True
|
||||
)
|
||||
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Protokollierung vor der Suche
|
||||
logger.info(f"🔍 [RETRIEVAL] Starte Stream: '{name}'")
|
||||
logger.info(f" -> Transformierte Query: '{transformed_query}'")
|
||||
logger.debug(f" ⚙️ [FILTER] Angewandte Metadaten-Filter: {request.filters}")
|
||||
logger.debug(f" ⚙️ [FILTER] Top-K: {request.top_k}, Expand-Depth: {request.expand.get('depth') if request.expand else None}")
|
||||
|
||||
response = await self.retriever.search(request)
|
||||
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Protokollierung nach der Suche
|
||||
if not response.results:
|
||||
logger.warning(f"⚠️ [EMPTY] Stream '{name}' lieferte 0 Ergebnisse.")
|
||||
else:
|
||||
logger.info(f"✨ [SUCCESS] Stream '{name}' lieferte {len(response.results)} Treffer.")
|
||||
# Top 3 Treffer im DEBUG-Level loggen
|
||||
# WP-24c v4.5.4: QueryHit hat kein chunk_id Feld - verwende node_id (enthält die Chunk-ID)
|
||||
for i, hit in enumerate(response.results[:3]):
|
||||
chunk_id = hit.node_id # node_id ist die Chunk-ID (pid)
|
||||
score = hit.total_score # QueryHit hat total_score, nicht score
|
||||
logger.debug(f" [{i+1}] Chunk: {chunk_id} | Score: {score:.4f} | Path: {hit.source.get('path', 'N/A') if hit.source else 'N/A'}")
|
||||
|
||||
for hit in response.results:
|
||||
hit.stream_origin = name
|
||||
return response
|
||||
|
||||
def _format_stream_context(self, response: QueryResponse) -> str:
|
||||
"""Wandelt QueryHits in einen formatierten Kontext-String um."""
|
||||
if not response.results:
|
||||
return "Keine spezifischen Informationen gefunden."
|
||||
lines = []
|
||||
for i, hit in enumerate(response.results, 1):
|
||||
source = hit.source.get("path", "Unbekannt")
|
||||
content = hit.source.get("text", "").strip()
|
||||
lines.append(f"[{i}] QUELLE: {source}\nINHALT: {content}")
|
||||
return "\n\n".join(lines)
|
||||
|
||||
async def _generate_final_answer(
|
||||
self,
|
||||
strategy_key: str,
|
||||
strategy: Dict,
|
||||
query: str,
|
||||
stream_results: Dict[str, str]
|
||||
) -> str:
|
||||
"""WP-25b: Finale Synthese via Lazy-Prompt mit Robustheit aus v1.2.1."""
|
||||
profile = strategy.get("llm_profile")
|
||||
template_key = strategy.get("prompt_template", "fact_synthesis_v1")
|
||||
system_prompt = self.llm_service.get_prompt("system_prompt")
|
||||
|
||||
# WP-25 ROBUSTNESS: Pre-Initialization der Variablen
|
||||
all_possible_streams = ["values_stream", "facts_stream", "biography_stream", "risk_stream", "tech_stream"]
|
||||
template_vars = {s: "" for s in all_possible_streams}
|
||||
template_vars.update(stream_results)
|
||||
template_vars["query"] = query
|
||||
|
||||
# WP-25a Erhalt: Prepend Instructions aus der strategy_config
|
||||
prepend = strategy.get("prepend_instruction", "")
|
||||
template_vars["prepend_instruction"] = prepend
|
||||
|
||||
try:
|
||||
# WP-25b: Delegation der Synthese an den LLMService
|
||||
response = await self.llm_service.generate_raw_response(
|
||||
prompt_key=template_key,
|
||||
variables=template_vars,
|
||||
system=system_prompt,
|
||||
profile_name=profile,
|
||||
priority="realtime"
|
||||
)
|
||||
|
||||
# WP-25a RECOVERY: Falls dieprepend_instruction nicht im Template-Key
|
||||
# der prompts.yaml enthalten ist (WP-25b Lazy Loading), fügen wir sie
|
||||
# hier manuell an den Anfang, um die Logik aus v1.2.1 zu bewahren.
|
||||
if prepend and prepend not in response[:len(prepend)+50]:
|
||||
logger.info("ℹ️ Adding prepend_instruction manually (not found in response).")
|
||||
response = f"{prepend}\n\n{response}"
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Final Synthesis failed: {e}")
|
||||
# WP-24c v4.5.5: ROBUST FALLBACK mit Context-Reuse
|
||||
# WICHTIG: stream_results werden Wiederverwendet - KEIN Re-Retrieval
|
||||
logger.info(f"🔄 [FALLBACK] Verwende vorhandene stream_results (KEIN Re-Retrieval)")
|
||||
logger.debug(f" -> Verfügbare Streams: {list(stream_results.keys())}")
|
||||
logger.debug(f" -> Stream-Längen: {[(k, len(v)) for k, v in stream_results.items()]}")
|
||||
|
||||
# WP-24c v4.5.5: Context-Reuse - Nutze vorhandene stream_results
|
||||
fallback_context = "\n\n".join([v for v in stream_results.values() if len(v) > 20])
|
||||
|
||||
if not fallback_context or len(fallback_context.strip()) < 20:
|
||||
logger.warning(f"⚠️ [FALLBACK] Fallback-Context zu kurz ({len(fallback_context)} Zeichen). Stream-Ergebnisse möglicherweise leer.")
|
||||
return f"Entschuldigung, ich konnte keine relevanten Informationen zu Ihrer Anfrage finden. (Fehler: {str(e)})"
|
||||
|
||||
try:
|
||||
# WP-24c v4.5.5: Fallback-Synthese mit LLM-Trace-Logging
|
||||
logger.info(f"🔄 [FALLBACK] Starte Fallback-Synthese mit vorhandenem Context (Länge: {len(fallback_context)})")
|
||||
logger.debug(f" -> Fallback-Profile: {profile}, Template: fallback_synthesis")
|
||||
|
||||
result = await self.llm_service.generate_raw_response(
|
||||
prompt_key="fallback_synthesis",
|
||||
variables={"query": query, "context": fallback_context},
|
||||
system=system_prompt, priority="realtime", profile_name=profile
|
||||
)
|
||||
|
||||
logger.info(f"✅ [FALLBACK] Fallback-Synthese erfolgreich (Antwort-Länge: {len(result) if result else 0})")
|
||||
return result
|
||||
|
||||
except (ValueError, KeyError) as template_error:
|
||||
# WP-24c v4.5.9: Fallback auf generisches Template mit variables
|
||||
# Nutzt Lazy-Loading aus WP-25b für modell-spezifische Fallback-Prompts
|
||||
logger.warning(f"⚠️ [FALLBACK] Template 'fallback_synthesis' nicht gefunden: {template_error}. Versuche generisches Template.")
|
||||
logger.debug(f" -> Fallback-Profile: {profile}, Context-Länge: {len(fallback_context)}")
|
||||
|
||||
try:
|
||||
# WP-24c v4.5.9: Versuche generisches Template mit variables (Lazy-Loading)
|
||||
result = await self.llm_service.generate_raw_response(
|
||||
prompt_key="fallback_synthesis_generic", # Fallback-Template
|
||||
variables={"query": query, "context": fallback_context},
|
||||
system=system_prompt, priority="realtime", profile_name=profile
|
||||
)
|
||||
logger.info(f"✅ [FALLBACK] Generisches Template erfolgreich (Antwort-Länge: {len(result) if result else 0})")
|
||||
return result
|
||||
except (ValueError, KeyError) as fallback_error:
|
||||
# WP-24c v4.5.9: Letzter Fallback - direkter Prompt (nur wenn beide Templates fehlen)
|
||||
logger.error(f"❌ [FALLBACK] Auch generisches Template nicht gefunden: {fallback_error}. Verwende direkten Prompt als letzten Fallback.")
|
||||
result = await self.llm_service.generate_raw_response(
|
||||
prompt=f"Beantworte: {query}\n\nKontext:\n{fallback_context}",
|
||||
system=system_prompt, priority="realtime", profile_name=profile
|
||||
)
|
||||
logger.info(f"✅ [FALLBACK] Direkter Prompt erfolgreich (Antwort-Länge: {len(result) if result else 0})")
|
||||
return result
|
||||
587
app/core/retrieval/retriever.py
Normal file
587
app/core/retrieval/retriever.py
Normal file
|
|
@ -0,0 +1,587 @@
|
|||
"""
|
||||
FILE: app/core/retrieval/retriever.py
|
||||
DESCRIPTION: Haupt-Schnittstelle für die Suche. Orchestriert Vektorsuche und Graph-Expansion.
|
||||
WP-15c Update: Note-Level Diversity Pooling & Super-Edge Aggregation.
|
||||
WP-24c v4.1.0: Gold-Standard - Scope-Awareness, Section-Filtering, Authority-Priorisierung.
|
||||
VERSION: 0.8.0 (WP-24c: Gold-Standard v4.1.0)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.config, app.models.dto, app.core.database*, app.core.graph_adapter
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from typing import Any, Dict, List, Tuple, Iterable, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
from app.config import get_settings
|
||||
from app.models.dto import (
|
||||
QueryRequest, QueryResponse, QueryHit,
|
||||
Explanation, ScoreBreakdown, Reason, EdgeDTO
|
||||
)
|
||||
|
||||
# MODULARISIERUNG: Neue Import-Pfade für die Datenbank-Ebene
|
||||
import app.core.database.qdrant as qdr
|
||||
import app.core.database.qdrant_points as qp
|
||||
|
||||
import app.services.embeddings_client as ec
|
||||
import app.core.graph.graph_subgraph as ga
|
||||
import app.core.graph.graph_db_adapter as gdb
|
||||
from app.core.graph.graph_utils import PROVENANCE_PRIORITY
|
||||
from qdrant_client.http import models as rest
|
||||
|
||||
# Mathematische Engine importieren
|
||||
from app.core.retrieval.retriever_scoring import get_weights, compute_wp22_score
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ==============================================================================
|
||||
# 1. CORE HELPERS & CONFIG LOADERS
|
||||
# ==============================================================================
|
||||
|
||||
def _get_client_and_prefix() -> Tuple[Any, str]:
|
||||
"""Initialisiert Qdrant Client und lädt Collection-Prefix via database-Paket."""
|
||||
cfg = qdr.QdrantConfig.from_env()
|
||||
return qdr.get_client(cfg), cfg.prefix
|
||||
|
||||
|
||||
def _get_query_vector(req: QueryRequest) -> List[float]:
|
||||
"""
|
||||
Vektorisiert die Anfrage.
|
||||
FIX: Enthält try-except Block für unterschiedliche Signaturen von ec.embed_text.
|
||||
"""
|
||||
if req.query_vector:
|
||||
return list(req.query_vector)
|
||||
if not req.query:
|
||||
raise ValueError("Kein Text oder Vektor für die Suche angegeben.")
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
try:
|
||||
# Versuch mit modernem Interface (WP-03 kompatibel)
|
||||
return ec.embed_text(req.query, model_name=settings.MODEL_NAME)
|
||||
except TypeError:
|
||||
# Fallback für Signaturen, die 'model_name' nicht als Keyword akzeptieren
|
||||
logger.debug("ec.embed_text does not accept 'model_name' keyword. Falling back.")
|
||||
return ec.embed_text(req.query)
|
||||
|
||||
|
||||
def _get_chunk_ids_for_notes(
|
||||
client: Any,
|
||||
prefix: str,
|
||||
note_ids: List[str]
|
||||
) -> List[str]:
|
||||
"""
|
||||
WP-24c v4.1.0: Lädt alle Chunk-IDs für gegebene Note-IDs.
|
||||
Wird für Scope-Aware Edge Retrieval benötigt.
|
||||
"""
|
||||
if not note_ids:
|
||||
return []
|
||||
|
||||
_, chunks_col, _ = qp._names(prefix)
|
||||
chunk_ids = []
|
||||
|
||||
try:
|
||||
# Filter: note_id IN note_ids
|
||||
note_filter = rest.Filter(should=[
|
||||
rest.FieldCondition(key="note_id", match=rest.MatchValue(value=str(nid)))
|
||||
for nid in note_ids
|
||||
])
|
||||
|
||||
pts, _ = client.scroll(
|
||||
collection_name=chunks_col,
|
||||
scroll_filter=note_filter,
|
||||
limit=2048,
|
||||
with_payload=True,
|
||||
with_vectors=False
|
||||
)
|
||||
|
||||
for pt in pts:
|
||||
pl = pt.payload or {}
|
||||
cid = pl.get("chunk_id")
|
||||
if cid:
|
||||
chunk_ids.append(str(cid))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load chunk IDs for notes: {e}")
|
||||
|
||||
return chunk_ids
|
||||
|
||||
def _semantic_hits(
|
||||
client: Any,
|
||||
prefix: str,
|
||||
vector: List[float],
|
||||
top_k: int,
|
||||
filters: Optional[Dict] = None,
|
||||
target_section: Optional[str] = None
|
||||
) -> List[Tuple[str, float, Dict[str, Any]]]:
|
||||
"""
|
||||
Führt die Vektorsuche via database-Points-Modul durch.
|
||||
WP-24c v4.1.0: Unterstützt optionales Section-Filtering.
|
||||
"""
|
||||
# WP-24c v4.1.0: Section-Filtering für präzise Section-Links
|
||||
if target_section and filters:
|
||||
filters = {**filters, "section": target_section}
|
||||
elif target_section:
|
||||
filters = {"section": target_section}
|
||||
|
||||
raw_hits = qp.search_chunks_by_vector(client, prefix, vector, top=top_k, filters=filters)
|
||||
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Protokollierung der rohen Qdrant-Antwort
|
||||
logger.debug(f"📊 [RAW-HITS] Qdrant lieferte {len(raw_hits)} Roh-Treffer (Top-K: {top_k})")
|
||||
if filters:
|
||||
logger.debug(f" ⚙️ [FILTER] Angewandte Filter: {filters}")
|
||||
|
||||
# Logge die Top 3 Roh-Scores für Diagnose
|
||||
for i, hit in enumerate(raw_hits[:3]):
|
||||
hit_id = str(hit[0]) if hit else "N/A"
|
||||
hit_score = float(hit[1]) if hit and len(hit) > 1 else 0.0
|
||||
hit_payload = dict(hit[2] or {}) if hit and len(hit) > 2 else {}
|
||||
hit_path = hit_payload.get('path', 'N/A')
|
||||
logger.debug(f" [{i+1}] ID: {hit_id} | Raw-Score: {hit_score:.4f} | Path: {hit_path}")
|
||||
|
||||
# Strikte Typkonvertierung für Stabilität
|
||||
return [(str(hit[0]), float(hit[1]), dict(hit[2] or {})) for hit in raw_hits]
|
||||
|
||||
# ==============================================================================
|
||||
# 2. EXPLANATION LAYER (DEBUG & VERIFIABILITY)
|
||||
# ==============================================================================
|
||||
|
||||
def _build_explanation(
|
||||
semantic_score: float,
|
||||
payload: Dict[str, Any],
|
||||
scoring_debug: Dict[str, Any],
|
||||
subgraph: Optional[ga.Subgraph],
|
||||
target_note_id: Optional[str],
|
||||
applied_boosts: Optional[Dict[str, float]] = None
|
||||
) -> Explanation:
|
||||
"""
|
||||
Transformiert mathematische Scores und Graph-Signale in eine menschenlesbare Erklärung.
|
||||
"""
|
||||
_, edge_w_cfg, _ = get_weights()
|
||||
base_val = scoring_debug["base_val"]
|
||||
|
||||
# 1. Detaillierter mathematischer Breakdown
|
||||
breakdown = ScoreBreakdown(
|
||||
semantic_contribution=base_val,
|
||||
edge_contribution=base_val * scoring_debug["edge_impact_final"],
|
||||
centrality_contribution=base_val * scoring_debug["cent_impact_final"],
|
||||
raw_semantic=semantic_score,
|
||||
raw_edge_bonus=scoring_debug["edge_bonus"],
|
||||
raw_centrality=scoring_debug["cent_bonus"],
|
||||
node_weight=float(payload.get("retriever_weight", 1.0)),
|
||||
status_multiplier=scoring_debug["status_multiplier"],
|
||||
graph_boost_factor=scoring_debug["graph_boost_factor"]
|
||||
)
|
||||
|
||||
reasons: List[Reason] = []
|
||||
edges_dto: List[EdgeDTO] = []
|
||||
|
||||
# 2. Gründe für Semantik hinzufügen
|
||||
if semantic_score > 0.85:
|
||||
reasons.append(Reason(kind="semantic", message="Sehr hohe textuelle Übereinstimmung.", score_impact=base_val))
|
||||
elif semantic_score > 0.70:
|
||||
reasons.append(Reason(kind="semantic", message="Inhaltliche Übereinstimmung.", score_impact=base_val))
|
||||
|
||||
# 3. Gründe für Typ und Lifecycle (WP-25 Vorbereitung)
|
||||
type_weight = float(payload.get("retriever_weight", 1.0))
|
||||
if type_weight != 1.0:
|
||||
msg = "Bevorzugt" if type_weight > 1.0 else "De-priorisiert"
|
||||
reasons.append(Reason(kind="type", message=f"{msg} durch Typ-Profil.", score_impact=base_val * (type_weight - 1.0)))
|
||||
|
||||
# NEU: Explizite Ausweisung des Lifecycle-Status (WP-22)
|
||||
status_mult = scoring_debug.get("status_multiplier", 1.0)
|
||||
if status_mult != 1.0:
|
||||
status_msg = "Belohnt (Stable)" if status_mult > 1.0 else "De-priorisiert (Draft)"
|
||||
reasons.append(Reason(
|
||||
kind="status",
|
||||
message=f"{status_msg} durch Content-Lifecycle.",
|
||||
score_impact=semantic_score * (status_mult - 1.0)
|
||||
))
|
||||
|
||||
# 4. Kanten-Verarbeitung (Graph-Intelligence)
|
||||
if subgraph and target_note_id and scoring_debug["edge_bonus"] > 0:
|
||||
raw_edges = []
|
||||
if hasattr(subgraph, "get_incoming_edges"):
|
||||
raw_edges.extend(subgraph.get_incoming_edges(target_note_id) or [])
|
||||
if hasattr(subgraph, "get_outgoing_edges"):
|
||||
raw_edges.extend(subgraph.get_outgoing_edges(target_note_id) or [])
|
||||
|
||||
for edge in raw_edges:
|
||||
src = str(edge.get("source") or "note_root")
|
||||
tgt = str(edge.get("target") or target_note_id or "unknown_target")
|
||||
kind = str(edge.get("kind", "related_to"))
|
||||
prov = str(edge.get("provenance", "rule"))
|
||||
conf = float(edge.get("confidence", 1.0))
|
||||
|
||||
direction = "in" if tgt == target_note_id else "out"
|
||||
|
||||
# WP-24c v4.5.10: Robuste EdgeDTO-Erstellung mit Fehlerbehandlung
|
||||
# Falls Provenance-Wert nicht unterstützt wird, verwende Fallback
|
||||
try:
|
||||
edge_obj = EdgeDTO(
|
||||
id=f"{src}->{tgt}:{kind}",
|
||||
kind=kind,
|
||||
source=src,
|
||||
target=tgt,
|
||||
weight=conf,
|
||||
direction=direction,
|
||||
provenance=prov,
|
||||
confidence=conf
|
||||
)
|
||||
edges_dto.append(edge_obj)
|
||||
except Exception as e:
|
||||
# WP-24c v4.5.10: Fallback bei Validierungsfehler (z.B. alte EdgeDTO-Version im Cache)
|
||||
logger.warning(
|
||||
f"⚠️ [EDGE-DTO] Provenance '{prov}' nicht unterstützt für Edge {src}->{tgt} ({kind}). "
|
||||
f"Fehler: {e}. Verwende Fallback 'explicit'."
|
||||
)
|
||||
# Fallback: Verwende 'explicit' als sicheren Default
|
||||
try:
|
||||
edge_obj = EdgeDTO(
|
||||
id=f"{src}->{tgt}:{kind}",
|
||||
kind=kind,
|
||||
source=src,
|
||||
target=tgt,
|
||||
weight=conf,
|
||||
direction=direction,
|
||||
provenance="explicit", # Fallback
|
||||
confidence=conf
|
||||
)
|
||||
edges_dto.append(edge_obj)
|
||||
except Exception as e2:
|
||||
logger.error(f"❌ [EDGE-DTO] Auch Fallback fehlgeschlagen: {e2}. Überspringe Edge.")
|
||||
# Überspringe diese Kante - besser als kompletter Fehler
|
||||
|
||||
# Die 3 wichtigsten Kanten als Begründung formulieren
|
||||
top_edges = sorted(edges_dto, key=lambda e: e.confidence, reverse=True)
|
||||
for e in top_edges[:3]:
|
||||
peer = e.source if e.direction == "in" else e.target
|
||||
# WP-24c v4.5.3: Unterstütze alle explicit-Varianten (explicit, explicit:callout, etc.)
|
||||
prov_txt = "Bestätigte" if e.provenance and e.provenance.startswith("explicit") else "KI-basierte"
|
||||
boost_txt = f" [Boost x{applied_boosts.get(e.kind)}]" if applied_boosts and e.kind in applied_boosts else ""
|
||||
|
||||
reasons.append(Reason(
|
||||
kind="edge",
|
||||
message=f"{prov_txt} Kante '{e.kind}'{boost_txt} von/zu '{peer}'.",
|
||||
score_impact=edge_w_cfg * e.confidence
|
||||
))
|
||||
|
||||
if scoring_debug["cent_bonus"] > 0.01:
|
||||
reasons.append(Reason(kind="centrality", message="Die Notiz ist ein zentraler Informations-Hub.", score_impact=breakdown.centrality_contribution))
|
||||
|
||||
return Explanation(
|
||||
breakdown=breakdown,
|
||||
reasons=reasons,
|
||||
related_edges=edges_dto if edges_dto else None,
|
||||
applied_boosts=applied_boosts
|
||||
)
|
||||
|
||||
# ==============================================================================
|
||||
# 3. CORE RETRIEVAL PIPELINE
|
||||
# ==============================================================================
|
||||
|
||||
def _build_hits_from_semantic(
|
||||
hits: Iterable[Tuple[str, float, Dict[str, Any]]],
|
||||
top_k: int,
|
||||
used_mode: str,
|
||||
subgraph: ga.Subgraph | None = None,
|
||||
explain: bool = False,
|
||||
dynamic_edge_boosts: Dict[str, float] = None
|
||||
) -> QueryResponse:
|
||||
"""
|
||||
Wandelt semantische Roh-Treffer in bewertete QueryHits um.
|
||||
WP-15c: Implementiert Note-Level Diversity Pooling.
|
||||
"""
|
||||
t0 = time.time()
|
||||
enriched = []
|
||||
|
||||
# Erstes Scoring für alle Kandidaten
|
||||
for pid, semantic_score, payload in hits:
|
||||
edge_bonus, cent_bonus = 0.0, 0.0
|
||||
target_id = payload.get("note_id")
|
||||
|
||||
if subgraph and target_id:
|
||||
try:
|
||||
edge_bonus = float(subgraph.edge_bonus(target_id))
|
||||
cent_bonus = float(subgraph.centrality_bonus(target_id))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
debug_data = compute_wp22_score(
|
||||
semantic_score, payload, edge_bonus, cent_bonus, dynamic_edge_boosts
|
||||
)
|
||||
enriched.append((pid, semantic_score, payload, debug_data))
|
||||
|
||||
# 1. Sortierung nach finalem mathematischen Score
|
||||
enriched_sorted = sorted(enriched, key=lambda h: h[3]["total"], reverse=True)
|
||||
|
||||
# 2. WP-15c: Note-Level Diversity Pooling
|
||||
# Wir behalten pro note_id nur den Hit mit dem höchsten total_score.
|
||||
# Dies verhindert, dass 10 Chunks derselben Note andere KeyNotes verdrängen.
|
||||
unique_note_hits = []
|
||||
seen_notes = set()
|
||||
|
||||
for item in enriched_sorted:
|
||||
_, _, payload, _ = item
|
||||
note_id = str(payload.get("note_id", "unknown"))
|
||||
|
||||
if note_id not in seen_notes:
|
||||
unique_note_hits.append(item)
|
||||
seen_notes.add(note_id)
|
||||
|
||||
# 3. Begrenzung auf top_k nach dem Diversity-Pooling
|
||||
limited_hits = unique_note_hits[: max(1, top_k)]
|
||||
|
||||
results: List[QueryHit] = []
|
||||
for pid, s_score, pl, dbg in limited_hits:
|
||||
explanation_obj = None
|
||||
if explain:
|
||||
explanation_obj = _build_explanation(
|
||||
semantic_score=float(s_score),
|
||||
payload=pl,
|
||||
scoring_debug=dbg,
|
||||
subgraph=subgraph,
|
||||
target_note_id=pl.get("note_id"),
|
||||
applied_boosts=dynamic_edge_boosts
|
||||
)
|
||||
|
||||
text_content = pl.get("page_content") or pl.get("text") or pl.get("content", "[Kein Text]")
|
||||
|
||||
# WP-24c v4.1.0: RAG-Kontext - source_chunk_id aus Edge-Payload extrahieren
|
||||
source_chunk_id = None
|
||||
if explanation_obj and explanation_obj.related_edges:
|
||||
# Finde die erste Edge mit chunk_id als source
|
||||
for edge in explanation_obj.related_edges:
|
||||
# Prüfe, ob source eine Chunk-ID ist (enthält # oder ist chunk_id)
|
||||
if edge.source and ("#" in edge.source or edge.source.startswith("chunk:")):
|
||||
source_chunk_id = edge.source
|
||||
break
|
||||
|
||||
results.append(QueryHit(
|
||||
node_id=str(pid),
|
||||
note_id=str(pl.get("note_id", "unknown")),
|
||||
semantic_score=float(s_score),
|
||||
edge_bonus=dbg["edge_bonus"],
|
||||
centrality_bonus=dbg["cent_bonus"],
|
||||
total_score=dbg["total"],
|
||||
source={
|
||||
"path": pl.get("path"),
|
||||
"section": pl.get("section") or pl.get("section_title"),
|
||||
"text": text_content
|
||||
},
|
||||
payload=pl,
|
||||
explanation=explanation_obj,
|
||||
source_chunk_id=source_chunk_id # WP-24c v4.1.0: RAG-Kontext
|
||||
))
|
||||
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Finale Ergebnisse
|
||||
latency_ms = int((time.time() - t0) * 1000)
|
||||
if not results:
|
||||
logger.warning(f"⚠️ [EMPTY] Hybride Suche lieferte 0 Ergebnisse (Latency: {latency_ms}ms)")
|
||||
else:
|
||||
logger.info(f"✨ [SUCCESS] Hybride Suche lieferte {len(results)} Treffer (Latency: {latency_ms}ms)")
|
||||
# Top 3 finale Scores loggen
|
||||
# WP-24c v4.5.4: QueryHit hat kein chunk_id Feld - verwende node_id (enthält die Chunk-ID)
|
||||
for i, hit in enumerate(results[:3]):
|
||||
chunk_id = hit.node_id # node_id ist die Chunk-ID (pid)
|
||||
logger.debug(f" [{i+1}] Final: Chunk={chunk_id} | Total-Score={hit.total_score:.4f} | Semantic={hit.semantic_score:.4f} | Edge={hit.edge_bonus:.4f}")
|
||||
|
||||
return QueryResponse(results=results, used_mode=used_mode, latency_ms=latency_ms)
|
||||
|
||||
|
||||
def hybrid_retrieve(req: QueryRequest) -> QueryResponse:
|
||||
"""
|
||||
Die Haupt-Einstiegsfunktion für die hybride Suche.
|
||||
WP-15c: Implementiert Edge-Aggregation (Super-Kanten).
|
||||
WP-24c v4.5.0-DEBUG: Retrieval-Tracer für Diagnose.
|
||||
"""
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Start der hybriden Suche
|
||||
logger.info(f"🔍 [RETRIEVAL] Starte hybride Suche")
|
||||
logger.info(f" -> Query: '{req.query[:100]}...' (Länge: {len(req.query)})")
|
||||
logger.debug(f" ⚙️ [FILTER] Request-Filter: {req.filters}")
|
||||
logger.debug(f" ⚙️ [FILTER] Top-K: {req.top_k}, Expand: {req.expand}, Target-Section: {req.target_section}")
|
||||
client, prefix = _get_client_and_prefix()
|
||||
vector = list(req.query_vector) if req.query_vector else _get_query_vector(req)
|
||||
top_k = req.top_k or 10
|
||||
|
||||
# 1. Semantische Seed-Suche (Wir laden etwas mehr für das Pooling)
|
||||
# WP-24c v4.1.0: Section-Filtering unterstützen
|
||||
target_section = getattr(req, "target_section", None)
|
||||
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Vor semantischer Suche
|
||||
logger.debug(f"🔍 [RETRIEVAL] Starte semantische Seed-Suche (Top-K: {top_k * 3}, Target-Section: {target_section})")
|
||||
|
||||
hits = _semantic_hits(client, prefix, vector, top_k=top_k * 3, filters=req.filters, target_section=target_section)
|
||||
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Nach semantischer Suche
|
||||
logger.debug(f"📊 [SEED-HITS] Semantische Suche lieferte {len(hits)} Seed-Treffer")
|
||||
|
||||
# 2. Graph Expansion Konfiguration
|
||||
expand_cfg = req.expand if isinstance(req.expand, dict) else {}
|
||||
depth = int(expand_cfg.get("depth", 1))
|
||||
boost_edges = getattr(req, "boost_edges", {}) or {}
|
||||
|
||||
subgraph: ga.Subgraph | None = None
|
||||
if depth > 0 and hits:
|
||||
# WP-24c v4.5.2: Chunk-Aware Graph Traversal
|
||||
# Extrahiere sowohl note_id als auch chunk_id (pid) direkt aus den Hits
|
||||
# Dies stellt sicher, dass Chunk-Scope Edges gefunden werden
|
||||
seed_note_ids = list({h[2].get("note_id") for h in hits if h[2].get("note_id")})
|
||||
seed_chunk_ids = list({h[0] for h in hits if h[0]}) # pid ist die Chunk-ID
|
||||
|
||||
# Kombiniere beide Sets für vollständige Seed-Abdeckung
|
||||
# Chunk-IDs können auch als Note-IDs fungieren (für Note-Scope Edges)
|
||||
all_seed_ids = list(set(seed_note_ids + seed_chunk_ids))
|
||||
|
||||
if all_seed_ids:
|
||||
try:
|
||||
# WP-24c v4.5.2: Chunk-IDs sind bereits aus Hits extrahiert
|
||||
# Zusätzlich können wir noch weitere Chunk-IDs für die Note-IDs laden
|
||||
# (für den Fall, dass nicht alle Chunks in den Top-K Hits sind)
|
||||
additional_chunk_ids = _get_chunk_ids_for_notes(client, prefix, seed_note_ids)
|
||||
# Kombiniere direkte Chunk-IDs aus Hits mit zusätzlich geladenen
|
||||
all_chunk_ids = list(set(seed_chunk_ids + additional_chunk_ids))
|
||||
|
||||
# WP-24c v4.5.2: Erweiterte Edge-Retrieval mit Chunk-Scope und Section-Filtering
|
||||
# Verwende all_seed_ids (enthält sowohl note_id als auch chunk_id)
|
||||
# und all_chunk_ids für explizite Chunk-Scope Edge-Suche
|
||||
subgraph = ga.expand(
|
||||
client, prefix, all_seed_ids,
|
||||
depth=depth,
|
||||
edge_types=expand_cfg.get("edge_types"),
|
||||
chunk_ids=all_chunk_ids,
|
||||
target_section=target_section
|
||||
)
|
||||
|
||||
# WP-24c v4.5.2: Debug-Logging für Chunk-Awareness
|
||||
logger.debug(f"🔍 [SEEDS] Note-IDs: {len(seed_note_ids)}, Chunk-IDs: {len(seed_chunk_ids)}, Total Seeds: {len(all_seed_ids)}")
|
||||
logger.debug(f" -> Zusätzliche Chunk-IDs geladen: {len(additional_chunk_ids)}, Total Chunk-IDs: {len(all_chunk_ids)}")
|
||||
|
||||
# --- WP-24c v4.1.0: Chunk-Level Edge-Aggregation & Deduplizierung ---
|
||||
# Verhindert Score-Explosion durch multiple Links auf versch. Abschnitte.
|
||||
# Logik: 1. Kante zählt voll, weitere dämpfen auf Faktor 0.1.
|
||||
# Erweitert um Chunk-Level Tracking für präzise In-Degree-Berechnung.
|
||||
if subgraph and hasattr(subgraph, "adj"):
|
||||
# WP-24c v4.1.0: Chunk-Level In-Degree Tracking
|
||||
chunk_level_in_degree = defaultdict(int) # target -> count of chunk sources
|
||||
|
||||
for src, edge_list in subgraph.adj.items():
|
||||
# Gruppiere Kanten nach Ziel-Note (Deduplizierung ID_A -> ID_B)
|
||||
by_target = defaultdict(list)
|
||||
for e in edge_list:
|
||||
by_target[e["target"]].append(e)
|
||||
|
||||
# WP-24c v4.1.0: Chunk-Level In-Degree Tracking
|
||||
# Wenn source eine Chunk-ID ist, zähle für Chunk-Level In-Degree
|
||||
if e.get("chunk_id") or (src and ("#" in src or src.startswith("chunk:"))):
|
||||
chunk_level_in_degree[e["target"]] += 1
|
||||
|
||||
aggregated_list = []
|
||||
for tgt, edges in by_target.items():
|
||||
if len(edges) > 1:
|
||||
# Sortiere: Stärkste Kante zuerst (Authority-Priorisierung)
|
||||
sorted_edges = sorted(
|
||||
edges,
|
||||
key=lambda x: (
|
||||
x.get("weight", 0.0) *
|
||||
(1.0 if not x.get("virtual", False) else 0.5) * # Virtual-Penalty
|
||||
float(x.get("confidence", 1.0)) # Confidence-Boost
|
||||
),
|
||||
reverse=True
|
||||
)
|
||||
primary = sorted_edges[0]
|
||||
|
||||
# Aggregiertes Gewicht berechnen (Sättigungs-Logik)
|
||||
total_w = primary.get("weight", 0.0)
|
||||
chunk_count = 0
|
||||
for secondary in sorted_edges[1:]:
|
||||
total_w += secondary.get("weight", 0.0) * 0.1
|
||||
if secondary.get("chunk_id") or (secondary.get("source") and ("#" in secondary.get("source", "") or secondary.get("source", "").startswith("chunk:"))):
|
||||
chunk_count += 1
|
||||
|
||||
primary["weight"] = total_w
|
||||
primary["is_super_edge"] = True # Flag für Explanation Layer
|
||||
primary["edge_count"] = len(edges)
|
||||
primary["chunk_source_count"] = chunk_count + (1 if (primary.get("chunk_id") or (primary.get("source") and ("#" in primary.get("source", "") or primary.get("source", "").startswith("chunk:")))) else 0)
|
||||
aggregated_list.append(primary)
|
||||
else:
|
||||
edge = edges[0]
|
||||
# WP-24c v4.1.0: Chunk-Count auch für einzelne Edges
|
||||
if edge.get("chunk_id") or (edge.get("source") and ("#" in edge.get("source", "") or edge.get("source", "").startswith("chunk:"))):
|
||||
edge["chunk_source_count"] = 1
|
||||
aggregated_list.append(edge)
|
||||
|
||||
# In-Place Update der Adjazenzliste des Graphen
|
||||
subgraph.adj[src] = aggregated_list
|
||||
|
||||
# Re-Sync der In-Degrees für Centrality-Bonus (Aggregation konsistent halten)
|
||||
subgraph.in_degree = defaultdict(int)
|
||||
for src, edges in subgraph.adj.items():
|
||||
for e in edges:
|
||||
subgraph.in_degree[e["target"]] += 1
|
||||
|
||||
# WP-24c v4.1.0: Chunk-Level In-Degree als Attribut speichern
|
||||
subgraph.chunk_level_in_degree = chunk_level_in_degree
|
||||
|
||||
# --- WP-24c v4.1.0: Authority-Priorisierung (Provenance & Confidence) ---
|
||||
if subgraph and hasattr(subgraph, "adj"):
|
||||
for src, edges in subgraph.adj.items():
|
||||
for e in edges:
|
||||
# A. Provenance Weighting (nutzt PROVENANCE_PRIORITY aus graph_utils)
|
||||
prov = e.get("provenance", "rule")
|
||||
prov_key = f"{prov}:{e.get('kind', 'related_to')}" if ":" not in prov else prov
|
||||
prov_w = PROVENANCE_PRIORITY.get(prov_key, PROVENANCE_PRIORITY.get(prov, 0.7))
|
||||
|
||||
# B. Confidence-Weighting (aus Edge-Payload)
|
||||
confidence = float(e.get("confidence", 1.0))
|
||||
|
||||
# C. Virtual-Flag De-Priorisierung
|
||||
is_virtual = e.get("virtual", False)
|
||||
virtual_penalty = 0.5 if is_virtual else 1.0
|
||||
|
||||
# D. Intent Boost Multiplikator
|
||||
kind = e.get("kind")
|
||||
intent_multiplier = boost_edges.get(kind, 1.0)
|
||||
|
||||
# Gewichtung anpassen (Authority-Priorisierung)
|
||||
e["weight"] = e.get("weight", 1.0) * prov_w * confidence * virtual_penalty * intent_multiplier
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Graph Expansion failed: {e}")
|
||||
subgraph = None
|
||||
|
||||
# 3. Scoring & Explanation Generierung
|
||||
# top_k wird erst hier final angewandt
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Vor finaler Hit-Erstellung
|
||||
if subgraph:
|
||||
# WP-24c v4.5.1: Subgraph hat kein .edges Attribut, sondern .adj (Adjazenzliste)
|
||||
# Zähle alle Kanten aus der Adjazenzliste
|
||||
edge_count = sum(len(edges) for edges in subgraph.adj.values()) if hasattr(subgraph, 'adj') else 0
|
||||
logger.debug(f"📊 [GRAPH] Subgraph enthält {edge_count} Kanten")
|
||||
else:
|
||||
logger.debug(f"📊 [GRAPH] Kein Subgraph (depth=0 oder keine Seed-IDs)")
|
||||
|
||||
result = _build_hits_from_semantic(hits, top_k, "hybrid", subgraph, req.explain, boost_edges)
|
||||
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Nach finaler Hit-Erstellung
|
||||
if not result.results:
|
||||
logger.warning(f"⚠️ [EMPTY] Hybride Suche lieferte nach Scoring 0 finale Ergebnisse")
|
||||
else:
|
||||
logger.info(f"✨ [SUCCESS] Hybride Suche lieferte {len(result.results)} finale Treffer (Mode: {result.used_mode})")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def semantic_retrieve(req: QueryRequest) -> QueryResponse:
|
||||
"""Standard Vektorsuche ohne Graph-Einfluss."""
|
||||
client, prefix = _get_client_and_prefix()
|
||||
vector = _get_query_vector(req)
|
||||
hits = _semantic_hits(client, prefix, vector, req.top_k or 10, req.filters)
|
||||
return _build_hits_from_semantic(hits, req.top_k or 10, "semantic", explain=req.explain)
|
||||
|
||||
|
||||
class Retriever:
|
||||
"""Schnittstelle für die asynchrone Suche."""
|
||||
async def search(self, request: QueryRequest) -> QueryResponse:
|
||||
return hybrid_retrieve(request)
|
||||
128
app/core/retrieval/retriever_scoring.py
Normal file
128
app/core/retrieval/retriever_scoring.py
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
"""
|
||||
FILE: app/core/retrieval/retriever_scoring.py
|
||||
DESCRIPTION: Mathematische Kern-Logik für das WP-22/WP-15c Scoring.
|
||||
Berechnet Relevanz-Scores basierend auf Semantik, Graph-Intelligence und Content Lifecycle.
|
||||
FIX v1.0.3: Optimierte Interaktion zwischen Typ-Boost und Status-Dämpfung.
|
||||
VERSION: 1.0.3
|
||||
STATUS: Active
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from typing import Any, Dict, Tuple, Optional
|
||||
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
yaml = None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@lru_cache
|
||||
def get_weights() -> Tuple[float, float, float]:
|
||||
"""
|
||||
Liefert die Basis-Gewichtung (semantic, edge, centrality) aus der Konfiguration.
|
||||
"""
|
||||
from app.config import get_settings
|
||||
settings = get_settings()
|
||||
|
||||
# Defaults aus Settings laden
|
||||
sem = float(getattr(settings, "RETRIEVER_W_SEM", 1.0))
|
||||
edge = float(getattr(settings, "RETRIEVER_W_EDGE", 0.0))
|
||||
cent = float(getattr(settings, "RETRIEVER_W_CENT", 0.0))
|
||||
|
||||
# Optionaler Override via YAML
|
||||
config_path = os.getenv("MINDNET_RETRIEVER_CONFIG", "config/retriever.yaml")
|
||||
if yaml and os.path.exists(config_path):
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
scoring = data.get("scoring", {})
|
||||
sem = float(scoring.get("semantic_weight", sem))
|
||||
edge = float(scoring.get("edge_weight", edge))
|
||||
cent = float(scoring.get("centrality_weight", cent))
|
||||
except Exception as e:
|
||||
logger.warning(f"Retriever Configuration could not be fully loaded from {config_path}: {e}")
|
||||
|
||||
return sem, edge, cent
|
||||
|
||||
def get_status_multiplier(payload: Dict[str, Any]) -> float:
|
||||
"""
|
||||
WP-22 A: Content Lifecycle Multiplier.
|
||||
Steuert das Ranking basierend auf dem Reifegrad der Information.
|
||||
|
||||
- stable: 1.2 (Belohnung für verifiziertes Wissen)
|
||||
- active: 1.0 (Standard-Gewichtung)
|
||||
- draft: 0.5 (Dämpfung für unfertige Fragmente)
|
||||
"""
|
||||
status = str(payload.get("status", "active")).lower().strip()
|
||||
if status == "stable":
|
||||
return 1.2
|
||||
if status == "draft":
|
||||
return 0.5
|
||||
return 1.0
|
||||
|
||||
def compute_wp22_score(
|
||||
semantic_score: float,
|
||||
payload: Dict[str, Any],
|
||||
edge_bonus_raw: float = 0.0,
|
||||
cent_bonus_raw: float = 0.0,
|
||||
dynamic_edge_boosts: Optional[Dict[str, float]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Die zentrale mathematische Scoring-Formel (WP-15c optimiert).
|
||||
Implementiert das Hybrid-Scoring (Semantic * Lifecycle * Graph).
|
||||
|
||||
LOGIK:
|
||||
1. Base = Similarity * StatusMult (Lifecycle-Filter).
|
||||
2. Boosts = (TypeBoost - 1) + (GraphBoni * IntentFactor).
|
||||
3. Final = Base * (1 + Boosts).
|
||||
|
||||
Der edge_bonus_raw enthält bereits die Super-Edge-Aggregation (WP-15c).
|
||||
"""
|
||||
sem_w, edge_w_cfg, cent_w_cfg = get_weights()
|
||||
status_mult = get_status_multiplier(payload)
|
||||
|
||||
# Retriever Weight (Typ-Boost aus types.yaml, z.B. 1.1 für Decisions)
|
||||
node_weight = float(payload.get("retriever_weight", 1.0))
|
||||
|
||||
# 1. Berechnung des Base Scores (Semantik gewichtet durch Lifecycle-Status)
|
||||
# WICHTIG: Der Status wirkt hier als Multiplikator auf die Basis-Relevanz.
|
||||
base_val = float(semantic_score) * status_mult
|
||||
|
||||
# 2. Graph Boost Factor (Intent-spezifische Verstärkung aus decision_engine.yaml)
|
||||
# Erhöht das Gewicht des gesamten Graphen um 50%, wenn ein spezifischer Intent vorliegt.
|
||||
graph_boost_factor = 1.5 if dynamic_edge_boosts and (edge_bonus_raw > 0 or cent_bonus_raw > 0) else 1.0
|
||||
|
||||
# 3. Einzelne Graph-Komponenten berechnen
|
||||
# WP-15c Hinweis: edge_bonus_raw ist durch den retriever.py bereits gedämpft/aggregiert.
|
||||
edge_impact_final = (edge_w_cfg * edge_bonus_raw) * graph_boost_factor
|
||||
cent_impact_final = (cent_w_cfg * cent_bonus_raw) * graph_boost_factor
|
||||
|
||||
# 4. Finales Zusammenführen (Merging)
|
||||
# (node_weight - 1.0) wandelt das Gewicht in einen relativen Bonus um (z.B. 1.2 -> +0.2).
|
||||
# Alle Boni werden addiert und wirken dann auf den base_val.
|
||||
type_impact = node_weight - 1.0
|
||||
total_boost = 1.0 + type_impact + edge_impact_final + cent_impact_final
|
||||
|
||||
total = base_val * total_boost
|
||||
|
||||
# Sicherstellen, dass der Score niemals 0 oder negativ ist (Floor)
|
||||
final_score = max(0.0001, float(total))
|
||||
|
||||
# WP-24c v4.5.0-DEBUG: Retrieval-Tracer - Protokollierung der Score-Berechnung
|
||||
chunk_id = payload.get("chunk_id", payload.get("id", "unknown"))
|
||||
logger.debug(f"📈 [SCORE-TRACE] Chunk: {chunk_id} | Base: {base_val:.4f} | Multiplier: {total_boost:.2f} | Final: {final_score:.4f}")
|
||||
logger.debug(f" -> Details: StatusMult={status_mult:.2f}, TypeImpact={type_impact:.2f}, EdgeImpact={edge_impact_final:.4f}, CentImpact={cent_impact_final:.4f}")
|
||||
|
||||
return {
|
||||
"total": final_score,
|
||||
"edge_bonus": float(edge_bonus_raw),
|
||||
"cent_bonus": float(cent_bonus_raw),
|
||||
"status_multiplier": status_mult,
|
||||
"graph_boost_factor": graph_boost_factor,
|
||||
"type_impact": type_impact,
|
||||
"base_val": base_val,
|
||||
"edge_impact_final": edge_impact_final,
|
||||
"cent_impact_final": cent_impact_final
|
||||
}
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
from functools import lru_cache
|
||||
from jsonschema import Draft202012Validator, RefResolver
|
||||
|
||||
SCHEMAS_DIR = os.getenv("SCHEMAS_DIR", os.path.join(os.path.dirname(os.path.dirname(__file__)), "..", "schemas"))
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def load_schema(name: str) -> dict:
|
||||
# name: "note.schema.json" | "chunk.schema.json" | "edge.schema.json"
|
||||
path = os.path.join(SCHEMAS_DIR, name)
|
||||
if not os.path.isfile(path):
|
||||
raise FileNotFoundError(f"Schema not found: {path}")
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
@lru_cache(maxsize=16)
|
||||
def get_validator(name: str) -> Draft202012Validator:
|
||||
schema = load_schema(name)
|
||||
resolver = RefResolver.from_schema(schema)
|
||||
return Draft202012Validator(schema, resolver=resolver)
|
||||
103
app/core/type_registry.py
Normal file
103
app/core/type_registry.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
"""
|
||||
FILE: app/core/type_registry.py
|
||||
DESCRIPTION: Loader für types.yaml.
|
||||
WP-24c: Robustheits-Fix für chunking_profile vs chunk_profile.
|
||||
WP-14: Support für zentrale Registry-Strukturen.
|
||||
VERSION: 1.1.0 (Audit-Fix: Profile Key Consistency)
|
||||
STATUS: Active (Support für Legacy-Loader)
|
||||
DEPENDENCIES: yaml, os, functools
|
||||
EXTERNAL_CONFIG: config/types.yaml
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from typing import Dict, Any, Optional, Tuple
|
||||
import os
|
||||
|
||||
try:
|
||||
import yaml # PyYAML
|
||||
except Exception:
|
||||
yaml = None # wird erst benötigt, wenn eine Datei gelesen werden soll
|
||||
|
||||
# Konservativer Default – WP-24c: Nutzt nun konsistent 'chunking_profile'
|
||||
_DEFAULT_REGISTRY: Dict[str, Any] = {
|
||||
"version": "1.0",
|
||||
"types": {
|
||||
"concept": {
|
||||
"chunking_profile": "medium",
|
||||
"edge_defaults": ["references", "related_to"],
|
||||
"retriever_weight": 1.0,
|
||||
}
|
||||
},
|
||||
"_using_defaults": True,
|
||||
"_warning": "types.yaml missing or invalid – using built-in defaults (type=concept).",
|
||||
}
|
||||
|
||||
# Chunk-Profile → Overlap-Empfehlungen (nur für synthetische Fensterbildung)
|
||||
_PROFILE_TO_OVERLAP: Dict[str, Tuple[int, int]] = {
|
||||
"short": (20, 30),
|
||||
"medium": (40, 60),
|
||||
"long": (60, 80),
|
||||
}
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_type_registry(path: str = "config/types.yaml") -> Dict[str, Any]:
|
||||
"""
|
||||
Lädt die Registry aus 'path'. Bei Fehlern wird ein konserviver Default geliefert.
|
||||
Die Rückgabe ist prozessweit gecached.
|
||||
"""
|
||||
if not path:
|
||||
return dict(_DEFAULT_REGISTRY)
|
||||
|
||||
if not os.path.isfile(path):
|
||||
return dict(_DEFAULT_REGISTRY)
|
||||
|
||||
if yaml is None:
|
||||
return dict(_DEFAULT_REGISTRY)
|
||||
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
# Minimal validieren
|
||||
if not isinstance(data, dict) or "types" not in data or not isinstance(data["types"], dict):
|
||||
return dict(_DEFAULT_REGISTRY)
|
||||
data.setdefault("version", "1.0")
|
||||
data.setdefault("_using_defaults", False)
|
||||
return data
|
||||
except Exception:
|
||||
return dict(_DEFAULT_REGISTRY)
|
||||
|
||||
|
||||
def get_type_config(note_type: Optional[str], reg: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Extrahiert die Konfiguration für einen spezifischen Typ."""
|
||||
t = (note_type or "concept").strip().lower()
|
||||
types = (reg or {}).get("types", {}) if isinstance(reg, dict) else {}
|
||||
return types.get(t) or types.get("concept") or _DEFAULT_REGISTRY["types"]["concept"]
|
||||
|
||||
|
||||
def resolve_note_type(fm_type: Optional[str], reg: Dict[str, Any]) -> str:
|
||||
"""Liefert einen gültigen Typ (unbekannt → 'concept')."""
|
||||
t = (fm_type or "concept").strip().lower()
|
||||
types = (reg or {}).get("types", {}) if isinstance(reg, dict) else {}
|
||||
return t if t in types else "concept"
|
||||
|
||||
|
||||
def effective_chunk_profile(note_type: Optional[str], reg: Dict[str, Any]) -> Optional[str]:
|
||||
"""
|
||||
Ermittelt das aktive Chunking-Profil für einen Notiz-Typ.
|
||||
Fix (Audit-Problem 2): Prüft beide Key-Varianten für 100% Kompatibilität.
|
||||
"""
|
||||
cfg = get_type_config(note_type, reg)
|
||||
# Check 'chunking_profile' (Standard) OR 'chunk_profile' (Legacy/Fallback)
|
||||
prof = cfg.get("chunking_profile") or cfg.get("chunk_profile")
|
||||
if isinstance(prof, str) and prof.strip():
|
||||
return prof.strip().lower()
|
||||
return None
|
||||
|
||||
|
||||
def profile_overlap(profile: Optional[str]) -> Tuple[int, int]:
|
||||
"""Gibt eine Overlap-Empfehlung (low, high) für das Profil zurück."""
|
||||
if not profile:
|
||||
return _PROFILE_TO_OVERLAP["medium"]
|
||||
return _PROFILE_TO_OVERLAP.get(profile.strip().lower(), _PROFILE_TO_OVERLAP["medium"])
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
from __future__ import annotations
|
||||
from typing import Dict
|
||||
from jsonschema import ValidationError
|
||||
from .schema_loader import get_validator
|
||||
|
||||
NOTE_SCHEMA_NAME = "note.schema.json"
|
||||
|
||||
def validate_note_payload(payload: Dict) -> None:
|
||||
validator = get_validator(NOTE_SCHEMA_NAME)
|
||||
errors = sorted(validator.iter_errors(payload), key=lambda e: e.path)
|
||||
if errors:
|
||||
msgs = []
|
||||
for e in errors:
|
||||
loc = ".".join([str(x) for x in e.path]) or "<root>"
|
||||
msgs.append(f"{loc}: {e.message}")
|
||||
raise ValidationError(" | ".join(msgs))
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
"""
|
||||
Version 1
|
||||
"""
|
||||
from __future__ import annotations
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Optional
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
app = FastAPI(title="mindnet-embed", version="1.0")
|
||||
|
||||
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # 384-dim
|
||||
_model: SentenceTransformer | None = None
|
||||
|
||||
class EmbedIn(BaseModel):
|
||||
model: Optional[str] = None
|
||||
inputs: List[str]
|
||||
|
||||
class EmbedOut(BaseModel):
|
||||
embeddings: List[List[float]]
|
||||
|
||||
@app.on_event("startup")
|
||||
def _load_model():
|
||||
global _model
|
||||
_model = SentenceTransformer(MODEL_NAME)
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {"ok": True, "model": MODEL_NAME, "dim": 384}
|
||||
|
||||
@app.post("/embed", response_model=EmbedOut)
|
||||
def embed(payload: EmbedIn) -> EmbedOut:
|
||||
if _model is None:
|
||||
raise HTTPException(status_code=503, detail="Model not loaded")
|
||||
if not payload.inputs:
|
||||
return EmbedOut(embeddings=[])
|
||||
vecs = _model.encode(payload.inputs, normalize_embeddings=False).tolist()
|
||||
if any(len(v) != 384 for v in vecs):
|
||||
raise HTTPException(status_code=500, detail="Embedding size mismatch (expected 384)")
|
||||
return EmbedOut(embeddings=vecs)
|
||||
|
|
@ -1,6 +1,10 @@
|
|||
"""
|
||||
Version 0.1
|
||||
|
||||
FILE: app/embeddings.py
|
||||
DESCRIPTION: Lokaler Wrapper für SentenceTransformer Embeddings.
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active (Bestätigung durch Aufrufer erforderlich)
|
||||
DEPENDENCIES: app.config, sentence_transformers
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
59
app/frontend/ui.py
Normal file
59
app/frontend/ui.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
"""
|
||||
FILE: app/frontend/ui.py
|
||||
DESCRIPTION: Main Entrypoint für Streamlit. Router, der basierend auf Sidebar-Auswahl die Module (Chat, Editor, Graph) lädt.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, ui_config, ui_sidebar, ui_chat, ui_editor, ui_graph_service, ui_graph*, ui_graph_cytoscape
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
import uuid
|
||||
|
||||
# --- CONFIG & STYLING ---
|
||||
st.set_page_config(page_title="mindnet v2.6", page_icon="🧠", layout="wide")
|
||||
st.markdown("""
|
||||
<style>
|
||||
.block-container { padding-top: 2rem; max_width: 1200px; margin: auto; }
|
||||
.intent-badge { background-color: #e8f0fe; color: #1a73e8; padding: 4px 10px; border-radius: 12px; font-size: 0.8rem; font-weight: 600; border: 1px solid #d2e3fc; display: inline-block; margin-bottom: 0.5rem; }
|
||||
.draft-box { border: 1px solid #d0d7de; border-radius: 6px; padding: 16px; background-color: #f6f8fa; margin: 10px 0; }
|
||||
.preview-box { border: 1px solid #e0e0e0; border-radius: 6px; padding: 24px; background-color: white; }
|
||||
</style>
|
||||
""", unsafe_allow_html=True)
|
||||
|
||||
# --- MODULE IMPORTS ---
|
||||
try:
|
||||
from ui_config import QDRANT_URL, QDRANT_KEY, COLLECTION_PREFIX
|
||||
from ui_graph_service import GraphExplorerService
|
||||
|
||||
# Komponenten
|
||||
from ui_sidebar import render_sidebar
|
||||
from ui_chat import render_chat_interface
|
||||
from ui_editor import render_manual_editor
|
||||
|
||||
# Die beiden Graph-Engines
|
||||
from ui_graph import render_graph_explorer as render_graph_agraph
|
||||
from ui_graph_cytoscape import render_graph_explorer_cytoscape # <-- Import
|
||||
|
||||
except ImportError as e:
|
||||
st.error(f"Import Error: {e}. Bitte stelle sicher, dass alle UI-Dateien im Ordner liegen und 'streamlit-cytoscapejs' installiert ist.")
|
||||
st.stop()
|
||||
|
||||
# --- SESSION STATE ---
|
||||
if "messages" not in st.session_state: st.session_state.messages = []
|
||||
if "user_id" not in st.session_state: st.session_state.user_id = str(uuid.uuid4())
|
||||
|
||||
# --- SERVICE INIT ---
|
||||
graph_service = GraphExplorerService(QDRANT_URL, QDRANT_KEY, COLLECTION_PREFIX)
|
||||
|
||||
# --- MAIN ROUTING ---
|
||||
mode, top_k, explain = render_sidebar()
|
||||
|
||||
if mode == "💬 Chat":
|
||||
render_chat_interface(top_k, explain)
|
||||
elif mode == "📝 Manueller Editor":
|
||||
render_manual_editor()
|
||||
elif mode == "🕸️ Graph (Agraph)":
|
||||
render_graph_agraph(graph_service)
|
||||
elif mode == "🕸️ Graph (Cytoscape)":
|
||||
render_graph_explorer_cytoscape(graph_service)
|
||||
46
app/frontend/ui_api.py
Normal file
46
app/frontend/ui_api.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_api.py
|
||||
DESCRIPTION: Wrapper für Backend-Calls (Chat, Ingest, Feedback). Kapselt requests und Error-Handling.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: requests, streamlit, ui_config
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import requests
|
||||
import streamlit as st
|
||||
from ui_config import CHAT_ENDPOINT, INGEST_ANALYZE_ENDPOINT, INGEST_SAVE_ENDPOINT, FEEDBACK_ENDPOINT, API_TIMEOUT
|
||||
|
||||
def send_chat_message(message: str, top_k: int, explain: bool):
|
||||
try:
|
||||
response = requests.post(
|
||||
CHAT_ENDPOINT,
|
||||
json={"message": message, "top_k": top_k, "explain": explain},
|
||||
timeout=API_TIMEOUT
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
def analyze_draft_text(text: str, n_type: str):
|
||||
try:
|
||||
response = requests.post(INGEST_ANALYZE_ENDPOINT, json={"text": text, "type": n_type}, timeout=15)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
def save_draft_to_vault(markdown_content: str, filename: str = None):
|
||||
try:
|
||||
response = requests.post(INGEST_SAVE_ENDPOINT, json={"markdown_content": markdown_content, "filename": filename}, timeout=API_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
def submit_feedback(query_id, node_id, score, comment=None):
|
||||
try:
|
||||
requests.post(FEEDBACK_ENDPOINT, json={"query_id": query_id, "node_id": node_id, "score": score, "comment": comment}, timeout=2)
|
||||
st.toast(f"Feedback ({score}) gesendet!")
|
||||
except: pass
|
||||
70
app/frontend/ui_callbacks.py
Normal file
70
app/frontend/ui_callbacks.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_callbacks.py
|
||||
DESCRIPTION: Event-Handler für UI-Interaktionen. Implementiert den Übergang vom Graphen zum Editor (State Transfer).
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, os, ui_utils
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
import os
|
||||
from ui_utils import build_markdown_doc
|
||||
|
||||
def switch_to_editor_callback(note_payload):
|
||||
"""
|
||||
Callback für den 'Bearbeiten'-Button im Graphen.
|
||||
Versucht, die Datei direkt aus dem Vault (Dateisystem) zu lesen.
|
||||
Das garantiert, dass Frontmatter und Inhalt vollständig sind (Single Source of Truth).
|
||||
"""
|
||||
# 1. Pfad ermitteln (Priorität auf 'path' aus Qdrant)
|
||||
origin_fname = note_payload.get('path')
|
||||
|
||||
# Fallback für Legacy-Datenfelder
|
||||
if not origin_fname:
|
||||
origin_fname = note_payload.get('file_path') or note_payload.get('filename')
|
||||
|
||||
content = ""
|
||||
file_loaded = False
|
||||
|
||||
# 2. Versuch: Direkt von der Festplatte lesen
|
||||
# Wir prüfen, ob der Pfad existiert und lesen den aktuellen Stand der Datei.
|
||||
if origin_fname and os.path.exists(origin_fname):
|
||||
try:
|
||||
with open(origin_fname, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
file_loaded = True
|
||||
except Exception as e:
|
||||
# Fehler im Terminal loggen, aber UI nicht crashen lassen
|
||||
print(f"Fehler beim Lesen von {origin_fname}: {e}")
|
||||
|
||||
# 3. Fallback: Inhalt aus Qdrant nehmen (wenn Datei nicht zugreifbar)
|
||||
if not file_loaded:
|
||||
# Wir nehmen 'fulltext' aus dem Payload
|
||||
content = note_payload.get('fulltext', '')
|
||||
|
||||
if not content:
|
||||
# Letzter Ausweg: Metadaten nehmen und Dummy-Content bauen
|
||||
content = build_markdown_doc(note_payload, "Inhalt konnte nicht geladen werden (Datei nicht gefunden).")
|
||||
else:
|
||||
# Check: Hat der Text ein Frontmatter? Wenn nein, rekonstruieren wir es.
|
||||
if not content.strip().startswith("---"):
|
||||
content = build_markdown_doc(note_payload, content)
|
||||
|
||||
# Notfall-Pfad Konstruktion (falls gar kein Pfad im System ist)
|
||||
if not origin_fname and 'note_id' in note_payload:
|
||||
origin_fname = f"{note_payload['note_id']}.md"
|
||||
|
||||
# 4. Daten an den Editor übergeben
|
||||
# Wir nutzen den Chat-Verlauf als Transportmittel für den State
|
||||
st.session_state.messages.append({
|
||||
"role": "assistant",
|
||||
"intent": "INTERVIEW",
|
||||
"content": content,
|
||||
"query_id": f"edit_{note_payload.get('note_id', 'unknown')}", # Trigger für den Editor
|
||||
"origin_filename": origin_fname,
|
||||
"origin_note_id": note_payload.get('note_id')
|
||||
})
|
||||
|
||||
# 5. Modus umschalten (wechselt den Tab beim nächsten Rerun)
|
||||
st.session_state["sidebar_mode_selection"] = "📝 Manueller Editor"
|
||||
87
app/frontend/ui_chat.py
Normal file
87
app/frontend/ui_chat.py
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_chat.py
|
||||
DESCRIPTION: Chat-UI. Rendert Nachrichtenverlauf, Quellen-Expanders mit Feedback-Buttons und delegiert bei Bedarf an den Editor.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, ui_api, ui_editor
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
from ui_api import send_chat_message, submit_feedback
|
||||
from ui_editor import render_draft_editor
|
||||
|
||||
def render_chat_interface(top_k, explain):
|
||||
"""
|
||||
Rendert das Chat-Interface.
|
||||
Zeigt Nachrichten an und behandelt User-Input.
|
||||
"""
|
||||
# 1. Verlauf anzeigen
|
||||
for idx, msg in enumerate(st.session_state.messages):
|
||||
with st.chat_message(msg["role"]):
|
||||
if msg["role"] == "assistant":
|
||||
# Intent Badge
|
||||
intent = msg.get("intent", "UNKNOWN")
|
||||
st.markdown(f'<div class="intent-badge">Intent: {intent}</div>', unsafe_allow_html=True)
|
||||
|
||||
# Debugging (optional, gut für Entwicklung)
|
||||
with st.expander("🐞 Payload", expanded=False):
|
||||
st.json(msg)
|
||||
|
||||
# Unterscheidung: Normaler Text oder Editor-Modus (Interview)
|
||||
if intent == "INTERVIEW":
|
||||
render_draft_editor(msg)
|
||||
else:
|
||||
st.markdown(msg["content"])
|
||||
|
||||
# Quellen anzeigen
|
||||
if "sources" in msg and msg["sources"]:
|
||||
for hit in msg["sources"]:
|
||||
score = hit.get('total_score', 0)
|
||||
# Wenn score None ist, 0.0 annehmen
|
||||
if score is None: score = 0.0
|
||||
|
||||
with st.expander(f"📄 {hit.get('note_id', '?')} ({score:.2f})"):
|
||||
st.markdown(f"_{hit.get('source', {}).get('text', '')[:300]}..._")
|
||||
|
||||
# Explanation Layer
|
||||
if hit.get('explanation'):
|
||||
st.caption(f"Grund: {hit['explanation']['reasons'][0]['message']}")
|
||||
|
||||
# Feedback Buttons pro Source
|
||||
def _cb(qid=msg.get("query_id"), nid=hit.get('node_id')):
|
||||
val = st.session_state.get(f"fb_src_{qid}_{nid}")
|
||||
if val is not None: submit_feedback(qid, nid, val+1)
|
||||
|
||||
st.feedback("faces", key=f"fb_src_{msg.get('query_id')}_{hit.get('node_id')}", on_change=_cb)
|
||||
|
||||
# Globales Feedback für die Antwort
|
||||
if "query_id" in msg:
|
||||
qid = msg["query_id"]
|
||||
st.feedback("stars", key=f"fb_glob_{qid}", on_change=lambda: submit_feedback(qid, "generated_answer", st.session_state[f"fb_glob_{qid}"]+1))
|
||||
else:
|
||||
# User Nachricht
|
||||
st.markdown(msg["content"])
|
||||
|
||||
# 2. Input Feld
|
||||
if prompt := st.chat_input("Frage Mindnet..."):
|
||||
st.session_state.messages.append({"role": "user", "content": prompt})
|
||||
st.rerun()
|
||||
|
||||
# 3. Antwort generieren (wenn letzte Nachricht vom User ist)
|
||||
if len(st.session_state.messages) > 0 and st.session_state.messages[-1]["role"] == "user":
|
||||
with st.chat_message("assistant"):
|
||||
with st.spinner("Thinking..."):
|
||||
resp = send_chat_message(st.session_state.messages[-1]["content"], top_k, explain)
|
||||
|
||||
if "error" in resp:
|
||||
st.error(resp["error"])
|
||||
else:
|
||||
st.session_state.messages.append({
|
||||
"role": "assistant",
|
||||
"content": resp.get("answer"),
|
||||
"intent": resp.get("intent", "FACT"),
|
||||
"sources": resp.get("sources", []),
|
||||
"query_id": resp.get("query_id")
|
||||
})
|
||||
st.rerun()
|
||||
88
app/frontend/ui_config.py
Normal file
88
app/frontend/ui_config.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_config.py
|
||||
DESCRIPTION: Zentrale Konfiguration für das Frontend. Definiert API-Endpoints, Timeouts und Graph-Styles (Farben).
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: os, hashlib, dotenv, pathlib
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
from dotenv import load_dotenv
|
||||
from pathlib import Path
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# --- API & PORTS ---
|
||||
API_BASE_URL = os.getenv("MINDNET_API_URL", "http://localhost:8002")
|
||||
CHAT_ENDPOINT = f"{API_BASE_URL}/chat"
|
||||
FEEDBACK_ENDPOINT = f"{API_BASE_URL}/feedback"
|
||||
INGEST_ANALYZE_ENDPOINT = f"{API_BASE_URL}/ingest/analyze"
|
||||
INGEST_SAVE_ENDPOINT = f"{API_BASE_URL}/ingest/save"
|
||||
|
||||
# --- QDRANT ---
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
QDRANT_KEY = os.getenv("QDRANT_API_KEY", None)
|
||||
if QDRANT_KEY == "": QDRANT_KEY = None
|
||||
COLLECTION_PREFIX = os.getenv("COLLECTION_PREFIX", "mindnet")
|
||||
|
||||
# --- FILES & TIMEOUTS ---
|
||||
HISTORY_FILE = Path("data/logs/search_history.jsonl")
|
||||
timeout_setting = os.getenv("MINDNET_API_TIMEOUT") or os.getenv("MINDNET_LLM_TIMEOUT")
|
||||
API_TIMEOUT = float(timeout_setting) if timeout_setting else 300.0
|
||||
|
||||
# --- STYLING CONSTANTS ---
|
||||
|
||||
# Basierend auf types.yaml
|
||||
GRAPH_COLORS = {
|
||||
# Kerntypen
|
||||
"experience": "#feca57", # Gelb/Orange
|
||||
"project": "#ff9f43", # Dunkleres Orange
|
||||
"decision": "#5f27cd", # Lila
|
||||
|
||||
# Persönlichkeit
|
||||
"value": "#00d2d3", # Cyan
|
||||
"principle": "#0abde3", # Dunkles Cyan
|
||||
"belief": "#48dbfb", # Helles Blau
|
||||
"profile": "#1dd1a1", # Grün
|
||||
|
||||
# Strategie & Risiko
|
||||
"goal": "#ff9ff3", # Pink
|
||||
"risk": "#ff6b6b", # Rot
|
||||
|
||||
# Basis
|
||||
"concept": "#54a0ff", # Blau
|
||||
"task": "#8395a7", # Grau-Blau
|
||||
"journal": "#c8d6e5", # Hellgrau
|
||||
"source": "#576574", # Dunkelgrau
|
||||
"glossary": "#222f3e", # Sehr dunkel
|
||||
|
||||
"default": "#8395a7" # Fallback
|
||||
}
|
||||
|
||||
# System-Kanten, die wir NICHT im Graphen sehen wollen, um Rauschen zu reduzieren
|
||||
SYSTEM_EDGES = ["prev", "next", "belongs_to"]
|
||||
|
||||
def get_edge_color(kind: str) -> str:
|
||||
"""Generiert eine deterministische Farbe basierend auf dem Edge-Typ."""
|
||||
if not kind: return "#bdc3c7"
|
||||
|
||||
# Einige feste Farben für wichtige semantische Typen
|
||||
fixed_colors = {
|
||||
"depends_on": "#ff6b6b", # Rot (Blocker/Abhängigkeit)
|
||||
"blocks": "#ee5253", # Dunkelrot
|
||||
"caused_by": "#ff9ff3", # Pink
|
||||
"related_to": "#c8d6e5", # Hellgrau (Hintergrund)
|
||||
"references": "#bdc3c7", # Grau
|
||||
"derived_from": "#1dd1a1" # Grün
|
||||
}
|
||||
|
||||
if kind in fixed_colors:
|
||||
return fixed_colors[kind]
|
||||
|
||||
# Fallback: Hash-basierte Farbe für dynamische Typen
|
||||
# Wir nutzen einen Pastell-Generator, damit es nicht zu grell wird
|
||||
hash_obj = hashlib.md5(kind.encode())
|
||||
hue = int(hash_obj.hexdigest(), 16) % 360
|
||||
return f"hsl({hue}, 60%, 50%)"
|
||||
223
app/frontend/ui_editor.py
Normal file
223
app/frontend/ui_editor.py
Normal file
|
|
@ -0,0 +1,223 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_editor.py
|
||||
DESCRIPTION: Markdown-Editor mit Live-Vorschau.
|
||||
Refactored für WP-14: Asynchrones Feedback-Handling (Queued State).
|
||||
VERSION: 2.7.0 (Fix: Async Save UI)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, uuid, re, datetime, ui_utils, ui_api
|
||||
"""
|
||||
import streamlit as st
|
||||
import uuid
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
from ui_utils import parse_markdown_draft, build_markdown_doc, slugify
|
||||
from ui_api import save_draft_to_vault, analyze_draft_text
|
||||
|
||||
def render_draft_editor(msg):
|
||||
"""
|
||||
Rendert den Markdown-Editor.
|
||||
Nutzt 'origin_filename' aus der Message, um zwischen Update und Neu zu unterscheiden.
|
||||
"""
|
||||
if "query_id" not in msg or not msg["query_id"]:
|
||||
msg["query_id"] = str(uuid.uuid4())
|
||||
|
||||
qid = msg["query_id"]
|
||||
key_base = f"draft_{qid}"
|
||||
|
||||
# State Keys
|
||||
data_meta_key = f"{key_base}_data_meta"
|
||||
data_sugg_key = f"{key_base}_data_suggestions"
|
||||
widget_body_key = f"{key_base}_widget_body"
|
||||
data_body_key = f"{key_base}_data_body"
|
||||
|
||||
# --- INIT STATE ---
|
||||
if f"{key_base}_init" not in st.session_state:
|
||||
meta, body = parse_markdown_draft(msg["content"])
|
||||
if "type" not in meta: meta["type"] = "default"
|
||||
if "title" not in meta: meta["title"] = ""
|
||||
tags = meta.get("tags", [])
|
||||
meta["tags_str"] = ", ".join(tags) if isinstance(tags, list) else str(tags)
|
||||
|
||||
st.session_state[data_meta_key] = meta
|
||||
st.session_state[data_sugg_key] = []
|
||||
st.session_state[data_body_key] = body.strip()
|
||||
|
||||
st.session_state[f"{key_base}_wdg_title"] = meta["title"]
|
||||
st.session_state[f"{key_base}_wdg_type"] = meta["type"]
|
||||
st.session_state[f"{key_base}_wdg_tags"] = meta["tags_str"]
|
||||
|
||||
# Pfad übernehmen (Source of Truth)
|
||||
st.session_state[f"{key_base}_origin_filename"] = msg.get("origin_filename")
|
||||
st.session_state[f"{key_base}_init"] = True
|
||||
|
||||
# --- RESURRECTION ---
|
||||
if widget_body_key not in st.session_state and data_body_key in st.session_state:
|
||||
st.session_state[widget_body_key] = st.session_state[data_body_key]
|
||||
|
||||
# --- SYNC HELPER ---
|
||||
def _sync_meta():
|
||||
meta = st.session_state[data_meta_key]
|
||||
meta["title"] = st.session_state.get(f"{key_base}_wdg_title", "")
|
||||
meta["type"] = st.session_state.get(f"{key_base}_wdg_type", "default")
|
||||
meta["tags_str"] = st.session_state.get(f"{key_base}_wdg_tags", "")
|
||||
st.session_state[data_meta_key] = meta
|
||||
|
||||
def _sync_body():
|
||||
st.session_state[data_body_key] = st.session_state[widget_body_key]
|
||||
|
||||
def _insert_text(t):
|
||||
st.session_state[widget_body_key] = f"{st.session_state.get(widget_body_key, '')}\n\n{t}"
|
||||
st.session_state[data_body_key] = st.session_state[widget_body_key]
|
||||
|
||||
def _remove_text(t):
|
||||
st.session_state[widget_body_key] = st.session_state.get(widget_body_key, '').replace(t, "").strip()
|
||||
st.session_state[data_body_key] = st.session_state[widget_body_key]
|
||||
|
||||
# --- UI LAYOUT ---
|
||||
|
||||
origin_fname = st.session_state.get(f"{key_base}_origin_filename")
|
||||
|
||||
if origin_fname:
|
||||
display_name = str(origin_fname).split("/")[-1]
|
||||
st.success(f"📂 **Update-Modus**: `{display_name}`")
|
||||
with st.expander("Dateipfad Details", expanded=False):
|
||||
st.code(origin_fname)
|
||||
st.markdown(f'<div class="draft-box" style="border-left: 5px solid #ff9f43;">', unsafe_allow_html=True)
|
||||
else:
|
||||
st.info("✨ **Erstell-Modus**: Neue Datei wird angelegt.")
|
||||
st.markdown(f'<div class="draft-box">', unsafe_allow_html=True)
|
||||
|
||||
st.markdown("### Editor")
|
||||
|
||||
# Meta Felder
|
||||
meta_ref = st.session_state[data_meta_key]
|
||||
c1, c2 = st.columns([2, 1])
|
||||
with c1:
|
||||
st.text_input("Titel", key=f"{key_base}_wdg_title", on_change=_sync_meta)
|
||||
with c2:
|
||||
known_types = ["concept", "project", "decision", "experience", "journal", "value", "goal", "principle", "risk", "belief"]
|
||||
curr_type = st.session_state.get(f"{key_base}_wdg_type", meta_ref["type"])
|
||||
if curr_type not in known_types: known_types.append(curr_type)
|
||||
st.selectbox("Typ", known_types, key=f"{key_base}_wdg_type", on_change=_sync_meta)
|
||||
|
||||
st.text_input("Tags", key=f"{key_base}_wdg_tags", on_change=_sync_meta)
|
||||
|
||||
# Tabs
|
||||
tab_edit, tab_intel, tab_view = st.tabs(["✏️ Inhalt", "🧠 Intelligence", "👁️ Vorschau"])
|
||||
|
||||
with tab_edit:
|
||||
st.text_area("Body", key=widget_body_key, height=600, on_change=_sync_body, label_visibility="collapsed")
|
||||
|
||||
with tab_intel:
|
||||
st.info("Analysiert den Text auf Verknüpfungsmöglichkeiten.")
|
||||
if st.button("🔍 Analyse starten", key=f"{key_base}_analyze"):
|
||||
st.session_state[data_sugg_key] = []
|
||||
text_to_analyze = st.session_state.get(widget_body_key, st.session_state.get(data_body_key, ""))
|
||||
with st.spinner("Analysiere..."):
|
||||
analysis = analyze_draft_text(text_to_analyze, st.session_state.get(f"{key_base}_wdg_type", "concept"))
|
||||
if "error" in analysis:
|
||||
st.error(f"Fehler: {analysis['error']}")
|
||||
else:
|
||||
suggestions = analysis.get("suggestions", [])
|
||||
st.session_state[data_sugg_key] = suggestions
|
||||
if not suggestions: st.warning("Keine Vorschläge.")
|
||||
else: st.success(f"{len(suggestions)} Vorschläge gefunden.")
|
||||
|
||||
suggestions = st.session_state[data_sugg_key]
|
||||
if suggestions:
|
||||
current_text = st.session_state.get(widget_body_key, "")
|
||||
for idx, sugg in enumerate(suggestions):
|
||||
link_text = sugg.get('suggested_markdown', '')
|
||||
is_inserted = link_text in current_text
|
||||
bg_color = "#e6fffa" if is_inserted else "#ffffff"
|
||||
border = "3px solid #28a745" if is_inserted else "3px solid #1a73e8"
|
||||
st.markdown(f"<div style='border-left: {border}; background-color: {bg_color}; padding: 10px; margin-bottom: 8px;'><b>{sugg.get('target_title')}</b> <small>({sugg.get('type')})</small><br><i>{sugg.get('reason')}</i><br><code>{link_text}</code></div>", unsafe_allow_html=True)
|
||||
if is_inserted:
|
||||
st.button("❌ Entfernen", key=f"del_{idx}_{key_base}", on_click=_remove_text, args=(link_text,))
|
||||
else:
|
||||
st.button("➕ Einfügen", key=f"add_{idx}_{key_base}", on_click=_insert_text, args=(link_text,))
|
||||
|
||||
# Save Logic Preparation
|
||||
final_tags = [t.strip() for t in st.session_state.get(f"{key_base}_wdg_tags", "").split(",") if t.strip()]
|
||||
final_meta = {
|
||||
"id": "generated_on_save",
|
||||
"type": st.session_state.get(f"{key_base}_wdg_type", "default"),
|
||||
"title": st.session_state.get(f"{key_base}_wdg_title", "").strip(),
|
||||
"status": "draft",
|
||||
"tags": final_tags
|
||||
}
|
||||
if "origin_note_id" in msg:
|
||||
final_meta["id"] = msg["origin_note_id"]
|
||||
|
||||
final_body = st.session_state.get(widget_body_key, st.session_state[data_body_key])
|
||||
if not final_meta["title"]:
|
||||
h1_match = re.search(r"^#\s+(.*)$", final_body, re.MULTILINE)
|
||||
if h1_match: final_meta["title"] = h1_match.group(1).strip()
|
||||
|
||||
final_doc = build_markdown_doc(final_meta, final_body)
|
||||
|
||||
with tab_view:
|
||||
st.markdown('<div class="preview-box">', unsafe_allow_html=True)
|
||||
st.markdown(final_doc)
|
||||
st.markdown('</div>', unsafe_allow_html=True)
|
||||
|
||||
st.markdown("---")
|
||||
|
||||
# Save Actions
|
||||
b1, b2 = st.columns([1, 1])
|
||||
with b1:
|
||||
save_label = "💾 Update speichern" if origin_fname else "💾 Neu anlegen & Indizieren"
|
||||
|
||||
if st.button(save_label, type="primary", key=f"{key_base}_save"):
|
||||
with st.spinner("Sende an Backend..."):
|
||||
if origin_fname:
|
||||
target_file = origin_fname
|
||||
else:
|
||||
raw_title = final_meta.get("title", "draft")
|
||||
target_file = f"{datetime.now().strftime('%Y%m%d')}-{slugify(raw_title)[:60]}.md"
|
||||
|
||||
result = save_draft_to_vault(final_doc, filename=target_file)
|
||||
|
||||
# --- WP-14 CHANGE START: Handling Async Response ---
|
||||
if "error" in result:
|
||||
st.error(f"Fehler: {result['error']}")
|
||||
else:
|
||||
status = result.get("status", "success")
|
||||
file_path = result.get("file_path", "unbekannt")
|
||||
|
||||
if status == "queued":
|
||||
# Neuer Status für Async Processing
|
||||
st.info(f"✅ **Eingereiht:** Datei `{file_path}` wurde gespeichert.")
|
||||
st.caption("Die KI-Analyse und Indizierung läuft im Hintergrund. Du kannst weiterarbeiten.")
|
||||
else:
|
||||
# Legacy / Synchroner Fall
|
||||
st.success(f"Gespeichert: {file_path}")
|
||||
|
||||
st.balloons()
|
||||
# --- WP-14 CHANGE END ---
|
||||
|
||||
with b2:
|
||||
if st.button("📋 Code anzeigen", key=f"{key_base}_btn_copy"):
|
||||
st.code(final_doc, language="markdown")
|
||||
|
||||
st.markdown("</div>", unsafe_allow_html=True)
|
||||
|
||||
def render_manual_editor():
|
||||
"""
|
||||
Rendert den manuellen Editor.
|
||||
"""
|
||||
target_msg = None
|
||||
if st.session_state.messages:
|
||||
last_msg = st.session_state.messages[-1]
|
||||
qid = str(last_msg.get("query_id", ""))
|
||||
if qid.startswith("edit_"):
|
||||
target_msg = last_msg
|
||||
|
||||
if not target_msg:
|
||||
target_msg = {
|
||||
"content": "---\ntype: concept\ntitle: Neue Notiz\nstatus: draft\ntags: []\n---\n# Titel\n",
|
||||
"query_id": f"manual_{uuid.uuid4()}"
|
||||
}
|
||||
|
||||
render_draft_editor(target_msg)
|
||||
162
app/frontend/ui_graph.py
Normal file
162
app/frontend/ui_graph.py
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_graph.py
|
||||
DESCRIPTION: Legacy Graph-Explorer (Streamlit-Agraph). Implementiert Physik-Simulation (BarnesHut) und direkten Editor-Sprung.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Maintenance (Active Fallback)
|
||||
DEPENDENCIES: streamlit, streamlit_agraph, qdrant_client, ui_config, ui_callbacks
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
from streamlit_agraph import agraph, Config
|
||||
from qdrant_client import models
|
||||
from ui_config import COLLECTION_PREFIX, GRAPH_COLORS
|
||||
from ui_callbacks import switch_to_editor_callback
|
||||
|
||||
def render_graph_explorer(graph_service):
|
||||
st.header("🕸️ Graph Explorer")
|
||||
|
||||
# Session State initialisieren
|
||||
if "graph_center_id" not in st.session_state:
|
||||
st.session_state.graph_center_id = None
|
||||
|
||||
# Defaults für View & Physik setzen
|
||||
st.session_state.setdefault("graph_depth", 2)
|
||||
st.session_state.setdefault("graph_show_labels", True)
|
||||
# Höhere Default-Werte für Abstand
|
||||
st.session_state.setdefault("graph_spacing", 250)
|
||||
st.session_state.setdefault("graph_gravity", -4000)
|
||||
|
||||
col_ctrl, col_graph = st.columns([1, 4])
|
||||
|
||||
# --- LINKE SPALTE: CONTROLS ---
|
||||
with col_ctrl:
|
||||
st.subheader("Fokus")
|
||||
|
||||
# Sucheingabe
|
||||
search_term = st.text_input("Suche Notiz", placeholder="Titel eingeben...")
|
||||
|
||||
# Suchlogik Qdrant
|
||||
if search_term:
|
||||
hits, _ = graph_service.client.scroll(
|
||||
collection_name=f"{COLLECTION_PREFIX}_notes",
|
||||
scroll_filter=models.Filter(must=[models.FieldCondition(key="title", match=models.MatchText(text=search_term))]),
|
||||
limit=10
|
||||
)
|
||||
options = {h.payload['title']: h.payload['note_id'] for h in hits}
|
||||
|
||||
if options:
|
||||
selected_title = st.selectbox("Ergebnisse:", list(options.keys()))
|
||||
if st.button("Laden", use_container_width=True):
|
||||
st.session_state.graph_center_id = options[selected_title]
|
||||
st.rerun()
|
||||
|
||||
st.divider()
|
||||
|
||||
# Layout & Physik Einstellungen
|
||||
with st.expander("👁️ Ansicht & Layout", expanded=True):
|
||||
st.session_state.graph_depth = st.slider("Tiefe (Tier)", 1, 3, st.session_state.graph_depth)
|
||||
st.session_state.graph_show_labels = st.checkbox("Kanten-Beschriftung", st.session_state.graph_show_labels)
|
||||
|
||||
st.markdown("**Physik (BarnesHut)**")
|
||||
st.session_state.graph_spacing = st.slider("Federlänge (Abstand)", 50, 800, st.session_state.graph_spacing)
|
||||
st.session_state.graph_gravity = st.slider("Abstoßung (Gravity)", -20000, -500, st.session_state.graph_gravity)
|
||||
|
||||
if st.button("Reset Layout"):
|
||||
st.session_state.graph_spacing = 250
|
||||
st.session_state.graph_gravity = -4000
|
||||
st.rerun()
|
||||
|
||||
st.divider()
|
||||
st.caption("Legende (Top Typen)")
|
||||
for k, v in list(GRAPH_COLORS.items())[:8]:
|
||||
st.markdown(f"<span style='color:{v}'>●</span> {k}", unsafe_allow_html=True)
|
||||
|
||||
# --- RECHTE SPALTE: GRAPH & ACTION BAR ---
|
||||
with col_graph:
|
||||
center_id = st.session_state.graph_center_id
|
||||
|
||||
if center_id:
|
||||
# Action Container oben fixieren (Layout Fix)
|
||||
action_container = st.container()
|
||||
|
||||
# Graph und Daten laden
|
||||
with st.spinner(f"Lade Graph..."):
|
||||
nodes, edges = graph_service.get_ego_graph(
|
||||
center_id,
|
||||
depth=st.session_state.graph_depth,
|
||||
show_labels=st.session_state.graph_show_labels
|
||||
)
|
||||
|
||||
# WICHTIG: Daten für Editor holen (inkl. Pfad)
|
||||
note_data = graph_service.get_note_with_full_content(center_id)
|
||||
|
||||
# Action Bar rendern
|
||||
with action_container:
|
||||
c1, c2 = st.columns([3, 1])
|
||||
with c1:
|
||||
st.caption(f"Aktives Zentrum: **{center_id}**")
|
||||
with c2:
|
||||
if note_data:
|
||||
st.button("📝 Bearbeiten",
|
||||
use_container_width=True,
|
||||
on_click=switch_to_editor_callback,
|
||||
args=(note_data,))
|
||||
else:
|
||||
st.error("Datenfehler: Note nicht gefunden")
|
||||
|
||||
# Debug Inspector
|
||||
with st.expander("🕵️ Data Inspector", expanded=False):
|
||||
if note_data:
|
||||
st.json(note_data)
|
||||
if 'path' in note_data:
|
||||
st.success(f"Pfad OK: {note_data['path']}")
|
||||
else:
|
||||
st.error("Pfad fehlt!")
|
||||
else:
|
||||
st.info("Leer.")
|
||||
|
||||
if not nodes:
|
||||
st.warning("Keine Daten gefunden.")
|
||||
else:
|
||||
# --- CONFIGURATION (BarnesHut) ---
|
||||
# Height-Trick für Re-Render (da key-Parameter manchmal crasht)
|
||||
dyn_height = 800 + (abs(st.session_state.graph_gravity) % 5)
|
||||
|
||||
config = Config(
|
||||
width=1000,
|
||||
height=dyn_height,
|
||||
directed=True,
|
||||
physics={
|
||||
"enabled": True,
|
||||
"solver": "barnesHut",
|
||||
"barnesHut": {
|
||||
"gravitationalConstant": st.session_state.graph_gravity,
|
||||
"centralGravity": 0.005, # Extrem wichtig für die Ausbreitung!
|
||||
"springLength": st.session_state.graph_spacing,
|
||||
"springConstant": 0.04,
|
||||
"damping": 0.09,
|
||||
"avoidOverlap": 0.1
|
||||
},
|
||||
"stabilization": {"enabled": True, "iterations": 600}
|
||||
},
|
||||
hierarchical=False,
|
||||
nodeHighlightBehavior=True,
|
||||
highlightColor="#F7A7A6",
|
||||
collapsible=False
|
||||
)
|
||||
|
||||
return_value = agraph(nodes=nodes, edges=edges, config=config)
|
||||
|
||||
# Interaktions-Logik (Klick auf Node)
|
||||
if return_value:
|
||||
if return_value != center_id:
|
||||
# Navigation: Neues Zentrum setzen
|
||||
st.session_state.graph_center_id = return_value
|
||||
st.rerun()
|
||||
else:
|
||||
# Klick auf das Zentrum selbst
|
||||
st.toast(f"Zentrum: {return_value}")
|
||||
|
||||
else:
|
||||
st.info("👈 Bitte wähle links eine Notiz aus, um den Graphen zu starten.")
|
||||
397
app/frontend/ui_graph_cytoscape.py
Normal file
397
app/frontend/ui_graph_cytoscape.py
Normal file
|
|
@ -0,0 +1,397 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_graph_cytoscape.py
|
||||
DESCRIPTION: Moderner Graph-Explorer (Cytoscape.js). Features: COSE-Layout, Deep-Linking (URL Params), Active Inspector Pattern (CSS-Styling ohne Re-Render).
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, st_cytoscape, qdrant_client, ui_config, ui_callbacks
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
from st_cytoscape import cytoscape
|
||||
from qdrant_client import models
|
||||
from ui_config import COLLECTION_PREFIX, GRAPH_COLORS
|
||||
from ui_callbacks import switch_to_editor_callback
|
||||
|
||||
def update_url_params():
|
||||
"""Callback: Schreibt Slider-Werte in die URL und synchronisiert den State."""
|
||||
# Werte aus den Slider-Keys in die Logik-Variablen übertragen
|
||||
if "cy_depth_slider" in st.session_state:
|
||||
st.session_state.cy_depth = st.session_state.cy_depth_slider
|
||||
if "cy_len_slider" in st.session_state:
|
||||
st.session_state.cy_ideal_edge_len = st.session_state.cy_len_slider
|
||||
if "cy_rep_slider" in st.session_state:
|
||||
st.session_state.cy_node_repulsion = st.session_state.cy_rep_slider
|
||||
|
||||
# In URL schreiben
|
||||
st.query_params["depth"] = st.session_state.cy_depth
|
||||
st.query_params["len"] = st.session_state.cy_ideal_edge_len
|
||||
st.query_params["rep"] = st.session_state.cy_node_repulsion
|
||||
|
||||
def render_graph_explorer_cytoscape(graph_service):
|
||||
st.header("🕸️ Graph Explorer (Cytoscape)")
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 1. STATE & PERSISTENZ
|
||||
# ---------------------------------------------------------
|
||||
if "graph_center_id" not in st.session_state:
|
||||
st.session_state.graph_center_id = None
|
||||
|
||||
if "graph_inspected_id" not in st.session_state:
|
||||
st.session_state.graph_inspected_id = None
|
||||
|
||||
# Lade Einstellungen aus der URL (falls vorhanden), sonst Defaults
|
||||
params = st.query_params
|
||||
|
||||
# Helper um sicher int zu parsen
|
||||
def get_param(key, default):
|
||||
try: return int(params.get(key, default))
|
||||
except: return default
|
||||
|
||||
# Initialisiere Session State Variablen, falls noch nicht vorhanden
|
||||
if "cy_depth" not in st.session_state:
|
||||
st.session_state.cy_depth = get_param("depth", 2)
|
||||
|
||||
if "cy_ideal_edge_len" not in st.session_state:
|
||||
st.session_state.cy_ideal_edge_len = get_param("len", 150)
|
||||
|
||||
if "cy_node_repulsion" not in st.session_state:
|
||||
st.session_state.cy_node_repulsion = get_param("rep", 1000000)
|
||||
|
||||
col_ctrl, col_graph = st.columns([1, 4])
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 2. LINKES PANEL (Steuerung)
|
||||
# ---------------------------------------------------------
|
||||
with col_ctrl:
|
||||
st.subheader("Fokus")
|
||||
|
||||
search_term = st.text_input("Suche Notiz", placeholder="Titel eingeben...", key="cy_search")
|
||||
|
||||
if search_term:
|
||||
try:
|
||||
hits, _ = graph_service.client.scroll(
|
||||
collection_name=f"{COLLECTION_PREFIX}_notes",
|
||||
limit=10,
|
||||
scroll_filter=models.Filter(must=[models.FieldCondition(key="title", match=models.MatchText(text=search_term))])
|
||||
)
|
||||
options = {}
|
||||
for h in hits:
|
||||
if h.payload and 'title' in h.payload and 'note_id' in h.payload:
|
||||
title = h.payload['title']
|
||||
note_id = h.payload['note_id']
|
||||
# Vermeide Duplikate (falls mehrere Chunks/Notes denselben Titel haben)
|
||||
if title not in options:
|
||||
options[title] = note_id
|
||||
|
||||
if options:
|
||||
selected_title = st.selectbox("Ergebnisse:", list(options.keys()), key="cy_select")
|
||||
if st.button("Laden", use_container_width=True, key="cy_load"):
|
||||
new_id = options[selected_title]
|
||||
st.session_state.graph_center_id = new_id
|
||||
st.session_state.graph_inspected_id = new_id
|
||||
st.rerun()
|
||||
else:
|
||||
# Zeige Info, wenn keine Ergebnisse gefunden wurden
|
||||
st.info(f"Keine Notizen mit '{search_term}' im Titel gefunden.")
|
||||
except Exception as e:
|
||||
st.error(f"Fehler bei der Suche: {e}")
|
||||
import traceback
|
||||
st.code(traceback.format_exc())
|
||||
|
||||
st.divider()
|
||||
|
||||
# LAYOUT EINSTELLUNGEN (Mit URL Sync)
|
||||
with st.expander("👁️ Layout Einstellungen", expanded=True):
|
||||
st.slider("Tiefe (Tier)", 1, 3,
|
||||
value=st.session_state.cy_depth,
|
||||
key="cy_depth_slider",
|
||||
on_change=update_url_params)
|
||||
|
||||
st.markdown("**COSE Layout**")
|
||||
st.slider("Kantenlänge", 50, 600,
|
||||
value=st.session_state.cy_ideal_edge_len,
|
||||
key="cy_len_slider",
|
||||
on_change=update_url_params)
|
||||
|
||||
st.slider("Knoten-Abstoßung", 100000, 5000000, step=100000,
|
||||
value=st.session_state.cy_node_repulsion,
|
||||
key="cy_rep_slider",
|
||||
on_change=update_url_params)
|
||||
|
||||
if st.button("Neu berechnen", key="cy_rerun"):
|
||||
st.rerun()
|
||||
|
||||
st.divider()
|
||||
st.caption("Legende")
|
||||
for k, v in list(GRAPH_COLORS.items())[:8]:
|
||||
st.markdown(f"<span style='color:{v}'>●</span> {k}", unsafe_allow_html=True)
|
||||
# ---------------------------------------------------------
|
||||
# 3. RECHTES PANEL (GRAPH & INSPECTOR)
|
||||
# ---------------------------------------------------------
|
||||
with col_graph:
|
||||
center_id = st.session_state.graph_center_id
|
||||
|
||||
# Fallback Init
|
||||
if not center_id and st.session_state.graph_inspected_id:
|
||||
center_id = st.session_state.graph_inspected_id
|
||||
st.session_state.graph_center_id = center_id
|
||||
|
||||
if center_id:
|
||||
# Sync Inspection
|
||||
if not st.session_state.graph_inspected_id:
|
||||
st.session_state.graph_inspected_id = center_id
|
||||
|
||||
inspected_id = st.session_state.graph_inspected_id
|
||||
|
||||
# --- DATEN LADEN ---
|
||||
with st.spinner(f"Lade Graph (Tiefe {st.session_state.cy_depth})..."):
|
||||
# 1. Graph Daten
|
||||
nodes_data, edges_data = graph_service.get_ego_graph(
|
||||
center_id,
|
||||
depth=st.session_state.cy_depth
|
||||
)
|
||||
# 2. Detail Daten (Inspector)
|
||||
inspected_data = graph_service.get_note_with_full_content(inspected_id)
|
||||
|
||||
# DEBUG: Zeige Debug-Informationen
|
||||
with st.expander("🔍 Debug-Informationen", expanded=False):
|
||||
st.write(f"**Gefundene Knoten:** {len(nodes_data) if nodes_data else 0}")
|
||||
st.write(f"**Gefundene Kanten:** {len(edges_data) if edges_data else 0}")
|
||||
if nodes_data:
|
||||
st.write("**Knoten-IDs:**")
|
||||
for n in nodes_data[:10]:
|
||||
nid = getattr(n, 'id', 'N/A')
|
||||
st.write(f" - {nid}")
|
||||
if len(nodes_data) > 10:
|
||||
st.write(f" ... und {len(nodes_data) - 10} weitere")
|
||||
if edges_data:
|
||||
st.write("**Kanten:**")
|
||||
for e in edges_data[:10]:
|
||||
src = getattr(e, "source", "N/A")
|
||||
tgt = getattr(e, "to", getattr(e, "target", "N/A"))
|
||||
st.write(f" - {src} -> {tgt}")
|
||||
if len(edges_data) > 10:
|
||||
st.write(f" ... und {len(edges_data) - 10} weitere")
|
||||
|
||||
# --- ACTION BAR ---
|
||||
action_container = st.container()
|
||||
with action_container:
|
||||
c1, c2, c3 = st.columns([2, 1, 1])
|
||||
|
||||
with c1:
|
||||
title_show = inspected_data.get('title', inspected_id) if inspected_data else inspected_id
|
||||
st.info(f"**Ausgewählt:** {title_show}")
|
||||
|
||||
with c2:
|
||||
# NAVIGATION
|
||||
if inspected_id != center_id:
|
||||
if st.button("🎯 Als Zentrum setzen", use_container_width=True, key="cy_nav_btn"):
|
||||
st.session_state.graph_center_id = inspected_id
|
||||
st.rerun()
|
||||
else:
|
||||
st.caption("_(Ist aktuelles Zentrum)_")
|
||||
|
||||
with c3:
|
||||
# EDITIEREN
|
||||
if inspected_data:
|
||||
st.button("📝 Bearbeiten",
|
||||
use_container_width=True,
|
||||
on_click=switch_to_editor_callback,
|
||||
args=(inspected_data,),
|
||||
key="cy_edit_btn")
|
||||
|
||||
# --- DATA INSPECTOR ---
|
||||
with st.expander("🕵️ Data Inspector (Details)", expanded=False):
|
||||
if inspected_data:
|
||||
col_i1, col_i2 = st.columns(2)
|
||||
with col_i1:
|
||||
st.markdown(f"**ID:** `{inspected_data.get('note_id')}`")
|
||||
st.markdown(f"**Typ:** `{inspected_data.get('type')}`")
|
||||
with col_i2:
|
||||
tags = inspected_data.get('tags', [])
|
||||
if isinstance(tags, list):
|
||||
tags_str = ', '.join(tags) if tags else "Keine"
|
||||
else:
|
||||
tags_str = str(tags) if tags else "Keine"
|
||||
st.markdown(f"**Tags:** {tags_str}")
|
||||
path_check = "✅" if inspected_data.get('path') else "❌"
|
||||
st.markdown(f"**Pfad:** {path_check}")
|
||||
|
||||
st.caption("Inhalt (Vorschau):")
|
||||
st.text_area("Content Preview", inspected_data.get('fulltext', '')[:1000], height=200, disabled=True, label_visibility="collapsed")
|
||||
|
||||
with st.expander("📄 Raw JSON anzeigen"):
|
||||
st.json(inspected_data)
|
||||
else:
|
||||
st.warning("Keine Daten geladen.")
|
||||
|
||||
# --- GRAPH ELEMENTS ---
|
||||
cy_elements = []
|
||||
|
||||
# Validierung: Prüfe ob nodes_data vorhanden ist
|
||||
if not nodes_data:
|
||||
st.warning("⚠️ Keine Knoten gefunden. Bitte wähle eine andere Notiz.")
|
||||
# Zeige trotzdem den Inspector, falls Daten vorhanden
|
||||
if inspected_data:
|
||||
st.info(f"**Hinweis:** Die Notiz '{inspected_data.get('title', inspected_id)}' wurde gefunden, hat aber keine Verbindungen im Graphen.")
|
||||
return
|
||||
|
||||
# Erstelle Set aller Node-IDs für schnelle Validierung
|
||||
node_ids = {n.id for n in nodes_data if hasattr(n, 'id') and n.id}
|
||||
|
||||
# Nodes hinzufügen
|
||||
for n in nodes_data:
|
||||
if not hasattr(n, 'id') or not n.id:
|
||||
continue
|
||||
|
||||
is_center = (n.id == center_id)
|
||||
is_inspected = (n.id == inspected_id)
|
||||
|
||||
tooltip_text = getattr(n, 'title', None) or getattr(n, 'label', '')
|
||||
display_label = getattr(n, 'label', str(n.id))
|
||||
if len(display_label) > 15 and " " in display_label:
|
||||
display_label = display_label.replace(" ", "\n", 1)
|
||||
|
||||
cy_node = {
|
||||
"data": {
|
||||
"id": n.id,
|
||||
"label": display_label,
|
||||
"bg_color": getattr(n, 'color', '#8395a7'),
|
||||
"tooltip": tooltip_text
|
||||
},
|
||||
# Wir steuern das Aussehen rein über Klassen (.inspected / .center)
|
||||
"classes": " ".join([c for c in ["center" if is_center else "", "inspected" if is_inspected else ""] if c]),
|
||||
"selected": False
|
||||
}
|
||||
cy_elements.append(cy_node)
|
||||
|
||||
# Edges hinzufügen - nur wenn beide Nodes im Graph vorhanden sind
|
||||
if edges_data:
|
||||
for e in edges_data:
|
||||
source_id = getattr(e, "source", None)
|
||||
target_id = getattr(e, "to", getattr(e, "target", None))
|
||||
# Nur hinzufügen, wenn beide IDs vorhanden UND beide Nodes im Graph sind
|
||||
if source_id and target_id and source_id in node_ids and target_id in node_ids:
|
||||
cy_edge = {
|
||||
"data": {
|
||||
"source": source_id,
|
||||
"target": target_id,
|
||||
"label": getattr(e, "label", ""),
|
||||
"line_color": getattr(e, "color", "#bdc3c7")
|
||||
}
|
||||
}
|
||||
cy_elements.append(cy_edge)
|
||||
|
||||
# --- STYLESHEET ---
|
||||
stylesheet = [
|
||||
{
|
||||
"selector": "node",
|
||||
"style": {
|
||||
"label": "data(label)",
|
||||
"width": "30px", "height": "30px",
|
||||
"background-color": "data(bg_color)",
|
||||
"color": "#333", "font-size": "12px",
|
||||
"text-valign": "center", "text-halign": "center",
|
||||
"text-wrap": "wrap", "text-max-width": "90px",
|
||||
"border-width": 2, "border-color": "#fff",
|
||||
"title": "data(tooltip)"
|
||||
}
|
||||
},
|
||||
# Inspiziert (Gelber Rahmen)
|
||||
{
|
||||
"selector": ".inspected",
|
||||
"style": {
|
||||
"border-width": 6,
|
||||
"border-color": "#FFC300",
|
||||
"width": "50px", "height": "50px",
|
||||
"font-weight": "bold",
|
||||
"z-index": 999
|
||||
}
|
||||
},
|
||||
# Zentrum (Roter Rahmen)
|
||||
{
|
||||
"selector": ".center",
|
||||
"style": {
|
||||
"border-width": 4,
|
||||
"border-color": "#FF5733",
|
||||
"width": "40px", "height": "40px"
|
||||
}
|
||||
},
|
||||
# Mix
|
||||
{
|
||||
"selector": ".center.inspected",
|
||||
"style": {
|
||||
"border-width": 6,
|
||||
"border-color": "#FF5733",
|
||||
"width": "55px", "height": "55px"
|
||||
}
|
||||
},
|
||||
# Default Selection unterdrücken
|
||||
{
|
||||
"selector": "node:selected",
|
||||
"style": {
|
||||
"border-width": 0,
|
||||
"overlay-opacity": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"selector": "edge",
|
||||
"style": {
|
||||
"width": 2,
|
||||
"line-color": "data(line_color)",
|
||||
"target-arrow-color": "data(line_color)",
|
||||
"target-arrow-shape": "triangle",
|
||||
"curve-style": "bezier",
|
||||
"label": "data(label)",
|
||||
"font-size": "10px", "color": "#666",
|
||||
"text-background-opacity": 0.8, "text-background-color": "#fff"
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
# --- RENDER ---
|
||||
# Nur rendern, wenn Elemente vorhanden sind
|
||||
if not cy_elements:
|
||||
st.warning("⚠️ Keine Graph-Elemente zum Anzeigen gefunden.")
|
||||
else:
|
||||
graph_key = f"cy_{center_id}_{st.session_state.cy_depth}_{st.session_state.cy_ideal_edge_len}"
|
||||
|
||||
clicked_elements = cytoscape(
|
||||
elements=cy_elements,
|
||||
stylesheet=stylesheet,
|
||||
layout={
|
||||
"name": "cose",
|
||||
"idealEdgeLength": st.session_state.cy_ideal_edge_len,
|
||||
"nodeOverlap": 20,
|
||||
"refresh": 20,
|
||||
"fit": True,
|
||||
"padding": 50,
|
||||
"randomize": False,
|
||||
"componentSpacing": 100,
|
||||
"nodeRepulsion": st.session_state.cy_node_repulsion,
|
||||
"edgeElasticity": 100,
|
||||
"nestingFactor": 5,
|
||||
"gravity": 80,
|
||||
"numIter": 1000,
|
||||
"initialTemp": 200,
|
||||
"coolingFactor": 0.95,
|
||||
"minTemp": 1.0,
|
||||
"animate": False
|
||||
},
|
||||
key=graph_key,
|
||||
height="700px"
|
||||
)
|
||||
|
||||
# --- EVENT HANDLING ---
|
||||
if clicked_elements:
|
||||
clicked_nodes = clicked_elements.get("nodes", [])
|
||||
if clicked_nodes:
|
||||
clicked_id = clicked_nodes[0]
|
||||
|
||||
if clicked_id != st.session_state.graph_inspected_id:
|
||||
st.session_state.graph_inspected_id = clicked_id
|
||||
st.rerun()
|
||||
|
||||
else:
|
||||
st.info("👈 Bitte wähle links eine Notiz aus.")
|
||||
448
app/frontend/ui_graph_service.py
Normal file
448
app/frontend/ui_graph_service.py
Normal file
|
|
@ -0,0 +1,448 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_graph_service.py
|
||||
DESCRIPTION: Data Layer für den Graphen. Greift direkt auf Qdrant zu (Performance), um Knoten/Kanten zu laden und Texte zu rekonstruieren ("Stitching").
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, streamlit_agraph, ui_config, re
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import re
|
||||
from qdrant_client import QdrantClient, models
|
||||
from streamlit_agraph import Node, Edge
|
||||
from ui_config import COLLECTION_PREFIX, GRAPH_COLORS, get_edge_color, SYSTEM_EDGES
|
||||
|
||||
class GraphExplorerService:
|
||||
def __init__(self, url, api_key=None, prefix=None):
|
||||
"""
|
||||
Initialisiert den Service. Nutzt COLLECTION_PREFIX aus der Config,
|
||||
sofern kein spezifischer Prefix übergeben wurde.
|
||||
"""
|
||||
self.client = QdrantClient(url=url, api_key=api_key)
|
||||
self.prefix = prefix if prefix else COLLECTION_PREFIX
|
||||
self.notes_col = f"{self.prefix}_notes"
|
||||
self.chunks_col = f"{self.prefix}_chunks"
|
||||
self.edges_col = f"{self.prefix}_edges"
|
||||
self._note_cache = {}
|
||||
|
||||
def get_note_with_full_content(self, note_id):
|
||||
"""
|
||||
Lädt die Metadaten der Note und rekonstruiert den gesamten Text
|
||||
aus den Chunks (Stitching). Wichtig für den Editor-Fallback.
|
||||
"""
|
||||
# 1. Metadaten holen
|
||||
meta = self._fetch_note_cached(note_id)
|
||||
if not meta: return None
|
||||
|
||||
# 2. Volltext aus Chunks bauen
|
||||
full_text = self._fetch_full_text_stitched(note_id)
|
||||
|
||||
# 3. Ergebnis kombinieren (Wir überschreiben das 'fulltext' Feld mit dem frischen Stitching)
|
||||
# Wir geben eine Kopie zurück, um den Cache nicht zu verfälschen
|
||||
complete_note = meta.copy()
|
||||
if full_text:
|
||||
complete_note['fulltext'] = full_text
|
||||
|
||||
return complete_note
|
||||
|
||||
def get_ego_graph(self, center_note_id: str, depth=2, show_labels=True):
|
||||
"""
|
||||
Erstellt den Ego-Graphen um eine zentrale Notiz.
|
||||
Lädt Volltext für das Zentrum und Snippets für Nachbarn.
|
||||
"""
|
||||
nodes_dict = {}
|
||||
unique_edges = {}
|
||||
|
||||
# 1. Center Note laden
|
||||
center_note = self._fetch_note_cached(center_note_id)
|
||||
if not center_note: return [], []
|
||||
self._add_node_to_dict(nodes_dict, center_note, level=0)
|
||||
|
||||
# Initialset für Suche
|
||||
level_1_ids = {center_note_id}
|
||||
|
||||
# Suche Kanten für Center (L1)
|
||||
l1_edges = self._find_connected_edges([center_note_id], center_note.get("title"))
|
||||
|
||||
for edge_data in l1_edges:
|
||||
src_id, tgt_id = self._process_edge(edge_data, nodes_dict, unique_edges, current_depth=1)
|
||||
if src_id: level_1_ids.add(src_id)
|
||||
if tgt_id: level_1_ids.add(tgt_id)
|
||||
|
||||
# Level 2 Suche (begrenzt für Performance)
|
||||
if depth > 1 and len(level_1_ids) > 1 and len(level_1_ids) < 80:
|
||||
l1_subset = list(level_1_ids - {center_note_id})
|
||||
if l1_subset:
|
||||
l2_edges = self._find_connected_edges_batch(l1_subset)
|
||||
for edge_data in l2_edges:
|
||||
self._process_edge(edge_data, nodes_dict, unique_edges, current_depth=2)
|
||||
|
||||
# --- SMART CONTENT LOADING ---
|
||||
|
||||
# A. Fulltext für Center Node holen (Chunks zusammenfügen)
|
||||
center_text = self._fetch_full_text_stitched(center_note_id)
|
||||
if center_note_id in nodes_dict:
|
||||
orig_title = nodes_dict[center_note_id].title
|
||||
clean_full = self._clean_markdown(center_text[:2000])
|
||||
# Wir packen den Text in den Tooltip (title attribute)
|
||||
nodes_dict[center_note_id].title = f"{orig_title}\n\n📄 INHALT:\n{clean_full}..."
|
||||
|
||||
# B. Previews für alle Nachbarn holen (Batch)
|
||||
all_ids = list(nodes_dict.keys())
|
||||
previews = self._fetch_previews_for_nodes(all_ids)
|
||||
|
||||
for nid, node_obj in nodes_dict.items():
|
||||
if nid != center_note_id:
|
||||
prev_raw = previews.get(nid, "Kein Vorschau-Text.")
|
||||
clean_prev = self._clean_markdown(prev_raw[:600])
|
||||
node_obj.title = f"{node_obj.title}\n\n🔍 VORSCHAU:\n{clean_prev}..."
|
||||
|
||||
# Graphen bauen (Nodes & Edges finalisieren)
|
||||
final_edges = []
|
||||
for (src, tgt), data in unique_edges.items():
|
||||
kind = data['kind']
|
||||
prov = data['provenance']
|
||||
color = get_edge_color(kind)
|
||||
is_smart = (prov != "explicit" and prov != "rule")
|
||||
|
||||
# Label Logik
|
||||
label_text = kind if show_labels else " "
|
||||
|
||||
final_edges.append(Edge(
|
||||
source=src, target=tgt, label=label_text, color=color, dashes=is_smart,
|
||||
title=f"Relation: {kind}\nProvenance: {prov}"
|
||||
))
|
||||
|
||||
return list(nodes_dict.values()), final_edges
|
||||
|
||||
def _clean_markdown(self, text):
|
||||
"""Entfernt Markdown-Sonderzeichen für saubere Tooltips im Browser."""
|
||||
if not text: return ""
|
||||
# Entferne Header Marker (## )
|
||||
text = re.sub(r'#+\s', '', text)
|
||||
# Entferne Bold/Italic (** oder *)
|
||||
text = re.sub(r'\*\*|__|\*|_', '', text)
|
||||
# Entferne Links [Text](Url) -> Text
|
||||
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
||||
# Entferne Wikilinks [[Link]] -> Link
|
||||
text = re.sub(r'\[\[([^\]]+)\]\]', r'\1', text)
|
||||
return text
|
||||
|
||||
def _fetch_full_text_stitched(self, note_id):
|
||||
"""Lädt alle Chunks einer Note und baut den Text zusammen."""
|
||||
try:
|
||||
scroll_filter = models.Filter(
|
||||
must=[models.FieldCondition(key="note_id", match=models.MatchValue(value=note_id))]
|
||||
)
|
||||
# Limit hoch genug setzen
|
||||
chunks, _ = self.client.scroll(self.chunks_col, scroll_filter=scroll_filter, limit=100, with_payload=True)
|
||||
# Sortieren nach 'ord' (Reihenfolge im Dokument)
|
||||
chunks.sort(key=lambda x: x.payload.get('ord', 999))
|
||||
|
||||
full_text = []
|
||||
for c in chunks:
|
||||
# 'text' ist der reine Inhalt ohne Overlap
|
||||
txt = c.payload.get('text', '')
|
||||
if txt: full_text.append(txt)
|
||||
|
||||
return "\n\n".join(full_text)
|
||||
except:
|
||||
return "Fehler beim Laden des Volltexts."
|
||||
|
||||
def _fetch_previews_for_nodes(self, node_ids):
|
||||
"""Holt Batch-weise den ersten Chunk für eine Liste von Nodes."""
|
||||
if not node_ids: return {}
|
||||
previews = {}
|
||||
try:
|
||||
scroll_filter = models.Filter(must=[models.FieldCondition(key="note_id", match=models.MatchAny(any=node_ids))])
|
||||
# Limit = Anzahl Nodes * 3 (Puffer)
|
||||
chunks, _ = self.client.scroll(self.chunks_col, scroll_filter=scroll_filter, limit=len(node_ids)*3, with_payload=True)
|
||||
|
||||
for c in chunks:
|
||||
nid = c.payload.get("note_id")
|
||||
# Nur den ersten gefundenen Chunk pro Note nehmen
|
||||
if nid and nid not in previews:
|
||||
previews[nid] = c.payload.get("window") or c.payload.get("text") or ""
|
||||
except: pass
|
||||
return previews
|
||||
|
||||
def _find_connected_edges(self, note_ids, note_title=None):
|
||||
"""
|
||||
Findet eingehende und ausgehende Kanten.
|
||||
|
||||
WICHTIG: target_id enthält nur den Titel (ohne #Abschnitt).
|
||||
target_section ist ein separates Feld für Abschnitt-Informationen.
|
||||
"""
|
||||
results = []
|
||||
if not note_ids:
|
||||
return results
|
||||
|
||||
# 1. OUTGOING EDGES (Der "Owner"-Fix)
|
||||
# Wir suchen Kanten, die im Feld 'note_id' (Owner) eine unserer Notizen haben.
|
||||
# Das findet ALLE ausgehenden Kanten, egal ob sie an einem Chunk oder der Note hängen.
|
||||
out_filter = models.Filter(must=[
|
||||
models.FieldCondition(key="note_id", match=models.MatchAny(any=note_ids)),
|
||||
models.FieldCondition(key="kind", match=models.MatchExcept(**{"except": SYSTEM_EDGES}))
|
||||
])
|
||||
res_out, _ = self.client.scroll(self.edges_col, scroll_filter=out_filter, limit=2000, with_payload=True)
|
||||
results.extend(res_out)
|
||||
|
||||
# 2. INCOMING EDGES (Ziel = Chunk ID, Note ID oder Titel)
|
||||
# WICHTIG: target_id enthält nur den Titel, target_section ist separat
|
||||
|
||||
# Chunk IDs der aktuellen Notes holen
|
||||
c_filter = models.Filter(must=[models.FieldCondition(key="note_id", match=models.MatchAny(any=note_ids))])
|
||||
chunks, _ = self.client.scroll(self.chunks_col, scroll_filter=c_filter, limit=1000, with_payload=False)
|
||||
chunk_ids = [c.id for c in chunks]
|
||||
|
||||
shoulds = []
|
||||
# Case A: Edge zeigt auf einen unserer Chunks
|
||||
if chunk_ids:
|
||||
shoulds.append(models.FieldCondition(key="target_id", match=models.MatchAny(any=chunk_ids)))
|
||||
|
||||
# Case B: Edge zeigt direkt auf unsere Note ID
|
||||
shoulds.append(models.FieldCondition(key="target_id", match=models.MatchAny(any=note_ids)))
|
||||
|
||||
# Case C: Edge zeigt auf unseren Titel
|
||||
# WICHTIG: target_id enthält nur den Titel (z.B. "Meine Prinzipien 2025")
|
||||
# target_section enthält die Abschnitt-Information (z.B. "P3 – Disziplin"), wenn gesetzt
|
||||
|
||||
# Sammle alle relevanten Titel (inkl. Aliase)
|
||||
titles_to_search = []
|
||||
if note_title:
|
||||
titles_to_search.append(note_title)
|
||||
|
||||
# Lade auch Titel aus den Notes selbst (falls note_title nicht übergeben wurde)
|
||||
for nid in note_ids:
|
||||
note = self._fetch_note_cached(nid)
|
||||
if note:
|
||||
note_title_from_db = note.get("title")
|
||||
if note_title_from_db and note_title_from_db not in titles_to_search:
|
||||
titles_to_search.append(note_title_from_db)
|
||||
# Aliase hinzufügen
|
||||
aliases = note.get("aliases", [])
|
||||
if isinstance(aliases, str):
|
||||
aliases = [aliases]
|
||||
for alias in aliases:
|
||||
if alias and alias not in titles_to_search:
|
||||
titles_to_search.append(alias)
|
||||
|
||||
# Für jeden Titel: Suche nach exaktem Match
|
||||
# target_id enthält nur den Titel, daher reicht MatchValue
|
||||
for title in titles_to_search:
|
||||
shoulds.append(models.FieldCondition(key="target_id", match=models.MatchValue(value=title)))
|
||||
|
||||
if shoulds:
|
||||
in_filter = models.Filter(
|
||||
must=[models.FieldCondition(key="kind", match=models.MatchExcept(**{"except": SYSTEM_EDGES}))],
|
||||
should=shoulds
|
||||
)
|
||||
res_in, _ = self.client.scroll(self.edges_col, scroll_filter=in_filter, limit=2000, with_payload=True)
|
||||
results.extend(res_in)
|
||||
|
||||
return results
|
||||
|
||||
def _find_connected_edges_batch(self, note_ids):
|
||||
"""
|
||||
Wrapper für Level 2 Suche.
|
||||
Lädt Titel der ersten Note für Titel-basierte Suche.
|
||||
"""
|
||||
if not note_ids:
|
||||
return []
|
||||
first_note = self._fetch_note_cached(note_ids[0])
|
||||
note_title = first_note.get("title") if first_note else None
|
||||
return self._find_connected_edges(note_ids, note_title=note_title)
|
||||
|
||||
def _process_edge(self, record, nodes_dict, unique_edges, current_depth):
|
||||
"""
|
||||
Verarbeitet eine rohe Edge, löst IDs auf und fügt sie den Dictionaries hinzu.
|
||||
|
||||
WICHTIG: Beide Richtungen werden unterstützt:
|
||||
- Ausgehende Kanten: source_id gehört zu unserer Note (via note_id Owner)
|
||||
- Eingehende Kanten: target_id zeigt auf unsere Note (via target_id Match)
|
||||
"""
|
||||
if not record or not record.payload:
|
||||
return None, None
|
||||
|
||||
payload = record.payload
|
||||
src_ref = payload.get("source_id")
|
||||
tgt_ref = payload.get("target_id")
|
||||
kind = payload.get("kind")
|
||||
provenance = payload.get("provenance", "explicit")
|
||||
|
||||
# Prüfe, ob beide Referenzen vorhanden sind
|
||||
if not src_ref or not tgt_ref:
|
||||
return None, None
|
||||
|
||||
# IDs zu Notes auflösen
|
||||
# WICHTIG: source_id kann Chunk-ID (note_id#c01), Note-ID oder Titel sein
|
||||
# WICHTIG: target_id kann Chunk-ID, Note-ID oder Titel sein (ohne #Abschnitt)
|
||||
src_note = self._resolve_note_from_ref(src_ref)
|
||||
tgt_note = self._resolve_note_from_ref(tgt_ref)
|
||||
|
||||
if src_note and tgt_note:
|
||||
src_id = src_note.get('note_id')
|
||||
tgt_id = tgt_note.get('note_id')
|
||||
|
||||
# Prüfe, ob beide IDs vorhanden sind
|
||||
if not src_id or not tgt_id:
|
||||
return None, None
|
||||
|
||||
if src_id != tgt_id:
|
||||
# Nodes hinzufügen
|
||||
self._add_node_to_dict(nodes_dict, src_note, level=current_depth)
|
||||
self._add_node_to_dict(nodes_dict, tgt_note, level=current_depth)
|
||||
|
||||
# Kante hinzufügen (mit Deduplizierung)
|
||||
key = (src_id, tgt_id)
|
||||
existing = unique_edges.get(key)
|
||||
|
||||
should_update = True
|
||||
# Bevorzuge explizite Kanten vor Smart Kanten
|
||||
is_current_explicit = (provenance in ["explicit", "rule"])
|
||||
if existing:
|
||||
is_existing_explicit = (existing.get('provenance', '') in ["explicit", "rule"])
|
||||
if is_existing_explicit and not is_current_explicit:
|
||||
should_update = False
|
||||
|
||||
if should_update:
|
||||
unique_edges[key] = {"source": src_id, "target": tgt_id, "kind": kind, "provenance": provenance}
|
||||
return src_id, tgt_id
|
||||
return None, None
|
||||
|
||||
def _fetch_note_cached(self, note_id):
|
||||
if note_id in self._note_cache: return self._note_cache[note_id]
|
||||
res, _ = self.client.scroll(
|
||||
collection_name=self.notes_col,
|
||||
scroll_filter=models.Filter(must=[models.FieldCondition(key="note_id", match=models.MatchValue(value=note_id))]),
|
||||
limit=1, with_payload=True
|
||||
)
|
||||
if res:
|
||||
self._note_cache[note_id] = res[0].payload
|
||||
return res[0].payload
|
||||
return None
|
||||
|
||||
def _resolve_note_from_ref(self, ref_str):
|
||||
"""
|
||||
Löst eine Referenz zu einer Note Payload auf.
|
||||
|
||||
WICHTIG: Wenn ref_str ein Titel#Abschnitt Format hat, wird nur der Titel-Teil verwendet.
|
||||
Unterstützt:
|
||||
- Note-ID: "20250101-meine-note"
|
||||
- Chunk-ID: "20250101-meine-note#c01"
|
||||
- Titel: "Meine Prinzipien 2025"
|
||||
- Titel#Abschnitt: "Meine Prinzipien 2025#P3 – Disziplin" (trennt Abschnitt ab, sucht nur nach Titel)
|
||||
"""
|
||||
if not ref_str:
|
||||
return None
|
||||
|
||||
# Fall A: Enthält # (kann Chunk-ID oder Titel#Abschnitt sein)
|
||||
if "#" in ref_str:
|
||||
try:
|
||||
# Versuch 1: Chunk ID direkt (Format: note_id#c01)
|
||||
res = self.client.retrieve(self.chunks_col, ids=[ref_str], with_payload=True)
|
||||
if res and res[0].payload:
|
||||
note_id = res[0].payload.get("note_id")
|
||||
if note_id:
|
||||
return self._fetch_note_cached(note_id)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Versuch 2: NoteID#Section (Hash abtrennen und als Note-ID versuchen)
|
||||
# z.B. "20250101-meine-note#Abschnitt" -> "20250101-meine-note"
|
||||
possible_note_id = ref_str.split("#")[0].strip()
|
||||
note = self._fetch_note_cached(possible_note_id)
|
||||
if note:
|
||||
return note
|
||||
|
||||
# Versuch 3: Titel#Abschnitt (Hash abtrennen und als Titel suchen)
|
||||
# z.B. "Meine Prinzipien 2025#P3 – Disziplin" -> "Meine Prinzipien 2025"
|
||||
# WICHTIG: target_id enthält nur den Titel, daher suchen wir nur nach dem Titel-Teil
|
||||
possible_title = ref_str.split("#")[0].strip()
|
||||
if possible_title:
|
||||
res, _ = self.client.scroll(
|
||||
collection_name=self.notes_col,
|
||||
scroll_filter=models.Filter(must=[
|
||||
models.FieldCondition(key="title", match=models.MatchValue(value=possible_title))
|
||||
]),
|
||||
limit=1, with_payload=True
|
||||
)
|
||||
if res and res[0].payload:
|
||||
payload = res[0].payload
|
||||
self._note_cache[payload['note_id']] = payload
|
||||
return payload
|
||||
|
||||
# Fallback: Text-Suche für Fuzzy-Matching
|
||||
res, _ = self.client.scroll(
|
||||
collection_name=self.notes_col,
|
||||
scroll_filter=models.Filter(must=[
|
||||
models.FieldCondition(key="title", match=models.MatchText(text=possible_title))
|
||||
]),
|
||||
limit=10, with_payload=True
|
||||
)
|
||||
if res:
|
||||
# Nimm das erste Ergebnis, das exakt oder beginnend mit possible_title übereinstimmt
|
||||
for r in res:
|
||||
if r.payload:
|
||||
note_title = r.payload.get("title", "")
|
||||
if note_title == possible_title or note_title.startswith(possible_title):
|
||||
payload = r.payload
|
||||
self._note_cache[payload['note_id']] = payload
|
||||
return payload
|
||||
|
||||
# Fall B: Note ID direkt
|
||||
note = self._fetch_note_cached(ref_str)
|
||||
if note:
|
||||
return note
|
||||
|
||||
# Fall C: Titel (exakte Übereinstimmung)
|
||||
res, _ = self.client.scroll(
|
||||
collection_name=self.notes_col,
|
||||
scroll_filter=models.Filter(must=[
|
||||
models.FieldCondition(key="title", match=models.MatchValue(value=ref_str))
|
||||
]),
|
||||
limit=1, with_payload=True
|
||||
)
|
||||
if res and res[0].payload:
|
||||
payload = res[0].payload
|
||||
self._note_cache[payload['note_id']] = payload
|
||||
return payload
|
||||
|
||||
# Fall D: Titel (Text-Suche für Fuzzy-Matching)
|
||||
res, _ = self.client.scroll(
|
||||
collection_name=self.notes_col,
|
||||
scroll_filter=models.Filter(must=[
|
||||
models.FieldCondition(key="title", match=models.MatchText(text=ref_str))
|
||||
]),
|
||||
limit=1, with_payload=True
|
||||
)
|
||||
if res and res[0].payload:
|
||||
payload = res[0].payload
|
||||
self._note_cache[payload['note_id']] = payload
|
||||
return payload
|
||||
|
||||
return None
|
||||
|
||||
def _add_node_to_dict(self, node_dict, note_payload, level=1):
|
||||
nid = note_payload.get("note_id")
|
||||
if not nid or nid in node_dict: return
|
||||
|
||||
ntype = note_payload.get("type", "default")
|
||||
color = GRAPH_COLORS.get(ntype, GRAPH_COLORS["default"])
|
||||
|
||||
# Basis-Tooltip (wird später erweitert)
|
||||
tooltip = f"Titel: {note_payload.get('title')}\nTyp: {ntype}"
|
||||
|
||||
if level == 0: size = 45
|
||||
elif level == 1: size = 25
|
||||
else: size = 15
|
||||
|
||||
node_dict[nid] = Node(
|
||||
id=nid,
|
||||
label=note_payload.get('title', nid),
|
||||
size=size,
|
||||
color=color,
|
||||
shape="dot" if level > 0 else "diamond",
|
||||
title=tooltip,
|
||||
font={'color': 'black', 'face': 'arial', 'size': 14 if level < 2 else 0}
|
||||
)
|
||||
45
app/frontend/ui_sidebar.py
Normal file
45
app/frontend/ui_sidebar.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_sidebar.py
|
||||
DESCRIPTION: Rendert die Sidebar. Steuert den Modus-Wechsel (Chat/Editor/Graph) und globale Settings (Top-K).
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: streamlit, ui_utils, ui_config
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import streamlit as st
|
||||
from ui_utils import load_history_from_logs
|
||||
from ui_config import HISTORY_FILE
|
||||
|
||||
def render_sidebar():
|
||||
with st.sidebar:
|
||||
st.title("🧠 mindnet")
|
||||
st.caption("v2.6 | WP-19 Graph View")
|
||||
|
||||
if "sidebar_mode_selection" not in st.session_state:
|
||||
st.session_state["sidebar_mode_selection"] = "💬 Chat"
|
||||
|
||||
mode = st.radio(
|
||||
"Modus",
|
||||
[
|
||||
"💬 Chat",
|
||||
"📝 Manueller Editor",
|
||||
"🕸️ Graph (Agraph)",
|
||||
"🕸️ Graph (Cytoscape)" # <-- Neuer Punkt
|
||||
],
|
||||
key="sidebar_mode_selection"
|
||||
)
|
||||
|
||||
st.divider()
|
||||
st.subheader("⚙️ Settings")
|
||||
top_k = st.slider("Quellen (Top-K)", 1, 10, 5)
|
||||
explain = st.toggle("Explanation Layer", True)
|
||||
|
||||
st.divider()
|
||||
st.subheader("🕒 Verlauf")
|
||||
for q in load_history_from_logs(HISTORY_FILE, 8):
|
||||
if st.button(f"🔎 {q[:25]}...", key=f"hist_{q}", use_container_width=True):
|
||||
st.session_state.messages.append({"role": "user", "content": q})
|
||||
st.rerun()
|
||||
|
||||
return mode, top_k, explain
|
||||
146
app/frontend/ui_utils.py
Normal file
146
app/frontend/ui_utils.py
Normal file
|
|
@ -0,0 +1,146 @@
|
|||
"""
|
||||
FILE: app/frontend/ui_utils.py
|
||||
DESCRIPTION: String-Utilities. Parser für Markdown/YAML (LLM-Healing) und Helper für History-Loading.
|
||||
VERSION: 2.6.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: re, yaml, unicodedata, json, datetime
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
import re
|
||||
import yaml
|
||||
import unicodedata
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
def slugify(value):
|
||||
if not value: return ""
|
||||
value = str(value).lower()
|
||||
replacements = {'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'ß': 'ss', '&': 'und', '+': 'und'}
|
||||
for k, v in replacements.items():
|
||||
value = value.replace(k, v)
|
||||
|
||||
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
|
||||
value = re.sub(r'[^\w\s-]', '', value).strip()
|
||||
return re.sub(r'[-\s]+', '-', value)
|
||||
|
||||
def normalize_meta_and_body(meta, body):
|
||||
ALLOWED_KEYS = {"title", "type", "status", "tags", "id", "created", "updated", "aliases", "lang"}
|
||||
clean_meta = {}
|
||||
extra_content = []
|
||||
|
||||
if "titel" in meta and "title" not in meta:
|
||||
meta["title"] = meta.pop("titel")
|
||||
|
||||
tag_candidates = ["tags", "emotionale_keywords", "keywords", "schluesselwoerter"]
|
||||
all_tags = []
|
||||
for key in tag_candidates:
|
||||
if key in meta:
|
||||
val = meta[key]
|
||||
if isinstance(val, list): all_tags.extend(val)
|
||||
elif isinstance(val, str): all_tags.extend([t.strip() for t in val.split(",")])
|
||||
|
||||
for key, val in meta.items():
|
||||
if key in ALLOWED_KEYS:
|
||||
clean_meta[key] = val
|
||||
elif key in tag_candidates:
|
||||
pass
|
||||
else:
|
||||
if val and isinstance(val, str):
|
||||
header = key.replace("_", " ").title()
|
||||
extra_content.append(f"## {header}\n{val}\n")
|
||||
|
||||
if all_tags:
|
||||
clean_tags = []
|
||||
for t in all_tags:
|
||||
t_clean = str(t).replace("#", "").strip()
|
||||
if t_clean: clean_tags.append(t_clean)
|
||||
clean_meta["tags"] = list(set(clean_tags))
|
||||
|
||||
if extra_content:
|
||||
new_section = "\n".join(extra_content)
|
||||
final_body = f"{new_section}\n{body}"
|
||||
else:
|
||||
final_body = body
|
||||
|
||||
return clean_meta, final_body
|
||||
|
||||
def parse_markdown_draft(full_text):
|
||||
clean_text = full_text.strip()
|
||||
pattern_block = r"```(?:markdown|md|yaml)?\s*(.*?)\s*```"
|
||||
match_block = re.search(pattern_block, clean_text, re.DOTALL | re.IGNORECASE)
|
||||
if match_block:
|
||||
clean_text = match_block.group(1).strip()
|
||||
|
||||
meta = {}
|
||||
body = clean_text
|
||||
yaml_str = ""
|
||||
|
||||
parts = re.split(r"^---+\s*$", clean_text, maxsplit=2, flags=re.MULTILINE)
|
||||
|
||||
if len(parts) >= 3:
|
||||
yaml_str = parts[1]
|
||||
body = parts[2]
|
||||
elif clean_text.startswith("---"):
|
||||
fallback_match = re.search(r"^---\s*(.*?)(?=\n#)", clean_text, re.DOTALL | re.MULTILINE)
|
||||
if fallback_match:
|
||||
yaml_str = fallback_match.group(1)
|
||||
body = clean_text.replace(f"---{yaml_str}", "", 1).strip()
|
||||
|
||||
if yaml_str:
|
||||
yaml_str_clean = yaml_str.replace("#", "")
|
||||
try:
|
||||
parsed = yaml.safe_load(yaml_str_clean)
|
||||
if isinstance(parsed, dict):
|
||||
meta = parsed
|
||||
except Exception as e:
|
||||
print(f"YAML Parsing Warning: {e}")
|
||||
|
||||
if not meta.get("title"):
|
||||
h1_match = re.search(r"^#\s+(.*)$", body, re.MULTILINE)
|
||||
if h1_match:
|
||||
meta["title"] = h1_match.group(1).strip()
|
||||
|
||||
if meta.get("type") == "draft":
|
||||
meta["status"] = "draft"
|
||||
meta["type"] = "experience"
|
||||
|
||||
return normalize_meta_and_body(meta, body)
|
||||
|
||||
def build_markdown_doc(meta, body):
|
||||
if "id" not in meta or meta["id"] == "generated_on_save":
|
||||
raw_title = meta.get('title', 'note')
|
||||
clean_slug = slugify(raw_title)[:50] or "note"
|
||||
meta["id"] = f"{datetime.now().strftime('%Y%m%d')}-{clean_slug}"
|
||||
|
||||
meta["updated"] = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
ordered_meta = {}
|
||||
prio_keys = ["id", "type", "title", "status", "tags"]
|
||||
for k in prio_keys:
|
||||
if k in meta: ordered_meta[k] = meta.pop(k)
|
||||
ordered_meta.update(meta)
|
||||
|
||||
try:
|
||||
yaml_str = yaml.dump(ordered_meta, default_flow_style=None, sort_keys=False, allow_unicode=True).strip()
|
||||
except:
|
||||
yaml_str = "error: generating_yaml"
|
||||
|
||||
return f"---\n{yaml_str}\n---\n\n{body}"
|
||||
|
||||
def load_history_from_logs(filepath, limit=10):
|
||||
queries = []
|
||||
if filepath.exists():
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for line in reversed(lines):
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
q = entry.get("query_text")
|
||||
if q and q not in queries:
|
||||
queries.append(q)
|
||||
if len(queries) >= limit: break
|
||||
except: continue
|
||||
except: pass
|
||||
return queries
|
||||
140
app/main.py
140
app/main.py
|
|
@ -1,25 +1,145 @@
|
|||
"""
|
||||
Version 0.3
|
||||
FILE: app/main.py
|
||||
DESCRIPTION: Bootstrap der FastAPI Anwendung für WP-25a (Agentic MoE).
|
||||
Orchestriert Lifespan-Events, globale Fehlerbehandlung und Routing.
|
||||
Prüft beim Start die Integrität der Mixture of Experts Konfiguration.
|
||||
VERSION: 1.1.0 (WP-25a: MoE Integrity Check)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.config, app.routers.*, app.services.llm_service
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import logging
|
||||
import os
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from fastapi import FastAPI
|
||||
from .config import get_settings
|
||||
from .routers.embed_router import router as embed_router
|
||||
from .routers.qdrant_router import router as qdrant_router
|
||||
from .services.llm_service import LLMService
|
||||
|
||||
# Import der Router
|
||||
from .routers.query import router as query_router
|
||||
from .routers.graph import router as graph_router
|
||||
from .routers.tools import router as tools_router
|
||||
from .routers.feedback import router as feedback_router
|
||||
from .routers.chat import router as chat_router
|
||||
from .routers.ingest import router as ingest_router
|
||||
|
||||
try:
|
||||
from .routers.admin import router as admin_router
|
||||
except Exception:
|
||||
admin_router = None
|
||||
|
||||
from .core.logging_setup import setup_logging
|
||||
|
||||
# Initialisierung des Loggings noch VOR create_app()
|
||||
setup_logging()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- WP-25a: Lifespan Management mit MoE Integritäts-Prüfung ---
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
Verwaltet den Lebenszyklus der Anwendung (Startup/Shutdown).
|
||||
Verifiziert die Verfügbarkeit der MoE-Experten-Profile und Strategien.
|
||||
"""
|
||||
settings = get_settings()
|
||||
logger.info("🚀 mindnet API: Starting up (WP-25a MoE Mode)...")
|
||||
|
||||
# 1. Startup: Integritäts-Check der MoE Konfiguration
|
||||
# Wir prüfen die drei Säulen der Agentic-RAG Architektur.
|
||||
decision_cfg = os.getenv("MINDNET_DECISION_CONFIG", "config/decision_engine.yaml")
|
||||
profiles_cfg = getattr(settings, "LLM_PROFILES_PATH", "config/llm_profiles.yaml")
|
||||
prompts_cfg = settings.PROMPTS_PATH
|
||||
|
||||
missing_files = []
|
||||
if not os.path.exists(decision_cfg): missing_files.append(decision_cfg)
|
||||
if not os.path.exists(profiles_cfg): missing_files.append(profiles_cfg)
|
||||
if not os.path.exists(prompts_cfg): missing_files.append(prompts_cfg)
|
||||
|
||||
if missing_files:
|
||||
logger.error(f"❌ CRITICAL: Missing MoE config files: {missing_files}")
|
||||
else:
|
||||
logger.info("✅ MoE Configuration files verified.")
|
||||
|
||||
yield
|
||||
|
||||
# 2. Shutdown: Ressourcen bereinigen
|
||||
logger.info("🛑 mindnet API: Shutting down...")
|
||||
try:
|
||||
llm = LLMService()
|
||||
await llm.close()
|
||||
logger.info("✨ LLM resources cleaned up.")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Error during LLMService cleanup: {e}")
|
||||
|
||||
logger.info("Goodbye.")
|
||||
|
||||
# --- App Factory ---
|
||||
|
||||
def create_app() -> FastAPI:
|
||||
app = FastAPI(title="mindnet API", version="0.1.0")
|
||||
"""Initialisiert die FastAPI App mit WP-25a Erweiterungen."""
|
||||
app = FastAPI(
|
||||
title="mindnet API",
|
||||
version="1.1.0", # WP-25a Milestone
|
||||
lifespan=lifespan,
|
||||
description="Digital Twin Knowledge Engine mit Mixture of Experts Orchestration."
|
||||
)
|
||||
|
||||
s = get_settings()
|
||||
|
||||
# --- Globale Fehlerbehandlung (WP-25a Resilienz) ---
|
||||
|
||||
@app.exception_handler(Exception)
|
||||
async def global_exception_handler(request: Request, exc: Exception):
|
||||
"""Fängt unerwartete Fehler in der MoE-Prozesskette ab."""
|
||||
logger.error(f"❌ Unhandled Engine Error: {exc}", exc_info=True)
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={
|
||||
"detail": "Ein interner Fehler ist in der MoE-Kette aufgetreten.",
|
||||
"error_type": type(exc).__name__
|
||||
}
|
||||
)
|
||||
|
||||
# Healthcheck
|
||||
@app.get("/healthz")
|
||||
def healthz():
|
||||
return {"status": "ok", "qdrant": s.QDRANT_URL, "prefix": s.COLLECTION_PREFIX}
|
||||
"""Bietet Statusinformationen über die Engine und Datenbank-Verbindung."""
|
||||
# WP-24c v4.5.10: Prüfe EdgeDTO-Version zur Laufzeit
|
||||
edge_dto_supports_callout = False
|
||||
try:
|
||||
from app.models.dto import EdgeDTO
|
||||
import inspect
|
||||
source = inspect.getsource(EdgeDTO)
|
||||
edge_dto_supports_callout = "explicit:callout" in source
|
||||
except Exception:
|
||||
pass # Fehler beim Prüfen ist nicht kritisch
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"version": "1.1.0",
|
||||
"qdrant": s.QDRANT_URL,
|
||||
"prefix": s.COLLECTION_PREFIX,
|
||||
"moe_enabled": True,
|
||||
"edge_dto_supports_callout": edge_dto_supports_callout # WP-24c v4.5.10: Diagnose-Hilfe
|
||||
}
|
||||
|
||||
# Inkludieren der Router (100% Kompatibilität erhalten)
|
||||
app.include_router(query_router, prefix="/query", tags=["query"])
|
||||
app.include_router(graph_router, prefix="/graph", tags=["graph"])
|
||||
app.include_router(tools_router, prefix="/tools", tags=["tools"])
|
||||
app.include_router(feedback_router, prefix="/feedback", tags=["feedback"])
|
||||
app.include_router(chat_router, prefix="/chat", tags=["chat"]) # WP-25a Agentic Chat
|
||||
app.include_router(ingest_router, prefix="/ingest", tags=["ingest"])
|
||||
|
||||
if admin_router:
|
||||
app.include_router(admin_router, prefix="/admin", tags=["admin"])
|
||||
|
||||
app.include_router(embed_router)
|
||||
app.include_router(qdrant_router)
|
||||
return app
|
||||
|
||||
|
||||
app = create_app()
|
||||
# Instanziierung der App
|
||||
app = create_app()
|
||||
24
app/models/__init__.py
Normal file
24
app/models/__init__.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
app/models/__init__.py — Package-Init für mindnet DTOs (WP-04)
|
||||
Version: 0.1.0 • Stand: 2025-10-07
|
||||
"""
|
||||
|
||||
from .dto import (
|
||||
EdgeKind,
|
||||
NodeDTO,
|
||||
EdgeDTO,
|
||||
QueryRequest,
|
||||
QueryHit,
|
||||
QueryResponse,
|
||||
GraphResponse,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"EdgeKind",
|
||||
"NodeDTO",
|
||||
"EdgeDTO",
|
||||
"QueryRequest",
|
||||
"QueryHit",
|
||||
"QueryResponse",
|
||||
"GraphResponse",
|
||||
]
|
||||
181
app/models/dto.py
Normal file
181
app/models/dto.py
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
"""
|
||||
FILE: app/models/dto.py
|
||||
DESCRIPTION: Pydantic-Modelle (DTOs) für Request/Response Bodies. Definiert das API-Schema.
|
||||
VERSION: 0.7.1 (WP-25: Stream-Tracing Support)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: pydantic, typing, uuid
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Literal, Optional, Dict, Any
|
||||
import uuid
|
||||
|
||||
# WP-25: Erweiterte Kanten-Typen gemäß neuer decision_engine.yaml
|
||||
EdgeKind = Literal[
|
||||
"references", "references_at", "backlink", "next", "prev",
|
||||
"belongs_to", "depends_on", "related_to", "similar_to",
|
||||
"caused_by", "derived_from", "based_on", "solves", "blocks",
|
||||
"uses", "guides", "enforced_by", "implemented_in", "part_of",
|
||||
"experienced_in", "impacts", "risk_of"
|
||||
]
|
||||
|
||||
|
||||
# --- Basis-DTOs ---
|
||||
|
||||
class NodeDTO(BaseModel):
|
||||
"""Darstellung eines Knotens (Note oder Chunk) im API-Graph."""
|
||||
id: str
|
||||
type: Literal["note", "chunk"]
|
||||
title: Optional[str] = None
|
||||
note_id: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
in_degree: Optional[int] = None
|
||||
out_degree: Optional[int] = None
|
||||
score: Optional[float] = None
|
||||
section_title: Optional[str] = None
|
||||
section_path: Optional[str] = None
|
||||
path: Optional[str] = None
|
||||
|
||||
|
||||
class EdgeDTO(BaseModel):
|
||||
"""Darstellung einer Kante im API-Graph."""
|
||||
id: str
|
||||
kind: str
|
||||
source: str
|
||||
target: str
|
||||
weight: float
|
||||
direction: Literal["out", "in", "undirected"] = "out"
|
||||
# WP-24c v4.5.3: Erweiterte Provenance-Werte für Chunk-Aware Edges
|
||||
# Unterstützt alle tatsächlich verwendeten Provenance-Typen im System
|
||||
provenance: Optional[Literal[
|
||||
"explicit", "rule", "smart", "structure",
|
||||
"explicit:callout", "explicit:wikilink", "explicit:note_zone", "explicit:note_scope",
|
||||
"inline:rel", "callout:edge", "semantic_ai", "structure:belongs_to", "structure:order",
|
||||
"derived:backlink", "edge_defaults", "global_pool"
|
||||
]] = "explicit"
|
||||
confidence: float = 1.0
|
||||
target_section: Optional[str] = None
|
||||
|
||||
|
||||
# --- Request Models ---
|
||||
|
||||
class QueryRequest(BaseModel):
|
||||
"""
|
||||
Request für /query. Unterstützt Multi-Stream Isolation via filters.
|
||||
WP-24c v4.1.0: Erweitert um Section-Filtering und Scope-Awareness.
|
||||
"""
|
||||
mode: Literal["semantic", "edge", "hybrid"] = "hybrid"
|
||||
query: Optional[str] = None
|
||||
query_vector: Optional[List[float]] = None
|
||||
top_k: int = 10
|
||||
expand: Dict = {"depth": 1, "edge_types": ["references", "belongs_to", "prev", "next", "depends_on", "related_to"]}
|
||||
filters: Optional[Dict] = None
|
||||
ret: Dict = {"with_paths": True, "with_notes": True, "with_chunks": True}
|
||||
explain: bool = False
|
||||
|
||||
# WP-22/25: Dynamische Gewichtung der Graphen-Highways
|
||||
boost_edges: Optional[Dict[str, float]] = None
|
||||
|
||||
# WP-24c v4.1.0: Section-Filtering für präzise Section-Links
|
||||
target_section: Optional[str] = None
|
||||
|
||||
|
||||
class FeedbackRequest(BaseModel):
|
||||
"""User-Feedback zu einem spezifischen Treffer oder der Gesamtantwort."""
|
||||
query_id: str = Field(..., description="ID der ursprünglichen Suche")
|
||||
node_id: str = Field(..., description="ID des bewerteten Treffers oder 'generated_answer'")
|
||||
score: int = Field(..., ge=1, le=5, description="1 (Irrelevant) bis 5 (Perfekt)")
|
||||
comment: Optional[str] = None
|
||||
|
||||
|
||||
class ChatRequest(BaseModel):
|
||||
"""Request für /chat (WP-25 Einstieg)."""
|
||||
message: str = Field(..., description="Die Nachricht des Users")
|
||||
conversation_id: Optional[str] = Field(None, description="ID für Chat-Verlauf")
|
||||
top_k: int = 5
|
||||
explain: bool = False
|
||||
|
||||
|
||||
# --- Explanation Models ---
|
||||
|
||||
class ScoreBreakdown(BaseModel):
|
||||
"""Aufschlüsselung der Score-Komponenten nach der WP-22 Formel."""
|
||||
semantic_contribution: float
|
||||
edge_contribution: float
|
||||
centrality_contribution: float
|
||||
raw_semantic: float
|
||||
raw_edge_bonus: float
|
||||
raw_centrality: float
|
||||
node_weight: float
|
||||
status_multiplier: float = 1.0
|
||||
graph_boost_factor: float = 1.0
|
||||
|
||||
|
||||
class Reason(BaseModel):
|
||||
"""Ein semantischer Grund für das Ranking."""
|
||||
# WP-25: 'status' hinzugefügt für Synchronität mit retriever.py
|
||||
kind: Literal["semantic", "edge", "type", "centrality", "lifecycle", "status"]
|
||||
message: str
|
||||
score_impact: Optional[float] = None
|
||||
details: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class Explanation(BaseModel):
|
||||
"""Container für alle Erklärungsdaten eines Treffers."""
|
||||
breakdown: ScoreBreakdown
|
||||
reasons: List[Reason]
|
||||
related_edges: Optional[List[EdgeDTO]] = None
|
||||
applied_intent: Optional[str] = None
|
||||
applied_boosts: Optional[Dict[str, float]] = None
|
||||
|
||||
|
||||
# --- Response Models ---
|
||||
|
||||
class QueryHit(BaseModel):
|
||||
"""
|
||||
Einzelnes Trefferobjekt.
|
||||
WP-25: stream_origin hinzugefügt für Tracing und Feedback-Optimierung.
|
||||
WP-24c v4.1.0: source_chunk_id für RAG-Kontext hinzugefügt.
|
||||
"""
|
||||
node_id: str
|
||||
note_id: str
|
||||
semantic_score: float
|
||||
edge_bonus: float
|
||||
centrality_bonus: float
|
||||
total_score: float
|
||||
paths: Optional[List[List[Dict]]] = None
|
||||
source: Optional[Dict] = None
|
||||
payload: Optional[Dict] = None
|
||||
explanation: Optional[Explanation] = None
|
||||
stream_origin: Optional[str] = Field(None, description="Name des Ursprungs-Streams")
|
||||
source_chunk_id: Optional[str] = Field(None, description="Chunk-ID der Quelle (für RAG-Kontext)")
|
||||
|
||||
|
||||
class QueryResponse(BaseModel):
|
||||
"""Antwortstruktur für /query (wird von DecisionEngine Streams genutzt)."""
|
||||
query_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
results: List[QueryHit]
|
||||
used_mode: str
|
||||
latency_ms: int
|
||||
|
||||
|
||||
class GraphResponse(BaseModel):
|
||||
"""Antwortstruktur für /graph/{note_id}."""
|
||||
center_note_id: str
|
||||
nodes: List[NodeDTO]
|
||||
edges: List[EdgeDTO]
|
||||
stats: Dict[str, int]
|
||||
|
||||
|
||||
class ChatResponse(BaseModel):
|
||||
"""
|
||||
Antwortstruktur für /chat.
|
||||
WP-25: 'intent' spiegelt nun die gewählte Strategie wider.
|
||||
"""
|
||||
query_id: str = Field(..., description="Traceability ID")
|
||||
answer: str = Field(..., description="Generierte Antwort vom LLM")
|
||||
sources: List[QueryHit] = Field(..., description="Die genutzten Quellen (alle Streams)")
|
||||
latency_ms: int
|
||||
intent: Optional[str] = Field("FACT", description="Die gewählte WP-25 Strategie")
|
||||
intent_source: Optional[str] = Field("LLM_Router", description="Quelle der Intent-Erkennung")
|
||||
38
app/models/init.py
Normal file
38
app/models/init.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
"""
|
||||
app/models/__init__.py — Package-Init für mindnet DTOs
|
||||
|
||||
Zweck:
|
||||
Stellt zentrale API-/DTO-Modelle als Import-Fassade bereit.
|
||||
Kompatibilität:
|
||||
Python 3.12+, Pydantic 2.x
|
||||
Version:
|
||||
0.1.0 (Erstanlage für WP-04)
|
||||
Stand:
|
||||
2025-10-07
|
||||
Bezug:
|
||||
Handbuch / Knowledge Design (WP-03/04)
|
||||
Nutzung:
|
||||
from app.models import QueryRequest, QueryResponse, GraphResponse
|
||||
Änderungsverlauf:
|
||||
0.1.0 (2025-10-07) – Erstanlage.
|
||||
"""
|
||||
|
||||
from .dto import (
|
||||
EdgeKind,
|
||||
NodeDTO,
|
||||
EdgeDTO,
|
||||
QueryRequest,
|
||||
QueryHit,
|
||||
QueryResponse,
|
||||
GraphResponse,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"EdgeKind",
|
||||
"NodeDTO",
|
||||
"EdgeDTO",
|
||||
"QueryRequest",
|
||||
"QueryHit",
|
||||
"QueryResponse",
|
||||
"GraphResponse",
|
||||
]
|
||||
50
app/routers/admin.py
Normal file
50
app/routers/admin.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
"""
|
||||
FILE: app/routers/admin.py
|
||||
DESCRIPTION: Monitoring-Endpunkt. Zeigt Qdrant-Collection-Counts und geladene Config.
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active (Optional)
|
||||
DEPENDENCIES: qdrant_client, app.config
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from fastapi import APIRouter
|
||||
from qdrant_client import QdrantClient
|
||||
from app.config import get_settings
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/stats")
|
||||
def stats():
|
||||
s = get_settings()
|
||||
client = QdrantClient(url=s.QDRANT_URL, api_key=s.QDRANT_API_KEY)
|
||||
notes = f"{s.COLLECTION_PREFIX}_notes"
|
||||
chunks = f"{s.COLLECTION_PREFIX}_chunks"
|
||||
edges = f"{s.COLLECTION_PREFIX}_edges"
|
||||
|
||||
def _count(col: str) -> int:
|
||||
try:
|
||||
return client.count(collection_name=col, exact=True).count
|
||||
except Exception:
|
||||
return -1
|
||||
|
||||
return {
|
||||
"collections": {
|
||||
"notes": {"name": notes, "count": _count(notes)},
|
||||
"chunks": {"name": chunks, "count": _count(chunks)},
|
||||
"edges": {"name": edges, "count": _count(edges)},
|
||||
},
|
||||
"config": {
|
||||
"qdrant": s.QDRANT_URL,
|
||||
"prefix": s.COLLECTION_PREFIX,
|
||||
"vector_size": s.VECTOR_SIZE,
|
||||
"distance": s.DISTANCE,
|
||||
"retriever": {
|
||||
"w_sem": s.RETRIEVER_W_SEM,
|
||||
"w_edge": s.RETRIEVER_W_EDGE,
|
||||
"w_cent": s.RETRIEVER_W_CENT,
|
||||
"top_k": s.RETRIEVER_TOP_K,
|
||||
"expand_depth": s.RETRIEVER_EXPAND_DEPTH,
|
||||
},
|
||||
},
|
||||
}
|
||||
362
app/routers/chat.py
Normal file
362
app/routers/chat.py
Normal file
|
|
@ -0,0 +1,362 @@
|
|||
"""
|
||||
FILE: app/routers/chat.py
|
||||
DESCRIPTION: Haupt-Chat-Interface (WP-25b Edition).
|
||||
Kombiniert die spezialisierte Interview-Logik mit der neuen
|
||||
Lazy-Prompt-Orchestration und MoE-Synthese.
|
||||
WP-24c: Integration der Discovery API für proaktive Vernetzung.
|
||||
VERSION: 3.1.0 (WP-24c: Discovery API Integration)
|
||||
STATUS: Active
|
||||
FIX:
|
||||
- WP-24c: Neuer Endpunkt /query/discover für proaktive Kanten-Vorschläge.
|
||||
- WP-25b: Umstellung des Interview-Modus auf Lazy-Prompt (prompt_key + variables).
|
||||
- WP-25b: Delegation der RAG-Phase an die Engine v1.3.0 für konsistente MoE-Steuerung.
|
||||
- WP-25a: Voller Erhalt der v3.0.2 Logik (Interview, Schema-Resolution, FastPaths).
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pydantic import BaseModel
|
||||
import time
|
||||
import uuid
|
||||
import logging
|
||||
import yaml
|
||||
import os
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
from app.config import get_settings
|
||||
from app.models.dto import ChatRequest, ChatResponse, QueryHit, QueryRequest
|
||||
from app.services.llm_service import LLMService
|
||||
from app.services.feedback_service import log_search
|
||||
|
||||
router = APIRouter()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- EBENE 0: DTOs FÜR DISCOVERY (WP-24c) ---
|
||||
|
||||
class DiscoveryRequest(BaseModel):
|
||||
content: str
|
||||
top_k: int = 8
|
||||
min_confidence: float = 0.6
|
||||
|
||||
class DiscoveryHit(BaseModel):
|
||||
target_note: str # Note ID
|
||||
target_title: str # Menschenlesbarer Titel
|
||||
suggested_edge_type: str # Kanonischer Typ aus edge_vocabulary
|
||||
confidence_score: float # Kombinierter Vektor- + KI-Score
|
||||
reasoning: str # Kurze Begründung der KI
|
||||
|
||||
# --- EBENE 1: CONFIG LOADER & CACHING (WP-25 Standard) ---
|
||||
|
||||
_DECISION_CONFIG_CACHE = None
|
||||
_TYPES_CONFIG_CACHE = None
|
||||
|
||||
def _load_decision_config() -> Dict[str, Any]:
|
||||
"""Lädt die Strategie-Konfiguration."""
|
||||
settings = get_settings()
|
||||
path = Path(settings.DECISION_CONFIG_PATH)
|
||||
try:
|
||||
if path.exists():
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load decision config: {e}")
|
||||
return {"strategies": {}}
|
||||
|
||||
def _load_types_config() -> Dict[str, Any]:
|
||||
"""Lädt die types.yaml für die Typerkennung."""
|
||||
path = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml")
|
||||
try:
|
||||
if os.path.exists(path):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load types config: {e}")
|
||||
return {}
|
||||
|
||||
def get_full_config() -> Dict[str, Any]:
|
||||
global _DECISION_CONFIG_CACHE
|
||||
if _DECISION_CONFIG_CACHE is None:
|
||||
_DECISION_CONFIG_CACHE = _load_decision_config()
|
||||
return _DECISION_CONFIG_CACHE
|
||||
|
||||
def get_types_config() -> Dict[str, Any]:
|
||||
global _TYPES_CONFIG_CACHE
|
||||
if _TYPES_CONFIG_CACHE is None:
|
||||
_TYPES_CONFIG_CACHE = _load_types_config()
|
||||
return _TYPES_CONFIG_CACHE
|
||||
|
||||
def get_decision_strategy(intent: str) -> Dict[str, Any]:
|
||||
config = get_full_config()
|
||||
strategies = config.get("strategies", {})
|
||||
return strategies.get(intent, strategies.get("FACT_WHAT", {}))
|
||||
|
||||
# --- EBENE 2: SPEZIAL-LOGIK (INTERVIEW & DETECTION) ---
|
||||
|
||||
def _detect_target_type(message: str, configured_schemas: Dict[str, Any]) -> str:
|
||||
"""WP-07: Identifiziert den gewünschten Notiz-Typ (Keyword-basiert)."""
|
||||
message_lower = message.lower()
|
||||
types_cfg = get_types_config()
|
||||
types_def = types_cfg.get("types", {})
|
||||
|
||||
# 1. Check types.yaml detection_keywords
|
||||
for type_name, type_data in types_def.items():
|
||||
keywords = type_data.get("detection_keywords", [])
|
||||
for kw in keywords:
|
||||
if kw.lower() in message_lower:
|
||||
return type_name
|
||||
|
||||
# 2. Direkter Match mit Schema-Keys
|
||||
for type_key in configured_schemas.keys():
|
||||
if type_key == "default": continue
|
||||
if type_key in message_lower:
|
||||
return type_key
|
||||
|
||||
# 3. Synonym-Mapping (Legacy)
|
||||
synonyms = {
|
||||
"projekt": "project", "entscheidung": "decision", "ziel": "goal",
|
||||
"erfahrung": "experience", "wert": "value", "prinzip": "principle"
|
||||
}
|
||||
for term, schema_key in synonyms.items():
|
||||
if term in message_lower:
|
||||
return schema_key
|
||||
|
||||
return "default"
|
||||
|
||||
def _is_question(query: str) -> bool:
|
||||
"""Prüft, ob der Input eine Frage ist."""
|
||||
q = query.strip().lower()
|
||||
if "?" in q: return True
|
||||
starters = ["wer", "wie", "was", "wo", "wann", "warum", "weshalb", "wozu", "welche", "bist du"]
|
||||
return any(q.startswith(s + " ") for s in starters)
|
||||
|
||||
async def _classify_intent(query: str, llm: LLMService) -> tuple[str, str]:
|
||||
"""Hybrid Router: Keyword-Fast-Paths & DecisionEngine LLM Router."""
|
||||
config = get_full_config()
|
||||
strategies = config.get("strategies", {})
|
||||
query_lower = query.lower()
|
||||
|
||||
# 1. FAST PATH: Keyword Trigger
|
||||
for intent_name, strategy in strategies.items():
|
||||
keywords = strategy.get("trigger_keywords", [])
|
||||
for k in keywords:
|
||||
if k.lower() in query_lower:
|
||||
return intent_name, "Keyword (FastPath)"
|
||||
|
||||
# 2. FAST PATH B: Type Keywords -> INTERVIEW
|
||||
if not _is_question(query_lower):
|
||||
types_cfg = get_types_config()
|
||||
for type_name, type_data in types_cfg.get("types", {}).items():
|
||||
for kw in type_data.get("detection_keywords", []):
|
||||
if kw.lower() in query_lower:
|
||||
return "INTERVIEW", "Keyword (Interview)"
|
||||
|
||||
# 3. SLOW PATH: DecisionEngine LLM Router (MoE-gesteuert)
|
||||
intent = await llm.decision_engine._determine_strategy(query)
|
||||
return intent, "DecisionEngine (LLM)"
|
||||
|
||||
# --- EBENE 3: RETRIEVAL AGGREGATION ---
|
||||
|
||||
def _collect_all_hits(stream_responses: Dict[str, Any]) -> List[QueryHit]:
|
||||
"""Sammelt deduplizierte Treffer aus allen Streams für das Tracing."""
|
||||
all_hits = []
|
||||
seen_node_ids = set()
|
||||
for _, response in stream_responses.items():
|
||||
# Sammeln der Hits aus den QueryResponse Objekten
|
||||
if hasattr(response, 'results'):
|
||||
for hit in response.results:
|
||||
if hit.node_id not in seen_node_ids:
|
||||
all_hits.append(hit)
|
||||
seen_node_ids.add(hit.node_id)
|
||||
return sorted(all_hits, key=lambda h: h.total_score, reverse=True)
|
||||
|
||||
# --- EBENE 4: ENDPUNKTE ---
|
||||
|
||||
def get_llm_service():
|
||||
return LLMService()
|
||||
|
||||
@router.post("/", response_model=ChatResponse)
|
||||
async def chat_endpoint(
|
||||
request: ChatRequest,
|
||||
llm: LLMService = Depends(get_llm_service)
|
||||
):
|
||||
start_time = time.time()
|
||||
query_id = str(uuid.uuid4())
|
||||
logger.info(f"🚀 [WP-25b] Chat request [{query_id}]: {request.message[:50]}...")
|
||||
|
||||
try:
|
||||
# 1. Intent Detection
|
||||
intent, intent_source = await _classify_intent(request.message, llm)
|
||||
logger.info(f"[{query_id}] Intent: {intent} via {intent_source}")
|
||||
|
||||
strategy = get_decision_strategy(intent)
|
||||
engine = llm.decision_engine
|
||||
|
||||
sources_hits = []
|
||||
answer_text = ""
|
||||
|
||||
# 2. INTERVIEW MODE (WP-25b Lazy-Prompt Logik)
|
||||
if intent == "INTERVIEW":
|
||||
target_type = _detect_target_type(request.message, strategy.get("schemas", {}))
|
||||
types_cfg = get_types_config()
|
||||
type_def = types_cfg.get("types", {}).get(target_type, {})
|
||||
fields_list = type_def.get("schema", [])
|
||||
|
||||
# WP-07: Restaurierte Fallback Logik
|
||||
if not fields_list:
|
||||
configured_schemas = strategy.get("schemas", {})
|
||||
fallback = configured_schemas.get(target_type, configured_schemas.get("default", {}))
|
||||
fields_list = fallback.get("fields", []) if isinstance(fallback, dict) else (fallback or [])
|
||||
|
||||
fields_str = "\n- " + "\n- ".join(fields_list)
|
||||
template_key = strategy.get("prompt_template", "interview_template")
|
||||
|
||||
# WP-25b: Lazy Loading Call
|
||||
answer_text = await llm.generate_raw_response(
|
||||
prompt_key=template_key,
|
||||
variables={
|
||||
"query": request.message,
|
||||
"target_type": target_type,
|
||||
"schema_fields": fields_str
|
||||
},
|
||||
system=llm.get_prompt("system_prompt"),
|
||||
priority="realtime",
|
||||
profile_name="compression_fast",
|
||||
max_retries=0
|
||||
)
|
||||
sources_hits = []
|
||||
|
||||
# 3. RAG MODE (WP-25b Delegation an Engine v1.3.0)
|
||||
else:
|
||||
# Phase A & B: Retrieval & Kompression (Delegiert an Engine v1.3.0)
|
||||
formatted_context_map = await engine._execute_parallel_streams(strategy, request.message)
|
||||
|
||||
# Erfassung der Quellen für das Tracing
|
||||
raw_stream_map = {}
|
||||
stream_keys = strategy.get("use_streams", [])
|
||||
library = engine.config.get("streams_library", {})
|
||||
|
||||
retrieval_tasks = []
|
||||
active_streams = []
|
||||
for key in stream_keys:
|
||||
if key in library:
|
||||
active_streams.append(key)
|
||||
retrieval_tasks.append(engine._run_single_stream(key, library[key], request.message))
|
||||
|
||||
responses = await asyncio.gather(*retrieval_tasks, return_exceptions=True)
|
||||
for name, res in zip(active_streams, responses):
|
||||
if not isinstance(res, Exception):
|
||||
raw_stream_map[name] = res
|
||||
|
||||
sources_hits = _collect_all_hits(raw_stream_map)
|
||||
|
||||
# Phase C: Finale MoE Synthese (Delegiert an Engine v1.3.0)
|
||||
answer_text = await engine._generate_final_answer(
|
||||
intent, strategy, request.message, formatted_context_map
|
||||
)
|
||||
|
||||
duration_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# Logging (WP-15)
|
||||
try:
|
||||
log_search(
|
||||
query_id=query_id, query_text=request.message, results=sources_hits,
|
||||
mode=f"wp25b_{intent.lower()}", metadata={"strategy": intent, "source": intent_source}
|
||||
)
|
||||
except: pass
|
||||
|
||||
return ChatResponse(
|
||||
query_id=query_id, answer=answer_text, sources=sources_hits,
|
||||
latency_ms=duration_ms, intent=intent, intent_source=intent_source
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Chat Endpoint Failure: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail="Fehler bei der Verarbeitung der Anfrage.")
|
||||
|
||||
@router.post("/query/discover", response_model=List[DiscoveryHit])
|
||||
async def discover_edges(
|
||||
request: DiscoveryRequest,
|
||||
llm: LLMService = Depends(get_llm_service)
|
||||
):
|
||||
"""
|
||||
WP-24c: Analysiert Text auf potenzielle Kanten zu bestehendem Wissen.
|
||||
Nutzt Vektor-Suche und DecisionEngine-Logik (WP-25b PROMPT-TRACE konform).
|
||||
"""
|
||||
start_time = time.time()
|
||||
logger.info(f"🔍 [WP-24c] Discovery triggered for content: {request.content[:50]}...")
|
||||
|
||||
try:
|
||||
# 1. Kandidaten-Suche via Retriever (Vektor-Match)
|
||||
search_req = QueryRequest(
|
||||
query=request.content,
|
||||
top_k=request.top_k,
|
||||
explain=True
|
||||
)
|
||||
candidates = await llm.decision_engine.retriever.search(search_req)
|
||||
|
||||
if not candidates.results:
|
||||
logger.info("ℹ️ No candidates found for discovery.")
|
||||
return []
|
||||
|
||||
# 2. KI-gestützte Beziehungs-Extraktion (WP-25b)
|
||||
discovery_results = []
|
||||
|
||||
# Zugriff auf gültige Kanten-Typen aus der Registry
|
||||
from app.services.edge_registry import registry as edge_reg
|
||||
valid_types_str = ", ".join(list(edge_reg.valid_types))
|
||||
|
||||
# Parallele Evaluierung der Kandidaten für maximale Performance
|
||||
async def evaluate_candidate(hit: QueryHit) -> Optional[DiscoveryHit]:
|
||||
if hit.total_score < request.min_confidence:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Nutzt ingest_extractor Profil für präzise semantische Analyse
|
||||
# Wir verwenden das prompt_key Pattern (edge_extraction) gemäß WP-24c Vorgabe
|
||||
raw_suggestion = await llm.generate_raw_response(
|
||||
prompt_key="edge_extraction",
|
||||
variables={
|
||||
"note_id": "NEUER_INHALT",
|
||||
"text": f"PROXIMITY_TARGET: {hit.source.get('text', '')}\n\nNEW_CONTENT: {request.content}",
|
||||
"valid_types": valid_types_str
|
||||
},
|
||||
profile_name="ingest_extractor",
|
||||
priority="realtime"
|
||||
)
|
||||
|
||||
# Parsing der LLM Antwort (Erwartet JSON Liste)
|
||||
from app.core.ingestion.ingestion_utils import extract_json_from_response
|
||||
suggestions = extract_json_from_response(raw_suggestion)
|
||||
|
||||
if isinstance(suggestions, list) and len(suggestions) > 0:
|
||||
sugg = suggestions[0] # Wir nehmen den stärksten Vorschlag pro Hit
|
||||
return DiscoveryHit(
|
||||
target_note=hit.note_id,
|
||||
target_title=hit.source.get("title") or hit.note_id,
|
||||
suggested_edge_type=sugg.get("kind", "related_to"),
|
||||
confidence_score=hit.total_score,
|
||||
reasoning=f"Semantische Nähe ({int(hit.total_score*100)}%) entdeckt."
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Discovery evaluation failed for hit {hit.note_id}: {e}")
|
||||
return None
|
||||
|
||||
tasks = [evaluate_candidate(hit) for hit in candidates.results]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# Zusammenführung und Duplikat-Bereinigung
|
||||
seen_targets = set()
|
||||
for r in results:
|
||||
if r and r.target_note not in seen_targets:
|
||||
discovery_results.append(r)
|
||||
seen_targets.add(r.target_note)
|
||||
|
||||
duration = int((time.time() - start_time) * 1000)
|
||||
logger.info(f"✨ Discovery finished: found {len(discovery_results)} edges in {duration}ms")
|
||||
|
||||
return discovery_results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Discovery API failure: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail="Discovery-Prozess fehlgeschlagen.")
|
||||
|
|
@ -1,5 +1,10 @@
|
|||
"""
|
||||
Version 0.1
|
||||
FILE: app/routers/embed_router.py
|
||||
DESCRIPTION: Exponiert die lokale Embedding-Funktion als API-Endpunkt.
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.embeddings, pydantic
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
|
|||
24
app/routers/feedback.py
Normal file
24
app/routers/feedback.py
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
FILE: app/routers/feedback.py
|
||||
DESCRIPTION: Endpunkt für explizites User-Feedback (WP-04c).
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.models.dto, app.services.feedback_service
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from app.models.dto import FeedbackRequest
|
||||
from app.services.feedback_service import log_feedback
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.post("", status_code=201)
|
||||
def post_feedback(fb: FeedbackRequest):
|
||||
"""
|
||||
Nimmt Feedback entgegen (z.B. Daumen hoch für einen Treffer).
|
||||
"""
|
||||
try:
|
||||
log_feedback(fb)
|
||||
return {"status": "recorded", "query_id": fb.query_id}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
43
app/routers/graph.py
Normal file
43
app/routers/graph.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
"""
|
||||
FILE: app/routers/graph.py
|
||||
DESCRIPTION: Liefert Graph-Daten (Knoten/Kanten) für UI-Visualisierungen basierend auf einer Seed-ID. (WP4)
|
||||
VERSION: 0.1.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: qdrant_client, app.models.dto, app.core.graph_adapter, app.config
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import List, Optional
|
||||
from fastapi import APIRouter, Query
|
||||
from qdrant_client import QdrantClient
|
||||
from app.models.dto import GraphResponse, NodeDTO, EdgeDTO
|
||||
from app.core.graph.graph_subgraph import expand
|
||||
from app.config import get_settings
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/{note_id}", response_model=GraphResponse)
|
||||
def get_graph(note_id: str, depth: int = 1, edge_types: Optional[List[str]] = Query(None)) -> GraphResponse:
|
||||
s = get_settings()
|
||||
client = QdrantClient(url=s.QDRANT_URL, api_key=s.QDRANT_API_KEY)
|
||||
|
||||
sg = expand(client, s.COLLECTION_PREFIX, [note_id], depth=depth, edge_types=edge_types)
|
||||
|
||||
# Seed-Node (Payload optional später ergänzen über get_note_payload)
|
||||
nodes = [NodeDTO(id=note_id, type="note",
|
||||
title=None, tags=[],
|
||||
in_degree=sg.in_degree.get(note_id, 0),
|
||||
out_degree=sg.out_degree.get(note_id, 0))]
|
||||
|
||||
# direkte Out-Kanten des Seeds
|
||||
edges = []
|
||||
for e in sg.adj.get(note_id, []):
|
||||
edges.append(EdgeDTO(
|
||||
id=f"{note_id}->{e['target']}:{e['kind']}",
|
||||
kind=e["kind"], source=note_id, target=e["target"],
|
||||
weight=float(e["weight"]), direction="out"
|
||||
))
|
||||
|
||||
return GraphResponse(center_note_id=note_id, nodes=nodes, edges=edges,
|
||||
stats={"node_count": len(nodes), "edge_count": len(edges)})
|
||||
125
app/routers/ingest.py
Normal file
125
app/routers/ingest.py
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
"""
|
||||
FILE: app/routers/ingest.py
|
||||
DESCRIPTION: Endpunkte für WP-11. Nimmt Markdown entgegen.
|
||||
Refactored für WP-14: Nutzt BackgroundTasks für non-blocking Save.
|
||||
Update WP-20: Unterstützung für Hybrid-Cloud-Analyse Feedback.
|
||||
VERSION: 0.8.0 (WP-20 Hybrid Ready)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.core.ingestion, app.services.discovery, fastapi, pydantic
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import asyncio
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from app.core.ingestion import IngestionService
|
||||
from app.services.discovery import DiscoveryService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
# Services Init
|
||||
discovery_service = DiscoveryService()
|
||||
|
||||
class AnalyzeRequest(BaseModel):
|
||||
text: str
|
||||
type: str = "concept"
|
||||
|
||||
class SaveRequest(BaseModel):
|
||||
markdown_content: str
|
||||
filename: Optional[str] = None
|
||||
folder: str = "00_Inbox"
|
||||
|
||||
class SaveResponse(BaseModel):
|
||||
status: str
|
||||
file_path: str
|
||||
note_id: str
|
||||
message: str # Neu für UX Feedback
|
||||
stats: Dict[str, Any] # Kann leer sein bei async processing
|
||||
|
||||
# --- Background Task Wrapper ---
|
||||
async def run_ingestion_task(markdown_content: str, filename: str, vault_root: str, folder: str):
|
||||
"""
|
||||
Führt die Ingestion im Hintergrund aus, damit der Request nicht blockiert.
|
||||
Integrierter WP-20 Hybrid-Modus über den IngestionService.
|
||||
"""
|
||||
logger.info(f"🔄 Background Task started: Ingesting {filename}...")
|
||||
try:
|
||||
ingest_service = IngestionService()
|
||||
result = await ingest_service.create_from_text(
|
||||
markdown_content=markdown_content,
|
||||
filename=filename,
|
||||
vault_root=vault_root,
|
||||
folder=folder
|
||||
)
|
||||
# Hier könnte man später Notification-Services (Websockets) triggern
|
||||
if result.get("status") == "error":
|
||||
logger.error(f"❌ Background Ingestion Error for {filename}: {result.get('error')}")
|
||||
else:
|
||||
logger.info(f"✅ Background Task finished: {filename} ({result.get('chunks_count')} Chunks)")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Critical Background Task Failure: {e}", exc_info=True)
|
||||
|
||||
|
||||
@router.post("/analyze")
|
||||
async def analyze_draft(req: AnalyzeRequest):
|
||||
"""
|
||||
WP-11 Intelligence: Liefert Link-Vorschläge via DiscoveryService.
|
||||
"""
|
||||
try:
|
||||
result = await discovery_service.analyze_draft(req.text, req.type)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Analyze failed: {e}", exc_info=True)
|
||||
return {"suggestions": [], "error": str(e)}
|
||||
|
||||
@router.post("/save", response_model=SaveResponse)
|
||||
async def save_note(req: SaveRequest, background_tasks: BackgroundTasks):
|
||||
"""
|
||||
WP-14 Fix: Startet Ingestion im Hintergrund (Fire & Forget).
|
||||
Verhindert Timeouts bei aktiver Smart-Edge-Allocation (WP-15) und Cloud-Hybrid-Modus (WP-20).
|
||||
"""
|
||||
try:
|
||||
vault_root = os.getenv("MINDNET_VAULT_ROOT", "./vault")
|
||||
abs_vault_root = os.path.abspath(vault_root)
|
||||
|
||||
if not os.path.exists(abs_vault_root):
|
||||
try:
|
||||
os.makedirs(abs_vault_root, exist_ok=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not create vault root: {e}")
|
||||
|
||||
final_filename = req.filename or f"draft_{int(time.time())}.md"
|
||||
|
||||
# Wir geben sofort eine ID zurück (optimistisch),
|
||||
# auch wenn die echte ID erst nach dem Parsing feststeht.
|
||||
# Für UI-Feedback nutzen wir den Filename.
|
||||
|
||||
# Task in die Queue schieben
|
||||
background_tasks.add_task(
|
||||
run_ingestion_task,
|
||||
markdown_content=req.markdown_content,
|
||||
filename=final_filename,
|
||||
vault_root=abs_vault_root,
|
||||
folder=req.folder
|
||||
)
|
||||
|
||||
return SaveResponse(
|
||||
status="queued",
|
||||
file_path=os.path.join(req.folder, final_filename),
|
||||
note_id="pending",
|
||||
message="Speicherung & Hybrid-KI-Analyse (WP-20) im Hintergrund gestartet.",
|
||||
stats={
|
||||
"chunks": -1, # Indikator für Async
|
||||
"edges": -1
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Save dispatch failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Save dispatch failed: {str(e)}")
|
||||
|
|
@ -1,160 +0,0 @@
|
|||
"""
|
||||
Version 0.1
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional, List
|
||||
import uuid
|
||||
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel, Field
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http.models import (
|
||||
Distance,
|
||||
VectorParams,
|
||||
PointStruct,
|
||||
Filter,
|
||||
FieldCondition,
|
||||
MatchValue,
|
||||
)
|
||||
|
||||
from ..config import get_settings
|
||||
from ..embeddings import embed_texts
|
||||
|
||||
router = APIRouter(prefix="/qdrant", tags=["qdrant"])
|
||||
|
||||
def _client() -> QdrantClient:
|
||||
s = get_settings()
|
||||
return QdrantClient(url=s.QDRANT_URL, api_key=s.QDRANT_API_KEY)
|
||||
|
||||
def _col(name: str) -> str:
|
||||
return f"{get_settings().COLLECTION_PREFIX}_{name}"
|
||||
|
||||
def _uuid5(s: str) -> str:
|
||||
"""Deterministic UUIDv5 from arbitrary string (server-side point id)."""
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, s))
|
||||
|
||||
# --- Models ---
|
||||
class BaseMeta(BaseModel):
|
||||
note_id: str = Field(..., description="Stable ID of the note (e.g., hash of vault-relative path)")
|
||||
title: Optional[str] = Field(None, description="Note or chunk title")
|
||||
path: Optional[str] = Field(None, description="Vault-relative path to the .md file")
|
||||
Typ: Optional[str] = None
|
||||
Status: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
Rolle: Optional[List[str]] = None # allow list
|
||||
|
||||
class UpsertChunkRequest(BaseMeta):
|
||||
chunk_id: str = Field(..., description="Stable ID of the chunk within the note")
|
||||
text: str = Field(..., description="Chunk text content")
|
||||
links: Optional[List[str]] = Field(default=None, description="Outbound links detected in the chunk")
|
||||
|
||||
class UpsertNoteRequest(BaseMeta):
|
||||
text: Optional[str] = Field(None, description="Full note text (optional)")
|
||||
|
||||
class UpsertEdgeRequest(BaseModel):
|
||||
src_note_id: str
|
||||
dst_note_id: Optional[str] = None
|
||||
src_chunk_id: Optional[str] = None
|
||||
dst_chunk_id: Optional[str] = None
|
||||
relation: str = Field(default="links_to")
|
||||
link_text: Optional[str] = None
|
||||
|
||||
class QueryRequest(BaseModel):
|
||||
query: str
|
||||
limit: int = 5
|
||||
note_id: Optional[str] = None
|
||||
path: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
|
||||
# --- Helpers ---
|
||||
def _ensure_collections():
|
||||
s = get_settings()
|
||||
cli = _client()
|
||||
# chunks
|
||||
try:
|
||||
cli.get_collection(_col("chunks"))
|
||||
except Exception:
|
||||
cli.recreate_collection(_col("chunks"), vectors_config=VectorParams(size=s.VECTOR_SIZE, distance=Distance.COSINE))
|
||||
# notes
|
||||
try:
|
||||
cli.get_collection(_col("notes"))
|
||||
except Exception:
|
||||
cli.recreate_collection(_col("notes"), vectors_config=VectorParams(size=s.VECTOR_SIZE, distance=Distance.COSINE))
|
||||
# edges (dummy vector of size 1)
|
||||
try:
|
||||
cli.get_collection(_col("edges"))
|
||||
except Exception:
|
||||
cli.recreate_collection(_col("edges"), vectors_config=VectorParams(size=1, distance=Distance.COSINE))
|
||||
|
||||
@router.post("/upsert_chunk", summary="Upsert a chunk into mindnet_chunks")
|
||||
def upsert_chunk(req: UpsertChunkRequest) -> dict:
|
||||
_ensure_collections()
|
||||
cli = _client()
|
||||
vec = embed_texts([req.text])[0]
|
||||
payload: dict[str, Any] = req.model_dump()
|
||||
payload.pop("text", None)
|
||||
payload["preview"] = (req.text[:240] + "…") if len(req.text) > 240 else req.text
|
||||
qdrant_id = _uuid5(f"chunk:{req.chunk_id}")
|
||||
pt = PointStruct(id=qdrant_id, vector=vec, payload=payload)
|
||||
cli.upsert(collection_name=_col("chunks"), points=[pt])
|
||||
return {"status": "ok", "id": qdrant_id}
|
||||
|
||||
@router.post("/upsert_note", summary="Upsert a note into mindnet_notes")
|
||||
def upsert_note(req: UpsertNoteRequest) -> dict:
|
||||
_ensure_collections()
|
||||
cli = _client()
|
||||
text_for_embedding = req.text if req.text else (req.title or req.note_id)
|
||||
vec = embed_texts([text_for_embedding])[0]
|
||||
payload: dict[str, Any] = req.model_dump()
|
||||
payload.pop("text", None)
|
||||
qdrant_id = _uuid5(f"note:{req.note_id}")
|
||||
pt = PointStruct(id=qdrant_id, vector=vec, payload=payload)
|
||||
cli.upsert(collection_name=_col("notes"), points=[pt])
|
||||
return {"status": "ok", "id": qdrant_id}
|
||||
|
||||
@router.post("/upsert_edge", summary="Upsert a graph edge into mindnet_edges")
|
||||
def upsert_edge(req: UpsertEdgeRequest) -> dict:
|
||||
_ensure_collections()
|
||||
cli = _client()
|
||||
payload = req.model_dump()
|
||||
vec = [0.0]
|
||||
raw_edge_id = f"{req.src_note_id}|{req.src_chunk_id or ''}->{req.dst_note_id or ''}|{req.dst_chunk_id or ''}|{req.relation}"
|
||||
qdrant_id = _uuid5(f"edge:{raw_edge_id}")
|
||||
pt = PointStruct(id=qdrant_id, vector=vec, payload=payload)
|
||||
cli.upsert(collection_name=_col("edges"), points=[pt])
|
||||
return {"status": "ok", "id": qdrant_id}
|
||||
|
||||
@router.post("/query", summary="Vector query over mindnet_chunks with optional filters")
|
||||
def query(req: QueryRequest) -> dict:
|
||||
_ensure_collections()
|
||||
cli = _client()
|
||||
vec = embed_texts([req.query])[0]
|
||||
|
||||
flt: Optional[Filter] = None
|
||||
conds = []
|
||||
if req.note_id:
|
||||
conds.append(FieldCondition(key="note_id", match=MatchValue(value=req.note_id)))
|
||||
if req.path:
|
||||
conds.append(FieldCondition(key="path", match=MatchValue(value=req.path)))
|
||||
if req.tags:
|
||||
for t in req.tags:
|
||||
conds.append(FieldCondition(key="tags", match=MatchValue(value=t)))
|
||||
if conds:
|
||||
flt = Filter(must=conds)
|
||||
|
||||
res = cli.search(collection_name=_col("chunks"), query_vector=vec, limit=req.limit, with_payload=True, with_vectors=False, query_filter=flt)
|
||||
hits = []
|
||||
for p in res:
|
||||
pl = p.payload or {}
|
||||
hits.append({
|
||||
"chunk_id": p.id,
|
||||
"score": p.score,
|
||||
"note_id": pl.get("note_id"),
|
||||
"title": pl.get("title"),
|
||||
"path": pl.get("path"),
|
||||
"preview": pl.get("preview"),
|
||||
"tags": pl.get("tags"),
|
||||
})
|
||||
return {"results": hits}
|
||||
34
app/routers/query.py
Normal file
34
app/routers/query.py
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
"""
|
||||
FILE: app/routers/query.py
|
||||
DESCRIPTION: Klassische Such-Endpunkte (Semantic & Hybrid). Initiiert asynchrones Feedback-Logging und ruft den richtigen Retriever Modus auf
|
||||
VERSION: 0.2.0
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.models.dto, app.core.retriever, app.services.feedback_service
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||
from app.models.dto import QueryRequest, QueryResponse
|
||||
from app.core.retrieval.retriever import hybrid_retrieve, semantic_retrieve
|
||||
# NEU:
|
||||
from app.services.feedback_service import log_search
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.post("", response_model=QueryResponse)
|
||||
def post_query(req: QueryRequest, background_tasks: BackgroundTasks) -> QueryResponse:
|
||||
try:
|
||||
if req.mode == "semantic":
|
||||
res = semantic_retrieve(req)
|
||||
else:
|
||||
res = hybrid_retrieve(req)
|
||||
|
||||
# WP-04c: Logging im Hintergrund (bremst Antwort nicht)
|
||||
background_tasks.add_task(log_search, req, res)
|
||||
|
||||
return res
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"query failed: {e}")
|
||||
73
app/routers/tools.py
Normal file
73
app/routers/tools.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
"""
|
||||
FILE: app/routers/tools.py
|
||||
DESCRIPTION: Liefert JSON-Schemas für die Integration als 'Tools' in Agents (Ollama/OpenAI). Read-Only.
|
||||
VERSION: 0.1.1
|
||||
STATUS: Active
|
||||
DEPENDENCIES: fastapi
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from fastapi import APIRouter
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
TOOLS = {
|
||||
"tools": [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "mindnet_query",
|
||||
"description": "Hybrid-Retrieval über mindnet (Semantik + Edges).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"oneOf": [
|
||||
{"required": ["query"]},
|
||||
{"required": ["query_vector"]}
|
||||
],
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "Freitext-Query; wird serverseitig in 384-d Embedding konvertiert."
|
||||
},
|
||||
"query_vector": {
|
||||
"type": "array",
|
||||
"items": {"type": "number"},
|
||||
"description": "Direkter 384-d Query-Vektor (optional)."
|
||||
},
|
||||
"top_k": {"type":"integer","default":10,"minimum":1,"maximum":50},
|
||||
"expand_depth": {"type":"integer","default":1,"minimum":0,"maximum":3},
|
||||
"edge_types": {
|
||||
"type":"array","items":{"type":"string"},
|
||||
"default": ["references","belongs_to","prev","next"]
|
||||
},
|
||||
"filters": {"type":"object","description":"payload-Filter (tags etc.)"}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "mindnet_subgraph",
|
||||
"description": "Gibt die Nachbarschaft (Edges) einer Note/Seed-ID zurück.",
|
||||
"parameters": {
|
||||
"type":"object",
|
||||
"properties": {
|
||||
"note_id":{"type":"string"},
|
||||
"depth":{"type":"integer","default":1,"minimum":0,"maximum":3},
|
||||
"edge_types":{
|
||||
"type":"array","items":{"type":"string"},
|
||||
"default":["references","belongs_to","prev","next","backlink"]
|
||||
}
|
||||
},
|
||||
"required":["note_id"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@router.get("/ollama")
|
||||
def get_ollama_tools():
|
||||
return TOOLS
|
||||
242
app/services/discovery.py
Normal file
242
app/services/discovery.py
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
"""
|
||||
FILE: app/services/discovery.py
|
||||
DESCRIPTION: Service für WP-11 (Discovery API). Analysiert Entwürfe, findet Entitäten
|
||||
und schlägt typisierte Verbindungen basierend auf der Topologie vor.
|
||||
WP-24c: Vollständige Umstellung auf EdgeRegistry für dynamische Vorschläge.
|
||||
WP-15b: Unterstützung für hybride Suche und Alias-Erkennung.
|
||||
VERSION: 1.1.0 (WP-24c: Full Registry Integration & Audit Fix)
|
||||
STATUS: Active
|
||||
COMPATIBILITY: 100% (Identische API-Signatur wie v0.6.0)
|
||||
"""
|
||||
import logging
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List, Dict, Any, Optional, Set
|
||||
import yaml
|
||||
|
||||
from app.core.database.qdrant import QdrantConfig, get_client
|
||||
from app.models.dto import QueryRequest
|
||||
from app.core.retrieval.retriever import hybrid_retrieve
|
||||
# WP-24c: Zentrale Topologie-Quelle
|
||||
from app.services.edge_registry import registry as edge_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DiscoveryService:
|
||||
def __init__(self, collection_prefix: str = None):
|
||||
"""Initialisiert den Discovery Service mit Qdrant-Anbindung."""
|
||||
self.cfg = QdrantConfig.from_env()
|
||||
self.prefix = collection_prefix or self.cfg.prefix or "mindnet"
|
||||
self.client = get_client(self.cfg)
|
||||
|
||||
# Die Registry wird für Typ-Metadaten geladen (Schema-Validierung)
|
||||
self.registry = self._load_type_registry()
|
||||
|
||||
async def analyze_draft(self, text: str, current_type: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analysiert einen Textentwurf auf potenzielle Verbindungen.
|
||||
1. Findet exakte Treffer (Titel/Aliasse).
|
||||
2. Führt semantische Suchen für verschiedene Textabschnitte aus.
|
||||
3. Schlägt topologisch korrekte Kanten-Typen vor.
|
||||
"""
|
||||
if not text or len(text.strip()) < 3:
|
||||
return {"suggestions": [], "status": "empty_input"}
|
||||
|
||||
suggestions = []
|
||||
seen_target_ids = set()
|
||||
|
||||
# --- PHASE 1: EXACT MATCHES (TITEL & ALIASSE) ---
|
||||
# Lädt alle bekannten Titel/Aliasse für einen schnellen Scan
|
||||
known_entities = self._fetch_all_titles_and_aliases()
|
||||
exact_matches = self._find_entities_in_text(text, known_entities)
|
||||
|
||||
for entity in exact_matches:
|
||||
target_id = entity["id"]
|
||||
if target_id in seen_target_ids:
|
||||
continue
|
||||
|
||||
seen_target_ids.add(target_id)
|
||||
target_type = entity.get("type", "concept")
|
||||
|
||||
# WP-24c: Dynamische Kanten-Ermittlung statt Hardcoded Matrix
|
||||
suggested_kind = self._resolve_edge_type(current_type, target_type)
|
||||
|
||||
suggestions.append({
|
||||
"type": "exact_match",
|
||||
"text_found": entity["match"],
|
||||
"target_title": entity["title"],
|
||||
"target_id": target_id,
|
||||
"suggested_edge_type": suggested_kind,
|
||||
"suggested_markdown": f"[[rel:{suggest_kind} {entity['title']}]]",
|
||||
"confidence": 1.0,
|
||||
"reason": f"Direkte Erwähnung von '{entity['match']}' ({target_type})"
|
||||
})
|
||||
|
||||
# --- PHASE 2: SEMANTIC MATCHES (VECTOR SEARCH) ---
|
||||
# Erzeugt Suchanfragen für verschiedene Fenster des Textes
|
||||
search_queries = self._generate_search_queries(text)
|
||||
|
||||
# Parallele Ausführung der Suchanfragen (Cloud-Performance)
|
||||
tasks = [self._get_semantic_suggestions_async(q) for q in search_queries]
|
||||
results_list = await asyncio.gather(*tasks)
|
||||
|
||||
for hits in results_list:
|
||||
for hit in hits:
|
||||
payload = hit.payload or {}
|
||||
target_id = payload.get("note_id")
|
||||
|
||||
if not target_id or target_id in seen_target_ids:
|
||||
continue
|
||||
|
||||
# Relevanz-Threshold (Modell-spezifisch für nomic)
|
||||
if hit.total_score > 0.55:
|
||||
seen_target_ids.add(target_id)
|
||||
target_type = payload.get("type", "concept")
|
||||
target_title = payload.get("title") or "Unbenannt"
|
||||
|
||||
# WP-24c: Nutzung der Topologie-Engine
|
||||
suggested_kind = self._resolve_edge_type(current_type, target_type)
|
||||
|
||||
suggestions.append({
|
||||
"type": "semantic_match",
|
||||
"text_found": (hit.source.get("text") or "")[:80] + "...",
|
||||
"target_title": target_title,
|
||||
"target_id": target_id,
|
||||
"suggested_edge_type": suggested_kind,
|
||||
"suggested_markdown": f"[[rel:{suggested_kind} {target_title}]]",
|
||||
"confidence": round(hit.total_score, 2),
|
||||
"reason": f"Semantischer Bezug zu {target_type} ({int(hit.total_score*100)}%)"
|
||||
})
|
||||
|
||||
# Sortierung nach Konfidenz
|
||||
suggestions.sort(key=lambda x: x["confidence"], reverse=True)
|
||||
|
||||
return {
|
||||
"draft_length": len(text),
|
||||
"analyzed_windows": len(search_queries),
|
||||
"suggestions_count": len(suggestions),
|
||||
"suggestions": suggestions[:12] # Top 12 Vorschläge
|
||||
}
|
||||
|
||||
# --- LOGIK-ZENTRALE (WP-24c) ---
|
||||
|
||||
def _resolve_edge_type(self, source_type: str, target_type: str) -> str:
|
||||
"""
|
||||
Ermittelt den optimalen Kanten-Typ zwischen zwei Notiz-Typen.
|
||||
Nutzt EdgeRegistry (graph_schema.md) statt lokaler Matrix.
|
||||
"""
|
||||
# 1. Spezifische Prüfung: Gibt es eine Regel für Source -> Target?
|
||||
info = edge_registry.get_topology_info(source_type, target_type)
|
||||
typical = info.get("typical", [])
|
||||
if typical:
|
||||
return typical[0] # Erster Vorschlag aus dem Schema
|
||||
|
||||
# 2. Fallback: Was ist für den Quell-Typ generell typisch? (Source -> any)
|
||||
info_fallback = edge_registry.get_topology_info(source_type, "any")
|
||||
typical_fallback = info_fallback.get("typical", [])
|
||||
if typical_fallback:
|
||||
return typical_fallback[0]
|
||||
|
||||
# 3. Globaler Fallback (Sicherheitsnetz)
|
||||
return "related_to"
|
||||
|
||||
# --- HELPERS (VOLLSTÄNDIG ERHALTEN) ---
|
||||
|
||||
def _generate_search_queries(self, text: str) -> List[str]:
|
||||
"""Erzeugt überlappende Fenster für die Vektorsuche (Sliding Window)."""
|
||||
text_len = len(text)
|
||||
queries = []
|
||||
|
||||
# Fokus A: Dokument-Anfang (Kontext)
|
||||
queries.append(text[:600])
|
||||
|
||||
# Fokus B: Dokument-Ende (Aktueller Schreibfokus)
|
||||
if text_len > 250:
|
||||
footer = text[-350:]
|
||||
if footer not in queries:
|
||||
queries.append(footer)
|
||||
|
||||
# Fokus C: Zwischenabschnitte bei langen Texten
|
||||
if text_len > 1200:
|
||||
window_size = 500
|
||||
step = 1200
|
||||
for i in range(600, text_len - 400, step):
|
||||
chunk = text[i:i+window_size]
|
||||
if len(chunk) > 100:
|
||||
queries.append(chunk)
|
||||
|
||||
return queries
|
||||
|
||||
async def _get_semantic_suggestions_async(self, text: str):
|
||||
"""Führt eine asynchrone Vektorsuche über den Retriever aus."""
|
||||
req = QueryRequest(query=text, top_k=6, explain=False)
|
||||
try:
|
||||
# Nutzt hybrid_retrieve (WP-15b Standard)
|
||||
res = hybrid_retrieve(req)
|
||||
return res.results
|
||||
except Exception as e:
|
||||
logger.error(f"Discovery retrieval error: {e}")
|
||||
return []
|
||||
|
||||
def _load_type_registry(self) -> dict:
|
||||
"""Lädt die types.yaml für Typ-Definitionen."""
|
||||
path = os.getenv("MINDNET_TYPES_FILE", "config/types.yaml")
|
||||
if not os.path.exists(path):
|
||||
return {}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def _fetch_all_titles_and_aliases(self) -> List[Dict]:
|
||||
"""Holt alle Note-IDs, Titel und Aliasse für den Exakt-Match Abgleich."""
|
||||
entities = []
|
||||
next_page = None
|
||||
col = f"{self.prefix}_notes"
|
||||
try:
|
||||
while True:
|
||||
res, next_page = self.client.scroll(
|
||||
collection_name=col, limit=1000, offset=next_page,
|
||||
with_payload=True, with_vectors=False
|
||||
)
|
||||
for point in res:
|
||||
pl = point.payload or {}
|
||||
aliases = pl.get("aliases") or []
|
||||
if isinstance(aliases, str):
|
||||
aliases = [aliases]
|
||||
|
||||
entities.append({
|
||||
"id": pl.get("note_id"),
|
||||
"title": pl.get("title"),
|
||||
"aliases": aliases,
|
||||
"type": pl.get("type", "concept")
|
||||
})
|
||||
if next_page is None:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"Error fetching entities for discovery: {e}")
|
||||
return entities
|
||||
|
||||
def _find_entities_in_text(self, text: str, entities: List[Dict]) -> List[Dict]:
|
||||
"""Sucht im Text nach Erwähnungen bekannter Entitäten."""
|
||||
found = []
|
||||
text_lower = text.lower()
|
||||
for entity in entities:
|
||||
title = entity.get("title")
|
||||
# Titel-Check
|
||||
if title and title.lower() in text_lower:
|
||||
found.append({
|
||||
"match": title, "title": title,
|
||||
"id": entity["id"], "type": entity["type"]
|
||||
})
|
||||
continue
|
||||
# Alias-Check
|
||||
for alias in entity.get("aliases", []):
|
||||
if str(alias).lower() in text_lower:
|
||||
found.append({
|
||||
"match": str(alias), "title": title,
|
||||
"id": entity["id"], "type": entity["type"]
|
||||
})
|
||||
break
|
||||
return found
|
||||
227
app/services/edge_registry.py
Normal file
227
app/services/edge_registry.py
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
"""
|
||||
FILE: app/services/edge_registry.py
|
||||
DESCRIPTION: Single Source of Truth für Kanten-Typen, Symmetrien und Graph-Topologie.
|
||||
WP-24c: Implementierung der dualen Registry (Vocabulary & Schema).
|
||||
Unterstützt dynamisches Laden von Inversen und kontextuellen Vorschlägen.
|
||||
VERSION: 1.0.1 (WP-24c: Verified Atomic Topology)
|
||||
STATUS: Active
|
||||
"""
|
||||
import re
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, Optional, Set, Tuple, List
|
||||
|
||||
from app.config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class EdgeRegistry:
|
||||
"""
|
||||
Zentraler Verwalter für das Kanten-Vokabular und das Graph-Schema.
|
||||
Singleton-Pattern zur Sicherstellung konsistenter Validierung.
|
||||
"""
|
||||
_instance = None
|
||||
|
||||
# SYSTEM-SCHUTZ: Diese Kanten sind für die strukturelle Integrität reserviert (v0.8.0 Erhalt)
|
||||
FORBIDDEN_SYSTEM_EDGES = {"next", "prev", "belongs_to"}
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if cls._instance is None:
|
||||
cls._instance = super(EdgeRegistry, cls).__new__(cls)
|
||||
cls._instance.initialized = False
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
if self.initialized:
|
||||
return
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
# --- Pfad-Konfiguration (WP-24c: Variable Pfade für Vault-Spiegelung) ---
|
||||
# Das Vokabular (Semantik)
|
||||
self.full_vocab_path = os.path.abspath(settings.MINDNET_VOCAB_PATH)
|
||||
|
||||
# Das Schema (Topologie) - Konfigurierbar via ENV: MINDNET_SCHEMA_PATH
|
||||
schema_env = getattr(settings, "MINDNET_SCHEMA_PATH", None)
|
||||
if schema_env:
|
||||
self.full_schema_path = os.path.abspath(schema_env)
|
||||
else:
|
||||
# Fallback: Liegt im selben Verzeichnis wie das Vokabular
|
||||
self.full_schema_path = os.path.join(os.path.dirname(self.full_vocab_path), "graph_schema.md")
|
||||
|
||||
self.unknown_log_path = "data/logs/unknown_edges.jsonl"
|
||||
|
||||
# --- Interne Datenspeicher ---
|
||||
self.canonical_map: Dict[str, str] = {}
|
||||
self.inverse_map: Dict[str, str] = {}
|
||||
self.valid_types: Set[str] = set()
|
||||
|
||||
# Topologie: source_type -> { target_type -> {"typical": set, "prohibited": set} }
|
||||
self.topology: Dict[str, Dict[str, Dict[str, Set[str]]]] = {}
|
||||
|
||||
self._last_vocab_mtime = 0.0
|
||||
self._last_schema_mtime = 0.0
|
||||
|
||||
logger.info(f">>> [EDGE-REGISTRY] Initializing WP-24c Dual-Engine")
|
||||
logger.info(f" - Vocab-Path: {self.full_vocab_path}")
|
||||
logger.info(f" - Schema-Path: {self.full_schema_path}")
|
||||
|
||||
self.ensure_latest()
|
||||
self.initialized = True
|
||||
|
||||
def ensure_latest(self):
|
||||
"""Prüft Zeitstempel beider Dateien und führt bei Änderung Hot-Reload durch."""
|
||||
try:
|
||||
# Vokabular-Reload bei Änderung
|
||||
if os.path.exists(self.full_vocab_path):
|
||||
v_mtime = os.path.getmtime(self.full_vocab_path)
|
||||
if v_mtime > self._last_vocab_mtime:
|
||||
self._load_vocabulary()
|
||||
self._last_vocab_mtime = v_mtime
|
||||
|
||||
# Schema-Reload bei Änderung
|
||||
if os.path.exists(self.full_schema_path):
|
||||
s_mtime = os.path.getmtime(self.full_schema_path)
|
||||
if s_mtime > self._last_schema_mtime:
|
||||
self._load_schema()
|
||||
self._last_schema_mtime = s_mtime
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"!!! [EDGE-REGISTRY] Sync failure: {e}")
|
||||
|
||||
def _load_vocabulary(self):
|
||||
"""Parst edge_vocabulary.md: | Canonical | Inverse | Aliases | Description |"""
|
||||
self.canonical_map.clear()
|
||||
self.inverse_map.clear()
|
||||
self.valid_types.clear()
|
||||
|
||||
# Regex für die 4-Spalten Struktur (WP-24c konform)
|
||||
# Erwartet: | **`type`** | `inverse` | alias1, alias2 | ... |
|
||||
pattern = re.compile(r"\|\s*\*\*`?([a-zA-Z0-9_-]+)`?\*\*\s*\|\s*`?([a-zA-Z0-9_-]+)`?\s*\|\s*([^|]+)\|")
|
||||
|
||||
try:
|
||||
with open(self.full_vocab_path, "r", encoding="utf-8") as f:
|
||||
c_count = 0
|
||||
for line in f:
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
canonical = match.group(1).strip().lower()
|
||||
inverse = match.group(2).strip().lower()
|
||||
aliases_raw = match.group(3).strip()
|
||||
|
||||
self.valid_types.add(canonical)
|
||||
self.canonical_map[canonical] = canonical
|
||||
if inverse:
|
||||
self.inverse_map[canonical] = inverse
|
||||
|
||||
# Aliase verarbeiten (Normalisierung auf snake_case)
|
||||
if aliases_raw and "Kein Alias" not in aliases_raw:
|
||||
aliases = [a.strip() for a in aliases_raw.split(",") if a.strip()]
|
||||
for alias in aliases:
|
||||
clean_alias = alias.replace("`", "").lower().strip().replace(" ", "_")
|
||||
if clean_alias:
|
||||
self.canonical_map[clean_alias] = canonical
|
||||
c_count += 1
|
||||
|
||||
logger.info(f"✅ [VOCAB] Loaded {c_count} edge definitions and their inverses.")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ [VOCAB ERROR] {e}")
|
||||
|
||||
def _load_schema(self):
|
||||
"""Parst graph_schema.md: ## Source: `type` | Target | Typical | Prohibited |"""
|
||||
self.topology.clear()
|
||||
current_source = None
|
||||
|
||||
try:
|
||||
with open(self.full_schema_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
# Header erkennen (Atomare Sektionen)
|
||||
src_match = re.search(r"## Source:\s*`?([a-zA-Z0-9_-]+)`?", line)
|
||||
if src_match:
|
||||
current_source = src_match.group(1).strip().lower()
|
||||
if current_source not in self.topology:
|
||||
self.topology[current_source] = {}
|
||||
continue
|
||||
|
||||
# Tabellenzeilen parsen
|
||||
if current_source and "|" in line and not line.startswith("|-") and "Target" not in line:
|
||||
cols = [c.strip().replace("`", "").lower() for c in line.split("|")]
|
||||
if len(cols) >= 4:
|
||||
target_type = cols[1]
|
||||
typical_edges = [e.strip() for e in cols[2].split(",") if e.strip() and e != "-"]
|
||||
prohibited_edges = [e.strip() for e in cols[3].split(",") if e.strip() and e != "-"]
|
||||
|
||||
if target_type not in self.topology[current_source]:
|
||||
self.topology[current_source][target_type] = {"typical": set(), "prohibited": set()}
|
||||
|
||||
self.topology[current_source][target_type]["typical"].update(typical_edges)
|
||||
self.topology[current_source][target_type]["prohibited"].update(prohibited_edges)
|
||||
|
||||
logger.info(f"✅ [SCHEMA] Topology matrix built for {len(self.topology)} source types.")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ [SCHEMA ERROR] {e}")
|
||||
|
||||
def resolve(self, edge_type: str, provenance: str = "explicit", context: dict = None) -> str:
|
||||
"""
|
||||
Löst Aliasse auf kanonische Namen auf und schützt System-Kanten.
|
||||
Erhalt der v0.8.0 Schutz-Logik.
|
||||
"""
|
||||
self.ensure_latest()
|
||||
if not edge_type:
|
||||
return "related_to"
|
||||
|
||||
clean_type = edge_type.lower().strip().replace(" ", "_").replace("-", "_")
|
||||
ctx = context or {}
|
||||
|
||||
# Sicherheits-Gate: Schutz vor unerlaubter Nutzung von System-Kanten
|
||||
restricted_provenance = ["explicit", "semantic_ai", "inherited", "global_pool", "rule"]
|
||||
if provenance in restricted_provenance and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
|
||||
self._log_issue(clean_type, f"forbidden_system_edge_manipulation_by_{provenance}", ctx)
|
||||
return "related_to"
|
||||
|
||||
# System-Kanten sind NUR bei struktureller Provenienz (Code-generiert) erlaubt
|
||||
if provenance == "structure" and clean_type in self.FORBIDDEN_SYSTEM_EDGES:
|
||||
return clean_type
|
||||
|
||||
# Alias-Auflösung
|
||||
return self.canonical_map.get(clean_type, clean_type)
|
||||
|
||||
def get_inverse(self, edge_type: str) -> str:
|
||||
"""WP-24c: Gibt das symmetrische Gegenstück zurück."""
|
||||
canonical = self.resolve(edge_type)
|
||||
return self.inverse_map.get(canonical, "related_to")
|
||||
|
||||
def get_topology_info(self, source_type: str, target_type: str) -> Dict[str, List[str]]:
|
||||
"""
|
||||
WP-24c: Liefert kontextuelle Kanten-Empfehlungen für Obsidian und das Backend.
|
||||
"""
|
||||
self.ensure_latest()
|
||||
|
||||
# Hierarchische Suche: Spezifisch -> 'any' -> Empty
|
||||
src_cfg = self.topology.get(source_type, self.topology.get("any", {}))
|
||||
tgt_cfg = src_cfg.get(target_type, src_cfg.get("any", {"typical": set(), "prohibited": set()}))
|
||||
|
||||
return {
|
||||
"typical": sorted(list(tgt_cfg["typical"])),
|
||||
"prohibited": sorted(list(tgt_cfg["prohibited"]))
|
||||
}
|
||||
|
||||
def _log_issue(self, edge_type: str, error_kind: str, ctx: dict):
|
||||
"""JSONL-Logging für unbekannte/verbotene Kanten (Erhalt v0.8.0)."""
|
||||
try:
|
||||
os.makedirs(os.path.dirname(self.unknown_log_path), exist_ok=True)
|
||||
entry = {
|
||||
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"edge_type": edge_type,
|
||||
"error": error_kind,
|
||||
"note_id": ctx.get("note_id", "unknown"),
|
||||
"provenance": ctx.get("provenance", "unknown")
|
||||
}
|
||||
with open(self.unknown_log_path, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
except Exception: pass
|
||||
|
||||
# Singleton Export
|
||||
registry = EdgeRegistry()
|
||||
138
app/services/embeddings_client.py
Normal file
138
app/services/embeddings_client.py
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
"""
|
||||
FILE: app/services/embeddings_client.py
|
||||
DESCRIPTION: Unified Embedding Client. Nutzt MoE-Profile zur Modellsteuerung.
|
||||
WP-25a: Integration der llm_profiles.yaml für konsistente Vektoren.
|
||||
VERSION: 2.6.0 (WP-25a: MoE & Profile Support)
|
||||
STATUS: Active
|
||||
DEPENDENCIES: httpx, requests, app.config, yaml
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import logging
|
||||
import httpx
|
||||
import requests
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
from app.config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class EmbeddingsClient:
|
||||
"""
|
||||
Async Client für Embeddings.
|
||||
Steuerung erfolgt über das 'embedding_expert' Profil in llm_profiles.yaml.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.settings = get_settings()
|
||||
|
||||
# 1. MoE-Profil laden (WP-25a)
|
||||
self.profile = self._load_embedding_profile()
|
||||
|
||||
# 2. Modell & URL auflösen
|
||||
# Priorität: llm_profiles.yaml -> .env (Legacy) -> Fallback
|
||||
self.model = self.profile.get("model") or os.getenv("MINDNET_EMBEDDING_MODEL")
|
||||
|
||||
provider = self.profile.get("provider", "ollama")
|
||||
if provider == "ollama":
|
||||
self.base_url = self.settings.OLLAMA_URL
|
||||
else:
|
||||
# Platzhalter für zukünftige Cloud-Embedding-Provider
|
||||
self.base_url = os.getenv("MINDNET_OLLAMA_URL", "http://127.0.0.1:11434")
|
||||
|
||||
if not self.model:
|
||||
self.model = os.getenv("MINDNET_LLM_MODEL", "phi3:mini")
|
||||
logger.warning(f"⚠️ Kein Embedding-Modell in Profil oder .env gefunden. Fallback auf '{self.model}'.")
|
||||
else:
|
||||
logger.info(f"🧬 Embedding-Experte aktiv: Model='{self.model}' via {provider}")
|
||||
|
||||
def _load_embedding_profile(self) -> Dict[str, Any]:
|
||||
"""Lädt die Konfiguration für den embedding_expert."""
|
||||
path_str = getattr(self.settings, "LLM_PROFILES_PATH", "config/llm_profiles.yaml")
|
||||
path = Path(path_str)
|
||||
if not path.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
profiles = data.get("profiles", {})
|
||||
return profiles.get("embedding_expert", {})
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to load embedding profile: {e}")
|
||||
return {}
|
||||
|
||||
async def embed_query(self, text: str) -> List[float]:
|
||||
"""Erzeugt einen Vektor für eine Suchanfrage."""
|
||||
return await self._request_embedding(text)
|
||||
|
||||
async def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Erzeugt Vektoren für einen Batch von Dokumenten."""
|
||||
vectors = []
|
||||
# Längeres Timeout für Batches (WP-20 Resilienz)
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
for text in texts:
|
||||
vec = await self._request_embedding_with_client(client, text)
|
||||
vectors.append(vec)
|
||||
return vectors
|
||||
|
||||
async def _request_embedding(self, text: str) -> List[float]:
|
||||
"""Interner Request-Handler für Einzelabfragen."""
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
return await self._request_embedding_with_client(client, text)
|
||||
|
||||
async def _request_embedding_with_client(self, client: httpx.AsyncClient, text: str) -> List[float]:
|
||||
"""Führt den HTTP-Call gegen die Embedding-API durch."""
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
url = f"{self.base_url}/api/embeddings"
|
||||
try:
|
||||
# WP-25: Aktuell optimiert für Ollama-API Struktur
|
||||
response = await client.post(url, json={"model": self.model, "prompt": text})
|
||||
response.raise_for_status()
|
||||
return response.json().get("embedding", [])
|
||||
except Exception as e:
|
||||
logger.error(f"Async embedding failed (Model: {self.model}): {e}")
|
||||
return []
|
||||
|
||||
# ==============================================================================
|
||||
# TEIL 2: SYNCHRONER FALLBACK (Unified)
|
||||
# ==============================================================================
|
||||
|
||||
def embed_text(text: str) -> List[float]:
|
||||
"""
|
||||
LEGACY/SYNC: Nutzt ebenfalls die Profil-Logik für Konsistenz.
|
||||
Ersetzt lokale sentence-transformers zur Vermeidung von Dimensionskonflikten.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
# Schneller Profil-Lookup für Sync-Mode
|
||||
path = Path(getattr(settings, "LLM_PROFILES_PATH", "config/llm_profiles.yaml"))
|
||||
model = os.getenv("MINDNET_EMBEDDING_MODEL")
|
||||
base_url = settings.OLLAMA_URL
|
||||
|
||||
if path.exists():
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
prof = data.get("profiles", {}).get("embedding_expert", {})
|
||||
if prof.get("model"):
|
||||
model = prof["model"]
|
||||
except: pass
|
||||
|
||||
if not model:
|
||||
model = os.getenv("MINDNET_LLM_MODEL", "phi3:mini")
|
||||
|
||||
url = f"{base_url}/api/embeddings"
|
||||
|
||||
try:
|
||||
# Synchroner Request via requests
|
||||
response = requests.post(url, json={"model": model, "prompt": text}, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response.json().get("embedding", [])
|
||||
except Exception as e:
|
||||
logger.error(f"Sync embedding failed (Model: {model}): {e}")
|
||||
return []
|
||||
99
app/services/feedback_service.py
Normal file
99
app/services/feedback_service.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
"""
|
||||
FILE: app/services/feedback_service.py
|
||||
DESCRIPTION: Schreibt Search- und Feedback-Logs in JSONL-Dateien.
|
||||
VERSION: 1.1
|
||||
STATUS: Active
|
||||
DEPENDENCIES: app.models.dto
|
||||
LAST_ANALYSIS: 2025-12-15
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, List, Union
|
||||
from app.models.dto import QueryRequest, QueryResponse, FeedbackRequest, QueryHit
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Pfad für Logs (lokal auf dem Beelink/PC)
|
||||
LOG_DIR = Path("data/logs")
|
||||
SEARCH_LOG_FILE = LOG_DIR / "search_history.jsonl"
|
||||
FEEDBACK_LOG_FILE = LOG_DIR / "feedback.jsonl"
|
||||
|
||||
def _ensure_log_dir():
|
||||
if not LOG_DIR.exists():
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
|
||||
def _append_jsonl(file_path: Path, data: dict):
|
||||
try:
|
||||
with open(file_path, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(data, ensure_ascii=False) + "\n")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write log: {e}")
|
||||
|
||||
def log_search(
|
||||
query_id: str,
|
||||
query_text: str,
|
||||
results: List[QueryHit],
|
||||
mode: str = "unknown",
|
||||
metadata: Dict[str, Any] = None
|
||||
):
|
||||
"""
|
||||
Generische Logging-Funktion für Suche UND Chat.
|
||||
|
||||
Args:
|
||||
query_id: UUID der Anfrage.
|
||||
query_text: User-Eingabe.
|
||||
results: Liste der Treffer (QueryHit Objekte).
|
||||
mode: z.B. "semantic", "hybrid", "chat_rag".
|
||||
metadata: Zusätzliche Infos (z.B. generierte Antwort, Intent).
|
||||
"""
|
||||
_ensure_log_dir()
|
||||
|
||||
hits_summary = []
|
||||
for hit in results:
|
||||
# Pydantic Model Dump für saubere Serialisierung
|
||||
breakdown = None
|
||||
if hit.explanation and hit.explanation.breakdown:
|
||||
breakdown = hit.explanation.breakdown.model_dump()
|
||||
|
||||
hits_summary.append({
|
||||
"node_id": hit.node_id,
|
||||
"note_id": hit.note_id,
|
||||
"total_score": hit.total_score,
|
||||
"breakdown": breakdown,
|
||||
"rank_semantic": hit.semantic_score,
|
||||
"rank_edge": hit.edge_bonus,
|
||||
"type": hit.source.get("type") if hit.source else "unknown"
|
||||
})
|
||||
|
||||
entry = {
|
||||
"timestamp": time.time(),
|
||||
"query_id": query_id,
|
||||
"query_text": query_text,
|
||||
"mode": mode,
|
||||
"hits_count": len(hits_summary),
|
||||
"hits": hits_summary,
|
||||
"metadata": metadata or {}
|
||||
}
|
||||
|
||||
_append_jsonl(SEARCH_LOG_FILE, entry)
|
||||
logger.info(f"Logged search/chat interaction {query_id}")
|
||||
|
||||
def log_feedback(fb: FeedbackRequest):
|
||||
"""
|
||||
Speichert das User-Feedback.
|
||||
"""
|
||||
_ensure_log_dir()
|
||||
|
||||
entry = {
|
||||
"timestamp": time.time(),
|
||||
"query_id": fb.query_id,
|
||||
"node_id": fb.node_id,
|
||||
"score": fb.score,
|
||||
"comment": fb.comment
|
||||
}
|
||||
|
||||
_append_jsonl(FEEDBACK_LOG_FILE, entry)
|
||||
logger.info(f"Logged feedback for {fb.query_id}")
|
||||
337
app/services/llm_service.py
Normal file
337
app/services/llm_service.py
Normal file
|
|
@ -0,0 +1,337 @@
|
|||
"""
|
||||
FILE: app/services/llm_service.py
|
||||
DESCRIPTION: Hybrid-Client für Ollama, Google GenAI (Gemini) und OpenRouter.
|
||||
WP-25b: Implementierung der Lazy-Prompt-Orchestration (Modell-spezifisch).
|
||||
VERSION: 3.5.5 (WP-25b: Prompt Orchestration & Full Resilience)
|
||||
STATUS: Active
|
||||
FIX:
|
||||
- WP-25b: get_prompt() unterstützt Hierarchie: Model-ID -> Provider -> Default.
|
||||
- WP-25b: generate_raw_response() unterstützt prompt_key + variables für Lazy-Formatting.
|
||||
- WP-25a: Voller Erhalt der rekursiven Fallback-Kaskade und visited_profiles Schutz.
|
||||
- WP-20: Restaurierung des internen Ollama-Retry-Loops für Hardware-Stabilität.
|
||||
"""
|
||||
import httpx
|
||||
import yaml
|
||||
import logging
|
||||
import asyncio
|
||||
import json
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
from openai import AsyncOpenAI
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, Literal
|
||||
from app.config import get_settings
|
||||
|
||||
# Import der neutralen Bereinigungs-Logik
|
||||
from app.core.registry import clean_llm_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class LLMService:
|
||||
_background_semaphore = None
|
||||
|
||||
def __init__(self):
|
||||
self.settings = get_settings()
|
||||
self.prompts = self._load_prompts()
|
||||
self.profiles = self._load_llm_profiles()
|
||||
self._decision_engine = None
|
||||
|
||||
if LLMService._background_semaphore is None:
|
||||
limit = getattr(self.settings, "BACKGROUND_LIMIT", 2)
|
||||
logger.info(f"🚦 LLMService: Initializing Background Semaphore with limit: {limit}")
|
||||
LLMService._background_semaphore = asyncio.Semaphore(limit)
|
||||
|
||||
# 1. Lokaler Ollama Client
|
||||
self.ollama_client = httpx.AsyncClient(
|
||||
base_url=self.settings.OLLAMA_URL,
|
||||
timeout=httpx.Timeout(self.settings.LLM_TIMEOUT)
|
||||
)
|
||||
|
||||
# 2. Google GenAI Client
|
||||
self.google_client = None
|
||||
if self.settings.GOOGLE_API_KEY:
|
||||
self.google_client = genai.Client(
|
||||
api_key=self.settings.GOOGLE_API_KEY,
|
||||
http_options={'api_version': 'v1'}
|
||||
)
|
||||
logger.info("✨ LLMService: Google GenAI (Gemini) active.")
|
||||
|
||||
# 3. OpenRouter Client
|
||||
self.openrouter_client = None
|
||||
if self.settings.OPENROUTER_API_KEY:
|
||||
self.openrouter_client = AsyncOpenAI(
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
api_key=self.settings.OPENROUTER_API_KEY,
|
||||
timeout=45.0
|
||||
)
|
||||
logger.info("🛰️ LLMService: OpenRouter Integration active.")
|
||||
|
||||
@property
|
||||
def decision_engine(self):
|
||||
if self._decision_engine is None:
|
||||
from app.core.retrieval.decision_engine import DecisionEngine
|
||||
self._decision_engine = DecisionEngine()
|
||||
return self._decision_engine
|
||||
|
||||
def _load_prompts(self) -> dict:
|
||||
path = Path(self.settings.PROMPTS_PATH)
|
||||
if not path.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to load prompts: {e}")
|
||||
return {}
|
||||
|
||||
def _load_llm_profiles(self) -> dict:
|
||||
"""WP-25a: Lädt die zentralen MoE-Profile aus der llm_profiles.yaml."""
|
||||
path_str = getattr(self.settings, "LLM_PROFILES_PATH", "config/llm_profiles.yaml")
|
||||
path = Path(path_str)
|
||||
if not path.exists():
|
||||
logger.warning(f"⚠️ LLM Profiles file not found at {path}.")
|
||||
return {}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f) or {}
|
||||
return data.get("profiles", {})
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to load llm_profiles.yaml: {e}")
|
||||
return {}
|
||||
|
||||
def get_prompt(self, key: str, model_id: str = None, provider: str = None) -> str:
|
||||
"""
|
||||
WP-25b: Hochpräziser Prompt-Lookup mit detailliertem Trace-Logging.
|
||||
"""
|
||||
data = self.prompts.get(key, "")
|
||||
if not isinstance(data, dict):
|
||||
return str(data)
|
||||
|
||||
# 1. Spezifischstes Match: Exakte Modell-ID
|
||||
if model_id and model_id in data:
|
||||
logger.info(f"🎯 [PROMPT-TRACE] Level 1 Match: Model-specific ('{model_id}') for key '{key}'")
|
||||
return str(data[model_id])
|
||||
|
||||
# 2. Mittlere Ebene: Provider
|
||||
if provider and provider in data:
|
||||
logger.info(f"📡 [PROMPT-TRACE] Level 2 Match: Provider-fallback ('{provider}') for key '{key}'")
|
||||
return str(data[provider])
|
||||
|
||||
# 3. Globaler Fallback
|
||||
default_val = data.get("default", data.get("gemini", data.get("ollama", "")))
|
||||
logger.info(f"⚓ [PROMPT-TRACE] Level 3 Match: Global Default for key '{key}'")
|
||||
return str(default_val)
|
||||
|
||||
async def generate_raw_response(
|
||||
self,
|
||||
prompt: str = None,
|
||||
prompt_key: str = None, # WP-25b: Lazy Loading Key
|
||||
variables: dict = None, # WP-25b: Daten für Formatierung
|
||||
system: str = None,
|
||||
force_json: bool = False,
|
||||
max_retries: int = 2,
|
||||
base_delay: float = 2.0,
|
||||
priority: Literal["realtime", "background"] = "realtime",
|
||||
provider: Optional[str] = None,
|
||||
model_override: Optional[str] = None,
|
||||
json_schema: Optional[Dict[str, Any]] = None,
|
||||
json_schema_name: str = "mindnet_json",
|
||||
strict_json_schema: bool = True,
|
||||
profile_name: Optional[str] = None,
|
||||
visited_profiles: Optional[list] = None
|
||||
) -> str:
|
||||
"""Haupteinstiegspunkt für LLM-Anfragen mit Lazy-Prompt Orchestrierung."""
|
||||
visited_profiles = visited_profiles or []
|
||||
target_provider = provider
|
||||
target_model = model_override
|
||||
target_temp = None
|
||||
fallback_profile = None
|
||||
|
||||
# 1. Profil-Auflösung (Mixture of Experts)
|
||||
if profile_name and self.profiles:
|
||||
profile = self.profiles.get(profile_name)
|
||||
if profile:
|
||||
target_provider = profile.get("provider", target_provider)
|
||||
target_model = profile.get("model", target_model)
|
||||
target_temp = profile.get("temperature")
|
||||
fallback_profile = profile.get("fallback_profile")
|
||||
visited_profiles.append(profile_name)
|
||||
logger.info(f"🎭 MoE Dispatch: Profil='{profile_name}' -> Provider='{target_provider}' | Model='{target_model}'")
|
||||
else:
|
||||
logger.warning(f"⚠️ Profil '{profile_name}' nicht in llm_profiles.yaml gefunden!")
|
||||
|
||||
if not target_provider:
|
||||
target_provider = self.settings.MINDNET_LLM_PROVIDER
|
||||
|
||||
# 2. WP-25b: Lazy Prompt Resolving
|
||||
# Wir laden den Prompt erst JETZT, basierend auf dem gerade aktiven Modell.
|
||||
current_prompt = prompt
|
||||
if prompt_key:
|
||||
template = self.get_prompt(prompt_key, model_id=target_model, provider=target_provider)
|
||||
# WP-25b FIX: Validierung des geladenen Prompts
|
||||
if not template or not template.strip():
|
||||
available_keys = list(self.prompts.keys())
|
||||
logger.error(f"❌ Prompt key '{prompt_key}' not found or empty. Available keys: {available_keys[:10]}...")
|
||||
raise ValueError(f"Invalid prompt_key: '{prompt_key}' (not found in prompts.yaml)")
|
||||
|
||||
try:
|
||||
# Formatierung mit den übergebenen Variablen
|
||||
current_prompt = template.format(**(variables or {}))
|
||||
except KeyError as e:
|
||||
logger.error(f"❌ Prompt formatting failed for key '{prompt_key}': Missing variable {e}")
|
||||
raise ValueError(f"Missing variable in prompt '{prompt_key}': {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Prompt formatting failed for key '{prompt_key}': {e}")
|
||||
current_prompt = template # Sicherheits-Fallback
|
||||
|
||||
# 3. Ausführung mit Fehler-Handling für Kaskade
|
||||
try:
|
||||
if priority == "background":
|
||||
async with LLMService._background_semaphore:
|
||||
res = await self._dispatch(
|
||||
target_provider, current_prompt, system, force_json,
|
||||
max_retries, base_delay, target_model,
|
||||
json_schema, json_schema_name, strict_json_schema, target_temp
|
||||
)
|
||||
else:
|
||||
res = await self._dispatch(
|
||||
target_provider, current_prompt, system, force_json,
|
||||
max_retries, base_delay, target_model,
|
||||
json_schema, json_schema_name, strict_json_schema, target_temp
|
||||
)
|
||||
|
||||
# Check auf leere Cloud-Antworten (WP-25 Stability)
|
||||
if not res and target_provider != "ollama":
|
||||
logger.warning(f"⚠️ Empty response from {target_provider}. Triggering fallback.")
|
||||
raise ValueError(f"Empty response from {target_provider}")
|
||||
|
||||
return clean_llm_text(res) if not force_json else res
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Error during execution of profile '{profile_name}' ({target_provider}): {e}")
|
||||
|
||||
# 4. WP-25b Kaskaden-Logik (Rekursiv mit Modell-spezifischem Re-Loading)
|
||||
if fallback_profile and fallback_profile not in visited_profiles:
|
||||
logger.info(f"🔄 Switching to fallback profile: '{fallback_profile}'")
|
||||
return await self.generate_raw_response(
|
||||
prompt=prompt,
|
||||
prompt_key=prompt_key,
|
||||
variables=variables, # Ermöglicht neues Formatting für Fallback-Modell
|
||||
system=system, force_json=force_json,
|
||||
max_retries=max_retries, base_delay=base_delay,
|
||||
priority=priority, provider=None, model_override=None,
|
||||
json_schema=json_schema, json_schema_name=json_schema_name,
|
||||
strict_json_schema=strict_json_schema,
|
||||
profile_name=fallback_profile,
|
||||
visited_profiles=visited_profiles
|
||||
)
|
||||
|
||||
# 5. Ultimativer Notanker: Falls alles fehlschlägt, direkt zu Ollama
|
||||
if target_provider != "ollama" and self.settings.LLM_FALLBACK_ENABLED:
|
||||
logger.warning(f"🚨 Kaskade erschöpft. Nutze finalen Ollama-Notanker.")
|
||||
res = await self._execute_ollama(current_prompt, system, force_json, max_retries, base_delay, target_temp, target_model)
|
||||
return clean_llm_text(res) if not force_json else res
|
||||
|
||||
raise e
|
||||
|
||||
async def _dispatch(
|
||||
self, provider, prompt, system, force_json, max_retries, base_delay,
|
||||
model_override, json_schema, json_schema_name, strict_json_schema, temperature
|
||||
) -> str:
|
||||
"""Routet die Anfrage an den spezifischen Provider-Executor."""
|
||||
rate_limit_attempts = 0
|
||||
max_rate_retries = min(max_retries, getattr(self.settings, "LLM_RATE_LIMIT_RETRIES", 3))
|
||||
wait_time = getattr(self.settings, "LLM_RATE_LIMIT_WAIT", 60.0)
|
||||
|
||||
while rate_limit_attempts <= max_rate_retries:
|
||||
try:
|
||||
if provider == "openrouter" and self.openrouter_client:
|
||||
return await self._execute_openrouter(
|
||||
prompt=prompt, system=system, force_json=force_json,
|
||||
model_override=model_override, json_schema=json_schema,
|
||||
json_schema_name=json_schema_name, strict_json_schema=strict_json_schema,
|
||||
temperature=temperature
|
||||
)
|
||||
|
||||
if provider == "gemini" and self.google_client:
|
||||
return await self._execute_google(prompt, system, force_json, model_override, temperature)
|
||||
|
||||
return await self._execute_ollama(prompt, system, force_json, max_retries, base_delay, temperature, model_override)
|
||||
|
||||
except Exception as e:
|
||||
err_str = str(e)
|
||||
if any(x in err_str for x in ["429", "RESOURCE_EXHAUSTED", "rate_limited"]):
|
||||
rate_limit_attempts += 1
|
||||
logger.warning(f"⏳ Rate Limit {provider}. Attempt {rate_limit_attempts}. Wait {wait_time}s.")
|
||||
await asyncio.sleep(wait_time)
|
||||
continue
|
||||
raise e
|
||||
|
||||
async def _execute_google(self, prompt, system, force_json, model_override, temperature):
|
||||
model = (model_override or self.settings.GEMINI_MODEL).replace("models/", "")
|
||||
config_kwargs = {
|
||||
"system_instruction": system,
|
||||
"response_mime_type": "application/json" if force_json else "text/plain"
|
||||
}
|
||||
if temperature is not None:
|
||||
config_kwargs["temperature"] = temperature
|
||||
|
||||
config = types.GenerateContentConfig(**config_kwargs)
|
||||
response = await asyncio.wait_for(
|
||||
asyncio.to_thread(self.google_client.models.generate_content, model=model, contents=prompt, config=config),
|
||||
timeout=45.0
|
||||
)
|
||||
return response.text.strip()
|
||||
|
||||
async def _execute_openrouter(self, prompt, system, force_json, model_override, json_schema, json_schema_name, strict_json_schema, temperature) -> str:
|
||||
model = model_override or self.settings.OPENROUTER_MODEL
|
||||
logger.info(f"🛰️ OpenRouter Call: Model='{model}' | Temp={temperature}")
|
||||
messages = []
|
||||
if system: messages.append({"role": "system", "content": system})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
kwargs: Dict[str, Any] = {}
|
||||
if temperature is not None: kwargs["temperature"] = temperature
|
||||
|
||||
if force_json:
|
||||
if json_schema:
|
||||
kwargs["response_format"] = {"type": "json_schema", "json_schema": {"name": json_schema_name, "strict": strict_json_schema, "schema": json_schema}}
|
||||
else:
|
||||
kwargs["response_format"] = {"type": "json_object"}
|
||||
|
||||
response = await self.openrouter_client.chat.completions.create(model=model, messages=messages, **kwargs)
|
||||
if not response.choices: return ""
|
||||
return response.choices[0].message.content.strip() if response.choices[0].message.content else ""
|
||||
|
||||
async def _execute_ollama(self, prompt, system, force_json, max_retries, base_delay, temperature=None, model_override=None):
|
||||
# WP-20: Restaurierter Retry-Loop für lokale Hardware-Resilienz
|
||||
effective_model = model_override or self.settings.LLM_MODEL
|
||||
effective_temp = temperature if temperature is not None else (0.1 if force_json else 0.7)
|
||||
|
||||
payload = {
|
||||
"model": effective_model,
|
||||
"prompt": prompt, "stream": False,
|
||||
"options": {"temperature": effective_temp, "num_ctx": 8192}
|
||||
}
|
||||
if force_json: payload["format"] = "json"
|
||||
if system: payload["system"] = system
|
||||
|
||||
attempt = 0
|
||||
while True:
|
||||
try:
|
||||
res = await self.ollama_client.post("/api/generate", json=payload)
|
||||
res.raise_for_status()
|
||||
return res.json().get("response", "").strip()
|
||||
except Exception as e:
|
||||
attempt += 1
|
||||
if attempt > max_retries:
|
||||
logger.error(f"❌ Ollama failure after {attempt} attempts: {e}")
|
||||
raise e
|
||||
await asyncio.sleep(base_delay * (2 ** (attempt - 1)))
|
||||
|
||||
async def generate_rag_response(self, query: str, context_str: Optional[str] = None) -> str:
|
||||
return await self.decision_engine.ask(query)
|
||||
|
||||
async def close(self):
|
||||
if self.ollama_client:
|
||||
await self.ollama_client.aclose()
|
||||
141
config/decision_engine.yaml
Normal file
141
config/decision_engine.yaml
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
# config/decision_engine.yaml
|
||||
# VERSION: 3.2.2 (WP-25a: Decoupled MoE Logic)
|
||||
# STATUS: Active
|
||||
# DESCRIPTION: Zentrale Orchestrierung der Multi-Stream-Engine.
|
||||
# FIX:
|
||||
# - Auslagerung der LLM-Profile in llm_profiles.yaml zur zentralen Wartbarkeit.
|
||||
# - Integration von compression_thresholds zur Inhaltsverdichtung (WP-25a).
|
||||
# - 100% Erhalt aller WP-25 Edge-Boosts und Filter-Typen (v3.1.6).
|
||||
|
||||
version: 3.2
|
||||
|
||||
settings:
|
||||
llm_fallback_enabled: true
|
||||
# "auto" nutzt den globalen Default-Provider aus der .env
|
||||
router_provider: "auto"
|
||||
# Verweis auf den Intent-Klassifizierer in der prompts.yaml
|
||||
router_prompt_key: "intent_router_v1"
|
||||
# Pfad zur neuen Experten-Konfiguration (WP-25a Architektur-Cleanliness)
|
||||
profiles_config_path: "config/llm_profiles.yaml"
|
||||
router_profile: "compression_fast"
|
||||
|
||||
# --- EBENE 1: STREAM-LIBRARY (Bausteine basierend auf types.yaml v2.7.0) ---
|
||||
streams_library:
|
||||
values_stream:
|
||||
name: "Identität & Ethik"
|
||||
# Referenz auf Experten-Profil (z.B. lokal via Ollama für Privacy)
|
||||
llm_profile: "identity_safe"
|
||||
compression_profile: "identity_safe"
|
||||
compression_threshold: 2500
|
||||
query_template: "Welche meiner Werte und Prinzipien betreffen: {query}"
|
||||
filter_types: ["value", "principle", "belief", "trait", "boundary", "need", "motivation"]
|
||||
top_k: 5
|
||||
edge_boosts:
|
||||
guides: 3.0
|
||||
depends_on: 2.5
|
||||
based_on: 2.0
|
||||
upholds: 2.5
|
||||
violates: 2.5
|
||||
aligned_with: 2.0
|
||||
conflicts_with: 2.0
|
||||
supports: 1.5
|
||||
contradicts: 1.5
|
||||
facts_stream:
|
||||
name: "Operative Realität"
|
||||
llm_profile: "synthesis_pro"
|
||||
compression_profile: "compression_fast"
|
||||
compression_threshold: 3500
|
||||
query_template: "Status, Ressourcen und Fakten zu: {query}"
|
||||
filter_types: ["project", "decision", "task", "goal", "event", "state"]
|
||||
top_k: 5
|
||||
edge_boosts:
|
||||
part_of: 2.0
|
||||
depends_on: 1.5
|
||||
implemented_in: 1.5
|
||||
|
||||
biography_stream:
|
||||
name: "Persönliche Erfahrung"
|
||||
llm_profile: "synthesis_pro"
|
||||
compression_profile: "compression_fast"
|
||||
compression_threshold: 3000
|
||||
query_template: "Welche Erlebnisse habe ich im Kontext von {query} gemacht?"
|
||||
filter_types: ["experience", "journal", "profile", "person"]
|
||||
top_k: 3
|
||||
edge_boosts:
|
||||
related_to: 1.5
|
||||
experienced_in: 2.0
|
||||
expert_for: 2.5
|
||||
followed_by: 2.0
|
||||
preceded_by: 2.0
|
||||
|
||||
risk_stream:
|
||||
name: "Risiko-Radar"
|
||||
llm_profile: "synthesis_pro"
|
||||
compression_profile: "compression_fast"
|
||||
compression_threshold: 2500
|
||||
query_template: "Gefahren, Hindernisse oder Risiken bei: {query}"
|
||||
filter_types: ["risk", "obstacle", "bias"]
|
||||
top_k: 3
|
||||
edge_boosts:
|
||||
blocks: 2.5
|
||||
impacts: 2.0
|
||||
risk_of: 2.5
|
||||
|
||||
tech_stream:
|
||||
name: "Wissen & Technik"
|
||||
llm_profile: "tech_expert"
|
||||
compression_profile: "compression_fast"
|
||||
compression_threshold: 4500
|
||||
query_template: "Inhaltliche Details und Definitionen zu: {query}"
|
||||
filter_types: ["concept", "source", "glossary", "idea", "insight", "skill", "habit"]
|
||||
top_k: 5
|
||||
edge_boosts:
|
||||
uses: 2.5
|
||||
implemented_in: 3.0
|
||||
|
||||
# --- EBENE 2: STRATEGIEN (Finale Komposition via MoE-Profile) ---
|
||||
strategies:
|
||||
FACT_WHEN:
|
||||
description: "Abfrage von exakten Zeitpunkten und Terminen."
|
||||
llm_profile: "synthesis_pro"
|
||||
trigger_keywords: ["wann", "datum", "uhrzeit", "zeitpunkt"]
|
||||
use_streams: ["facts_stream", "biography_stream", "tech_stream"]
|
||||
prompt_template: "fact_synthesis_v1"
|
||||
|
||||
FACT_WHAT:
|
||||
description: "Abfrage von Definitionen, Listen und Inhalten."
|
||||
llm_profile: "synthesis_pro"
|
||||
trigger_keywords: ["was ist", "welche sind", "liste", "übersicht", "zusammenfassung"]
|
||||
use_streams: ["facts_stream", "tech_stream", "biography_stream"]
|
||||
prompt_template: "fact_synthesis_v1"
|
||||
|
||||
DECISION:
|
||||
description: "Der User sucht Rat, Strategie oder Abwägung."
|
||||
llm_profile: "synthesis_pro"
|
||||
trigger_keywords: ["soll ich", "sollte ich", "entscheidung", "abwägen", "priorität", "empfehlung"]
|
||||
use_streams: ["values_stream", "facts_stream", "risk_stream"]
|
||||
prompt_template: "decision_synthesis_v1"
|
||||
prepend_instruction: |
|
||||
!!! ENTSCHEIDUNGS-MODUS (AGENTIC MULTI-STREAM) !!!
|
||||
Analysiere die Fakten vor dem Hintergrund meiner Werte und evaluiere die Risiken.
|
||||
Wäge ab, ob das Vorhaben mit meiner langfristigen Identität kompatibel ist.
|
||||
|
||||
EMPATHY:
|
||||
description: "Reaktion auf emotionale Zustände."
|
||||
llm_profile: "synthesis_pro"
|
||||
trigger_keywords: ["fühle", "traurig", "glücklich", "stress", "angst"]
|
||||
use_streams: ["biography_stream", "values_stream"]
|
||||
prompt_template: "empathy_template"
|
||||
|
||||
CODING:
|
||||
description: "Technische Anfragen und Programmierung."
|
||||
llm_profile: "tech_expert"
|
||||
trigger_keywords: ["code", "python", "script", "bug", "syntax"]
|
||||
use_streams: ["tech_stream", "facts_stream"]
|
||||
prompt_template: "technical_template"
|
||||
|
||||
INTERVIEW:
|
||||
description: "Der User möchte Wissen erfassen (Eingabemodus)."
|
||||
llm_profile: "compression_fast"
|
||||
use_streams: []
|
||||
prompt_template: "interview_template"
|
||||
64
config/llm_profiles.yaml
Normal file
64
config/llm_profiles.yaml
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
# config/llm_profiles.yaml
|
||||
# VERSION: 1.3.0 (WP-25a: Global MoE & Fallback Cascade)
|
||||
# STATUS: Active
|
||||
# DESCRIPTION: Zentrale Definition der LLM-Rollen inkl. Ausfall-Logik (Kaskade).
|
||||
|
||||
profiles:
|
||||
# --- CHAT & SYNTHESE ---
|
||||
# Der "Architekt": Hochwertige Synthese. Fällt bei Fehlern auf den Backup-Cloud-Experten zurück.
|
||||
synthesis_pro:
|
||||
provider: "openrouter"
|
||||
model: "google/gemini-2.0-flash-exp:free"
|
||||
temperature: 0.7
|
||||
fallback_profile: "synthesis_backup"
|
||||
|
||||
# Der "Vize": Leistungsstarkes Modell bei einem anderen Provider (Resilienz).
|
||||
synthesis_backup:
|
||||
provider: "openrouter"
|
||||
model: "meta-llama/llama-3.3-70b-instruct:free"
|
||||
temperature: 0.5
|
||||
fallback_profile: "identity_safe" # Letzte Instanz: Lokal
|
||||
|
||||
# Der "Ingenieur": Fachspezialist für Code. Nutzt bei Ausfall den Generalisten.
|
||||
tech_expert:
|
||||
provider: "openrouter"
|
||||
model: "qwen/qwen-2.5-vl-7b-instruct:free"
|
||||
temperature: 0.3
|
||||
fallback_profile: "synthesis_pro"
|
||||
|
||||
# Der "Dampfhammer": Schnell für Routing und Zusammenfassungen.
|
||||
compression_fast:
|
||||
provider: "openrouter"
|
||||
model: "mistralai/mistral-7b-instruct:free"
|
||||
temperature: 0.1
|
||||
fallback_profile: "identity_safe"
|
||||
|
||||
# --- INGESTION EXPERTEN ---
|
||||
# Spezialist für die Extraktion komplexer Datenstrukturen aus Dokumenten.
|
||||
ingest_extractor:
|
||||
provider: "openrouter"
|
||||
model: "mistralai/mistral-7b-instruct:free"
|
||||
temperature: 0.2
|
||||
fallback_profile: "synthesis_backup"
|
||||
|
||||
# Spezialist für binäre Prüfungen (YES/NO). Muss extrem deterministisch sein.
|
||||
ingest_validator:
|
||||
provider: "openrouter"
|
||||
model: "mistralai/mistral-7b-instruct:free"
|
||||
temperature: 0.0
|
||||
fallback_profile: "compression_fast"
|
||||
|
||||
# --- LOKALER ANKER & PRIVACY ---
|
||||
# Der "Wächter": Lokales Modell für maximale Privatsphäre. Ende der Kaskade.
|
||||
identity_safe:
|
||||
provider: "ollama"
|
||||
model: "phi3:mini"
|
||||
temperature: 0.2
|
||||
# Kein fallback_profile definiert = Terminaler Endpunkt
|
||||
|
||||
# --- EMBEDDING EXPERTE ---
|
||||
# Zentralisierung des Embedding-Modells zur Entfernung aus der .env.
|
||||
embedding_expert:
|
||||
provider: "ollama"
|
||||
model: "nomic-embed-text"
|
||||
dimensions: 768
|
||||
63
config/prod.env
Normal file
63
config/prod.env
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# --- FastAPI Server (Produktion) ---
|
||||
UVICORN_HOST=0.0.0.0
|
||||
UVICORN_PORT=8000
|
||||
DEBUG=false
|
||||
|
||||
# --- Qdrant Vektor-Datenbank ---
|
||||
# Trennung der Daten durch eigenes Prefix
|
||||
QDRANT_URL=http://127.0.0.1:6333
|
||||
QDRANT_API_KEY=
|
||||
COLLECTION_PREFIX=mindnet
|
||||
|
||||
# --- Vektoren-Konfiguration ---
|
||||
# Muss 768 für 'nomic-embed-text' sein
|
||||
VECTOR_DIM=768
|
||||
|
||||
# --- AI Modelle (Lokal/Fallback) ---
|
||||
MINDNET_LLM_MODEL=phi3:mini
|
||||
MINDNET_OLLAMA_URL=http://127.0.0.1:11434
|
||||
MINDNET_LLM_TIMEOUT=300.0
|
||||
MINDNET_LLM_BACKGROUND_LIMIT=2
|
||||
|
||||
# Vektor-Modell für semantische Suche
|
||||
MINDNET_EMBEDDING_MODEL=nomic-embed-text
|
||||
|
||||
# --- WP-20/WP-76: Hybrid-Cloud & Resilienz ---
|
||||
# Primärer Provider für höchste Qualität
|
||||
MINDNET_LLM_PROVIDER=openrouter
|
||||
MINDNET_LLM_FALLBACK=true
|
||||
|
||||
# Intelligente Rate-Limit Steuerung (Sekunden/Versuche)
|
||||
MINDNET_LLM_RATE_LIMIT_WAIT=60.0
|
||||
MINDNET_LLM_RATE_LIMIT_RETRIES=3
|
||||
|
||||
# --- Cloud Provider Keys (Hier Prod-Keys einsetzen) ---
|
||||
GOOGLE_API_KEY=AIzaSy... (Dein Prod-Key)
|
||||
MINDNET_GEMINI_MODEL=gemini-2.5-flash-lite
|
||||
|
||||
OPENROUTER_API_KEY=sk-or-v1-... (Dein Prod-Key)
|
||||
# Stabilstes Free-Modell für strukturierte Extraktion
|
||||
OPENROUTER_MODEL=mistralai/mistral-7b-instruct:free
|
||||
|
||||
# --- Pfade & System (Produktions-Vault) ---
|
||||
MINDNET_TYPES_FILE=./config/types.yaml
|
||||
MINDNET_VAULT_ROOT=./vault_prod
|
||||
MINDNET_VOCAB_PATH=/mindnet/vault/mindnet/_system/dictionary/edge_vocabulary.md
|
||||
|
||||
# Change Detection für effiziente Re-Imports
|
||||
MINDNET_CHANGE_DETECTION_MODE=full
|
||||
|
||||
# --- WP-24c v4.2.0: Konfigurierbare Markdown-Header für Edge-Zonen ---
|
||||
# Komma-separierte Liste von Headern für LLM-Validierung
|
||||
# Format: Header1,Header2,Header3
|
||||
MINDNET_LLM_VALIDATION_HEADERS=Unzugeordnete Kanten,Edge Pool,Candidates
|
||||
|
||||
# Header-Ebene für LLM-Validierung (1-6, Default: 3 für ###)
|
||||
MINDNET_LLM_VALIDATION_HEADER_LEVEL=3
|
||||
|
||||
# Komma-separierte Liste von Headern für Note-Scope Zonen
|
||||
# Format: Header1,Header2,Header3
|
||||
MINDNET_NOTE_SCOPE_ZONE_HEADERS=Smart Edges,Relationen,Global Links,Note-Level Relations,Globale Verbindungen
|
||||
|
||||
# Header-Ebene für Note-Scope Zonen (1-6, Default: 2 für ##)
|
||||
MINDNET_NOTE_SCOPE_HEADER_LEVEL=2
|
||||
337
config/prompts - Kopie.yaml
Normal file
337
config/prompts - Kopie.yaml
Normal file
|
|
@ -0,0 +1,337 @@
|
|||
# config/prompts.yaml — VERSION 3.1.2 (WP-25 Cleanup: Multi-Stream Sync)
|
||||
# STATUS: Active
|
||||
# FIX:
|
||||
# - 100% Wiederherstellung der Ingest- & Validierungslogik (Sektion 5-8).
|
||||
# - Überführung der Kategorien 1-4 in die Multi-Stream Struktur unter Beibehaltung des Inhalts.
|
||||
# - Konsolidierung: Sektion 9 (v3.0.0) wurde in Sektion 1 & 2 integriert (keine Redundanz).
|
||||
|
||||
system_prompt: |
|
||||
Du bist 'mindnet', mein digitaler Zwilling und strategischer Partner.
|
||||
|
||||
DEINE IDENTITÄT:
|
||||
- Du bist nicht nur eine Datenbank, sondern handelst nach MEINEN Werten und Zielen.
|
||||
- Du passt deinen Stil dynamisch an die Situation an (Analytisch, Empathisch oder Technisch).
|
||||
|
||||
DEINE REGELN:
|
||||
1. Deine Antwort muss zu 100% auf dem bereitgestellten KONTEXT basieren.
|
||||
2. Halluziniere keine Fakten, die nicht in den Quellen stehen.
|
||||
3. Antworte auf Deutsch (außer bei Code/Fachbegriffen).
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 1. STANDARD: Fakten & Wissen (Intent: FACT_WHAT / FACT_WHEN)
|
||||
# ---------------------------------------------------------
|
||||
# Ersetzt das alte 'rag_template'. Nutzt jetzt parallele Streams.
|
||||
fact_synthesis_v1:
|
||||
ollama: |
|
||||
WISSENS-STREAMS:
|
||||
=========================================
|
||||
FAKTEN & STATUS:
|
||||
{facts_stream}
|
||||
|
||||
ERFAHRUNG & BIOGRAFIE:
|
||||
{biography_stream}
|
||||
|
||||
WISSEN & TECHNIK:
|
||||
{tech_stream}
|
||||
=========================================
|
||||
|
||||
FRAGE:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Beantworte die Frage präzise basierend auf den Quellen.
|
||||
Kombiniere harte Fakten mit persönlichen Erfahrungen, falls vorhanden.
|
||||
Fasse die Informationen zusammen. Sei objektiv und neutral.
|
||||
gemini: |
|
||||
Beantworte die Wissensabfrage "{query}" basierend auf diesen Streams:
|
||||
FAKTEN: {facts_stream}
|
||||
BIOGRAFIE/ERFAHRUNG: {biography_stream}
|
||||
TECHNIK: {tech_stream}
|
||||
Kombiniere harte Fakten mit persönlichen Erfahrungen, falls vorhanden. Antworte strukturiert und präzise.
|
||||
openrouter: |
|
||||
Synthese der Wissens-Streams für: {query}
|
||||
Inhalt: {facts_stream} | {biography_stream} | {tech_stream}
|
||||
Antworte basierend auf dem bereitgestellten Kontext.
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 2. DECISION: Strategie & Abwägung (Intent: DECISION)
|
||||
# ---------------------------------------------------------
|
||||
# Ersetzt das alte 'decision_template'. Nutzt jetzt parallele Streams.
|
||||
decision_synthesis_v1:
|
||||
ollama: |
|
||||
ENTSCHEIDUNGS-STREAMS:
|
||||
=========================================
|
||||
WERTE & PRINZIPIEN (Identität):
|
||||
{values_stream}
|
||||
|
||||
OPERATIVE FAKTEN (Realität):
|
||||
{facts_stream}
|
||||
|
||||
RISIKO-RADAR (Konsequenzen):
|
||||
{risk_stream}
|
||||
=========================================
|
||||
|
||||
ENTSCHEIDUNGSFRAGE:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Du agierst als mein Entscheidungs-Partner.
|
||||
1. Analysiere die Faktenlage aus den Quellen.
|
||||
2. Prüfe dies hart gegen meine strategischen Notizen (Werte & Prinzipien).
|
||||
3. Wäge ab: Passt die technische/faktische Lösung zu meinen Werten?
|
||||
|
||||
FORMAT:
|
||||
- **Analyse:** (Kurze Zusammenfassung der Fakten)
|
||||
- **Abgleich:** (Gibt es Konflikte mit Werten/Zielen? Nenne die Quelle!)
|
||||
- **Empfehlung:** (Klare Meinung: Ja/No/Vielleicht mit Begründung)
|
||||
gemini: |
|
||||
Agiere als mein strategischer Partner. Analysiere die Frage: {query}
|
||||
Werte: {values_stream} | Fakten: {facts_stream} | Risiken: {risk_stream}.
|
||||
Wäge ab und gib eine klare strategische Empfehlung ab.
|
||||
openrouter: |
|
||||
Strategische Multi-Stream Analyse für: {query}
|
||||
Werte-Basis: {values_stream} | Fakten: {facts_stream} | Risiken: {risk_stream}
|
||||
Bitte wäge ab und gib eine Empfehlung.
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 3. EMPATHY: Der Spiegel / "Ich"-Modus (Intent: EMPATHY)
|
||||
# ---------------------------------------------------------
|
||||
empathy_template:
|
||||
ollama: |
|
||||
KONTEXT (ERFAHRUNGEN & WERTE):
|
||||
=========================================
|
||||
ERLEBNISSE & BIOGRAFIE:
|
||||
{biography_stream}
|
||||
|
||||
WERTE & BEDÜRFNISSE:
|
||||
{values_stream}
|
||||
=========================================
|
||||
|
||||
SITUATION:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Du agierst jetzt als mein empathischer Spiegel.
|
||||
1. Versuche nicht sofort, das Problem technisch zu lösen.
|
||||
2. Zeige Verständnis für die Situation basierend auf meinen eigenen Erfahrungen ([EXPERIENCE]) oder Werten, falls im Kontext vorhanden.
|
||||
3. Antworte in der "Ich"-Form oder "Wir"-Form. Sei unterstützend.
|
||||
|
||||
TONFALL:
|
||||
Ruhig, verständnisvoll, reflektiert. Keine Aufzählungszeichen, sondern fließender Text.
|
||||
gemini: "Sei mein digitaler Spiegel für {query}. Kontext: {biography_stream}, {values_stream}"
|
||||
openrouter: "Empathische Reflexion der Situation {query}. Persönlicher Kontext: {biography_stream}, {values_stream}"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 4. TECHNICAL: Der Coder (Intent: CODING)
|
||||
# ---------------------------------------------------------
|
||||
technical_template:
|
||||
ollama: |
|
||||
KONTEXT (WISSEN & PROJEKTE):
|
||||
=========================================
|
||||
TECHNIK & SNIPPETS:
|
||||
{tech_stream}
|
||||
|
||||
PROJEKT-STATUS:
|
||||
{facts_stream}
|
||||
=========================================
|
||||
|
||||
TASK:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Du bist Senior Developer.
|
||||
1. Ignoriere Smalltalk. Komm sofort zum Punkt.
|
||||
2. Generiere validen, performanten Code basierend auf den Quellen.
|
||||
3. Wenn Quellen fehlen, nutze dein allgemeines Programmierwissen, aber weise darauf hin.
|
||||
|
||||
FORMAT:
|
||||
- Kurze Erklärung des Ansatzes.
|
||||
- Markdown Code-Block (Copy-Paste fertig).
|
||||
- Wichtige Edge-Cases.
|
||||
gemini: "Generiere Code für {query} unter Berücksichtigung von {tech_stream} und {facts_stream}."
|
||||
openrouter: "Technischer Support für {query}. Referenzen: {tech_stream}, Projekt-Kontext: {facts_stream}"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 5. INTERVIEW: Der "One-Shot Extractor" (WP-07)
|
||||
# ---------------------------------------------------------
|
||||
interview_template:
|
||||
ollama: |
|
||||
TASK:
|
||||
Du bist ein professioneller Ghostwriter. Verwandle den "USER INPUT" in eine strukturierte Notiz vom Typ '{target_type}'.
|
||||
|
||||
STRUKTUR (Nutze EXAKT diese Überschriften):
|
||||
{schema_fields}
|
||||
|
||||
USER INPUT:
|
||||
"{query}"
|
||||
|
||||
ANWEISUNG ZUM INHALT:
|
||||
1. Analysiere den Input genau.
|
||||
2. Schreibe die Inhalte unter die passenden Überschriften aus der STRUKTUR-Liste oben.
|
||||
3. STIL: Schreibe flüssig, professionell und in der Ich-Perspektive. Korrigiere Grammatikfehler, aber behalte den persönlichen Ton bei.
|
||||
4. Wenn Informationen für einen Abschnitt fehlen, schreibe nur: "[TODO: Ergänzen]". Erfinde nichts dazu.
|
||||
|
||||
OUTPUT FORMAT (YAML + MARKDOWN):
|
||||
---
|
||||
type: {target_type}
|
||||
status: draft
|
||||
title: (Erstelle einen treffenden, kurzen Titel für den Inhalt)
|
||||
tags: [Tag1, Tag2]
|
||||
---
|
||||
|
||||
# (Wiederhole den Titel hier)
|
||||
|
||||
## (Erster Begriff aus STRUKTUR)
|
||||
(Text...)
|
||||
|
||||
## (Zweiter Begriff aus STRUKTUR)
|
||||
(Text...)
|
||||
gemini: "Extrahiere Daten für {target_type} aus {query}."
|
||||
openrouter: "Strukturiere den Input {query} nach dem Schema {schema_fields} für Typ {target_type}."
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 6. EDGE_ALLOCATION: Kantenfilter (Ingest)
|
||||
# ---------------------------------------------------------
|
||||
edge_allocation_template:
|
||||
ollama: |
|
||||
TASK:
|
||||
Du bist ein strikter Selektor. Du erhältst eine Liste von "Kandidaten-Kanten" (Strings).
|
||||
Wähle jene aus, die inhaltlich im "Textabschnitt" vorkommen oder relevant sind.
|
||||
|
||||
TEXTABSCHNITT:
|
||||
"""
|
||||
{chunk_text}
|
||||
"""
|
||||
|
||||
KANDIDATEN (Auswahl-Pool):
|
||||
{edge_list}
|
||||
|
||||
REGELN:
|
||||
1. Die Kanten haben das Format "typ:ziel". Der "typ" ist variabel und kann ALLES sein.
|
||||
2. Gib NUR die Strings aus der Kandidaten-Liste zurück, die zum Text passen.
|
||||
3. Erfinde KEINE neuen Kanten.
|
||||
4. Antworte als flache JSON-Liste.
|
||||
|
||||
DEIN OUTPUT (JSON):
|
||||
gemini: |
|
||||
TASK: Ordne Kanten einem Textabschnitt zu.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {chunk_text}
|
||||
KANDIDATEN: {edge_list}
|
||||
OUTPUT: STRIKT eine flache JSON-Liste ["typ:ziel"]. Kein Text davor/danach. Wenn nichts: []. Keine Objekte!
|
||||
openrouter: |
|
||||
TASK: Filtere relevante Kanten aus dem Pool.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {chunk_text}
|
||||
POOL: {edge_list}
|
||||
ANWEISUNG: Gib NUR eine flache JSON-Liste von Strings zurück.
|
||||
BEISPIEL: ["kind:target", "kind:target"]
|
||||
REGEL: Kein Text, keine Analyse, keine Kommentare. Wenn nichts passt, gib [] zurück.
|
||||
OUTPUT:
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 7. SMART EDGE ALLOCATION: Extraktion (Ingest)
|
||||
# ---------------------------------------------------------
|
||||
edge_extraction:
|
||||
ollama: |
|
||||
TASK:
|
||||
Du bist ein Wissens-Ingenieur für den digitalen Zwilling 'mindnet'.
|
||||
Deine Aufgabe ist es, semantische Relationen (Kanten) aus dem Text zu extrahieren,
|
||||
die die Hauptnotiz '{note_id}' mit anderen Konzepten verbinden.
|
||||
|
||||
ANWEISUNGEN:
|
||||
1. Identifiziere wichtige Entitäten, Konzepte oder Ereignisse im Text.
|
||||
2. Bestimme die Art der Beziehung (z.B. part_of, uses, related_to, blocks, caused_by).
|
||||
3. Das Ziel (target) muss ein prägnanter Begriff sein.
|
||||
4. Antworte AUSSCHLIESSLICH in validem JSON als Liste von Objekten.
|
||||
|
||||
BEISPIEL:
|
||||
[[ {{"to": "Ziel-Konzept", \"kind\": \"beziehungs_typ\"}} ]]
|
||||
|
||||
TEXT:
|
||||
"""
|
||||
{text}
|
||||
"""
|
||||
|
||||
DEIN OUTPUT (JSON):
|
||||
gemini: |
|
||||
Analysiere '{note_id}'. Extrahiere semantische Beziehungen.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {text}
|
||||
OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to\":\"Ziel\",\"kind\":\"typ\"}}]]. Kein Text davor/danach. Wenn nichts: [].
|
||||
openrouter: |
|
||||
TASK: Extrahiere semantische Relationen für '{note_id}'.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {text}
|
||||
ANWEISUNG: Antworte AUSSCHLIESSLICH mit einem JSON-Array von Objekten.
|
||||
FORMAT: [[{{"to\":\"Ziel-Begriff\",\"kind\":\"typ\"}}]]
|
||||
STRIKTES VERBOT: Schreibe keine Einleitung, keine Analyse und keine Erklärungen.
|
||||
Wenn keine Relationen existieren, antworte NUR mit: []
|
||||
OUTPUT:
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 8. WP-15b: EDGE VALIDATION (Ingest/Validate)
|
||||
# ---------------------------------------------------------
|
||||
edge_validation:
|
||||
gemini: |
|
||||
Bewerte die semantische Validität dieser Verbindung im Wissensgraph.
|
||||
|
||||
KONTEXT DER QUELLE (Chunk):
|
||||
"{chunk_text}"
|
||||
|
||||
ZIEL-NOTIZ: "{target_title}"
|
||||
ZIEL-BESCHREIBUNG (Zusammenfassung):
|
||||
"{target_summary}"
|
||||
|
||||
GEPLANTE RELATION: "{edge_kind}"
|
||||
|
||||
FRAGE: Bestätigt der Kontext der Quelle die Beziehung '{edge_kind}' zum Ziel?
|
||||
REGEL: Antworte NUR mit 'YES' oder 'NO'. Keine Erklärungen oder Smalltalk.
|
||||
openrouter: |
|
||||
Verify semantic relation for graph construction.
|
||||
Source Context: {chunk_text}
|
||||
Target Note: {target_title}
|
||||
Target Summary: {target_summary}
|
||||
Proposed Relation: {edge_kind}
|
||||
Instruction: Does the source context support this relation to the target?
|
||||
Result: Respond ONLY with 'YES' or 'NO'.
|
||||
ollama: |
|
||||
Bewerte die semantische Korrektheit dieser Verbindung.
|
||||
QUELLE: {chunk_text}
|
||||
ZIEL: {target_title} ({target_summary})
|
||||
BEZIEHUNG: {edge_kind}
|
||||
Ist diese Verbindung valide? Antworte NUR mit YES oder NO.
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 10. WP-25: INTENT ROUTING (Intent: CLASSIFY)
|
||||
# ---------------------------------------------------------
|
||||
intent_router_v1:
|
||||
ollama: |
|
||||
Analysiere die Nutzeranfrage und wähle die passende Strategie.
|
||||
Antworte NUR mit dem Namen der Strategie.
|
||||
|
||||
STRATEGIEN:
|
||||
- FACT_WHEN: Nur für explizite Fragen nach einem exakten Datum, Uhrzeit oder dem "Wann" eines Ereignisses.
|
||||
- FACT_WHAT: Fragen nach Inhalten, Listen von Objekten/Projekten, Definitionen oder "Was/Welche" Anfragen (auch bei Zeiträumen).
|
||||
- DECISION: Rat, Meinung, "Soll ich?", Abwägung gegen Werte.
|
||||
- EMPATHY: Emotionen, Reflexion, Befindlichkeit.
|
||||
- CODING: Programmierung, Skripte, technische Syntax.
|
||||
- INTERVIEW: Dokumentation neuer Informationen, Notizen anlegen.
|
||||
|
||||
NACHRICHT: "{query}"
|
||||
STRATEGIE:
|
||||
gemini: |
|
||||
Classify intent:
|
||||
- FACT_WHEN: Exact dates/times only.
|
||||
- FACT_WHAT: Content, lists of entities (projects, etc.), definitions, "What/Which" queries.
|
||||
- DECISION: Strategic advice/values.
|
||||
- EMPATHY: Emotions.
|
||||
- CODING: Tech/Code.
|
||||
- INTERVIEW: Data entry.
|
||||
Query: "{query}"
|
||||
Result (One word only):
|
||||
openrouter: |
|
||||
Select strategy for Mindnet:
|
||||
FACT_WHEN (timing/dates), FACT_WHAT (entities/lists/what/which), DECISION, EMPATHY, CODING, INTERVIEW.
|
||||
Query: "{query}"
|
||||
Response:
|
||||
453
config/prompts.yaml
Normal file
453
config/prompts.yaml
Normal file
|
|
@ -0,0 +1,453 @@
|
|||
# config/prompts.yaml — VERSION 3.2.2 (WP-25b: Hierarchical Model Sync)
|
||||
# STATUS: Active
|
||||
# FIX:
|
||||
# - 100% Erhalt der Original-Prompts aus v3.1.2 für die Provider-Ebene (ollama, gemini, openrouter).
|
||||
# - Integration der Modell-spezifischen Overrides für Gemini 2.0, Llama 3.3 und Qwen 2.5.
|
||||
# - Hinzufügen des notwendigen 'compression_template' für die DecisionEngine v1.3.0.
|
||||
|
||||
system_prompt: |
|
||||
Du bist 'mindnet', mein digitaler Zwilling und strategischer Partner.
|
||||
|
||||
DEINE IDENTITÄT:
|
||||
- Du bist nicht nur eine Datenbank, sondern handelst nach MEINEN Werten und Zielen.
|
||||
- Du passt deinen Stil dynamisch an die Situation an (Analytisch, Empathisch oder Technisch).
|
||||
|
||||
DEINE REGELN:
|
||||
1. Deine Antwort muss zu 100% auf dem bereitgestellten KONTEXT basieren.
|
||||
2. Halluziniere keine Fakten, die nicht in den Quellen stehen.
|
||||
3. Antworte auf Deutsch (außer bei Code/Fachbegriffen).
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 1. STANDARD: Fakten & Wissen (Intent: FACT_WHAT / FACT_WHEN)
|
||||
# ---------------------------------------------------------
|
||||
fact_synthesis_v1:
|
||||
# --- Modell-spezifisch (WP-25b Optimierung) ---
|
||||
"google/gemini-2.0-flash-exp:free": |
|
||||
Analysiere die Wissens-Streams für: {query}
|
||||
FAKTEN: {facts_stream} | BIOGRAFIE: {biography_stream} | TECHNIK: {tech_stream}
|
||||
Nutze deine hohe Reasoning-Kapazität für eine tiefe Synthese. Antworte präzise auf Deutsch.
|
||||
|
||||
"meta-llama/llama-3.3-70b-instruct:free": |
|
||||
Erstelle eine fundierte Synthese für die Frage: "{query}"
|
||||
Nutze die Daten: {facts_stream}, {biography_stream} und {tech_stream}.
|
||||
Trenne klare Fakten von Erfahrungen. Bleibe strikt beim bereitgestellten Kontext.
|
||||
|
||||
# --- EXAKTE Provider-Fallbacks aus v3.1.2 ---
|
||||
ollama: |
|
||||
WISSENS-STREAMS:
|
||||
=========================================
|
||||
FAKTEN & STATUS:
|
||||
{facts_stream}
|
||||
|
||||
ERFAHRUNG & BIOGRAFIE:
|
||||
{biography_stream}
|
||||
|
||||
WISSEN & TECHNIK:
|
||||
{tech_stream}
|
||||
=========================================
|
||||
|
||||
FRAGE:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Beantworte die Frage präzise basierend auf den Quellen.
|
||||
Kombiniere harte Fakten mit persönlichen Erfahrungen, falls vorhanden.
|
||||
Fasse die Informationen zusammen. Sei objektiv und neutral.
|
||||
|
||||
gemini: |
|
||||
Beantworte die Wissensabfrage "{query}" basierend auf diesen Streams:
|
||||
FAKTEN: {facts_stream}
|
||||
BIOGRAFIE/ERFAHRUNG: {biography_stream}
|
||||
TECHNIK: {tech_stream}
|
||||
Kombiniere harte Fakten mit persönlichen Erfahrungen, falls vorhanden. Antworte strukturiert und präzise.
|
||||
|
||||
openrouter: |
|
||||
Synthese der Wissens-Streams für: {query}
|
||||
Inhalt: {facts_stream} | {biography_stream} | {tech_stream}
|
||||
Antworte basierend auf dem bereitgestellten Kontext.
|
||||
|
||||
default: "Beantworte {query} basierend auf dem Kontext: {facts_stream} {biography_stream} {tech_stream}."
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 2. DECISION: Strategie & Abwägung (Intent: DECISION)
|
||||
# ---------------------------------------------------------
|
||||
decision_synthesis_v1:
|
||||
# --- Modell-spezifisch (WP-25b Optimierung) ---
|
||||
"google/gemini-2.0-flash-exp:free": |
|
||||
Agiere als strategischer Partner für: {query}
|
||||
WERTE: {values_stream} | FAKTEN: {facts_stream} | RISIKEN: {risk_stream}
|
||||
Prüfe die Fakten gegen meine Werte. Zeige Zielkonflikte auf. Gib eine klare Empfehlung.
|
||||
|
||||
# --- EXAKTE Provider-Fallbacks aus v3.1.2 ---
|
||||
ollama: |
|
||||
ENTSCHEIDUNGS-STREAMS:
|
||||
=========================================
|
||||
WERTE & PRINZIPIEN (Identität):
|
||||
{values_stream}
|
||||
|
||||
OPERATIVE FAKTEN (Realität):
|
||||
{facts_stream}
|
||||
|
||||
RISIKO-RADAR (Konsequenzen):
|
||||
{risk_stream}
|
||||
=========================================
|
||||
|
||||
ENTSCHEIDUNGSFRAGE:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Du agierst als mein Entscheidungs-Partner.
|
||||
1. Analysiere die Faktenlage aus den Quellen.
|
||||
2. Prüfe dies hart gegen meine strategischen Notizen (Werte & Prinzipien).
|
||||
3. Wäge ab: Passt die technische/faktische Lösung zu meinen Werten?
|
||||
|
||||
FORMAT:
|
||||
- **Analyse:** (Kurze Zusammenfassung der Fakten)
|
||||
- **Abgleich:** (Gibt es Konflikte mit Werten/Zielen? Nenne die Quelle!)
|
||||
- **Empfehlung:** (Klare Meinung: Ja/No/Vielleicht mit Begründung)
|
||||
|
||||
gemini: |
|
||||
Agiere als mein strategischer Partner. Analysiere die Frage: {query}
|
||||
Werte: {values_stream} | Fakten: {facts_stream} | Risiken: {risk_stream}.
|
||||
Wäge ab und gib eine klare strategische Empfehlung ab.
|
||||
|
||||
openrouter: |
|
||||
Strategische Multi-Stream Analyse für: {query}
|
||||
Werte-Basis: {values_stream} | Fakten: {facts_stream} | Risiken: {risk_stream}
|
||||
Bitte wäge ab und gib eine Empfehlung.
|
||||
|
||||
default: "Prüfe {query} gegen Werte {values_stream} und Fakten {facts_stream}."
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 3. EMPATHY: Der Spiegel / "Ich"-Modus (Intent: EMPATHY)
|
||||
# ---------------------------------------------------------
|
||||
empathy_template:
|
||||
# --- EXAKTE Provider-Fallbacks aus v3.1.2 ---
|
||||
ollama: |
|
||||
KONTEXT (ERFAHRUNGEN & WERTE):
|
||||
=========================================
|
||||
ERLEBNISSE & BIOGRAFIE:
|
||||
{biography_stream}
|
||||
|
||||
WERTE & BEDÜRFNISSE:
|
||||
{values_stream}
|
||||
=========================================
|
||||
|
||||
SITUATION:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Du agierst jetzt als mein empathischer Spiegel.
|
||||
1. Versuche nicht sofort, das Problem technisch zu lösen.
|
||||
2. Zeige Verständnis für die Situation basierend auf meinen eigenen Erfahrungen ([EXPERIENCE]) oder Werten, falls im Kontext vorhanden.
|
||||
3. Antworte in der "Ich"-Form oder "Wir"-Form. Sei unterstützend.
|
||||
|
||||
TONFALL:
|
||||
Ruhig, verständnisvoll, reflektiert. Keine Aufzählungszeichen, sondern fließender Text.
|
||||
|
||||
gemini: "Sei mein digitaler Spiegel für {query}. Kontext: {biography_stream}, {values_stream}"
|
||||
openrouter: "Empathische Reflexion der Situation {query}. Persönlicher Kontext: {biography_stream}, {values_stream}"
|
||||
|
||||
default: "Reflektiere empathisch über {query} basierend auf {biography_stream}."
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 4. TECHNICAL: Der Coder (Intent: CODING)
|
||||
# ---------------------------------------------------------
|
||||
technical_template:
|
||||
# --- Modell-spezifisch (WP-25b Optimierung) ---
|
||||
"qwen/qwen-2.5-vl-7b-instruct:free": |
|
||||
Du bist Senior Software Engineer. TASK: {query}
|
||||
REFERENZEN: {tech_stream} | KONTEXT: {facts_stream}
|
||||
Generiere validen, performanten Code. Nutze die Snippets aus dem Kontext.
|
||||
|
||||
# --- EXAKTE Provider-Fallbacks aus v3.1.2 ---
|
||||
ollama: |
|
||||
KONTEXT (WISSEN & PROJEKTE):
|
||||
=========================================
|
||||
TECHNIK & SNIPPETS:
|
||||
{tech_stream}
|
||||
|
||||
PROJEKT-STATUS:
|
||||
{facts_stream}
|
||||
=========================================
|
||||
|
||||
TASK:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Du bist Senior Developer.
|
||||
1. Ignoriere Smalltalk. Komm sofort zum Punkt.
|
||||
2. Generiere validen, performanten Code basierend auf den Quellen.
|
||||
3. Wenn Quellen fehlen, nutze dein allgemeines Programmierwissen, aber weise darauf hin.
|
||||
|
||||
FORMAT:
|
||||
- Kurze Erklärung des Ansatzes.
|
||||
- Markdown Code-Block (Copy-Paste fertig).
|
||||
- Wichtige Edge-Cases.
|
||||
|
||||
gemini: "Generiere Code für {query} unter Berücksichtigung von {tech_stream} und {facts_stream}."
|
||||
openrouter: "Technischer Support für {query}. Referenzen: {tech_stream}, Projekt-Kontext: {facts_stream}"
|
||||
|
||||
default: "Erstelle eine technische Lösung für {query}."
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 5. INTERVIEW: Der "One-Shot Extractor" (WP-07)
|
||||
# ---------------------------------------------------------
|
||||
interview_template:
|
||||
# --- EXAKTE Provider-Fallbacks aus v3.1.2 ---
|
||||
ollama: |
|
||||
TASK:
|
||||
Du bist ein professioneller Ghostwriter. Verwandle den "USER INPUT" in eine strukturierte Notiz vom Typ '{target_type}'.
|
||||
|
||||
STRUKTUR (Nutze EXAKT diese Überschriften):
|
||||
{schema_fields}
|
||||
|
||||
USER INPUT:
|
||||
"{query}"
|
||||
|
||||
ANWEISUNG ZUM INHALT:
|
||||
1. Analysiere den Input genau.
|
||||
2. Schreibe die Inhalte unter die passenden Überschriften aus der STRUKTUR-Liste oben.
|
||||
3. STIL: Schreibe flüssig, professionell und in der Ich-Perspektive. Korrigiere Grammatikfehler, aber behalte den persönlichen Ton bei.
|
||||
4. Wenn Informationen für einen Abschnitt fehlen, schreibe nur: "[TODO: Ergänzen]". Erfinde nichts dazu.
|
||||
|
||||
OUTPUT FORMAT (YAML + MARKDOWN):
|
||||
---
|
||||
type: {target_type}
|
||||
status: draft
|
||||
title: (Erstelle einen treffenden, kurzen Titel für den Inhalt)
|
||||
tags: [Tag1, Tag2]
|
||||
---
|
||||
|
||||
# (Wiederhole den Titel hier)
|
||||
|
||||
## (Erster Begriff aus STRUKTUR)
|
||||
(Text...)
|
||||
|
||||
## (Zweiter Begriff aus STRUKTUR)
|
||||
(Text...)
|
||||
|
||||
gemini: "Extrahiere Daten für {target_type} aus {query}."
|
||||
openrouter: "Strukturiere den Input {query} nach dem Schema {schema_fields} für Typ {target_type}."
|
||||
|
||||
default: "Extrahiere Informationen für {target_type} aus dem Input: {query}"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 6. WP-25b: PRE-SYNTHESIS COMPRESSION (Neu!)
|
||||
# ---------------------------------------------------------
|
||||
compression_template:
|
||||
"mistralai/mistral-7b-instruct:free": |
|
||||
Reduziere den Stream '{stream_name}' auf die Informationen, die für die Beantwortung der Frage '{query}' absolut notwendig sind.
|
||||
BEHALTE: Harte Fakten, Projektnamen, konkrete Werte und Quellenangaben.
|
||||
ENTFERNE: Redundante Einleitungen, Füllwörter und irrelevante Details.
|
||||
|
||||
INHALT:
|
||||
{content}
|
||||
|
||||
KOMPRIMIERTE ANALYSE:
|
||||
|
||||
default: "Fasse das Wichtigste aus {stream_name} für die Frage {query} kurz zusammen: {content}"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 7. EDGE_ALLOCATION: Kantenfilter (Ingest)
|
||||
# ---------------------------------------------------------
|
||||
edge_allocation_template:
|
||||
ollama: |
|
||||
TASK:
|
||||
Du bist ein strikter Selektor. Du erhältst eine Liste von "Kandidaten-Kanten" (Strings).
|
||||
Wähle jene aus, die inhaltlich im "Textabschnitt" vorkommen oder relevant sind.
|
||||
|
||||
TEXTABSCHNITT:
|
||||
"""
|
||||
{chunk_text}
|
||||
"""
|
||||
|
||||
KANDIDATEN (Auswahl-Pool):
|
||||
{edge_list}
|
||||
|
||||
REGELN:
|
||||
1. Die Kanten haben das Format "typ:ziel". Der "typ" ist variabel und kann ALLES sein.
|
||||
2. Gib NUR die Strings aus der Kandidaten-Liste zurück, die zum Text passen.
|
||||
3. Erfinde KEINE neuen Kanten.
|
||||
4. Antworte als flache JSON-Liste.
|
||||
|
||||
DEIN OUTPUT (JSON):
|
||||
|
||||
gemini: |
|
||||
TASK: Ordne Kanten einem Textabschnitt zu.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {chunk_text}
|
||||
KANDIDATEN: {edge_list}
|
||||
OUTPUT: STRIKT eine flache JSON-Liste ["typ:ziel"]. Kein Text davor/danach. Wenn nichts: []. Keine Objekte!
|
||||
|
||||
openrouter: |
|
||||
TASK: Filtere relevante Kanten aus dem Pool.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {chunk_text}
|
||||
POOL: {edge_list}
|
||||
ANWEISUNG: Gib NUR eine flache JSON-Liste von Strings zurück.
|
||||
BEISPIEL: ["kind:target", "kind:target"]
|
||||
REGEL: Kein Text, keine Analyse, keine Kommentare. Wenn nichts passt, gib [] zurück.
|
||||
OUTPUT:
|
||||
|
||||
default: "[]"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 8. SMART EDGE ALLOCATION: Extraktion (Ingest)
|
||||
# ---------------------------------------------------------
|
||||
edge_extraction:
|
||||
ollama: |
|
||||
TASK:
|
||||
Du bist ein Wissens-Ingenieur für den digitalen Zwilling 'mindnet'.
|
||||
Deine Aufgabe ist es, semantische Relationen (Kanten) aus dem Text zu extrahieren,
|
||||
die die Hauptnotiz '{note_id}' mit anderen Konzepten verbinden.
|
||||
|
||||
ANWEISUNGEN:
|
||||
1. Identifiziere wichtige Entitäten, Konzepte oder Ereignisse im Text.
|
||||
2. Bestimme die Art der Beziehung (z.B. part_of, uses, related_to, blocks, caused_by).
|
||||
3. Das Ziel (target) muss ein prägnanter Begriff sein.
|
||||
4. Antworte AUSSCHLIESSLICH in validem JSON als Liste von Objekten.
|
||||
|
||||
BEISPIEL:
|
||||
[[ {{"to": "Ziel-Konzept", \"kind\": \"beziehungs_typ\"}} ]]
|
||||
|
||||
TEXT:
|
||||
"""
|
||||
{text}
|
||||
"""
|
||||
|
||||
DEIN OUTPUT (JSON):
|
||||
|
||||
gemini: |
|
||||
Analysiere '{note_id}'. Extrahiere semantische Beziehungen.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {text}
|
||||
OUTPUT: STRIKT JSON-Array von Objekten: [[{{"to\":\"Ziel\",\"kind\":\"typ\"}}]]. Kein Text davor/danach. Wenn nichts: [].
|
||||
|
||||
openrouter: |
|
||||
TASK: Extrahiere semantische Relationen für '{note_id}'.
|
||||
ERLAUBTE TYPEN: {valid_types}
|
||||
TEXT: {text}
|
||||
ANWEISUNG: Antworte AUSSCHLIESSLICH mit einem JSON-Array von Objekten.
|
||||
FORMAT: [[{{"to\":\"Ziel-Begriff\",\"kind\":\"typ\"}}]]
|
||||
STRIKTES VERBOT: Schreibe keine Einleitung, keine Analyse und keine Erklärungen.
|
||||
Wenn keine Relationen existieren, antworte NUR mit: []
|
||||
OUTPUT:
|
||||
|
||||
default: "[]"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 9. INGESTION: EDGE VALIDATION (Ingest/Validate)
|
||||
# ---------------------------------------------------------
|
||||
edge_validation:
|
||||
# --- Modell-spezifisch (WP-25b Optimierung) ---
|
||||
"mistralai/mistral-7b-instruct:free": |
|
||||
Verify relation '{edge_kind}' for graph integrity.
|
||||
Chunk: "{chunk_text}"
|
||||
Target: "{target_title}" ({target_summary})
|
||||
Respond ONLY with 'YES' or 'NO'.
|
||||
|
||||
# --- EXAKTE Provider-Fallbacks aus v3.1.2 ---
|
||||
gemini: |
|
||||
Bewerte die semantische Validität dieser Verbindung im Wissensgraph.
|
||||
|
||||
KONTEXT DER QUELLE (Chunk):
|
||||
"{chunk_text}"
|
||||
|
||||
ZIEL-NOTIZ: "{target_title}"
|
||||
ZIEL-BESCHREIBUNG (Zusammenfassung):
|
||||
"{target_summary}"
|
||||
|
||||
GEPLANTE RELATION: "{edge_kind}"
|
||||
|
||||
FRAGE: Bestätigt der Kontext der Quelle die Beziehung '{edge_kind}' zum Ziel?
|
||||
REGEL: Antworte NUR mit 'YES' oder 'NO'. Keine Erklärungen oder Smalltalk.
|
||||
|
||||
openrouter: |
|
||||
Verify semantic relation for graph construction.
|
||||
Source Context: {chunk_text}
|
||||
Target Note: {target_title}
|
||||
Target Summary: {target_summary}
|
||||
Proposed Relation: {edge_kind}
|
||||
Instruction: Does the source context support this relation to the target?
|
||||
Result: Respond ONLY with 'YES' or 'NO'.
|
||||
|
||||
ollama: |
|
||||
Bewerte die semantische Korrektheit dieser Verbindung.
|
||||
QUELLE: {chunk_text}
|
||||
ZIEL: {target_title} ({target_summary})
|
||||
BEZIEHUNG: {edge_kind}
|
||||
Ist diese Verbindung valide? Antworte NUR mit YES oder NO.
|
||||
|
||||
default: "YES"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 10. WP-25: INTENT ROUTING (Intent: CLASSIFY)
|
||||
# ---------------------------------------------------------
|
||||
intent_router_v1:
|
||||
# --- Modell-spezifisch (WP-25b Optimierung) ---
|
||||
"mistralai/mistral-7b-instruct:free": |
|
||||
Classify query "{query}" into exactly one of these categories:
|
||||
FACT_WHEN, FACT_WHAT, DECISION, EMPATHY, CODING, INTERVIEW.
|
||||
Respond with the category name only.
|
||||
|
||||
# --- EXAKTE Provider-Fallbacks aus v3.1.2 ---
|
||||
ollama: |
|
||||
Analysiere die Nutzeranfrage und wähle die passende Strategie.
|
||||
Antworte NUR mit dem Namen der Strategie.
|
||||
|
||||
STRATEGIEN:
|
||||
- FACT_WHEN: Nur für explizite Fragen nach einem exakten Datum, Uhrzeit oder dem "Wann" eines Ereignisses.
|
||||
- FACT_WHAT: Fragen nach Inhalten, Listen von Objekten/Projekten, Definitionen oder "Was/Welche" Anfragen (auch bei Zeiträumen).
|
||||
- DECISION: Rat, Meinung, "Soll ich?", Abwägung gegen Werte.
|
||||
- EMPATHY: Emotionen, Reflexion, Befindlichkeit.
|
||||
- CODING: Programmierung, Skripte, technische Syntax.
|
||||
- INTERVIEW: Dokumentation neuer Informationen, Notizen anlegen.
|
||||
|
||||
NACHRICHT: "{query}"
|
||||
STRATEGIE:
|
||||
|
||||
gemini: |
|
||||
Classify intent:
|
||||
- FACT_WHEN: Exact dates/times only.
|
||||
- FACT_WHAT: Content, lists of entities (projects, etc.), definitions, "What/Which" queries.
|
||||
- DECISION: Strategic advice/values.
|
||||
- EMPATHY: Emotions.
|
||||
- CODING: Tech/Code.
|
||||
- INTERVIEW: Data entry.
|
||||
Query: "{query}"
|
||||
Result (One word only):
|
||||
|
||||
openrouter: |
|
||||
Select strategy for Mindnet:
|
||||
FACT_WHEN (timing/dates), FACT_WHAT (entities/lists/what/which), DECISION, EMPATHY, CODING, INTERVIEW.
|
||||
Query: "{query}"
|
||||
Response:
|
||||
|
||||
default: "FACT_WHAT"
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 11. WP-25b: FALLBACK SYNTHESIS (Error Recovery)
|
||||
# ---------------------------------------------------------
|
||||
fallback_synthesis:
|
||||
ollama: |
|
||||
Beantworte die folgende Frage basierend auf dem bereitgestellten Kontext.
|
||||
|
||||
FRAGE:
|
||||
{query}
|
||||
|
||||
KONTEXT:
|
||||
{context}
|
||||
|
||||
ANWEISUNG:
|
||||
Nutze den Kontext, um eine präzise Antwort zu geben. Falls der Kontext unvollständig ist, weise darauf hin.
|
||||
|
||||
gemini: |
|
||||
Frage: {query}
|
||||
Kontext: {context}
|
||||
Antworte basierend auf dem Kontext.
|
||||
|
||||
openrouter: |
|
||||
Answer the question "{query}" using the provided context: {context}
|
||||
|
||||
default: "Answer: {query}\n\nContext: {context}"
|
||||
32
config/prompts.yaml_old
Normal file
32
config/prompts.yaml_old
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# config/prompts.yaml — Persönlichkeit & RAG-Strategie
|
||||
# Version: 2.0 (Audit Update)
|
||||
|
||||
system_prompt: |
|
||||
Du bist 'mindnet', ein persönliches KI-Gedächtnis und der Digitale Zwilling deines Erschaffers ("User").
|
||||
|
||||
DEINE PERSÖNLICHKEIT & WERTE:
|
||||
1. Pragmatismus: Du bevorzugst funktionierende Lösungen über theoretische Perfektion.
|
||||
2. Transparenz: Du erfindest keine Fakten. Wenn Informationen im Kontext fehlen, sagst du das klar.
|
||||
3. Vernetztes Denken: Du suchst aktiv nach Verbindungen zwischen den bereitgestellten Notizen.
|
||||
4. Erklärbarkeit: Wenn du eine Aussage machst, beziehst du dich implizit auf die Quelle (z.B. "Wie in Projekt X definiert...").
|
||||
|
||||
DEINE AUFGABE:
|
||||
Beantworte die Frage des Users ausschließlich basierend auf dem untenstehenden KONTEXT.
|
||||
Der Kontext besteht aus Auszügen verschiedener Notizen. Achte besonders auf den [TYPE] der Notiz:
|
||||
- [DECISION] erklärt das "Warum".
|
||||
- [PROJECT] erklärt das "Was" und "Wann".
|
||||
- [CONCEPT] liefert Definitionen.
|
||||
- [VALUE] definiert die moralische/strategische Ausrichtung.
|
||||
|
||||
rag_template: |
|
||||
HINTERGRUNDWISSEN (KONTEXT):
|
||||
=========================================
|
||||
{context_str}
|
||||
=========================================
|
||||
|
||||
FRAGE DES USERS:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Analysiere die Quellen oben. Synthetisiere eine Antwort, die die Frage präzise beantwortet.
|
||||
Nutze Markdown für Struktur (Fettgedrucktes für Wichtiges).
|
||||
24
config/prompts.yaml_old2
Normal file
24
config/prompts.yaml_old2
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
system_prompt: |
|
||||
Du bist 'mindnet', ein persönliches KI-Gedächtnis.
|
||||
|
||||
DEINE REGELN:
|
||||
1. Antworte NUR basierend auf dem untenstehenden KONTEXT.
|
||||
2. Achte auf den TYP der Quelle:
|
||||
- [DECISION] enthält Begründungen (Warum?).
|
||||
- [PROJECT] enthält Ziele (Was?).
|
||||
- [CONCEPT] enthält Definitionen.
|
||||
3. Sei präzise. Nenne konkrete technische Gründe, wenn sie im Text stehen.
|
||||
4. Halluziniere nicht.
|
||||
|
||||
rag_template: |
|
||||
KONTEXT (WISSEN):
|
||||
=========================================
|
||||
{context_str}
|
||||
=========================================
|
||||
|
||||
FRAGE:
|
||||
{query}
|
||||
|
||||
ANWEISUNG:
|
||||
Analysiere den Kontext. Wenn eine [DECISION] Quelle dabei ist, nutze deren Inhalt für die Begründung.
|
||||
Antworte kurz und präzise auf Deutsch.
|
||||
59
config/retriever.yaml
Normal file
59
config/retriever.yaml
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
version: 1.2
|
||||
|
||||
scoring:
|
||||
# W_sem: skaliert den Term (semantic_score * retriever_weight)
|
||||
# Empfehlung Startwert: 1.0 → Semantik bleibt Hauptsignal
|
||||
semantic_weight: 1.0
|
||||
|
||||
# W_edge: skaliert edge_bonus aus dem Subgraph
|
||||
# Empfehlung: 0.8 → Graph ist deutlich spürbar, aber überstimmt Semantik nicht komplett
|
||||
edge_weight: 0.8
|
||||
|
||||
# W_cent: skaliert centrality_bonus (Knoten-Zentralität im Subgraph)
|
||||
# Empfehlung: 0.5 → zentrale Knoten werden bevorzugt, aber moderat
|
||||
centrality_weight: 0.5
|
||||
|
||||
# WP-22 Stellschraube: Lifecycle (Status-basiertes Scoring)
|
||||
# Bonus für verifiziertes Wissen, Malus für Entwürfe
|
||||
lifecycle_weights:
|
||||
stable: 1.2 # +20% Bonus
|
||||
active: 1.0 # Standardwert
|
||||
draft: 0.5 # -50% Malus
|
||||
system: 0.0 # Hard Skip via Ingestion
|
||||
|
||||
# Die nachfolgenden Werte überschreiben die Defaults aus app/core/retriever_config.
|
||||
# Wenn neue Kantentypen, z.B. durch Referenzierung innerhalb einer md-Datei im vault anders gewichtet werden sollen, dann muss hier die Konfiguration erfolgen
|
||||
edge_types:
|
||||
# --- KATEGORIE 1: LOGIK-BOOSTS (Relevanz-Treiber) ---
|
||||
# Diese Kanten haben die Kraft, das semantische Ranking aktiv umzugestalten.
|
||||
blocks: 1.6 # Kritisch: Risiken/Blocker müssen sofort sichtbar sein.
|
||||
solves: 1.5 # Zielführend: Lösungen sind primäre Suchziele.
|
||||
depends_on: 1.4 # Logisch: Harte fachliche Abhängigkeit.
|
||||
resulted_in: 1.4 # Kausal: Ergebnisse und unmittelbare Konsequenzen.
|
||||
followed_by: 1.3 # Sequenziell (User): Bewusst gesteuerte Wissenspfade.
|
||||
caused_by: 1.2 # Kausal: Ursachen-Bezug (Basis für Intent-Boost).
|
||||
preceded_by: 1.1 # Sequenziell (User): Rückwärts-Bezug in Logik-Ketten.
|
||||
impacts: 1.2 # Langfristige Auswirkung/Einfluss
|
||||
|
||||
# --- KATEGORIE 2: QUALITATIVER KONTEXT (Stabilitäts-Stützen) ---
|
||||
# Diese Kanten liefern wichtigen Kontext, ohne das Ergebnis zu verfälschen.
|
||||
guides: 1.1 # Qualitativ: Prinzipien oder Werte leiten das Thema.
|
||||
part_of: 1.1 # Strukturell: Zieht übergeordnete Kontexte (Parents) mit hoch.
|
||||
based_on: 0.8 # Fundament: Bezug auf Basis-Werte (kalibriert auf Safe-Retrieval).
|
||||
derived_from: 0.6 # Historisch: Dokumentiert die Herkunft von Wissen.
|
||||
uses: 0.6 # Instrumentell: Genutzte Werkzeuge, Methoden oder Ressourcen.
|
||||
|
||||
# --- KATEGORIE 3: THEMATISCHE NÄHE (Ähnlichkeits-Signal) ---
|
||||
# Diese Werte verhindern den "Drift" in fachfremde Bereiche.
|
||||
similar_to: 0.4 # Analytisch: Thematische Nähe (oft KI-generiert).
|
||||
|
||||
# --- KATEGORIE 4: SYSTEM-NUDGES (Technische Struktur) ---
|
||||
# Reine Orientierungshilfen für das System; fast kein Einfluss auf das Ranking.
|
||||
belongs_to: 0.2 # System: Verknüpft Chunks mit der Note (Metadaten-Träger).
|
||||
next: 0.1 # System: Technische Lesereihenfolge der Absätze.
|
||||
prev: 0.1 # System: Technische Lesereihenfolge der Absätze.
|
||||
|
||||
# --- KATEGORIE 5: WEICHE ASSOZIATIONEN (Rausch-Unterdrückung) ---
|
||||
# Verhindert, dass lose Verknüpfungen das Ergebnis "verwässern".
|
||||
references: 0.1 # Assoziativ: Einfacher Querverweis oder Erwähnung.
|
||||
related_to: 0.05 # Minimal: Schwächste thematische Verbindung.
|
||||
310
config/types.yaml
Normal file
310
config/types.yaml
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
version: 2.7.0 # WP-14 Update: Dynamisierung der Ingestion-Pipeline
|
||||
|
||||
# ==============================================================================
|
||||
# 1. CHUNKING PROFILES
|
||||
# ==============================================================================
|
||||
|
||||
chunking_profiles:
|
||||
|
||||
# A. SHORT & FAST
|
||||
sliding_short:
|
||||
strategy: sliding_window
|
||||
enable_smart_edge_allocation: false
|
||||
target: 200
|
||||
max: 350
|
||||
overlap: [30, 50]
|
||||
|
||||
# B. STANDARD & FAST
|
||||
sliding_standard:
|
||||
strategy: sliding_window
|
||||
enable_smart_edge_allocation: false
|
||||
target: 450
|
||||
max: 650
|
||||
overlap: [50, 100]
|
||||
|
||||
# C. SMART FLOW (Text-Fluss)
|
||||
sliding_smart_edges:
|
||||
strategy: sliding_window
|
||||
enable_smart_edge_allocation: true
|
||||
target: 400
|
||||
max: 600
|
||||
overlap: [50, 80]
|
||||
|
||||
# D. SMART STRUCTURE (Soft Split)
|
||||
structured_smart_edges:
|
||||
strategy: by_heading
|
||||
enable_smart_edge_allocation: true
|
||||
split_level: 2
|
||||
strict_heading_split: false
|
||||
max: 600
|
||||
target: 400
|
||||
overlap: [50, 80]
|
||||
|
||||
# E. SMART STRUCTURE STRICT (H2 Hard Split)
|
||||
structured_smart_edges_strict:
|
||||
strategy: by_heading
|
||||
enable_smart_edge_allocation: true
|
||||
split_level: 2
|
||||
strict_heading_split: true # Hard Mode
|
||||
max: 600
|
||||
target: 400
|
||||
overlap: [50, 80]
|
||||
|
||||
# F. SMART STRUCTURE DEEP (H3 Hard Split + Merge-Check)
|
||||
structured_smart_edges_strict_L3:
|
||||
strategy: by_heading
|
||||
enable_smart_edge_allocation: true
|
||||
split_level: 3
|
||||
strict_heading_split: true
|
||||
max: 600
|
||||
target: 400
|
||||
overlap: [50, 80]
|
||||
|
||||
# ==============================================================================
|
||||
# 2. DEFAULTS
|
||||
# ==============================================================================
|
||||
defaults:
|
||||
retriever_weight: 1.0
|
||||
chunking_profile: sliding_standard
|
||||
|
||||
# ==============================================================================
|
||||
# 3. INGESTION SETTINGS (WP-14 Dynamization)
|
||||
# ==============================================================================
|
||||
ingestion_settings:
|
||||
ignore_statuses: ["system", "template", "archive", "hidden"]
|
||||
default_note_type: "concept"
|
||||
|
||||
# ==============================================================================
|
||||
# 4. SUMMARY & SCAN SETTINGS
|
||||
# ==============================================================================
|
||||
summary_settings:
|
||||
max_summary_length: 500
|
||||
pre_scan_depth: 600
|
||||
|
||||
# ==============================================================================
|
||||
# 5. LLM SETTINGS
|
||||
# ==============================================================================
|
||||
llm_settings:
|
||||
cleanup_patterns: ["<s>", "</s>", "[OUT]", "[/OUT]", "```json", "```"]
|
||||
|
||||
# ==============================================================================
|
||||
# 6. TYPE DEFINITIONS
|
||||
# ==============================================================================
|
||||
|
||||
types:
|
||||
|
||||
experience:
|
||||
chunking_profile: structured_smart_edges
|
||||
retriever_weight: 1.10
|
||||
detection_keywords: ["erleben", "reagieren", "handeln", "prägen", "reflektieren"]
|
||||
schema:
|
||||
- "Situation (Was ist passiert?)"
|
||||
- "Meine Reaktion (Was habe ich getan?)"
|
||||
- "Ergebnis & Auswirkung"
|
||||
- "Reflexion & Learning (Was lerne ich daraus?)"
|
||||
|
||||
insight:
|
||||
chunking_profile: structured_smart_edges
|
||||
retriever_weight: 1.20
|
||||
detection_keywords: ["beobachten", "erkennen", "verstehen", "analysieren", "schlussfolgern"]
|
||||
schema:
|
||||
- "Beobachtung (Was sehe ich?)"
|
||||
- "Interpretation (Was bedeutet das?)"
|
||||
- "Bedürfnis (Was steckt dahinter?)"
|
||||
- "Handlungsempfehlung"
|
||||
|
||||
project:
|
||||
chunking_profile: structured_smart_edges
|
||||
retriever_weight: 0.97
|
||||
detection_keywords: ["umsetzen", "planen", "starten", "bauen", "abschließen"]
|
||||
schema:
|
||||
- "Mission & Zielsetzung"
|
||||
- "Aktueller Status & Blockaden"
|
||||
- "Nächste konkrete Schritte"
|
||||
|
||||
decision:
|
||||
chunking_profile: structured_smart_edges_strict
|
||||
retriever_weight: 1.00
|
||||
detection_keywords: ["entscheiden", "wählen", "abwägen", "priorisieren", "festlegen"]
|
||||
schema:
|
||||
- "Kontext & Problemstellung"
|
||||
- "Betrachtete Optionen"
|
||||
- "Die Entscheidung"
|
||||
- "Begründung"
|
||||
|
||||
value:
|
||||
chunking_profile: structured_smart_edges_strict
|
||||
retriever_weight: 1.00
|
||||
detection_keywords: ["werten", "achten", "verpflichten", "bedeuten"]
|
||||
schema:
|
||||
- "Definition"
|
||||
- "Warum mir das wichtig ist"
|
||||
- "Leitsätze"
|
||||
|
||||
principle:
|
||||
chunking_profile: structured_smart_edges_strict_L3
|
||||
retriever_weight: 0.95
|
||||
detection_keywords: ["leiten", "steuern", "ausrichten", "handhaben"]
|
||||
schema:
|
||||
- "Das Prinzip"
|
||||
- "Anwendung & Beispiele"
|
||||
|
||||
trait:
|
||||
chunking_profile: structured_smart_edges_strict
|
||||
retriever_weight: 1.10
|
||||
detection_keywords: ["begeistern", "können", "auszeichnen", "befähigen", "stärken"]
|
||||
schema:
|
||||
- "Eigenschaft / Talent"
|
||||
- "Beispiele aus der Praxis"
|
||||
- "Potenzial für die Zukunft"
|
||||
|
||||
obstacle:
|
||||
chunking_profile: structured_smart_edges_strict
|
||||
retriever_weight: 1.00
|
||||
detection_keywords: ["blockieren", "fürchten", "vermeiden", "hindern", "zweifeln"]
|
||||
schema:
|
||||
- "Beschreibung der Hürde"
|
||||
- "Ursprung / Auslöser"
|
||||
- "Auswirkung auf Ziele"
|
||||
- "Gegenstrategie"
|
||||
|
||||
belief:
|
||||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.90
|
||||
detection_keywords: ["glauben", "meinen", "annehmen", "überzeugen"]
|
||||
schema:
|
||||
- "Der Glaubenssatz"
|
||||
- "Ursprung & Reflexion"
|
||||
|
||||
profile:
|
||||
chunking_profile: structured_smart_edges_strict
|
||||
retriever_weight: 0.70
|
||||
detection_keywords: ["verkörpern", "verantworten", "agieren", "repräsentieren"]
|
||||
schema:
|
||||
- "Rolle / Identität"
|
||||
- "Fakten & Daten"
|
||||
- "Historie"
|
||||
|
||||
idea:
|
||||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.70
|
||||
detection_keywords: ["einfall", "gedanke", "potenzial", "möglichkeit"]
|
||||
schema:
|
||||
- "Der Kerngedanke"
|
||||
- "Potenzial & Auswirkung"
|
||||
- "Nächste Schritte"
|
||||
|
||||
skill:
|
||||
chunking_profile: sliding_smart_edges
|
||||
retriever_weight: 0.90
|
||||
detection_keywords: ["lernen", "beherrschen", "üben", "fertigkeit", "kompetenz"]
|
||||
schema:
|
||||
- "Definition der Fähigkeit"
|
||||
- "Aktueller Stand & Lernpfad"
|
||||
- "Evidenz (Proof of Work)"
|
||||
|
||||
habit:
|
||||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.85
|
||||
detection_keywords: ["gewohnheit", "routine", "automatismus", "immer wenn"]
|
||||
schema:
|
||||
- "Auslöser (Trigger)"
|
||||
- "Routine (Handlung)"
|
||||
- "Belohnung (Reward)"
|
||||
- "Strategie"
|
||||
|
||||
need:
|
||||
chunking_profile: structured_smart_edges
|
||||
retriever_weight: 1.05
|
||||
detection_keywords: ["bedürfnis", "brauchen", "mangel", "erfüllung"]
|
||||
schema:
|
||||
- "Das Bedürfnis"
|
||||
- "Zustand (Mangel vs. Erfüllung)"
|
||||
- "Bezug zu Werten"
|
||||
|
||||
motivation:
|
||||
chunking_profile: structured_smart_edges
|
||||
retriever_weight: 0.95
|
||||
detection_keywords: ["motivation", "antrieb", "warum", "energie"]
|
||||
schema:
|
||||
- "Der Antrieb"
|
||||
- "Zielbezug"
|
||||
- "Energiequelle"
|
||||
|
||||
bias:
|
||||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.80
|
||||
detection_keywords: ["denkfehler", "verzerrung", "vorurteil", "falle"]
|
||||
schema: ["Beschreibung der Verzerrung", "Typische Situationen", "Gegenstrategie"]
|
||||
|
||||
state:
|
||||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.60
|
||||
detection_keywords: ["stimmung", "energie", "gefühl", "verfassung"]
|
||||
schema: ["Aktueller Zustand", "Auslöser", "Auswirkung auf den Tag"]
|
||||
|
||||
boundary:
|
||||
chunking_profile: structured_smart_edges
|
||||
retriever_weight: 0.90
|
||||
detection_keywords: ["grenze", "nein sagen", "limit", "schutz"]
|
||||
schema: ["Die Grenze", "Warum sie wichtig ist", "Konsequenz bei Verletzung"]
|
||||
|
||||
goal:
|
||||
chunking_profile: structured_smart_edges
|
||||
retriever_weight: 0.95
|
||||
detection_keywords: ["ziel", "zielzustand", "kpi", "zeitrahmen", "deadline", "meilenstein"]
|
||||
schema: ["Zielzustand", "Zeitrahmen & KPIs", "Motivation"]
|
||||
|
||||
risk:
|
||||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.85
|
||||
detection_keywords: ["risiko", "gefahr", "bedrohung"]
|
||||
schema: ["Beschreibung des Risikos", "Auswirkungen", "Gegenmaßnahmen"]
|
||||
|
||||
concept:
|
||||
chunking_profile: structured_smart_edges
|
||||
retriever_weight: 0.6
|
||||
detection_keywords: ["definition", "konzept", "begriff", "modell", "rahmen", "theorie"]
|
||||
schema: ["Definition", "Kontext", "Verwandte Konzepte"]
|
||||
|
||||
task:
|
||||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.8
|
||||
detection_keywords: ["aufgabe", "todo", "next_action", "erledigen", "definition_of_done", "checkliste"]
|
||||
schema: ["Aufgabe", "Kontext", "Definition of Done"]
|
||||
|
||||
journal:
|
||||
chunking_profile: sliding_standard
|
||||
retriever_weight: 0.8
|
||||
detection_keywords: ["journal", "tagebuch", "log", "eintrag", "reflexion", "heute"]
|
||||
schema: ["Log-Eintrag", "Gedanken"]
|
||||
|
||||
source:
|
||||
chunking_profile: sliding_standard
|
||||
retriever_weight: 0.5
|
||||
detection_keywords: ["quelle", "paper", "buch", "artikel", "link", "zitat", "studie"]
|
||||
schema: ["Metadaten", "Zusammenfassung", "Zitate"]
|
||||
|
||||
glossary:
|
||||
chunking_profile: sliding_short
|
||||
retriever_weight: 0.4
|
||||
detection_keywords: ["glossar", "begriff", "definition", "terminologie"]
|
||||
schema: ["Begriff", "Definition"]
|
||||
|
||||
person:
|
||||
chunking_profile: sliding_standard
|
||||
retriever_weight: 0.5
|
||||
detection_keywords: ["person", "mensch", "kontakt", "name", "beziehung", "stakeholder"]
|
||||
schema: ["Profile", "Beziehung", "Kontext"]
|
||||
|
||||
event:
|
||||
chunking_profile: sliding_standard
|
||||
retriever_weight: 0.6
|
||||
detection_keywords: ["ereignis", "termin", "datum", "ort", "teilnehmer", "meeting"]
|
||||
schema: ["Datum & Ort", "Teilnehmer", "Ergebnisse"]
|
||||
|
||||
default:
|
||||
chunking_profile: sliding_standard
|
||||
retriever_weight: 1.0
|
||||
detection_keywords: []
|
||||
schema: ["Inhalt"]
|
||||
1
debug.log
Normal file
1
debug.log
Normal file
|
|
@ -0,0 +1 @@
|
|||
[0114/152756.633:ERROR:third_party\crashpad\crashpad\util\win\registration_protocol_win.cc:108] CreateFile: Das System kann die angegebene Datei nicht finden. (0x2)
|
||||
69
docs/00_General/00_Marketing_V3.md
Normal file
69
docs/00_General/00_Marketing_V3.md
Normal file
|
|
@ -0,0 +1,69 @@
|
|||
# Mindnet V3.0: Der Aufstieg des Digitalen Zwillings
|
||||
## Von der Wissensdatenbank zum strategischen Partner – Ein Paradigmenwechsel
|
||||
|
||||
### Einleitung: Die Vision von Version 3.0
|
||||
Mit der Vollendung des Meilensteins WP25 (inklusive der Architektur-Erweiterungen 25a und 25b) transformiert sich Mindnet von einem reinen Retrieval-System (V2) zu einem autonomen, agentischen Ökosystem (V3.0). Mindnet V3.0 ist nicht länger nur ein Werkzeug zur Informationswiedergabe; es ist ein **Digitaler Zwilling**, der in der Lage ist, komplexe Realitäten durch Multi-Stream-Analysen zu erfassen, strategische Empfehlungen auf Basis individueller Werte zu geben und eine bisher unerreichte Ausfallsicherheit zu garantieren.
|
||||
|
||||
---
|
||||
|
||||
### Die 6 Säulen der Mindnet V3.0 Architektur
|
||||
|
||||
#### 1. Agentic Multi-Stream Retrieval (WP-25)
|
||||
Das Herzstück von V3.0 ist die neue `DecisionEngine`. Während herkömmliche Systeme lediglich eine einfache Vektorsuche durchführen, orchestriert die DecisionEngine parallele Wissens-Streams:
|
||||
* **Werte-Stream:** Abgleich von Anfragen mit Ihrer ethischen und strategischen Identität.
|
||||
* **Fakten-Stream:** Analyse der operativen Realität und aktueller Projektdaten.
|
||||
* **Biografie-Stream:** Integration persönlicher Erfahrungen und historischer Kontexte.
|
||||
* **Risiko-Radar:** Proaktive Identifikation von Hindernissen und Zielkonflikten.
|
||||
* **Technik-Wissen:** Tiefgreifende fachliche Expertise für spezialisierte Aufgaben.
|
||||
|
||||
Dieses System erlaubt es Mindnet, eine Anfrage aus fünf verschiedenen Perspektiven gleichzeitig zu beleuchten, bevor eine finale Synthese erfolgt.
|
||||
|
||||
#### 2. Mixture of Experts (MoE) & Dynamic Profiling (WP-25a)
|
||||
Mindnet V3.0 nutzt nicht mehr nur "ein" Modell. Über die zentrale Steuerung in der `llm_profiles.yaml` wird für jede Teilaufgabe der ideale "Experte" gerufen:
|
||||
* **Der Architekt (Gemini 2.0 Flash):** Für hochkomplexe reasoning-intensive Synthesen.
|
||||
* **Der Ingenieur (Qwen 2.5):** Spezialisiert auf präzise Code-Generierung und technische Problemlösung.
|
||||
* **Der Dampfhammer (Mistral 7B):** Optimiert für blitzschnelles Routing und asynchrone Inhaltskompression.
|
||||
* **Der Wächter (Phi-3 Mini):** Ein lokales Modell via Ollama, das maximale Privatsphäre für sensible Identitätsdaten garantiert.
|
||||
|
||||
#### 3. Hierarchische Lazy-Prompt-Orchestration (WP-25b)
|
||||
Ein technologisches Highlight ist die Einführung des **Lazy-Promptings**. Prompts werden nicht mehr statisch im Code verwaltet, sondern erst im Moment der Modellauswahl hierarchisch aufgelöst:
|
||||
1. **Modell-Ebene:** Spezifisch für die jeweilige Modell-ID optimierte Instruktionen.
|
||||
2. **Provider-Ebene:** Fallback-Anweisungen für OpenRouter oder Ollama.
|
||||
3. **Global-Ebene:** Sicherheits-Instruktionen als ultimativer Anker.
|
||||
|
||||
Dies garantiert, dass jedes Modell in seiner "Muttersprache" angesprochen wird, was die Antwortqualität drastisch erhöht.
|
||||
|
||||
#### 4. Die unzerstörbare Fallback-Kaskade
|
||||
Resilienz ist in V3.0 kein Schlagwort, sondern ein Algorithmus. Sollte ein Cloud-Anbieter (wie OpenRouter) ausfallen oder in ein Rate-Limit laufen, reagiert das System autonom:
|
||||
* Automatischer Wechsel auf das Backup-Profil (z.B. von Gemini auf Llama).
|
||||
* In letzter Instanz: Rückzug auf die lokale Hardware (Ollama/Phi-3), sodass Mindnet auch offline voll einsatzfähig bleibt.
|
||||
* **Lazy-Re-Formatting:** Beim Wechsel des Modells wird auch der Prompt sofort neu geladen und für das neue Modell optimiert.
|
||||
|
||||
#### 5. Hochpräzises Intent-Routing mit Regex-Cleaning
|
||||
Durch den neuen ultra-robusten Router in der `DecisionEngine` v1.3.2 erkennt Mindnet Nutzerintentionen mit chirurgischer Präzision. Modell-Artefakte (wie Stop-Marker oder überflüssige Tags freier Modelle) werden durch aggressive Regex-Filter eliminiert, bevor sie das System-Routing stören können. Dies stellt sicher, dass eine Coding-Frage niemals fälschlicherweise im Fakten-Modus landet.
|
||||
|
||||
#### 6. Semantische Ingestion-Validierung v2.14.0
|
||||
Die Qualität des Wissensgraphen wird durch eine neue Validierungsebene geschützt. Während des Imports prüft Mindnet semantisch, ob vorgeschlagene Verknüpfungen (Edges) zwischen Informationen wirklich sinnvoll sind. Dabei unterscheidet das System zwischen temporären Netzwerkfehlern und dauerhaften Logikfehlern, um die Integrität Ihres digitalen Gedächtnisses zu wahren.
|
||||
|
||||
---
|
||||
|
||||
### Technische Highlights für Power-User
|
||||
|
||||
| Feature | Technologie | Nutzen |
|
||||
| :--- | :--- | :--- |
|
||||
| **Orchestrator** | `DecisionEngine v1.3.2` | Agentische Steuerung & Multi-Stream Retrieval |
|
||||
| **Hybrid Cloud** | OpenRouter & Ollama | Maximale Flexibilität zwischen Leistung und Datenschutz |
|
||||
| **Traceability** | `[PROMPT-TRACE]` Logs | Volle Transparenz über die genutzten KI-Instruktionen |
|
||||
| **Context Guard** | Asynchrone Kompression | Optimierung der Kontextfenster für maximale Kosten-Effizienz |
|
||||
| **Resilienz** | Rekursive Fallback-Kaskade | 100% Verfügbarkeit durch Cloud-to-Local Automatisierung |
|
||||
|
||||
---
|
||||
|
||||
### Fazit: Ihr Gehirn, erweitert durch Mindnet V3.0
|
||||
Mindnet V3.0 ist das Ergebnis einer konsequenten Weiterentwicklung hin zu einer **Zero-Failure-Architektur**. Durch die Kombination aus agentischer Intelligenz, hybrider Modellnutzung und der neuen Lazy-Prompt-Infrastruktur bietet es eine Basis, die nicht nur mit Ihrem Wissen wächst, sondern aktiv dabei hilft, dieses Wissen in strategisches Handeln zu übersetzen.
|
||||
|
||||
**Willkommen in der Ära von Mindnet V3.0 – Ihr strategischer Partner ist bereit.**
|
||||
|
||||
---
|
||||
*Dokumentations-Identifikator: `mindnet_v3_core_release`*
|
||||
*Synchronisations-Stand: WP-25b Final*
|
||||
158
docs/00_General/00_documentation_map.md
Normal file
158
docs/00_General/00_documentation_map.md
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
# Mindnet v2.6 – Documentation Map & Governance
|
||||
**Status:** Active
|
||||
**Context:** Central Navigation & Maintenance Guide
|
||||
|
||||
## 1. Zweck dieses Dokuments
|
||||
|
||||
Diese Karte dient Entwicklern, dem Mindmaster und KI-Agenten als **zentraler Einstiegspunkt**. Sie beantwortet zwei Fragen:
|
||||
1. **Navigation:** In welcher Datei finde ich Informationen zu Thema X?
|
||||
2. **Wartung:** Ich arbeite an Feature Y – welche Dateien muss ich aktualisieren?
|
||||
|
||||
---
|
||||
|
||||
## 2. Verzeichnisstruktur & Inhalte
|
||||
|
||||
Das Repository ist in **logische Domänen** unterteilt.
|
||||
|
||||
### 📂 00_General (Grundlagen)
|
||||
*Zielgruppe: Alle*
|
||||
| Datei | Inhalt & Zweck |
|
||||
| :--- | :--- |
|
||||
| `README.md` | **Einstiegspunkt.** Übersicht über die Dokumentationsstruktur, Schnellzugriff nach Rollen und Navigation. |
|
||||
| `00_quickstart.md` | **Schnellstart.** Installation und erste Schritte in 15 Minuten. Ideal für neue Benutzer. |
|
||||
| `00_vision_and_strategy.md` | **Strategie.** Warum bauen wir das? Prinzipien (Privacy, Local-First), High-Level Architektur. |
|
||||
| `00_glossary.md` | **Definitionen.** Was bedeutet "Smart Edge", "Traffic Control", "Chunk"? Verhindert Begriffsverwirrung. |
|
||||
| `00_documentation_map.md` | **Dieser Index.** Navigationshilfe. |
|
||||
| `00_quality_checklist.md` | **Qualitätsprüfung.** Systematische Checkliste zur Vollständigkeitsprüfung für alle Rollen. |
|
||||
|
||||
### 📂 01_User_Manual (Anwendung)
|
||||
*Zielgruppe: Mindmaster, Autoren, Power-User*
|
||||
| Datei | Inhalt & Zweck |
|
||||
| :--- | :--- |
|
||||
| `01_chat_usage_guide.md` | **Bedienung.** Wie steuere ich die Personas (Berater, Spiegel)? Wie nutze ich das Feedback? |
|
||||
| `01_knowledge_design.md` | **Content-Regeln.** Die "Bibel" für den Vault. Erklärt Note-Typen, Matrix-Logik und Markdown-Syntax. |
|
||||
| `01_authoring_guidelines.md` | **Content strukturieren.** Primäres Werkzeug, um Wissen so zu strukturieren, dass Mindnet die Persönlichkeit spiegelt, empathisch reagiert und strategisch berät. |
|
||||
| `01_obsidian_integration_guide.md` | **Obsidian Setup.** Technische Anleitung für die Integration von Mindnet mit Obsidian (Templater, Skripte, Workflows). |
|
||||
|
||||
### 📂 02_Concepts (Fachliche Logik)
|
||||
*Zielgruppe: Architekten, Product Owner*
|
||||
| Datei | Inhalt & Zweck |
|
||||
| :--- | :--- |
|
||||
| `02_concept_graph_logic.md` | **Graph-Theorie.** Abstrakte Erklärung von Knoten, Kanten, Provenance und Idempotenz. |
|
||||
| `02_concept_ai_personality.md`| **KI-Verhalten.** Konzepte hinter dem Hybrid Router, Empathie-Modell und "Teach-the-AI". |
|
||||
| `02_concept_architecture_patterns.md` | **Architektur-Patterns.** Design-Entscheidungen, modulare Struktur (WP-14), Resilienz-Patterns und Erweiterbarkeit. |
|
||||
|
||||
### 📂 03_Technical_Reference (Technik & Code)
|
||||
*Zielgruppe: Entwickler, DevOps. (Enthält JSON/YAML Beispiele)*
|
||||
| Datei | Inhalt & Zweck |
|
||||
| :--- | :--- |
|
||||
| `03_tech_data_model.md` | **Datenbank.** Exakte Qdrant-Schemas (Payloads) und Index-Anforderungen. |
|
||||
| `03_tech_ingestion_pipeline.md`| **Import.** Ablauflogik (13 Schritte), Chunker-Profile, Smart Edge Allocation. |
|
||||
| `03_tech_retrieval_scoring.md` | **Suche.** Die mathematischen Formeln für Scoring, Hybrid Search und Explanation Layer. |
|
||||
| `03_tech_chat_backend.md` | **API & LLM.** Implementation des Routers, Traffic Control (Semaphore) und Feedback-Traceability. |
|
||||
| `03_tech_frontend.md` | **UI & Graph.** Architektur des Streamlit-Frontends, State-Management, Cytoscape-Integration und Editor-Logik. |
|
||||
| `03_tech_configuration.md` | **Config.** Referenztabellen für `.env`, `types.yaml`, `decision_engine.yaml`, `llm_profiles.yaml`, `prompts.yaml`. **Neu:** Verbindungen zwischen Config-Dateien, Praxisbeispiel und Mermaid-Grafik. |
|
||||
| `03_tech_api_reference.md` | **API-Referenz.** Vollständige Dokumentation aller Endpunkte (`/query`, `/chat`, `/ingest`, `/graph`, etc.). |
|
||||
|
||||
### 📂 04_Operations (Betrieb)
|
||||
*Zielgruppe: Administratoren*
|
||||
| Datei | Inhalt & Zweck |
|
||||
| :--- | :--- |
|
||||
| `04_admin_operations.md` | **Runbook.** Installation, Docker-Setup, Backup/Restore, Troubleshooting Guide. |
|
||||
| `04_server_operation_manual.md` | **Server-Betrieb.** Detaillierte Dokumentation für den Betrieb auf llm-node (Systemd, Borgmatic, Disaster Recovery). |
|
||||
| `04_deployment_guide.md` | **Deployment.** CI/CD-Pipelines, Rollout-Strategien, Versionierung, Rollback und Pre/Post-Deployment-Checklisten. |
|
||||
|
||||
### 📂 05_Development (Code)
|
||||
*Zielgruppe: Entwickler*
|
||||
| Datei | Inhalt & Zweck |
|
||||
| :--- | :--- |
|
||||
| `05_developer_guide.md` | **Workflow.** Hardware-Setup (Win/Pi/Beelink), Git-Flow, Test-Befehle, Modul-Interna. |
|
||||
| `05_genai_best_practices.md` | **AI Workflow.** Prompt-Library, Templates und Best Practices für die Entwicklung mit LLMs. |
|
||||
| `05_testing_guide.md` | **Testing.** Umfassender Test-Guide: Strategien, Frameworks, Test-Daten, Best Practices. |
|
||||
|
||||
### 📂 06_Roadmap & 99_Archive
|
||||
*Zielgruppe: Projektleitung*
|
||||
| Datei | Inhalt & Zweck |
|
||||
| :--- | :--- |
|
||||
| `06_active_roadmap.md` | **Zukunft.** Aktive Workpackages (WP16+), Release-Planung und WP-Historie (Tabelle). |
|
||||
| `99_legacy_workpackages.md` | **Vergangenheit.** Detaillierte Archivdaten zu abgeschlossenen WPs (WP01–WP15). |
|
||||
|
||||
---
|
||||
|
||||
## 3. Maintenance Guide: "Welche Datei muss ich ändern?"
|
||||
|
||||
Nutze diese Matrix, wenn du ein Workpackage bearbeitest, um die Dokumentation konsistent zu halten.
|
||||
|
||||
| Wenn du arbeitest an... | ...aktualisiere diese Dateien: |
|
||||
| :--- | :--- |
|
||||
| **Neuen Note-Typen** | 1. `01_knowledge_design.md` (Für Autoren)<br>2. `03_tech_configuration.md` (Technische Referenz)<br>3. `05_developer_guide.md` (Erweiterungs-How-To) |
|
||||
| **Importer / Parsing** | `03_tech_ingestion_pipeline.md` |
|
||||
| **Datenbank-Schema** | `03_tech_data_model.md` (Payloads anpassen) |
|
||||
| **Retrieval / Scoring** | `03_tech_retrieval_scoring.md` (Formeln anpassen) |
|
||||
| **Frontend / Visualisierung** | 1. `03_tech_frontend.md` (Technische Details)<br>2. `01_chat_usage_guide.md` (Bedienung) |
|
||||
| **Chat-Logik / Prompts**| 1. `02_concept_ai_personality.md` (Konzept)<br>2. `03_tech_chat_backend.md` (Tech)<br>3. `01_chat_usage_guide.md` (User-Sicht) |
|
||||
| **Architektur / Design-Patterns** | 1. `02_concept_architecture_patterns.md` (Patterns & Entscheidungen)<br>2. `02_concept_graph_logic.md` (Graph-Theorie)<br>3. `05_developer_guide.md` (Modulare Struktur) |
|
||||
| **Deployment / Server** | 1. `04_deployment_guide.md` (CI/CD, Rollout)<br>2. `04_admin_operations.md` (Installation, Wartung)<br>3. `04_server_operation_manual.md` (Server-Betrieb) |
|
||||
| **Testing / QA** | 1. `05_testing_guide.md` (Test-Strategien & Frameworks)<br>2. `05_developer_guide.md` (Test-Befehle) |
|
||||
| **Neuen Features (Allg.)**| `06_active_roadmap.md` (Status Update) |
|
||||
|
||||
---
|
||||
|
||||
## 4. Prinzipien für die Dokumentation (Governance)
|
||||
|
||||
Damit dieses System wartbar bleibt (auch für KI-Agenten wie NotebookLM), gelten folgende Regeln:
|
||||
|
||||
1. **Single Source of Truth:**
|
||||
Kopiere keine Informationen. Referenziere sie.
|
||||
* *Ausnahme:* Konkrete JSON-Beispiele in `03_Technical_Reference`. Diese müssen dort stehen, damit Entwickler nicht suchen müssen.
|
||||
|
||||
2. **Konkrete Beispiele:**
|
||||
Technische Dokumente (`03_*`) müssen **immer** Code-Snippets (JSON, YAML, Shell) enthalten. Abstrakte Beschreibungen reichen nicht für die Implementierung.
|
||||
|
||||
3. **Human Readable:**
|
||||
User Manuals (`01_*`) müssen Narrative und Szenarien enthalten ("Stell dir vor..."), keine technischen Auflistungen.
|
||||
|
||||
4. **KI-Optimiert (Context Header):**
|
||||
Jede Datei muss mit einem YAML-Frontmatter beginnen, der `context`, `audience` und `scope` definiert. Dies hilft RAG-Systemen, den Inhalt korrekt einzuordnen.
|
||||
|
||||
```yaml
|
||||
---
|
||||
doc_type: technical_reference
|
||||
audience: developer
|
||||
context: "Beschreibung der Scoring-Formel."
|
||||
---
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Schnellzugriff & Empfehlungen
|
||||
|
||||
### Für neue Benutzer
|
||||
1. Starte mit **[Schnellstart](00_quickstart.md)** für die Installation
|
||||
2. Lese **[Vision & Strategie](00_vision_and_strategy.md)** für das große Bild
|
||||
3. Nutze **[Chat Usage Guide](../01_User_Manual/01_chat_usage_guide.md)** für die ersten Schritte
|
||||
|
||||
### Für Entwickler
|
||||
1. **[Developer Guide](../05_Development/05_developer_guide.md)** - Umfassender technischer Guide
|
||||
2. **[Technical References](../03_Technical_References/)** - Detaillierte API- und Architektur-Dokumentation
|
||||
3. **[GenAI Best Practices](../05_Development/05_genai_best_practices.md)** - Workflow mit LLMs
|
||||
|
||||
### Für Administratoren
|
||||
1. **[Admin Operations](../04_Operations/04_admin_operations.md)** - Installation und Wartung
|
||||
2. **[Server Operations Manual](../04_Operations/04_server_operation_manual.md)** - Server-Betrieb und Disaster Recovery
|
||||
3. **[Troubleshooting Guide](../04_Operations/04_admin_operations.md#33-troubleshooting-guide)** - Häufige Probleme und Lösungen
|
||||
|
||||
### Für Autoren
|
||||
1. **[Knowledge Design](../01_User_Manual/01_knowledge_design.md)** - Content-Regeln und Best Practices
|
||||
2. **[Authoring Guidelines](../01_User_Manual/01_authoring_guidelines.md)** - Strukturierung für den Digitalen Zwilling
|
||||
3. **[Obsidian Integration](../01_User_Manual/01_obsidian_integration_guide.md)** - Workflow-Optimierung
|
||||
|
||||
---
|
||||
|
||||
## 6. Dokumentations-Status
|
||||
|
||||
**Aktuelle Version:** 3.1.1
|
||||
**Letzte Aktualisierung:** 2026-01-02
|
||||
**Status:** ✅ Vollständig und aktiv gepflegt
|
||||
|
||||
**Hinweis:** Diese Dokumentation wird kontinuierlich aktualisiert. Bei Fragen oder Verbesserungsvorschlägen bitte im Repository melden.
|
||||
74
docs/00_General/00_glossary.md
Normal file
74
docs/00_General/00_glossary.md
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
---
|
||||
doc_type: glossary
|
||||
audience: all
|
||||
status: active
|
||||
version: 4.5.8
|
||||
context: "Zentrales Glossar für Mindnet v4.5.8. Enthält Definitionen zu Hybrid-Cloud Resilienz, WP-14 Modularisierung, WP-15b Two-Pass Ingestion, WP-15c Multigraph-Support, WP-25 Agentic Multi-Stream RAG, WP-25a Mixture of Experts (MoE), WP-25b Lazy-Prompt-Orchestration, WP-24c Phase 3 Agentic Edge Validation und Mistral-safe Parsing."
|
||||
---
|
||||
|
||||
# Mindnet Glossar
|
||||
|
||||
**Quellen:** `01_edge_vocabulary.md`, `llm_service.py`, `ingestion.py`, `edge_registry.py`, `registry.py`, `qdrant.py`
|
||||
|
||||
## Kern-Entitäten
|
||||
|
||||
* **Note:** Repräsentiert eine Markdown-Datei. Die fachliche Haupteinheit. Verfügt über einen **Status** (stable, draft, system), der das Scoring beeinflusst.
|
||||
* **Chunk:** Ein Textabschnitt einer Note. Die technische Sucheinheit (Vektor).
|
||||
* **Edge:** Eine gerichtete Verbindung zwischen zwei Knoten. Wird in WP-22 durch die Registry validiert. Seit v2.9.1 unterstützt Edges **Section-basierte Links** (`target_section`), sodass mehrere Kanten zwischen denselben Knoten existieren können, wenn sie auf verschiedene Abschnitte zeigen.
|
||||
* **Vault:** Der lokale Ordner mit den Markdown-Dateien (Source of Truth).
|
||||
* **Frontmatter:** Der YAML-Header am Anfang einer Notiz (enthält `id`, `type`, `title`, `status`).
|
||||
|
||||
## Komponenten
|
||||
|
||||
* **Edge Registry:** Der zentrale Dienst (SSOT), der Kanten-Typen validiert und Aliase in kanonische Typen auflöst. Nutzt `01_edge_vocabulary.md` als Basis.
|
||||
* **LLM Service:** Der Hybrid-Client (v3.3.6), der Anfragen zwischen OpenRouter, Google Gemini und lokalem Ollama routet. Verwaltet Cloud-Timeouts und Quoten-Management. Nutzt zur Text-Bereinigung nun die neutrale `registry.py`, um Circular Imports zu vermeiden.
|
||||
* **Retriever:** Besteht in v2.7+ aus der Orchestrierung (`retriever.py`) und der mathematischen Scoring-Engine (`retriever_scoring.py`). Seit WP-14 im Paket `app.core.retrieval` gekapselt.
|
||||
* **Decision Engine (WP-25):** Der zentrale **Agentic Orchestrator**, der Intents erkennt, parallele Wissens-Streams orchestriert und die Ergebnisse synthetisiert. Implementiert Multi-Stream Retrieval und Intent-basiertes Routing.
|
||||
* **Agentic Multi-Stream RAG (WP-25):** Architektur-Paradigma, bei dem Nutzeranfragen in parallele, spezialisierte Wissens-Streams aufgeteilt werden (Values, Facts, Biography, Risk, Tech), die gleichzeitig abgefragt und zu einer kontextreichen Antwort synthetisiert werden.
|
||||
* **Stream-Tracing (WP-25):** Kennzeichnung jedes Treffers mit seinem Ursprungs-Stream (`stream_origin`), um Feedback-Optimierung pro Wissensbereich zu ermöglichen.
|
||||
* **Intent-basiertes Routing (WP-25):** Hybrid-Modus zur Intent-Erkennung mit Keyword Fast-Path (sofortige Erkennung von Triggern) und LLM Slow-Path (semantische Analyse für unklare Anfragen).
|
||||
* **Wissens-Synthese (WP-25):** Template-basierte Zusammenführung der Ergebnisse aus parallelen Streams mit expliziten Stream-Variablen (z.B. `{values_stream}`, `{risk_stream}`), um dem LLM eine differenzierte Abwägung zu ermöglichen.
|
||||
* **Traffic Control:** Verwaltet Prioritäten und drosselt Hintergrund-Tasks (z.B. Smart Edges) mittels Semaphoren und Timeouts (45s) zur Vermeidung von System-Hangs.
|
||||
* **Unknown Edges Log:** Die Datei `unknown_edges.jsonl`, in der das System Kanten-Typen protokolliert, die nicht im Dictionary gefunden wurden.
|
||||
* **Database Package (WP-14):** Zentralisiertes Infrastruktur-Paket (`app.core.database`), das den Qdrant-Client (`qdrant.py`) und das Point-Mapping (`qdrant_points.py`) verwaltet.
|
||||
* **LocalBatchCache (WP-15b):** Ein globaler In-Memory-Index, der während des Pass 1 Scans aufgebaut wird und Metadaten (IDs, Titel, Summaries) aller Notizen für die Kantenvalidierung bereithält.
|
||||
|
||||
## Konzepte & Features
|
||||
|
||||
* **Hybrid Provider Cascade:** Die intelligente Reihenfolge der Modell-Ansprache. Schlägt die Cloud (OpenRouter/Gemini) fehl, erfolgt nach Retries ein Fallback auf den lokalen Ollama (Quoten-Schutz).
|
||||
* **Deep Fallback (v2.11.14, WP20):** Ein inhaltsbasierter Rettungsmechanismus in der Ingestion. Im Gegensatz zum technischen Fallback (bei Verbindungsfehlern) wird der Deep Fallback ausgelöst, wenn ein Cloud-Modell zwar technisch erfolgreich antwortet, aber inhaltlich keine verwertbaren Daten liefert (z. B. bei "Data Policy Violations").
|
||||
* **Silent Refusal (WP20):** Ein Zustand, in dem Cloud-Provider (wie OpenRouter) die Verarbeitung eines Dokuments aufgrund interner Filter ("No data training") verweigern, ohne einen HTTP-Fehler zu senden. Wird durch Deep Fallback abgefangen.
|
||||
* **Rate-Limit Resilience (WP-20):** Automatisierte Erkennung von HTTP 429 Fehlern. Das System pausiert (konfigurierbar via `LLM_RATE_LIMIT_WAIT`) und wiederholt den Cloud-Call, bevor der langsame Fallback ausgelöst wird.
|
||||
* **Mistral-safe Parsing:** Robuste Extraktions-Logik in Ingestion und Analyzer, die technische Steuerzeichen (`<s>`, `[OUT]`) und Framework-Tags erkennt und entfernt, um valides JSON aus Free-Modellen zu gewinnen.
|
||||
* **Lifecycle Scoring (WP-22):** Ein Mechanismus, der die Relevanz einer Notiz basierend auf ihrem Status gewichtet (z.B. Bonus für `stable`, Malus für `draft`).
|
||||
* **Intent Boosting:** Dynamische Erhöhung der Kanten-Gewichte basierend auf der Nutzerfrage (z.B. Fokus auf `caused_by` bei "Warum"-Fragen).
|
||||
* **Provenance Weighting:** Gewichtung einer Kante nach ihrer Herkunft:
|
||||
* `explicit`: Vom Mensch gesetzt (Prio 1).
|
||||
* `semantic_ai`: Von der KI im Turbo-Mode extrahiert und validiert (Prio 2).
|
||||
* `structure`: Durch System-Regeln/Matrix erzeugt (Prio 3).
|
||||
* **Smart Edge Allocation (WP-15b):** KI-Verfahren zur Relevanzprüfung von Links für spezifische Textabschnitte. Validiert Kandidaten semantisch gegen das Ziel im LocalBatchCache.
|
||||
* **Matrix Logic:** Bestimmung des Kanten-Typs basierend auf Quell- und Ziel-Entität (z.B. Erfahrung -> Wert = `based_on`).
|
||||
* **Two-Pass Workflow (WP-15b):** Optimiertes Ingestion-Verfahren:
|
||||
* **Pass 1 (Pre-Scan):** Schnelles Scannen aller Dateien zur Befüllung des LocalBatchCache.
|
||||
* **Pass 2 (Semantic Processing):** Tiefenverarbeitung (Chunking, Embedding, Validierung) nur für geänderte Dateien.
|
||||
* **Circular Import Registry (WP-14):** Entkopplung von Kern-Logik (wie Textbereinigung) in eine neutrale `registry.py`, um Abhängigkeitsschleifen zwischen Diensten und Ingestion-Utilities zu verhindern.
|
||||
* **Deep-Link / Section-basierter Link:** Ein Link wie `[[Note#Section]]`, der auf einen spezifischen Abschnitt innerhalb einer Note verweist. Seit v2.9.1 wird dieser in `target_id="Note"` und `target_section="Section"` aufgeteilt, um "Phantom-Knoten" zu vermeiden und Multigraph-Support zu ermöglichen.
|
||||
* **Atomic Section Logic (v3.9.9):** Chunking-Verfahren, das Sektions-Überschriften und deren Inhalte atomar in Chunks hält (Pack-and-Carry-Over). Verhindert, dass Überschriften über Chunk-Grenzen hinweg getrennt werden.
|
||||
* **Registry-First Profiling (v2.13.12):** Hierarchische Auflösung des Chunking-Profils: Frontmatter > types.yaml Typ-Config > Global Defaults. Stellt sicher, dass Note-Typen automatisch das korrekte Profil erhalten.
|
||||
* **Mixture of Experts (MoE) - WP-25a:** Profilbasierte Experten-Architektur, bei der jede Systemaufgabe (Synthese, Ingestion-Validierung, Routing, Kompression) einem dedizierten Profil zugewiesen wird, das Modell, Provider und Parameter unabhängig von der globalen Konfiguration definiert.
|
||||
* **LLM-Profil:** Zentrale Definition in `llm_profiles.yaml`, die Provider, Modell, Temperature und Fallback-Profil für eine spezifische Aufgabe festlegt (z.B. `synthesis_pro`, `tech_expert`, `ingest_validator`).
|
||||
* **Fallback-Kaskade (WP-25a):** Rekursive Fallback-Logik, bei der bei Fehlern automatisch auf das `fallback_profile` umgeschaltet wird, bis der terminale Endpunkt (`identity_safe`) erreicht wird. Schutz gegen Zirkel-Referenzen via `visited_profiles`-Tracking.
|
||||
* **Pre-Synthesis Kompression (WP-25a):** Asynchrone Verdichtung überlanger Wissens-Streams vor der Synthese, um Token-Verbrauch zu reduzieren und die Synthese zu beschleunigen. Nutzt `compression_profile` (z.B. `compression_fast`).
|
||||
* **Profilgesteuerte Validierung (WP-25a):** Semantische Kanten-Validierung in der Ingestion erfolgt zwingend über das MoE-Profil `ingest_validator` (Temperature 0.0 für Determinismus), unabhängig von der globalen Provider-Konfiguration.
|
||||
* **Lazy-Prompt-Orchestration (WP-25b):** Hierarchisches Prompt-Resolution-System, das Prompts erst im Moment des Modellaustauschs lädt, basierend auf dem exakt aktiven Modell. Ermöglicht modell-spezifisches Tuning und maximale Resilienz bei Modell-Fallbacks.
|
||||
* **Hierarchische Prompt-Resolution (WP-25b):** Dreistufige Auflösungs-Logik: Level 1 (Modell-ID) → Level 2 (Provider) → Level 3 (Default). Gewährleistet, dass jedes Modell das optimale Template erhält.
|
||||
* **PROMPT-TRACE (WP-25b):** Logging-Mechanismus, der die genutzte Prompt-Auflösungs-Ebene protokolliert (`🎯 Level 1`, `📡 Level 2`, `⚓ Level 3`). Bietet vollständige Transparenz über die genutzten Instruktionen.
|
||||
* **Ultra-robustes Intent-Parsing (WP-25b):** Regex-basierter Intent-Parser in der DecisionEngine, der Modell-Artefakte wie `[/S]`, `</s>` oder Newlines zuverlässig bereinigt, um präzises Strategie-Routing zu gewährleisten.
|
||||
* **Differenzierte Ingestion-Validierung (WP-25b):** Unterscheidung zwischen transienten Fehlern (Netzwerk, Timeout) und permanenten Fehlern (Config, Validation). Transiente Fehler erlauben die Kante (Datenverlust vermeiden), permanente Fehler lehnen sie ab (Graph-Qualität schützen).
|
||||
* **Phase 3 Agentic Edge Validation (WP-24c v4.5.8):** Finales Validierungs-Gate für alle Kanten mit `candidate:` Präfix. Nutzt LLM-basierte semantische Prüfung zur Verifizierung von Wissensverknüpfungen. Verhindert "Geister-Verknüpfungen" und sichert die Graph-Qualität gegen Fehlinterpretationen ab.
|
||||
* **candidate: Präfix (WP-24c v4.5.8):** Markierung für unbestätigte Kanten in `rule_id` oder `provenance`. Alle Kanten mit diesem Präfix werden in Phase 3 dem LLM-Validator vorgelegt. Nach erfolgreicher Validierung wird das Präfix entfernt.
|
||||
* **verified Status (WP-24c v4.5.8):** Impliziter Status für Kanten nach erfolgreicher Phase 3 Validierung. Kanten ohne `candidate:` Präfix gelten als verifiziert und werden in die Datenbank geschrieben.
|
||||
* **Note-Scope (WP-24c v4.2.0):** Globale Verbindungen, die der gesamten Note zugeordnet werden (nicht nur einem spezifischen Chunk). Wird durch spezielle Header-Zonen (z.B. `## Smart Edges`) definiert. In Phase 3 Validierung wird `note_summary` oder `note_text` als Kontext verwendet.
|
||||
* **Chunk-Scope (WP-24c v4.2.0):** Lokale Verbindungen, die einem spezifischen Textabschnitt (Chunk) zugeordnet werden. In Phase 3 Validierung wird der spezifische Chunk-Text als Kontext verwendet, falls verfügbar.
|
||||
* **Kontext-Optimierung (WP-24c v4.5.8):** Dynamische Kontext-Auswahl in Phase 3 Validierung basierend auf `scope`. Note-Scope nutzt aggregierten Note-Text, Chunk-Scope nutzt spezifischen Chunk-Text. Optimiert die Validierungs-Genauigkeit durch passenden Kontext.
|
||||
* **rejected_edges (WP-24c v4.5.8):** Liste von Kanten, die in Phase 3 Validierung abgelehnt wurden. Diese Kanten werden **nicht** in die Datenbank geschrieben und vollständig ignoriert. Verhindert persistente "Geister-Verknüpfungen" im Wissensgraphen.
|
||||
180
docs/00_General/00_quality_checklist.md
Normal file
180
docs/00_General/00_quality_checklist.md
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
---
|
||||
doc_type: quality_assurance
|
||||
audience: all
|
||||
status: active
|
||||
version: 4.5.8
|
||||
context: "Qualitätsprüfung der Dokumentation für alle Rollen: Vollständigkeit, Korrektheit und Anwendbarkeit. Inkludiert WP-24c Phase 3 Agentic Edge Validation, automatische Spiegelkanten und Note-Scope Zonen."
|
||||
---
|
||||
|
||||
# Dokumentations-Qualitätsprüfung
|
||||
|
||||
Diese Checkliste dient zur systematischen Prüfung, ob die Dokumentation alle Fragen jeder Rolle vollständig beantwortet.
|
||||
|
||||
## ✅ Entwickler
|
||||
|
||||
### Setup & Installation
|
||||
- [x] **Lokales Setup:** [Developer Guide](../05_Development/05_developer_guide.md#6-lokales-setup-development)
|
||||
- [x] **Schnellstart:** [Quickstart](00_quickstart.md)
|
||||
- [x] **Hardware-Anforderungen:** [Admin Operations](../04_Operations/04_admin_operations.md#11-voraussetzungen)
|
||||
|
||||
### Architektur & Code
|
||||
- [x] **Modulare Struktur:** [Developer Guide - Architektur](../05_Development/05_developer_guide.md#4-projektstruktur--modul-referenz-deep-dive)
|
||||
- [x] **Design-Patterns:** [Architektur-Patterns](../02_concepts/02_concept_architecture_patterns.md)
|
||||
- [x] **API-Referenz:** [API Reference](../03_Technical_References/03_tech_api_reference.md)
|
||||
- [x] **Datenmodell:** [Data Model](../03_Technical_References/03_tech_data_model.md)
|
||||
|
||||
### Entwicklung & Erweiterung
|
||||
- [x] **Workflow:** [Developer Guide - Workflow](../05_Development/05_developer_guide.md#7-der-entwicklungs-zyklus-workflow)
|
||||
- [x] **Erweiterungs-Guide:** [Teach-the-AI](../05_Development/05_developer_guide.md#8-erweiterungs-guide-teach-the-ai)
|
||||
- [x] **GenAI Best Practices:** [GenAI Best Practices](../05_Development/05_genai_best_practices.md)
|
||||
|
||||
### Testing
|
||||
- [x] **Test-Strategien:** [Testing Guide](../05_Development/05_testing_guide.md)
|
||||
- [x] **Test-Frameworks:** [Testing Guide - Frameworks](../05_Development/05_testing_guide.md#3-test-frameworks--tools)
|
||||
- [x] **Test-Daten:** [Testing Guide - Test-Daten](../05_Development/05_testing_guide.md#2-test-daten--vaults)
|
||||
|
||||
### Debugging & Troubleshooting
|
||||
- [x] **Troubleshooting:** [Developer Guide - Troubleshooting](../05_Development/05_developer_guide.md#10-troubleshooting--one-liners)
|
||||
- [x] **Debug-Tools:** [Testing Guide - Debugging](../05_Development/05_testing_guide.md#7-debugging--diagnose)
|
||||
|
||||
---
|
||||
|
||||
## ✅ Administratoren
|
||||
|
||||
### Installation & Setup
|
||||
- [x] **Installation:** [Admin Operations](../04_Operations/04_admin_operations.md#1-installation--setup)
|
||||
- [x] **Docker Setup:** [Admin Operations - Qdrant](../04_Operations/04_admin_operations.md#12-qdrant-docker)
|
||||
- [x] **Systemd Services:** [Admin Operations - Deployment](../04_Operations/04_admin_operations.md#2-deployment-systemd-services)
|
||||
|
||||
### Betrieb & Wartung
|
||||
- [x] **Monitoring:** [Admin Operations - Wartung](../04_Operations/04_admin_operations.md#3-wartung--monitoring)
|
||||
- [x] **Backup & Restore:** [Admin Operations - Backup](../04_Operations/04_admin_operations.md#4-backup--restore)
|
||||
- [x] **Troubleshooting:** [Admin Operations - Troubleshooting](../04_Operations/04_admin_operations.md#33-troubleshooting-guide)
|
||||
|
||||
### Server-Betrieb
|
||||
- [x] **Server-Konfiguration:** [Server Operations Manual](../04_Operations/04_server_operation_manual.md)
|
||||
- [x] **Disaster Recovery:** [Server Operations - DR](../04_Operations/04_server_operation_manual.md#5-disaster-recovery-wiederherstellung-two-stage-dr)
|
||||
- [x] **Backup-Strategie:** [Server Operations - Backup](../04_Operations/04_server_operation_manual.md#4-backup-strategie-borgmatic)
|
||||
|
||||
### Konfiguration
|
||||
- [x] **ENV-Variablen:** [Configuration Reference](../03_Technical_References/03_tech_configuration.md#1-environment-variablen-env)
|
||||
- [x] **YAML-Configs:** [Configuration Reference - YAML](../03_Technical_References/03_tech_configuration.md#2-typ-registry-typesyaml)
|
||||
- [x] **Phase 3 Validierung:** [Configuration Reference - ENV](../03_Technical_References/03_tech_configuration.md#1-environment-variablen-env) (MINDNET_LLM_VALIDATION_HEADERS, MINDNET_NOTE_SCOPE_ZONE_HEADERS)
|
||||
- [x] **LLM-Profile:** [Configuration Reference - LLM Profiles](../03_Technical_References/03_tech_configuration.md#6-llm-profile-registry-llm_profilesyaml-v130)
|
||||
|
||||
---
|
||||
|
||||
## ✅ Anwender
|
||||
|
||||
### Erste Schritte
|
||||
- [x] **Schnellstart:** [Quickstart](00_quickstart.md)
|
||||
- [x] **Was ist Mindnet:** [Vision & Strategie](00_vision_and_strategy.md)
|
||||
- [x] **Grundlagen:** [Glossar](00_glossary.md)
|
||||
|
||||
### Nutzung
|
||||
- [x] **Chat-Bedienung:** [Chat Usage Guide](../01_User_Manual/01_chat_usage_guide.md)
|
||||
- [x] **Graph Explorer:** [Chat Usage Guide - Graph](../01_User_Manual/01_chat_usage_guide.md#22-modus--graph-explorer-cytoscape)
|
||||
- [x] **Editor:** [Chat Usage Guide - Editor](../01_User_Manual/01_chat_usage_guide.md#23-modus--manueller-editor)
|
||||
|
||||
### Content-Erstellung
|
||||
- [x] **Knowledge Design:** [Knowledge Design Manual](../01_User_Manual/01_knowledge_design.md)
|
||||
- [x] **Authoring Guidelines:** [Authoring Guidelines](../01_User_Manual/01_authoring_guidelines.md)
|
||||
- [x] **Obsidian-Integration:** [Obsidian Integration](../01_User_Manual/01_obsidian_integration_guide.md)
|
||||
- [x] **Note-Scope Zonen:** [Note-Scope Zonen](../01_User_Manual/NOTE_SCOPE_ZONEN.md) (WP-24c v4.2.0)
|
||||
- [x] **LLM-Validierung:** [LLM-Validierung von Links](../01_User_Manual/LLM_VALIDIERUNG_VON_LINKS.md) (WP-24c v4.5.8)
|
||||
|
||||
### Häufige Fragen
|
||||
- [x] **Wie strukturiere ich Notizen?** → [Knowledge Design](../01_User_Manual/01_knowledge_design.md)
|
||||
- [x] **Welche Note-Typen gibt es?** → [Knowledge Design - Typ-Referenz](../01_User_Manual/01_knowledge_design.md#31-typ-referenz--stream-logik)
|
||||
- [x] **Wie verknüpfe ich Notizen?** → [Knowledge Design - Edges](../01_User_Manual/01_knowledge_design.md#4-edges--verlinkung)
|
||||
- [x] **Wie nutze ich den Chat?** → [Chat Usage Guide](../01_User_Manual/01_chat_usage_guide.md)
|
||||
- [x] **Was sind automatische Spiegelkanten?** → [Knowledge Design - Spiegelkanten](../01_User_Manual/01_knowledge_design.md#43-automatische-spiegelkanten-invers-logik---wp-24c-v458)
|
||||
- [x] **Was ist Phase 3 Validierung?** → [Knowledge Design - Phase 3](../01_User_Manual/01_knowledge_design.md#44-explizite-vs-validierte-kanten-phase-3-validierung---wp-24c-v458)
|
||||
- [x] **Was sind Note-Scope Zonen?** → [Note-Scope Zonen](../01_User_Manual/NOTE_SCOPE_ZONEN.md)
|
||||
- [x] **Wann nutze ich explizite vs. validierte Links?** → [Knowledge Design - Explizite vs. Validierte](../01_User_Manual/01_knowledge_design.md#44-explizite-vs-validierte-kanten-phase-3-validierung---wp-24c-v458)
|
||||
|
||||
---
|
||||
|
||||
## ✅ Tester
|
||||
|
||||
### Test-Strategien
|
||||
- [x] **Test-Pyramide:** [Testing Guide - Strategien](../05_Development/05_testing_guide.md#1-test-strategie--ebenen)
|
||||
- [x] **Unit Tests:** [Testing Guide - Unit Tests](../05_Development/05_testing_guide.md#11-unit-tests-pytest)
|
||||
- [x] **Integration Tests:** [Testing Guide - Integration](../05_Development/05_testing_guide.md#12-integration-tests)
|
||||
- [x] **E2E Tests:** [Testing Guide - E2E](../05_Development/05_testing_guide.md#13-e2e--smoke-tests)
|
||||
|
||||
### Test-Frameworks
|
||||
- [x] **Pytest:** [Testing Guide - Frameworks](../05_Development/05_testing_guide.md#31-pytest-unit-tests)
|
||||
- [x] **Unittest:** [Testing Guide - Unittest](../05_Development/05_testing_guide.md#32-unittest-e2e-tests)
|
||||
- [x] **Shell-Skripte:** [Testing Guide - Shell](../05_Development/05_testing_guide.md#33-shell-skripte-e2e-roundtrip)
|
||||
|
||||
### Test-Daten & Tools
|
||||
- [x] **Test-Vault erstellen:** [Testing Guide - Test-Daten](../05_Development/05_testing_guide.md#21-test-vault-erstellen)
|
||||
- [x] **Test-Skripte:** [Developer Guide - Scripts](../05_Development/05_developer_guide.md#44-scripts--tooling-die-admin-toolbox)
|
||||
- [x] **Test-Checkliste:** [Testing Guide - Checkliste](../05_Development/05_testing_guide.md#8-test-checkliste-für-pull-requests)
|
||||
|
||||
---
|
||||
|
||||
## ✅ Deployment
|
||||
|
||||
### Deployment-Prozesse
|
||||
- [x] **Deployment-Guide:** [Deployment Guide](../04_Operations/04_deployment_guide.md)
|
||||
- [x] **CI/CD Pipeline:** [Deployment Guide - CI/CD](../04_Operations/04_deployment_guide.md#9-cicd-pipeline-details)
|
||||
- [x] **Rollout-Strategien:** [Deployment Guide - Rollout](../04_Operations/04_deployment_guide.md#4-rollout-strategien)
|
||||
|
||||
### Versionierung & Releases
|
||||
- [x] **Version-Schema:** [Deployment Guide - Versionierung](../04_Operations/04_deployment_guide.md#51-version-schema)
|
||||
- [x] **Release-Prozess:** [Deployment Guide - Release](../04_Operations/04_deployment_guide.md#52-release-prozess)
|
||||
|
||||
### Rollback & Recovery
|
||||
- [x] **Rollback-Strategien:** [Deployment Guide - Rollback](../04_Operations/04_deployment_guide.md#6-rollback-strategien)
|
||||
- [x] **Disaster Recovery:** [Server Operations - DR](../04_Operations/04_server_operation_manual.md#5-disaster-recovery-wiederherstellung-two-stage-dr)
|
||||
|
||||
### Pre/Post-Deployment
|
||||
- [x] **Pre-Deployment Checkliste:** [Deployment Guide - Checkliste](../04_Operations/04_deployment_guide.md#7-pre-deployment-checkliste)
|
||||
- [x] **Post-Deployment Validierung:** [Deployment Guide - Validierung](../04_Operations/04_deployment_guide.md#8-post-deployment-validierung)
|
||||
|
||||
---
|
||||
|
||||
## 📊 Zusammenfassung
|
||||
|
||||
### Vollständigkeit nach Rolle
|
||||
|
||||
| Rolle | Abgedeckte Themen | Status |
|
||||
| :--- | :--- | :--- |
|
||||
| **Entwickler** | Setup, Architektur, Code, Testing, Debugging | ✅ Vollständig |
|
||||
| **Administratoren** | Installation, Betrieb, Wartung, Backup, DR | ✅ Vollständig |
|
||||
| **Anwender** | Nutzung, Content-Erstellung, Workflows | ✅ Vollständig |
|
||||
| **Tester** | Test-Strategien, Frameworks, Tools | ✅ Vollständig |
|
||||
| **Deployment** | CI/CD, Rollout, Versionierung, Rollback | ✅ Vollständig |
|
||||
|
||||
### Neue Dokumente
|
||||
|
||||
1. ✅ `05_testing_guide.md` - Umfassender Test-Guide
|
||||
2. ✅ `04_deployment_guide.md` - Vollständiger Deployment-Guide
|
||||
3. ✅ `02_concept_architecture_patterns.md` - Architektur-Patterns
|
||||
4. ✅ `03_tech_api_reference.md` - API-Referenz
|
||||
5. ✅ `00_quickstart.md` - Schnellstart-Anleitung
|
||||
6. ✅ `README.md` - Dokumentations-Einstiegspunkt
|
||||
|
||||
### Aktualisierte Dokumente
|
||||
|
||||
1. ✅ `00_documentation_map.md` - Alle neuen Dokumente aufgenommen
|
||||
2. ✅ `04_admin_operations.md` - Troubleshooting erweitert, Phase 3 Validierung dokumentiert
|
||||
3. ✅ `05_developer_guide.md` - Modulare Struktur ergänzt, WP-24c Phase 3 dokumentiert
|
||||
4. ✅ `03_tech_ingestion_pipeline.md` - Background Tasks dokumentiert, Phase 3 Agentic Validation hinzugefügt
|
||||
5. ✅ `03_tech_configuration.md` - Fehlende ENV-Variablen ergänzt, WP-24c Konfiguration dokumentiert
|
||||
6. ✅ `00_vision_and_strategy.md` - Design-Entscheidungen ergänzt
|
||||
7. ✅ `01_knowledge_design.md` - Automatische Spiegelkanten, Phase 3 Validierung, Note-Scope Zonen dokumentiert
|
||||
8. ✅ `02_concept_graph_logic.md` - Phase 3 Validierung, automatische Spiegelkanten, Note-Scope vs. Chunk-Scope dokumentiert
|
||||
9. ✅ `03_tech_data_model.md` - candidate: Präfix, verified Status, virtual Flag dokumentiert
|
||||
10. ✅ `NOTE_SCOPE_ZONEN.md` - Phase 3 Validierung integriert
|
||||
11. ✅ `LLM_VALIDIERUNG_VON_LINKS.md` - Phase 3 statt global_pool, Kontext-Optimierung dokumentiert
|
||||
12. ✅ `05_testing_guide.md` - WP-24c Test-Szenarien hinzugefügt
|
||||
|
||||
---
|
||||
|
||||
**Status:** ✅ Alle Rollen vollständig abgedeckt
|
||||
**Letzte Prüfung:** 2025-01-XX
|
||||
**Version:** 2.9.1
|
||||
|
||||
156
docs/00_General/00_quickstart.md
Normal file
156
docs/00_General/00_quickstart.md
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
---
|
||||
doc_type: quickstart_guide
|
||||
audience: user, developer, admin
|
||||
status: active
|
||||
version: 2.9.1
|
||||
context: "Schnellstart-Anleitung für neue Benutzer von Mindnet"
|
||||
---
|
||||
|
||||
# Mindnet Schnellstart
|
||||
|
||||
Diese Anleitung hilft dir, in 15 Minuten mit Mindnet loszulegen.
|
||||
|
||||
## 🎯 Was ist Mindnet?
|
||||
|
||||
Mindnet ist ein **persönliches KI-Gedächtnis**, das:
|
||||
- Dein Wissen in Markdown-Notizen speichert
|
||||
- Semantisch verknüpft (Wissensgraph)
|
||||
- Als intelligenter Dialogpartner agiert (RAG-Chat)
|
||||
- **Lokal und privat** läuft (Privacy First)
|
||||
|
||||
## 📋 Voraussetzungen
|
||||
|
||||
- **Python 3.10+** installiert
|
||||
- **Docker** installiert (für Qdrant)
|
||||
- **Ollama** installiert (für lokale LLMs)
|
||||
- Optional: **Obsidian** (für komfortables Schreiben)
|
||||
|
||||
## ⚡ Installation (5 Minuten)
|
||||
|
||||
### Schritt 1: Repository klonen
|
||||
|
||||
```bash
|
||||
git clone <repository-url> mindnet
|
||||
cd mindnet
|
||||
```
|
||||
|
||||
### Schritt 2: Virtuelle Umgebung erstellen
|
||||
|
||||
```bash
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
||||
```
|
||||
|
||||
### Schritt 3: Abhängigkeiten installieren
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Schritt 4: Qdrant starten (Docker)
|
||||
|
||||
```bash
|
||||
docker compose up -d qdrant
|
||||
```
|
||||
|
||||
### Schritt 5: Ollama-Modelle laden
|
||||
|
||||
```bash
|
||||
ollama pull phi3:mini
|
||||
ollama pull nomic-embed-text
|
||||
```
|
||||
|
||||
### Schritt 6: Konfiguration anpassen
|
||||
|
||||
Erstelle eine `.env`-Datei im Projektroot:
|
||||
|
||||
```ini
|
||||
QDRANT_URL=http://localhost:6333
|
||||
MINDNET_OLLAMA_URL=http://localhost:11434
|
||||
MINDNET_LLM_MODEL=phi3:mini
|
||||
MINDNET_EMBEDDING_MODEL=nomic-embed-text
|
||||
COLLECTION_PREFIX=mindnet
|
||||
MINDNET_VAULT_ROOT=./vault
|
||||
```
|
||||
|
||||
## 🚀 Erste Schritte (5 Minuten)
|
||||
|
||||
### Schritt 1: Backend starten
|
||||
|
||||
```bash
|
||||
uvicorn app.main:app --reload --port 8001
|
||||
```
|
||||
|
||||
### Schritt 2: Frontend starten (neues Terminal)
|
||||
|
||||
```bash
|
||||
streamlit run app/frontend/ui.py --server.port 8501
|
||||
```
|
||||
|
||||
### Schritt 3: Browser öffnen
|
||||
|
||||
Öffne `http://localhost:8501` im Browser.
|
||||
|
||||
### Schritt 4: Erste Notiz importieren
|
||||
|
||||
Erstelle eine Test-Notiz im `vault/` Ordner:
|
||||
|
||||
```markdown
|
||||
---
|
||||
id: 20250101-test
|
||||
title: Meine erste Notiz
|
||||
type: concept
|
||||
status: active
|
||||
---
|
||||
|
||||
# Meine erste Notiz
|
||||
|
||||
Dies ist eine Test-Notiz für Mindnet.
|
||||
|
||||
[[rel:related_to Mindnet]]
|
||||
```
|
||||
|
||||
Dann importiere sie:
|
||||
|
||||
```bash
|
||||
python3 -m scripts.import_markdown --vault ./vault --prefix mindnet --apply
|
||||
```
|
||||
|
||||
### Schritt 5: Erste Chat-Anfrage
|
||||
|
||||
Im Browser-Chat eingeben:
|
||||
```
|
||||
Was ist Mindnet?
|
||||
```
|
||||
|
||||
## 📚 Nächste Schritte
|
||||
|
||||
Nach dem Schnellstart empfehle ich:
|
||||
|
||||
1. **[Chat Usage Guide](../01_User_Manual/01_chat_usage_guide.md)** - Lerne die Chat-Funktionen kennen
|
||||
2. **[Knowledge Design](../01_User_Manual/01_knowledge_design.md)** - Verstehe, wie du Notizen strukturierst
|
||||
3. **[Authoring Guidelines](../01_User_Manual/01_authoring_guidelines.md)** - Lerne Best Practices für das Schreiben
|
||||
|
||||
## 🆘 Hilfe & Troubleshooting
|
||||
|
||||
**Problem:** Qdrant startet nicht
|
||||
- **Lösung:** Prüfe, ob Docker läuft: `docker ps`
|
||||
|
||||
**Problem:** Ollama-Modell nicht gefunden
|
||||
- **Lösung:** Prüfe mit `ollama list`, ob die Modelle geladen sind
|
||||
|
||||
**Problem:** Import schlägt fehl
|
||||
- **Lösung:** Prüfe die Logs und stelle sicher, dass Qdrant läuft
|
||||
|
||||
Für detaillierte Troubleshooting-Informationen siehe [Admin Operations](../04_Operations/04_admin_operations.md#33-troubleshooting-guide).
|
||||
|
||||
## 🔗 Weitere Ressourcen
|
||||
|
||||
- **[Dokumentationskarte](00_documentation_map.md)** - Übersicht aller Dokumente
|
||||
- **[Glossar](00_glossary.md)** - Wichtige Begriffe erklärt
|
||||
- **[Vision & Strategie](00_vision_and_strategy.md)** - Die Philosophie hinter Mindnet
|
||||
|
||||
---
|
||||
|
||||
**Viel Erfolg mit Mindnet!** 🚀
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user