From b78f3dd0cd31d6a97de2f25a012938de28287ff5 Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Wed, 13 May 2026 13:47:38 -0700 Subject: [PATCH 1/2] Revert "Fix duplicate-cluster trigger to require >=2 kept candidates (#327)" This reverts commit 9cee0f13003c9affda297a2cd6f3071c3a3c6144. --- include/svs/index/vamana/prune.h | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/include/svs/index/vamana/prune.h b/include/svs/index/vamana/prune.h index 07dc1997..f303a56f 100644 --- a/include/svs/index/vamana/prune.h +++ b/include/svs/index/vamana/prune.h @@ -185,12 +185,8 @@ void heuristic_prune_neighbors( current_alpha *= alpha; } - // Add a diversity edge if a duplicate cluster is detected. - // A "cluster" requires at least 2 kept candidates sharing the same - // distance; a single retained neighbor is not a cluster and must not - // be replaced (doing so would discard the only true nearest-neighbor - // edge for that node). - if (all_duplicates && anchor_set && result.size() >= 2) { + // Add a diversity edge if a duplicate cluster is detected + if (all_duplicates && anchor_set && !result.empty()) { auto result_id = [](const I& r) -> size_t { if constexpr (std::integral) { return static_cast(r); @@ -301,12 +297,8 @@ void heuristic_prune_neighbors( current_alpha *= alpha; } - // Add a diversity edge if a duplicate cluster is detected. - // A "cluster" requires at least 2 kept candidates sharing the same - // distance; a single retained neighbor is not a cluster and must not - // be replaced (doing so would discard the only true nearest-neighbor - // edge for that node). - if (all_duplicates && anchor_set && result.size() >= 2) { + // Add a diversity edge if a duplicate cluster is detected + if (all_duplicates && anchor_set && !result.empty()) { auto result_id = [](const I& r) -> size_t { if constexpr (std::integral) { return static_cast(r); From 93b6f5e46e0d1087268ea5b4b34316619f9ed81a Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Wed, 13 May 2026 13:47:56 -0700 Subject: [PATCH 2/2] Revert "Enhance heuristic pruning to handle duplicate clusters (#282)" This reverts commit 629a79c518fc9b6bce567a23032c5cff4a623f7d. --- include/svs/index/vamana/prune.h | 94 -------------------------------- tests/svs/index/vamana/prune.cpp | 65 ---------------------- 2 files changed, 159 deletions(-) diff --git a/include/svs/index/vamana/prune.h b/include/svs/index/vamana/prune.h index f303a56f..aeab27ac 100644 --- a/include/svs/index/vamana/prune.h +++ b/include/svs/index/vamana/prune.h @@ -130,9 +130,6 @@ void heuristic_prune_neighbors( auto pruned = std::vector(poolsize, PruneState::Available); float current_alpha = 1.0f; - float anchor_dist = 0.0f; - bool anchor_set = false; - bool all_duplicates = true; while (result.size() < max_result_size && !cmp(alpha, current_alpha)) { size_t start = 0; while (result.size() < max_result_size && start < poolsize) { @@ -148,16 +145,6 @@ void heuristic_prune_neighbors( const auto& query = accessor(dataset, id); distance::maybe_fix_argument(distance_function, query); result.push_back(detail::construct_as(lib::Type(), pool[start])); - - if (all_duplicates) { - if (!anchor_set) { - anchor_dist = pool[start].distance(); - anchor_set = true; - } else if (pool[start].distance() != anchor_dist) { - all_duplicates = false; - } - } - for (size_t t = start + 1; t < poolsize; ++t) { if (excluded(pruned[t])) { continue; @@ -184,40 +171,6 @@ void heuristic_prune_neighbors( } current_alpha *= alpha; } - - // Add a diversity edge if a duplicate cluster is detected - if (all_duplicates && anchor_set && !result.empty()) { - auto result_id = [](const I& r) -> size_t { - if constexpr (std::integral) { - return static_cast(r); - } else { - return static_cast(r.id()); - } - }; - for (size_t t = 0; t < poolsize; ++t) { - const auto& candidate = pool[t]; - auto cid = candidate.id(); - if (cid == current_node_id || candidate.distance() == anchor_dist) { - continue; - } - bool in_result = false; - for (const auto& r : result) { - if (result_id(r) == static_cast(cid)) { - in_result = true; - break; - } - } - assert( - !in_result && - "Candidate with non-anchor distance should not already be in result" - ); - if (in_result) { - continue; - } - result.back() = detail::construct_as(lib::Type(), candidate); - break; - } - } } template < @@ -250,9 +203,6 @@ void heuristic_prune_neighbors( std::vector pruned(poolsize, type_traits::tombstone_v); float current_alpha = 1.0f; - float anchor_dist = 0.0f; - bool anchor_set = false; - bool all_duplicates = true; while (result.size() < max_result_size && !cmp(alpha, current_alpha)) { size_t start = 0; while (result.size() < max_result_size && start < poolsize) { @@ -268,16 +218,6 @@ void heuristic_prune_neighbors( const auto& query = accessor(dataset, id); distance::maybe_fix_argument(distance_function, query); result.push_back(detail::construct_as(lib::Type(), pool[start])); - - if (all_duplicates) { - if (!anchor_set) { - anchor_dist = pool[start].distance(); - anchor_set = true; - } else if (pool[start].distance() != anchor_dist) { - all_duplicates = false; - } - } - for (size_t t = start + 1; t < poolsize; ++t) { if (cmp(current_alpha, pruned[t])) { continue; @@ -296,40 +236,6 @@ void heuristic_prune_neighbors( } current_alpha *= alpha; } - - // Add a diversity edge if a duplicate cluster is detected - if (all_duplicates && anchor_set && !result.empty()) { - auto result_id = [](const I& r) -> size_t { - if constexpr (std::integral) { - return static_cast(r); - } else { - return static_cast(r.id()); - } - }; - for (size_t t = 0; t < poolsize; ++t) { - const auto& candidate = pool[t]; - auto cid = candidate.id(); - if (cid == current_node_id || candidate.distance() == anchor_dist) { - continue; - } - bool in_result = false; - for (const auto& r : result) { - if (result_id(r) == static_cast(cid)) { - in_result = true; - break; - } - } - assert( - !in_result && - "Candidate with non-anchor distance should not already be in result" - ); - if (in_result) { - continue; - } - result.back() = detail::construct_as(lib::Type(), candidate); - break; - } - } } /// diff --git a/tests/svs/index/vamana/prune.cpp b/tests/svs/index/vamana/prune.cpp index d50ffdfe..1c67a5af 100644 --- a/tests/svs/index/vamana/prune.cpp +++ b/tests/svs/index/vamana/prune.cpp @@ -17,10 +17,6 @@ // header under test #include "svs/index/vamana/prune.h" -// core -#include "svs/core/data/simple.h" -#include "svs/core/distance/euclidean.h" - // catch2 #include "catch2/catch_test_macros.hpp" @@ -50,65 +46,4 @@ CATCH_TEST_CASE("Pruning", "[index][vamana]") { CATCH_REQUIRE(v::excluded(v::PruneState::Pruned) == true); } } - - CATCH_SECTION("Duplicate Cluster Trap") { - auto data = svs::data::SimpleData(6, 4); - auto d0 = std::vector{1.0f, 1.0f, 1.0f, 1.0f}; - auto d4 = std::vector{2.0f, 1.0f, 1.0f, 1.0f}; - auto d5 = std::vector{1.5f, 1.0f, 1.0f, 1.0f}; - - for (size_t i = 0; i < 4; ++i) { - data.set_datum(i, d0); - } - data.set_datum(4, d4); - data.set_datum(5, d5); - - auto dist = svs::distance::DistanceL2(); - auto accessor = svs::data::GetDatumAccessor{}; - - std::vector> pool = { - {size_t{0}, 0.0f}, - {size_t{1}, 0.0f}, - {size_t{2}, 0.0f}, - {size_t{3}, 0.0f}, - {size_t{4}, 1.0f}}; - - CATCH_SECTION("Iterative Strategy Fix") { - std::vector> result; - v::heuristic_prune_neighbors( - v::IterativePruneStrategy{}, - 2, - 1.3f, - data, - accessor, - dist, - size_t{5}, - std::span>(pool), - result - ); - - CATCH_REQUIRE(result.size() == 2); - CATCH_REQUIRE(result[0].id() == 0); - CATCH_REQUIRE(result[1].id() == 4); - } - - CATCH_SECTION("Progressive Strategy Fix") { - std::vector> result; - v::heuristic_prune_neighbors( - v::ProgressivePruneStrategy{}, - 2, - 1.3f, - data, - accessor, - dist, - size_t{5}, - std::span>(pool), - result - ); - - CATCH_REQUIRE(result.size() == 2); - CATCH_REQUIRE(result[0].id() == 0); - CATCH_REQUIRE(result[1].id() == 4); - } - } }