-
Notifications
You must be signed in to change notification settings - Fork 323
Expand file tree
/
Copy pathtest.cpp
More file actions
1457 lines (1257 loc) · 60.9 KB
/
test.cpp
File metadata and controls
1457 lines (1257 loc) · 60.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* @file test.cpp
* @author Ash Vardanian
* @brief Unit-testing vector-search functionality.
* @date June 10, 2023
*
*
* Key and slot types:
* - 64-bit `std::int64_t` keys and `std::uint32_t` slots are most popular.
* - 64-bit `std::uint64_t` keys and `std::uint40_t` are most space-efficient for
* point clouds 4B+ in size.
* - 128-bit `uuid_t` keys and `enum slot64_t : std::uint64_t` make most sense for
* for database users, implementing portable, concurrent systems.
*/
#include <cassert> // `assert`
#include <cmath> // `std::abs`
#include <csignal> // `std::signal`, `SIGSEGV`, ...
#include <cstdio> // `std::fprintf`
#include <cstdlib> // `std::_Exit`
#include <limits> // `std::numeric_limits`
#include <algorithm> // `std::shuffle`
#include <random> // `std::default_random_engine`
#include <stdexcept> // `std::terminate`
#include <unordered_map> // `std::unordered_map`
#include <vector> // `std::vector`
// Back-trace support. Prefer the C++23 `<stacktrace>` library when the
// toolchain + stdlib expose it (`__cpp_lib_stacktrace`); otherwise fall back
// to the OS-native facility so that unit-test crashes in CI log something
// useful beyond a bare exit code.
#if defined(__has_include)
#if __has_include(<stacktrace>)
#include <stacktrace>
#endif
#endif
#if defined(__cpp_lib_stacktrace) && __cpp_lib_stacktrace >= 202011L
#define USEARCH_HAS_STD_STACKTRACE 1
#else
#define USEARCH_HAS_STD_STACKTRACE 0
#if defined(_WIN32)
// `windows.h` must precede `dbghelp.h` — the latter uses `PSTR` and friends
// that are only defined after `windows.h`. The blank line keeps clang-format
// from re-sorting the two headers into a single alphabetized block.
#include <windows.h>
#include <dbghelp.h>
#pragma comment(lib, "Dbghelp.lib")
#else
#include <execinfo.h>
#include <unistd.h>
#endif
#endif
#define SZ_USE_X86_AVX512 0 // Sanitizers hate AVX512
#include <stringzilla/stringzilla.hpp> // Levenshtein distance implementation
#include <usearch/index.hpp>
#include <usearch/index_dense.hpp>
#include <usearch/index_plugins.hpp>
using namespace unum::usearch;
using namespace unum;
void __expect(bool must_be_true, char const* file, int line, char const* message = nullptr) {
if (must_be_true)
return;
message = message ? message : "C++ unit test failed";
char buffer[512];
std::snprintf(buffer, sizeof(buffer), "%s at %s:%d", message, file, line);
usearch_raise_runtime_error(buffer);
}
template <typename value_at>
void __expect_eq(value_at a, value_at b, char const* file, int line, char const* message = nullptr) {
__expect(a == b, file, line, message);
}
#define expect(cond) __expect((bool)(cond), __FILE__, __LINE__)
#define expect_eq(a, b) __expect_eq<decltype(a)>((a), (b), __FILE__, __LINE__)
/**
* Less error-prone type definition to differentiate `std::uint32_t` and other native
* types in logs and avoid implicit conversions.
*/
enum slot32_t : std::uint32_t {};
template <> struct unum::usearch::hash_gt<slot32_t> : public unum::usearch::hash_gt<std::uint32_t> {};
template <> struct unum::usearch::default_free_value_gt<slot32_t> {
static slot32_t value() noexcept { return static_cast<slot32_t>((std::numeric_limits<std::uint32_t>::max)()); }
};
/*
* Let's instantiate several templates to make all of their symbols available for testing.
* https://dhashe.com/how-to-build-highly-debuggable-c-binaries.html
*/
template class unum::usearch::index_gt<float, std::int64_t, slot32_t>;
template class unum::usearch::index_gt<float, std::int64_t, uint40_t>;
template class unum::usearch::index_dense_gt<std::int64_t, slot32_t>;
template class unum::usearch::index_dense_gt<std::int64_t, uint40_t>;
/**
* @brief Convenience wrapper combining combined allocation and construction of an index.
*/
template <typename index_at> struct aligned_wrapper_gt {
using index_t = index_at;
using alloc_t = aligned_allocator_gt<index_t, 64>;
alloc_t alloc;
index_t* index = nullptr;
template <typename... args_at> aligned_wrapper_gt(args_at&&... args) {
alloc_t index_alloc;
index_t* index_typed = index_alloc.allocate(1);
expect(index_typed != nullptr);
expect(((unsigned long long)(index_typed) % 64ull) == 0ull);
new (index_typed) index_t(std::forward<args_at>(args)...);
index = index_typed;
}
~aligned_wrapper_gt() {
if (index != nullptr) {
index->~index_t();
alloc.deallocate(index, 1);
}
}
};
/**
* Tests the functionality of the custom uint40_t type ensuring consistent
* behavior across various constructors from uint32_t, uint64_t, and size_t types,
* and validates the relational ordering between various values.
*/
void test_uint40() {
union uint64_octets_t {
std::uint64_t value;
std::uint8_t octets[8];
};
// Constants for tests
std::uint64_t max_uint40_k = (1ULL << 40) - 1;
// Set of test numbers
std::vector<std::uint64_t> test_numbers = {
42ull, // Typical small number
4242ull, // Larger number still within uint40 range
(1ull << 39), // A high number within range
(1ull << 40) - 1, // Maximum value representable in uint40
1ull << 40, // Exactly at the boundary of uint40
(1ull << 40) + 1, // Just beyond the boundary of uint40
1ull << 63 // Well beyond the uint40 boundary, tests masking
};
for (std::uint64_t input_u64 : test_numbers) {
std::uint32_t input_u32 = static_cast<std::uint32_t>(input_u64);
std::size_t input_size = static_cast<std::size_t>(input_u64);
// Create uint40_t instances from different types
uint40_t u40_from_u32(input_u32);
uint40_t u40_from_u64(input_u64);
uint40_t u40_from_size(input_size);
// Expected value after masking
uint64_octets_t input_clamped;
input_clamped.value = input_u64 & max_uint40_k;
// Check if all conversions are equal to the masked value
expect_eq(u40_from_u32, input_clamped.value & 0xFFFFFFFF);
expect_eq(u40_from_u64, input_clamped.value);
expect_eq(u40_from_size, input_clamped.value);
// Check relative ordering against all other test numbers
for (std::uint64_t other_u64 : test_numbers) {
uint64_octets_t other_clamped;
other_clamped.value = other_u64 & max_uint40_k;
uint40_t other_u40(other_clamped.value);
// Check < and >
expect_eq(input_clamped.value < other_clamped.value, u40_from_u64 < other_u40);
expect_eq(input_clamped.value > other_clamped.value, u40_from_u64 > other_u40);
// Check <= and >=
expect_eq(input_clamped.value <= other_clamped.value, u40_from_u64 <= other_u40);
expect_eq(input_clamped.value >= other_clamped.value, u40_from_u64 >= other_u40);
}
}
// Test equality and inequality operators
for (std::uint64_t input_u64 : test_numbers) {
uint64_octets_t input_clamped;
input_clamped.value = input_u64 & max_uint40_k;
uint40_t u40(input_clamped.value);
expect_eq(u40 == uint40_t(input_clamped.value), true);
expect_eq(u40 != uint40_t(input_clamped.value + 1), true);
}
// Test min and max functions
expect_eq((uint40_t::min)(), uint40_t(0u));
expect_eq((uint40_t::max)(), uint40_t(max_uint40_k));
// Test copy and move semantics
for (std::uint64_t input_u64 : test_numbers) {
uint40_t u40_orig(input_u64);
uint40_t u40_copy(u40_orig);
expect_eq(u40_orig, u40_copy);
uint40_t u40_move(std::move(u40_orig));
expect_eq(u40_copy, u40_move);
}
// Test default constructor (zero initialization)
uint40_t u40_default;
expect_eq(u40_default, uint40_t(0u));
}
void test_checked_size_arithmetic() {
std::printf("Testing checked size arithmetic\n");
std::size_t max = (std::numeric_limits<std::size_t>::max)();
checked_size_result_t cast = checked_size_from_u64(42);
expect(cast);
expect_eq(cast.value, static_cast<std::size_t>(42));
if (sizeof(std::size_t) < sizeof(std::uint64_t))
expect(!checked_size_from_u64((std::numeric_limits<std::uint64_t>::max)()));
checked_size_result_t sum = checked_add(max - 1, 1);
expect(sum);
expect_eq(sum.value, max);
expect(!checked_add(max, 1));
checked_size_result_t product = checked_mul(max / 2, 2);
expect(product);
expect_eq(product.value, max - 1);
expect(!checked_mul(max / 2 + 1, 2));
checked_size_result_t fused = checked_mul_add(10, 20, 30);
expect(fused);
expect_eq(fused.value, static_cast<std::size_t>(230));
expect(!checked_mul_add(max / 2 + 1, 2, 0));
expect(!checked_mul_add(max / 2, 2, 2));
checked_size_result_t rounded = checked_round_up(17, 8);
expect(rounded);
expect_eq(rounded.value, static_cast<std::size_t>(24));
expect(!checked_round_up(max, 8));
checked_size_result_t power = checked_ceil2(17);
expect(power);
expect_eq(power.value, static_cast<std::size_t>(32));
checked_size_result_t zero_power = checked_ceil2(0);
expect(zero_power);
expect_eq(zero_power.value, static_cast<std::size_t>(0));
expect(!checked_ceil2(max));
}
/**
* @brief Tests the functionality of the custom float16_t type ensuring consistent.
*/
void test_float16() {}
/**
* The goal of this test is to invoke as many different interfaces as possible, making sure that all code-paths compile.
* For that it only uses a tiny set of 3 predefined vectors.
*
* @param index Reference to the index where vectors will be stored and searched.
* @param vectors A collection of vectors to be tested.
* @param args Additional arguments for configuring search or index operations.
* @tparam punned_ak Template parameter that determines specific behaviors or checks in the test based on its value.
* @tparam index_at Type of the index being tested.
* @tparam scalar_at Data type of the elements in the vectors.
* @tparam extra_args_at Variadic template parameter types for additional configuration.
*/
template <bool punned_ak, typename index_at, typename scalar_at, typename... extra_args_at>
void test_minimal_three_vectors(index_at& index, //
typename index_at::vector_key_t key_first, std::vector<scalar_at> const& vector_first,
typename index_at::vector_key_t key_second, std::vector<scalar_at> const& vector_second,
typename index_at::vector_key_t key_third, std::vector<scalar_at> const& vector_third,
extra_args_at&&... args) {
using scalar_t = scalar_at;
using index_t = index_at;
using vector_key_t = typename index_t::vector_key_t;
using distance_t = typename index_t::distance_t;
// Try checking the empty state
if constexpr (punned_ak) {
if (index.config().enable_key_lookups) {
expect(!index.contains(key_first));
expect(!index.get(key_first, (f32_t*)nullptr, 1));
}
}
// Add data
expect(index.try_reserve(10));
expect(index.add(key_first, vector_first.data(), args...));
// Default approximate search
vector_key_t matched_keys[10] = {0};
distance_t matched_distances[10] = {0};
std::size_t matched_count = index.search(vector_first.data(), 5, args...).dump_to(matched_keys, matched_distances);
expect(matched_count == 1);
expect(matched_keys[0] == key_first);
expect(std::abs(matched_distances[0]) < 0.01);
// Add more entries
index.add(key_second, vector_second.data(), args...);
index.add(key_third, vector_third.data(), args...);
expect(index.size() == 3);
// Perform single entry search
{
auto search_result = index.search(vector_first.data(), 5, args...);
expect(search_result);
matched_count = search_result.dump_to(matched_keys, matched_distances);
expect(matched_count != 0);
}
// Perform filtered exact search, keeping only odd values
if constexpr (punned_ak) {
auto is_odd = [](vector_key_t key) -> bool { return (key & 1) != 0; };
auto search_result = index.filtered_search(vector_first.data(), 5, is_odd, args...);
expect(search_result);
matched_count = search_result.dump_to(matched_keys, matched_distances);
expect(matched_count != 0);
for (std::size_t i = 0; i < matched_count; i++)
expect(is_odd(matched_keys[i]));
}
// Validate scans
std::size_t count = 0;
for (auto member : index) {
vector_key_t id = member.key;
expect(id >= key_first && id <= key_third);
count++;
}
expect((count == 3));
expect((index.stats(0).nodes == 3));
// Check if clustering endpoint compiles
index.cluster(vector_first.data(), 0, args...);
// Try removals and replacements
if constexpr (punned_ak) {
if (index.config().enable_key_lookups) {
using labeling_result_t = typename index_t::labeling_result_t;
labeling_result_t result = index.remove(key_third);
expect(result);
expect(index.size() == 2);
index.add(key_third, vector_third.data(), args...);
expect(index.size() == 3);
}
}
expect(index.save("tmp.usearch"));
// Perform content and scan validations for a copy
{
auto copy_result = index.copy();
expect(copy_result);
index_at copied_index = std::move(copy_result.index);
// Perform single entry search
auto search_result = copied_index.search(vector_first.data(), 5, args...);
expect(search_result);
matched_count = search_result.dump_to(matched_keys, matched_distances);
expect(matched_count != 0);
// Validate scans
std::size_t count = 0;
for (auto member : copied_index) {
vector_key_t id = member.key;
expect(id >= key_first && id <= key_third);
count++;
}
expect_eq(count, 3);
expect_eq(copied_index.stats(0).nodes, 3);
}
// Perform content and scan validations for a moved
{
index_at moved_index(std::move(index));
// Perform single entry search
auto search_result = moved_index.search(vector_first.data(), 5, args...);
expect(search_result);
matched_count = search_result.dump_to(matched_keys, matched_distances);
expect(matched_count != 0);
// Validate scans
std::size_t count = 0;
for (auto member : moved_index) {
vector_key_t id = member.key;
expect(id >= key_first && id <= key_third);
count++;
}
expect_eq(count, 3);
expect_eq(moved_index.stats(0).nodes, 3);
}
// Check if metadata is retrieved correctly
if constexpr (punned_ak) {
auto head_result = index_dense_metadata_from_path("tmp.usearch");
expect(head_result);
expect_eq(3ull, head_result.head.count_present);
}
// Try loading and move assignment
{
index_at loaded_index;
auto load_result = loaded_index.load("tmp.usearch");
expect(load_result);
index = std::move(loaded_index);
}
// Check the copy of the restored index
{
auto copy_result = index.copy();
expect(copy_result);
index_at copied_index = std::move(copy_result.index);
expect_eq(copied_index.size(), 3);
}
// Search again over reconstructed index
{
matched_count = index.search(vector_first.data(), 5, args...).dump_to(matched_keys, matched_distances);
expect_eq(matched_count, 3);
expect_eq(matched_keys[0], key_first);
expect(std::abs(matched_distances[0]) < 0.01);
}
// Try retrieving a vector from a deserialized index
if constexpr (punned_ak) {
if (index.config().enable_key_lookups) {
std::size_t dimensions = vector_first.size();
std::vector<scalar_t> vector_reloaded(dimensions);
expect(index.get(key_second, vector_reloaded.data()));
expect(std::equal(vector_second.data(), vector_second.data() + dimensions, vector_reloaded.data()));
}
}
}
/**
* @brief Tests value removals, by repeatedly adding and removing a couple of vectors.
*
* @param index Reference to the index where vectors will be stored and searched.
* @param vector_first First vector to be tested.
* @param vector_second Second vector to be tested.
* @param args Additional arguments for configuring search or index operations.
*/
template <typename index_at, typename scalar_at>
void test_punned_add_remove_vector( //
index_at& index, //
std::vector<scalar_at> const& vector_first, //
std::vector<scalar_at> const& vector_second) {
using index_t = index_at;
using vector_key_t = typename index_t::vector_key_t;
// Creating the index
expect(index.try_reserve(10));
expect(index.capacity() >= 10);
// Max 64-bit unsigned key value is: 18446744073709551615
vector_key_t key_first = 483367403120493160;
vector_key_t key_second = 483367403120558696;
vector_key_t key_third = 483367403120624232;
vector_key_t key_fourth = 483367403120624233;
// Adding, getting, and removing vectors
expect(index.add(key_first, vector_first.data()));
std::vector<float> found_slice(vector_first.size(), 0.0f);
expect_eq(index.get(key_first, found_slice.data()), 1);
expect(index.remove(key_first));
expect(index.add(key_second, vector_second.data()));
expect_eq(index.get(key_second, found_slice.data()), 1);
expect(index.remove(key_second));
expect(index.add(key_third, vector_second.data()));
expect_eq(index.get(key_third, found_slice.data()), 1);
expect(index.remove(key_third));
expect(index.add(key_fourth, vector_second.data()));
expect_eq(index.get(key_fourth, found_slice.data()), 1);
expect(index.remove(key_fourth));
expect_eq(index.size(), 0);
}
/**
* Tests the normal operational mode of the library, dealing with a variable length collection
* of `vectors` with monotonically increasing keys starting from `start_key`.
*
* @param index Reference to the index where vectors will be stored and searched.
* @param start_key The key for the first `vector`, others are generated with increments.
* @param vectors A collection of vectors to be tested.
* @param args Additional arguments for configuring search or index operations.
* @tparam punned_ak Template parameter that determines specific behaviors or checks in the test based on its value.
* @tparam index_at Type of the index being tested.
* @tparam scalar_at Data type of the elements in the vectors.
* @tparam extra_args_at Variadic template parameter types for additional configuration.
*/
template <bool punned_ak, typename index_at, typename scalar_at, typename... extra_args_at>
void test_collection(index_at& index, typename index_at::vector_key_t const start_key,
std::vector<std::vector<scalar_at>> const& vectors, extra_args_at&&... args) {
using scalar_t = scalar_at;
using index_t = index_at;
using vector_key_t = typename index_t::vector_key_t;
using distance_t = typename index_t::distance_t;
using index_add_result_t = typename index_t::add_result_t;
using index_search_result_t = typename index_t::search_result_t;
// Generate some keys starting from end,
// for three vectors from the dataset
vector_key_t const key_first = start_key;
std::vector<scalar_at> const& vector_first = vectors[0];
std::size_t dimensions = vector_first.size();
// Try batch requests, heavily over-subscribing the CPU cores
std::size_t executor_threads = std::thread::hardware_concurrency();
executor_default_t executor(executor_threads);
expect(index.try_reserve({vectors.size(), executor.size()}));
executor.fixed(vectors.size(), [&](std::size_t thread, std::size_t task) {
if constexpr (punned_ak) {
index_add_result_t result = index.add(start_key + task, vectors[task].data(), args...);
expect(result);
} else {
index_update_config_t config;
config.thread = thread;
index_add_result_t result = index.add(start_key + task, vectors[task].data(), args..., config);
expect(result);
}
});
// Make sure we didn't lose parallelism settings after reload
expect(index.limits().threads_search >= executor.size());
if constexpr (punned_ak)
expect(index.currently_available_threads() >= executor.size());
// Parallel search over the same vectors
executor.fixed(vectors.size(), [&](std::size_t thread, std::size_t task) {
std::size_t max_possible_matches = vectors.size();
std::size_t count_requested = max_possible_matches;
std::vector<vector_key_t> matched_keys(count_requested);
std::vector<distance_t> matched_distances(count_requested);
std::size_t matched_count = 0;
// Invoke the search kernel
if constexpr (punned_ak) {
index_search_result_t result = index.search(vectors[task].data(), count_requested, args...);
expect(result);
matched_count = result.dump_to(matched_keys.data(), matched_distances.data());
} else {
index_search_config_t config;
config.thread = thread;
index_search_result_t result = index.search(vectors[task].data(), count_requested, args..., config);
expect(result);
matched_count = result.dump_to(matched_keys.data(), matched_distances.data());
}
// In approximate search we can't always expect the right answer to be found
// expect_eq(matched_count, max_possible_matches);
// expect_eq(matched_keys[0], start_key + task);
// expect(std::abs(matched_distances[0]) < 0.01);
expect(matched_count <= max_possible_matches);
// Check that all the distance are monotonically rising
for (std::size_t i = 1; i < matched_count; i++)
expect(matched_distances[i - 1] <= matched_distances[i]);
});
// Search again over mapped index
expect(index.save("tmp.usearch"));
{
auto copy_result = index.copy();
expect(copy_result);
index_at copied_index = std::move(copy_result.index);
expect_eq(copied_index.size(), vectors.size());
}
// Check for duplicates
if constexpr (punned_ak) {
if (index.config().enable_key_lookups) {
expect(index.try_reserve({vectors.size() + 1u, executor.size()}));
index_add_result_t result = index.add(key_first, vector_first.data(), args...);
expect_eq(!!result, index.multi());
result.error.release();
std::size_t first_key_count = index.count(key_first);
expect_eq(first_key_count, 1ul + index.multi());
}
}
// Recover the state before the duplicate insertion
expect(index.view("tmp.usearch"));
// Parallel search over the same vectors
executor.fixed(vectors.size(), [&](std::size_t thread, std::size_t task) {
// Check over-sampling beyond the size of the collection
std::size_t max_possible_matches = vectors.size();
std::size_t count_requested = max_possible_matches * 10;
std::vector<vector_key_t> matched_keys(count_requested);
std::vector<distance_t> matched_distances(count_requested);
std::size_t matched_count = 0;
// Invoke the search kernel
if constexpr (punned_ak) {
index_search_result_t result = index.search(vectors[task].data(), count_requested, args...);
expect(result);
matched_count = result.dump_to(matched_keys.data(), matched_distances.data());
} else {
index_search_config_t config;
config.thread = thread;
index_search_result_t result = index.search(vectors[task].data(), count_requested, args..., config);
expect(result);
matched_count = result.dump_to(matched_keys.data(), matched_distances.data());
}
// In approximate search we can't always expect the right answer to be found
// expect_eq(matched_count, max_possible_matches);
// expect_eq(matched_keys[0], start_key + task);
// expect(std::abs(matched_distances[0]) < 0.01);
expect(matched_count <= max_possible_matches);
// Check that all the distance are monotonically rising
for (std::size_t i = 1; i < matched_count; i++)
expect(matched_distances[i - 1] <= matched_distances[i]);
});
// Try retrieving a vector from a deserialized index
if constexpr (punned_ak) {
if (index.config().enable_key_lookups) {
expect(index.contains(key_first));
expect_eq(index.count(key_first), 1);
std::vector<scalar_t> vector_reloaded(dimensions);
index.get(key_first, vector_reloaded.data());
expect(std::equal(vector_first.data(), vector_first.data() + dimensions, vector_reloaded.data()));
}
auto compaction_result = index.compact();
expect(compaction_result);
}
expect(index.memory_usage() > 0);
expect(index.stats().max_edges > 0);
// Check metadata
if constexpr (punned_ak) {
index_dense_metadata_result_t meta = index_dense_metadata_from_path("tmp.usearch");
expect(meta);
}
}
/**
* Stress-tests the behavior of the type-punned higher-level index under heavy concurrent insertions,
* removals and updates.
*
* @param index Reference to the index where vectors will be stored and searched.
* @param start_key The key for the first `vector`, others are generated with increments.
* @param vectors A collection of vectors to be tested.
* @param executor_threads Number of threads to be used for concurrent operations.
* @tparam punned_ak Template parameter that determines specific behaviors or checks in the test based on its value.
* @tparam index_at Type of the index being tested.
* @tparam scalar_at Data type of the elements in the vectors.
* @tparam extra_args_at Variadic template parameter types for additional configuration.
*/
template <typename index_at, typename scalar_at, typename... extra_args_at>
void test_punned_concurrent_updates(index_at& index, typename index_at::vector_key_t const start_key,
std::vector<std::vector<scalar_at>> const& vectors, std::size_t executor_threads) {
using index_t = index_at;
// Try batch requests, heavily oversubscribing the CPU cores
executor_default_t executor(executor_threads);
expect(index.try_reserve({vectors.size(), executor.size()}));
executor.fixed(vectors.size(), [&](std::size_t, std::size_t task) {
using add_result_t = typename index_t::add_result_t;
add_result_t result = index.add(start_key + task, vectors[task].data());
expect(result);
});
expect_eq(index.size(), vectors.size());
// Without key lookups we can't do much more
if (!index.config().enable_key_lookups)
return;
// Remove all the keys
executor.fixed(vectors.size(), [&](std::size_t, std::size_t task) {
using labeling_result_t = typename index_t::labeling_result_t;
labeling_result_t result = index.remove(start_key + task);
expect(result);
});
expect_eq(index.size(), 0);
// Add them back, which under the hood will trigger the `update`
executor.fixed(vectors.size(), [&](std::size_t, std::size_t task) {
using add_result_t = typename index_t::add_result_t;
add_result_t result = index.add(start_key + task, vectors[task].data());
expect(result);
});
expect_eq(index.size(), vectors.size());
}
/**
* Overloaded function to test cosine similarity index functionality using specific scalar, key, and slot types.
*
* This function initializes vectors and an index instance to test cosine similarity calculations and index operations.
* It involves creating vectors with random values, constructing an index, and verifying that the index operations
* like search work correctly with respect to the cosine similarity metric.
*
* @param collection_size Number of vectors to be included in the test.
* @param dimensions Number of dimensions each vector should have.
* @tparam scalar_at Data type of the elements in the vectors.
* @tparam key_at Data type used for the keys in the index.
* @tparam slot_at Data type used for slots in the index.
*/
template <typename scalar_at, typename key_at, typename slot_at> //
void test_cosine(std::size_t collection_size, std::size_t dimensions) {
using scalar_t = scalar_at;
using vector_key_t = key_at;
using slot_t = slot_at;
using index_typed_t = index_gt<float, vector_key_t, slot_t>;
using member_cref_t = typename index_typed_t::member_cref_t;
using member_citerator_t = typename index_typed_t::member_citerator_t;
using vector_of_vectors_t = std::vector<std::vector<scalar_at>>;
vector_of_vectors_t vector_of_vectors(collection_size);
for (auto& vector : vector_of_vectors) {
vector.resize(dimensions);
std::generate(vector.begin(), vector.end(), [=] { return float(std::rand()) / float(RAND_MAX); });
}
struct metric_t {
vector_of_vectors_t const* vector_of_vectors_ptr = nullptr;
std::size_t dimensions = 0;
scalar_t const* row(std::size_t i) const noexcept { return (*vector_of_vectors_ptr)[i].data(); }
float operator()(member_cref_t const& a, member_cref_t const& b) const {
return metric_cos_gt<scalar_t, float>{}(row(get_slot(b)), row(get_slot(a)), dimensions);
}
float operator()(scalar_t const* some_vector, member_cref_t const& member) const {
return metric_cos_gt<scalar_t, float>{}(some_vector, row(get_slot(member)), dimensions);
}
float operator()(member_citerator_t const& a, member_citerator_t const& b) const {
return metric_cos_gt<scalar_t, float>{}(row(get_slot(b)), row(get_slot(a)), dimensions);
}
float operator()(scalar_t const* some_vector, member_citerator_t const& member) const {
return metric_cos_gt<scalar_t, float>{}(some_vector, row(get_slot(member)), dimensions);
}
};
// Template:
auto run_templated = [&](std::size_t connectivity) {
std::printf("-- templates with connectivity %zu \n", connectivity);
metric_t metric{&vector_of_vectors, dimensions};
index_config_t config(connectivity);
// Toy example
if (vector_of_vectors.size() >= 3) {
aligned_wrapper_gt<index_typed_t> aligned_index(config);
test_minimal_three_vectors<false, index_typed_t>( //
*aligned_index.index, //
42, vector_of_vectors[0], //
43, vector_of_vectors[1], //
44, vector_of_vectors[2], metric);
}
// Larger collection
{
aligned_wrapper_gt<index_typed_t> aligned_index(config);
test_collection<false, index_typed_t>(*aligned_index.index, 42, vector_of_vectors, metric);
}
};
for (std::size_t connectivity : {3, 13, 50})
run_templated(connectivity);
// Type-punned:
auto run_punned = [&](bool multi, bool enable_key_lookups, std::size_t connectivity) {
std::printf("-- punned with connectivity %zu, multi: %s, lookups: %s \n", connectivity, multi ? "yes" : "no",
enable_key_lookups ? "yes" : "no");
using index_t = index_dense_gt<vector_key_t, slot_t>;
using index_result_t = typename index_t::state_result_t;
metric_punned_t metric(dimensions, metric_kind_t::cos_k, scalar_kind<scalar_at>());
index_dense_config_t config(connectivity);
config.multi = multi;
config.enable_key_lookups = enable_key_lookups;
// Toy example
if (vector_of_vectors.size() >= 3) {
index_result_t index_result = index_t::make(metric, config);
test_minimal_three_vectors<true>( //
index_result.index, //
42, vector_of_vectors[0], //
43, vector_of_vectors[1], //
44, vector_of_vectors[2]);
}
if (vector_of_vectors.size() >= 3 && enable_key_lookups) {
index_result_t index_result = index_t::make(metric, config);
test_punned_add_remove_vector( //
index_result.index, //
vector_of_vectors[0], //
vector_of_vectors[1]);
}
// Larger collection
{
index_result_t index_result = index_t::make(metric, config);
test_collection<true>(index_result.index, 42, vector_of_vectors);
}
// Try running benchmarks with a different number of threads
for (std::size_t threads : {
static_cast<std::size_t>(1),
// TODO: Multithreaded updates should word differently and may involve a search first
// static_cast<std::size_t>(2),
// static_cast<std::size_t>(std::thread::hardware_concurrency()),
// static_cast<std::size_t>(std::thread::hardware_concurrency() * 4),
// static_cast<std::size_t>(vector_of_vectors.size()),
}) {
index_result_t index_result = index_t::make(metric, config);
index_t& index = index_result.index;
test_punned_concurrent_updates(index, 42, vector_of_vectors, threads);
}
};
for (bool multi : {false, true})
for (bool enable_key_lookups : {true, false})
for (std::size_t connectivity : {3, 13, 50})
run_punned(multi, enable_key_lookups, connectivity);
}
/**
* Tests the functionality of the Tanimoto coefficient calculation and indexing.
*
* Initializes a dense index configured for Tanimoto similarity and fills it with randomly generated binary vectors.
* It performs concurrent additions of these vectors to the index to ensure thread safety and correctness of concurrent
* operations.
*
* @param dimensions Number of dimensions for the binary vectors.
* @param connectivity The degree of connectivity for the index configuration.
* @tparam key_at Data type used for the keys in the index.
* @tparam slot_at Data type used for slots in the index.
*/
template <typename key_at, typename slot_at> void test_tanimoto(std::size_t dimensions, std::size_t connectivity) {
using vector_key_t = key_at;
using slot_t = slot_at;
using index_punned_t = index_dense_gt<vector_key_t, slot_t>;
std::size_t words = divide_round_up<CHAR_BIT>(dimensions);
metric_punned_t metric(words, metric_kind_t::tanimoto_k, scalar_kind_t::b1x8_k);
index_config_t config(connectivity);
auto index_result = index_punned_t::make(metric, config);
expect(index_result);
index_punned_t& index = index_result.index;
executor_default_t executor;
std::size_t batch_size = 1000;
std::vector<b1x8_t> scalars(batch_size * index.scalar_words());
std::generate(scalars.begin(), scalars.end(), [] { return static_cast<b1x8_t>(std::rand()); });
index.try_reserve({batch_size + index.size(), executor.size()});
executor.fixed(batch_size, [&](std::size_t thread, std::size_t task) {
index.add(task + 25000, scalars.data() + index.scalar_words() * task, thread);
});
}
/**
* Performs a unit test on the index with a ridiculous variety of configurations and parameters.
*
* This test aims to evaluate the index under extreme conditions, including small and potentially invalid parameters for
* connectivity, dimensions, and other configurations. It tests both the addition of vectors and their retrieval in
* these edge cases to ensure stability and error handling.
*
* @param dimensions Number of dimensions for the vectors.
* @param connectivity Index connectivity configuration.
* @param expansion_add Expansion factor during addition operations.
* @param expansion_search Expansion factor during search operations.
* @param count_vectors Number of vectors to add to the index.
* @param count_wanted Number of results wanted from search operations.
* @tparam key_at Data type used for the keys in the index.
* @tparam slot_at Data type used for slots in the index.
*/
template <typename key_at, typename slot_at>
void test_absurd(std::size_t dimensions, std::size_t connectivity, std::size_t expansion_add,
std::size_t expansion_search, std::size_t count_vectors, std::size_t count_wanted) {
using vector_key_t = key_at;
using slot_t = slot_at;
using index_punned_t = index_dense_gt<vector_key_t, slot_t>;
metric_punned_t metric(dimensions, metric_kind_t::cos_k, scalar_kind_t::f32_k);
index_dense_config_t config(connectivity, expansion_add, expansion_search);
auto index_result = index_punned_t::make(metric, config);
expect(index_result);
index_punned_t& index = index_result.index;
std::size_t count_max = (std::max)(count_vectors, count_wanted);
std::size_t needed_scalars = count_max * dimensions;
std::vector<f32_t> scalars(needed_scalars);
std::generate(scalars.begin(), scalars.end(), [] { return static_cast<f32_t>(std::rand()); });
expect(index.try_reserve({count_vectors, count_max}));
index.change_expansion_add(expansion_add);
index.change_expansion_search(expansion_search);
// Parallel construction
{
executor_default_t executor(count_vectors);
executor.fixed(count_vectors, [&](std::size_t thread, std::size_t task) {
expect(index.add(task + 25000, scalars.data() + index.scalar_words() * task, thread));
});
}
// Parallel search
{
executor_default_t executor(count_max);
executor.fixed(count_max, [&](std::size_t thread, std::size_t task) {
std::vector<vector_key_t> keys(count_wanted);
std::vector<f32_t> distances(count_wanted);
auto results = index.search(scalars.data() + index.scalar_words() * task, count_wanted, thread);
expect(results);
auto count_found = results.dump_to(keys.data(), distances.data());
expect(count_found <= count_wanted);
if (count_vectors && count_wanted)
expect(count_found > 0);
});
}
}
/**
* Tests the exact search functionality over a dataset of vectors, @b without constructing the index.
*
* Generates a dataset of vectors and performs exact search queries to verify that the search results are correct.
* This function mainly validates the basic functionality of exact searches using a given similarity metric.
*
* @param dataset_count Number of vectors in the dataset.
* @param queries_count Number of query vectors.
* @param wanted_count Number of top matches required from each query.
* @tparam scalar_at Data type of the elements in the vectors.
*/
template <typename scalar_at>
void test_exact_search(std::size_t dataset_count, std::size_t queries_count, std::size_t wanted_count) {
std::size_t dimensions = 32;
metric_punned_t metric(dimensions, metric_kind_t::cos_k, scalar_kind<scalar_at>());
std::random_device seed_source;
std::mt19937 generator(seed_source());
std::uniform_real_distribution<> distribution(0.0, 1.0); // ! We can't pass `scalar_at` to the distribution
std::vector<scalar_at> dataset(dataset_count * dimensions);
std::generate(dataset.begin(), dataset.end(), [&] { return static_cast<scalar_at>(distribution(generator)); });
exact_search_t search;
auto results = search( //
(byte_t const*)dataset.data(), dataset_count, dimensions * sizeof(scalar_at), //
(byte_t const*)dataset.data(), queries_count, dimensions * sizeof(scalar_at), //
wanted_count, metric);
for (std::size_t i = 0; i < results.size(); ++i)
assert(results.at(i)[0].offset == i); // Validate the top match
}
/**
* Tests handling of variable length sets (group of sorted unique integers), as opposed to @b equi-dimensional vectors.
*
* Adds a predefined number of vectors to an index and checks if the size of the index is updated correctly.
* It serves as a simple verification and showcase of how the same index can be used to handle strings and other types.
*
* @param index A reference to the index instance to be tested.
* @tparam index_at Type of the index being tested.
*/
template <typename key_at, typename slot_at>
void test_sets(std::size_t collection_size, std::size_t min_set_length, std::size_t max_set_length) {
/// Type of set elements, should support strong ordering
using set_member_t = std::uint32_t;
/// Jaccard is a fraction, so let's use a some float
using set_distance_t = double;
// Aliases for the index overload
using vector_key_t = key_at;
using slot_t = slot_at;
using index_t = index_gt<set_distance_t, vector_key_t, slot_t>;
// Let's allocate some data for indexing
using set_view_t = span_gt<set_member_t const>;
using sets_t = std::vector<std::vector<set_member_t>>;
sets_t sets(collection_size);
for (auto& set : sets) {
std::size_t set_size = min_set_length + std::rand() % (max_set_length - min_set_length);
set.resize(set_size);