queso-0.57.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Pages
ann_test.cpp
Go to the documentation of this file.
1 //----------------------------------------------------------------------
2 // File: ann_test.cpp
3 // Programmer: Sunil Arya and David Mount
4 // Description: test program for ANN (approximate nearest neighbors)
5 // Last modified: 01/27/10 (Version 1.1.2)
6 //----------------------------------------------------------------------
7 // Copyright (c) 1997-2010 University of Maryland and Sunil Arya and
8 // David Mount. All Rights Reserved.
9 //
10 // This software and related documentation is part of the Approximate
11 // Nearest Neighbor Library (ANN). This software is provided under
12 // the provisions of the Lesser GNU Public License (LGPL). See the
13 // file ../ReadMe.txt for further information.
14 //
15 // The University of Maryland (U.M.) and the authors make no
16 // representations about the suitability or fitness of this software for
17 // any purpose. It is provided "as is" without express or implied
18 // warranty.
19 //----------------------------------------------------------------------
20 // History:
21 // Revision 0.1 03/04/98
22 // Initial release
23 // Revision 0.2 06/26/98
24 // Added CLOCKS_PER_SEC definition if needed
25 // Revision 1.0 04/01/05
26 // Added comments (from "#" to eol)
27 // Added clus_orth_flats and clus_ellipsoids distributions
28 // Fixed order of fair and midpt in split_table
29 // Added dump/load operations
30 // Cleaned up C++ for modern compilers
31 // Revision 1.1 05/03/05
32 // Added fixed radius kNN search
33 // Revision 1.1.1 08/04/06
34 // Added planted distribution
35 // Revision 1.1.2 01/27/10
36 // Fixed minor compilation bugs for new versions of gcc
37 // Allow round-off error in validation test
38 //----------------------------------------------------------------------
39 
40 #include <ctime> // clock
41 #include <cmath> // math routines
42 #include <cstring> // C string ops
43 #include <fstream> // file I/O
44 
45 #include <ANN/ANN.h> // ANN declarations
46 #include <ANN/ANNx.h> // more ANN declarations
47 #include <ANN/ANNperf.h> // performance evaluation
48 
49 #include "rand.h" // random point generation
50 
51 #ifndef CLOCKS_PER_SEC // define clocks-per-second if needed
52  #define CLOCKS_PER_SEC 1000000
53 #endif
54 
55 using namespace std; // make std:: available
56 
57 //----------------------------------------------------------------------
58 // ann_test
59 //
60 // This program is a driver for testing and evaluating the ANN library
61 // for computing approximate nearest neighbors. It allows the user to
62 // generate data and query sets of various sizes, dimensions, and
63 // distributions, to build kd- and bbd-trees of various types, and then
64 // run queries and outputting various performance statistics.
65 //
66 // Overview:
67 // ---------
68 // The test program is run as follows:
69 //
70 // ann_test < test_input > test_output
71 //
72 // where the test_input file contains a list of directives as described
73 // below. Directives consist of a directive name, followed by list of
74 // arguments (depending on the directive). Arguments and directives are
75 // separated by white space (blank, tab, and newline). String arguments
76 // are not quoted, and consist of a string of nonwhite chacters. A
77 // character "#" denotes a comment. The following characters up to
78 // the end of line are ignored. Comments may only be inserted between
79 // directives (not within the argument list of a directive).
80 //
81 // Basic operations:
82 // -----------------
83 // The test program can perform the following operations. How these
84 // operations are performed depends on the options which are described
85 // later.
86 //
87 // Data Generation:
88 // ----------------
89 // read_data_pts <file> Create a set of data points whose
90 // coordinates are input from file <file>.
91 // gen_data_pts Create a set of data points whose
92 // coordinates are generated from the
93 // current point distribution.
94 //
95 // Building the tree:
96 // ------------------
97 // build_ann Generate an approximate nearest neighbor
98 // structure for the current data set, using
99 // the selected splitting rules. Any existing
100 // tree will be destroyed.
101 //
102 // Query Generation/Searching:
103 // ---------------------------
104 // read_query_pts <file> Create a set of query points whose
105 // coordinates are input from file <file>.
106 // gen_query_pts Create a set of query points whose
107 // coordinates are generated from the
108 // current point distribution.
109 // run_queries <string> Apply nearest neighbor searching to the
110 // query points using the approximate nearest
111 // neighbor structure and the given search
112 // strategy. Possible strategies are:
113 // standard = standard kd-tree search
114 // priority = priority search
115 //
116 // Miscellaneous:
117 // --------------
118 // output_label Output a label to the output file.
119 // dump <file> Dump the current structure to given file.
120 // (The dump format is explained further in
121 // the source file kd_tree.cc.)
122 // load <file> Load a tree from a data file which was
123 // created by the dump operation. Any
124 // existing tree will be destroyed.
125 //
126 // Options:
127 // --------
128 // How these operations are performed depends on a set of options.
129 // If an option is not specified, a default value is used. An option
130 // retains its value until it is set again. String inputs are not
131 // enclosed in quotes, and must contain no embedded white space (sorry,
132 // this is C++'s convention).
133 //
134 // Options affecting search tree structure:
135 // ----------------------------------------
136 // split_rule <type> Type of splitting rule to use in building
137 // the search tree. Choices are:
138 // kd = optimized kd-tree
139 // midpt = midpoint split
140 // fair = fair split
141 // sl_midpt = sliding midpt split
142 // sl_fair = sliding fair split
143 // suggest = authors' choice for best
144 // The default is "suggest". See the file
145 // kd_split.cc for more detailed information.
146 //
147 // shrink_rule <type> Type of shrinking rule to use in building
148 // a bd-tree data structure. If "none" is
149 // given, then no shrinking is performed and
150 // the result is a kd-tree. Choices are:
151 // none = perform no shrinking
152 // simple = simple shrinking
153 // centroid = centroid shrinking
154 // suggest = authors' choice for best
155 // The default is "none". See the file
156 // bd_tree.cc for more information.
157 // bucket_size <int> Bucket size, that is, the maximum number of
158 // points stored in each leaf node.
159 //
160 // Options affecting data and query point generation:
161 // --------------------------------------------------
162 // dim <int> Dimension of space.
163 // seed <int> Seed for random number generation.
164 // data_size <int> Number of data points. When reading data
165 // points from a file, this indicates the
166 // maximum number of points for storage
167 // allocation. Default = 100.
168 // query_size <int> Same as data_size for query points.
169 // std_dev <float> Standard deviation (used in gauss,
170 // planted, and clustered distributions).
171 // This is the "small" distribution for
172 // clus_ellipsoids. Default = 1.
173 // std_dev_lo <float> Low and high standard deviations (used in
174 // std_dev_hi <float> clus_ellipsoids). Default = 1.
175 // corr_coef <float> Correlation coefficient (used in co-gauss
176 // and co_lapace distributions). Default = 0.05.
177 // colors <int> Number of color classes (clusters) (used
178 // in the clustered distributions). Default = 5.
179 // new_clust Once generated, cluster centers are not
180 // normally regenerated. This is so that both
181 // query points and data points can be generated
182 // using the same set of clusters. This option
183 // forces new cluster centers to be generated
184 // with the next generation of either data or
185 // query points.
186 // max_clus_dim <int> Maximum dimension of clusters (used in
187 // clus_orth_flats and clus_ellipsoids).
188 // Default = 1.
189 // distribution <string> Type of input distribution
190 // uniform = uniform over cube [-1,1]^d.
191 // gauss = Gaussian with mean 0
192 // laplace = Laplacian, mean 0 and var 1
193 // co_gauss = correlated Gaussian
194 // co_laplace = correlated Laplacian
195 // clus_gauss = clustered Gaussian
196 // clus_orth_flats = clusters of orth flats
197 // clus_ellipsoids = clusters of ellipsoids
198 // planted = planted distribution
199 // See the file rand.cpp for further information.
200 //
201 // Options affecting nearest neighbor search:
202 // ------------------------------------------
203 // epsilon <float> Error bound for approx. near neigh. search.
204 // near_neigh <int> Number of nearest neighbors to compute.
205 // max_pts_visit <int> Maximum number of points to visit before
206 // terminating. (Used in applications where
207 // real-time performance is important.)
208 // (Default = 0, which means no limit.)
209 // radius_bound <float> Sets an upper bound on the nearest
210 // neighbor search radius. If the bound is
211 // positive, then fixed-radius nearest
212 // neighbor searching is performed, and the
213 // count of the number of points in the
214 // range is returned. If the bound is
215 // zero, then standard search is used.
216 // This can only be used with standard, not
217 // priority, search. (Default = 0, which
218 // means standard search.)
219 //
220 // Options affection general program behavior:
221 // -------------------------------------------
222 // stats <string> Level of statistics output
223 // silent = no output,
224 // exec_time += execution time only
225 // prep_stats += preprocessing statistics
226 // query_stats += query performance stats
227 // query_res += results of queries
228 // show_pts += show the data points
229 // show_struct += print search structure
230 // validate <string> Validate experiment and compute average
231 // error. Since validation causes exact
232 // nearest neighbors to be computed by the
233 // brute force method, this can take a long
234 // time. Valid arguments are:
235 // on = turn validation on
236 // off = turn validation off
237 // true_near_neigh <int> Number of true nearest neighbors to compute.
238 // When validating, we compute the difference
239 // in rank between each reported nearest neighbor
240 // and the true nearest neighbor of the same
241 // rank. Thus it is necessary to compute a
242 // few more true nearest neighbors. By default
243 // we compute 10 more than near_neigh. With
244 // this option the exact number can be set.
245 // (Used only when validating.)
246 //
247 // Example:
248 // --------
249 // output_label test_run_0 # output label for this run
250 // validate off # do not perform validation
251 // dim 16 # points in dimension 16
252 // stats query_stats # output performance statistics for queries
253 // seed 121212 # random number seed
254 // data_size 1000
255 // distribution uniform
256 // gen_data_pts # 1000 uniform data points in dim 16
257 // query_size 100
258 // std_dev 0.05
259 // distribution clus_gauss
260 // gen_query_pts # 100 points in 10 clusters with std_dev 0.05
261 // bucket_size 2
262 // split_rule kd
263 // shrink_rule none
264 // build_ann # kd-tree, bucket size 2
265 // epsilon 0.1
266 // near_neigh 5
267 // max_pts_visit 100 # stop search if more than 100 points seen
268 // run_queries standard # run queries; 5 nearest neighbors, 10% error
269 // data_size 500
270 // read_data_pts data.in # read up to 500 points from file data.in
271 // split_rule sl_midpt
272 // shrink_rule simple
273 // build_ann # bd-tree; simple shrink, sliding midpoint split
274 // epsilon 0
275 // run_queries priority # run same queries; 0 allowable error
276 //
277 //------------------------------------------------------------------------
278 
279 //------------------------------------------------------------------------
280 // Constants
281 //------------------------------------------------------------------------
282 
283 const int STRING_LEN = 500; // max string length
284 const double ERR = 0.00001; // epsilon (for float compares)
285 const double RND_OFF = 5E-16; // double round-off error
286 
287 //------------------------------------------------------------------------
288 // Enumerated values and conversions
289 //------------------------------------------------------------------------
290 
291 typedef enum {DATA, QUERY} PtType; // point types
292 
293 //------------------------------------------------------------------------
294 // Statistics output levels
295 //------------------------------------------------------------------------
296 
297 typedef enum { // stat levels
298  SILENT, // no output
299  EXEC_TIME, // just execution time
300  PREP_STATS, // preprocessing info
301  QUERY_STATS, // query performance
302  QUERY_RES, // query results
303  SHOW_PTS, // show data points
304  SHOW_STRUCT, // show tree structure
305  N_STAT_LEVELS} // number of levels
306  StatLev;
307 
309  "silent", // SILENT
310  "exec_time", // EXEC_TIME
311  "prep_stats", // PREP_STATS
312  "query_stats", // QUERY_STATS
313  "query_res", // QUERY_RES
314  "show_pts", // SHOW_PTS
315  "show_struct"}; // SHOW_STRUCT
316 
317 //------------------------------------------------------------------------
318 // Distributions
319 //------------------------------------------------------------------------
320 
321 typedef enum { // distributions
322  UNIFORM, // uniform over cube [-1,1]^d.
323  GAUSS, // Gaussian with mean 0
324  LAPLACE, // Laplacian, mean 0 and var 1
325  CO_GAUSS, // correlated Gaussian
326  CO_LAPLACE, // correlated Laplacian
327  CLUS_GAUSS, // clustered Gaussian
328  CLUS_ORTH_FLATS, // clustered on orthog flats
329  CLUS_ELLIPSOIDS, // clustered on ellipsoids
330  PLANTED, // planted distribution
332  Distrib;
333 
335  "uniform", // UNIFORM
336  "gauss", // GAUSS
337  "laplace", // LAPLACE
338  "co_gauss", // CO_GAUSS
339  "co_laplace", // CO_LAPLACE
340  "clus_gauss", // CLUS_GAUSS
341  "clus_orth_flats", // CLUS_ORTH_FLATS
342  "clus_ellipsoids", // CLUS_ELLIPSOIS
343  "planted"}; // PLANTED
344 
345 //------------------------------------------------------------------------
346 // Splitting rules for kd-trees (see ANN.h for types)
347 //------------------------------------------------------------------------
348 
349 const int N_SPLIT_RULES = 6;
351  "standard", // standard optimized kd-tree
352  "midpt", // midpoint split
353  "fair", // fair split
354  "sl_midpt", // sliding midpt split
355  "sl_fair", // sliding fair split
356  "suggest"}; // authors' choice for best
357 
358 //------------------------------------------------------------------------
359 // Shrinking rules for bd-trees (see ANN.h for types)
360 //------------------------------------------------------------------------
361 
362 const int N_SHRINK_RULES = 4;
364  "none", // perform no shrinking (kd-tree)
365  "simple", // simple shrinking
366  "centroid", // centroid shrinking
367  "suggest"}; // authors' choice for best
368 
369 //----------------------------------------------------------------------
370 // Short utility functions
371 // Error - general error routine
372 // printPoint - print a point to standard output
373 // lookUp - look up a name in table and return index
374 //----------------------------------------------------------------------
375 
376 void Error( // error routine
377  const char* msg, // error message
378  ANNerr level) // abort afterwards
379 {
380  if (level == ANNabort) {
381  cerr << "ann_test: ERROR------->" << msg << "<-------------ERROR\n";
382  exit(1);
383  }
384  else {
385  cerr << "ann_test: WARNING----->" << msg << "<-------------WARNING\n";
386  }
387 }
388 
389 void printPoint( // print point
390  ANNpoint p, // the point
391  int dim) // the dimension
392 {
393  cout << "[";
394  for (int i = 0; i < dim; i++) {
395  cout << p[i];
396  if (i < dim-1) cout << ",";
397  }
398  cout << "]";
399 }
400 
401 int lookUp( // look up name in table
402  const char* arg, // name to look up
403  const char (*table)[STRING_LEN], // name table
404  int size) // table size
405 {
406  int i;
407  for (i = 0; i < size; i++) {
408  if (!strcmp(arg, table[i])) return i;
409  }
410  return i;
411 }
412 
413 //------------------------------------------------------------------------
414 // Function declarations
415 //------------------------------------------------------------------------
416 
417 void generatePts( // generate data/query points
418  ANNpointArray &pa, // point array (returned)
419  int n, // number of points
420  PtType type, // point type
421  ANNbool new_clust, // new cluster centers desired?
422  ANNpointArray src = NULL, // source array (for PLANTED)
423  int n_src = 0); // source size (for PLANTED)
424 
425 void readPts( // read data/query points from file
426  ANNpointArray &pa, // point array (returned)
427  int &n, // number of points
428  char *file_nm, // file name
429  PtType type); // point type (DATA, QUERY)
430 
431 void doValidation(); // perform validation
432 void getTrueNN(); // compute true nearest neighbors
433 
434 void treeStats( // print statistics on kd- or bd-tree
435  ostream &out, // output stream
436  ANNbool verbose); // print stats
437 
438 //------------------------------------------------------------------------
439 // Default execution parameters
440 //------------------------------------------------------------------------
441 const int extra_nn = 10; // how many extra true nn's?
442 
443 const int def_dim = 2; // def dimension
444 const int def_data_size = 100; // def data size
445 const int def_query_size = 100; // def number of queries
446 const int def_n_color = 5; // def number of colors
447 const ANNbool def_new_clust = ANNfalse; // def new clusters flag
448 const int def_max_dim = 1; // def max flat dimension
449 const Distrib def_distr = UNIFORM; // def distribution
450 const double def_std_dev = 1.00; // def standard deviation
451 const double def_corr_coef = 0.05; // def correlation coef
452 const int def_bucket_size = 1; // def bucket size
453 const double def_epsilon = 0.0; // def error bound
454 const int def_near_neigh = 1; // def number of near neighbors
455 const int def_max_visit = 0; // def number of points visited
456 const int def_rad_bound = 0; // def radius bound
457  // def number of true nn's
459 const int def_seed = 0; // def seed for random numbers
460 const ANNbool def_validate = ANNfalse; // def validation flag
461  // def statistics output level
463 const ANNsplitRule // def splitting rule
465 const ANNshrinkRule // def shrinking rule
467 
468 //------------------------------------------------------------------------
469 // Global variables - Execution options
470 //------------------------------------------------------------------------
471 
472 int dim; // dimension
473 int data_size; // data size
474 int query_size; // number of queries
475 int n_color; // number of colors
476 ANNbool new_clust; // generate new clusters?
477 int max_dim; // maximum flat dimension
478 Distrib distr; // distribution
479 double corr_coef; // correlation coef
480 double std_dev; // standard deviation
481 double std_dev_lo; // low standard deviation
482 double std_dev_hi; // high standard deviation
483 int bucket_size; // bucket size
484 double epsilon; // error bound
485 int near_neigh; // number of near neighbors
486 int max_pts_visit; // max number of points to visit
487 double radius_bound; // maximum radius search bound
488 int true_nn; // number of true nn's
489 ANNbool validate; // validation flag
490 StatLev stats; // statistics output level
491 ANNsplitRule split; // splitting rule
492 ANNshrinkRule shrink; // shrinking rule
493 
494 //------------------------------------------------------------------------
495 // More globals - pointers to dynamically allocated arrays and structures
496 //
497 // It is assumed that all these values are set to NULL when nothing
498 // is allocated.
499 //
500 // data_pts, query_pts The data and query points
501 // the_tree Points to the kd- or bd-tree for
502 // nearest neighbor searching.
503 // apx_nn_idx, apx_dists Record approximate near neighbor
504 // indices and distances
505 // apx_pts_in_range Counts of the number of points in
506 // the in approx range, for fixed-
507 // radius NN searching.
508 // true_nn_idx, true_dists Record true near neighbor
509 // indices and distances
510 // min_pts_in_range, max_... Min and max counts of the number
511 // of points in the in approximate
512 // range.
513 // valid_dirty To avoid repeated validation,
514 // we only validate query results
515 // once. This validation becomes
516 // invalid, if a new tree, new data
517 // points or new query points have
518 // been generated.
519 // tree_data_size The number of points in the
520 // current tree. (This will be the
521 // same a data_size unless points have
522 // been added since the tree was
523 // built.)
524 //
525 // The approximate and true nearest neighbor results are stored
526 // in: apx_nn_idx, apx_dists, and true_nn_idx, true_dists.
527 // They are really flattened 2-dimensional arrays. Each of these
528 // arrays consists of query_size blocks, each of which contains
529 // near_neigh (or true_nn) entries, one for each of the nearest
530 // neighbors for a given query point.
531 //------------------------------------------------------------------------
532 
533 ANNpointArray data_pts; // data points
534 ANNpointArray query_pts; // query points
535 ANNbd_tree* the_tree; // kd- or bd-tree search structure
536 ANNidxArray apx_nn_idx; // storage for near neighbor indices
537 ANNdistArray apx_dists; // storage for near neighbor distances
538 int* apx_pts_in_range; // storage for no. of points in range
539 ANNidxArray true_nn_idx; // true near neighbor indices
540 ANNdistArray true_dists; // true near neighbor distances
541 int* min_pts_in_range; // min points in approx range
542 int* max_pts_in_range; // max points in approx range
543 
544 ANNbool valid_dirty; // validation is no longer valid
545 
546 //------------------------------------------------------------------------
547 // Initialize global parameters
548 //------------------------------------------------------------------------
549 
551 {
552  dim = def_dim; // init execution parameters
555  distr = def_distr;
570  stats = def_stats;
571  split = def_split;
572  shrink = def_shrink;
573  annIdum = -def_seed; // init. global seed for ran0()
574 
575  data_pts = NULL; // initialize storage pointers
576  query_pts = NULL;
577  the_tree = NULL;
578  apx_nn_idx = NULL;
579  apx_dists = NULL;
580  apx_pts_in_range = NULL;
581  true_nn_idx = NULL;
582  true_dists = NULL;
583  min_pts_in_range = NULL;
584  max_pts_in_range = NULL;
585 
586  valid_dirty = ANNtrue; // (validation must be done)
587 }
588 
589 //------------------------------------------------------------------------
590 // getDirective - skip comments and read next directive
591 // Returns ANNtrue if directive read, and ANNfalse if eof seen.
592 //------------------------------------------------------------------------
593 
594 ANNbool skipComment( // skip any comments
595  istream &in) // input stream
596 {
597  char ch = 0;
598  // skip whitespace
599  do { in.get(ch); } while (isspace(ch) && !in.eof());
600  while (ch == '#' && !in.eof()) { // comment?
601  // skip to end of line
602  do { in.get(ch); } while(ch != '\n' && !in.eof());
603  // skip whitespace
604  do { in.get(ch); } while(isspace(ch) && !in.eof());
605  }
606  if (in.eof()) return ANNfalse; // end of file
607  in.putback(ch); // put character back
608  return ANNtrue;
609 }
610 
612  istream &in, // input stream
613  char *directive) // directive storage
614 {
615  if (!skipComment(in)) // skip comments
616  return ANNfalse; // found eof along the way?
617  in >> directive; // read directive
618  return ANNtrue;
619 }
620 
621 
622 //------------------------------------------------------------------------
623 // main program - driver
624 // The main program reads input options, invokes the necessary
625 // routines to process them.
626 //------------------------------------------------------------------------
627 
628 int main(int argc, char** argv)
629 {
630  long clock0; // clock time
631  char directive[STRING_LEN]; // input directive
632  char arg[STRING_LEN]; // all-purpose argument
633 
634  cout << "------------------------------------------------------------\n"
635  << "ann_test: Version " << ANNversion << " " << ANNversionCmt << "\n"
636  << " Copyright: " << ANNcopyright << ".\n"
637  << " Latest Revision: " << ANNlatestRev << ".\n"
638  << "------------------------------------------------------------\n\n";
639 
640  initGlobals(); // initialize global values
641 
642  //--------------------------------------------------------------------
643  // Main input loop
644  //--------------------------------------------------------------------
645  // read input directive
646  while (getDirective(cin, directive)) {
647  //----------------------------------------------------------------
648  // Read options
649  //----------------------------------------------------------------
650  if (!strcmp(directive,"dim")) {
651  cin >> dim;
652  }
653  else if (!strcmp(directive,"colors")) {
654  cin >> n_color;
655  }
656  else if (!strcmp(directive,"new_clust")) {
657  new_clust = ANNtrue;
658  }
659  else if (!strcmp(directive,"max_clus_dim")) {
660  cin >> max_dim;
661  }
662  else if (!strcmp(directive,"std_dev")) {
663  cin >> std_dev;
664  }
665  else if (!strcmp(directive,"std_dev_lo")) {
666  cin >> std_dev_lo;
667  }
668  else if (!strcmp(directive,"std_dev_hi")) {
669  cin >> std_dev_hi;
670  }
671  else if (!strcmp(directive,"corr_coef")) {
672  cin >> corr_coef;
673  }
674  else if (!strcmp(directive, "data_size")) {
675  cin >> data_size;
676  }
677  else if (!strcmp(directive,"query_size")) {
678  cin >> query_size;
679  }
680  else if (!strcmp(directive,"bucket_size")) {
681  cin >> bucket_size;
682  }
683  else if (!strcmp(directive,"epsilon")) {
684  cin >> epsilon;
685  }
686  else if (!strcmp(directive,"max_pts_visit")) {
687  cin >> max_pts_visit;
688  valid_dirty = ANNtrue; // validation must be redone
689  }
690  else if (!strcmp(directive,"radius_bound")) {
691  cin >> radius_bound;
692  valid_dirty = ANNtrue; // validation must be redone
693  }
694  else if (!strcmp(directive,"near_neigh")) {
695  cin >> near_neigh;
696  true_nn = near_neigh + extra_nn; // also reset true near neighs
697  valid_dirty = ANNtrue; // validation must be redone
698  }
699  else if (!strcmp(directive,"true_near_neigh")) {
700  cin >> true_nn;
701  valid_dirty = ANNtrue; // validation must be redone
702  }
703  //----------------------------------------------------------------
704  // seed option
705  // The seed is reset by setting the global annIdum to the
706  // negation of the seed value. See rand.cpp.
707  //----------------------------------------------------------------
708  else if (!strcmp(directive,"seed")) {
709  cin >> annIdum;
710  annIdum = -annIdum;
711  }
712  //----------------------------------------------------------------
713  // validate option
714  //----------------------------------------------------------------
715  else if (!strcmp(directive,"validate")) {
716  cin >> arg; // input argument
717  if (!strcmp(arg, "on")) {
718  validate = ANNtrue;
719  cout << "validate = on "
720  << "(Warning: this may slow execution time.)\n";
721  }
722  else if (!strcmp(arg, "off")) {
723  validate = ANNfalse;
724  }
725  else {
726  cerr << "Argument: " << arg << "\n";
727  Error("validate argument must be \"on\" or \"off\"", ANNabort);
728  }
729  }
730  //----------------------------------------------------------------
731  // distribution option
732  //----------------------------------------------------------------
733  else if (!strcmp(directive,"distribution")) {
734  cin >> arg; // input name and translate
736  if (distr >= N_DISTRIBS) { // not something we recognize
737  cerr << "Distribution: " << arg << "\n";
738  Error("Unknown distribution", ANNabort);
739  }
740  }
741  //----------------------------------------------------------------
742  // stats option
743  //----------------------------------------------------------------
744  else if (!strcmp(directive,"stats")) {
745  cin >> arg; // input name and translate
747  if (stats >= N_STAT_LEVELS) { // not something we recognize
748  cerr << "Stats level: " << arg << "\n";
749  Error("Unknown statistics level", ANNabort);
750  }
751  if (stats > SILENT)
752  cout << "stats = " << arg << "\n";
753  }
754  //----------------------------------------------------------------
755  // split_rule option
756  //----------------------------------------------------------------
757  else if (!strcmp(directive,"split_rule")) {
758  cin >> arg; // input split_rule name
760  if (split >= N_SPLIT_RULES) { // not something we recognize
761  cerr << "Splitting rule: " << arg << "\n";
762  Error("Unknown splitting rule", ANNabort);
763  }
764  }
765  //----------------------------------------------------------------
766  // shrink_rule option
767  //----------------------------------------------------------------
768  else if (!strcmp(directive,"shrink_rule")) {
769  cin >> arg; // input split_rule name
771  if (shrink >= N_SHRINK_RULES) { // not something we recognize
772  cerr << "Shrinking rule: " << arg << "\n";
773  Error("Unknown shrinking rule", ANNabort);
774  }
775  }
776  //----------------------------------------------------------------
777  // label operation
778  //----------------------------------------------------------------
779  else if (!strcmp(directive,"output_label")) {
780  cin >> arg;
781  if (stats > SILENT)
782  cout << "<" << arg << ">\n";
783  }
784  //----------------------------------------------------------------
785  // gen_data_pts operation
786  //----------------------------------------------------------------
787  else if (!strcmp(directive,"gen_data_pts")) {
788  if (distr == PLANTED) { // planted distribution
789  Error("Cannot use planted distribution for data points", ANNabort);
790  }
791  generatePts( // generate data points
792  data_pts, // data points
793  data_size, // data size
794  DATA, // data points
795  new_clust); // new clusters flag
796  valid_dirty = ANNtrue; // validation must be redone
797  new_clust = ANNfalse; // reset flag
798  }
799  //----------------------------------------------------------------
800  // gen_query_pts operation
801  // If the distribution is PLANTED, then the query points
802  // are planted near the data points (which must already be
803  // generated).
804  //----------------------------------------------------------------
805  else if (!strcmp(directive,"gen_query_pts")) {
806  if (distr == PLANTED) { // planted distribution
807  if (data_pts == NULL) {
808  Error("Must generate data points before query points for planted distribution", ANNabort);
809  }
810  generatePts( // generate query points
811  query_pts, // point array
812  query_size, // number of query points
813  QUERY, // query points
814  new_clust, // new clusters flag
815  data_pts, // plant around data pts
816  data_size);
817  }
818  else { // all other distributions
819  generatePts( // generate query points
820  query_pts, // point array
821  query_size, // number of query points
822  QUERY, // query points
823  new_clust); // new clusters flag
824  }
825  valid_dirty = ANNtrue; // validation must be redone
826  new_clust = ANNfalse; // reset flag
827  }
828  //----------------------------------------------------------------
829  // read_data_pts operation
830  //----------------------------------------------------------------
831  else if (!strcmp(directive,"read_data_pts")) {
832  cin >> arg; // input file name
833  readPts(
834  data_pts, // point array
835  data_size, // number of points
836  arg, // file name
837  DATA); // data points
838  valid_dirty = ANNtrue; // validation must be redone
839  }
840  //----------------------------------------------------------------
841  // read_query_pts operation
842  //----------------------------------------------------------------
843  else if (!strcmp(directive,"read_query_pts")) {
844  cin >> arg; // input file name
845  readPts(
846  query_pts, // point array
847  query_size, // number of points
848  arg, // file name
849  QUERY); // query points
850  valid_dirty = ANNtrue; // validation must be redone
851  }
852  //----------------------------------------------------------------
853  // build_ann operation
854  // We always invoke the constructor for bd-trees. Note
855  // that when the shrinking rule is NONE (which is true by
856  // default), then this constructs a kd-tree.
857  //----------------------------------------------------------------
858  else if (!strcmp(directive,"build_ann")) {
859  //------------------------------------------------------------
860  // Build the tree
861  //------------------------------------------------------------
862  if (the_tree != NULL) { // tree exists already
863  delete the_tree; // get rid of it
864  }
865  clock0 = clock(); // start time
866 
867  the_tree = new ANNbd_tree( // build it
868  data_pts, // the data points
869  data_size, // number of points
870  dim, // dimension of space
871  bucket_size, // maximum bucket size
872  split, // splitting rule
873  shrink); // shrinking rule
874 
875  //------------------------------------------------------------
876  // Print summary
877  //------------------------------------------------------------
878  long prep_time = clock() - clock0; // end of prep time
879 
880  if (stats > SILENT) {
881  cout << "[Build ann-structure:\n";
882  cout << " split_rule = " << split_table[split] << "\n";
883  cout << " shrink_rule = " << shrink_table[shrink] << "\n";
884  cout << " data_size = " << data_size << "\n";
885  cout << " dim = " << dim << "\n";
886  cout << " bucket_size = " << bucket_size << "\n";
887 
888  if (stats >= EXEC_TIME) { // output processing time
889  cout << " process_time = "
890  << double(prep_time)/CLOCKS_PER_SEC << " sec\n";
891  }
892 
893  if (stats >= PREP_STATS) // output or check tree stats
894  treeStats(cout, ANNtrue); // print tree stats
895  else
896  treeStats(cout, ANNfalse); // check stats
897 
898  if (stats >= SHOW_STRUCT) { // print the whole tree
899  cout << " (Structure Contents:\n";
900  the_tree->Print(ANNfalse, cout);
901  cout << " )\n";
902  }
903  cout << "]\n";
904  }
905  }
906  //----------------------------------------------------------------
907  // dump operation
908  //----------------------------------------------------------------
909  else if (!strcmp(directive,"dump")) {
910  cin >> arg; // input file name
911  if (the_tree == NULL) { // no tree
912  Error("Cannot dump. No tree has been built yet", ANNwarn);
913  }
914  else { // there is a tree
915  // try to open file
916  ofstream out_dump_file(arg);
917  if (!out_dump_file) {
918  cerr << "File name: " << arg << "\n";
919  Error("Cannot open dump file", ANNabort);
920  }
921  // dump the tree and points
922  the_tree->Dump(ANNtrue, out_dump_file);
923  if (stats > SILENT) {
924  cout << "(Tree has been dumped to file " << arg << ")\n";
925  }
926  }
927  }
928  //----------------------------------------------------------------
929  // load operation
930  // Since this not only loads a tree, but loads a new set
931  // of data points.
932  //----------------------------------------------------------------
933  else if (!strcmp(directive,"load")) {
934  cin >> arg; // input file name
935  if (the_tree != NULL) { // tree exists already
936  delete the_tree; // get rid of it
937  }
938  if (data_pts != NULL) { // data points exist already
939  delete data_pts; // get rid of them
940  }
941 
942  ifstream in_dump_file(arg); // try to open file
943  if (!in_dump_file) {
944  cerr << "File name: " << arg << "\n";
945  Error("Cannot open file for loading", ANNabort);
946  }
947  // build tree by loading
948  the_tree = new ANNbd_tree(in_dump_file);
949 
950  dim = the_tree->theDim(); // new dimension
951  data_size = the_tree->nPoints(); // number of points
952  data_pts = the_tree->thePoints(); // new points
953 
954  valid_dirty = ANNtrue; // validation must be redone
955 
956  if (stats > SILENT) {
957  cout << "(Tree has been loaded from file " << arg << ")\n";
958  }
959  if (stats >= SHOW_STRUCT) { // print the tree
960  cout << " (Structure Contents:\n";
961  the_tree->Print(ANNfalse, cout);
962  cout << " )\n";
963  }
964  }
965  //----------------------------------------------------------------
966  // run_queries operation
967  // This section does all the query processing. It consists
968  // of the following subsections:
969  //
970  // ** input the argument (standard or priority) and output
971  // the header describing the essential information.
972  // ** allocate space for the results to be stored.
973  // ** run the queries by invoking the appropriate search
974  // procedure on the query points. Print nearest neighbor
975  // if requested.
976  // ** print final summaries
977  //
978  // The approach for processing multiple nearest neighbors is
979  // pretty crude. We allocate an array whose size is the
980  // product of the total number of queries times the number of
981  // nearest neighbors (k), and then use each k consecutive
982  // entries to store the results of each query.
983  //----------------------------------------------------------------
984  else if (!strcmp(directive,"run_queries")) {
985 
986  //------------------------------------------------------------
987  // Input arguments and print summary
988  //------------------------------------------------------------
989  enum {STANDARD, PRIORITY} method;
990 
991  cin >> arg; // input argument
992  if (!strcmp(arg, "standard")) {
993  method = STANDARD;
994  }
995  else if (!strcmp(arg, "priority")) {
996  method = PRIORITY;
997  }
998  else {
999  cerr << "Search type: " << arg << "\n";
1000  Error("Search type must be \"standard\" or \"priority\"",
1001  ANNabort);
1002  }
1003  if (data_pts == NULL || query_pts == NULL) {
1004  Error("Either data set and query set not constructed", ANNabort);
1005  }
1006  if (the_tree == NULL) {
1007  Error("No search tree built.", ANNabort);
1008  }
1009 
1010  //------------------------------------------------------------
1011  // Set up everything
1012  //------------------------------------------------------------
1013 
1014  #ifdef ANN_PERF // performance only
1015  annResetStats(data_size); // reset statistics
1016  #endif
1017 
1018  clock0 = clock(); // start time
1019  // deallocate existing storage
1020  if (apx_nn_idx != NULL) delete [] apx_nn_idx;
1021  if (apx_dists != NULL) delete [] apx_dists;
1022  if (apx_pts_in_range != NULL) delete [] apx_pts_in_range;
1023  // allocate apx answer storage
1026  apx_pts_in_range = new int[query_size];
1027 
1028  annMaxPtsVisit(max_pts_visit); // set max points to visit
1029 
1030  //------------------------------------------------------------
1031  // Run the queries
1032  //------------------------------------------------------------
1033  // pointers for current query
1034  ANNidxArray curr_nn_idx = apx_nn_idx;
1035  ANNdistArray curr_dists = apx_dists;
1036 
1037  for (int i = 0; i < query_size; i++) {
1038  #ifdef ANN_PERF
1039  annResetCounts(); // reset counters
1040  #endif
1041  apx_pts_in_range[i] = 0;
1042 
1043  if (radius_bound == 0) { // no radius bound
1044  if (method == STANDARD) {
1046  query_pts[i], // query point
1047  near_neigh, // number of near neighbors
1048  curr_nn_idx, // nearest neighbors (returned)
1049  curr_dists, // distance (returned)
1050  epsilon); // error bound
1051  }
1052  else if (method == PRIORITY) {
1054  query_pts[i], // query point
1055  near_neigh, // number of near neighbors
1056  curr_nn_idx, // nearest neighbors (returned)
1057  curr_dists, // distance (returned)
1058  epsilon); // error bound
1059  }
1060  else {
1061  Error("Internal error - invalid method", ANNabort);
1062  }
1063  }
1064  else { // use radius bound
1065  if (method != STANDARD) {
1066  Error("A nonzero radius bound assumes standard search",
1067  ANNwarn);
1068  }
1070  query_pts[i], // query point
1071  ANN_POW(radius_bound), // squared radius search bound
1072  near_neigh, // number of near neighbors
1073  curr_nn_idx, // nearest neighbors (returned)
1074  curr_dists, // distance (returned)
1075  epsilon); // error bound
1076  }
1077  curr_nn_idx += near_neigh; // increment current pointers
1078  curr_dists += near_neigh;
1079 
1080  #ifdef ANN_PERF
1081  annUpdateStats(); // update stats
1082  #endif
1083  }
1084 
1085  long query_time = clock() - clock0; // end of query time
1086 
1087  if (validate) { // validation requested
1088  if (valid_dirty) getTrueNN(); // get true near neighbors
1089  doValidation(); // validate
1090  }
1091 
1092  //------------------------------------------------------------
1093  // Print summaries
1094  //------------------------------------------------------------
1095 
1096  if (stats > SILENT) {
1097  cout << "[Run Queries:\n";
1098  cout << " query_size = " << query_size << "\n";
1099  cout << " dim = " << dim << "\n";
1100  cout << " search_method = " << arg << "\n";
1101  cout << " epsilon = " << epsilon << "\n";
1102  cout << " near_neigh = " << near_neigh << "\n";
1103  if (max_pts_visit != 0)
1104  cout << " max_pts_visit = " << max_pts_visit << "\n";
1105  if (radius_bound != 0)
1106  cout << " radius_bound = " << radius_bound << "\n";
1107  if (validate)
1108  cout << " true_nn = " << true_nn << "\n";
1109 
1110  if (stats >= EXEC_TIME) { // print exec time summary
1111  cout << " query_time = " <<
1112  double(query_time)/(query_size*CLOCKS_PER_SEC)
1113  << " sec/query";
1114  #ifdef ANN_PERF
1115  cout << " (biased by perf measurements)";
1116  #endif
1117  cout << "\n";
1118  }
1119 
1120  if (stats >= QUERY_STATS) { // output performance stats
1121  #ifdef ANN_PERF
1122  cout.flush();
1124  #else
1125  cout << " (Performance statistics unavailable.)\n";
1126  #endif
1127  }
1128 
1129  if (stats >= QUERY_RES) { // output results
1130  cout << " (Query Results:\n";
1131  cout << " Pt\tANN\tDist\n";
1132  curr_nn_idx = apx_nn_idx; // subarray pointers
1133  curr_dists = apx_dists;
1134  // output nearest neighbors
1135  for (int i = 0; i < query_size; i++) {
1136  cout << " " << setw(4) << i;
1137  for (int j = 0; j < near_neigh; j++) {
1138  // exit if no more neighbors
1139  if (curr_nn_idx[j] == ANN_NULL_IDX) {
1140  cout << "\t[no other pts in radius bound]\n";
1141  break;
1142  }
1143  else { // output point info
1144  cout << "\t" << curr_nn_idx[j]
1145  << "\t" << ANN_ROOT(curr_dists[j])
1146  << "\n";
1147  }
1148  }
1149  // output range count
1150  if (radius_bound != 0) {
1151  cout << " pts_in_radius_bound = "
1152  << apx_pts_in_range[i] << "\n";
1153  }
1154  // increment subarray pointers
1155  curr_nn_idx += near_neigh;
1156  curr_dists += near_neigh;
1157  }
1158  cout << " )\n";
1159  }
1160  cout << "]\n";
1161  }
1162  }
1163  //----------------------------------------------------------------
1164  // Unknown directive
1165  //----------------------------------------------------------------
1166  else {
1167  cerr << "Directive: " << directive << "\n";
1168  Error("Unknown directive", ANNabort);
1169  }
1170  }
1171  //--------------------------------------------------------------------
1172  // End of input loop (deallocate stuff that was allocated)
1173  //--------------------------------------------------------------------
1174  if (the_tree != NULL) delete the_tree;
1175  if (data_pts != NULL) annDeallocPts(data_pts);
1176  if (query_pts != NULL) annDeallocPts(query_pts);
1177  if (apx_nn_idx != NULL) delete [] apx_nn_idx;
1178  if (apx_dists != NULL) delete [] apx_dists;
1179  if (apx_pts_in_range != NULL) delete [] apx_pts_in_range;
1180 
1181  annClose(); // close ANN
1182 
1183  return EXIT_SUCCESS;
1184 }
1185 
1186 //------------------------------------------------------------------------
1187 // generatePts - call appropriate routine to generate points of a
1188 // given distribution.
1189 //------------------------------------------------------------------------
1190 
1192  ANNpointArray &pa, // point array (returned)
1193  int n, // number of points to generate
1194  PtType type, // point type
1195  ANNbool new_clust, // new cluster centers desired?
1196  ANNpointArray src, // source array (if distr=PLANTED)
1197  int n_src) // source size (if distr=PLANTED)
1198 {
1199  if (pa != NULL) annDeallocPts(pa); // get rid of any old points
1200  pa = annAllocPts(n, dim); // allocate point storage
1201 
1202  switch (distr) {
1203  case UNIFORM: // uniform over cube [-1,1]^d.
1204  annUniformPts(pa, n, dim);
1205  break;
1206  case GAUSS: // Gaussian with mean 0
1207  annGaussPts(pa, n, dim, std_dev);
1208  break;
1209  case LAPLACE: // Laplacian, mean 0 and var 1
1210  annLaplacePts(pa, n, dim);
1211  break;
1212  case CO_GAUSS: // correlated Gaussian
1213  annCoGaussPts(pa, n, dim, corr_coef);
1214  break;
1215  case CO_LAPLACE: // correlated Laplacian
1216  annCoLaplacePts(pa, n, dim, corr_coef);
1217  break;
1218  case CLUS_GAUSS: // clustered Gaussian
1219  annClusGaussPts(pa, n, dim, n_color, new_clust, std_dev);
1220  break;
1221  case CLUS_ORTH_FLATS: // clustered on orthog flats
1222  annClusOrthFlats(pa, n, dim, n_color, new_clust, std_dev, max_dim);
1223  break;
1224  case CLUS_ELLIPSOIDS: // clustered ellipsoids
1225  annClusEllipsoids(pa, n, dim, n_color, new_clust, std_dev,
1227  break;
1228  case PLANTED: // planted distribution
1229  annPlanted(pa, n, dim, src, n_src, std_dev);
1230  break;
1231  default:
1232  Error("INTERNAL ERROR: Unknown distribution", ANNabort);
1233  break;
1234  }
1235 
1236  if (stats > SILENT) {
1237  if(type == DATA) cout << "[Generating Data Points:\n";
1238  else cout << "[Generating Query Points:\n";
1239  cout << " number = " << n << "\n";
1240  cout << " dim = " << dim << "\n";
1241  cout << " distribution = " << distr_table[distr] << "\n";
1242  if (annIdum < 0)
1243  cout << " seed = " << annIdum << "\n";
1244  if (distr == GAUSS || distr == CLUS_GAUSS
1245  || distr == CLUS_ORTH_FLATS)
1246  cout << " std_dev = " << std_dev << "\n";
1247  if (distr == CLUS_ELLIPSOIDS) {
1248  cout << " std_dev = " << std_dev << " (small) \n";
1249  cout << " std_dev_lo = " << std_dev_lo << "\n";
1250  cout << " std_dev_hi = " << std_dev_hi << "\n";
1251  }
1252  if (distr == CO_GAUSS || distr == CO_LAPLACE)
1253  cout << " corr_coef = " << corr_coef << "\n";
1254  if (distr == CLUS_GAUSS || distr == CLUS_ORTH_FLATS
1255  || distr == CLUS_ELLIPSOIDS) {
1256  cout << " colors = " << n_color << "\n";
1257  if (new_clust)
1258  cout << " (cluster centers regenerated)\n";
1259  }
1260  if (distr == CLUS_ORTH_FLATS || distr == CLUS_ELLIPSOIDS) {
1261  cout << " max_dim = " << max_dim << "\n";
1262  }
1263  }
1264  // want to see points?
1265  if ((type == DATA && stats >= SHOW_PTS) ||
1266  (type == QUERY && stats >= QUERY_RES)) {
1267  if(type == DATA) cout << "(Data Points:\n";
1268  else cout << "(Query Points:\n";
1269  for (int i = 0; i < n; i++) {
1270  cout << " " << setw(4) << i << "\t";
1271  printPoint(pa[i], dim);
1272  cout << "\n";
1273  }
1274  cout << " )\n";
1275  }
1276  cout << "]\n";
1277 }
1278 
1279 //------------------------------------------------------------------------
1280 // readPts - read a collection of data or query points.
1281 //------------------------------------------------------------------------
1282 
1283 void readPts(
1284  ANNpointArray &pa, // point array (returned)
1285  int &n, // number of points
1286  char *file_nm, // file name
1287  PtType type) // point type (DATA, QUERY)
1288 {
1289  int i;
1290  //--------------------------------------------------------------------
1291  // Open input file and read points
1292  //--------------------------------------------------------------------
1293  ifstream in_file(file_nm); // try to open data file
1294  if (!in_file) {
1295  cerr << "File name: " << file_nm << "\n";
1296  Error("Cannot open input data/query file", ANNabort);
1297  }
1298  // allocate storage for points
1299  if (pa != NULL) annDeallocPts(pa); // get rid of old points
1300  pa = annAllocPts(n, dim);
1301 
1302  for (i = 0; i < n; i++) { // read the data
1303  if (!(in_file >> pa[i][0])) break;
1304  for (int d = 1; d < dim; d++) {
1305  in_file >> pa[i][d];
1306  }
1307  }
1308 
1309  char ignore_me; // character for EOF test
1310  in_file >> ignore_me; // try to get one more character
1311  if (!in_file.eof()) { // exhausted space before eof
1312  if (type == DATA)
1313  Error("`data_size' too small. Input file truncated.", ANNwarn);
1314  else
1315  Error("`query_size' too small. Input file truncated.", ANNwarn);
1316  }
1317  n = i; // number of points read
1318 
1319  //--------------------------------------------------------------------
1320  // Print summary
1321  //--------------------------------------------------------------------
1322  if (stats > SILENT) {
1323  if (type == DATA) {
1324  cout << "[Read Data Points:\n";
1325  cout << " data_size = " << n << "\n";
1326  }
1327  else {
1328  cout << "[Read Query Points:\n";
1329  cout << " query_size = " << n << "\n";
1330  }
1331  cout << " file_name = " << file_nm << "\n";
1332  cout << " dim = " << dim << "\n";
1333  // print if results requested
1334  if ((type == DATA && stats >= SHOW_PTS) ||
1335  (type == QUERY && stats >= QUERY_RES)) {
1336  cout << " (Points:\n";
1337  for (i = 0; i < n; i++) {
1338  cout << " " << i << "\t";
1339  printPoint(pa[i], dim);
1340  cout << "\n";
1341  }
1342  cout << " )\n";
1343  }
1344  cout << "]\n";
1345  }
1346 }
1347 
1348 //------------------------------------------------------------------------
1349 // getTrueNN
1350 // Computes the true nearest neighbors. For purposes of validation,
1351 // this intentionally done in a rather dumb (but safe way), by
1352 // invoking the brute-force search.
1353 //
1354 // The number of true nearest neighbors is somewhat larger than
1355 // the number of nearest neighbors. This is so that the validation
1356 // can determine the expected difference in element ranks.
1357 //
1358 // This procedure is invoked just prior to running queries. Since
1359 // the operation takes a long time, it is performed only if needed.
1360 // In particular, once generated, it will be regenerated only if
1361 // new query or data points are generated, or if the requested number
1362 // of true near neighbors or approximate near neighbors has changed.
1363 //
1364 // To validate fixed-radius searching, we compute two counts, one
1365 // with the original query radius (trueSqRadius) and the other with
1366 // a radius shrunken by the error factor (minSqradius). We then
1367 // check that the count of points inside the approximate range is
1368 // between these two bounds. Because fixed-radius search is
1369 // allowed to ignore points within the shrunken radius, we only
1370 // compute exact neighbors within this smaller distance (for we
1371 // cannot guarantee that we will even visit the other points).
1372 //------------------------------------------------------------------------
1373 
1374 void getTrueNN() // compute true nearest neighbors
1375 {
1376  if (stats > SILENT) {
1377  cout << "(Computing true nearest neighbors for validation. This may take time.)\n";
1378  }
1379  // deallocate existing storage
1380  if (true_nn_idx != NULL) delete [] true_nn_idx;
1381  if (true_dists != NULL) delete [] true_dists;
1382  if (min_pts_in_range != NULL) delete [] min_pts_in_range;
1383  if (max_pts_in_range != NULL) delete [] max_pts_in_range;
1384 
1385  if (true_nn > data_size) { // can't get more nn than points
1386  true_nn = data_size;
1387  }
1388 
1389  // allocate true answer storage
1392  min_pts_in_range = new int[query_size];
1393  max_pts_in_range = new int[query_size];
1394 
1395  ANNidxArray curr_nn_idx = true_nn_idx; // current locations in arrays
1396  ANNdistArray curr_dists = true_dists;
1397 
1398  // allocate search structure
1399  ANNbruteForce *the_brute = new ANNbruteForce(data_pts, data_size, dim);
1400  // compute nearest neighbors
1401  for (int i = 0; i < query_size; i++) {
1402  if (radius_bound == 0) { // standard kNN search
1403  the_brute->annkSearch( // compute true near neighbors
1404  query_pts[i], // query point
1405  true_nn, // number of nearest neighbors
1406  curr_nn_idx, // where to put indices
1407  curr_dists); // where to put distances
1408  }
1409  else { // fixed radius kNN search
1410  // search radii limits
1411  ANNdist trueSqRadius = ANN_POW(radius_bound);
1412  ANNdist minSqRadius = ANN_POW(radius_bound / (1+epsilon));
1413  min_pts_in_range[i] = the_brute->annkFRSearch(
1414  query_pts[i], // query point
1415  minSqRadius, // shrunken search radius
1416  true_nn, // number of near neighbors
1417  curr_nn_idx, // nearest neighbors (returned)
1418  curr_dists); // distance (returned)
1419  max_pts_in_range[i] = the_brute->annkFRSearch(
1420  query_pts[i], // query point
1421  trueSqRadius, // true search radius
1422  0, NULL, NULL); // (ignore kNN info)
1423  }
1424  curr_nn_idx += true_nn; // increment nn index pointer
1425  curr_dists += true_nn; // increment nn dist pointer
1426  }
1427  delete the_brute; // delete brute-force struct
1428  valid_dirty = ANNfalse; // validation good for now
1429 }
1430 
1431 //------------------------------------------------------------------------
1432 // doValidation
1433 // Compares the approximate answers to the k-nearest neighbors
1434 // against the true nearest neighbors (computed earlier). It is
1435 // assumed that the true nearest neighbors and indices have been
1436 // computed earlier.
1437 //
1438 // First, we check that all the results are within their allowed
1439 // limits, and generate an internal error, if not. For the sake of
1440 // performance evaluation, we also compute the following two
1441 // quantities for nearest neighbors:
1442 //
1443 // Average Error
1444 // -------------
1445 // The relative error between the distance to a reported nearest
1446 // neighbor and the true nearest neighbor (of the same rank),
1447 //
1448 // Rank Error
1449 // ----------
1450 // The difference in rank between the reported nearest neighbor and
1451 // its position (if any) among the true nearest neighbors. If we
1452 // cannot find this point among the true nearest neighbors, then
1453 // it assumed that the rank of the true nearest neighbor is true_nn+1.
1454 //
1455 // Because of the possibility of duplicate distances, this is computed
1456 // as follows. For the j-th reported nearest neighbor, we count the
1457 // number of true nearest neighbors that are at least this close. Let
1458 // this be rnk. Then the rank error is max(0, j-rnk). (In the code
1459 // below, j is an array index and so the first item is 0, not 1. Thus
1460 // we take max(0, j+1-rnk) instead.)
1461 //
1462 // For the results of fixed-radious range count, we verify that the
1463 // reported number of points in the range lies between the actual
1464 // number of points in the shrunken and the true search radius.
1465 //------------------------------------------------------------------------
1466 
1467 void doValidation() // perform validation
1468 {
1469  int* curr_apx_idx = apx_nn_idx; // approx index pointer
1470  ANNdistArray curr_apx_dst = apx_dists; // approx distance pointer
1471  int* curr_tru_idx = true_nn_idx; // true index pointer
1472  ANNdistArray curr_tru_dst = true_dists; // true distance pointer
1473  int i, j;
1474 
1475  if (true_nn < near_neigh) {
1476  Error("Cannot validate with fewer true near neighbors than actual", ANNabort);
1477  }
1478 
1479  for (i = 0; i < query_size; i++) { // validate each query
1480  //----------------------------------------------------------------
1481  // Compute result errors
1482  // In fixed radius search it is possible that not all k
1483  // nearest neighbors were computed. Because the true
1484  // results are computed over the shrunken radius, we should
1485  // have at least as many true nearest neighbors as
1486  // approximate nearest neighbors. (If not, an infinite
1487  // error will be generated, and so an internal error will
1488  // will be generated.
1489  //
1490  // Because nearest neighbors are sorted in increasing order
1491  // of distance, as soon as we see a null index, we can
1492  // terminate the distance checking. The error in the
1493  // result should not exceed epsilon. However, if
1494  // max_pts_visit is nonzero (meaning that the search is
1495  // terminated early) this might happen.
1496  //----------------------------------------------------------------
1497  for (j = 0; j < near_neigh; j++) {
1498  if (curr_tru_idx[j] == ANN_NULL_IDX)// no more true neighbors?
1499  break;
1500  // true i-th smallest distance
1501  double true_dist = ANN_ROOT(curr_tru_dst[j]);
1502  // reported i-th smallest
1503  double rept_dist = ANN_ROOT(curr_apx_dst[j]);
1504  // better than optimum?
1505  if (rept_dist < true_dist*(1-ERR)) {
1506  Error("INTERNAL ERROR: True nearest neighbor incorrect",
1507  ANNabort);
1508  }
1509 
1510  double resultErr; // result error
1511  if (true_dist == 0.0) { // let's not divide by zero
1512  if (rept_dist != 0.0) resultErr = ANN_DBL_MAX;
1513  else resultErr = 0.0;
1514  }
1515  else {
1516  resultErr = (rept_dist - true_dist) / ((double) true_dist);
1517  }
1518 
1519  if (resultErr > epsilon + RND_OFF && max_pts_visit == 0) {
1520  Error("INTERNAL ERROR: Actual error exceeds epsilon",
1521  ANNabort);
1522  }
1523  #ifdef ANN_PERF
1524  ann_average_err += resultErr; // update statistics error
1525  #endif
1526  }
1527  //--------------------------------------------------------------------
1528  // Compute rank errors (only needed for perf measurements)
1529  //--------------------------------------------------------------------
1530  #ifdef ANN_PERF
1531  for (j = 0; j < near_neigh; j++) {
1532  if (curr_tru_idx[i] == ANN_NULL_IDX) // no more true neighbors?
1533  break;
1534 
1535  double rnkErr = 0.0; // rank error
1536  // reported j-th distance
1537  ANNdist rept_dist = curr_apx_dst[j];
1538  int rnk = 0; // compute rank of this item
1539  while (rnk < true_nn && curr_tru_dst[rnk] <= rept_dist)
1540  rnk++;
1541  if (j+1-rnk > 0) rnkErr = (double) (j+1-rnk);
1542  ann_rank_err += rnkErr; // update average rank error
1543  }
1544  #endif
1545  //----------------------------------------------------------------
1546  // Check range counts from fixed-radius query
1547  //----------------------------------------------------------------
1548  if (radius_bound != 0) { // fixed-radius search
1549  if (apx_pts_in_range[i] < min_pts_in_range[i] ||
1551  Error("INTERNAL ERROR: Invalid fixed-radius range count",
1552  ANNabort);
1553  }
1554 
1555  curr_apx_idx += near_neigh;
1556  curr_apx_dst += near_neigh;
1557  curr_tru_idx += true_nn; // increment current pointers
1558  curr_tru_dst += true_nn;
1559  }
1560 }
1561 
1562 //----------------------------------------------------------------------
1563 // treeStats
1564 // Computes a number of statistics related to kd_trees and
1565 // bd_trees. These statistics are printed if in verbose mode,
1566 // and otherwise they are only printed if they are deemed to
1567 // be outside of reasonable operating bounds.
1568 //----------------------------------------------------------------------
1569 
1570 #define log2(x) (log(x)/log(2.0)) // log base 2
1571 
1573  ostream &out, // output stream
1574  ANNbool verbose) // print stats
1575 {
1576  const int MIN_PTS = 20; // min no. pts for checking
1577  const float MAX_FRAC_TL = 0.50; // max frac of triv leaves
1578  const float MAX_AVG_AR = 20; // max average aspect ratio
1579 
1580  ANNkdStats st; // statistics structure
1581 
1582  the_tree->getStats(st); // get statistics
1583  // total number of nodes
1584  int n_nodes = st.n_lf + st.n_spl + st.n_shr;
1585  // should be O(n/bs)
1586  int opt_n_nodes = (int) (2*(float(st.n_pts)/st.bkt_size));
1587  int too_many_nodes = 10*opt_n_nodes;
1588  if (st.n_pts >= MIN_PTS && n_nodes > too_many_nodes) {
1589  out << "-----------------------------------------------------------\n";
1590  out << "Warning: The tree has more than 10x as many nodes as points.\n";
1591  out << "You may want to consider a different split or shrink method.\n";
1592  out << "-----------------------------------------------------------\n";
1593  verbose = ANNtrue;
1594  }
1595  // fraction of trivial leaves
1596  float frac_tl = (st.n_lf == 0 ? 0 : ((float) st.n_tl)/ st.n_lf);
1597  if (st.n_pts >= MIN_PTS && frac_tl > MAX_FRAC_TL) {
1598  out << "-----------------------------------------------------------\n";
1599  out << "Warning: A significant fraction of leaves contain no points.\n";
1600  out << "You may want to consider a different split or shrink method.\n";
1601  out << "-----------------------------------------------------------\n";
1602  verbose = ANNtrue;
1603  }
1604  // depth should be O(dim*log n)
1605  int too_many_levels = (int) (2.0 * st.dim * log2((double) st.n_pts));
1606  int opt_levels = (int) log2(double(st.n_pts)/st.bkt_size);
1607  if (st.n_pts >= MIN_PTS && st.depth > too_many_levels) {
1608  out << "-----------------------------------------------------------\n";
1609  out << "Warning: The tree is more than 2x as deep as (dim*log n).\n";
1610  out << "You may want to consider a different split or shrink method.\n";
1611  out << "-----------------------------------------------------------\n";
1612  verbose = ANNtrue;
1613  }
1614  // average leaf aspect ratio
1615  if (st.n_pts >= MIN_PTS && st.avg_ar > MAX_AVG_AR) {
1616  out << "-----------------------------------------------------------\n";
1617  out << "Warning: Average aspect ratio of cells is quite large.\n";
1618  out << "This may slow queries depending on the point distribution.\n";
1619  out << "-----------------------------------------------------------\n";
1620  verbose = ANNtrue;
1621  }
1622 
1623  //------------------------------------------------------------------
1624  // Print summaries if requested
1625  //------------------------------------------------------------------
1626  if (verbose) { // output statistics
1627  out << " (Structure Statistics:\n";
1628  out << " n_nodes = " << n_nodes
1629  << " (opt = " << opt_n_nodes
1630  << ", best if < " << too_many_nodes << ")\n"
1631  << " n_leaves = " << st.n_lf
1632  << " (" << st.n_tl << " contain no points)\n"
1633  << " n_splits = " << st.n_spl << "\n"
1634  << " n_shrinks = " << st.n_shr << "\n";
1635  out << " empty_leaves = " << frac_tl*100
1636  << " percent (best if < " << MAX_FRAC_TL*100 << " percent)\n";
1637  out << " depth = " << st.depth
1638  << " (opt = " << opt_levels
1639  << ", best if < " << too_many_levels << ")\n";
1640  out << " avg_aspect_ratio = " << st.avg_ar
1641  << " (best if < " << MAX_AVG_AR << ")\n";
1642  out << " )\n";
1643  }
1644 }
int * apx_pts_in_range
Definition: ann_test.cpp:538
ANNpointArray data_pts
Definition: ann_test.cpp:533
DLL_API void annClose()
Definition: kd_tree.cpp:221
int query_size
Definition: ann_test.cpp:474
const char shrink_table[N_SHRINK_RULES][STRING_LEN]
Definition: ann_test.cpp:363
int * max_pts_in_range
Definition: ann_test.cpp:542
int nPoints()
Definition: ANN.h:766
int ANNidx
Definition: ANN.h:175
ANNpoint * ANNpointArray
Definition: ANN.h:376
const int def_max_visit
Definition: ann_test.cpp:455
DLL_API ANNsampStat ann_rank_err
Definition: perf.cpp:65
const int def_seed
Definition: ann_test.cpp:459
void annClusGaussPts(ANNpointArray pa, int n, int dim, int n_clus, ANNbool new_clust, double std_dev)
Definition: rand.cpp:314
int n_pts
Definition: ANNperf.h:48
DLL_API void annResetStats(int data_size)
Definition: perf.cpp:71
const int STRING_LEN
Definition: kd_dump.cpp:43
const double def_epsilon
Definition: ann_test.cpp:453
virtual void Print(ANNbool with_pts, std::ostream &out)
Definition: kd_tree.cpp:104
ANNdistArray apx_dists
Definition: ann_test.cpp:537
const double RND_OFF
Definition: ann_test.cpp:285
void printPoint(ANNpoint p, int dim)
Definition: ann_test.cpp:389
virtual void getStats(ANNkdStats &st)
Definition: kd_tree.cpp:191
void doValidation()
Definition: ann_test.cpp:1467
int depth
Definition: ANNperf.h:54
double corr_coef
Definition: ann_test.cpp:479
int bucket_size
Definition: ann_test.cpp:483
double epsilon
Definition: ann_test.cpp:484
int annkFRSearch(ANNpoint q, ANNdist sqRad, int k, ANNidxArray nn_idx=NULL, ANNdistArray dd=NULL, double eps=0.0)
const ANNshrinkRule def_shrink
Definition: ann_test.cpp:466
const int def_true_nn
Definition: ann_test.cpp:458
double ANNdist
Definition: ANN.h:159
const double def_corr_coef
Definition: ann_test.cpp:451
ANNerr
Definition: ANNx.h:48
StatLev
Definition: ann_test.cpp:297
void annkSearch(ANNpoint q, int k, ANNidxArray nn_idx, ANNdistArray dd, double eps=0.0)
Definition: kd_search.cpp:89
ANNbool validate
Definition: ann_test.cpp:489
const ANNbool def_validate
Definition: ann_test.cpp:460
DLL_API void annUpdateStats()
Definition: perf.cpp:95
int data_size
Definition: ann_test.cpp:473
int n_tl
Definition: ANNperf.h:51
const ANNsplitRule def_split
Definition: ann_test.cpp:464
ANNdistArray true_dists
Definition: ann_test.cpp:540
ANNcoord * ANNpoint
Definition: ANN.h:375
int n_color
Definition: ann_test.cpp:475
ANNbool valid_dirty
Definition: ann_test.cpp:544
ANNpointArray query_pts
Definition: ann_test.cpp:534
ANNbd_tree * the_tree
Definition: ann_test.cpp:535
const StatLev def_stats
Definition: ann_test.cpp:462
const ANNidx ANN_NULL_IDX
Definition: ANN.h:176
DLL_API ANNsampStat ann_average_err
Definition: perf.cpp:64
const double ERR
Definition: kd_split.cpp:34
const int def_bucket_size
Definition: ann_test.cpp:452
double std_dev_lo
Definition: ann_test.cpp:481
void generatePts(ANNpointArray &pa, int n, PtType type, ANNbool new_clust, ANNpointArray src=NULL, int n_src=0)
Definition: ann_test.cpp:1191
Distrib distr
Definition: ann_test.cpp:478
void annkPriSearch(ANNpoint q, int k, ANNidxArray nn_idx, ANNdistArray dd, double eps=0.0)
double radius_bound
Definition: ann_test.cpp:487
DLL_API void annDeallocPts(ANNpointArray &pa)
Definition: ANN.cpp:133
const int def_rad_bound
Definition: ann_test.cpp:456
StatLev stats
Definition: ann_test.cpp:490
int annkFRSearch(ANNpoint q, ANNdist sqRad, int k=0, ANNidxArray nn_idx=NULL, ANNdistArray dd=NULL, double eps=0.0)
Definition: brute.cpp:80
Definition: ANNx.h:48
ANNbool getDirective(istream &in, char *directive)
Definition: ann_test.cpp:611
void annPlanted(ANNpointArray pa, int n, int dim, ANNpointArray src, int n_src, double std_dev)
Definition: rand.cpp:580
const double ANN_DBL_MAX
Definition: ANN.h:114
Distrib
Definition: ann_test.cpp:321
int dim
Definition: ANNperf.h:47
const int def_data_size
Definition: ann_test.cpp:444
DLL_API void annPrintStats(ANNbool validate)
Definition: perf.cpp:116
const int def_near_neigh
Definition: ann_test.cpp:454
int n_spl
Definition: ANNperf.h:52
void treeStats(ostream &out, ANNbool verbose)
Definition: ann_test.cpp:1572
ANNidxArray apx_nn_idx
Definition: ann_test.cpp:536
ANNsplitRule split
Definition: ann_test.cpp:491
const int def_query_size
Definition: ann_test.cpp:445
void readPts(ANNpointArray &pa, int &n, char *file_nm, PtType type)
Definition: ann_test.cpp:1283
const int extra_nn
Definition: ann_test.cpp:441
void annClusEllipsoids(ANNpointArray pa, int n, int dim, int n_clus, ANNbool new_clust, double std_dev_small, double std_dev_lo, double std_dev_hi, int max_dim)
Definition: rand.cpp:503
void annLaplacePts(ANNpointArray pa, int n, int dim)
Definition: rand.cpp:232
ANNdist * ANNdistArray
Definition: ANN.h:377
ANNshrinkRule shrink
Definition: ann_test.cpp:492
int n_shr
Definition: ANNperf.h:53
void getTrueNN()
Definition: ann_test.cpp:1374
int true_nn
Definition: ann_test.cpp:488
void annGaussPts(ANNpointArray pa, int n, int dim, double std_dev)
Definition: rand.cpp:214
const char split_table[N_SPLIT_RULES][STRING_LEN]
Definition: ann_test.cpp:350
const double def_std_dev
Definition: ann_test.cpp:450
ANNidxArray true_nn_idx
Definition: ann_test.cpp:539
const char distr_table[N_DISTRIBS][STRING_LEN]
Definition: ann_test.cpp:334
int bkt_size
Definition: ANNperf.h:49
int annIdum
Definition: rand.cpp:42
void annCoGaussPts(ANNpointArray pa, int n, int dim, double correlation)
Definition: rand.cpp:250
const int def_n_color
Definition: ann_test.cpp:446
int dim
Definition: ann_test.cpp:472
Definition: ANN.h:132
int theDim()
Definition: ANN.h:763
PtType
Definition: ann_test.cpp:291
void annkSearch(ANNpoint q, int k, ANNidxArray nn_idx, ANNdistArray dd, double eps=0.0)
Definition: brute.cpp:54
float avg_ar
Definition: ANNperf.h:56
const Distrib def_distr
Definition: ann_test.cpp:449
const int N_SPLIT_RULES
Definition: ann_test.cpp:349
DLL_API void annMaxPtsVisit(int maxPts)
Definition: ANN.cpp:197
const int def_dim
Definition: ann_test.cpp:443
ANNsplitRule
Definition: ANN.h:596
int near_neigh
Definition: ann_test.cpp:485
const ANNbool def_new_clust
Definition: ann_test.cpp:447
ANNpointArray thePoints()
Definition: ANN.h:769
ANNbool skipComment(istream &in)
Definition: ann_test.cpp:594
int max_dim
Definition: ann_test.cpp:477
ANNidx * ANNidxArray
Definition: ANN.h:378
void Error(const char *msg, ANNerr level)
Definition: ann_test.cpp:376
ANNbool new_clust
Definition: ann_test.cpp:476
DLL_API ANNpointArray annAllocPts(int n, int dim)
Definition: ANN.cpp:117
const char stat_table[N_STAT_LEVELS][STRING_LEN]
Definition: ann_test.cpp:308
int lookUp(const char *arg, const char(*table)[STRING_LEN], int size)
Definition: ann_test.cpp:401
Definition: ANN.h:132
ANNbool
Definition: ANN.h:132
DLL_API void annResetCounts()
Definition: perf.cpp:85
double std_dev
Definition: ann_test.cpp:480
int n_lf
Definition: ANNperf.h:50
void annUniformPts(ANNpointArray pa, int n, int dim)
Definition: rand.cpp:196
void initGlobals()
Definition: ann_test.cpp:550
const int def_max_dim
Definition: ann_test.cpp:448
Definition: ANNx.h:48
const int N_SHRINK_RULES
Definition: ann_test.cpp:362
virtual void Dump(ANNbool with_pts, std::ostream &out)
Definition: kd_dump.cpp:102
int max_pts_visit
Definition: ann_test.cpp:486
ANNshrinkRule
Definition: ANN.h:605
double std_dev_hi
Definition: ann_test.cpp:482
int main(int argc, char **argv)
Definition: ann_test.cpp:628
void annCoLaplacePts(ANNpointArray pa, int n, int dim, double correlation)
Definition: rand.cpp:273
void annClusOrthFlats(ANNpointArray pa, int n, int dim, int n_clus, ANNbool new_clust, double std_dev, int max_dim)
Definition: rand.cpp:408
int * min_pts_in_range
Definition: ann_test.cpp:541

Generated on Tue Jun 5 2018 19:48:54 for queso-0.57.1 by  doxygen 1.8.5