queso-0.53.0
ann_test.cpp
Go to the documentation of this file.
1 //----------------------------------------------------------------------
2 // File: ann_test.cpp
3 // Programmer: Sunil Arya and David Mount
4 // Description: test program for ANN (approximate nearest neighbors)
5 // Last modified: 01/27/10 (Version 1.1.2)
6 //----------------------------------------------------------------------
7 // Copyright (c) 1997-2010 University of Maryland and Sunil Arya and
8 // David Mount. All Rights Reserved.
9 //
10 // This software and related documentation is part of the Approximate
11 // Nearest Neighbor Library (ANN). This software is provided under
12 // the provisions of the Lesser GNU Public License (LGPL). See the
13 // file ../ReadMe.txt for further information.
14 //
15 // The University of Maryland (U.M.) and the authors make no
16 // representations about the suitability or fitness of this software for
17 // any purpose. It is provided "as is" without express or implied
18 // warranty.
19 //----------------------------------------------------------------------
20 // History:
21 // Revision 0.1 03/04/98
22 // Initial release
23 // Revision 0.2 06/26/98
24 // Added CLOCKS_PER_SEC definition if needed
25 // Revision 1.0 04/01/05
26 // Added comments (from "#" to eol)
27 // Added clus_orth_flats and clus_ellipsoids distributions
28 // Fixed order of fair and midpt in split_table
29 // Added dump/load operations
30 // Cleaned up C++ for modern compilers
31 // Revision 1.1 05/03/05
32 // Added fixed radius kNN search
33 // Revision 1.1.1 08/04/06
34 // Added planted distribution
35 // Revision 1.1.2 01/27/10
36 // Fixed minor compilation bugs for new versions of gcc
37 // Allow round-off error in validation test
38 //----------------------------------------------------------------------
39 
40 #include <ctime> // clock
41 #include <cmath> // math routines
42 #include <cstring> // C string ops
43 #include <fstream> // file I/O
44 
45 #include <ANN/ANN.h> // ANN declarations
46 #include <ANN/ANNx.h> // more ANN declarations
47 #include <ANN/ANNperf.h> // performance evaluation
48 
49 #include "rand.h" // random point generation
50 
51 #ifndef CLOCKS_PER_SEC // define clocks-per-second if needed
52  #define CLOCKS_PER_SEC 1000000
53 #endif
54 
55 using namespace std; // make std:: available
56 
57 //----------------------------------------------------------------------
58 // ann_test
59 //
60 // This program is a driver for testing and evaluating the ANN library
61 // for computing approximate nearest neighbors. It allows the user to
62 // generate data and query sets of various sizes, dimensions, and
63 // distributions, to build kd- and bbd-trees of various types, and then
64 // run queries and outputting various performance statistics.
65 //
66 // Overview:
67 // ---------
68 // The test program is run as follows:
69 //
70 // ann_test < test_input > test_output
71 //
72 // where the test_input file contains a list of directives as described
73 // below. Directives consist of a directive name, followed by list of
74 // arguments (depending on the directive). Arguments and directives are
75 // separated by white space (blank, tab, and newline). String arguments
76 // are not quoted, and consist of a string of nonwhite chacters. A
77 // character "#" denotes a comment. The following characters up to
78 // the end of line are ignored. Comments may only be inserted between
79 // directives (not within the argument list of a directive).
80 //
81 // Basic operations:
82 // -----------------
83 // The test program can perform the following operations. How these
84 // operations are performed depends on the options which are described
85 // later.
86 //
87 // Data Generation:
88 // ----------------
89 // read_data_pts <file> Create a set of data points whose
90 // coordinates are input from file <file>.
91 // gen_data_pts Create a set of data points whose
92 // coordinates are generated from the
93 // current point distribution.
94 //
95 // Building the tree:
96 // ------------------
97 // build_ann Generate an approximate nearest neighbor
98 // structure for the current data set, using
99 // the selected splitting rules. Any existing
100 // tree will be destroyed.
101 //
102 // Query Generation/Searching:
103 // ---------------------------
104 // read_query_pts <file> Create a set of query points whose
105 // coordinates are input from file <file>.
106 // gen_query_pts Create a set of query points whose
107 // coordinates are generated from the
108 // current point distribution.
109 // run_queries <string> Apply nearest neighbor searching to the
110 // query points using the approximate nearest
111 // neighbor structure and the given search
112 // strategy. Possible strategies are:
113 // standard = standard kd-tree search
114 // priority = priority search
115 //
116 // Miscellaneous:
117 // --------------
118 // output_label Output a label to the output file.
119 // dump <file> Dump the current structure to given file.
120 // (The dump format is explained further in
121 // the source file kd_tree.cc.)
122 // load <file> Load a tree from a data file which was
123 // created by the dump operation. Any
124 // existing tree will be destroyed.
125 //
126 // Options:
127 // --------
128 // How these operations are performed depends on a set of options.
129 // If an option is not specified, a default value is used. An option
130 // retains its value until it is set again. String inputs are not
131 // enclosed in quotes, and must contain no embedded white space (sorry,
132 // this is C++'s convention).
133 //
134 // Options affecting search tree structure:
135 // ----------------------------------------
136 // split_rule <type> Type of splitting rule to use in building
137 // the search tree. Choices are:
138 // kd = optimized kd-tree
139 // midpt = midpoint split
140 // fair = fair split
141 // sl_midpt = sliding midpt split
142 // sl_fair = sliding fair split
143 // suggest = authors' choice for best
144 // The default is "suggest". See the file
145 // kd_split.cc for more detailed information.
146 //
147 // shrink_rule <type> Type of shrinking rule to use in building
148 // a bd-tree data structure. If "none" is
149 // given, then no shrinking is performed and
150 // the result is a kd-tree. Choices are:
151 // none = perform no shrinking
152 // simple = simple shrinking
153 // centroid = centroid shrinking
154 // suggest = authors' choice for best
155 // The default is "none". See the file
156 // bd_tree.cc for more information.
157 // bucket_size <int> Bucket size, that is, the maximum number of
158 // points stored in each leaf node.
159 //
160 // Options affecting data and query point generation:
161 // --------------------------------------------------
162 // dim <int> Dimension of space.
163 // seed <int> Seed for random number generation.
164 // data_size <int> Number of data points. When reading data
165 // points from a file, this indicates the
166 // maximum number of points for storage
167 // allocation. Default = 100.
168 // query_size <int> Same as data_size for query points.
169 // std_dev <float> Standard deviation (used in gauss,
170 // planted, and clustered distributions).
171 // This is the "small" distribution for
172 // clus_ellipsoids. Default = 1.
173 // std_dev_lo <float> Low and high standard deviations (used in
174 // std_dev_hi <float> clus_ellipsoids). Default = 1.
175 // corr_coef <float> Correlation coefficient (used in co-gauss
176 // and co_lapace distributions). Default = 0.05.
177 // colors <int> Number of color classes (clusters) (used
178 // in the clustered distributions). Default = 5.
179 // new_clust Once generated, cluster centers are not
180 // normally regenerated. This is so that both
181 // query points and data points can be generated
182 // using the same set of clusters. This option
183 // forces new cluster centers to be generated
184 // with the next generation of either data or
185 // query points.
186 // max_clus_dim <int> Maximum dimension of clusters (used in
187 // clus_orth_flats and clus_ellipsoids).
188 // Default = 1.
189 // distribution <string> Type of input distribution
190 // uniform = uniform over cube [-1,1]^d.
191 // gauss = Gaussian with mean 0
192 // laplace = Laplacian, mean 0 and var 1
193 // co_gauss = correlated Gaussian
194 // co_laplace = correlated Laplacian
195 // clus_gauss = clustered Gaussian
196 // clus_orth_flats = clusters of orth flats
197 // clus_ellipsoids = clusters of ellipsoids
198 // planted = planted distribution
199 // See the file rand.cpp for further information.
200 //
201 // Options affecting nearest neighbor search:
202 // ------------------------------------------
203 // epsilon <float> Error bound for approx. near neigh. search.
204 // near_neigh <int> Number of nearest neighbors to compute.
205 // max_pts_visit <int> Maximum number of points to visit before
206 // terminating. (Used in applications where
207 // real-time performance is important.)
208 // (Default = 0, which means no limit.)
209 // radius_bound <float> Sets an upper bound on the nearest
210 // neighbor search radius. If the bound is
211 // positive, then fixed-radius nearest
212 // neighbor searching is performed, and the
213 // count of the number of points in the
214 // range is returned. If the bound is
215 // zero, then standard search is used.
216 // This can only be used with standard, not
217 // priority, search. (Default = 0, which
218 // means standard search.)
219 //
220 // Options affection general program behavior:
221 // -------------------------------------------
222 // stats <string> Level of statistics output
223 // silent = no output,
224 // exec_time += execution time only
225 // prep_stats += preprocessing statistics
226 // query_stats += query performance stats
227 // query_res += results of queries
228 // show_pts += show the data points
229 // show_struct += print search structure
230 // validate <string> Validate experiment and compute average
231 // error. Since validation causes exact
232 // nearest neighbors to be computed by the
233 // brute force method, this can take a long
234 // time. Valid arguments are:
235 // on = turn validation on
236 // off = turn validation off
237 // true_near_neigh <int> Number of true nearest neighbors to compute.
238 // When validating, we compute the difference
239 // in rank between each reported nearest neighbor
240 // and the true nearest neighbor of the same
241 // rank. Thus it is necessary to compute a
242 // few more true nearest neighbors. By default
243 // we compute 10 more than near_neigh. With
244 // this option the exact number can be set.
245 // (Used only when validating.)
246 //
247 // Example:
248 // --------
249 // output_label test_run_0 # output label for this run
250 // validate off # do not perform validation
251 // dim 16 # points in dimension 16
252 // stats query_stats # output performance statistics for queries
253 // seed 121212 # random number seed
254 // data_size 1000
255 // distribution uniform
256 // gen_data_pts # 1000 uniform data points in dim 16
257 // query_size 100
258 // std_dev 0.05
259 // distribution clus_gauss
260 // gen_query_pts # 100 points in 10 clusters with std_dev 0.05
261 // bucket_size 2
262 // split_rule kd
263 // shrink_rule none
264 // build_ann # kd-tree, bucket size 2
265 // epsilon 0.1
266 // near_neigh 5
267 // max_pts_visit 100 # stop search if more than 100 points seen
268 // run_queries standard # run queries; 5 nearest neighbors, 10% error
269 // data_size 500
270 // read_data_pts data.in # read up to 500 points from file data.in
271 // split_rule sl_midpt
272 // shrink_rule simple
273 // build_ann # bd-tree; simple shrink, sliding midpoint split
274 // epsilon 0
275 // run_queries priority # run same queries; 0 allowable error
276 //
277 //------------------------------------------------------------------------
278 
279 //------------------------------------------------------------------------
280 // Constants
281 //------------------------------------------------------------------------
282 
283 const int STRING_LEN = 500; // max string length
284 const double ERR = 0.00001; // epsilon (for float compares)
285 const double RND_OFF = 5E-16; // double round-off error
286 
287 //------------------------------------------------------------------------
288 // Enumerated values and conversions
289 //------------------------------------------------------------------------
290 
291 typedef enum {DATA, QUERY} PtType; // point types
292 
293 //------------------------------------------------------------------------
294 // Statistics output levels
295 //------------------------------------------------------------------------
296 
297 typedef enum { // stat levels
298  SILENT, // no output
299  EXEC_TIME, // just execution time
300  PREP_STATS, // preprocessing info
301  QUERY_STATS, // query performance
302  QUERY_RES, // query results
303  SHOW_PTS, // show data points
304  SHOW_STRUCT, // show tree structure
305  N_STAT_LEVELS} // number of levels
306  StatLev;
307 
309  "silent", // SILENT
310  "exec_time", // EXEC_TIME
311  "prep_stats", // PREP_STATS
312  "query_stats", // QUERY_STATS
313  "query_res", // QUERY_RES
314  "show_pts", // SHOW_PTS
315  "show_struct"}; // SHOW_STRUCT
316 
317 //------------------------------------------------------------------------
318 // Distributions
319 //------------------------------------------------------------------------
320 
321 typedef enum { // distributions
322  UNIFORM, // uniform over cube [-1,1]^d.
323  GAUSS, // Gaussian with mean 0
324  LAPLACE, // Laplacian, mean 0 and var 1
325  CO_GAUSS, // correlated Gaussian
326  CO_LAPLACE, // correlated Laplacian
327  CLUS_GAUSS, // clustered Gaussian
328  CLUS_ORTH_FLATS, // clustered on orthog flats
329  CLUS_ELLIPSOIDS, // clustered on ellipsoids
330  PLANTED, // planted distribution
332  Distrib;
333 
335  "uniform", // UNIFORM
336  "gauss", // GAUSS
337  "laplace", // LAPLACE
338  "co_gauss", // CO_GAUSS
339  "co_laplace", // CO_LAPLACE
340  "clus_gauss", // CLUS_GAUSS
341  "clus_orth_flats", // CLUS_ORTH_FLATS
342  "clus_ellipsoids", // CLUS_ELLIPSOIS
343  "planted"}; // PLANTED
344 
345 //------------------------------------------------------------------------
346 // Splitting rules for kd-trees (see ANN.h for types)
347 //------------------------------------------------------------------------
348 
349 const int N_SPLIT_RULES = 6;
351  "standard", // standard optimized kd-tree
352  "midpt", // midpoint split
353  "fair", // fair split
354  "sl_midpt", // sliding midpt split
355  "sl_fair", // sliding fair split
356  "suggest"}; // authors' choice for best
357 
358 //------------------------------------------------------------------------
359 // Shrinking rules for bd-trees (see ANN.h for types)
360 //------------------------------------------------------------------------
361 
362 const int N_SHRINK_RULES = 4;
364  "none", // perform no shrinking (kd-tree)
365  "simple", // simple shrinking
366  "centroid", // centroid shrinking
367  "suggest"}; // authors' choice for best
368 
369 //----------------------------------------------------------------------
370 // Short utility functions
371 // Error - general error routine
372 // printPoint - print a point to standard output
373 // lookUp - look up a name in table and return index
374 //----------------------------------------------------------------------
375 
376 void Error( // error routine
377  const char* msg, // error message
378  ANNerr level) // abort afterwards
379 {
380  if (level == ANNabort) {
381  cerr << "ann_test: ERROR------->" << msg << "<-------------ERROR\n";
382  exit(1);
383  }
384  else {
385  cerr << "ann_test: WARNING----->" << msg << "<-------------WARNING\n";
386  }
387 }
388 
389 void printPoint( // print point
390  ANNpoint p, // the point
391  int dim) // the dimension
392 {
393  cout << "[";
394  for (int i = 0; i < dim; i++) {
395  cout << p[i];
396  if (i < dim-1) cout << ",";
397  }
398  cout << "]";
399 }
400 
401 int lookUp( // look up name in table
402  const char* arg, // name to look up
403  const char (*table)[STRING_LEN], // name table
404  int size) // table size
405 {
406  int i;
407  for (i = 0; i < size; i++) {
408  if (!strcmp(arg, table[i])) return i;
409  }
410  return i;
411 }
412 
413 //------------------------------------------------------------------------
414 // Function declarations
415 //------------------------------------------------------------------------
416 
417 void generatePts( // generate data/query points
418  ANNpointArray &pa, // point array (returned)
419  int n, // number of points
420  PtType type, // point type
421  ANNbool new_clust, // new cluster centers desired?
422  ANNpointArray src = NULL, // source array (for PLANTED)
423  int n_src = 0); // source size (for PLANTED)
424 
425 void readPts( // read data/query points from file
426  ANNpointArray &pa, // point array (returned)
427  int &n, // number of points
428  char *file_nm, // file name
429  PtType type); // point type (DATA, QUERY)
430 
431 void doValidation(); // perform validation
432 void getTrueNN(); // compute true nearest neighbors
433 
434 void treeStats( // print statistics on kd- or bd-tree
435  ostream &out, // output stream
436  ANNbool verbose); // print stats
437 
438 //------------------------------------------------------------------------
439 // Default execution parameters
440 //------------------------------------------------------------------------
441 const int extra_nn = 10; // how many extra true nn's?
442 
443 const int def_dim = 2; // def dimension
444 const int def_data_size = 100; // def data size
445 const int def_query_size = 100; // def number of queries
446 const int def_n_color = 5; // def number of colors
447 const ANNbool def_new_clust = ANNfalse; // def new clusters flag
448 const int def_max_dim = 1; // def max flat dimension
449 const Distrib def_distr = UNIFORM; // def distribution
450 const double def_std_dev = 1.00; // def standard deviation
451 const double def_corr_coef = 0.05; // def correlation coef
452 const int def_bucket_size = 1; // def bucket size
453 const double def_epsilon = 0.0; // def error bound
454 const int def_near_neigh = 1; // def number of near neighbors
455 const int def_max_visit = 0; // def number of points visited
456 const int def_rad_bound = 0; // def radius bound
457  // def number of true nn's
459 const int def_seed = 0; // def seed for random numbers
460 const ANNbool def_validate = ANNfalse; // def validation flag
461  // def statistics output level
463 const ANNsplitRule // def splitting rule
465 const ANNshrinkRule // def shrinking rule
467 
468 //------------------------------------------------------------------------
469 // Global variables - Execution options
470 //------------------------------------------------------------------------
471 
472 int dim; // dimension
473 int data_size; // data size
474 int query_size; // number of queries
475 int n_color; // number of colors
476 ANNbool new_clust; // generate new clusters?
477 int max_dim; // maximum flat dimension
478 Distrib distr; // distribution
479 double corr_coef; // correlation coef
480 double std_dev; // standard deviation
481 double std_dev_lo; // low standard deviation
482 double std_dev_hi; // high standard deviation
483 int bucket_size; // bucket size
484 double epsilon; // error bound
485 int near_neigh; // number of near neighbors
486 int max_pts_visit; // max number of points to visit
487 double radius_bound; // maximum radius search bound
488 int true_nn; // number of true nn's
489 ANNbool validate; // validation flag
490 StatLev stats; // statistics output level
491 ANNsplitRule split; // splitting rule
492 ANNshrinkRule shrink; // shrinking rule
493 
494 //------------------------------------------------------------------------
495 // More globals - pointers to dynamically allocated arrays and structures
496 //
497 // It is assumed that all these values are set to NULL when nothing
498 // is allocated.
499 //
500 // data_pts, query_pts The data and query points
501 // the_tree Points to the kd- or bd-tree for
502 // nearest neighbor searching.
503 // apx_nn_idx, apx_dists Record approximate near neighbor
504 // indices and distances
505 // apx_pts_in_range Counts of the number of points in
506 // the in approx range, for fixed-
507 // radius NN searching.
508 // true_nn_idx, true_dists Record true near neighbor
509 // indices and distances
510 // min_pts_in_range, max_... Min and max counts of the number
511 // of points in the in approximate
512 // range.
513 // valid_dirty To avoid repeated validation,
514 // we only validate query results
515 // once. This validation becomes
516 // invalid, if a new tree, new data
517 // points or new query points have
518 // been generated.
519 // tree_data_size The number of points in the
520 // current tree. (This will be the
521 // same a data_size unless points have
522 // been added since the tree was
523 // built.)
524 //
525 // The approximate and true nearest neighbor results are stored
526 // in: apx_nn_idx, apx_dists, and true_nn_idx, true_dists.
527 // They are really flattened 2-dimensional arrays. Each of these
528 // arrays consists of query_size blocks, each of which contains
529 // near_neigh (or true_nn) entries, one for each of the nearest
530 // neighbors for a given query point.
531 //------------------------------------------------------------------------
532 
533 ANNpointArray data_pts; // data points
534 ANNpointArray query_pts; // query points
535 ANNbd_tree* the_tree; // kd- or bd-tree search structure
536 ANNidxArray apx_nn_idx; // storage for near neighbor indices
537 ANNdistArray apx_dists; // storage for near neighbor distances
538 int* apx_pts_in_range; // storage for no. of points in range
539 ANNidxArray true_nn_idx; // true near neighbor indices
540 ANNdistArray true_dists; // true near neighbor distances
541 int* min_pts_in_range; // min points in approx range
542 int* max_pts_in_range; // max points in approx range
543 
544 ANNbool valid_dirty; // validation is no longer valid
545 
546 //------------------------------------------------------------------------
547 // Initialize global parameters
548 //------------------------------------------------------------------------
549 
551 {
552  dim = def_dim; // init execution parameters
555  distr = def_distr;
570  stats = def_stats;
571  split = def_split;
572  shrink = def_shrink;
573  annIdum = -def_seed; // init. global seed for ran0()
574 
575  data_pts = NULL; // initialize storage pointers
576  query_pts = NULL;
577  the_tree = NULL;
578  apx_nn_idx = NULL;
579  apx_dists = NULL;
580  apx_pts_in_range = NULL;
581  true_nn_idx = NULL;
582  true_dists = NULL;
583  min_pts_in_range = NULL;
584  max_pts_in_range = NULL;
585 
586  valid_dirty = ANNtrue; // (validation must be done)
587 }
588 
589 //------------------------------------------------------------------------
590 // getDirective - skip comments and read next directive
591 // Returns ANNtrue if directive read, and ANNfalse if eof seen.
592 //------------------------------------------------------------------------
593 
594 ANNbool skipComment( // skip any comments
595  istream &in) // input stream
596 {
597  char ch = 0;
598  // skip whitespace
599  do { in.get(ch); } while (isspace(ch) && !in.eof());
600  while (ch == '#' && !in.eof()) { // comment?
601  // skip to end of line
602  do { in.get(ch); } while(ch != '\n' && !in.eof());
603  // skip whitespace
604  do { in.get(ch); } while(isspace(ch) && !in.eof());
605  }
606  if (in.eof()) return ANNfalse; // end of file
607  in.putback(ch); // put character back
608  return ANNtrue;
609 }
610 
612  istream &in, // input stream
613  char *directive) // directive storage
614 {
615  if (!skipComment(in)) // skip comments
616  return ANNfalse; // found eof along the way?
617  in >> directive; // read directive
618  return ANNtrue;
619 }
620 
621 
622 //------------------------------------------------------------------------
623 // main program - driver
624 // The main program reads input options, invokes the necessary
625 // routines to process them.
626 //------------------------------------------------------------------------
627 
628 int main(int argc, char** argv)
629 {
630  long clock0; // clock time
631  char directive[STRING_LEN]; // input directive
632  char arg[STRING_LEN]; // all-purpose argument
633 
634  cout << "------------------------------------------------------------\n"
635  << "ann_test: Version " << ANNversion << " " << ANNversionCmt << "\n"
636  << " Copyright: " << ANNcopyright << ".\n"
637  << " Latest Revision: " << ANNlatestRev << ".\n"
638  << "------------------------------------------------------------\n\n";
639 
640  initGlobals(); // initialize global values
641 
642  //--------------------------------------------------------------------
643  // Main input loop
644  //--------------------------------------------------------------------
645  // read input directive
646  while (getDirective(cin, directive)) {
647  //----------------------------------------------------------------
648  // Read options
649  //----------------------------------------------------------------
650  if (!strcmp(directive,"dim")) {
651  cin >> dim;
652  }
653  else if (!strcmp(directive,"colors")) {
654  cin >> n_color;
655  }
656  else if (!strcmp(directive,"new_clust")) {
657  new_clust = ANNtrue;
658  }
659  else if (!strcmp(directive,"max_clus_dim")) {
660  cin >> max_dim;
661  }
662  else if (!strcmp(directive,"std_dev")) {
663  cin >> std_dev;
664  }
665  else if (!strcmp(directive,"std_dev_lo")) {
666  cin >> std_dev_lo;
667  }
668  else if (!strcmp(directive,"std_dev_hi")) {
669  cin >> std_dev_hi;
670  }
671  else if (!strcmp(directive,"corr_coef")) {
672  cin >> corr_coef;
673  }
674  else if (!strcmp(directive, "data_size")) {
675  cin >> data_size;
676  }
677  else if (!strcmp(directive,"query_size")) {
678  cin >> query_size;
679  }
680  else if (!strcmp(directive,"bucket_size")) {
681  cin >> bucket_size;
682  }
683  else if (!strcmp(directive,"epsilon")) {
684  cin >> epsilon;
685  }
686  else if (!strcmp(directive,"max_pts_visit")) {
687  cin >> max_pts_visit;
688  valid_dirty = ANNtrue; // validation must be redone
689  }
690  else if (!strcmp(directive,"radius_bound")) {
691  cin >> radius_bound;
692  valid_dirty = ANNtrue; // validation must be redone
693  }
694  else if (!strcmp(directive,"near_neigh")) {
695  cin >> near_neigh;
696  true_nn = near_neigh + extra_nn; // also reset true near neighs
697  valid_dirty = ANNtrue; // validation must be redone
698  }
699  else if (!strcmp(directive,"true_near_neigh")) {
700  cin >> true_nn;
701  valid_dirty = ANNtrue; // validation must be redone
702  }
703  //----------------------------------------------------------------
704  // seed option
705  // The seed is reset by setting the global annIdum to the
706  // negation of the seed value. See rand.cpp.
707  //----------------------------------------------------------------
708  else if (!strcmp(directive,"seed")) {
709  cin >> annIdum;
710  annIdum = -annIdum;
711  }
712  //----------------------------------------------------------------
713  // validate option
714  //----------------------------------------------------------------
715  else if (!strcmp(directive,"validate")) {
716  cin >> arg; // input argument
717  if (!strcmp(arg, "on")) {
718  validate = ANNtrue;
719  cout << "validate = on "
720  << "(Warning: this may slow execution time.)\n";
721  }
722  else if (!strcmp(arg, "off")) {
723  validate = ANNfalse;
724  }
725  else {
726  cerr << "Argument: " << arg << "\n";
727  Error("validate argument must be \"on\" or \"off\"", ANNabort);
728  }
729  }
730  //----------------------------------------------------------------
731  // distribution option
732  //----------------------------------------------------------------
733  else if (!strcmp(directive,"distribution")) {
734  cin >> arg; // input name and translate
736  if (distr >= N_DISTRIBS) { // not something we recognize
737  cerr << "Distribution: " << arg << "\n";
738  Error("Unknown distribution", ANNabort);
739  }
740  }
741  //----------------------------------------------------------------
742  // stats option
743  //----------------------------------------------------------------
744  else if (!strcmp(directive,"stats")) {
745  cin >> arg; // input name and translate
747  if (stats >= N_STAT_LEVELS) { // not something we recognize
748  cerr << "Stats level: " << arg << "\n";
749  Error("Unknown statistics level", ANNabort);
750  }
751  if (stats > SILENT)
752  cout << "stats = " << arg << "\n";
753  }
754  //----------------------------------------------------------------
755  // split_rule option
756  //----------------------------------------------------------------
757  else if (!strcmp(directive,"split_rule")) {
758  cin >> arg; // input split_rule name
760  if (split >= N_SPLIT_RULES) { // not something we recognize
761  cerr << "Splitting rule: " << arg << "\n";
762  Error("Unknown splitting rule", ANNabort);
763  }
764  }
765  //----------------------------------------------------------------
766  // shrink_rule option
767  //----------------------------------------------------------------
768  else if (!strcmp(directive,"shrink_rule")) {
769  cin >> arg; // input split_rule name
771  if (shrink >= N_SHRINK_RULES) { // not something we recognize
772  cerr << "Shrinking rule: " << arg << "\n";
773  Error("Unknown shrinking rule", ANNabort);
774  }
775  }
776  //----------------------------------------------------------------
777  // label operation
778  //----------------------------------------------------------------
779  else if (!strcmp(directive,"output_label")) {
780  cin >> arg;
781  if (stats > SILENT)
782  cout << "<" << arg << ">\n";
783  }
784  //----------------------------------------------------------------
785  // gen_data_pts operation
786  //----------------------------------------------------------------
787  else if (!strcmp(directive,"gen_data_pts")) {
788  if (distr == PLANTED) { // planted distribution
789  Error("Cannot use planted distribution for data points", ANNabort);
790  }
791  generatePts( // generate data points
792  data_pts, // data points
793  data_size, // data size
794  DATA, // data points
795  new_clust); // new clusters flag
796  valid_dirty = ANNtrue; // validation must be redone
797  new_clust = ANNfalse; // reset flag
798  }
799  //----------------------------------------------------------------
800  // gen_query_pts operation
801  // If the distribution is PLANTED, then the query points
802  // are planted near the data points (which must already be
803  // generated).
804  //----------------------------------------------------------------
805  else if (!strcmp(directive,"gen_query_pts")) {
806  if (distr == PLANTED) { // planted distribution
807  if (data_pts == NULL) {
808  Error("Must generate data points before query points for planted distribution", ANNabort);
809  }
810  generatePts( // generate query points
811  query_pts, // point array
812  query_size, // number of query points
813  QUERY, // query points
814  new_clust, // new clusters flag
815  data_pts, // plant around data pts
816  data_size);
817  }
818  else { // all other distributions
819  generatePts( // generate query points
820  query_pts, // point array
821  query_size, // number of query points
822  QUERY, // query points
823  new_clust); // new clusters flag
824  }
825  valid_dirty = ANNtrue; // validation must be redone
826  new_clust = ANNfalse; // reset flag
827  }
828  //----------------------------------------------------------------
829  // read_data_pts operation
830  //----------------------------------------------------------------
831  else if (!strcmp(directive,"read_data_pts")) {
832  cin >> arg; // input file name
833  readPts(
834  data_pts, // point array
835  data_size, // number of points
836  arg, // file name
837  DATA); // data points
838  valid_dirty = ANNtrue; // validation must be redone
839  }
840  //----------------------------------------------------------------
841  // read_query_pts operation
842  //----------------------------------------------------------------
843  else if (!strcmp(directive,"read_query_pts")) {
844  cin >> arg; // input file name
845  readPts(
846  query_pts, // point array
847  query_size, // number of points
848  arg, // file name
849  QUERY); // query points
850  valid_dirty = ANNtrue; // validation must be redone
851  }
852  //----------------------------------------------------------------
853  // build_ann operation
854  // We always invoke the constructor for bd-trees. Note
855  // that when the shrinking rule is NONE (which is true by
856  // default), then this constructs a kd-tree.
857  //----------------------------------------------------------------
858  else if (!strcmp(directive,"build_ann")) {
859  //------------------------------------------------------------
860  // Build the tree
861  //------------------------------------------------------------
862  if (the_tree != NULL) { // tree exists already
863  delete the_tree; // get rid of it
864  }
865  clock0 = clock(); // start time
866 
867  the_tree = new ANNbd_tree( // build it
868  data_pts, // the data points
869  data_size, // number of points
870  dim, // dimension of space
871  bucket_size, // maximum bucket size
872  split, // splitting rule
873  shrink); // shrinking rule
874 
875  //------------------------------------------------------------
876  // Print summary
877  //------------------------------------------------------------
878  long prep_time = clock() - clock0; // end of prep time
879 
880  if (stats > SILENT) {
881  cout << "[Build ann-structure:\n";
882  cout << " split_rule = " << split_table[split] << "\n";
883  cout << " shrink_rule = " << shrink_table[shrink] << "\n";
884  cout << " data_size = " << data_size << "\n";
885  cout << " dim = " << dim << "\n";
886  cout << " bucket_size = " << bucket_size << "\n";
887 
888  if (stats >= EXEC_TIME) { // output processing time
889  cout << " process_time = "
890  << double(prep_time)/CLOCKS_PER_SEC << " sec\n";
891  }
892 
893  if (stats >= PREP_STATS) // output or check tree stats
894  treeStats(cout, ANNtrue); // print tree stats
895  else
896  treeStats(cout, ANNfalse); // check stats
897 
898  if (stats >= SHOW_STRUCT) { // print the whole tree
899  cout << " (Structure Contents:\n";
900  the_tree->Print(ANNfalse, cout);
901  cout << " )\n";
902  }
903  cout << "]\n";
904  }
905  }
906  //----------------------------------------------------------------
907  // dump operation
908  //----------------------------------------------------------------
909  else if (!strcmp(directive,"dump")) {
910  cin >> arg; // input file name
911  if (the_tree == NULL) { // no tree
912  Error("Cannot dump. No tree has been built yet", ANNwarn);
913  }
914  else { // there is a tree
915  // try to open file
916  ofstream out_dump_file(arg);
917  if (!out_dump_file) {
918  cerr << "File name: " << arg << "\n";
919  Error("Cannot open dump file", ANNabort);
920  }
921  // dump the tree and points
922  the_tree->Dump(ANNtrue, out_dump_file);
923  if (stats > SILENT) {
924  cout << "(Tree has been dumped to file " << arg << ")\n";
925  }
926  }
927  }
928  //----------------------------------------------------------------
929  // load operation
930  // Since this not only loads a tree, but loads a new set
931  // of data points.
932  //----------------------------------------------------------------
933  else if (!strcmp(directive,"load")) {
934  cin >> arg; // input file name
935  if (the_tree != NULL) { // tree exists already
936  delete the_tree; // get rid of it
937  }
938  if (data_pts != NULL) { // data points exist already
939  delete data_pts; // get rid of them
940  }
941 
942  ifstream in_dump_file(arg); // try to open file
943  if (!in_dump_file) {
944  cerr << "File name: " << arg << "\n";
945  Error("Cannot open file for loading", ANNabort);
946  }
947  // build tree by loading
948  the_tree = new ANNbd_tree(in_dump_file);
949 
950  dim = the_tree->theDim(); // new dimension
951  data_size = the_tree->nPoints(); // number of points
952  data_pts = the_tree->thePoints(); // new points
953 
954  valid_dirty = ANNtrue; // validation must be redone
955 
956  if (stats > SILENT) {
957  cout << "(Tree has been loaded from file " << arg << ")\n";
958  }
959  if (stats >= SHOW_STRUCT) { // print the tree
960  cout << " (Structure Contents:\n";
961  the_tree->Print(ANNfalse, cout);
962  cout << " )\n";
963  }
964  }
965  //----------------------------------------------------------------
966  // run_queries operation
967  // This section does all the query processing. It consists
968  // of the following subsections:
969  //
970  // ** input the argument (standard or priority) and output
971  // the header describing the essential information.
972  // ** allocate space for the results to be stored.
973  // ** run the queries by invoking the appropriate search
974  // procedure on the query points. Print nearest neighbor
975  // if requested.
976  // ** print final summaries
977  //
978  // The approach for processing multiple nearest neighbors is
979  // pretty crude. We allocate an array whose size is the
980  // product of the total number of queries times the number of
981  // nearest neighbors (k), and then use each k consecutive
982  // entries to store the results of each query.
983  //----------------------------------------------------------------
984  else if (!strcmp(directive,"run_queries")) {
985 
986  //------------------------------------------------------------
987  // Input arguments and print summary
988  //------------------------------------------------------------
989  enum {STANDARD, PRIORITY} method;
990 
991  cin >> arg; // input argument
992  if (!strcmp(arg, "standard")) {
993  method = STANDARD;
994  }
995  else if (!strcmp(arg, "priority")) {
996  method = PRIORITY;
997  }
998  else {
999  cerr << "Search type: " << arg << "\n";
1000  Error("Search type must be \"standard\" or \"priority\"",
1001  ANNabort);
1002  }
1003  if (data_pts == NULL || query_pts == NULL) {
1004  Error("Either data set and query set not constructed", ANNabort);
1005  }
1006  if (the_tree == NULL) {
1007  Error("No search tree built.", ANNabort);
1008  }
1009 
1010  //------------------------------------------------------------
1011  // Set up everything
1012  //------------------------------------------------------------
1013 
1014  #ifdef ANN_PERF // performance only
1015  annResetStats(data_size); // reset statistics
1016  #endif
1017 
1018  clock0 = clock(); // start time
1019  // deallocate existing storage
1020  if (apx_nn_idx != NULL) delete [] apx_nn_idx;
1021  if (apx_dists != NULL) delete [] apx_dists;
1022  if (apx_pts_in_range != NULL) delete [] apx_pts_in_range;
1023  // allocate apx answer storage
1026  apx_pts_in_range = new int[query_size];
1027 
1028  annMaxPtsVisit(max_pts_visit); // set max points to visit
1029 
1030  //------------------------------------------------------------
1031  // Run the queries
1032  //------------------------------------------------------------
1033  // pointers for current query
1034  ANNidxArray curr_nn_idx = apx_nn_idx;
1035  ANNdistArray curr_dists = apx_dists;
1036 
1037  for (int i = 0; i < query_size; i++) {
1038  #ifdef ANN_PERF
1039  annResetCounts(); // reset counters
1040  #endif
1041  apx_pts_in_range[i] = 0;
1042 
1043  if (radius_bound == 0) { // no radius bound
1044  if (method == STANDARD) {
1046  query_pts[i], // query point
1047  near_neigh, // number of near neighbors
1048  curr_nn_idx, // nearest neighbors (returned)
1049  curr_dists, // distance (returned)
1050  epsilon); // error bound
1051  }
1052  else if (method == PRIORITY) {
1054  query_pts[i], // query point
1055  near_neigh, // number of near neighbors
1056  curr_nn_idx, // nearest neighbors (returned)
1057  curr_dists, // distance (returned)
1058  epsilon); // error bound
1059  }
1060  else {
1061  Error("Internal error - invalid method", ANNabort);
1062  }
1063  }
1064  else { // use radius bound
1065  if (method != STANDARD) {
1066  Error("A nonzero radius bound assumes standard search",
1067  ANNwarn);
1068  }
1070  query_pts[i], // query point
1071  ANN_POW(radius_bound), // squared radius search bound
1072  near_neigh, // number of near neighbors
1073  curr_nn_idx, // nearest neighbors (returned)
1074  curr_dists, // distance (returned)
1075  epsilon); // error bound
1076  }
1077  curr_nn_idx += near_neigh; // increment current pointers
1078  curr_dists += near_neigh;
1079 
1080  #ifdef ANN_PERF
1081  annUpdateStats(); // update stats
1082  #endif
1083  }
1084 
1085  long query_time = clock() - clock0; // end of query time
1086 
1087  if (validate) { // validation requested
1088  if (valid_dirty) getTrueNN(); // get true near neighbors
1089  doValidation(); // validate
1090  }
1091 
1092  //------------------------------------------------------------
1093  // Print summaries
1094  //------------------------------------------------------------
1095 
1096  if (stats > SILENT) {
1097  cout << "[Run Queries:\n";
1098  cout << " query_size = " << query_size << "\n";
1099  cout << " dim = " << dim << "\n";
1100  cout << " search_method = " << arg << "\n";
1101  cout << " epsilon = " << epsilon << "\n";
1102  cout << " near_neigh = " << near_neigh << "\n";
1103  if (max_pts_visit != 0)
1104  cout << " max_pts_visit = " << max_pts_visit << "\n";
1105  if (radius_bound != 0)
1106  cout << " radius_bound = " << radius_bound << "\n";
1107  if (validate)
1108  cout << " true_nn = " << true_nn << "\n";
1109 
1110  if (stats >= EXEC_TIME) { // print exec time summary
1111  cout << " query_time = " <<
1112  double(query_time)/(query_size*CLOCKS_PER_SEC)
1113  << " sec/query";
1114  #ifdef ANN_PERF
1115  cout << " (biased by perf measurements)";
1116  #endif
1117  cout << "\n";
1118  }
1119 
1120  if (stats >= QUERY_STATS) { // output performance stats
1121  #ifdef ANN_PERF
1122  cout.flush();
1124  #else
1125  cout << " (Performance statistics unavailable.)\n";
1126  #endif
1127  }
1128 
1129  if (stats >= QUERY_RES) { // output results
1130  cout << " (Query Results:\n";
1131  cout << " Pt\tANN\tDist\n";
1132  curr_nn_idx = apx_nn_idx; // subarray pointers
1133  curr_dists = apx_dists;
1134  // output nearest neighbors
1135  for (int i = 0; i < query_size; i++) {
1136  cout << " " << setw(4) << i;
1137  for (int j = 0; j < near_neigh; j++) {
1138  // exit if no more neighbors
1139  if (curr_nn_idx[j] == ANN_NULL_IDX) {
1140  cout << "\t[no other pts in radius bound]\n";
1141  break;
1142  }
1143  else { // output point info
1144  cout << "\t" << curr_nn_idx[j]
1145  << "\t" << ANN_ROOT(curr_dists[j])
1146  << "\n";
1147  }
1148  }
1149  // output range count
1150  if (radius_bound != 0) {
1151  cout << " pts_in_radius_bound = "
1152  << apx_pts_in_range[i] << "\n";
1153  }
1154  // increment subarray pointers
1155  curr_nn_idx += near_neigh;
1156  curr_dists += near_neigh;
1157  }
1158  cout << " )\n";
1159  }
1160  cout << "]\n";
1161  }
1162  }
1163  //----------------------------------------------------------------
1164  // Unknown directive
1165  //----------------------------------------------------------------
1166  else {
1167  cerr << "Directive: " << directive << "\n";
1168  Error("Unknown directive", ANNabort);
1169  }
1170  }
1171  //--------------------------------------------------------------------
1172  // End of input loop (deallocate stuff that was allocated)
1173  //--------------------------------------------------------------------
1174  if (the_tree != NULL) delete the_tree;
1175  if (data_pts != NULL) annDeallocPts(data_pts);
1176  if (query_pts != NULL) annDeallocPts(query_pts);
1177  if (apx_nn_idx != NULL) delete [] apx_nn_idx;
1178  if (apx_dists != NULL) delete [] apx_dists;
1179  if (apx_pts_in_range != NULL) delete [] apx_pts_in_range;
1180 
1181  annClose(); // close ANN
1182 
1183  return EXIT_SUCCESS;
1184 }
1185 
1186 //------------------------------------------------------------------------
1187 // generatePts - call appropriate routine to generate points of a
1188 // given distribution.
1189 //------------------------------------------------------------------------
1190 
1192  ANNpointArray &pa, // point array (returned)
1193  int n, // number of points to generate
1194  PtType type, // point type
1195  ANNbool new_clust, // new cluster centers desired?
1196  ANNpointArray src, // source array (if distr=PLANTED)
1197  int n_src) // source size (if distr=PLANTED)
1198 {
1199  if (pa != NULL) annDeallocPts(pa); // get rid of any old points
1200  pa = annAllocPts(n, dim); // allocate point storage
1201 
1202  switch (distr) {
1203  case UNIFORM: // uniform over cube [-1,1]^d.
1204  annUniformPts(pa, n, dim);
1205  break;
1206  case GAUSS: // Gaussian with mean 0
1207  annGaussPts(pa, n, dim, std_dev);
1208  break;
1209  case LAPLACE: // Laplacian, mean 0 and var 1
1210  annLaplacePts(pa, n, dim);
1211  break;
1212  case CO_GAUSS: // correlated Gaussian
1213  annCoGaussPts(pa, n, dim, corr_coef);
1214  break;
1215  case CO_LAPLACE: // correlated Laplacian
1216  annCoLaplacePts(pa, n, dim, corr_coef);
1217  break;
1218  case CLUS_GAUSS: // clustered Gaussian
1219  annClusGaussPts(pa, n, dim, n_color, new_clust, std_dev);
1220  break;
1221  case CLUS_ORTH_FLATS: // clustered on orthog flats
1222  annClusOrthFlats(pa, n, dim, n_color, new_clust, std_dev, max_dim);
1223  break;
1224  case CLUS_ELLIPSOIDS: // clustered ellipsoids
1225  annClusEllipsoids(pa, n, dim, n_color, new_clust, std_dev,
1227  break;
1228  case PLANTED: // planted distribution
1229  annPlanted(pa, n, dim, src, n_src, std_dev);
1230  break;
1231  default:
1232  Error("INTERNAL ERROR: Unknown distribution", ANNabort);
1233  break;
1234  }
1235 
1236  if (stats > SILENT) {
1237  if(type == DATA) cout << "[Generating Data Points:\n";
1238  else cout << "[Generating Query Points:\n";
1239  cout << " number = " << n << "\n";
1240  cout << " dim = " << dim << "\n";
1241  cout << " distribution = " << distr_table[distr] << "\n";
1242  if (annIdum < 0)
1243  cout << " seed = " << annIdum << "\n";
1244  if (distr == GAUSS || distr == CLUS_GAUSS
1245  || distr == CLUS_ORTH_FLATS)
1246  cout << " std_dev = " << std_dev << "\n";
1247  if (distr == CLUS_ELLIPSOIDS) {
1248  cout << " std_dev = " << std_dev << " (small) \n";
1249  cout << " std_dev_lo = " << std_dev_lo << "\n";
1250  cout << " std_dev_hi = " << std_dev_hi << "\n";
1251  }
1252  if (distr == CO_GAUSS || distr == CO_LAPLACE)
1253  cout << " corr_coef = " << corr_coef << "\n";
1254  if (distr == CLUS_GAUSS || distr == CLUS_ORTH_FLATS
1255  || distr == CLUS_ELLIPSOIDS) {
1256  cout << " colors = " << n_color << "\n";
1257  if (new_clust)
1258  cout << " (cluster centers regenerated)\n";
1259  }
1260  if (distr == CLUS_ORTH_FLATS || distr == CLUS_ELLIPSOIDS) {
1261  cout << " max_dim = " << max_dim << "\n";
1262  }
1263  }
1264  // want to see points?
1265  if ((type == DATA && stats >= SHOW_PTS) ||
1266  (type == QUERY && stats >= QUERY_RES)) {
1267  if(type == DATA) cout << "(Data Points:\n";
1268  else cout << "(Query Points:\n";
1269  for (int i = 0; i < n; i++) {
1270  cout << " " << setw(4) << i << "\t";
1271  printPoint(pa[i], dim);
1272  cout << "\n";
1273  }
1274  cout << " )\n";
1275  }
1276  cout << "]\n";
1277 }
1278 
1279 //------------------------------------------------------------------------
1280 // readPts - read a collection of data or query points.
1281 //------------------------------------------------------------------------
1282 
1283 void readPts(
1284  ANNpointArray &pa, // point array (returned)
1285  int &n, // number of points
1286  char *file_nm, // file name
1287  PtType type) // point type (DATA, QUERY)
1288 {
1289  int i;
1290  //--------------------------------------------------------------------
1291  // Open input file and read points
1292  //--------------------------------------------------------------------
1293  ifstream in_file(file_nm); // try to open data file
1294  if (!in_file) {
1295  cerr << "File name: " << file_nm << "\n";
1296  Error("Cannot open input data/query file", ANNabort);
1297  }
1298  // allocate storage for points
1299  if (pa != NULL) annDeallocPts(pa); // get rid of old points
1300  pa = annAllocPts(n, dim);
1301 
1302  for (i = 0; i < n; i++) { // read the data
1303  if (!(in_file >> pa[i][0])) break;
1304  for (int d = 1; d < dim; d++) {
1305  in_file >> pa[i][d];
1306  }
1307  }
1308 
1309  char ignore_me; // character for EOF test
1310  in_file >> ignore_me; // try to get one more character
1311  if (!in_file.eof()) { // exhausted space before eof
1312  if (type == DATA)
1313  Error("`data_size' too small. Input file truncated.", ANNwarn);
1314  else
1315  Error("`query_size' too small. Input file truncated.", ANNwarn);
1316  }
1317  n = i; // number of points read
1318 
1319  //--------------------------------------------------------------------
1320  // Print summary
1321  //--------------------------------------------------------------------
1322  if (stats > SILENT) {
1323  if (type == DATA) {
1324  cout << "[Read Data Points:\n";
1325  cout << " data_size = " << n << "\n";
1326  }
1327  else {
1328  cout << "[Read Query Points:\n";
1329  cout << " query_size = " << n << "\n";
1330  }
1331  cout << " file_name = " << file_nm << "\n";
1332  cout << " dim = " << dim << "\n";
1333  // print if results requested
1334  if ((type == DATA && stats >= SHOW_PTS) ||
1335  (type == QUERY && stats >= QUERY_RES)) {
1336  cout << " (Points:\n";
1337  for (i = 0; i < n; i++) {
1338  cout << " " << i << "\t";
1339  printPoint(pa[i], dim);
1340  cout << "\n";
1341  }
1342  cout << " )\n";
1343  }
1344  cout << "]\n";
1345  }
1346 }
1347 
1348 //------------------------------------------------------------------------
1349 // getTrueNN
1350 // Computes the true nearest neighbors. For purposes of validation,
1351 // this intentionally done in a rather dumb (but safe way), by
1352 // invoking the brute-force search.
1353 //
1354 // The number of true nearest neighbors is somewhat larger than
1355 // the number of nearest neighbors. This is so that the validation
1356 // can determine the expected difference in element ranks.
1357 //
1358 // This procedure is invoked just prior to running queries. Since
1359 // the operation takes a long time, it is performed only if needed.
1360 // In particular, once generated, it will be regenerated only if
1361 // new query or data points are generated, or if the requested number
1362 // of true near neighbors or approximate near neighbors has changed.
1363 //
1364 // To validate fixed-radius searching, we compute two counts, one
1365 // with the original query radius (trueSqRadius) and the other with
1366 // a radius shrunken by the error factor (minSqradius). We then
1367 // check that the count of points inside the approximate range is
1368 // between these two bounds. Because fixed-radius search is
1369 // allowed to ignore points within the shrunken radius, we only
1370 // compute exact neighbors within this smaller distance (for we
1371 // cannot guarantee that we will even visit the other points).
1372 //------------------------------------------------------------------------
1373 
1374 void getTrueNN() // compute true nearest neighbors
1375 {
1376  if (stats > SILENT) {
1377  cout << "(Computing true nearest neighbors for validation. This may take time.)\n";
1378  }
1379  // deallocate existing storage
1380  if (true_nn_idx != NULL) delete [] true_nn_idx;
1381  if (true_dists != NULL) delete [] true_dists;
1382  if (min_pts_in_range != NULL) delete [] min_pts_in_range;
1383  if (max_pts_in_range != NULL) delete [] max_pts_in_range;
1384 
1385  if (true_nn > data_size) { // can't get more nn than points
1386  true_nn = data_size;
1387  }
1388 
1389  // allocate true answer storage
1392  min_pts_in_range = new int[query_size];
1393  max_pts_in_range = new int[query_size];
1394 
1395  ANNidxArray curr_nn_idx = true_nn_idx; // current locations in arrays
1396  ANNdistArray curr_dists = true_dists;
1397 
1398  // allocate search structure
1399  ANNbruteForce *the_brute = new ANNbruteForce(data_pts, data_size, dim);
1400  // compute nearest neighbors
1401  for (int i = 0; i < query_size; i++) {
1402  if (radius_bound == 0) { // standard kNN search
1403  the_brute->annkSearch( // compute true near neighbors
1404  query_pts[i], // query point
1405  true_nn, // number of nearest neighbors
1406  curr_nn_idx, // where to put indices
1407  curr_dists); // where to put distances
1408  }
1409  else { // fixed radius kNN search
1410  // search radii limits
1411  ANNdist trueSqRadius = ANN_POW(radius_bound);
1412  ANNdist minSqRadius = ANN_POW(radius_bound / (1+epsilon));
1413  min_pts_in_range[i] = the_brute->annkFRSearch(
1414  query_pts[i], // query point
1415  minSqRadius, // shrunken search radius
1416  true_nn, // number of near neighbors
1417  curr_nn_idx, // nearest neighbors (returned)
1418  curr_dists); // distance (returned)
1419  max_pts_in_range[i] = the_brute->annkFRSearch(
1420  query_pts[i], // query point
1421  trueSqRadius, // true search radius
1422  0, NULL, NULL); // (ignore kNN info)
1423  }
1424  curr_nn_idx += true_nn; // increment nn index pointer
1425  curr_dists += true_nn; // increment nn dist pointer
1426  }
1427  delete the_brute; // delete brute-force struct
1428  valid_dirty = ANNfalse; // validation good for now
1429 }
1430 
1431 //------------------------------------------------------------------------
1432 // doValidation
1433 // Compares the approximate answers to the k-nearest neighbors
1434 // against the true nearest neighbors (computed earlier). It is
1435 // assumed that the true nearest neighbors and indices have been
1436 // computed earlier.
1437 //
1438 // First, we check that all the results are within their allowed
1439 // limits, and generate an internal error, if not. For the sake of
1440 // performance evaluation, we also compute the following two
1441 // quantities for nearest neighbors:
1442 //
1443 // Average Error
1444 // -------------
1445 // The relative error between the distance to a reported nearest
1446 // neighbor and the true nearest neighbor (of the same rank),
1447 //
1448 // Rank Error
1449 // ----------
1450 // The difference in rank between the reported nearest neighbor and
1451 // its position (if any) among the true nearest neighbors. If we
1452 // cannot find this point among the true nearest neighbors, then
1453 // it assumed that the rank of the true nearest neighbor is true_nn+1.
1454 //
1455 // Because of the possibility of duplicate distances, this is computed
1456 // as follows. For the j-th reported nearest neighbor, we count the
1457 // number of true nearest neighbors that are at least this close. Let
1458 // this be rnk. Then the rank error is max(0, j-rnk). (In the code
1459 // below, j is an array index and so the first item is 0, not 1. Thus
1460 // we take max(0, j+1-rnk) instead.)
1461 //
1462 // For the results of fixed-radious range count, we verify that the
1463 // reported number of points in the range lies between the actual
1464 // number of points in the shrunken and the true search radius.
1465 //------------------------------------------------------------------------
1466 
1467 void doValidation() // perform validation
1468 {
1469  int* curr_apx_idx = apx_nn_idx; // approx index pointer
1470  ANNdistArray curr_apx_dst = apx_dists; // approx distance pointer
1471  int* curr_tru_idx = true_nn_idx; // true index pointer
1472  ANNdistArray curr_tru_dst = true_dists; // true distance pointer
1473  int i, j;
1474 
1475  if (true_nn < near_neigh) {
1476  Error("Cannot validate with fewer true near neighbors than actual", ANNabort);
1477  }
1478 
1479  for (i = 0; i < query_size; i++) { // validate each query
1480  //----------------------------------------------------------------
1481  // Compute result errors
1482  // In fixed radius search it is possible that not all k
1483  // nearest neighbors were computed. Because the true
1484  // results are computed over the shrunken radius, we should
1485  // have at least as many true nearest neighbors as
1486  // approximate nearest neighbors. (If not, an infinite
1487  // error will be generated, and so an internal error will
1488  // will be generated.
1489  //
1490  // Because nearest neighbors are sorted in increasing order
1491  // of distance, as soon as we see a null index, we can
1492  // terminate the distance checking. The error in the
1493  // result should not exceed epsilon. However, if
1494  // max_pts_visit is nonzero (meaning that the search is
1495  // terminated early) this might happen.
1496  //----------------------------------------------------------------
1497  for (j = 0; j < near_neigh; j++) {
1498  if (curr_tru_idx[j] == ANN_NULL_IDX)// no more true neighbors?
1499  break;
1500  // true i-th smallest distance
1501  double true_dist = ANN_ROOT(curr_tru_dst[j]);
1502  // reported i-th smallest
1503  double rept_dist = ANN_ROOT(curr_apx_dst[j]);
1504  // better than optimum?
1505  if (rept_dist < true_dist*(1-ERR)) {
1506  Error("INTERNAL ERROR: True nearest neighbor incorrect",
1507  ANNabort);
1508  }
1509 
1510  double resultErr; // result error
1511  if (true_dist == 0.0) { // let's not divide by zero
1512  if (rept_dist != 0.0) resultErr = ANN_DBL_MAX;
1513  else resultErr = 0.0;
1514  }
1515  else {
1516  resultErr = (rept_dist - true_dist) / ((double) true_dist);
1517  }
1518 
1519  if (resultErr > epsilon + RND_OFF && max_pts_visit == 0) {
1520  Error("INTERNAL ERROR: Actual error exceeds epsilon",
1521  ANNabort);
1522  }
1523  #ifdef ANN_PERF
1524  ann_average_err += resultErr; // update statistics error
1525  #endif
1526  }
1527  //--------------------------------------------------------------------
1528  // Compute rank errors (only needed for perf measurements)
1529  //--------------------------------------------------------------------
1530  #ifdef ANN_PERF
1531  for (j = 0; j < near_neigh; j++) {
1532  if (curr_tru_idx[i] == ANN_NULL_IDX) // no more true neighbors?
1533  break;
1534 
1535  double rnkErr = 0.0; // rank error
1536  // reported j-th distance
1537  ANNdist rept_dist = curr_apx_dst[j];
1538  int rnk = 0; // compute rank of this item
1539  while (rnk < true_nn && curr_tru_dst[rnk] <= rept_dist)
1540  rnk++;
1541  if (j+1-rnk > 0) rnkErr = (double) (j+1-rnk);
1542  ann_rank_err += rnkErr; // update average rank error
1543  }
1544  #endif
1545  //----------------------------------------------------------------
1546  // Check range counts from fixed-radius query
1547  //----------------------------------------------------------------
1548  if (radius_bound != 0) { // fixed-radius search
1549  if (apx_pts_in_range[i] < min_pts_in_range[i] ||
1551  Error("INTERNAL ERROR: Invalid fixed-radius range count",
1552  ANNabort);
1553  }
1554 
1555  curr_apx_idx += near_neigh;
1556  curr_apx_dst += near_neigh;
1557  curr_tru_idx += true_nn; // increment current pointers
1558  curr_tru_dst += true_nn;
1559  }
1560 }
1561 
1562 //----------------------------------------------------------------------
1563 // treeStats
1564 // Computes a number of statistics related to kd_trees and
1565 // bd_trees. These statistics are printed if in verbose mode,
1566 // and otherwise they are only printed if they are deemed to
1567 // be outside of reasonable operating bounds.
1568 //----------------------------------------------------------------------
1569 
1570 #define log2(x) (log(x)/log(2.0)) // log base 2
1571 
1573  ostream &out, // output stream
1574  ANNbool verbose) // print stats
1575 {
1576  const int MIN_PTS = 20; // min no. pts for checking
1577  const float MAX_FRAC_TL = 0.50; // max frac of triv leaves
1578  const float MAX_AVG_AR = 20; // max average aspect ratio
1579 
1580  ANNkdStats st; // statistics structure
1581 
1582  the_tree->getStats(st); // get statistics
1583  // total number of nodes
1584  int n_nodes = st.n_lf + st.n_spl + st.n_shr;
1585  // should be O(n/bs)
1586  int opt_n_nodes = (int) (2*(float(st.n_pts)/st.bkt_size));
1587  int too_many_nodes = 10*opt_n_nodes;
1588  if (st.n_pts >= MIN_PTS && n_nodes > too_many_nodes) {
1589  out << "-----------------------------------------------------------\n";
1590  out << "Warning: The tree has more than 10x as many nodes as points.\n";
1591  out << "You may want to consider a different split or shrink method.\n";
1592  out << "-----------------------------------------------------------\n";
1593  verbose = ANNtrue;
1594  }
1595  // fraction of trivial leaves
1596  float frac_tl = (st.n_lf == 0 ? 0 : ((float) st.n_tl)/ st.n_lf);
1597  if (st.n_pts >= MIN_PTS && frac_tl > MAX_FRAC_TL) {
1598  out << "-----------------------------------------------------------\n";
1599  out << "Warning: A significant fraction of leaves contain no points.\n";
1600  out << "You may want to consider a different split or shrink method.\n";
1601  out << "-----------------------------------------------------------\n";
1602  verbose = ANNtrue;
1603  }
1604  // depth should be O(dim*log n)
1605  int too_many_levels = (int) (2.0 * st.dim * log2((double) st.n_pts));
1606  int opt_levels = (int) log2(double(st.n_pts)/st.bkt_size);
1607  if (st.n_pts >= MIN_PTS && st.depth > too_many_levels) {
1608  out << "-----------------------------------------------------------\n";
1609  out << "Warning: The tree is more than 2x as deep as (dim*log n).\n";
1610  out << "You may want to consider a different split or shrink method.\n";
1611  out << "-----------------------------------------------------------\n";
1612  verbose = ANNtrue;
1613  }
1614  // average leaf aspect ratio
1615  if (st.n_pts >= MIN_PTS && st.avg_ar > MAX_AVG_AR) {
1616  out << "-----------------------------------------------------------\n";
1617  out << "Warning: Average aspect ratio of cells is quite large.\n";
1618  out << "This may slow queries depending on the point distribution.\n";
1619  out << "-----------------------------------------------------------\n";
1620  verbose = ANNtrue;
1621  }
1622 
1623  //------------------------------------------------------------------
1624  // Print summaries if requested
1625  //------------------------------------------------------------------
1626  if (verbose) { // output statistics
1627  out << " (Structure Statistics:\n";
1628  out << " n_nodes = " << n_nodes
1629  << " (opt = " << opt_n_nodes
1630  << ", best if < " << too_many_nodes << ")\n"
1631  << " n_leaves = " << st.n_lf
1632  << " (" << st.n_tl << " contain no points)\n"
1633  << " n_splits = " << st.n_spl << "\n"
1634  << " n_shrinks = " << st.n_shr << "\n";
1635  out << " empty_leaves = " << frac_tl*100
1636  << " percent (best if < " << MAX_FRAC_TL*100 << " percent)\n";
1637  out << " depth = " << st.depth
1638  << " (opt = " << opt_levels
1639  << ", best if < " << too_many_levels << ")\n";
1640  out << " avg_aspect_ratio = " << st.avg_ar
1641  << " (best if < " << MAX_AVG_AR << ")\n";
1642  out << " )\n";
1643  }
1644 }
int * apx_pts_in_range
Definition: ann_test.cpp:538
ANNpointArray data_pts
Definition: ann_test.cpp:533
int query_size
Definition: ann_test.cpp:474
const char shrink_table[N_SHRINK_RULES][STRING_LEN]
Definition: ann_test.cpp:363
int * max_pts_in_range
Definition: ann_test.cpp:542
int nPoints()
Definition: ANN.h:766
DLL_API void annPrintStats(ANNbool validate)
Definition: perf.cpp:116
const int def_max_visit
Definition: ann_test.cpp:455
ANNbool
Definition: ANN.h:132
const int def_seed
Definition: ann_test.cpp:459
void annClusGaussPts(ANNpointArray pa, int n, int dim, int n_clus, ANNbool new_clust, double std_dev)
Definition: rand.cpp:314
int n_pts
Definition: ANNperf.h:48
and that you are informed that you can do these things To protect your we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it For if you distribute copies of the whether gratis or for a you must give the recipients all the rights that we gave you You must make sure that receive or can get the source code If you link other code with the you must provide complete object files to the so that they can relink them with the library after making changes to the library and recompiling it And you must show them these terms so they know their rights We protect your rights with a two step method
Definition: License.txt:45
const int STRING_LEN
Definition: ann2fig.cpp:56
const double def_epsilon
Definition: ann_test.cpp:453
virtual void Print(ANNbool with_pts, std::ostream &out)
Definition: kd_tree.cpp:104
#define log2(x)
Definition: ann_test.cpp:1570
ANNdistArray apx_dists
Definition: ann_test.cpp:537
const double RND_OFF
Definition: ann_test.cpp:285
Definition: ANNx.h:48
void printPoint(ANNpoint p, int dim)
Definition: ann_test.cpp:389
virtual void getStats(ANNkdStats &st)
Definition: kd_tree.cpp:191
void doValidation()
Definition: ann_test.cpp:1467
int depth
Definition: ANNperf.h:54
double corr_coef
Definition: ann_test.cpp:479
int bucket_size
Definition: ann_test.cpp:483
DLL_API void annResetStats(int data_size)
Definition: perf.cpp:71
double epsilon
Definition: ann_test.cpp:484
int annkFRSearch(ANNpoint q, ANNdist sqRad, int k, ANNidxArray nn_idx=NULL, ANNdistArray dd=NULL, double eps=0.0)
const ANNshrinkRule def_shrink
Definition: ann_test.cpp:466
const int def_true_nn
Definition: ann_test.cpp:458
DLL_API ANNsampStat ann_average_err
Definition: perf.cpp:64
DLL_API void annMaxPtsVisit(int maxPts)
Definition: ANN.cpp:197
const double def_corr_coef
Definition: ann_test.cpp:451
StatLev
Definition: ann_test.cpp:297
void annkSearch(ANNpoint q, int k, ANNidxArray nn_idx, ANNdistArray dd, double eps=0.0)
Definition: kd_search.cpp:89
ANNbool validate
Definition: ann_test.cpp:489
const ANNbool def_validate
Definition: ann_test.cpp:460
int data_size
Definition: ann_test.cpp:473
int n_tl
Definition: ANNperf.h:51
const ANNsplitRule def_split
Definition: ann_test.cpp:464
ANNdistArray true_dists
Definition: ann_test.cpp:540
const ANNidx ANN_NULL_IDX
Definition: ANN.h:176
int n_color
Definition: ann_test.cpp:475
ANNbool valid_dirty
Definition: ann_test.cpp:544
ANNerr
Definition: ANNx.h:48
ANNpointArray query_pts
Definition: ann_test.cpp:534
Definition: ANNx.h:48
DLL_API void annUpdateStats()
Definition: perf.cpp:95
ANNbd_tree * the_tree
Definition: ann_test.cpp:535
const StatLev def_stats
Definition: ann_test.cpp:462
#define ANNlatestRev
Definition: ANN.h:124
int annkFRSearch(ANNpoint q, ANNdist sqRad, int k=0, ANNidxArray nn_idx=NULL, ANNdistArray dd=NULL, double eps=0.0)
Definition: brute.cpp:80
const int def_bucket_size
Definition: ann_test.cpp:452
double std_dev_lo
Definition: ann_test.cpp:481
DLL_API void annDeallocPts(ANNpointArray &pa)
Definition: ANN.cpp:133
void generatePts(ANNpointArray &pa, int n, PtType type, ANNbool new_clust, ANNpointArray src=NULL, int n_src=0)
Definition: ann_test.cpp:1191
Distrib distr
Definition: ann_test.cpp:478
#define ANN_ROOT(x)
Definition: ANN.h:361
ANNdist * ANNdistArray
Definition: ANN.h:377
DLL_API void annClose()
Definition: kd_tree.cpp:221
void annkPriSearch(ANNpoint q, int k, ANNidxArray nn_idx, ANNdistArray dd, double eps=0.0)
#define ANNcopyright
Definition: ANN.h:123
double radius_bound
Definition: ann_test.cpp:487
main(int argc, char **argv)
Definition: ann2fig.cpp:583
const int def_rad_bound
Definition: ann_test.cpp:456
ANNshrinkRule
Definition: ANN.h:605
StatLev stats
Definition: ann_test.cpp:490
#define ANNversion
Definition: ANN.h:121
ANNbool getDirective(istream &in, char *directive)
Definition: ann_test.cpp:611
void annPlanted(ANNpointArray pa, int n, int dim, ANNpointArray src, int n_src, double std_dev)
Definition: rand.cpp:580
Distrib
Definition: ann_test.cpp:321
ANNpoint * ANNpointArray
Definition: ANN.h:376
int dim
Definition: ANNperf.h:47
const int def_data_size
Definition: ann_test.cpp:444
const int def_near_neigh
Definition: ann_test.cpp:454
int n_spl
Definition: ANNperf.h:52
void treeStats(ostream &out, ANNbool verbose)
Definition: ann_test.cpp:1572
ANNidxArray apx_nn_idx
Definition: ann_test.cpp:536
ANNsplitRule split
Definition: ann_test.cpp:491
const int def_query_size
Definition: ann_test.cpp:445
int ANNidx
Definition: ANN.h:175
void readPts(ANNpointArray &pa, int &n, char *file_nm, PtType type)
Definition: ann_test.cpp:1283
const int extra_nn
Definition: ann_test.cpp:441
void annClusEllipsoids(ANNpointArray pa, int n, int dim, int n_clus, ANNbool new_clust, double std_dev_small, double std_dev_lo, double std_dev_hi, int max_dim)
Definition: rand.cpp:503
void annLaplacePts(ANNpointArray pa, int n, int dim)
Definition: rand.cpp:232
ANNshrinkRule shrink
Definition: ann_test.cpp:492
int n_shr
Definition: ANNperf.h:53
void getTrueNN()
Definition: ann_test.cpp:1374
int true_nn
Definition: ann_test.cpp:488
void annGaussPts(ANNpointArray pa, int n, int dim, double std_dev)
Definition: rand.cpp:214
const char split_table[N_SPLIT_RULES][STRING_LEN]
Definition: ann_test.cpp:350
void Error(const char *msg, ANNerr level)
Definition: ann2fig.cpp:97
const double def_std_dev
Definition: ann_test.cpp:450
ANNidxArray true_nn_idx
Definition: ann_test.cpp:539
const char distr_table[N_DISTRIBS][STRING_LEN]
Definition: ann_test.cpp:334
#define CLOCKS_PER_SEC
Definition: ann_test.cpp:52
ANNsplitRule
Definition: ANN.h:596
int bkt_size
Definition: ANNperf.h:49
int annIdum
Definition: rand.cpp:42
void annCoGaussPts(ANNpointArray pa, int n, int dim, double correlation)
Definition: rand.cpp:250
const int def_n_color
Definition: ann_test.cpp:446
int theDim()
Definition: ANN.h:763
PtType
Definition: ann_test.cpp:291
float avg_ar
Definition: ANNperf.h:56
const Distrib def_distr
Definition: ann_test.cpp:449
const int N_SPLIT_RULES
Definition: ann_test.cpp:349
const int def_dim
Definition: ann_test.cpp:443
int near_neigh
Definition: ann_test.cpp:485
const ANNbool def_new_clust
Definition: ann_test.cpp:447
ANNcoord * ANNpoint
Definition: ANN.h:375
ANNpointArray thePoints()
Definition: ANN.h:769
double ANNdist
Definition: ANN.h:159
ANNbool skipComment(istream &in)
Definition: ann_test.cpp:594
int max_dim
Definition: ann_test.cpp:477
int dim
Definition: ann2fig.cpp:81
Definition: ANN.h:132
ANNbool new_clust
Definition: ann_test.cpp:476
const char stat_table[N_STAT_LEVELS][STRING_LEN]
Definition: ann_test.cpp:308
int lookUp(const char *arg, const char(*table)[STRING_LEN], int size)
Definition: ann_test.cpp:401
DLL_API ANNsampStat ann_rank_err
Definition: perf.cpp:65
double std_dev
Definition: ann_test.cpp:480
int n_lf
Definition: ANNperf.h:50
void annUniformPts(ANNpointArray pa, int n, int dim)
Definition: rand.cpp:196
const double ERR
Definition: kd_split.cpp:34
DLL_API ANNpointArray annAllocPts(int n, int dim)
Definition: ANN.cpp:117
#define ANNversionCmt
Definition: ANN.h:122
void initGlobals()
Definition: ann_test.cpp:550
const int def_max_dim
Definition: ann_test.cpp:448
const int N_SHRINK_RULES
Definition: ann_test.cpp:362
virtual void Dump(ANNbool with_pts, std::ostream &out)
Definition: kd_dump.cpp:102
int max_pts_visit
Definition: ann_test.cpp:486
#define ANN_POW(v)
Definition: ANN.h:360
DLL_API void annResetCounts()
Definition: perf.cpp:85
and distribute a copy of this License along with the Library You may charge a fee for the physical act of transferring a and you may at your option offer warranty protection in exchange for a fee You may modify your copy or copies of the Library or any portion of thus forming a work based on the and copy and distribute such modifications or work under the terms of Section provided that you also meet all of these other than as an argument passed when the facility is then you must make a good faith effort to ensure in the event an application does not supply such function or table
Definition: License.txt:165
double std_dev_hi
Definition: ann_test.cpp:482
void annkSearch(ANNpoint q, int k, ANNidxArray nn_idx, ANNdistArray dd, double eps=0.0)
Definition: brute.cpp:54
Definition: ANN.h:132
void annCoLaplacePts(ANNpointArray pa, int n, int dim, double correlation)
Definition: rand.cpp:273
ANNidx * ANNidxArray
Definition: ANN.h:378
void annClusOrthFlats(ANNpointArray pa, int n, int dim, int n_clus, ANNbool new_clust, double std_dev, int max_dim)
Definition: rand.cpp:408
const double ANN_DBL_MAX
Definition: ANN.h:118
int * min_pts_in_range
Definition: ann_test.cpp:541

Generated on Thu Jun 11 2015 13:52:31 for queso-0.53.0 by  doxygen 1.8.5