queso-0.53.0
kd_split.cpp
Go to the documentation of this file.
1 //----------------------------------------------------------------------
2 // File: kd_split.cpp
3 // Programmer: Sunil Arya and David Mount
4 // Description: Methods for splitting kd-trees
5 // Last modified: 01/04/05 (Version 1.0)
6 //----------------------------------------------------------------------
7 // Copyright (c) 1997-2005 University of Maryland and Sunil Arya and
8 // David Mount. All Rights Reserved.
9 //
10 // This software and related documentation is part of the Approximate
11 // Nearest Neighbor Library (ANN). This software is provided under
12 // the provisions of the Lesser GNU Public License (LGPL). See the
13 // file ../ReadMe.txt for further information.
14 //
15 // The University of Maryland (U.M.) and the authors make no
16 // representations about the suitability or fitness of this software for
17 // any purpose. It is provided "as is" without express or implied
18 // warranty.
19 //----------------------------------------------------------------------
20 // History:
21 // Revision 0.1 03/04/98
22 // Initial release
23 // Revision 1.0 04/01/05
24 //----------------------------------------------------------------------
25 
26 #include "kd_tree.h" // kd-tree definitions
27 #include "kd_util.h" // kd-tree utilities
28 #include "kd_split.h" // splitting functions
29 
30 //----------------------------------------------------------------------
31 // Constants
32 //----------------------------------------------------------------------
33 
34 const double ERR = 0.001; // a small value
35 const double FS_ASPECT_RATIO = 3.0; // maximum allowed aspect ratio
36  // in fair split. Must be >= 2.
37 
38 //----------------------------------------------------------------------
39 // kd_split - Bentley's standard splitting routine for kd-trees
40 // Find the dimension of the greatest spread, and split
41 // just before the median point along this dimension.
42 //----------------------------------------------------------------------
43 
44 void kd_split(
45  ANNpointArray pa, // point array (permuted on return)
46  ANNidxArray pidx, // point indices
47  const ANNorthRect &bnds, // bounding rectangle for cell
48  int n, // number of points
49  int dim, // dimension of space
50  int &cut_dim, // cutting dimension (returned)
51  ANNcoord &cut_val, // cutting value (returned)
52  int &n_lo) // num of points on low side (returned)
53 {
54  // find dimension of maximum spread
55  cut_dim = annMaxSpread(pa, pidx, n, dim);
56  n_lo = n/2; // median rank
57  // split about median
58  annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
59 }
60 
61 //----------------------------------------------------------------------
62 // midpt_split - midpoint splitting rule for box-decomposition trees
63 //
64 // This is the simplest splitting rule that guarantees boxes
65 // of bounded aspect ratio. It simply cuts the box with the
66 // longest side through its midpoint. If there are ties, it
67 // selects the dimension with the maximum point spread.
68 //
69 // WARNING: This routine (while simple) doesn't seem to work
70 // well in practice in high dimensions, because it tends to
71 // generate a large number of trivial and/or unbalanced splits.
72 // Either kd_split(), sl_midpt_split(), or fair_split() are
73 // recommended, instead.
74 //----------------------------------------------------------------------
75 
77  ANNpointArray pa, // point array
78  ANNidxArray pidx, // point indices (permuted on return)
79  const ANNorthRect &bnds, // bounding rectangle for cell
80  int n, // number of points
81  int dim, // dimension of space
82  int &cut_dim, // cutting dimension (returned)
83  ANNcoord &cut_val, // cutting value (returned)
84  int &n_lo) // num of points on low side (returned)
85 {
86  int d;
87 
88  ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
89  for (d = 1; d < dim; d++) { // find length of longest box side
90  ANNcoord length = bnds.hi[d] - bnds.lo[d];
91  if (length > max_length) {
92  max_length = length;
93  }
94  }
95  ANNcoord max_spread = -1; // find long side with most spread
96  for (d = 0; d < dim; d++) {
97  // is it among longest?
98  if (double(bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) {
99  // compute its spread
100  ANNcoord spr = annSpread(pa, pidx, n, d);
101  if (spr > max_spread) { // is it max so far?
102  max_spread = spr;
103  cut_dim = d;
104  }
105  }
106  }
107  // split along cut_dim at midpoint
108  cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim]) / 2;
109  // permute points accordingly
110  int br1, br2;
111  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
112  //------------------------------------------------------------------
113  // On return: pa[0..br1-1] < cut_val
114  // pa[br1..br2-1] == cut_val
115  // pa[br2..n-1] > cut_val
116  //
117  // We can set n_lo to any value in the range [br1..br2].
118  // We choose split so that points are most evenly divided.
119  //------------------------------------------------------------------
120  if (br1 > n/2) n_lo = br1;
121  else if (br2 < n/2) n_lo = br2;
122  else n_lo = n/2;
123 }
124 
125 //----------------------------------------------------------------------
126 // sl_midpt_split - sliding midpoint splitting rule
127 //
128 // This is a modification of midpt_split, which has the nonsensical
129 // name "sliding midpoint". The idea is that we try to use the
130 // midpoint rule, by bisecting the longest side. If there are
131 // ties, the dimension with the maximum spread is selected. If,
132 // however, the midpoint split produces a trivial split (no points
133 // on one side of the splitting plane) then we slide the splitting
134 // (maintaining its orientation) until it produces a nontrivial
135 // split. For example, if the splitting plane is along the x-axis,
136 // and all the data points have x-coordinate less than the x-bisector,
137 // then the split is taken along the maximum x-coordinate of the
138 // data points.
139 //
140 // Intuitively, this rule cannot generate trivial splits, and
141 // hence avoids midpt_split's tendency to produce trees with
142 // a very large number of nodes.
143 //
144 //----------------------------------------------------------------------
145 
147  ANNpointArray pa, // point array
148  ANNidxArray pidx, // point indices (permuted on return)
149  const ANNorthRect &bnds, // bounding rectangle for cell
150  int n, // number of points
151  int dim, // dimension of space
152  int &cut_dim, // cutting dimension (returned)
153  ANNcoord &cut_val, // cutting value (returned)
154  int &n_lo) // num of points on low side (returned)
155 {
156  int d;
157 
158  ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
159  for (d = 1; d < dim; d++) { // find length of longest box side
160  ANNcoord length = bnds.hi[d] - bnds.lo[d];
161  if (length > max_length) {
162  max_length = length;
163  }
164  }
165  ANNcoord max_spread = -1; // find long side with most spread
166  for (d = 0; d < dim; d++) {
167  // is it among longest?
168  if ((bnds.hi[d] - bnds.lo[d]) >= (1-ERR)*max_length) {
169  // compute its spread
170  ANNcoord spr = annSpread(pa, pidx, n, d);
171  if (spr > max_spread) { // is it max so far?
172  max_spread = spr;
173  cut_dim = d;
174  }
175  }
176  }
177  // ideal split at midpoint
178  ANNcoord ideal_cut_val = (bnds.lo[cut_dim] + bnds.hi[cut_dim])/2;
179 
180  ANNcoord min, max;
181  annMinMax(pa, pidx, n, cut_dim, min, max); // find min/max coordinates
182 
183  if (ideal_cut_val < min) // slide to min or max as needed
184  cut_val = min;
185  else if (ideal_cut_val > max)
186  cut_val = max;
187  else
188  cut_val = ideal_cut_val;
189 
190  // permute points accordingly
191  int br1, br2;
192  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
193  //------------------------------------------------------------------
194  // On return: pa[0..br1-1] < cut_val
195  // pa[br1..br2-1] == cut_val
196  // pa[br2..n-1] > cut_val
197  //
198  // We can set n_lo to any value in the range [br1..br2] to satisfy
199  // the exit conditions of the procedure.
200  //
201  // if ideal_cut_val < min (implying br2 >= 1),
202  // then we select n_lo = 1 (so there is one point on left) and
203  // if ideal_cut_val > max (implying br1 <= n-1),
204  // then we select n_lo = n-1 (so there is one point on right).
205  // Otherwise, we select n_lo as close to n/2 as possible within
206  // [br1..br2].
207  //------------------------------------------------------------------
208  if (ideal_cut_val < min) n_lo = 1;
209  else if (ideal_cut_val > max) n_lo = n-1;
210  else if (br1 > n/2) n_lo = br1;
211  else if (br2 < n/2) n_lo = br2;
212  else n_lo = n/2;
213 }
214 
215 //----------------------------------------------------------------------
216 // fair_split - fair-split splitting rule
217 //
218 // This is a compromise between the kd-tree splitting rule (which
219 // always splits data points at their median) and the midpoint
220 // splitting rule (which always splits a box through its center.
221 // The goal of this procedure is to achieve both nicely balanced
222 // splits, and boxes of bounded aspect ratio.
223 //
224 // A constant FS_ASPECT_RATIO is defined. Given a box, those sides
225 // which can be split so that the ratio of the longest to shortest
226 // side does not exceed ASPECT_RATIO are identified. Among these
227 // sides, we select the one in which the points have the largest
228 // spread. We then split the points in a manner which most evenly
229 // distributes the points on either side of the splitting plane,
230 // subject to maintaining the bound on the ratio of long to short
231 // sides. To determine that the aspect ratio will be preserved,
232 // we determine the longest side (other than this side), and
233 // determine how narrowly we can cut this side, without causing the
234 // aspect ratio bound to be exceeded (small_piece).
235 //
236 // This procedure is more robust than either kd_split or midpt_split,
237 // but is more complicated as well. When point distribution is
238 // extremely skewed, this degenerates to midpt_split (actually
239 // 1/3 point split), and when the points are most evenly distributed,
240 // this degenerates to kd-split.
241 //----------------------------------------------------------------------
242 
244  ANNpointArray pa, // point array
245  ANNidxArray pidx, // point indices (permuted on return)
246  const ANNorthRect &bnds, // bounding rectangle for cell
247  int n, // number of points
248  int dim, // dimension of space
249  int &cut_dim, // cutting dimension (returned)
250  ANNcoord &cut_val, // cutting value (returned)
251  int &n_lo) // num of points on low side (returned)
252 {
253  int d;
254  ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
255  cut_dim = 0;
256  for (d = 1; d < dim; d++) { // find length of longest box side
257  ANNcoord length = bnds.hi[d] - bnds.lo[d];
258  if (length > max_length) {
259  max_length = length;
260  cut_dim = d;
261  }
262  }
263 
264  ANNcoord max_spread = 0; // find legal cut with max spread
265  cut_dim = 0;
266  for (d = 0; d < dim; d++) {
267  ANNcoord length = bnds.hi[d] - bnds.lo[d];
268  // is this side midpoint splitable
269  // without violating aspect ratio?
270  if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) {
271  // compute spread along this dim
272  ANNcoord spr = annSpread(pa, pidx, n, d);
273  if (spr > max_spread) { // best spread so far
274  max_spread = spr;
275  cut_dim = d; // this is dimension to cut
276  }
277  }
278  }
279 
280  max_length = 0; // find longest side other than cut_dim
281  for (d = 0; d < dim; d++) {
282  ANNcoord length = bnds.hi[d] - bnds.lo[d];
283  if (d != cut_dim && length > max_length)
284  max_length = length;
285  }
286  // consider most extreme splits
287  ANNcoord small_piece = max_length / FS_ASPECT_RATIO;
288  ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut
289  ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut
290 
291  int br1, br2;
292  // is median below lo_cut ?
293  if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) {
294  cut_val = lo_cut; // cut at lo_cut
295  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
296  n_lo = br1;
297  }
298  // is median above hi_cut?
299  else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) {
300  cut_val = hi_cut; // cut at hi_cut
301  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
302  n_lo = br2;
303  }
304  else { // median cut preserves asp ratio
305  n_lo = n/2; // split about median
306  annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
307  }
308 }
309 
310 //----------------------------------------------------------------------
311 // sl_fair_split - sliding fair split splitting rule
312 //
313 // Sliding fair split is a splitting rule that combines the
314 // strengths of both fair split with sliding midpoint split.
315 // Fair split tends to produce balanced splits when the points
316 // are roughly uniformly distributed, but it can produce many
317 // trivial splits when points are highly clustered. Sliding
318 // midpoint never produces trivial splits, and shrinks boxes
319 // nicely if points are highly clustered, but it may produce
320 // rather unbalanced splits when points are unclustered but not
321 // quite uniform.
322 //
323 // Sliding fair split is based on the theory that there are two
324 // types of splits that are "good": balanced splits that produce
325 // fat boxes, and unbalanced splits provided the cell with fewer
326 // points is fat.
327 //
328 // This splitting rule operates by first computing the longest
329 // side of the current bounding box. Then it asks which sides
330 // could be split (at the midpoint) and still satisfy the aspect
331 // ratio bound with respect to this side. Among these, it selects
332 // the side with the largest spread (as fair split would). It
333 // then considers the most extreme cuts that would be allowed by
334 // the aspect ratio bound. This is done by dividing the longest
335 // side of the box by the aspect ratio bound. If the median cut
336 // lies between these extreme cuts, then we use the median cut.
337 // If not, then consider the extreme cut that is closer to the
338 // median. If all the points lie to one side of this cut, then
339 // we slide the cut until it hits the first point. This may
340 // violate the aspect ratio bound, but will never generate empty
341 // cells. However the sibling of every such skinny cell is fat,
342 // and hence packing arguments still apply.
343 //
344 //----------------------------------------------------------------------
345 
347  ANNpointArray pa, // point array
348  ANNidxArray pidx, // point indices (permuted on return)
349  const ANNorthRect &bnds, // bounding rectangle for cell
350  int n, // number of points
351  int dim, // dimension of space
352  int &cut_dim, // cutting dimension (returned)
353  ANNcoord &cut_val, // cutting value (returned)
354  int &n_lo) // num of points on low side (returned)
355 {
356  int d;
357  ANNcoord min, max; // min/max coordinates
358  int br1, br2; // split break points
359 
360  ANNcoord max_length = bnds.hi[0] - bnds.lo[0];
361  cut_dim = 0;
362  for (d = 1; d < dim; d++) { // find length of longest box side
363  ANNcoord length = bnds.hi[d] - bnds.lo[d];
364  if (length > max_length) {
365  max_length = length;
366  cut_dim = d;
367  }
368  }
369 
370  ANNcoord max_spread = 0; // find legal cut with max spread
371  cut_dim = 0;
372  for (d = 0; d < dim; d++) {
373  ANNcoord length = bnds.hi[d] - bnds.lo[d];
374  // is this side midpoint splitable
375  // without violating aspect ratio?
376  if (((double) max_length)*2.0/((double) length) <= FS_ASPECT_RATIO) {
377  // compute spread along this dim
378  ANNcoord spr = annSpread(pa, pidx, n, d);
379  if (spr > max_spread) { // best spread so far
380  max_spread = spr;
381  cut_dim = d; // this is dimension to cut
382  }
383  }
384  }
385 
386  max_length = 0; // find longest side other than cut_dim
387  for (d = 0; d < dim; d++) {
388  ANNcoord length = bnds.hi[d] - bnds.lo[d];
389  if (d != cut_dim && length > max_length)
390  max_length = length;
391  }
392  // consider most extreme splits
393  ANNcoord small_piece = max_length / FS_ASPECT_RATIO;
394  ANNcoord lo_cut = bnds.lo[cut_dim] + small_piece;// lowest legal cut
395  ANNcoord hi_cut = bnds.hi[cut_dim] - small_piece;// highest legal cut
396  // find min and max along cut_dim
397  annMinMax(pa, pidx, n, cut_dim, min, max);
398  // is median below lo_cut?
399  if (annSplitBalance(pa, pidx, n, cut_dim, lo_cut) >= 0) {
400  if (max > lo_cut) { // are any points above lo_cut?
401  cut_val = lo_cut; // cut at lo_cut
402  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
403  n_lo = br1; // balance if there are ties
404  }
405  else { // all points below lo_cut
406  cut_val = max; // cut at max value
407  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
408  n_lo = n-1;
409  }
410  }
411  // is median above hi_cut?
412  else if (annSplitBalance(pa, pidx, n, cut_dim, hi_cut) <= 0) {
413  if (min < hi_cut) { // are any points below hi_cut?
414  cut_val = hi_cut; // cut at hi_cut
415  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
416  n_lo = br2; // balance if there are ties
417  }
418  else { // all points above hi_cut
419  cut_val = min; // cut at min value
420  annPlaneSplit(pa, pidx, n, cut_dim, cut_val, br1, br2);
421  n_lo = 1;
422  }
423  }
424  else { // median cut is good enough
425  n_lo = n/2; // split about median
426  annMedianSplit(pa, pidx, n, cut_dim, cut_val, n_lo);
427  }
428 }
void annMinMax(ANNpointArray pa, ANNidxArray pidx, int n, int d, ANNcoord &min, ANNcoord &max)
Definition: kd_util.cpp:170
int annSplitBalance(ANNpointArray pa, ANNidxArray pidx, int n, int d, ANNcoord cv)
Definition: kd_util.cpp:360
void sl_fair_split(ANNpointArray pa, ANNidxArray pidx, const ANNorthRect &bnds, int n, int dim, int &cut_dim, ANNcoord &cut_val, int &n_lo)
Definition: kd_split.cpp:346
double ANNcoord
Definition: ANN.h:158
ANNcoord annSpread(ANNpointArray pa, ANNidxArray pidx, int n, int d)
Definition: kd_util.cpp:154
void annMedianSplit(ANNpointArray pa, ANNidxArray pidx, int n, int d, ANNcoord &cv, int n_lo)
Definition: kd_util.cpp:230
ANNpoint * ANNpointArray
Definition: ANN.h:376
void sl_midpt_split(ANNpointArray pa, ANNidxArray pidx, const ANNorthRect &bnds, int n, int dim, int &cut_dim, ANNcoord &cut_val, int &n_lo)
Definition: kd_split.cpp:146
ANNpoint lo
Definition: ANNx.h:93
int annMaxSpread(ANNpointArray pa, ANNidxArray pidx, int n, int dim)
Definition: kd_util.cpp:187
void annPlaneSplit(ANNpointArray pa, ANNidxArray pidx, int n, int d, ANNcoord cv, int &br1, int &br2)
Definition: kd_util.cpp:291
int dim
Definition: ann2fig.cpp:81
ANNpoint hi
Definition: ANNx.h:94
void midpt_split(ANNpointArray pa, ANNidxArray pidx, const ANNorthRect &bnds, int n, int dim, int &cut_dim, ANNcoord &cut_val, int &n_lo)
Definition: kd_split.cpp:76
void kd_split(ANNpointArray pa, ANNidxArray pidx, const ANNorthRect &bnds, int n, int dim, int &cut_dim, ANNcoord &cut_val, int &n_lo)
Definition: kd_split.cpp:44
const double ERR
Definition: kd_split.cpp:34
const double FS_ASPECT_RATIO
Definition: kd_split.cpp:35
ANNidx * ANNidxArray
Definition: ANN.h:378
void fair_split(ANNpointArray pa, ANNidxArray pidx, const ANNorthRect &bnds, int n, int dim, int &cut_dim, ANNcoord &cut_val, int &n_lo)
Definition: kd_split.cpp:243

Generated on Thu Jun 11 2015 13:52:31 for queso-0.53.0 by  doxygen 1.8.5