From e1479281811e339a9499f77556122abed9d791c8 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Tue, 26 Jan 2021 15:38:47 -0800 Subject: [PATCH 01/19] Needless commit for git testing fork --- Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile b/Makefile index 112b1e68188a0..5ea35053a3b21 100644 --- a/Makefile +++ b/Makefile @@ -66,3 +66,8 @@ code-analysis: flake8-diff: git diff upstream/main -u -- "*.py" | flake8 --diff + +install: + . ./venv38/bin/activate ; pip install --verbose --no-build-isolation --editable . +install-test: + python -c "import sklearn; sklearn.show_versions()" From cfae65e75dfa75d22938057b3d8a85cb5ea82a8d Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Mon, 8 Feb 2021 15:56:56 -0800 Subject: [PATCH 02/19] Basic cython structure and some dummy code compiling --- doc/conf.py | 1 + sklearn/neighbors/__init__.py | 2 + sklearn/neighbors/_helpers.pxi | 81 ++++++++++++++++++++++ sklearn/neighbors/_trilateration.pyx | 100 +++++++++++++++++++++++++++ sklearn/neighbors/_unsupervised.py | 1 + sklearn/neighbors/setup.py | 6 ++ 6 files changed, 191 insertions(+) create mode 100644 sklearn/neighbors/_helpers.pxi create mode 100644 sklearn/neighbors/_trilateration.pyx diff --git a/doc/conf.py b/doc/conf.py index adf12d9e88e82..62013d8891073 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -25,6 +25,7 @@ # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. sys.path.insert(0, os.path.abspath('sphinxext')) +sys.path.insert(0, os.path.abspath('../venv38/lib/python3.8/site-packages/')) from github_link import make_linkcode_resolve import sphinx_gallery diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 82f9993bec50c..cb9d55c22498c 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -5,6 +5,7 @@ from ._ball_tree import BallTree from ._kd_tree import KDTree +from ._trilateration import TrilaterationIndex from ._dist_metrics import DistanceMetric from ._graph import kneighbors_graph, radius_neighbors_graph from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer @@ -20,6 +21,7 @@ __all__ = ['BallTree', 'DistanceMetric', 'KDTree', + 'TrilaterationIndex', 'KNeighborsClassifier', 'KNeighborsRegressor', 'KNeighborsTransformer', diff --git a/sklearn/neighbors/_helpers.pxi b/sklearn/neighbors/_helpers.pxi new file mode 100644 index 0000000000000..130383aa1e81d --- /dev/null +++ b/sklearn/neighbors/_helpers.pxi @@ -0,0 +1,81 @@ +# File is temporary, for moving around some dependencies + +cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( + np.ndarray[DTYPE_t, ndim=2, mode='c'] X): + return ( X.data) + +cdef ITYPE_t[::1] get_memview_ITYPE_1D( + np.ndarray[ITYPE_t, ndim=1, mode='c'] X): + return ( X.data) + +cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr, + ITYPE_t i1, ITYPE_t i2) nogil: + """swap the values at inex i1 and i2 of both darr and iarr""" + cdef DTYPE_t dtmp = darr[i1] + darr[i1] = darr[i2] + darr[i2] = dtmp + + cdef ITYPE_t itmp = iarr[i1] + iarr[i1] = iarr[i2] + iarr[i2] = itmp + +cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx, + ITYPE_t size) nogil except -1: + """ + Perform a recursive quicksort on the dist array, simultaneously + performing the same swaps on the idx array. The equivalent in + numpy (though quite a bit slower) is + + def simultaneous_sort(dist, idx): + i = np.argsort(dist) + return dist[i], idx[i] + """ + cdef ITYPE_t pivot_idx, i, store_idx + cdef DTYPE_t pivot_val + + # in the small-array case, do things efficiently + if size <= 1: + pass + elif size == 2: + if dist[0] > dist[1]: + dual_swap(dist, idx, 0, 1) + elif size == 3: + if dist[0] > dist[1]: + dual_swap(dist, idx, 0, 1) + if dist[1] > dist[2]: + dual_swap(dist, idx, 1, 2) + if dist[0] > dist[1]: + dual_swap(dist, idx, 0, 1) + else: + # Determine the pivot using the median-of-three rule. + # The smallest of the three is moved to the beginning of the array, + # the middle (the pivot value) is moved to the end, and the largest + # is moved to the pivot index. + pivot_idx = size / 2 + if dist[0] > dist[size - 1]: + dual_swap(dist, idx, 0, size - 1) + if dist[size - 1] > dist[pivot_idx]: + dual_swap(dist, idx, size - 1, pivot_idx) + if dist[0] > dist[size - 1]: + dual_swap(dist, idx, 0, size - 1) + pivot_val = dist[size - 1] + + # partition indices about pivot. At the end of this operation, + # pivot_idx will contain the pivot value, everything to the left + # will be smaller, and everything to the right will be larger. + store_idx = 0 + for i in range(size - 1): + if dist[i] < pivot_val: + dual_swap(dist, idx, i, store_idx) + store_idx += 1 + dual_swap(dist, idx, store_idx, size - 1) + pivot_idx = store_idx + + # recursively sort each side of the pivot + if pivot_idx > 1: + _simultaneous_sort(dist, idx, pivot_idx) + if pivot_idx + 2 < size: + _simultaneous_sort(dist + pivot_idx + 1, + idx + pivot_idx + 1, + size - pivot_idx - 1) + return 0 diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx new file mode 100644 index 0000000000000..c3271ef657edb --- /dev/null +++ b/sklearn/neighbors/_trilateration.pyx @@ -0,0 +1,100 @@ +#!python +#cython: boundscheck=False +#cython: wraparound=False +#cython: cdivision=True + +# Trilateration (or n-fold lateration for arbitrary dimensions) +# ===================== +# +# Author: Chip Lynch +# License: BSD something something +# +# The Trilateration Index itself is a only a data structure designed +# to store distances from n (default 3 thus "tri") fixed points in a +# sorted manner for quick approximate distance comparisons. This is +# most beneficial when the distance metric is complex (i.e. accurate +# geographic distances on an oblate spheroid earth). +# +# Trilateration optimized nearest-neighbor and related algorithms will +# create this structure during the "fit" phase, and leverage the anchored +# distance data within neighbor searches. + +cimport numpy as np +import numpy as np + +from ..utils import ( + check_array +) + +from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t +from ._typedefs import DTYPE, ITYPE + +from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, + euclidean_dist_to_rdist, euclidean_rdist_to_dist) + + +__all__ = ['TrilaterationIndex'] + +DOC_DICT = {'TrilaterationIndex': 'TrilaterationIndex'} + +# Start simple: +VALID_METRICS = ['EuclideanDistance'] + +include "_helpers.pxi" + +cdef class TrilaterationIndex: + # __doc__ = CLASS_DOC.format(**DOC_DICT) + + cdef np.ndarray data_arr + cdef np.ndarray ref_points_arr + cdef np.ndarray idx_array_arr + + def __cinit__(self): + self.data_arr = np.empty((1, 1), dtype=DTYPE, order='C') + + def __init__(self, data, metric='euclidean', **kwargs): + self.data_arr = check_array(data, dtype=DTYPE, order='C') + n_samples = self.data_arr.shape[0] + n_features = self.data_arr.shape[1] + + self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) + self.euclidean = (self.dist_metric.__class__.__name__ + == 'EuclideanDistance') + metric = self.dist_metric.__class__.__name__ + if metric not in VALID_METRICS: + raise ValueError('metric {metric} is not valid for ' + '{TrilaterationIndex}'.format(metric=metric, + **DOC_DICT)) + self.dist_metric._validate_data(self.data_arr) + + # allocate arrays for storage + self.idx_array_arr = np.arange(n_samples, dtype=ITYPE) + + self._update_memviews() + + print(self.data_arr) + + def _update_memviews(self): + self.data = get_memview_DTYPE_2D(self.data_arr) + self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) + self.ref_points = get_memview_DTYPE_2D(self.ref_points_arr) + + def query(self, X, k=5, + return_distance=True, + sort_results=True): + """ + query the index for k nearest neighbors + """ + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + if self.data.shape[0] < k: + raise ValueError("k must be less than or equal " + "to the number of training points") + + + + + pass diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 822a30f503bd2..1b57da81dacb2 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -28,6 +28,7 @@ class NearestNeighbors(KNeighborsMixin, - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` + - 'trilateration' will use :class:`TrilaterationIndex` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py index 9264044678193..a8b6d98cf1ec4 100644 --- a/sklearn/neighbors/setup.py +++ b/sklearn/neighbors/setup.py @@ -20,6 +20,11 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) + config.add_extension('_trilateration', + sources=['_trilateration.pyx'], + include_dirs=[numpy.get_include()], + libraries=libraries) + config.add_extension('_dist_metrics', sources=['_dist_metrics.pyx'], include_dirs=[numpy.get_include(), @@ -31,6 +36,7 @@ def configuration(parent_package='', top_path=None): sources=['_typedefs.pyx'], include_dirs=[numpy.get_include()], libraries=libraries) + config.add_extension("_quad_tree", sources=["_quad_tree.pyx"], include_dirs=[numpy.get_include()], From 44053ae426480c2c18c9de2aa00c98db9bfe153e Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Wed, 10 Feb 2021 16:57:38 -0800 Subject: [PATCH 03/19] TrilaterationIndex.query is working --- sklearn/neighbors/_helpers.pxi | 13 +++ sklearn/neighbors/_trilateration.pyx | 140 ++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/_helpers.pxi b/sklearn/neighbors/_helpers.pxi index 130383aa1e81d..0e4f2efd632a9 100644 --- a/sklearn/neighbors/_helpers.pxi +++ b/sklearn/neighbors/_helpers.pxi @@ -1,5 +1,7 @@ # File is temporary, for moving around some dependencies +from libc.math cimport fabs + cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( np.ndarray[DTYPE_t, ndim=2, mode='c'] X): return ( X.data) @@ -19,6 +21,17 @@ cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr, iarr[i1] = iarr[i2] iarr[i2] = itmp +cdef int _find_nearest_sorted_2D(DTYPE_t[:,:] rdists, DTYPE_t target): + """ rdists must be sorted by its first column. + Can we do this faster with Memoryviews? See _simultaneous_sort + """ + cdef int idx + idx = np.searchsorted(rdists[:,0], target, side="left") + if idx > 0 and (idx == len(rdists) or fabs(target - rdists[idx-1,0]) < fabs(target - rdists[idx,0])): + idx = idx - 1 + return idx + + cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx, ITYPE_t size) nogil except -1: """ diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index c3271ef657edb..745955b117263 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -21,6 +21,9 @@ cimport numpy as np import numpy as np +from libc.math cimport fabs + +from scipy.spatial.distance import cdist from ..utils import ( check_array @@ -48,18 +51,32 @@ cdef class TrilaterationIndex: cdef np.ndarray data_arr cdef np.ndarray ref_points_arr cdef np.ndarray idx_array_arr + cdef np.ndarray r0sortorder + cdef np.ndarray ref_dists + + cdef readonly DTYPE_t[:, ::1] data + cdef readonly DTYPE_t[:, ::1] ref_points + cdef public ITYPE_t[::1] idx_array + cdef readonly DTYPE_t[:, ::1] distances + + cdef DistanceMetric dist_metric def __cinit__(self): self.data_arr = np.empty((1, 1), dtype=DTYPE, order='C') - def __init__(self, data, metric='euclidean', **kwargs): + def __init__(self, data, ref, metric='euclidean', **kwargs): self.data_arr = check_array(data, dtype=DTYPE, order='C') + self.ref_points_arr = check_array(ref, dtype=DTYPE, order='C') + + if self.data_arr.shape[1] != self.ref_points_arr.shape[1]: + raise ValueError("Data and reference points must share same dimension") + n_samples = self.data_arr.shape[0] n_features = self.data_arr.shape[1] self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) - self.euclidean = (self.dist_metric.__class__.__name__ - == 'EuclideanDistance') + # self.euclidean = (self.dist_metric.__class__.__name__ + # == 'EuclideanDistance') metric = self.dist_metric.__class__.__name__ if metric not in VALID_METRICS: raise ValueError('metric {metric} is not valid for ' @@ -67,17 +84,34 @@ cdef class TrilaterationIndex: **DOC_DICT)) self.dist_metric._validate_data(self.data_arr) + self.ref_dists = self.dist_metric.pairwise(self.data_arr, self.ref_points_arr) + # allocate arrays for storage self.idx_array_arr = np.arange(n_samples, dtype=ITYPE) + + # Sort data by distance to refpoint[0] + # TODO: See _simultaneous sort to see if we can speed this up: + # (Right now since self.distances is 2D, we can't use this) + # _simultaneous_sort(&self.distances, + # &self.idx_array, + # self.distances.shape[0]) + + r0sortorder = np.argsort(self.ref_dists[:,0]) + self.ref_dists = self.ref_dists[r0sortorder] + self.idx_array_arr = self.idx_array_arr[r0sortorder] + self._update_memviews() + _find_nearest_sorted_2D(self.distances, -2) + _find_nearest_sorted_2D(self.distances, 62) + _find_nearest_sorted_2D(self.distances, 100) - print(self.data_arr) def _update_memviews(self): self.data = get_memview_DTYPE_2D(self.data_arr) self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) self.ref_points = get_memview_DTYPE_2D(self.ref_points_arr) + self.distances = get_memview_DTYPE_2D(self.ref_dists) def query(self, X, k=5, return_distance=True, @@ -94,6 +128,104 @@ cdef class TrilaterationIndex: raise ValueError("k must be less than or equal " "to the number of training points") + # Establish the distances from the query point to the reference points + cdef np.ndarray q_dists + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + + # cdef DTYPE_t[:, ::1] Xarr + # Xarr = get_memview_DTYPE_2D(np.asarray(X)) + # Xarr = np.asarray(X, dtype=DTYPE, order='C') + cdef int best_idx, low_idx, low_idx_possible, high_idx, high_idx_possible + cdef int test_idx + cdef DTYPE_t best_dist, test_dist + + best_idx = _find_nearest_sorted_2D(self.distances, q_dists[0,0]) + # best_dist = self.dist_metric.dist(self.data[best_idx,:], &Xarr, X.shape[1]) + # print(self.data_arr[best_idx,:].shape) + best_dist = cdist([self.data_arr[best_idx,:]], X) + + print("first best guess:") + print(best_idx) + print(np.asarray(self.data[best_idx,:])) + + print("starting best distance") + print(best_dist) + + # Establish bounds between which to search + low_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] - best_dist) + high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + best_dist) + low_idx = max(best_idx - 1, 0) + high_idx = min(best_idx + 1, self.distances.shape[0]) + print(f"low_idx_possible: {low_idx_possible}; high_idx_possible: {high_idx_possible}") + + # Consider adding chunking here + # So we test more than one new point at a time + test_idx = best_idx + + while True: + if low_idx <= low_idx_possible and high_idx >= high_idx_possible: + break + + # Determine whether the next high or low point is a better test: + lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) + highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) + print(f"comparing: {low_idx}, {high_idx}") + print(f"lowdelta: {lowdelta}, highdelta: {highdelta}") + if lowdelta <= highdelta and low_idx >= low_idx_possible: + test_idx = low_idx + low_idx = low_idx - 1 + elif high_idx <= high_idx_possible: + test_idx = high_idx + high_idx = high_idx + 1 + elif low_idx >= low_idx_possible: + test_idx = low_idx + low_idx = low_idx - 1 + else: + print("why?") + break + + print(f"testing point at: {self.idx_array[test_idx]} {np.asarray(self.data[self.idx_array[test_idx],:])}") + # Check that all pre-calculated distances are better than best + sufficient = True + for d, q in zip(self.distances[test_idx,1:], q_dists[0,1:]): + print(f"testing: ({abs(d-q)}) vs {best_dist}") + if abs(d-q) > best_dist: + sufficient = False + break + + if sufficient: + test_dist = cdist([self.data_arr[self.idx_array[test_idx],:]], X) + print(f"{test_idx} is sufficient... test_dist is: {test_dist}") + if test_dist < best_dist: + print("replacing best with test...") + print(f"new best idx: {best_idx} ({self.idx_array[best_idx]}) with dist: {test_dist}") + best_idx = test_idx + best_dist = test_dist + low_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] - best_dist) + high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + best_dist) + + continue + + return (self.idx_array[best_idx], best_dist) + + + + def query_radius(self, X, r, + int return_distance=False, + int count_only=False, + int sort_results=False): + """ + query the index for neighbors within a radius r + """ + if count_only and return_distance: + raise ValueError("count_only and return_distance " + "cannot both be true") + + if sort_results and not return_distance: + raise ValueError("return_distance must be True " + "if sort_results is True") + + From 4fc8bf6388ced4b73f67b94c4f650004aace823a Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Thu, 11 Feb 2021 13:03:10 -0800 Subject: [PATCH 04/19] Fixed big bug --- sklearn/neighbors/_helpers.pxi | 116 +++++++++++++++++++++++++++ sklearn/neighbors/_trilateration.pyx | 71 +++++++++++++--- trilat_test_10.py | 83 +++++++++++++++++++ 3 files changed, 257 insertions(+), 13 deletions(-) create mode 100644 trilat_test_10.py diff --git a/sklearn/neighbors/_helpers.pxi b/sklearn/neighbors/_helpers.pxi index 0e4f2efd632a9..2306f573a9afd 100644 --- a/sklearn/neighbors/_helpers.pxi +++ b/sklearn/neighbors/_helpers.pxi @@ -10,6 +10,10 @@ cdef ITYPE_t[::1] get_memview_ITYPE_1D( np.ndarray[ITYPE_t, ndim=1, mode='c'] X): return ( X.data) +cdef ITYPE_t[:, ::1] get_memview_ITYPE_2D( + np.ndarray[ITYPE_t, ndim=2, mode='c'] X): + return ( X.data) + cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr, ITYPE_t i1, ITYPE_t i2) nogil: """swap the values at inex i1 and i2 of both darr and iarr""" @@ -32,6 +36,118 @@ cdef int _find_nearest_sorted_2D(DTYPE_t[:,:] rdists, DTYPE_t target): return idx +cdef class NeighborsHeap: + """A max-heap structure to keep track of distances/indices of neighbors + + This implements an efficient pre-allocated set of fixed-size heaps + for chasing neighbors, holding both an index and a distance. + When any row of the heap is full, adding an additional point will push + the furthest point off the heap. + + Parameters + ---------- + n_pts : int + the number of heaps to use + n_nbrs : int + the size of each heap. + """ + cdef np.ndarray distances_arr + cdef np.ndarray indices_arr + + cdef DTYPE_t[:, ::1] distances + cdef ITYPE_t[:, ::1] indices + + def __cinit__(self): + self.distances_arr = np.zeros((1, 1), dtype=DTYPE, order='C') + self.indices_arr = np.zeros((1, 1), dtype=ITYPE, order='C') + self.distances = get_memview_DTYPE_2D(self.distances_arr) + self.indices = get_memview_ITYPE_2D(self.indices_arr) + + def __init__(self, n_pts, n_nbrs): + self.distances_arr = np.full((n_pts, n_nbrs), np.inf, dtype=DTYPE, + order='C') + self.indices_arr = np.zeros((n_pts, n_nbrs), dtype=ITYPE, order='C') + self.distances = get_memview_DTYPE_2D(self.distances_arr) + self.indices = get_memview_ITYPE_2D(self.indices_arr) + + def get_arrays(self, sort=True): + """Get the arrays of distances and indices within the heap. + + If sort=True, then simultaneously sort the indices and distances, + so the closer points are listed first. + """ + if sort: + self._sort() + return self.distances_arr, self.indices_arr + + cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1: + """Return the largest distance in the given row""" + return self.distances[row, 0] + + def push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val): + return self._push(row, val, i_val) + + cdef int _push(self, ITYPE_t row, DTYPE_t val, + ITYPE_t i_val) nogil except -1: + """push (val, i_val) into the given row""" + cdef ITYPE_t i, ic1, ic2, i_swap + cdef ITYPE_t size = self.distances.shape[1] + cdef DTYPE_t* dist_arr = &self.distances[row, 0] + cdef ITYPE_t* ind_arr = &self.indices[row, 0] + + # check if val should be in heap + if val > dist_arr[0]: + return 0 + + # insert val at position zero + dist_arr[0] = val + ind_arr[0] = i_val + + # descend the heap, swapping values until the max heap criterion is met + i = 0 + while True: + ic1 = 2 * i + 1 + ic2 = ic1 + 1 + + if ic1 >= size: + break + elif ic2 >= size: + if dist_arr[ic1] > val: + i_swap = ic1 + else: + break + elif dist_arr[ic1] >= dist_arr[ic2]: + if val < dist_arr[ic1]: + i_swap = ic1 + else: + break + else: + if val < dist_arr[ic2]: + i_swap = ic2 + else: + break + + dist_arr[i] = dist_arr[i_swap] + ind_arr[i] = ind_arr[i_swap] + + i = i_swap + + dist_arr[i] = val + ind_arr[i] = i_val + + return 0 + + cdef int _sort(self) except -1: + """simultaneously sort the distances and indices""" + cdef DTYPE_t[:, ::1] distances = self.distances + cdef ITYPE_t[:, ::1] indices = self.indices + cdef ITYPE_t row + for row in range(distances.shape[0]): + _simultaneous_sort(&distances[row, 0], + &indices[row, 0], + distances.shape[1]) + return 0 + cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx, ITYPE_t size) nogil except -1: """ diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 745955b117263..849c3e1f7bf52 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -64,8 +64,16 @@ cdef class TrilaterationIndex: def __cinit__(self): self.data_arr = np.empty((1, 1), dtype=DTYPE, order='C') - def __init__(self, data, ref, metric='euclidean', **kwargs): + def __init__(self, data, ref=None, metric='euclidean', **kwargs): self.data_arr = check_array(data, dtype=DTYPE, order='C') + + # Ref points can be passed in + # Otherwise they are generated based on input data + if ref is None: + ref = self._create_ref_points() + print("generated synthetic reference points:") + print(ref) + self.ref_points_arr = check_array(ref, dtype=DTYPE, order='C') if self.data_arr.shape[1] != self.ref_points_arr.shape[1]: @@ -102,10 +110,23 @@ cdef class TrilaterationIndex: self.idx_array_arr = self.idx_array_arr[r0sortorder] self._update_memviews() - _find_nearest_sorted_2D(self.distances, -2) - _find_nearest_sorted_2D(self.distances, 62) - _find_nearest_sorted_2D(self.distances, 100) + # _find_nearest_sorted_2D(self.distances, -2) + # _find_nearest_sorted_2D(self.distances, 62) + # _find_nearest_sorted_2D(self.distances, 100) + + def _create_ref_points(self): + """ + Create a set of well distributed reference points + in the same number of dimensions as the input data + + For the love of all that is holy, make this better. + """ + cdef DTYPE_t MAX_REF = 9999.0 + + ndims = self.data_arr.shape[1] + refs = [[0]*i+[MAX_REF]*(ndims-i) for i in range(ndims)] + return np.asarray(refs) def _update_memviews(self): self.data = get_memview_DTYPE_2D(self.data_arr) @@ -120,6 +141,31 @@ cdef class TrilaterationIndex: query the index for k nearest neighbors """ + if k != 1: + raise ValueError("Right now this only works for k=1") + + print(f"X.shape: {X.shape}") + + if X.shape[0] == 1: + results=self._query_one(X, k, return_distance, sort_results) + else: + results = [self._query_one(x, k=k, + return_distance=return_distance, + sort_results=sort_results) \ + for x in X] + + return results + + def _query_one(self, X, k=5, + return_distance=True, + sort_results=True): + """ + query the index for k nearest neighbors + """ + if X.shape[0] != 1: + raise ValueError("_query takes only a single X point" + "use query for multiple records") + if X.shape[X.ndim - 1] != self.data.shape[1]: raise ValueError("query data dimension must " "match training data dimension") @@ -127,7 +173,9 @@ cdef class TrilaterationIndex: if self.data.shape[0] < k: raise ValueError("k must be less than or equal " "to the number of training points") - + + cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + # Establish the distances from the query point to the reference points cdef np.ndarray q_dists q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) @@ -142,9 +190,9 @@ cdef class TrilaterationIndex: best_idx = _find_nearest_sorted_2D(self.distances, q_dists[0,0]) # best_dist = self.dist_metric.dist(self.data[best_idx,:], &Xarr, X.shape[1]) # print(self.data_arr[best_idx,:].shape) - best_dist = cdist([self.data_arr[best_idx,:]], X) + best_dist = cdist([self.data_arr[self.idx_array_arr[best_idx],:]], X) - print("first best guess:") + print(f"first best guess: {best_idx} data[{self.idx_array_arr[best_idx]}]") print(best_idx) print(np.asarray(self.data[best_idx,:])) @@ -164,6 +212,7 @@ cdef class TrilaterationIndex: while True: if low_idx <= low_idx_possible and high_idx >= high_idx_possible: + print(f"breaking because {low_idx} <= {low_idx_possible} and {high_idx} >= {high_idx_possible}") break # Determine whether the next high or low point is a better test: @@ -184,11 +233,11 @@ cdef class TrilaterationIndex: print("why?") break - print(f"testing point at: {self.idx_array[test_idx]} {np.asarray(self.data[self.idx_array[test_idx],:])}") + print(f"testing point at index {test_idx}: data[{self.idx_array[test_idx]}] {np.asarray(self.data[self.idx_array[test_idx],:])}") # Check that all pre-calculated distances are better than best sufficient = True for d, q in zip(self.distances[test_idx,1:], q_dists[0,1:]): - print(f"testing: ({abs(d-q)}) vs {best_dist}") + print(f"testing that: {abs(d-q)} < {best_dist}") if abs(d-q) > best_dist: sufficient = False break @@ -224,9 +273,5 @@ cdef class TrilaterationIndex: if sort_results and not return_distance: raise ValueError("return_distance must be True " "if sort_results is True") - - - - pass diff --git a/trilat_test_10.py b/trilat_test_10.py new file mode 100644 index 0000000000000..422ecb1220f82 --- /dev/null +++ b/trilat_test_10.py @@ -0,0 +1,83 @@ +# File to debug +# This file should never be published... +# Purge it from git before merging + +import csv +import sklearn +import numpy as np + +from scipy.spatial.distance import cdist + +# samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] +querypoint = [[50, 65]] + +samples = [] +refpoints = [] + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/point_sample_10.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + samples.append([row['x'], row['y']]) + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/sample_ref_points.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + refpoints.append([row['x'], row['y']]) + +print(refpoints) + +print(samples) +print("Actual distances:") +print(cdist(samples, querypoint)) + +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.neighbors import TrilaterationIndex + +neigh = NearestNeighbors(radius=10) +tree = KDTree(samples) +tq = tree.query(querypoint, k=2) +print("tree query results:") +print(tq) +neigh.fit(samples) + +print("finding neighbors within radius:") +rng = neigh.radius_neighbors([[44., 44.]]) +print("distances:") +print(np.asarray(rng[0][0])) +print("indexes:") +print(list(np.asarray(rng[1][0]))) +print("points:") +things = [samples[x] for x in list(np.asarray(rng[1][0]))] +print(things) +# [samples[x] for x in list(np.asarray(rng[1][0]))] +# print(samples[list(np.asarray(rng[1][0]))]) + +print(type(tree)) +print(type(tree.data)) +# dist, ind = tree.query +print(np.asarray(tree.node_data)) # Ohhh https://stackoverflow.com/questions/20978377/cython-convert-memory-view-to-numpy-array +print(tree.node_data[0]) + +ti = TrilaterationIndex(samples, refpoints) + + + +print(np.asarray(ti.ref_points)) +print(np.asarray(ti.idx_array)) +print("distances:") +print(np.asarray(ti.distances)) + +print(ti.ref_points.shape[0]) +print(ti.ref_points.shape[1]) + +result1 = ti.query(X=np.asarray(querypoint), k=1) +print(result1) + + + +t2 = TrilaterationIndex(samples) +result2 = t2.query(X=np.asarray(querypoint), k=1) +print(result2) +print("This should be 14.30117 something") From e9f18bde21fdfb07b17e77a83f58d5c5f8001163 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Thu, 11 Feb 2021 14:59:44 -0800 Subject: [PATCH 05/19] Added working heap and k-nn (beyond 1-nn) support --- sklearn/neighbors/_helpers.pxi | 9 +++++ sklearn/neighbors/_trilateration.pyx | 56 +++++++++++++++++----------- trilat_test_10.py | 9 ++++- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/sklearn/neighbors/_helpers.pxi b/sklearn/neighbors/_helpers.pxi index 2306f573a9afd..07d187eaf4e1f 100644 --- a/sklearn/neighbors/_helpers.pxi +++ b/sklearn/neighbors/_helpers.pxi @@ -70,6 +70,11 @@ cdef class NeighborsHeap: self.distances = get_memview_DTYPE_2D(self.distances_arr) self.indices = get_memview_ITYPE_2D(self.indices_arr) + def get_max(self, row): + """Get the max distance and index from the heap for a given row + """ + return self.largest(row), self.largest_idx(row) + def get_arrays(self, sort=True): """Get the arrays of distances and indices within the heap. @@ -83,6 +88,10 @@ cdef class NeighborsHeap: cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1: """Return the largest distance in the given row""" return self.distances[row, 0] + + cdef inline ITYPE_t largest_idx(self, ITYPE_t row) nogil except -1: + """Return the index for the largest distance in the given row""" + return self.indices[row, 0] def push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val): return self._push(row, val, i_val) diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 849c3e1f7bf52..0bc83ef131eb2 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -141,11 +141,6 @@ cdef class TrilaterationIndex: query the index for k nearest neighbors """ - if k != 1: - raise ValueError("Right now this only works for k=1") - - print(f"X.shape: {X.shape}") - if X.shape[0] == 1: results=self._query_one(X, k, return_distance, sort_results) else: @@ -174,7 +169,10 @@ cdef class TrilaterationIndex: raise ValueError("k must be less than or equal " "to the number of training points") + # Can probably improve this - using a heap that allows more than 1 value + # but we're always only using one here (X.shape[0] must be 1 from above) cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + print(f"initialized heap: {heap.get_arrays(sort=False)}") # Establish the distances from the query point to the reference points cdef np.ndarray q_dists @@ -191,20 +189,33 @@ cdef class TrilaterationIndex: # best_dist = self.dist_metric.dist(self.data[best_idx,:], &Xarr, X.shape[1]) # print(self.data_arr[best_idx,:].shape) best_dist = cdist([self.data_arr[self.idx_array_arr[best_idx],:]], X) + # heap.push(0, best_dist, best_idx) + # print(f"max heap: {heap.get_max(0)}") + # print(f"largest: {heap.largest(0)}") print(f"first best guess: {best_idx} data[{self.idx_array_arr[best_idx]}]") print(best_idx) print(np.asarray(self.data[best_idx,:])) - print("starting best distance") - print(best_dist) + # Populate the heap using 2k elements; k on each side of our first guess: + low_idx = max(best_idx - k, 0) + high_idx = min(best_idx + k, self.distances.shape[0]) + print(f"low_idx_possible: {low_idx_possible}; high_idx_possible: {high_idx_possible} with low_idx: {low_idx}, high_idx: {high_idx}") + for i in range(low_idx, high_idx + 1): + if i < self.distances.shape[0]: + test_dist = cdist([self.data_arr[self.idx_array_arr[i],:]], X) + heap.push(0, test_dist, self.idx_array_arr[i]) + + print("starting best distance heap") + print(f"{heap.get_arrays(sort=False)}") # Establish bounds between which to search - low_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] - best_dist) - high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + best_dist) - low_idx = max(best_idx - 1, 0) - high_idx = min(best_idx + 1, self.distances.shape[0]) - print(f"low_idx_possible: {low_idx_possible}; high_idx_possible: {high_idx_possible}") + if heap.largest(0) != np.inf: + low_idx_possible = 0 + high_idx_possible = self.data_arr.shape[0] + else: + low_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] - heap.largest(0)) + high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + heap.largest(0)) # Consider adding chunking here # So we test more than one new point at a time @@ -215,6 +226,8 @@ cdef class TrilaterationIndex: print(f"breaking because {low_idx} <= {low_idx_possible} and {high_idx} >= {high_idx_possible}") break + print(f"heap.largest(0) = {heap.largest(0)}") + # Determine whether the next high or low point is a better test: lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) @@ -236,26 +249,27 @@ cdef class TrilaterationIndex: print(f"testing point at index {test_idx}: data[{self.idx_array[test_idx]}] {np.asarray(self.data[self.idx_array[test_idx],:])}") # Check that all pre-calculated distances are better than best sufficient = True + for d, q in zip(self.distances[test_idx,1:], q_dists[0,1:]): - print(f"testing that: {abs(d-q)} < {best_dist}") - if abs(d-q) > best_dist: + print(f"testing that: {abs(d-q)} < {heap.largest(0)}") + if abs(d-q) > heap.largest(0): sufficient = False break if sufficient: test_dist = cdist([self.data_arr[self.idx_array[test_idx],:]], X) print(f"{test_idx} is sufficient... test_dist is: {test_dist}") - if test_dist < best_dist: - print("replacing best with test...") - print(f"new best idx: {best_idx} ({self.idx_array[best_idx]}) with dist: {test_dist}") + if test_dist < heap.largest(0): + heap.push(0, test_dist, self.idx_array[test_idx]) + print(f"pushing idx: {best_idx} ({self.idx_array[best_idx]}) with dist: {test_dist}") best_idx = test_idx - best_dist = test_dist - low_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] - best_dist) - high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + best_dist) + low_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] - heap.largest(0)) + high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + heap.largest(0)) continue - return (self.idx_array[best_idx], best_dist) + # return (self.idx_array[best_idx], best_dist) + return heap.get_arrays() diff --git a/trilat_test_10.py b/trilat_test_10.py index 422ecb1220f82..c69c7740f90f5 100644 --- a/trilat_test_10.py +++ b/trilat_test_10.py @@ -35,11 +35,12 @@ from sklearn.neighbors import NearestNeighbors, KDTree from sklearn.neighbors import TrilaterationIndex -neigh = NearestNeighbors(radius=10) tree = KDTree(samples) tq = tree.query(querypoint, k=2) print("tree query results:") print(tq) + +neigh = NearestNeighbors(radius=10) neigh.fit(samples) print("finding neighbors within radius:") @@ -81,3 +82,9 @@ result2 = t2.query(X=np.asarray(querypoint), k=1) print(result2) print("This should be 14.30117 something") + + +t2 = TrilaterationIndex(samples) +result2 = t2.query(X=np.asarray(querypoint), k=3) +print(result2) +print("This should be 14.30117 something") \ No newline at end of file From 605e52b0a6f08e6f6de1863ab8937184218bc76d Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Thu, 11 Feb 2021 16:02:28 -0800 Subject: [PATCH 06/19] Remove evil print statements --- sklearn/neighbors/_trilateration.pyx | 40 +++++++++++++++------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 0bc83ef131eb2..428a69258603a 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -71,8 +71,8 @@ cdef class TrilaterationIndex: # Otherwise they are generated based on input data if ref is None: ref = self._create_ref_points() - print("generated synthetic reference points:") - print(ref) + # print("generated synthetic reference points:") + # print(ref) self.ref_points_arr = check_array(ref, dtype=DTYPE, order='C') @@ -141,7 +141,9 @@ cdef class TrilaterationIndex: query the index for k nearest neighbors """ - if X.shape[0] == 1: + if isinstance(X, list): + results = self._query_one(np.asarray(X), k, return_distance, sort_results) + elif X.shape[0] == 1: results=self._query_one(X, k, return_distance, sort_results) else: results = [self._query_one(x, k=k, @@ -151,7 +153,7 @@ cdef class TrilaterationIndex: return results - def _query_one(self, X, k=5, + cdef _query_one(self, X, k=5, return_distance=True, sort_results=True): """ @@ -172,7 +174,7 @@ cdef class TrilaterationIndex: # Can probably improve this - using a heap that allows more than 1 value # but we're always only using one here (X.shape[0] must be 1 from above) cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) - print(f"initialized heap: {heap.get_arrays(sort=False)}") + # print(f"initialized heap: {heap.get_arrays(sort=False)}") # Establish the distances from the query point to the reference points cdef np.ndarray q_dists @@ -193,21 +195,21 @@ cdef class TrilaterationIndex: # print(f"max heap: {heap.get_max(0)}") # print(f"largest: {heap.largest(0)}") - print(f"first best guess: {best_idx} data[{self.idx_array_arr[best_idx]}]") - print(best_idx) - print(np.asarray(self.data[best_idx,:])) + # print(f"first best guess: {best_idx} data[{self.idx_array_arr[best_idx]}]") + # print(best_idx) + # print(np.asarray(self.data[best_idx,:])) # Populate the heap using 2k elements; k on each side of our first guess: low_idx = max(best_idx - k, 0) high_idx = min(best_idx + k, self.distances.shape[0]) - print(f"low_idx_possible: {low_idx_possible}; high_idx_possible: {high_idx_possible} with low_idx: {low_idx}, high_idx: {high_idx}") + # print(f"low_idx_possible: {low_idx_possible}; high_idx_possible: {high_idx_possible} with low_idx: {low_idx}, high_idx: {high_idx}") for i in range(low_idx, high_idx + 1): if i < self.distances.shape[0]: test_dist = cdist([self.data_arr[self.idx_array_arr[i],:]], X) heap.push(0, test_dist, self.idx_array_arr[i]) - print("starting best distance heap") - print(f"{heap.get_arrays(sort=False)}") + # print("starting best distance heap") + # print(f"{heap.get_arrays(sort=False)}") # Establish bounds between which to search if heap.largest(0) != np.inf: @@ -223,16 +225,16 @@ cdef class TrilaterationIndex: while True: if low_idx <= low_idx_possible and high_idx >= high_idx_possible: - print(f"breaking because {low_idx} <= {low_idx_possible} and {high_idx} >= {high_idx_possible}") + # print(f"breaking because {low_idx} <= {low_idx_possible} and {high_idx} >= {high_idx_possible}") break - print(f"heap.largest(0) = {heap.largest(0)}") + # print(f"heap.largest(0) = {heap.largest(0)}") # Determine whether the next high or low point is a better test: lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) - print(f"comparing: {low_idx}, {high_idx}") - print(f"lowdelta: {lowdelta}, highdelta: {highdelta}") + # print(f"comparing: {low_idx}, {high_idx}") + # print(f"lowdelta: {lowdelta}, highdelta: {highdelta}") if lowdelta <= highdelta and low_idx >= low_idx_possible: test_idx = low_idx low_idx = low_idx - 1 @@ -246,22 +248,22 @@ cdef class TrilaterationIndex: print("why?") break - print(f"testing point at index {test_idx}: data[{self.idx_array[test_idx]}] {np.asarray(self.data[self.idx_array[test_idx],:])}") + # print(f"testing point at index {test_idx}: data[{self.idx_array[test_idx]}] {np.asarray(self.data[self.idx_array[test_idx],:])}") # Check that all pre-calculated distances are better than best sufficient = True for d, q in zip(self.distances[test_idx,1:], q_dists[0,1:]): - print(f"testing that: {abs(d-q)} < {heap.largest(0)}") + # print(f"testing that: {abs(d-q)} < {heap.largest(0)}") if abs(d-q) > heap.largest(0): sufficient = False break if sufficient: test_dist = cdist([self.data_arr[self.idx_array[test_idx],:]], X) - print(f"{test_idx} is sufficient... test_dist is: {test_dist}") + # print(f"{test_idx} is sufficient... test_dist is: {test_dist}") if test_dist < heap.largest(0): heap.push(0, test_dist, self.idx_array[test_idx]) - print(f"pushing idx: {best_idx} ({self.idx_array[best_idx]}) with dist: {test_dist}") + # print(f"pushing idx: {best_idx} ({self.idx_array[best_idx]}) with dist: {test_dist}") best_idx = test_idx low_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] - heap.largest(0)) high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + heap.largest(0)) From 7a1973233bd8fc1c791e9aeb2b3f89c00b9725d9 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Fri, 12 Feb 2021 14:39:07 -0800 Subject: [PATCH 07/19] WIP commit - adding query_radius --- sklearn/neighbors/_helpers.pxi | 9 ++ sklearn/neighbors/_trilateration.pyx | 118 +++++++++++++++++++++++++-- 2 files changed, 118 insertions(+), 9 deletions(-) diff --git a/sklearn/neighbors/_helpers.pxi b/sklearn/neighbors/_helpers.pxi index 07d187eaf4e1f..4ecc6569845ba 100644 --- a/sklearn/neighbors/_helpers.pxi +++ b/sklearn/neighbors/_helpers.pxi @@ -35,6 +35,15 @@ cdef int _find_nearest_sorted_2D(DTYPE_t[:,:] rdists, DTYPE_t target): idx = idx - 1 return idx +cdef int _find_nearest_sorted_1D(DTYPE_t[:] rdists, DTYPE_t target): + """ rdists must be sorted by its first column. + Can we do this faster with Memoryviews? See _simultaneous_sort + """ + cdef int idx + idx = np.searchsorted(rdists[:], target, side="left") + if idx > 0 and (idx == len(rdists) or fabs(target - rdists[idx-1]) < fabs(target - rdists[idx])): + idx = idx - 1 + return idx cdef class NeighborsHeap: """A max-heap structure to keep track of distances/indices of neighbors diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 428a69258603a..b73d31e4553d8 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -53,6 +53,11 @@ cdef class TrilaterationIndex: cdef np.ndarray idx_array_arr cdef np.ndarray r0sortorder cdef np.ndarray ref_dists + cdef np.ndarray r_indexes + cdef np.ndarray r_distances + + cdef readonly ITYPE_t[:, ::1] r_indexes_mv + cdef readonly DTYPE_t[:, ::1] r_distances_mv cdef readonly DTYPE_t[:, ::1] data cdef readonly DTYPE_t[:, ::1] ref_points @@ -109,10 +114,17 @@ cdef class TrilaterationIndex: self.ref_dists = self.ref_dists[r0sortorder] self.idx_array_arr = self.idx_array_arr[r0sortorder] - self._update_memviews() - # _find_nearest_sorted_2D(self.distances, -2) - # _find_nearest_sorted_2D(self.distances, 62) - # _find_nearest_sorted_2D(self.distances, 100) + # Build ADDITIONAL indexes each sorted by the specific ref dists + # Note that self.distances/self.ref_dists are ONLY sorted by r0 dist + self.distances = get_memview_DTYPE_2D(self.ref_dists) + self.data = get_memview_DTYPE_2D(self.data_arr) + self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) + self.ref_points = get_memview_DTYPE_2D(self.ref_points_arr) + + self._build_r_indexes() + self.r_indexes_mv = get_memview_ITYPE_2D(self.r_indexes) + self.r_distances_mv = get_memview_DTYPE_2D(self.r_distances) + def _create_ref_points(self): """ @@ -128,11 +140,28 @@ cdef class TrilaterationIndex: refs = [[0]*i+[MAX_REF]*(ndims-i) for i in range(ndims)] return np.asarray(refs) - def _update_memviews(self): - self.data = get_memview_DTYPE_2D(self.data_arr) - self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) - self.ref_points = get_memview_DTYPE_2D(self.ref_points_arr) - self.distances = get_memview_DTYPE_2D(self.ref_dists) + + cdef _build_r_indexes(self): + """ + Build a 2D array + """ + self.r_indexes = np.zeros((self.distances.shape[1], self.distances.shape[0]), dtype=ITYPE, order='C') + self.r_distances = np.zeros((self.distances.shape[1], self.distances.shape[0]), dtype=DTYPE, order='C') + + for i in range(self.distances.shape[1]): + print(type(self.idx_array), self.idx_array.shape) + print(self.idx_array) + print(type(self.distances), self.distances.shape) + self.r_indexes[i,] = self.idx_array + self.r_distances[i,] = self.distances[:,i] + if i > 0: # At this moment, the first column is already sorted + sortorder = np.argsort(self.r_distances[i,]) + self.r_indexes[i,] = self.r_indexes[i,sortorder] + self.r_distances[i,] = self.r_distances[i, sortorder] + + print(self.r_indexes) + print(self.r_distances) + return 0 def query(self, X, k=5, return_distance=True, @@ -273,6 +302,49 @@ cdef class TrilaterationIndex: # return (self.idx_array[best_idx], best_dist) return heap.get_arrays() + cdef _query_expand(self, X, k=5, + start_dist = 0.01, + return_distance=True, + sort_results=True): + """ + return k-nn by the expanding method... + starting with start_dist, iterate over query_radius + while expanding and contracting the radius to + arrive at a good result + """ + + cdef DTYPE_t radius, too_high, too_low, new_radius, STD_SCALE, MAX_FUDGE + cdef ITYPE_t approx_count, i, MAX_ITER + + MAX_ITER = 20 + MAX_FUDGE = 5.0 + STD_SCALE = 2.0 + + radius = start_dist + too_low = 0 + too_high = np.inf + approx_count = self.query_radius_approx(X, radius) + + for i in range(MAX_ITER): + if approx_count == k: + break + elif approx_count < k: + too_low = radius + new_radius = start_dist * STD_SCALE + if new_radius > too_high: + new_radius = (too_high + radius) / STD_SCALE + radius = new_radius + approx_count = self.query_radius_approx(X, radius) + continue + elif approx_count > k * MAX_FUDGE: + radius = (radius + too_low) / STD_SCALE + approx_count = self.query_radius_approx(X, radius) + else: + continue + + return too_low, too_high, approx_count + + def query_radius(self, X, r, @@ -291,3 +363,31 @@ cdef class TrilaterationIndex: "if sort_results is True") pass + + def query_radius_approx(self, X, r): + """ + query the index for neighbors within a radius r + return approximate results only (by relying on index distances) + """ + result = self._query_radius_approx(X, r) + return result + + cdef _query_radius_approx(self, X, r): + + # The main self.idx_array and self.distances sort ONLY by the first + # reference distance. Here we sort by all three. + # Tune this some day... + + cdef np.ndarray points_in_range = np.asarray([0]) + cdef np.ndarray q_dists + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + + for i in range(self.r_indexes_mv.shape[1]): + low_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, 0] - r) + high_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, 0] + r) + + print(i) + + + return points_in_range + From 6daca29e7f44042e25889d5c2fa5f15aa3a9ec5d Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Fri, 12 Feb 2021 16:46:29 -0800 Subject: [PATCH 08/19] query_radius_approx is working, woot --- sklearn/neighbors/_trilateration.pyx | 31 ++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index b73d31e4553d8..f8affa813137f 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -21,6 +21,8 @@ cimport numpy as np import numpy as np + +from functools import reduce from libc.math cimport fabs from scipy.spatial.distance import cdist @@ -378,16 +380,27 @@ cdef class TrilaterationIndex: # reference distance. Here we sort by all three. # Tune this some day... - cdef np.ndarray points_in_range = np.asarray([0]) + # cdef np.ndarray points_in_range = np.arange(self.r_indexes_mv.shape[0]).reshape(self.r_indexes_mv.shape[0]) cdef np.ndarray q_dists - q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) - - for i in range(self.r_indexes_mv.shape[1]): - low_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, 0] - r) - high_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, 0] + r) - - print(i) + points_in_range = [] - return points_in_range + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + print(f"q_dists: {q_dists[0]} - {q_dists[0, 1]}") + print(f"r.indexes.shape: {self.r_indexes_mv.shape}") + + for i in range(self.r_indexes_mv.shape[0]): + # low_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, i] - r) + low_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] - r, side="left") + # high_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, i] + r) + high_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] + r, side="right") + + print(f"low_idx: {low_idx}, high_idx: {high_idx}") + print(f"valid IDS: {self.r_indexes[i, low_idx:high_idx]}") + points_in_range.append(self.r_indexes[i, low_idx:high_idx]) + print(i) + + common_idx = reduce(np.intersect1d, points_in_range) + print(f"commonIDs: {common_idx}") + return common_idx From c6dd6c63dbe8919e8ff9d5e423ca1af238d429f9 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Fri, 12 Feb 2021 17:58:07 -0800 Subject: [PATCH 09/19] Query expand is working! --- debug_query_radius.py | 86 ++++++++++++++++++++++++++++ sklearn/neighbors/_trilateration.pyx | 76 ++++++++++++++++++------ 2 files changed, 146 insertions(+), 16 deletions(-) create mode 100644 debug_query_radius.py diff --git a/debug_query_radius.py b/debug_query_radius.py new file mode 100644 index 0000000000000..c17953f73a615 --- /dev/null +++ b/debug_query_radius.py @@ -0,0 +1,86 @@ +# File to debug +# This file should never be published... +# Purge it from git before merging + +import csv +import sklearn +import numpy as np + +from scipy.spatial.distance import cdist + +# samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] +querypoint = [[50, 65]] + +samples = [] +refpoints = [] + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/point_sample_10.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + samples.append([row['x'], row['y']]) + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/sample_ref_points.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + refpoints.append([row['x'], row['y']]) + +print(refpoints) + +print(samples) +print("Actual distances:") +print(cdist(samples, querypoint)) + +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.neighbors import TrilaterationIndex + +tree = KDTree(samples) +tq = tree.query_radius(querypoint, r=20) +print("tree query results:") +print(tq) + +neigh = NearestNeighbors(radius=10) +neigh.fit(samples) + +print("finding neighbors within radius:") +rng = neigh.radius_neighbors([[44., 44.]]) +print("distances:") +print(np.asarray(rng[0][0])) +print("indexes:") +print(list(np.asarray(rng[1][0]))) +print("points:") +things = [samples[x] for x in list(np.asarray(rng[1][0]))] +print(things) +# [samples[x] for x in list(np.asarray(rng[1][0]))] +# print(samples[list(np.asarray(rng[1][0]))]) + +print(type(tree)) +print(type(tree.data)) +# dist, ind = tree.query +print(np.asarray(tree.node_data)) # Ohhh https://stackoverflow.com/questions/20978377/cython-convert-memory-view-to-numpy-array +print(tree.node_data[0]) + +ti = TrilaterationIndex(samples, refpoints) + + + +print(np.asarray(ti.ref_points)) +print(np.asarray(ti.idx_array)) +print("distances:") +print(np.asarray(ti.distances)) + +print(ti.ref_points.shape[0]) +print(ti.ref_points.shape[1]) + +result1 = ti.query_radius_approx(np.asarray(querypoint), 20) +print(f"query_radius_approx: {result1}") + + +result2 = ti.query_radius_approx(np.asarray(querypoint), 20) +print(f"query_radius: {result2}") + +print("------------") + +r3 = ti.query_expand(np.asarray(querypoint), 3) +print(f"query_expand: {r3}") \ No newline at end of file diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index f8affa813137f..524293aa51399 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -23,7 +23,7 @@ cimport numpy as np import numpy as np from functools import reduce -from libc.math cimport fabs +from libc.math cimport fabs, ceil from scipy.spatial.distance import cdist @@ -304,49 +304,79 @@ cdef class TrilaterationIndex: # return (self.idx_array[best_idx], best_dist) return heap.get_arrays() - cdef _query_expand(self, X, k=5, - start_dist = 0.01, - return_distance=True, - sort_results=True): + cdef _get_start_dist(self, X, k): + """ + For the expanding radius approach, we must make an initial guess + for a radius that may contain exactly k members. + We do this by selecting the R0 reference distances, and simply + expanding to k items, and returning the distance which separates + the k points along that sole axis + """ + cdef np.ndarray q_dists + cdef ITYPE_t low_idx, high_idx + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + print(f"q_dists: {q_dists}") + + ground_idx = _find_nearest_sorted_2D(self.distances, q_dists[0, 0]) + # ground_idx = np.searchsorted(self.distances, q_dists[0, 0], side="left") + low_idx = ground_idx - (k//2) + high_idx = ground_idx + ceil(k/2) + + print(f"get_start_dist is {self.distances[high_idx, 0] - self.distances[low_idx, 0]}") + return(self.distances[high_idx, 0] - self.distances[low_idx, 0]) + + def query_expand(self, X, k, return_distance=True, sort_results=True): + return np.asarray(self._query_expand(X, k, return_distance, sort_results)) + + cdef _query_expand(self, X, k, + return_distance, + sort_results): """ return k-nn by the expanding method... - starting with start_dist, iterate over query_radius + select a starting radius, iterate over query_radius while expanding and contracting the radius to arrive at a good result """ cdef DTYPE_t radius, too_high, too_low, new_radius, STD_SCALE, MAX_FUDGE cdef ITYPE_t approx_count, i, MAX_ITER + cdef ITYPE_t[::1] approx_array + MAX_ITER = 20 - MAX_FUDGE = 5.0 + MAX_FUDGE = 1.0 # MAX_ITER should handle things... STD_SCALE = 2.0 - radius = start_dist + radius = self._get_start_dist(X, k) + too_low = 0 too_high = np.inf - approx_count = self.query_radius_approx(X, radius) + print(f"testing query_radius_approx with: {X}, {radius}") + approx_array = self._query_radius_approx(X, radius) + approx_count = approx_array.shape[0] for i in range(MAX_ITER): if approx_count == k: break elif approx_count < k: + print(f"{approx_count} is too few at radius {radius}") too_low = radius - new_radius = start_dist * STD_SCALE + new_radius = radius * STD_SCALE if new_radius > too_high: new_radius = (too_high + radius) / STD_SCALE radius = new_radius - approx_count = self.query_radius_approx(X, radius) + approx_array = self._query_radius_approx(X, radius) + approx_count = approx_array.shape[0] continue elif approx_count > k * MAX_FUDGE: + print(f"{approx_count} is too many at radius {radius}") radius = (radius + too_low) / STD_SCALE - approx_count = self.query_radius_approx(X, radius) + approx_array = self._query_radius_approx(X, radius) + approx_count = approx_array.shape[0] else: continue - return too_low, too_high, approx_count - - + return approx_array def query_radius(self, X, r, @@ -364,7 +394,21 @@ cdef class TrilaterationIndex: raise ValueError("return_distance must be True " "if sort_results is True") - pass + cdef ITYPE_t[::1] idx_within_r + + if isinstance(X, list): + idx_witin_r = self._query_radius_approx([X], r) + elif X.shape[0] == 1: + idx_witin_r = self._query_radius_approx(X, r) + else: + idx_witin_r = [self._query_radius_approx(x, r) for x in X] + + if return_distance: + dists_within_r = self.dist_metric.pairwise(self.data_arr[idx_within_r], X) + return idx_witin_r, dists_within_r + + return idx_witin_r + def query_radius_approx(self, X, r): """ From 3ec876ac7c80bcb676f8ad9eda8db60d3a637ff6 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Sat, 13 Feb 2021 16:02:23 -0800 Subject: [PATCH 10/19] query and query_expand work with ann-benchmarks --- debug_query_radius.py | 41 ++++++++- sklearn/neighbors/_trilateration.pyx | 133 +++++++++++++++++---------- 2 files changed, 122 insertions(+), 52 deletions(-) diff --git a/debug_query_radius.py b/debug_query_radius.py index c17953f73a615..5b9ea31be65c4 100644 --- a/debug_query_radius.py +++ b/debug_query_radius.py @@ -80,7 +80,44 @@ result2 = ti.query_radius_approx(np.asarray(querypoint), 20) print(f"query_radius: {result2}") +rq = ti.query_radius(np.asarray(querypoint), 20) +print(f"rq: {rq}") + +print("------------") +print("single point tests") +r4 = ti.query(querypoint, 3) +ids, dists = r4 +print(f"query: {r4}") +print(f"indexes: {ids}, dists: {dists}") + +r3 = ti.query_expand(querypoint, 3) +ids, dists = r3 +print(f"query_expand: {r3}") +print(f"indexes: {np.asarray(ids)}", dists) + +print("Passed Single Point tests") print("------------") +print("Starting multi point tests") + +querypoints = np.asarray([np.asarray([50, 65]), + np.asarray([55, 70]), + np.asarray([45, 80])]) + +r4 = ti.query(querypoints, 3) +print(f"query results r4: {r4}") +print(f"r4.shape: {np.asarray(r4).shape}") + +ids, dists = r4 +print(f"indexes: {ids}, dists: {dists}") + +r3 = ti.query_expand(querypoints, 3) +ids, dists = r3 +print(f"query_expand: {r3}") +print(f"indexes: {np.asarray(ids)}", dists) + +# [(array([[14.30117536, 15.82017495, 16.12663017]]), array([[0, 5, 3]])), +# (array([[16.08773716, 16.85576594, 16.85576594]]), array([[5, 0, 0]])), +# (array([[ 4.37084771, 23.74124599, 23.74124599]]), array([[5, 3, 3]]))] -r3 = ti.query_expand(np.asarray(querypoint), 3) -print(f"query_expand: {r3}") \ No newline at end of file +# cython sklearn/neighbors/_trilateration.pyx -a +# cython sklearn/neighbors/_kd_tree.pyx -a diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 524293aa51399..17aa3ddae60a8 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -151,9 +151,6 @@ cdef class TrilaterationIndex: self.r_distances = np.zeros((self.distances.shape[1], self.distances.shape[0]), dtype=DTYPE, order='C') for i in range(self.distances.shape[1]): - print(type(self.idx_array), self.idx_array.shape) - print(self.idx_array) - print(type(self.distances), self.distances.shape) self.r_indexes[i,] = self.idx_array self.r_distances[i,] = self.distances[:,i] if i > 0: # At this moment, the first column is already sorted @@ -161,8 +158,6 @@ cdef class TrilaterationIndex: self.r_indexes[i,] = self.r_indexes[i,sortorder] self.r_distances[i,] = self.r_distances[i, sortorder] - print(self.r_indexes) - print(self.r_distances) return 0 def query(self, X, k=5, @@ -177,10 +172,12 @@ cdef class TrilaterationIndex: elif X.shape[0] == 1: results=self._query_one(X, k, return_distance, sort_results) else: - results = [self._query_one(x, k=k, - return_distance=return_distance, - sort_results=sort_results) \ - for x in X] + raise NotImplementedError("Not yet") + # [print(x) for x in X] + # results = [self._query_one(np.asarray([x]), k=k, + # return_distance=return_distance, + # sort_results=sort_results) \ + # for x in X] return results @@ -205,7 +202,6 @@ cdef class TrilaterationIndex: # Can probably improve this - using a heap that allows more than 1 value # but we're always only using one here (X.shape[0] must be 1 from above) cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) - # print(f"initialized heap: {heap.get_arrays(sort=False)}") # Establish the distances from the query point to the reference points cdef np.ndarray q_dists @@ -220,27 +216,17 @@ cdef class TrilaterationIndex: best_idx = _find_nearest_sorted_2D(self.distances, q_dists[0,0]) # best_dist = self.dist_metric.dist(self.data[best_idx,:], &Xarr, X.shape[1]) - # print(self.data_arr[best_idx,:].shape) best_dist = cdist([self.data_arr[self.idx_array_arr[best_idx],:]], X) # heap.push(0, best_dist, best_idx) - # print(f"max heap: {heap.get_max(0)}") - # print(f"largest: {heap.largest(0)}") - - # print(f"first best guess: {best_idx} data[{self.idx_array_arr[best_idx]}]") - # print(best_idx) - # print(np.asarray(self.data[best_idx,:])) # Populate the heap using 2k elements; k on each side of our first guess: low_idx = max(best_idx - k, 0) high_idx = min(best_idx + k, self.distances.shape[0]) - # print(f"low_idx_possible: {low_idx_possible}; high_idx_possible: {high_idx_possible} with low_idx: {low_idx}, high_idx: {high_idx}") for i in range(low_idx, high_idx + 1): if i < self.distances.shape[0]: test_dist = cdist([self.data_arr[self.idx_array_arr[i],:]], X) heap.push(0, test_dist, self.idx_array_arr[i]) - # print("starting best distance heap") - # print(f"{heap.get_arrays(sort=False)}") # Establish bounds between which to search if heap.largest(0) != np.inf: @@ -256,16 +242,11 @@ cdef class TrilaterationIndex: while True: if low_idx <= low_idx_possible and high_idx >= high_idx_possible: - # print(f"breaking because {low_idx} <= {low_idx_possible} and {high_idx} >= {high_idx_possible}") break - # print(f"heap.largest(0) = {heap.largest(0)}") - # Determine whether the next high or low point is a better test: lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) - # print(f"comparing: {low_idx}, {high_idx}") - # print(f"lowdelta: {lowdelta}, highdelta: {highdelta}") if lowdelta <= highdelta and low_idx >= low_idx_possible: test_idx = low_idx low_idx = low_idx - 1 @@ -314,23 +295,55 @@ cdef class TrilaterationIndex: """ cdef np.ndarray q_dists cdef ITYPE_t low_idx, high_idx + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) - print(f"q_dists: {q_dists}") ground_idx = _find_nearest_sorted_2D(self.distances, q_dists[0, 0]) # ground_idx = np.searchsorted(self.distances, q_dists[0, 0], side="left") low_idx = ground_idx - (k//2) high_idx = ground_idx + ceil(k/2) - print(f"get_start_dist is {self.distances[high_idx, 0] - self.distances[low_idx, 0]}") return(self.distances[high_idx, 0] - self.distances[low_idx, 0]) def query_expand(self, X, k, return_distance=True, sort_results=True): - return np.asarray(self._query_expand(X, k, return_distance, sort_results)) + """ + X can be what...? + A singlie list of one coordinate: [50, 25, 100] + """ + cdef np.ndarray idx_results + # cdef DTYPE_t[::1] dist_results + cdef np.ndarray dist_results + + + if isinstance(X, list): + X = np.asarray(X) + + cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + + if X.shape[0] == 1: + idx_results = np.asarray(self._query_expand(X, k)) + + else: + raise NotImplementedError("You can't do that yet") + # [print(x) for x in X] + # all_results = [self._query_expand(x, k) for x in X] + # print(f"all_results: {all_results}") + + dist_results = self.dist_metric.pairwise(self.data_arr[idx_results], X) + # best = np.argsort(dist_results) + # dist_results = dist_results[best] + # idx_results = idx_results[best] + + for idx, dist in zip(idx_results, dist_results): + heap.push(0, dist, idx) + + if return_distance: + return heap.get_arrays() - cdef _query_expand(self, X, k, - return_distance, - sort_results): + return idx_results + + + cdef _query_expand(self, X, k): """ return k-nn by the expanding method... select a starting radius, iterate over query_radius @@ -338,20 +351,27 @@ cdef class TrilaterationIndex: arrive at a good result """ + if X.shape[0] != 1: + raise ValueError("_query takes only a single X point" + "use query for multiple records") + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + cdef DTYPE_t radius, too_high, too_low, new_radius, STD_SCALE, MAX_FUDGE cdef ITYPE_t approx_count, i, MAX_ITER cdef ITYPE_t[::1] approx_array MAX_ITER = 20 - MAX_FUDGE = 1.0 # MAX_ITER should handle things... + MAX_FUDGE = 5.0 STD_SCALE = 2.0 radius = self._get_start_dist(X, k) too_low = 0 too_high = np.inf - print(f"testing query_radius_approx with: {X}, {radius}") approx_array = self._query_radius_approx(X, radius) approx_count = approx_array.shape[0] @@ -359,7 +379,6 @@ cdef class TrilaterationIndex: if approx_count == k: break elif approx_count < k: - print(f"{approx_count} is too few at radius {radius}") too_low = radius new_radius = radius * STD_SCALE if new_radius > too_high: @@ -369,7 +388,6 @@ cdef class TrilaterationIndex: approx_count = approx_array.shape[0] continue elif approx_count > k * MAX_FUDGE: - print(f"{approx_count} is too many at radius {radius}") radius = (radius + too_low) / STD_SCALE approx_array = self._query_radius_approx(X, radius) approx_count = approx_array.shape[0] @@ -394,20 +412,32 @@ cdef class TrilaterationIndex: raise ValueError("return_distance must be True " "if sort_results is True") - cdef ITYPE_t[::1] idx_within_r + # cdef ITYPE_t[::1] idx_within_r = np.asarray([1]) + cdef np.ndarray idx_within_r = np.asarray([1]) + cdef np.ndarray dists_within_r if isinstance(X, list): - idx_witin_r = self._query_radius_approx([X], r) - elif X.shape[0] == 1: - idx_witin_r = self._query_radius_approx(X, r) + X = np.asarray(X) + + if X.shape[0] == 1: + idx_within_r = self._query_radius_approx(X, r) else: - idx_witin_r = [self._query_radius_approx(x, r) for x in X] + idx_within_r = [self._query_radius_approx(x, r) for x in X] + + dists_within_r = self.dist_metric.pairwise(self.data_arr[idx_within_r], X).reshape(idx_within_r.shape[0]) + valid_idx = [x <= r for x in dists_within_r] + dists_within_r = dists_within_r[valid_idx] + idx_within_r = idx_within_r[valid_idx] + + if sort_results: + sort_order = np.argsort(dists_within_r) + dists_within_r = dists_within_r[sort_order] + idx_within_r = idx_within_r[sort_order] if return_distance: - dists_within_r = self.dist_metric.pairwise(self.data_arr[idx_within_r], X) - return idx_witin_r, dists_within_r + return idx_within_r, dists_within_r - return idx_witin_r + return idx_within_r def query_radius_approx(self, X, r): @@ -426,12 +456,16 @@ cdef class TrilaterationIndex: # cdef np.ndarray points_in_range = np.arange(self.r_indexes_mv.shape[0]).reshape(self.r_indexes_mv.shape[0]) cdef np.ndarray q_dists + # cdef ITYPE_t[::1] common_idx + cdef np.ndarray common_idx - points_in_range = [] + if isinstance(X, list) or X.ndim == 1: + X = np.asarray([X]) + points_in_range = [] q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) - print(f"q_dists: {q_dists[0]} - {q_dists[0, 1]}") - print(f"r.indexes.shape: {self.r_indexes_mv.shape}") + # print(f"q_dists: {q_dists[0]} - {q_dists[0, 1]}") + # print(f"r.indexes.shape: {self.r_indexes_mv.shape}") for i in range(self.r_indexes_mv.shape[0]): # low_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, i] - r) @@ -439,12 +473,11 @@ cdef class TrilaterationIndex: # high_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, i] + r) high_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] + r, side="right") - print(f"low_idx: {low_idx}, high_idx: {high_idx}") - print(f"valid IDS: {self.r_indexes[i, low_idx:high_idx]}") + # print(f"low_idx: {low_idx}, high_idx: {high_idx}") + # print(f"valid IDS: {self.r_indexes[i, low_idx:high_idx]}") points_in_range.append(self.r_indexes[i, low_idx:high_idx]) - print(i) common_idx = reduce(np.intersect1d, points_in_range) - print(f"commonIDs: {common_idx}") + # print(f"commonIDs: {common_idx}") return common_idx From b76156187aaa45b1e4553894c696d4a15702dea8 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Sat, 20 Feb 2021 22:40:00 -0800 Subject: [PATCH 11/19] query_radius_t3 is relatively performant now --- perftest.py | 84 ++++++++++++ sklearn/neighbors/_trilateration.pyx | 194 ++++++++++++++++++++++++--- 2 files changed, 263 insertions(+), 15 deletions(-) create mode 100644 perftest.py diff --git a/perftest.py b/perftest.py new file mode 100644 index 0000000000000..1775ab08b1718 --- /dev/null +++ b/perftest.py @@ -0,0 +1,84 @@ +import csv +import sklearn +import numpy as np +import timeit + +from scipy.spatial.distance import cdist + +querypoint = np.asarray([[38.25, -85.50]]) + +samples = [] +refpoints = [] + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/lat_long_synthetic.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + # print(row) + samples.append([row['Latitude'], row['Longitude']]) + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/ref_points.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + refpoints.append([row['Latitude'], row['Longitude']]) + +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.neighbors import TrilaterationIndex + +brute = NearestNeighbors(n_neighbors=1000, radius=0.07, algorithm='brute') +brute.fit(samples) +BF = brute.radius_neighbors(querypoint, 0.1) +print(f"brute force results: {BF[1][0]} ({len(BF[1][0])})") + + +tree = KDTree(samples) +# t = timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500) +# print(f"KDTree 500x single point time: {t}") +tq = tree.query_radius(querypoint, r=0.1) +print(f"tree query results: {tq} ({len(tq[0])})") + + +trilat = TrilaterationIndex(samples) +# t = timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500) +# print(f"Trilat 500x single point time: {t}") +tr = trilat.query_radius(querypoint, r=0.1) +print(f"trilat query_radius: {tr} ({len(tr)})") + + +t3 = trilat.query_radius_t3(querypoint, r=0.1) +print(f"trilat query_radius_t3: {t3} ({len(t3)})") + +print(f"length match 1: {len(np.intersect1d(tq[0], tr))}") +print(f"length match 2: {len(np.intersect1d(tq[0], t3))}") + +print(f"setdiff: {np.setdiff1d(tq[0], t3)}") +if 18 in tq[0]: + print("18 in tq") +if 18 in t3: + print("18 in t3") + + +import pstats, cProfile +import pyximport +pyximport.install + +import sklearn +# print(type(querypoint)) +# cProfile.runctx('trilat.query_radius(querypoint, r=0.07)', globals(), locals(), "Profile.prof") + +cProfile.runctx('timeit.timeit(lambda: brute.radius_neighbors(querypoint, 0.07), number=500)', globals(), locals(), "BFProfile.prof") +cProfile.runctx('timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500)', globals(), locals(), "KDProfile.prof") +cProfile.runctx('timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500)', globals(), locals(), "TRProfile.prof") +cProfile.runctx('timeit.timeit(lambda: trilat.query_radius_t3(querypoint, r=0.07), number=500)', globals(), locals(), "T3Profile.prof") + +s = pstats.Stats("BFProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("KDProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("TRProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("T3Profile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() \ No newline at end of file diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 17aa3ddae60a8..8880e12692cf2 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -2,6 +2,7 @@ #cython: boundscheck=False #cython: wraparound=False #cython: cdivision=True +#cython: profile=True # Trilateration (or n-fold lateration for arbitrary dimensions) # ===================== @@ -22,6 +23,7 @@ cimport numpy as np import numpy as np +from collections import Counter from functools import reduce from libc.math cimport fabs, ceil @@ -66,8 +68,11 @@ cdef class TrilaterationIndex: cdef public ITYPE_t[::1] idx_array cdef readonly DTYPE_t[:, ::1] distances + cdef int ndims + cdef DistanceMetric dist_metric + def __cinit__(self): self.data_arr = np.empty((1, 1), dtype=DTYPE, order='C') @@ -138,8 +143,8 @@ cdef class TrilaterationIndex: cdef DTYPE_t MAX_REF = 9999.0 - ndims = self.data_arr.shape[1] - refs = [[0]*i+[MAX_REF]*(ndims-i) for i in range(ndims)] + self.ndims = self.data_arr.shape[1] + refs = [[0]*i+[MAX_REF]*(self.ndims-i) for i in range(self.ndims)] return np.asarray(refs) @@ -168,9 +173,9 @@ cdef class TrilaterationIndex: """ if isinstance(X, list): - results = self._query_one(np.asarray(X), k, return_distance, sort_results) + results = self._query_one(np.asarray(X), k, return_distance) elif X.shape[0] == 1: - results=self._query_one(X, k, return_distance, sort_results) + results=self._query_one(X, k, return_distance) else: raise NotImplementedError("Not yet") # [print(x) for x in X] @@ -182,8 +187,7 @@ cdef class TrilaterationIndex: return results cdef _query_one(self, X, k=5, - return_distance=True, - sort_results=True): + return_distance=True): """ query the index for k nearest neighbors """ @@ -372,7 +376,9 @@ cdef class TrilaterationIndex: too_low = 0 too_high = np.inf - approx_array = self._query_radius_approx(X, radius) + # approx_array = self._query_radius_at_least_k(X, radius, k) + approx_array = self._query_radius_r0_only(X, radius) + approx_count = approx_array.shape[0] for i in range(MAX_ITER): @@ -384,12 +390,12 @@ cdef class TrilaterationIndex: if new_radius > too_high: new_radius = (too_high + radius) / STD_SCALE radius = new_radius - approx_array = self._query_radius_approx(X, radius) + approx_array = self._query_radius_at_least_k(X, radius, k) approx_count = approx_array.shape[0] continue elif approx_count > k * MAX_FUDGE: radius = (radius + too_low) / STD_SCALE - approx_array = self._query_radius_approx(X, radius) + approx_array = self._query_radius_at_least_k(X, radius, k) approx_count = approx_array.shape[0] else: continue @@ -449,15 +455,17 @@ cdef class TrilaterationIndex: return result cdef _query_radius_approx(self, X, r): + """ - # The main self.idx_array and self.distances sort ONLY by the first - # reference distance. Here we sort by all three. - # Tune this some day... + """ # cdef np.ndarray points_in_range = np.arange(self.r_indexes_mv.shape[0]).reshape(self.r_indexes_mv.shape[0]) cdef np.ndarray q_dists # cdef ITYPE_t[::1] common_idx cdef np.ndarray common_idx + cdef np.ndarray new_points + + point_counts = Counter() if isinstance(X, list) or X.ndim == 1: X = np.asarray([X]) @@ -467,7 +475,7 @@ cdef class TrilaterationIndex: # print(f"q_dists: {q_dists[0]} - {q_dists[0, 1]}") # print(f"r.indexes.shape: {self.r_indexes_mv.shape}") - for i in range(self.r_indexes_mv.shape[0]): + for i in range(self.ndims): # low_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, i] - r) low_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] - r, side="left") # high_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, i] + r) @@ -475,9 +483,165 @@ cdef class TrilaterationIndex: # print(f"low_idx: {low_idx}, high_idx: {high_idx}") # print(f"valid IDS: {self.r_indexes[i, low_idx:high_idx]}") - points_in_range.append(self.r_indexes[i, low_idx:high_idx]) + new_points = self.r_indexes[i, low_idx:high_idx] + # print(f"new_points: {new_points} len: {len(new_points)}") + # point_counts.update(new_points) # Failed performance improvement test; leaving for posterity + points_in_range.append(new_points) - common_idx = reduce(np.intersect1d, points_in_range) + # THIS IS THE PERFORMANCE PROBLEM: + common_idx = reduce(np.intersect1d, points_in_range) # This works but seems slow (fastest so far tho) + # common_idx = np.asarray([k for (k, v) in point_counts.items() if v == self.r_indexes_mv.shape[0]]) + # print(f"commonIDs: {common_idx}") return common_idx + cdef _query_radius_at_least_k(self, X, r, k): + """ for use with expanding estimates... + break early if we've eliminated too many points + """ + + cdef np.ndarray q_dists + cdef np.ndarray common_idx + + if isinstance(X, list) or X.ndim == 1: + X = np.asarray([X]) + + points_in_range = [] + new_points = [] + + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + + for i in range(self.r_indexes_mv.shape[0]): + low_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] - r, side="left") + high_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] + r, side="right") + + new_points = self.r_indexes[i, low_idx:high_idx] + + if points_in_range == []: + points_in_range = new_points + else: + # THIS IS THE PERFORMANCE PROBLEM: + points_in_range = np.intersect1d(points_in_range, new_points) # This is the line that works, but slowly + # At Least k voodoo: + if len(points_in_range) < k: + break + + common_idx = np.asarray(points_in_range) + return common_idx + + + def query_radius_t3(self, X, r): + """ + query the index for neighbors within a radius r + return approximate results only (by relying on index distances) + """ + # result = np.asarray(self._query_radius_r0_only(X, r)) + result = self._query_radius_r0_only(X, r) + + return result + + cdef _query_radius_r0_only(self, X, r): + """ + This approach uses only the original sort order from Ref Point 0 + and then brute forces whatever that index indicates MAY be in range + + NOT FINISHED + """ + + cdef DTYPE_t[:, ::1] q_dists # , dist_results + cdef np.ndarray[DTYPE_t, ndim=1] dist_results + # cdef DTYPE_t[::1] dist_results + cdef ITYPE_t[::1] approx_idxs, results + # cdef np.ndarray[DTYPE_t, ndim=2] q_dists, dist_results + # cdef np.ndarray[ITYPE_t] approx_idxs #, points_in_range + cdef ITYPE_t i, low_idx, high_idx, count + + if isinstance(X, list) or X.ndim == 1: + X = np.asarray([X]) + + # points_in_range = [] + + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + # ground_idx = _find_nearest_sorted_2D(self.distances, q_dists[0, 0]) + + # Which of the following two pairs are reliably faster? Probably neither: + # low_idx = np.searchsorted(self.r_distances_mv[0,:], q_dists[0, 0] - r, side="left") + # high_idx = np.searchsorted(self.r_distances_mv[0,:], q_dists[0, 0] + r, side="right") + + low_idx = np.searchsorted(self.distances[:,0], q_dists[0, 0] - r, side="left") + high_idx = np.searchsorted(self.distances[:,0], q_dists[0, 0] + r, side="right") + + # print(f"self.distances.shape: {self.distances.shape}, low_idx: {low_idx}, high_idx: {high_idx}") + # print(f"self.distances[low_idx, :]: {np.asarray(self.distances[low_idx,:])}") + # print(f"self.distances[low_idx+1, :]: {np.asarray(self.distances[low_idx+1,:])}") + # print(f"self.distances[low_idx+2, :]: {np.asarray(self.distances[low_idx+2,:])}") + + # print(f"q_dists[0, :]: {q_dists[0, :]}") + # print(f"self.distances - qd: {abs(np.asarray(self.distances[low_idx,:] - q_dists[0, :]))} vs r: {r}") + + # The next two lines work but are 10 times slower than just brute alone (as-is) + # approx_idxs = np.asarray([i for i in range(low_idx, high_idx+1) \ + # if np.all(abs(np.asarray(self.distances[i,:]) - q_dists[0, :]) <= r)]) + # approx_idxs = [self.idx_array[i] for i in approx_idxs] + approx_idxs = self.idx_array[low_idx:high_idx+1] + # print(f"approx_idxs (len {len(approx_idxs)}): {np.array(approx_idxs)}") + + dist_results = self.dist_metric.pairwise(self.data_arr[approx_idxs], X).reshape(approx_idxs.shape[0]) + # print(f"len(dist_results): {len(dist_results)} or {dist_results.shape[0]} low_idx:high_idx {low_idx}:{high_idx}") + dist_results = dist_results.reshape(dist_results.shape[0]) + # print(f"dist_results: {np.asarray(dist_results)}") + + # cdef ITYPE_t result_count + # result_count = np.count_nonzero(dist_results <= r) + # print(f"dist_results <= r: {dist_results <= r}") + # print(f"result_count: {result_count}") + + # result_arr = np.zeros(result_count, dtype=ITYPE) + result_arr = np.zeros(len(dist_results), dtype=ITYPE) + # results = get_memview_ITYPE_1D(result_arr) # If you replace result_arr with results from here on you get nonsense. I should understand, but do not. + + count = 0 + for i in range(len(dist_results)): + # print(f"{i} (count: {count}) testing point {approx_idxs[i]} with dist: {dist_results[i]}") + if dist_results[i] <= r: + # if approx_idxs[i] == 18: + # print(f"adding {approx_idxs[i]}") + result_arr[count] = approx_idxs[i] + count += 1 + + + # NOW WHAT? + # results_arr = approx_idxs[dist_results <= r] + + # Why is this slow: ? + # results = approx_idxs[[range(8000)]] + + # This alone is slow... why? + # cdef np.array(dtype=bool) things # Not sure how to declare this + # cdef ITYPE_t[::1] things + # things = [x <= r for x in dist_results] + + # for i in range(len(dist_results)): + # if things[i]: + # results.append(approx_idxs[i]) + + # This is slow but works: + # results = approx_idxs[[x[0] <= r for x in dist_results]] + + # This is even slower but works: + # results = np.asarray([approx_idxs[i] for i in range(len(approx_idxs)) if dist_results[i] <= r]) + + # print(f"results: {results}") + # return results + # return np.sort(np.asarray(results[0:count])) + return np.asarray(result_arr[0:count]) + + + cdef _query_radius_best_r_only(self, X, r): + """ + This approach counts the candidates using each reference point by themselves + then brute forces the indexees from the best possible result + + NOT FINISHED + """ + pass \ No newline at end of file From a2cb2f3cf0eb1b9f7952daa1c4f64cd105de8943 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Tue, 23 Feb 2021 10:00:50 -0800 Subject: [PATCH 12/19] WIP for _query_expand_2. Slight perf improvement for query_radius --- perftest.py | 84 ------------------- sklearn/neighbors/_trilateration.pyx | 121 ++++++++++++++++++++++----- 2 files changed, 99 insertions(+), 106 deletions(-) delete mode 100644 perftest.py diff --git a/perftest.py b/perftest.py deleted file mode 100644 index 1775ab08b1718..0000000000000 --- a/perftest.py +++ /dev/null @@ -1,84 +0,0 @@ -import csv -import sklearn -import numpy as np -import timeit - -from scipy.spatial.distance import cdist - -querypoint = np.asarray([[38.25, -85.50]]) - -samples = [] -refpoints = [] - -with open('/home/chipmonkey/repos/TrilaterationIndex/data/lat_long_synthetic.csv') as csvfile: - reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) - for row in reader: - # print(row) - samples.append([row['Latitude'], row['Longitude']]) - -with open('/home/chipmonkey/repos/TrilaterationIndex/data/ref_points.csv') as csvfile: - reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) - for row in reader: - print(row) - refpoints.append([row['Latitude'], row['Longitude']]) - -from sklearn.neighbors import NearestNeighbors, KDTree -from sklearn.neighbors import TrilaterationIndex - -brute = NearestNeighbors(n_neighbors=1000, radius=0.07, algorithm='brute') -brute.fit(samples) -BF = brute.radius_neighbors(querypoint, 0.1) -print(f"brute force results: {BF[1][0]} ({len(BF[1][0])})") - - -tree = KDTree(samples) -# t = timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500) -# print(f"KDTree 500x single point time: {t}") -tq = tree.query_radius(querypoint, r=0.1) -print(f"tree query results: {tq} ({len(tq[0])})") - - -trilat = TrilaterationIndex(samples) -# t = timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500) -# print(f"Trilat 500x single point time: {t}") -tr = trilat.query_radius(querypoint, r=0.1) -print(f"trilat query_radius: {tr} ({len(tr)})") - - -t3 = trilat.query_radius_t3(querypoint, r=0.1) -print(f"trilat query_radius_t3: {t3} ({len(t3)})") - -print(f"length match 1: {len(np.intersect1d(tq[0], tr))}") -print(f"length match 2: {len(np.intersect1d(tq[0], t3))}") - -print(f"setdiff: {np.setdiff1d(tq[0], t3)}") -if 18 in tq[0]: - print("18 in tq") -if 18 in t3: - print("18 in t3") - - -import pstats, cProfile -import pyximport -pyximport.install - -import sklearn -# print(type(querypoint)) -# cProfile.runctx('trilat.query_radius(querypoint, r=0.07)', globals(), locals(), "Profile.prof") - -cProfile.runctx('timeit.timeit(lambda: brute.radius_neighbors(querypoint, 0.07), number=500)', globals(), locals(), "BFProfile.prof") -cProfile.runctx('timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500)', globals(), locals(), "KDProfile.prof") -cProfile.runctx('timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500)', globals(), locals(), "TRProfile.prof") -cProfile.runctx('timeit.timeit(lambda: trilat.query_radius_t3(querypoint, r=0.07), number=500)', globals(), locals(), "T3Profile.prof") - -s = pstats.Stats("BFProfile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("KDProfile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("TRProfile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("T3Profile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() \ No newline at end of file diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 8880e12692cf2..848af20a3f0c5 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -189,7 +189,7 @@ cdef class TrilaterationIndex: cdef _query_one(self, X, k=5, return_distance=True): """ - query the index for k nearest neighbors + query the index for k nearest neighbors for a single point X """ if X.shape[0] != 1: raise ValueError("_query takes only a single X point" @@ -309,7 +309,8 @@ cdef class TrilaterationIndex: return(self.distances[high_idx, 0] - self.distances[low_idx, 0]) - def query_expand(self, X, k, return_distance=True, sort_results=True): + def query_expand(self, X, k, return_distance=True, sort_results=True, + mfudge=5.0, miter=20, sscale=2.0): """ X can be what...? A singlie list of one coordinate: [50, 25, 100] @@ -325,7 +326,7 @@ cdef class TrilaterationIndex: cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) if X.shape[0] == 1: - idx_results = np.asarray(self._query_expand(X, k)) + idx_results = np.asarray(self._query_expand(X, k, mfudge, miter, sscale)) else: raise NotImplementedError("You can't do that yet") @@ -333,21 +334,28 @@ cdef class TrilaterationIndex: # all_results = [self._query_expand(x, k) for x in X] # print(f"all_results: {all_results}") - dist_results = self.dist_metric.pairwise(self.data_arr[idx_results], X) - # best = np.argsort(dist_results) - # dist_results = dist_results[best] - # idx_results = idx_results[best] - - for idx, dist in zip(idx_results, dist_results): - heap.push(0, dist, idx) - if return_distance: + dist_results = self.dist_metric.pairwise(self.data_arr[idx_results], X) + # best = np.argsort(dist_results) + # dist_results = dist_results[best] + # idx_results = idx_results[best] + for idx, dist in zip(idx_results, dist_results): + heap.push(0, dist, idx) + return heap.get_arrays() return idx_results + cdef ITYPE_t _count_in_range(self, DTYPE_t* dists, r, size): + cdef ITYPE_t count = 0, i + + for i in range(size): + if dists[i] <= r: + count += 1 + + return count - cdef _query_expand(self, X, k): + cdef _query_expand(self, X, k, mfudge, miter, sscale): """ return k-nn by the expanding method... select a starting radius, iterate over query_radius @@ -363,14 +371,14 @@ cdef class TrilaterationIndex: raise ValueError("query data dimension must " "match training data dimension") - cdef DTYPE_t radius, too_high, too_low, new_radius, STD_SCALE, MAX_FUDGE + cdef DTYPE_t radius, too_high, too_low, new_radius, STD_SCALE, MAX_FUDGE, ratio cdef ITYPE_t approx_count, i, MAX_ITER cdef ITYPE_t[::1] approx_array - MAX_ITER = 20 - MAX_FUDGE = 5.0 - STD_SCALE = 2.0 + MAX_ITER = miter + MAX_FUDGE = mfudge + STD_SCALE = sscale radius = self._get_start_dist(X, k) @@ -387,15 +395,18 @@ cdef class TrilaterationIndex: elif approx_count < k: too_low = radius new_radius = radius * STD_SCALE + # new_radius = radius * k / approx_count if new_radius > too_high: - new_radius = (too_high + radius) / STD_SCALE + new_radius = (too_high + too_low) / 2 radius = new_radius - approx_array = self._query_radius_at_least_k(X, radius, k) + # approx_array = self._query_radius_at_least_k(X, radius, k) + approx_array = self._query_radius_r0_only(X, radius) approx_count = approx_array.shape[0] continue elif approx_count > k * MAX_FUDGE: radius = (radius + too_low) / STD_SCALE - approx_array = self._query_radius_at_least_k(X, radius, k) + # approx_array = self._query_radius_at_least_k(X, radius, k) + approx_array = self._query_radius_r0_only(X, radius) approx_count = approx_array.shape[0] else: continue @@ -403,6 +414,74 @@ cdef class TrilaterationIndex: return approx_array + cdef _query_expand_2(self, X, k, mfudge, miter, sscale): + """ + return k-nn by the expanding method... + select a starting radius + In this case we expand the radius for the sorted self.distances + NOT necessarily just precise distances + """ + + if X.shape[0] != 1: + raise ValueError("_query takes only a single X point" + "use query for multiple records") + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + cdef DTYPE_t STD_SCALE, MAX_FUDGE, ratio, radius, dist + cdef ITYPE_t approx_count, i, MAX_ITER, low_count, low_idx, high_idx, ground_idx, CHUNK_SIZE + cdef ITYPE_t new_high, new_low, idx + cdef ITYPE_t[::1] approx_array + cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + + cdef np.ndarray q_dists + cdef DTYPE_t[::1] new_dists, approx_dists, low_dists, high_dists + + MAX_ITER = miter + MAX_FUDGE = mfudge + STD_SCALE = sscale + CHUNK_SIZE = max(k, 500) # Tune? + + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + ground_idx = _find_nearest_sorted_2D(self.distances, q_dists[0, 0]) + low_idx = min(ground_idx - k, 0) + high_idx = max(ground_idx + k, self.data_arr.shape[1]) + + approx_array = self.idx_array[low_idx:high_idx+1] + + approx_dists = self.dist_metric.pairwise(self.data_arr[approx_array], X).reshape(approx_array.shape[0]) + + for idx, dist in zip(approx_array, approx_dists): + if dist < heap.largest(0): + heap.push(0, dist, idx) + + radius = heap.largest(0) + + approx_count = self._count_in_range(&approx_dists[0], radius, approx_dists.shape[0]) + + while approx_count < k: + new_low = min(low_idx - CHUNK_SIZE, 0) + new_high = max(high_idx + CHUNK_SIZE, self.data_arr.shape[1]) + if new_low < low_idx: + low_dists = self.dist_metric.pairwise(self.idx_array[new_low:low_idx], X).reshape(low_idx - new_low) + low_count = self._count_in_range(&low_dists[0], radius, low_dists.shape[0]) + approx_count = approx_count + low_count + low_idx = new_low + + if new_high > high_idx: + high_dists = self.dist_metric.pairwise(self.idx_array[high_idx:new_high], X).reshape(new_high - high_idx) + high_count = self._count_in_range(&high_dists[0], radius, high_dists.shape[0]) + approx_count = approx_count + high_count + high_idx = new_high + + approx_array = self.idx_array[low_idx:high_idx+1] + radius = heap.largest(0) + + return approx_array + + def query_radius(self, X, r, int return_distance=False, int count_only=False, @@ -544,8 +623,6 @@ cdef class TrilaterationIndex: """ This approach uses only the original sort order from Ref Point 0 and then brute forces whatever that index indicates MAY be in range - - NOT FINISHED """ cdef DTYPE_t[:, ::1] q_dists # , dist_results @@ -588,7 +665,7 @@ cdef class TrilaterationIndex: dist_results = self.dist_metric.pairwise(self.data_arr[approx_idxs], X).reshape(approx_idxs.shape[0]) # print(f"len(dist_results): {len(dist_results)} or {dist_results.shape[0]} low_idx:high_idx {low_idx}:{high_idx}") - dist_results = dist_results.reshape(dist_results.shape[0]) + # dist_results = dist_results.reshape(dist_results.shape[0]) # print(f"dist_results: {np.asarray(dist_results)}") # cdef ITYPE_t result_count From 52a5e6bd42d2b330222a51c2e7693d138858c9f2 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Tue, 23 Feb 2021 13:44:04 -0800 Subject: [PATCH 13/19] Query expand 2 performed well in ann! Yay! --- sklearn/neighbors/_trilateration.pyx | 95 ++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 848af20a3f0c5..bc1dc1aa53de3 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -414,10 +414,37 @@ cdef class TrilaterationIndex: return approx_array + def query_expand_2(self, X, k, return_distance=True, sort_results=True, + mfudge=5.0, miter=20, sscale=2.0): + """ + X can be what...? + A singlie list of one coordinate: [50, 25, 100] + """ + cdef np.ndarray idx_results + # cdef DTYPE_t[::1] dist_results + cdef np.ndarray dist_results + + # print(f"taa daa: {X}") + + if isinstance(X, list): + X = np.asarray(X) + + cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + + if X.shape[0] == 1: + heap = self._query_expand_2(X, k, mfudge, miter, sscale) + + else: + raise NotImplementedError("You can't do that yet") + + if return_distance: + return heap.get_arrays() + + return heap.get_arrays() # Fix this + cdef _query_expand_2(self, X, k, mfudge, miter, sscale): """ return k-nn by the expanding method... - select a starting radius In this case we expand the radius for the sorted self.distances NOT necessarily just precise distances """ @@ -436,7 +463,7 @@ cdef class TrilaterationIndex: cdef ITYPE_t[::1] approx_array cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) - cdef np.ndarray q_dists + cdef np.ndarray q_dists, temp_data cdef DTYPE_t[::1] new_dists, approx_dists, low_dists, high_dists MAX_ITER = miter @@ -446,40 +473,54 @@ cdef class TrilaterationIndex: q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) ground_idx = _find_nearest_sorted_2D(self.distances, q_dists[0, 0]) - low_idx = min(ground_idx - k, 0) - high_idx = max(ground_idx + k, self.data_arr.shape[1]) + low_idx = max(ground_idx - k, 0) + high_idx = min(ground_idx + k, self.data_arr.shape[0]) approx_array = self.idx_array[low_idx:high_idx+1] - + # print(f"k: {k}, ground_idx: {ground_idx}, low_idx: {low_idx}, high_idx: {high_idx}") + # print(f"getting dists for approx_array: {approx_array} with shape: {approx_array.shape}") approx_dists = self.dist_metric.pairwise(self.data_arr[approx_array], X).reshape(approx_array.shape[0]) for idx, dist in zip(approx_array, approx_dists): if dist < heap.largest(0): heap.push(0, dist, idx) - - radius = heap.largest(0) - approx_count = self._count_in_range(&approx_dists[0], radius, approx_dists.shape[0]) - - while approx_count < k: - new_low = min(low_idx - CHUNK_SIZE, 0) - new_high = max(high_idx + CHUNK_SIZE, self.data_arr.shape[1]) - if new_low < low_idx: - low_dists = self.dist_metric.pairwise(self.idx_array[new_low:low_idx], X).reshape(low_idx - new_low) - low_count = self._count_in_range(&low_dists[0], radius, low_dists.shape[0]) - approx_count = approx_count + low_count - low_idx = new_low - - if new_high > high_idx: - high_dists = self.dist_metric.pairwise(self.idx_array[high_idx:new_high], X).reshape(new_high - high_idx) - high_count = self._count_in_range(&high_dists[0], radius, high_dists.shape[0]) - approx_count = approx_count + high_count - high_idx = new_high - - approx_array = self.idx_array[low_idx:high_idx+1] - radius = heap.largest(0) + lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) + highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) + + # While the "not closer than" numbers are still closer than the current solution... + count = 0 + while (lowdelta < heap.largest(0) and low_idx > 0) \ + or (highdelta < heap.largest(0) and high_idx < self.data_arr.shape[0]): + count += 1 + # print(count, lowdelta, highdelta, heap.largest(0), low_idx, high_idx) + if lowdelta < heap.largest(0): + new_low = max(low_idx - CHUNK_SIZE, 0) + if new_low < low_idx: + low_dists = self.dist_metric.pairwise(self.data_arr[self.idx_array[new_low:low_idx]], X)[0] #.reshape(low_idx - new_low) + for idx in range(low_idx - new_low): + if low_dists[idx] < heap.largest(0): + heap.push(0, low_dists[idx], self.idx_array[idx + new_low]) + low_idx = new_low + + if highdelta < heap.largest(0): + new_high = min(high_idx + CHUNK_SIZE, self.data_arr.shape[0]) + if new_high > high_idx: + # print(f"new_high: {new_high}, high_idx: {high_idx}, self.data_arr.shape[1]") + high_dists = self.dist_metric.pairwise(self.data_arr[self.idx_array[high_idx:new_high]], X)[0] #.reshape(new_high - high_idx) + # testing approaches for performance: + for idx in range(new_high - high_idx): + if high_dists[idx] < heap.largest(0): + heap.push(0, high_dists[idx], self.idx_array[idx + high_idx]) + # for idx, dist in zip(self.idx_array[high_idx:new_high], high_dists): + # if dist < heap.largest(0): + # heap.push(0, dist, idx) + high_idx = new_high + + lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) + highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) - return approx_array + return heap def query_radius(self, X, r, From e4fc3e8e49ebe89f14596b61c2fe61b6f9446a0b Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Thu, 11 Mar 2021 19:27:06 -0800 Subject: [PATCH 14/19] Working (very slow) basic geodesic distance from geopy --- perftest_geodesic.py | 114 ++++++++++++++++++++++++++ perftest_query.py | 115 +++++++++++++++++++++++++++ perftest_radius.py | 84 +++++++++++++++++++ sklearn/metrics/pairwise.py | 54 ++++++++++++- sklearn/neighbors/_ball_tree.pyx | 3 +- sklearn/neighbors/_dist_metrics.pyx | 31 +++++++- sklearn/neighbors/_kd_tree.pyx | 3 +- sklearn/neighbors/_trilateration.pyx | 8 +- 8 files changed, 404 insertions(+), 8 deletions(-) create mode 100644 perftest_geodesic.py create mode 100644 perftest_query.py create mode 100644 perftest_radius.py diff --git a/perftest_geodesic.py b/perftest_geodesic.py new file mode 100644 index 0000000000000..6ecc75d1f3ea8 --- /dev/null +++ b/perftest_geodesic.py @@ -0,0 +1,114 @@ +import csv +import sklearn +import numpy as np +import timeit +import time + +from scipy.spatial.distance import cdist +from geopy.distance import geodesic + + +import pstats, cProfile +import pyximport +pyximport.install + +import sklearn + +start = time.time() + +querypoint = np.asarray([[38.25, -85.50]]) + +samples = [] +refpoints = [] + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/lat_long_synthetic.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + # print(row) + samples.append([row['Latitude'], row['Longitude']]) + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/ref_points.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + refpoints.append([row['Latitude'], row['Longitude']]) + +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.neighbors import TrilaterationIndex + +print(f"{time.time() - start} seconds since start") + + +brute = NearestNeighbors(n_neighbors=1000, radius=0.07, algorithm='brute', metric='geodesic') +brute.fit(samples) +BF = brute.kneighbors(querypoint, 100) +print(f"brute force results: {BF[1][0]} ({len(BF[1][0])}) - at time {time.time() - start} seconds") + + +tree = KDTree(samples, metric='geodesic') +# t = timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500) +# print(f"KDTree 500x single point time: {t}") +tq = tree.query(querypoint, k=100) +print(f"tree query results: {tq} ({len(tq[0])}) - at time {time.time() - start} seconds") + + +trilat = TrilaterationIndex(samples, metric='geodesic') +# t = timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500) +# print(f"Trilat 500x single point time: {t}") +tr = trilat.query(querypoint, k=100) +print(f"trilat query_radius: {tr} ({len(tr)}) - at time {time.time() - start} seconds") + + +t3 = trilat.query_expand(querypoint, k=100) +print(f"trilat query_radius_t3: {t3} ({len(t3)}) - at time {time.time() - start} seconds") + +t4 = trilat.query_expand_2(querypoint, k=100) +print(f"trilat query_expand_2: {t4} ({len(t4)}) - at time {time.time() - start} seconds") + +print(f"length match 1: {len(np.intersect1d(tq[1], tr[1]))}") +print(f"length match 2: {len(np.intersect1d(tq[1], t3[1]))}") +print(f"length match 4: {len(np.intersect1d(tq[1], t4[1]))}") + +# exit() + +print(f"setdiff: {np.setdiff1d(tq[0], t3)}") +if 18 in tq[0]: + print("18 in tq") +if 18 in t3[1]: + print("18 in t3") + + + +# print(type(querypoint)) +# cProfile.runctx('trilat.query_radius(querypoint, r=0.07)', globals(), locals(), "Profile.prof") + +# print(f"timing brute force:") +# cProfile.runctx('timeit.timeit(lambda: brute.kneighbors(querypoint, 100), number=20)', globals(), locals(), "QBFProfile.prof") + +print("timing kd tree") +cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=20)', globals(), locals(), "QKDProfile_geo.prof") + + +print("timing trilat.query") +cProfile.runctx('timeit.timeit(lambda: trilat.query(querypoint, k=100), number=20)', globals(), locals(), "QTRProfile_geo.prof") + +print("timing query_expand") +cProfile.runctx('timeit.timeit(lambda: trilat.query_expand(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=20)', globals(), locals(), "QT3Profile_geo.prof") + +print("timing query_expand_2") +cProfile.runctx('timeit.timeit(lambda: trilat.query_expand_2(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=20)', globals(), locals(), "QT4Profile_geo.prof") + +s = pstats.Stats("QBFProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("QKDProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("QTRProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("QT3Profile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("QT4Profile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() \ No newline at end of file diff --git a/perftest_query.py b/perftest_query.py new file mode 100644 index 0000000000000..ca4cefbeb0434 --- /dev/null +++ b/perftest_query.py @@ -0,0 +1,115 @@ +import csv +import sklearn +import numpy as np +import timeit +import time + +from scipy.spatial.distance import cdist + +import pstats, cProfile +import pyximport +pyximport.install + +import sklearn + +start = time.time() + +querypoint = np.asarray([[38.25, -85.50]]) + +samples = [] +refpoints = [] + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/lat_long_synthetic.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + # print(row) + samples.append([row['Latitude'], row['Longitude']]) + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/ref_points.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + refpoints.append([row['Latitude'], row['Longitude']]) + +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.neighbors import TrilaterationIndex + +print(f"{time.time() - start} seconds since start") + + +brute = NearestNeighbors(n_neighbors=1000, radius=0.07, algorithm='brute', metric='euclidean') +brute.fit(samples) +BF = brute.kneighbors(querypoint, 100) +print(f"brute force results: {BF[1][0]} ({len(BF[1][0])}) - at time {time.time() - start} seconds") + + +tree = KDTree(samples, metric='euclidean') +# t = timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500) +# print(f"KDTree 500x single point time: {t}") +tq = tree.query(querypoint, k=100) +print(f"tree query results: {tq} ({len(tq[0])}) - at time {time.time() - start} seconds") + + +trilat = TrilaterationIndex(samples, metric='euclidean') +# t = timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500) +# print(f"Trilat 500x single point time: {t}") +tr = trilat.query(querypoint, k=100) +print(f"trilat query_radius: {tr} ({len(tr)}) - at time {time.time() - start} seconds") + + +t3 = trilat.query_expand(querypoint, k=100) +print(f"trilat query_radius_t3: {t3} ({len(t3)}) - at time {time.time() - start} seconds") + +t4 = trilat.query_expand_2(querypoint, k=100) +print(f"trilat query_expand_2: {t4} ({len(t4)}) - at time {time.time() - start} seconds") + +print(f"length match 1: {len(np.intersect1d(tq[1], tr[1]))}") +print(f"length match 2: {len(np.intersect1d(tq[1], t3[1]))}") +print(f"length match 4: {len(np.intersect1d(tq[1], t4[1]))}") + +# exit() + +print(f"setdiff: {np.setdiff1d(tq[0], t3)}") +if 18 in tq[0]: + print("18 in tq") +if 18 in t3[1]: + print("18 in t3") + + + +# print(type(querypoint)) +# cProfile.runctx('trilat.query_radius(querypoint, r=0.07)', globals(), locals(), "Profile.prof") + +# print(f"timing brute force:") +# cProfile.runctx('timeit.timeit(lambda: brute.kneighbors(querypoint, 100), number=500)', globals(), locals(), "QBFProfile.prof") +print("running one kd tree") +tree.query(querypoint, k=100) + +print("timing kd tree") +# cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=500)', globals(), locals(), "QKDProfile.prof") +cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=20)', globals(), locals(), "QKDProfile.prof") + + +# print("timing trilat.query") +# cProfile.runctx('timeit.timeit(lambda: trilat.query(querypoint, k=100), number=500)', globals(), locals(), "QTRProfile.prof") + +print("timing query_expand") +cProfile.runctx('timeit.timeit(lambda: trilat.query_expand(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=500)', globals(), locals(), "QT3Profile.prof") + +print("timing query_expand_2") +cProfile.runctx('timeit.timeit(lambda: trilat.query_expand_2(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=500)', globals(), locals(), "QT4Profile.prof") + +s = pstats.Stats("QBFProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("QKDProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("QTRProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("QT3Profile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("QT4Profile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() \ No newline at end of file diff --git a/perftest_radius.py b/perftest_radius.py new file mode 100644 index 0000000000000..1775ab08b1718 --- /dev/null +++ b/perftest_radius.py @@ -0,0 +1,84 @@ +import csv +import sklearn +import numpy as np +import timeit + +from scipy.spatial.distance import cdist + +querypoint = np.asarray([[38.25, -85.50]]) + +samples = [] +refpoints = [] + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/lat_long_synthetic.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + # print(row) + samples.append([row['Latitude'], row['Longitude']]) + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/ref_points.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + refpoints.append([row['Latitude'], row['Longitude']]) + +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.neighbors import TrilaterationIndex + +brute = NearestNeighbors(n_neighbors=1000, radius=0.07, algorithm='brute') +brute.fit(samples) +BF = brute.radius_neighbors(querypoint, 0.1) +print(f"brute force results: {BF[1][0]} ({len(BF[1][0])})") + + +tree = KDTree(samples) +# t = timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500) +# print(f"KDTree 500x single point time: {t}") +tq = tree.query_radius(querypoint, r=0.1) +print(f"tree query results: {tq} ({len(tq[0])})") + + +trilat = TrilaterationIndex(samples) +# t = timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500) +# print(f"Trilat 500x single point time: {t}") +tr = trilat.query_radius(querypoint, r=0.1) +print(f"trilat query_radius: {tr} ({len(tr)})") + + +t3 = trilat.query_radius_t3(querypoint, r=0.1) +print(f"trilat query_radius_t3: {t3} ({len(t3)})") + +print(f"length match 1: {len(np.intersect1d(tq[0], tr))}") +print(f"length match 2: {len(np.intersect1d(tq[0], t3))}") + +print(f"setdiff: {np.setdiff1d(tq[0], t3)}") +if 18 in tq[0]: + print("18 in tq") +if 18 in t3: + print("18 in t3") + + +import pstats, cProfile +import pyximport +pyximport.install + +import sklearn +# print(type(querypoint)) +# cProfile.runctx('trilat.query_radius(querypoint, r=0.07)', globals(), locals(), "Profile.prof") + +cProfile.runctx('timeit.timeit(lambda: brute.radius_neighbors(querypoint, 0.07), number=500)', globals(), locals(), "BFProfile.prof") +cProfile.runctx('timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500)', globals(), locals(), "KDProfile.prof") +cProfile.runctx('timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500)', globals(), locals(), "TRProfile.prof") +cProfile.runctx('timeit.timeit(lambda: trilat.query_radius_t3(querypoint, r=0.07), number=500)', globals(), locals(), "T3Profile.prof") + +s = pstats.Stats("BFProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("KDProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("TRProfile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() + +s = pstats.Stats("T3Profile.prof") +s.strip_dirs().sort_stats("cumtime").print_stats() \ No newline at end of file diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index a3cf7f4bf1d72..0aad03de94f9d 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -751,6 +751,53 @@ def haversine_distances(X, Y=None): from sklearn.neighbors import DistanceMetric return DistanceMetric.get_metric('haversine').pairwise(X, Y) +def geodesic_distances(X, Y=None): + """Compute the Geodesic distance between samples in X and Y. + + The geodesic distance is an accurate measure of the distance between + two points on the earth, using an ellipsoidal model of the earth + which is more accurate than the spherical model used in Haversine. + + The trade-off is that no closed form calculation exists, so the + process is iterative, using numeric techniques until a specific + accuracy is reached. + + See the geopy.distance.geodesic code for implementation details. + + Parameters + ---------- + X : array-like of shape (n_samples_X, 2) + + Y : array-like of shape (n_samples_Y, 2), default=None + + Returns + ------- + distance : ndarray of shape (n_samples_X, n_samples_Y) + + Notes + ----- + This will be significantly slower than the Haversine distance and + should only be used when more accuracy is needed. + + Unlike Haversine, which requires input in radians, the geodesic + implementation works on (latitude, longitude) pairs. + + Examples + -------- + We want to calculate the distance between the Ezeiza Airport + (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris, + France). + + >>> from sklearn.metrics.pairwise import geodesic_distances + >>> bsas = [-34.83333, -58.5166646] + >>> paris = [49.0083899664, 2.53844117956] + >>> result = geodesic_distances([bsas, paris]) + >>> result + array([[ 0. , 11099.54035582], + [11099.54035582, 0. ]]) + """ + from sklearn.neighbors import DistanceMetric + return DistanceMetric.get_metric('geodesic').pairwise(X, Y) @_deprecate_positional_args def manhattan_distances(X, Y=None, *, sum_over_features=True): @@ -1340,6 +1387,7 @@ def chi2_kernel(X, Y=None, gamma=1.): 'manhattan': manhattan_distances, 'precomputed': None, # HACK: precomputed is always allowed, never called 'nan_euclidean': nan_euclidean_distances, + 'geodesic': geodesic_distances, } @@ -1359,6 +1407,7 @@ def distance_metrics(): 'cosine' metrics.pairwise.cosine_distances 'euclidean' metrics.pairwise.euclidean_distances 'haversine' metrics.pairwise.haversine_distances + 'geodesic' metrics.pairwise.geodesic_distances 'l1' metrics.pairwise.manhattan_distances 'l2' metrics.pairwise.euclidean_distances 'manhattan' metrics.pairwise.manhattan_distances @@ -1440,7 +1489,7 @@ def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds): 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski", - 'nan_euclidean', 'haversine'] + 'nan_euclidean', 'haversine', 'geodesic'] _NAN_METRICS = ['nan_euclidean'] @@ -1694,6 +1743,9 @@ def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None, See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. + - From geopy.distance: ['geodesic'] + See the documentation for geopy for details on this metric. + Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx index 81ce9606f7b80..f9ee57aa43d7b 100644 --- a/sklearn/neighbors/_ball_tree.pyx +++ b/sklearn/neighbors/_ball_tree.pyx @@ -19,7 +19,8 @@ VALID_METRICS = ['EuclideanDistance', 'SEuclideanDistance', 'DiceDistance', 'KulsinskiDistance', 'RogersTanimotoDistance', 'RussellRaoDistance', 'SokalMichenerDistance', 'SokalSneathDistance', - 'PyFuncDistance', 'HaversineDistance'] + 'PyFuncDistance', 'HaversineDistance', + 'GeodesicDistance'] include "_binary_tree.pxi" diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx index 8bee948eeaeba..de9a378cf384a 100755 --- a/sklearn/neighbors/_dist_metrics.pyx +++ b/sklearn/neighbors/_dist_metrics.pyx @@ -4,6 +4,7 @@ #cython: cdivision=True # By Jake Vanderplas (2013) +# Geodesic hack by Chip Lynch (2021) #needswork # written for the scikit-learn project # License: BSD @@ -11,6 +12,7 @@ import numpy as np cimport numpy as np np.import_array() # required in order to use C-API +from geopy.distance import geodesic ###################################################################### # Numpy 1.3-1.4 compatibility utilities @@ -86,7 +88,9 @@ METRIC_MAPPING = {'euclidean': EuclideanDistance, 'sokalmichener': SokalMichenerDistance, 'sokalsneath': SokalSneathDistance, 'haversine': HaversineDistance, - 'pyfunc': PyFuncDistance} + 'pyfunc': PyFuncDistance, + 'geodesic': GeodesicDistance, + 'vincenty': GeodesicDistance} def get_valid_metric_ids(L): @@ -410,6 +414,31 @@ cdef class DistanceMetric: get_memview_DTYPE_2D(Darr)) return Darr +#------------------------------------------------------------ +# Geodesic Distance +# Accurate distance between two points on earth +# modeled as an Ellipsoid +# see HaversineDistance for a Spheroid earth distance +cdef class GeodesicDistance(DistanceMetric): + r""" Geodesic Distance metric + + Imported from geopy, see: + https://geopy.readthedocs.io/en/stable/#module-geopy.distance + """ + + # Late import to only happen if needed + from geopy.distance import geodesic + + def __init__(self): + self.p = 2 + + cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != 2: + with gil: + raise ValueError("Geodesic Distance takes only 2-dimensions (Latitude, Longitude)") + with gil: + return geodesic((x1[0], x1[1]),(x2[0], x2[1])).meters #------------------------------------------------------------ # Euclidean Distance diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx index bc1ab764a6fcf..d6c0c94a2d57a 100644 --- a/sklearn/neighbors/_kd_tree.pyx +++ b/sklearn/neighbors/_kd_tree.pyx @@ -12,7 +12,8 @@ __all__ = ['KDTree'] DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'} VALID_METRICS = ['EuclideanDistance', 'ManhattanDistance', - 'ChebyshevDistance', 'MinkowskiDistance'] + 'ChebyshevDistance', 'MinkowskiDistance', + 'GeodesicDistance'] include "_binary_tree.pxi" diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index bc1dc1aa53de3..73f2efec7c5ea 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -45,7 +45,8 @@ __all__ = ['TrilaterationIndex'] DOC_DICT = {'TrilaterationIndex': 'TrilaterationIndex'} # Start simple: -VALID_METRICS = ['EuclideanDistance'] +VALID_METRICS = ['EuclideanDistance', 'HaversineDistance', + 'GeodesicDistance'] include "_helpers.pxi" @@ -95,8 +96,7 @@ cdef class TrilaterationIndex: n_features = self.data_arr.shape[1] self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) - # self.euclidean = (self.dist_metric.__class__.__name__ - # == 'EuclideanDistance') + metric = self.dist_metric.__class__.__name__ if metric not in VALID_METRICS: raise ValueError('metric {metric} is not valid for ' @@ -208,7 +208,7 @@ cdef class TrilaterationIndex: cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) # Establish the distances from the query point to the reference points - cdef np.ndarray q_dists + cdef np.ndarray[DTYPE_t, ndim=2] q_dists q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) # cdef DTYPE_t[:, ::1] Xarr From 1f5257d802903113dfd05d2b38709e298c212b03 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Thu, 11 Mar 2021 23:30:54 -0800 Subject: [PATCH 15/19] Geodesic benchmarks look goood! --- perftest_geodesic.py | 35 ++++++++++++++++----------- sklearn/neighbors/_dist_metrics.pyx | 2 +- sklearn/neighbors/_trilateration.pyx | 36 +++++++++++++++++----------- 3 files changed, 44 insertions(+), 29 deletions(-) diff --git a/perftest_geodesic.py b/perftest_geodesic.py index 6ecc75d1f3ea8..0080f00aa6a23 100644 --- a/perftest_geodesic.py +++ b/perftest_geodesic.py @@ -27,6 +27,9 @@ # print(row) samples.append([row['Latitude'], row['Longitude']]) +print(min([x[0] for x in samples]), max([x[0] for x in samples])) +print(min([x[1] for x in samples]), max([x[1] for x in samples])) + with open('/home/chipmonkey/repos/TrilaterationIndex/data/ref_points.csv') as csvfile: reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) for row in reader: @@ -38,6 +41,11 @@ print(f"{time.time() - start} seconds since start") +from sklearn.metrics.pairwise import geodesic_distances +bsas = [-34.83333, -58.5166646] +paris = [49.0083899664, 2.53844117956] +result = geodesic_distances([bsas, paris]) +print(f"{result} meters between bsas and paris") brute = NearestNeighbors(n_neighbors=1000, radius=0.07, algorithm='brute', metric='geodesic') brute.fit(samples) @@ -85,30 +93,29 @@ # print(f"timing brute force:") # cProfile.runctx('timeit.timeit(lambda: brute.kneighbors(querypoint, 100), number=20)', globals(), locals(), "QBFProfile.prof") -print("timing kd tree") -cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=20)', globals(), locals(), "QKDProfile_geo.prof") - +# print("timing kd tree") +# cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=20)', globals(), locals(), "QKDProfile_geo.prof") -print("timing trilat.query") -cProfile.runctx('timeit.timeit(lambda: trilat.query(querypoint, k=100), number=20)', globals(), locals(), "QTRProfile_geo.prof") +# print("timing trilat.query") +# cProfile.runctx('timeit.timeit(lambda: trilat.query(querypoint, k=100), number=20)', globals(), locals(), "QTRProfile_geo.prof") -print("timing query_expand") -cProfile.runctx('timeit.timeit(lambda: trilat.query_expand(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=20)', globals(), locals(), "QT3Profile_geo.prof") +# print("timing query_expand") +# cProfile.runctx('timeit.timeit(lambda: trilat.query_expand(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=20)', globals(), locals(), "QT3Profile_geo.prof") -print("timing query_expand_2") -cProfile.runctx('timeit.timeit(lambda: trilat.query_expand_2(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=20)', globals(), locals(), "QT4Profile_geo.prof") +# print("timing query_expand_2") +# cProfile.runctx('timeit.timeit(lambda: trilat.query_expand_2(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=20)', globals(), locals(), "QT4Profile_geo.prof") -s = pstats.Stats("QBFProfile.prof") +s = pstats.Stats("QBFProfile_geo.prof") s.strip_dirs().sort_stats("cumtime").print_stats() -s = pstats.Stats("QKDProfile.prof") +s = pstats.Stats("QKDProfile_geo.prof") s.strip_dirs().sort_stats("cumtime").print_stats() -s = pstats.Stats("QTRProfile.prof") +s = pstats.Stats("QTRProfile_geo.prof") s.strip_dirs().sort_stats("cumtime").print_stats() -s = pstats.Stats("QT3Profile.prof") +s = pstats.Stats("QT3Profile_geo.prof") s.strip_dirs().sort_stats("cumtime").print_stats() -s = pstats.Stats("QT4Profile.prof") +s = pstats.Stats("QT4Profile_geo.prof") s.strip_dirs().sort_stats("cumtime").print_stats() \ No newline at end of file diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx index de9a378cf384a..e466e435b8910 100755 --- a/sklearn/neighbors/_dist_metrics.pyx +++ b/sklearn/neighbors/_dist_metrics.pyx @@ -438,7 +438,7 @@ cdef class GeodesicDistance(DistanceMetric): with gil: raise ValueError("Geodesic Distance takes only 2-dimensions (Latitude, Longitude)") with gil: - return geodesic((x1[0], x1[1]),(x2[0], x2[1])).meters + return geodesic((x1[0], x1[1]),(x2[0], x2[1])).kilometers #------------------------------------------------------------ # Euclidean Distance diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 73f2efec7c5ea..9f889ea029ffa 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -61,6 +61,8 @@ cdef class TrilaterationIndex: cdef np.ndarray r_indexes cdef np.ndarray r_distances + cdef char* metric + cdef readonly ITYPE_t[:, ::1] r_indexes_mv cdef readonly DTYPE_t[:, ::1] r_distances_mv @@ -80,18 +82,6 @@ cdef class TrilaterationIndex: def __init__(self, data, ref=None, metric='euclidean', **kwargs): self.data_arr = check_array(data, dtype=DTYPE, order='C') - # Ref points can be passed in - # Otherwise they are generated based on input data - if ref is None: - ref = self._create_ref_points() - # print("generated synthetic reference points:") - # print(ref) - - self.ref_points_arr = check_array(ref, dtype=DTYPE, order='C') - - if self.data_arr.shape[1] != self.ref_points_arr.shape[1]: - raise ValueError("Data and reference points must share same dimension") - n_samples = self.data_arr.shape[0] n_features = self.data_arr.shape[1] @@ -104,8 +94,20 @@ cdef class TrilaterationIndex: **DOC_DICT)) self.dist_metric._validate_data(self.data_arr) + # Ref points can be passed in + # Otherwise they are generated based on input data + if ref is None: + ref = self._create_ref_points() + # print("generated synthetic reference points:") + # print(ref) + + self.ref_points_arr = check_array(ref, dtype=DTYPE, order='C') + self.ref_dists = self.dist_metric.pairwise(self.data_arr, self.ref_points_arr) + if self.data_arr.shape[1] != self.ref_points_arr.shape[1]: + raise ValueError("Data and reference points must share same dimension") + # allocate arrays for storage self.idx_array_arr = np.arange(n_samples, dtype=ITYPE) @@ -140,11 +142,17 @@ cdef class TrilaterationIndex: For the love of all that is holy, make this better. """ - cdef DTYPE_t MAX_REF = 9999.0 self.ndims = self.data_arr.shape[1] - refs = [[0]*i+[MAX_REF]*(self.ndims-i) for i in range(self.ndims)] + + print(f"self.dist_metric.__class__.__name__: {self.dist_metric.__class__.__name__}") + + if self.dist_metric.__class__.__name__ == 'GeodesicDistance' or \ + self.dist_metric.__class__.__name__ == 'HaversineDistance': + refs = [[90, 0], [38.26, -85.76], [-19.22, 159.93]] + else: + refs = [[0]*i+[MAX_REF]*(self.ndims-i) for i in range(self.ndims)] return np.asarray(refs) From 6e12412731dc61648af5ffa9c6af4f399e83c418 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Sat, 13 Mar 2021 12:23:49 -0800 Subject: [PATCH 16/19] Performance tests working for geodesic --- perftest_geodesic.py | 16 ++++++---------- perftest_query.py | 10 ++++------ sklearn/neighbors/_trilateration.pyx | 9 +++------ 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/perftest_geodesic.py b/perftest_geodesic.py index 0080f00aa6a23..0cdc1909a8b2b 100644 --- a/perftest_geodesic.py +++ b/perftest_geodesic.py @@ -64,34 +64,30 @@ # t = timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500) # print(f"Trilat 500x single point time: {t}") tr = trilat.query(querypoint, k=100) -print(f"trilat query_radius: {tr} ({len(tr)}) - at time {time.time() - start} seconds") +print(f"trilat.query: {tr} ({len(tr)}) - at time {time.time() - start} seconds") t3 = trilat.query_expand(querypoint, k=100) -print(f"trilat query_radius_t3: {t3} ({len(t3)}) - at time {time.time() - start} seconds") +print(f"trilat.query_expand (t3): {t3} ({len(t3)}) - at time {time.time() - start} seconds") t4 = trilat.query_expand_2(querypoint, k=100) -print(f"trilat query_expand_2: {t4} ({len(t4)}) - at time {time.time() - start} seconds") +print(f"trilat.query_expand_2 (t4): {t4} ({len(t4)}) - at time {time.time() - start} seconds") print(f"length match 1: {len(np.intersect1d(tq[1], tr[1]))}") print(f"length match 2: {len(np.intersect1d(tq[1], t3[1]))}") print(f"length match 4: {len(np.intersect1d(tq[1], t4[1]))}") -# exit() - print(f"setdiff: {np.setdiff1d(tq[0], t3)}") if 18 in tq[0]: print("18 in tq") if 18 in t3[1]: print("18 in t3") +# exit() - -# print(type(querypoint)) -# cProfile.runctx('trilat.query_radius(querypoint, r=0.07)', globals(), locals(), "Profile.prof") - +# uncomment to rerun, otherwise just print results: # print(f"timing brute force:") -# cProfile.runctx('timeit.timeit(lambda: brute.kneighbors(querypoint, 100), number=20)', globals(), locals(), "QBFProfile.prof") +# cProfile.runctx('timeit.timeit(lambda: brute.kneighbors(querypoint, 100), number=20)', globals(), locals(), "QBFProfile_geo.prof") # print("timing kd tree") # cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=20)', globals(), locals(), "QKDProfile_geo.prof") diff --git a/perftest_query.py b/perftest_query.py index ca4cefbeb0434..53d951ece0ab5 100644 --- a/perftest_query.py +++ b/perftest_query.py @@ -80,18 +80,16 @@ # print(type(querypoint)) # cProfile.runctx('trilat.query_radius(querypoint, r=0.07)', globals(), locals(), "Profile.prof") -# print(f"timing brute force:") -# cProfile.runctx('timeit.timeit(lambda: brute.kneighbors(querypoint, 100), number=500)', globals(), locals(), "QBFProfile.prof") -print("running one kd tree") -tree.query(querypoint, k=100) +print(f"timing brute force:") +cProfile.runctx('timeit.timeit(lambda: brute.kneighbors(querypoint, 100), number=500)', globals(), locals(), "QBFProfile.prof") print("timing kd tree") # cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=500)', globals(), locals(), "QKDProfile.prof") cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=20)', globals(), locals(), "QKDProfile.prof") -# print("timing trilat.query") -# cProfile.runctx('timeit.timeit(lambda: trilat.query(querypoint, k=100), number=500)', globals(), locals(), "QTRProfile.prof") +print("timing trilat.query") +cProfile.runctx('timeit.timeit(lambda: trilat.query(querypoint, k=100), number=500)', globals(), locals(), "QTRProfile.prof") print("timing query_expand") cProfile.runctx('timeit.timeit(lambda: trilat.query_expand(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=500)', globals(), locals(), "QT3Profile.prof") diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 9f889ea029ffa..a844d269ce991 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -27,8 +27,6 @@ from collections import Counter from functools import reduce from libc.math cimport fabs, ceil -from scipy.spatial.distance import cdist - from ..utils import ( check_array ) @@ -227,8 +225,7 @@ cdef class TrilaterationIndex: cdef DTYPE_t best_dist, test_dist best_idx = _find_nearest_sorted_2D(self.distances, q_dists[0,0]) - # best_dist = self.dist_metric.dist(self.data[best_idx,:], &Xarr, X.shape[1]) - best_dist = cdist([self.data_arr[self.idx_array_arr[best_idx],:]], X) + best_dist = self.dist_metric.pairwise([self.data_arr[self.idx_array_arr[best_idx],:]], X) # heap.push(0, best_dist, best_idx) # Populate the heap using 2k elements; k on each side of our first guess: @@ -236,7 +233,7 @@ cdef class TrilaterationIndex: high_idx = min(best_idx + k, self.distances.shape[0]) for i in range(low_idx, high_idx + 1): if i < self.distances.shape[0]: - test_dist = cdist([self.data_arr[self.idx_array_arr[i],:]], X) + test_dist = self.dist_metric.pairwise([self.data_arr[self.idx_array_arr[i],:]], X) heap.push(0, test_dist, self.idx_array_arr[i]) @@ -283,7 +280,7 @@ cdef class TrilaterationIndex: break if sufficient: - test_dist = cdist([self.data_arr[self.idx_array[test_idx],:]], X) + test_dist = self.dist_metric.pairwise([self.data_arr[self.idx_array[test_idx],:]], X) # print(f"{test_idx} is sufficient... test_dist is: {test_dist}") if test_dist < heap.largest(0): heap.push(0, test_dist, self.idx_array[test_idx]) From 6d8cc746f10a34ccf465831379ff0050ffc09f75 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Sat, 13 Mar 2021 12:27:33 -0800 Subject: [PATCH 17/19] Moved perftest scripts to Trilateration project --- perftest_geodesic.py | 117 ------------------------------------------- perftest_query.py | 113 ----------------------------------------- perftest_radius.py | 84 ------------------------------- 3 files changed, 314 deletions(-) delete mode 100644 perftest_geodesic.py delete mode 100644 perftest_query.py delete mode 100644 perftest_radius.py diff --git a/perftest_geodesic.py b/perftest_geodesic.py deleted file mode 100644 index 0cdc1909a8b2b..0000000000000 --- a/perftest_geodesic.py +++ /dev/null @@ -1,117 +0,0 @@ -import csv -import sklearn -import numpy as np -import timeit -import time - -from scipy.spatial.distance import cdist -from geopy.distance import geodesic - - -import pstats, cProfile -import pyximport -pyximport.install - -import sklearn - -start = time.time() - -querypoint = np.asarray([[38.25, -85.50]]) - -samples = [] -refpoints = [] - -with open('/home/chipmonkey/repos/TrilaterationIndex/data/lat_long_synthetic.csv') as csvfile: - reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) - for row in reader: - # print(row) - samples.append([row['Latitude'], row['Longitude']]) - -print(min([x[0] for x in samples]), max([x[0] for x in samples])) -print(min([x[1] for x in samples]), max([x[1] for x in samples])) - -with open('/home/chipmonkey/repos/TrilaterationIndex/data/ref_points.csv') as csvfile: - reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) - for row in reader: - print(row) - refpoints.append([row['Latitude'], row['Longitude']]) - -from sklearn.neighbors import NearestNeighbors, KDTree -from sklearn.neighbors import TrilaterationIndex - -print(f"{time.time() - start} seconds since start") - -from sklearn.metrics.pairwise import geodesic_distances -bsas = [-34.83333, -58.5166646] -paris = [49.0083899664, 2.53844117956] -result = geodesic_distances([bsas, paris]) -print(f"{result} meters between bsas and paris") - -brute = NearestNeighbors(n_neighbors=1000, radius=0.07, algorithm='brute', metric='geodesic') -brute.fit(samples) -BF = brute.kneighbors(querypoint, 100) -print(f"brute force results: {BF[1][0]} ({len(BF[1][0])}) - at time {time.time() - start} seconds") - - -tree = KDTree(samples, metric='geodesic') -# t = timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500) -# print(f"KDTree 500x single point time: {t}") -tq = tree.query(querypoint, k=100) -print(f"tree query results: {tq} ({len(tq[0])}) - at time {time.time() - start} seconds") - - -trilat = TrilaterationIndex(samples, metric='geodesic') -# t = timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500) -# print(f"Trilat 500x single point time: {t}") -tr = trilat.query(querypoint, k=100) -print(f"trilat.query: {tr} ({len(tr)}) - at time {time.time() - start} seconds") - - -t3 = trilat.query_expand(querypoint, k=100) -print(f"trilat.query_expand (t3): {t3} ({len(t3)}) - at time {time.time() - start} seconds") - -t4 = trilat.query_expand_2(querypoint, k=100) -print(f"trilat.query_expand_2 (t4): {t4} ({len(t4)}) - at time {time.time() - start} seconds") - -print(f"length match 1: {len(np.intersect1d(tq[1], tr[1]))}") -print(f"length match 2: {len(np.intersect1d(tq[1], t3[1]))}") -print(f"length match 4: {len(np.intersect1d(tq[1], t4[1]))}") - -print(f"setdiff: {np.setdiff1d(tq[0], t3)}") -if 18 in tq[0]: - print("18 in tq") -if 18 in t3[1]: - print("18 in t3") - -# exit() - -# uncomment to rerun, otherwise just print results: -# print(f"timing brute force:") -# cProfile.runctx('timeit.timeit(lambda: brute.kneighbors(querypoint, 100), number=20)', globals(), locals(), "QBFProfile_geo.prof") - -# print("timing kd tree") -# cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=20)', globals(), locals(), "QKDProfile_geo.prof") - -# print("timing trilat.query") -# cProfile.runctx('timeit.timeit(lambda: trilat.query(querypoint, k=100), number=20)', globals(), locals(), "QTRProfile_geo.prof") - -# print("timing query_expand") -# cProfile.runctx('timeit.timeit(lambda: trilat.query_expand(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=20)', globals(), locals(), "QT3Profile_geo.prof") - -# print("timing query_expand_2") -# cProfile.runctx('timeit.timeit(lambda: trilat.query_expand_2(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=20)', globals(), locals(), "QT4Profile_geo.prof") - -s = pstats.Stats("QBFProfile_geo.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("QKDProfile_geo.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("QTRProfile_geo.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("QT3Profile_geo.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("QT4Profile_geo.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() \ No newline at end of file diff --git a/perftest_query.py b/perftest_query.py deleted file mode 100644 index 53d951ece0ab5..0000000000000 --- a/perftest_query.py +++ /dev/null @@ -1,113 +0,0 @@ -import csv -import sklearn -import numpy as np -import timeit -import time - -from scipy.spatial.distance import cdist - -import pstats, cProfile -import pyximport -pyximport.install - -import sklearn - -start = time.time() - -querypoint = np.asarray([[38.25, -85.50]]) - -samples = [] -refpoints = [] - -with open('/home/chipmonkey/repos/TrilaterationIndex/data/lat_long_synthetic.csv') as csvfile: - reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) - for row in reader: - # print(row) - samples.append([row['Latitude'], row['Longitude']]) - -with open('/home/chipmonkey/repos/TrilaterationIndex/data/ref_points.csv') as csvfile: - reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) - for row in reader: - print(row) - refpoints.append([row['Latitude'], row['Longitude']]) - -from sklearn.neighbors import NearestNeighbors, KDTree -from sklearn.neighbors import TrilaterationIndex - -print(f"{time.time() - start} seconds since start") - - -brute = NearestNeighbors(n_neighbors=1000, radius=0.07, algorithm='brute', metric='euclidean') -brute.fit(samples) -BF = brute.kneighbors(querypoint, 100) -print(f"brute force results: {BF[1][0]} ({len(BF[1][0])}) - at time {time.time() - start} seconds") - - -tree = KDTree(samples, metric='euclidean') -# t = timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500) -# print(f"KDTree 500x single point time: {t}") -tq = tree.query(querypoint, k=100) -print(f"tree query results: {tq} ({len(tq[0])}) - at time {time.time() - start} seconds") - - -trilat = TrilaterationIndex(samples, metric='euclidean') -# t = timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500) -# print(f"Trilat 500x single point time: {t}") -tr = trilat.query(querypoint, k=100) -print(f"trilat query_radius: {tr} ({len(tr)}) - at time {time.time() - start} seconds") - - -t3 = trilat.query_expand(querypoint, k=100) -print(f"trilat query_radius_t3: {t3} ({len(t3)}) - at time {time.time() - start} seconds") - -t4 = trilat.query_expand_2(querypoint, k=100) -print(f"trilat query_expand_2: {t4} ({len(t4)}) - at time {time.time() - start} seconds") - -print(f"length match 1: {len(np.intersect1d(tq[1], tr[1]))}") -print(f"length match 2: {len(np.intersect1d(tq[1], t3[1]))}") -print(f"length match 4: {len(np.intersect1d(tq[1], t4[1]))}") - -# exit() - -print(f"setdiff: {np.setdiff1d(tq[0], t3)}") -if 18 in tq[0]: - print("18 in tq") -if 18 in t3[1]: - print("18 in t3") - - - -# print(type(querypoint)) -# cProfile.runctx('trilat.query_radius(querypoint, r=0.07)', globals(), locals(), "Profile.prof") - -print(f"timing brute force:") -cProfile.runctx('timeit.timeit(lambda: brute.kneighbors(querypoint, 100), number=500)', globals(), locals(), "QBFProfile.prof") - -print("timing kd tree") -# cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=500)', globals(), locals(), "QKDProfile.prof") -cProfile.runctx('timeit.timeit(lambda: tree.query(querypoint, k=100), number=20)', globals(), locals(), "QKDProfile.prof") - - -print("timing trilat.query") -cProfile.runctx('timeit.timeit(lambda: trilat.query(querypoint, k=100), number=500)', globals(), locals(), "QTRProfile.prof") - -print("timing query_expand") -cProfile.runctx('timeit.timeit(lambda: trilat.query_expand(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=500)', globals(), locals(), "QT3Profile.prof") - -print("timing query_expand_2") -cProfile.runctx('timeit.timeit(lambda: trilat.query_expand_2(querypoint, k=100, mfudge=5, miter=20, sscale=2), number=500)', globals(), locals(), "QT4Profile.prof") - -s = pstats.Stats("QBFProfile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("QKDProfile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("QTRProfile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("QT3Profile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("QT4Profile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() \ No newline at end of file diff --git a/perftest_radius.py b/perftest_radius.py deleted file mode 100644 index 1775ab08b1718..0000000000000 --- a/perftest_radius.py +++ /dev/null @@ -1,84 +0,0 @@ -import csv -import sklearn -import numpy as np -import timeit - -from scipy.spatial.distance import cdist - -querypoint = np.asarray([[38.25, -85.50]]) - -samples = [] -refpoints = [] - -with open('/home/chipmonkey/repos/TrilaterationIndex/data/lat_long_synthetic.csv') as csvfile: - reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) - for row in reader: - # print(row) - samples.append([row['Latitude'], row['Longitude']]) - -with open('/home/chipmonkey/repos/TrilaterationIndex/data/ref_points.csv') as csvfile: - reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) - for row in reader: - print(row) - refpoints.append([row['Latitude'], row['Longitude']]) - -from sklearn.neighbors import NearestNeighbors, KDTree -from sklearn.neighbors import TrilaterationIndex - -brute = NearestNeighbors(n_neighbors=1000, radius=0.07, algorithm='brute') -brute.fit(samples) -BF = brute.radius_neighbors(querypoint, 0.1) -print(f"brute force results: {BF[1][0]} ({len(BF[1][0])})") - - -tree = KDTree(samples) -# t = timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500) -# print(f"KDTree 500x single point time: {t}") -tq = tree.query_radius(querypoint, r=0.1) -print(f"tree query results: {tq} ({len(tq[0])})") - - -trilat = TrilaterationIndex(samples) -# t = timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500) -# print(f"Trilat 500x single point time: {t}") -tr = trilat.query_radius(querypoint, r=0.1) -print(f"trilat query_radius: {tr} ({len(tr)})") - - -t3 = trilat.query_radius_t3(querypoint, r=0.1) -print(f"trilat query_radius_t3: {t3} ({len(t3)})") - -print(f"length match 1: {len(np.intersect1d(tq[0], tr))}") -print(f"length match 2: {len(np.intersect1d(tq[0], t3))}") - -print(f"setdiff: {np.setdiff1d(tq[0], t3)}") -if 18 in tq[0]: - print("18 in tq") -if 18 in t3: - print("18 in t3") - - -import pstats, cProfile -import pyximport -pyximport.install - -import sklearn -# print(type(querypoint)) -# cProfile.runctx('trilat.query_radius(querypoint, r=0.07)', globals(), locals(), "Profile.prof") - -cProfile.runctx('timeit.timeit(lambda: brute.radius_neighbors(querypoint, 0.07), number=500)', globals(), locals(), "BFProfile.prof") -cProfile.runctx('timeit.timeit(lambda: tree.query_radius(querypoint, r=0.07), number=500)', globals(), locals(), "KDProfile.prof") -cProfile.runctx('timeit.timeit(lambda: trilat.query_radius(querypoint, r=0.07), number=500)', globals(), locals(), "TRProfile.prof") -cProfile.runctx('timeit.timeit(lambda: trilat.query_radius_t3(querypoint, r=0.07), number=500)', globals(), locals(), "T3Profile.prof") - -s = pstats.Stats("BFProfile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("KDProfile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("TRProfile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() - -s = pstats.Stats("T3Profile.prof") -s.strip_dirs().sort_stats("cumtime").print_stats() \ No newline at end of file From 32c42e7f120c7715d76bac6f16222b06e3a7d5f4 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Mon, 15 Mar 2021 19:40:16 -0700 Subject: [PATCH 18/19] Fix bug that caused segfault --- sklearn/neighbors/_trilateration.pyx | 35 ++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index a844d269ce991..2c0538d7f6ee2 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -144,7 +144,7 @@ cdef class TrilaterationIndex: self.ndims = self.data_arr.shape[1] - print(f"self.dist_metric.__class__.__name__: {self.dist_metric.__class__.__name__}") + # print(f"self.dist_metric.__class__.__name__: {self.dist_metric.__class__.__name__}") if self.dist_metric.__class__.__name__ == 'GeodesicDistance' or \ self.dist_metric.__class__.__name__ == 'HaversineDistance': @@ -179,8 +179,10 @@ cdef class TrilaterationIndex: """ if isinstance(X, list): + # print("check 1") results = self._query_one(np.asarray(X), k, return_distance) elif X.shape[0] == 1: + # print("check 2") results=self._query_one(X, k, return_distance) else: raise NotImplementedError("Not yet") @@ -189,7 +191,6 @@ cdef class TrilaterationIndex: # return_distance=return_distance, # sort_results=sort_results) \ # for x in X] - return results cdef _query_one(self, X, k=5, @@ -202,6 +203,7 @@ cdef class TrilaterationIndex: "use query for multiple records") if X.shape[X.ndim - 1] != self.data.shape[1]: + # print(f"X is : {X}") raise ValueError("query data dimension must " "match training data dimension") @@ -209,14 +211,19 @@ cdef class TrilaterationIndex: raise ValueError("k must be less than or equal " "to the number of training points") + # print(f"Check: X is {X}") # Can probably improve this - using a heap that allows more than 1 value # but we're always only using one here (X.shape[0] must be 1 from above) cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + # print(f"Check - Built NeighborsHeap") + # Establish the distances from the query point to the reference points cdef np.ndarray[DTYPE_t, ndim=2] q_dists q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + # print(f"Check - built q_dists") + # cdef DTYPE_t[:, ::1] Xarr # Xarr = get_memview_DTYPE_2D(np.asarray(X)) # Xarr = np.asarray(X, dtype=DTYPE, order='C') @@ -228,17 +235,20 @@ cdef class TrilaterationIndex: best_dist = self.dist_metric.pairwise([self.data_arr[self.idx_array_arr[best_idx],:]], X) # heap.push(0, best_dist, best_idx) + # Populate the heap using 2k elements; k on each side of our first guess: low_idx = max(best_idx - k, 0) high_idx = min(best_idx + k, self.distances.shape[0]) + # print("Check - ready for loop") for i in range(low_idx, high_idx + 1): + # print(f"Check - Loop {i}") if i < self.distances.shape[0]: test_dist = self.dist_metric.pairwise([self.data_arr[self.idx_array_arr[i],:]], X) heap.push(0, test_dist, self.idx_array_arr[i]) - + # print("Check - out of loop") # Establish bounds between which to search - if heap.largest(0) != np.inf: + if heap.largest(0) == np.inf: # This was != before... bug? low_idx_possible = 0 high_idx_possible = self.data_arr.shape[0] else: @@ -249,20 +259,26 @@ cdef class TrilaterationIndex: # So we test more than one new point at a time test_idx = best_idx + # print(f"Check - low_idx_possible: {low_idx_possible}, high_idx_possible: {high_idx_possible}") + while True: + # print(f"Check - Entering while loop with low_idx: {low_idx}, high_idx: {high_idx}") if low_idx <= low_idx_possible and high_idx >= high_idx_possible: break # Determine whether the next high or low point is a better test: lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) - if lowdelta <= highdelta and low_idx >= low_idx_possible: + if lowdelta <= highdelta and low_idx > low_idx_possible: + # print("check a") test_idx = low_idx low_idx = low_idx - 1 - elif high_idx <= high_idx_possible: + elif high_idx < high_idx_possible: + # print("check b") test_idx = high_idx high_idx = high_idx + 1 - elif low_idx >= low_idx_possible: + elif low_idx > low_idx_possible: + # print("check c") test_idx = low_idx low_idx = low_idx - 1 else: @@ -273,6 +289,7 @@ cdef class TrilaterationIndex: # Check that all pre-calculated distances are better than best sufficient = True + # print("Check - checking sufficient") for d, q in zip(self.distances[test_idx,1:], q_dists[0,1:]): # print(f"testing that: {abs(d-q)} < {heap.largest(0)}") if abs(d-q) > heap.largest(0): @@ -290,7 +307,7 @@ cdef class TrilaterationIndex: high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + heap.largest(0)) continue - + # print("Check - out of while") # return (self.idx_array[best_idx], best_dist) return heap.get_arrays() @@ -727,8 +744,6 @@ cdef class TrilaterationIndex: for i in range(len(dist_results)): # print(f"{i} (count: {count}) testing point {approx_idxs[i]} with dist: {dist_results[i]}") if dist_results[i] <= r: - # if approx_idxs[i] == 18: - # print(f"adding {approx_idxs[i]}") result_arr[count] = approx_idxs[i] count += 1 From e3f65f52f42c0bb16e580827f546868b9de0f942 Mon Sep 17 00:00:00 2001 From: Chip Lynch Date: Mon, 22 Mar 2021 14:31:30 -0700 Subject: [PATCH 19/19] Mostly fixed bug causing query_expand to return no results It's possible that if the initial guess is too low, the _query_expand function will iterate beyond its MAX_ITER limit before finding a single approximate match. This is patched by forcing the initial guess function to report a minimum radius of 0.5, but this could be problematic in certain coordinate systems. --- sklearn/neighbors/_trilateration.pyx | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx index 2c0538d7f6ee2..0cd70497dadff 100644 --- a/sklearn/neighbors/_trilateration.pyx +++ b/sklearn/neighbors/_trilateration.pyx @@ -326,10 +326,14 @@ cdef class TrilaterationIndex: ground_idx = _find_nearest_sorted_2D(self.distances, q_dists[0, 0]) # ground_idx = np.searchsorted(self.distances, q_dists[0, 0], side="left") - low_idx = ground_idx - (k//2) - high_idx = ground_idx + ceil(k/2) + low_idx = max(ground_idx - k, 0) + high_idx = min(ground_idx + k, self.data_arr.shape[0]) + + # Values were insanely low - like 1e-07. For now, forcing min of 0.5. Consider improvements here. + # print(f"""get_start_dist: ({low_idx}, {high_idx}), {self.distances[high_idx, 0]}, {self.distances[low_idx, 0]} + # {self.distances[high_idx, 0] - self.distances[low_idx, 0]}""") - return(self.distances[high_idx, 0] - self.distances[low_idx, 0]) + return(min(0.5, self.distances[high_idx, 0] - self.distances[low_idx, 0])) def query_expand(self, X, k, return_distance=True, sort_results=True, mfudge=5.0, miter=20, sscale=2.0): @@ -421,12 +425,16 @@ cdef class TrilaterationIndex: if new_radius > too_high: new_radius = (too_high + too_low) / 2 radius = new_radius + # print(f"too small... new radius: {radius}") # approx_array = self._query_radius_at_least_k(X, radius, k) approx_array = self._query_radius_r0_only(X, radius) approx_count = approx_array.shape[0] + # print(f"approx_count: {approx_count}") continue elif approx_count > k * MAX_FUDGE: + too_high = radius radius = (radius + too_low) / STD_SCALE + # print(f"too big... new radius: {radius}") # approx_array = self._query_radius_at_least_k(X, radius, k) approx_array = self._query_radius_r0_only(X, radius) approx_count = approx_array.shape[0]