diff --git a/Makefile b/Makefile index 112b1e68188a0..5ea35053a3b21 100644 --- a/Makefile +++ b/Makefile @@ -66,3 +66,8 @@ code-analysis: flake8-diff: git diff upstream/main -u -- "*.py" | flake8 --diff + +install: + . ./venv38/bin/activate ; pip install --verbose --no-build-isolation --editable . +install-test: + python -c "import sklearn; sklearn.show_versions()" diff --git a/debug_query_radius.py b/debug_query_radius.py new file mode 100644 index 0000000000000..5b9ea31be65c4 --- /dev/null +++ b/debug_query_radius.py @@ -0,0 +1,123 @@ +# File to debug +# This file should never be published... +# Purge it from git before merging + +import csv +import sklearn +import numpy as np + +from scipy.spatial.distance import cdist + +# samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] +querypoint = [[50, 65]] + +samples = [] +refpoints = [] + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/point_sample_10.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + samples.append([row['x'], row['y']]) + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/sample_ref_points.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + refpoints.append([row['x'], row['y']]) + +print(refpoints) + +print(samples) +print("Actual distances:") +print(cdist(samples, querypoint)) + +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.neighbors import TrilaterationIndex + +tree = KDTree(samples) +tq = tree.query_radius(querypoint, r=20) +print("tree query results:") +print(tq) + +neigh = NearestNeighbors(radius=10) +neigh.fit(samples) + +print("finding neighbors within radius:") +rng = neigh.radius_neighbors([[44., 44.]]) +print("distances:") +print(np.asarray(rng[0][0])) +print("indexes:") +print(list(np.asarray(rng[1][0]))) +print("points:") +things = [samples[x] for x in list(np.asarray(rng[1][0]))] +print(things) +# [samples[x] for x in list(np.asarray(rng[1][0]))] +# print(samples[list(np.asarray(rng[1][0]))]) + +print(type(tree)) +print(type(tree.data)) +# dist, ind = tree.query +print(np.asarray(tree.node_data)) # Ohhh https://stackoverflow.com/questions/20978377/cython-convert-memory-view-to-numpy-array +print(tree.node_data[0]) + +ti = TrilaterationIndex(samples, refpoints) + + + +print(np.asarray(ti.ref_points)) +print(np.asarray(ti.idx_array)) +print("distances:") +print(np.asarray(ti.distances)) + +print(ti.ref_points.shape[0]) +print(ti.ref_points.shape[1]) + +result1 = ti.query_radius_approx(np.asarray(querypoint), 20) +print(f"query_radius_approx: {result1}") + + +result2 = ti.query_radius_approx(np.asarray(querypoint), 20) +print(f"query_radius: {result2}") + +rq = ti.query_radius(np.asarray(querypoint), 20) +print(f"rq: {rq}") + +print("------------") +print("single point tests") +r4 = ti.query(querypoint, 3) +ids, dists = r4 +print(f"query: {r4}") +print(f"indexes: {ids}, dists: {dists}") + +r3 = ti.query_expand(querypoint, 3) +ids, dists = r3 +print(f"query_expand: {r3}") +print(f"indexes: {np.asarray(ids)}", dists) + +print("Passed Single Point tests") +print("------------") +print("Starting multi point tests") + +querypoints = np.asarray([np.asarray([50, 65]), + np.asarray([55, 70]), + np.asarray([45, 80])]) + +r4 = ti.query(querypoints, 3) +print(f"query results r4: {r4}") +print(f"r4.shape: {np.asarray(r4).shape}") + +ids, dists = r4 +print(f"indexes: {ids}, dists: {dists}") + +r3 = ti.query_expand(querypoints, 3) +ids, dists = r3 +print(f"query_expand: {r3}") +print(f"indexes: {np.asarray(ids)}", dists) + +# [(array([[14.30117536, 15.82017495, 16.12663017]]), array([[0, 5, 3]])), +# (array([[16.08773716, 16.85576594, 16.85576594]]), array([[5, 0, 0]])), +# (array([[ 4.37084771, 23.74124599, 23.74124599]]), array([[5, 3, 3]]))] + +# cython sklearn/neighbors/_trilateration.pyx -a +# cython sklearn/neighbors/_kd_tree.pyx -a diff --git a/doc/conf.py b/doc/conf.py index adf12d9e88e82..62013d8891073 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -25,6 +25,7 @@ # is relative to the documentation root, use os.path.abspath to make it # absolute, like shown here. sys.path.insert(0, os.path.abspath('sphinxext')) +sys.path.insert(0, os.path.abspath('../venv38/lib/python3.8/site-packages/')) from github_link import make_linkcode_resolve import sphinx_gallery diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index a3cf7f4bf1d72..0aad03de94f9d 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -751,6 +751,53 @@ def haversine_distances(X, Y=None): from sklearn.neighbors import DistanceMetric return DistanceMetric.get_metric('haversine').pairwise(X, Y) +def geodesic_distances(X, Y=None): + """Compute the Geodesic distance between samples in X and Y. + + The geodesic distance is an accurate measure of the distance between + two points on the earth, using an ellipsoidal model of the earth + which is more accurate than the spherical model used in Haversine. + + The trade-off is that no closed form calculation exists, so the + process is iterative, using numeric techniques until a specific + accuracy is reached. + + See the geopy.distance.geodesic code for implementation details. + + Parameters + ---------- + X : array-like of shape (n_samples_X, 2) + + Y : array-like of shape (n_samples_Y, 2), default=None + + Returns + ------- + distance : ndarray of shape (n_samples_X, n_samples_Y) + + Notes + ----- + This will be significantly slower than the Haversine distance and + should only be used when more accuracy is needed. + + Unlike Haversine, which requires input in radians, the geodesic + implementation works on (latitude, longitude) pairs. + + Examples + -------- + We want to calculate the distance between the Ezeiza Airport + (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris, + France). + + >>> from sklearn.metrics.pairwise import geodesic_distances + >>> bsas = [-34.83333, -58.5166646] + >>> paris = [49.0083899664, 2.53844117956] + >>> result = geodesic_distances([bsas, paris]) + >>> result + array([[ 0. , 11099.54035582], + [11099.54035582, 0. ]]) + """ + from sklearn.neighbors import DistanceMetric + return DistanceMetric.get_metric('geodesic').pairwise(X, Y) @_deprecate_positional_args def manhattan_distances(X, Y=None, *, sum_over_features=True): @@ -1340,6 +1387,7 @@ def chi2_kernel(X, Y=None, gamma=1.): 'manhattan': manhattan_distances, 'precomputed': None, # HACK: precomputed is always allowed, never called 'nan_euclidean': nan_euclidean_distances, + 'geodesic': geodesic_distances, } @@ -1359,6 +1407,7 @@ def distance_metrics(): 'cosine' metrics.pairwise.cosine_distances 'euclidean' metrics.pairwise.euclidean_distances 'haversine' metrics.pairwise.haversine_distances + 'geodesic' metrics.pairwise.geodesic_distances 'l1' metrics.pairwise.manhattan_distances 'l2' metrics.pairwise.euclidean_distances 'manhattan' metrics.pairwise.manhattan_distances @@ -1440,7 +1489,7 @@ def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds): 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski", - 'nan_euclidean', 'haversine'] + 'nan_euclidean', 'haversine', 'geodesic'] _NAN_METRICS = ['nan_euclidean'] @@ -1694,6 +1743,9 @@ def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None, See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. + - From geopy.distance: ['geodesic'] + See the documentation for geopy for details on this metric. + Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 82f9993bec50c..cb9d55c22498c 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -5,6 +5,7 @@ from ._ball_tree import BallTree from ._kd_tree import KDTree +from ._trilateration import TrilaterationIndex from ._dist_metrics import DistanceMetric from ._graph import kneighbors_graph, radius_neighbors_graph from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer @@ -20,6 +21,7 @@ __all__ = ['BallTree', 'DistanceMetric', 'KDTree', + 'TrilaterationIndex', 'KNeighborsClassifier', 'KNeighborsRegressor', 'KNeighborsTransformer', diff --git a/sklearn/neighbors/_ball_tree.pyx b/sklearn/neighbors/_ball_tree.pyx index 81ce9606f7b80..f9ee57aa43d7b 100644 --- a/sklearn/neighbors/_ball_tree.pyx +++ b/sklearn/neighbors/_ball_tree.pyx @@ -19,7 +19,8 @@ VALID_METRICS = ['EuclideanDistance', 'SEuclideanDistance', 'DiceDistance', 'KulsinskiDistance', 'RogersTanimotoDistance', 'RussellRaoDistance', 'SokalMichenerDistance', 'SokalSneathDistance', - 'PyFuncDistance', 'HaversineDistance'] + 'PyFuncDistance', 'HaversineDistance', + 'GeodesicDistance'] include "_binary_tree.pxi" diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx index 8bee948eeaeba..e466e435b8910 100755 --- a/sklearn/neighbors/_dist_metrics.pyx +++ b/sklearn/neighbors/_dist_metrics.pyx @@ -4,6 +4,7 @@ #cython: cdivision=True # By Jake Vanderplas (2013) +# Geodesic hack by Chip Lynch (2021) #needswork # written for the scikit-learn project # License: BSD @@ -11,6 +12,7 @@ import numpy as np cimport numpy as np np.import_array() # required in order to use C-API +from geopy.distance import geodesic ###################################################################### # Numpy 1.3-1.4 compatibility utilities @@ -86,7 +88,9 @@ METRIC_MAPPING = {'euclidean': EuclideanDistance, 'sokalmichener': SokalMichenerDistance, 'sokalsneath': SokalSneathDistance, 'haversine': HaversineDistance, - 'pyfunc': PyFuncDistance} + 'pyfunc': PyFuncDistance, + 'geodesic': GeodesicDistance, + 'vincenty': GeodesicDistance} def get_valid_metric_ids(L): @@ -410,6 +414,31 @@ cdef class DistanceMetric: get_memview_DTYPE_2D(Darr)) return Darr +#------------------------------------------------------------ +# Geodesic Distance +# Accurate distance between two points on earth +# modeled as an Ellipsoid +# see HaversineDistance for a Spheroid earth distance +cdef class GeodesicDistance(DistanceMetric): + r""" Geodesic Distance metric + + Imported from geopy, see: + https://geopy.readthedocs.io/en/stable/#module-geopy.distance + """ + + # Late import to only happen if needed + from geopy.distance import geodesic + + def __init__(self): + self.p = 2 + + cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, + ITYPE_t size) nogil except -1: + if size != 2: + with gil: + raise ValueError("Geodesic Distance takes only 2-dimensions (Latitude, Longitude)") + with gil: + return geodesic((x1[0], x1[1]),(x2[0], x2[1])).kilometers #------------------------------------------------------------ # Euclidean Distance diff --git a/sklearn/neighbors/_helpers.pxi b/sklearn/neighbors/_helpers.pxi new file mode 100644 index 0000000000000..4ecc6569845ba --- /dev/null +++ b/sklearn/neighbors/_helpers.pxi @@ -0,0 +1,228 @@ +# File is temporary, for moving around some dependencies + +from libc.math cimport fabs + +cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( + np.ndarray[DTYPE_t, ndim=2, mode='c'] X): + return ( X.data) + +cdef ITYPE_t[::1] get_memview_ITYPE_1D( + np.ndarray[ITYPE_t, ndim=1, mode='c'] X): + return ( X.data) + +cdef ITYPE_t[:, ::1] get_memview_ITYPE_2D( + np.ndarray[ITYPE_t, ndim=2, mode='c'] X): + return ( X.data) + +cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr, + ITYPE_t i1, ITYPE_t i2) nogil: + """swap the values at inex i1 and i2 of both darr and iarr""" + cdef DTYPE_t dtmp = darr[i1] + darr[i1] = darr[i2] + darr[i2] = dtmp + + cdef ITYPE_t itmp = iarr[i1] + iarr[i1] = iarr[i2] + iarr[i2] = itmp + +cdef int _find_nearest_sorted_2D(DTYPE_t[:,:] rdists, DTYPE_t target): + """ rdists must be sorted by its first column. + Can we do this faster with Memoryviews? See _simultaneous_sort + """ + cdef int idx + idx = np.searchsorted(rdists[:,0], target, side="left") + if idx > 0 and (idx == len(rdists) or fabs(target - rdists[idx-1,0]) < fabs(target - rdists[idx,0])): + idx = idx - 1 + return idx + +cdef int _find_nearest_sorted_1D(DTYPE_t[:] rdists, DTYPE_t target): + """ rdists must be sorted by its first column. + Can we do this faster with Memoryviews? See _simultaneous_sort + """ + cdef int idx + idx = np.searchsorted(rdists[:], target, side="left") + if idx > 0 and (idx == len(rdists) or fabs(target - rdists[idx-1]) < fabs(target - rdists[idx])): + idx = idx - 1 + return idx + +cdef class NeighborsHeap: + """A max-heap structure to keep track of distances/indices of neighbors + + This implements an efficient pre-allocated set of fixed-size heaps + for chasing neighbors, holding both an index and a distance. + When any row of the heap is full, adding an additional point will push + the furthest point off the heap. + + Parameters + ---------- + n_pts : int + the number of heaps to use + n_nbrs : int + the size of each heap. + """ + cdef np.ndarray distances_arr + cdef np.ndarray indices_arr + + cdef DTYPE_t[:, ::1] distances + cdef ITYPE_t[:, ::1] indices + + def __cinit__(self): + self.distances_arr = np.zeros((1, 1), dtype=DTYPE, order='C') + self.indices_arr = np.zeros((1, 1), dtype=ITYPE, order='C') + self.distances = get_memview_DTYPE_2D(self.distances_arr) + self.indices = get_memview_ITYPE_2D(self.indices_arr) + + def __init__(self, n_pts, n_nbrs): + self.distances_arr = np.full((n_pts, n_nbrs), np.inf, dtype=DTYPE, + order='C') + self.indices_arr = np.zeros((n_pts, n_nbrs), dtype=ITYPE, order='C') + self.distances = get_memview_DTYPE_2D(self.distances_arr) + self.indices = get_memview_ITYPE_2D(self.indices_arr) + + def get_max(self, row): + """Get the max distance and index from the heap for a given row + """ + return self.largest(row), self.largest_idx(row) + + def get_arrays(self, sort=True): + """Get the arrays of distances and indices within the heap. + + If sort=True, then simultaneously sort the indices and distances, + so the closer points are listed first. + """ + if sort: + self._sort() + return self.distances_arr, self.indices_arr + + cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1: + """Return the largest distance in the given row""" + return self.distances[row, 0] + + cdef inline ITYPE_t largest_idx(self, ITYPE_t row) nogil except -1: + """Return the index for the largest distance in the given row""" + return self.indices[row, 0] + + def push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val): + return self._push(row, val, i_val) + + cdef int _push(self, ITYPE_t row, DTYPE_t val, + ITYPE_t i_val) nogil except -1: + """push (val, i_val) into the given row""" + cdef ITYPE_t i, ic1, ic2, i_swap + cdef ITYPE_t size = self.distances.shape[1] + cdef DTYPE_t* dist_arr = &self.distances[row, 0] + cdef ITYPE_t* ind_arr = &self.indices[row, 0] + + # check if val should be in heap + if val > dist_arr[0]: + return 0 + + # insert val at position zero + dist_arr[0] = val + ind_arr[0] = i_val + + # descend the heap, swapping values until the max heap criterion is met + i = 0 + while True: + ic1 = 2 * i + 1 + ic2 = ic1 + 1 + + if ic1 >= size: + break + elif ic2 >= size: + if dist_arr[ic1] > val: + i_swap = ic1 + else: + break + elif dist_arr[ic1] >= dist_arr[ic2]: + if val < dist_arr[ic1]: + i_swap = ic1 + else: + break + else: + if val < dist_arr[ic2]: + i_swap = ic2 + else: + break + + dist_arr[i] = dist_arr[i_swap] + ind_arr[i] = ind_arr[i_swap] + + i = i_swap + + dist_arr[i] = val + ind_arr[i] = i_val + + return 0 + + cdef int _sort(self) except -1: + """simultaneously sort the distances and indices""" + cdef DTYPE_t[:, ::1] distances = self.distances + cdef ITYPE_t[:, ::1] indices = self.indices + cdef ITYPE_t row + for row in range(distances.shape[0]): + _simultaneous_sort(&distances[row, 0], + &indices[row, 0], + distances.shape[1]) + return 0 + +cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx, + ITYPE_t size) nogil except -1: + """ + Perform a recursive quicksort on the dist array, simultaneously + performing the same swaps on the idx array. The equivalent in + numpy (though quite a bit slower) is + + def simultaneous_sort(dist, idx): + i = np.argsort(dist) + return dist[i], idx[i] + """ + cdef ITYPE_t pivot_idx, i, store_idx + cdef DTYPE_t pivot_val + + # in the small-array case, do things efficiently + if size <= 1: + pass + elif size == 2: + if dist[0] > dist[1]: + dual_swap(dist, idx, 0, 1) + elif size == 3: + if dist[0] > dist[1]: + dual_swap(dist, idx, 0, 1) + if dist[1] > dist[2]: + dual_swap(dist, idx, 1, 2) + if dist[0] > dist[1]: + dual_swap(dist, idx, 0, 1) + else: + # Determine the pivot using the median-of-three rule. + # The smallest of the three is moved to the beginning of the array, + # the middle (the pivot value) is moved to the end, and the largest + # is moved to the pivot index. + pivot_idx = size / 2 + if dist[0] > dist[size - 1]: + dual_swap(dist, idx, 0, size - 1) + if dist[size - 1] > dist[pivot_idx]: + dual_swap(dist, idx, size - 1, pivot_idx) + if dist[0] > dist[size - 1]: + dual_swap(dist, idx, 0, size - 1) + pivot_val = dist[size - 1] + + # partition indices about pivot. At the end of this operation, + # pivot_idx will contain the pivot value, everything to the left + # will be smaller, and everything to the right will be larger. + store_idx = 0 + for i in range(size - 1): + if dist[i] < pivot_val: + dual_swap(dist, idx, i, store_idx) + store_idx += 1 + dual_swap(dist, idx, store_idx, size - 1) + pivot_idx = store_idx + + # recursively sort each side of the pivot + if pivot_idx > 1: + _simultaneous_sort(dist, idx, pivot_idx) + if pivot_idx + 2 < size: + _simultaneous_sort(dist + pivot_idx + 1, + idx + pivot_idx + 1, + size - pivot_idx - 1) + return 0 diff --git a/sklearn/neighbors/_kd_tree.pyx b/sklearn/neighbors/_kd_tree.pyx index bc1ab764a6fcf..d6c0c94a2d57a 100644 --- a/sklearn/neighbors/_kd_tree.pyx +++ b/sklearn/neighbors/_kd_tree.pyx @@ -12,7 +12,8 @@ __all__ = ['KDTree'] DOC_DICT = {'BinaryTree': 'KDTree', 'binary_tree': 'kd_tree'} VALID_METRICS = ['EuclideanDistance', 'ManhattanDistance', - 'ChebyshevDistance', 'MinkowskiDistance'] + 'ChebyshevDistance', 'MinkowskiDistance', + 'GeodesicDistance'] include "_binary_tree.pxi" diff --git a/sklearn/neighbors/_trilateration.pyx b/sklearn/neighbors/_trilateration.pyx new file mode 100644 index 0000000000000..0cd70497dadff --- /dev/null +++ b/sklearn/neighbors/_trilateration.pyx @@ -0,0 +1,793 @@ +#!python +#cython: boundscheck=False +#cython: wraparound=False +#cython: cdivision=True +#cython: profile=True + +# Trilateration (or n-fold lateration for arbitrary dimensions) +# ===================== +# +# Author: Chip Lynch +# License: BSD something something +# +# The Trilateration Index itself is a only a data structure designed +# to store distances from n (default 3 thus "tri") fixed points in a +# sorted manner for quick approximate distance comparisons. This is +# most beneficial when the distance metric is complex (i.e. accurate +# geographic distances on an oblate spheroid earth). +# +# Trilateration optimized nearest-neighbor and related algorithms will +# create this structure during the "fit" phase, and leverage the anchored +# distance data within neighbor searches. + +cimport numpy as np +import numpy as np + +from collections import Counter +from functools import reduce +from libc.math cimport fabs, ceil + +from ..utils import ( + check_array +) + +from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t +from ._typedefs import DTYPE, ITYPE + +from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, + euclidean_dist_to_rdist, euclidean_rdist_to_dist) + + +__all__ = ['TrilaterationIndex'] + +DOC_DICT = {'TrilaterationIndex': 'TrilaterationIndex'} + +# Start simple: +VALID_METRICS = ['EuclideanDistance', 'HaversineDistance', + 'GeodesicDistance'] + +include "_helpers.pxi" + +cdef class TrilaterationIndex: + # __doc__ = CLASS_DOC.format(**DOC_DICT) + + cdef np.ndarray data_arr + cdef np.ndarray ref_points_arr + cdef np.ndarray idx_array_arr + cdef np.ndarray r0sortorder + cdef np.ndarray ref_dists + cdef np.ndarray r_indexes + cdef np.ndarray r_distances + + cdef char* metric + + cdef readonly ITYPE_t[:, ::1] r_indexes_mv + cdef readonly DTYPE_t[:, ::1] r_distances_mv + + cdef readonly DTYPE_t[:, ::1] data + cdef readonly DTYPE_t[:, ::1] ref_points + cdef public ITYPE_t[::1] idx_array + cdef readonly DTYPE_t[:, ::1] distances + + cdef int ndims + + cdef DistanceMetric dist_metric + + + def __cinit__(self): + self.data_arr = np.empty((1, 1), dtype=DTYPE, order='C') + + def __init__(self, data, ref=None, metric='euclidean', **kwargs): + self.data_arr = check_array(data, dtype=DTYPE, order='C') + + n_samples = self.data_arr.shape[0] + n_features = self.data_arr.shape[1] + + self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) + + metric = self.dist_metric.__class__.__name__ + if metric not in VALID_METRICS: + raise ValueError('metric {metric} is not valid for ' + '{TrilaterationIndex}'.format(metric=metric, + **DOC_DICT)) + self.dist_metric._validate_data(self.data_arr) + + # Ref points can be passed in + # Otherwise they are generated based on input data + if ref is None: + ref = self._create_ref_points() + # print("generated synthetic reference points:") + # print(ref) + + self.ref_points_arr = check_array(ref, dtype=DTYPE, order='C') + + self.ref_dists = self.dist_metric.pairwise(self.data_arr, self.ref_points_arr) + + if self.data_arr.shape[1] != self.ref_points_arr.shape[1]: + raise ValueError("Data and reference points must share same dimension") + + # allocate arrays for storage + self.idx_array_arr = np.arange(n_samples, dtype=ITYPE) + + + # Sort data by distance to refpoint[0] + # TODO: See _simultaneous sort to see if we can speed this up: + # (Right now since self.distances is 2D, we can't use this) + # _simultaneous_sort(&self.distances, + # &self.idx_array, + # self.distances.shape[0]) + + r0sortorder = np.argsort(self.ref_dists[:,0]) + self.ref_dists = self.ref_dists[r0sortorder] + self.idx_array_arr = self.idx_array_arr[r0sortorder] + + # Build ADDITIONAL indexes each sorted by the specific ref dists + # Note that self.distances/self.ref_dists are ONLY sorted by r0 dist + self.distances = get_memview_DTYPE_2D(self.ref_dists) + self.data = get_memview_DTYPE_2D(self.data_arr) + self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) + self.ref_points = get_memview_DTYPE_2D(self.ref_points_arr) + + self._build_r_indexes() + self.r_indexes_mv = get_memview_ITYPE_2D(self.r_indexes) + self.r_distances_mv = get_memview_DTYPE_2D(self.r_distances) + + + def _create_ref_points(self): + """ + Create a set of well distributed reference points + in the same number of dimensions as the input data + + For the love of all that is holy, make this better. + """ + cdef DTYPE_t MAX_REF = 9999.0 + + self.ndims = self.data_arr.shape[1] + + # print(f"self.dist_metric.__class__.__name__: {self.dist_metric.__class__.__name__}") + + if self.dist_metric.__class__.__name__ == 'GeodesicDistance' or \ + self.dist_metric.__class__.__name__ == 'HaversineDistance': + refs = [[90, 0], [38.26, -85.76], [-19.22, 159.93]] + else: + refs = [[0]*i+[MAX_REF]*(self.ndims-i) for i in range(self.ndims)] + return np.asarray(refs) + + + cdef _build_r_indexes(self): + """ + Build a 2D array + """ + self.r_indexes = np.zeros((self.distances.shape[1], self.distances.shape[0]), dtype=ITYPE, order='C') + self.r_distances = np.zeros((self.distances.shape[1], self.distances.shape[0]), dtype=DTYPE, order='C') + + for i in range(self.distances.shape[1]): + self.r_indexes[i,] = self.idx_array + self.r_distances[i,] = self.distances[:,i] + if i > 0: # At this moment, the first column is already sorted + sortorder = np.argsort(self.r_distances[i,]) + self.r_indexes[i,] = self.r_indexes[i,sortorder] + self.r_distances[i,] = self.r_distances[i, sortorder] + + return 0 + + def query(self, X, k=5, + return_distance=True, + sort_results=True): + """ + query the index for k nearest neighbors + """ + + if isinstance(X, list): + # print("check 1") + results = self._query_one(np.asarray(X), k, return_distance) + elif X.shape[0] == 1: + # print("check 2") + results=self._query_one(X, k, return_distance) + else: + raise NotImplementedError("Not yet") + # [print(x) for x in X] + # results = [self._query_one(np.asarray([x]), k=k, + # return_distance=return_distance, + # sort_results=sort_results) \ + # for x in X] + return results + + cdef _query_one(self, X, k=5, + return_distance=True): + """ + query the index for k nearest neighbors for a single point X + """ + if X.shape[0] != 1: + raise ValueError("_query takes only a single X point" + "use query for multiple records") + + if X.shape[X.ndim - 1] != self.data.shape[1]: + # print(f"X is : {X}") + raise ValueError("query data dimension must " + "match training data dimension") + + if self.data.shape[0] < k: + raise ValueError("k must be less than or equal " + "to the number of training points") + + # print(f"Check: X is {X}") + # Can probably improve this - using a heap that allows more than 1 value + # but we're always only using one here (X.shape[0] must be 1 from above) + cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + + # print(f"Check - Built NeighborsHeap") + + # Establish the distances from the query point to the reference points + cdef np.ndarray[DTYPE_t, ndim=2] q_dists + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + + # print(f"Check - built q_dists") + + # cdef DTYPE_t[:, ::1] Xarr + # Xarr = get_memview_DTYPE_2D(np.asarray(X)) + # Xarr = np.asarray(X, dtype=DTYPE, order='C') + cdef int best_idx, low_idx, low_idx_possible, high_idx, high_idx_possible + cdef int test_idx + cdef DTYPE_t best_dist, test_dist + + best_idx = _find_nearest_sorted_2D(self.distances, q_dists[0,0]) + best_dist = self.dist_metric.pairwise([self.data_arr[self.idx_array_arr[best_idx],:]], X) + # heap.push(0, best_dist, best_idx) + + + # Populate the heap using 2k elements; k on each side of our first guess: + low_idx = max(best_idx - k, 0) + high_idx = min(best_idx + k, self.distances.shape[0]) + # print("Check - ready for loop") + for i in range(low_idx, high_idx + 1): + # print(f"Check - Loop {i}") + if i < self.distances.shape[0]: + test_dist = self.dist_metric.pairwise([self.data_arr[self.idx_array_arr[i],:]], X) + heap.push(0, test_dist, self.idx_array_arr[i]) + + # print("Check - out of loop") + # Establish bounds between which to search + if heap.largest(0) == np.inf: # This was != before... bug? + low_idx_possible = 0 + high_idx_possible = self.data_arr.shape[0] + else: + low_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] - heap.largest(0)) + high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + heap.largest(0)) + + # Consider adding chunking here + # So we test more than one new point at a time + test_idx = best_idx + + # print(f"Check - low_idx_possible: {low_idx_possible}, high_idx_possible: {high_idx_possible}") + + while True: + # print(f"Check - Entering while loop with low_idx: {low_idx}, high_idx: {high_idx}") + if low_idx <= low_idx_possible and high_idx >= high_idx_possible: + break + + # Determine whether the next high or low point is a better test: + lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) + highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) + if lowdelta <= highdelta and low_idx > low_idx_possible: + # print("check a") + test_idx = low_idx + low_idx = low_idx - 1 + elif high_idx < high_idx_possible: + # print("check b") + test_idx = high_idx + high_idx = high_idx + 1 + elif low_idx > low_idx_possible: + # print("check c") + test_idx = low_idx + low_idx = low_idx - 1 + else: + print("why?") + break + + # print(f"testing point at index {test_idx}: data[{self.idx_array[test_idx]}] {np.asarray(self.data[self.idx_array[test_idx],:])}") + # Check that all pre-calculated distances are better than best + sufficient = True + + # print("Check - checking sufficient") + for d, q in zip(self.distances[test_idx,1:], q_dists[0,1:]): + # print(f"testing that: {abs(d-q)} < {heap.largest(0)}") + if abs(d-q) > heap.largest(0): + sufficient = False + break + + if sufficient: + test_dist = self.dist_metric.pairwise([self.data_arr[self.idx_array[test_idx],:]], X) + # print(f"{test_idx} is sufficient... test_dist is: {test_dist}") + if test_dist < heap.largest(0): + heap.push(0, test_dist, self.idx_array[test_idx]) + # print(f"pushing idx: {best_idx} ({self.idx_array[best_idx]}) with dist: {test_dist}") + best_idx = test_idx + low_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] - heap.largest(0)) + high_idx_possible = _find_nearest_sorted_2D(self.distances, q_dists[0, 0] + heap.largest(0)) + + continue + # print("Check - out of while") + # return (self.idx_array[best_idx], best_dist) + return heap.get_arrays() + + cdef _get_start_dist(self, X, k): + """ + For the expanding radius approach, we must make an initial guess + for a radius that may contain exactly k members. + We do this by selecting the R0 reference distances, and simply + expanding to k items, and returning the distance which separates + the k points along that sole axis + """ + cdef np.ndarray q_dists + cdef ITYPE_t low_idx, high_idx + + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + + ground_idx = _find_nearest_sorted_2D(self.distances, q_dists[0, 0]) + # ground_idx = np.searchsorted(self.distances, q_dists[0, 0], side="left") + low_idx = max(ground_idx - k, 0) + high_idx = min(ground_idx + k, self.data_arr.shape[0]) + + # Values were insanely low - like 1e-07. For now, forcing min of 0.5. Consider improvements here. + # print(f"""get_start_dist: ({low_idx}, {high_idx}), {self.distances[high_idx, 0]}, {self.distances[low_idx, 0]} + # {self.distances[high_idx, 0] - self.distances[low_idx, 0]}""") + + return(min(0.5, self.distances[high_idx, 0] - self.distances[low_idx, 0])) + + def query_expand(self, X, k, return_distance=True, sort_results=True, + mfudge=5.0, miter=20, sscale=2.0): + """ + X can be what...? + A singlie list of one coordinate: [50, 25, 100] + """ + cdef np.ndarray idx_results + # cdef DTYPE_t[::1] dist_results + cdef np.ndarray dist_results + + + if isinstance(X, list): + X = np.asarray(X) + + cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + + if X.shape[0] == 1: + idx_results = np.asarray(self._query_expand(X, k, mfudge, miter, sscale)) + + else: + raise NotImplementedError("You can't do that yet") + # [print(x) for x in X] + # all_results = [self._query_expand(x, k) for x in X] + # print(f"all_results: {all_results}") + + if return_distance: + dist_results = self.dist_metric.pairwise(self.data_arr[idx_results], X) + # best = np.argsort(dist_results) + # dist_results = dist_results[best] + # idx_results = idx_results[best] + for idx, dist in zip(idx_results, dist_results): + heap.push(0, dist, idx) + + return heap.get_arrays() + + return idx_results + + cdef ITYPE_t _count_in_range(self, DTYPE_t* dists, r, size): + cdef ITYPE_t count = 0, i + + for i in range(size): + if dists[i] <= r: + count += 1 + + return count + + cdef _query_expand(self, X, k, mfudge, miter, sscale): + """ + return k-nn by the expanding method... + select a starting radius, iterate over query_radius + while expanding and contracting the radius to + arrive at a good result + """ + + if X.shape[0] != 1: + raise ValueError("_query takes only a single X point" + "use query for multiple records") + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + cdef DTYPE_t radius, too_high, too_low, new_radius, STD_SCALE, MAX_FUDGE, ratio + cdef ITYPE_t approx_count, i, MAX_ITER + cdef ITYPE_t[::1] approx_array + + + MAX_ITER = miter + MAX_FUDGE = mfudge + STD_SCALE = sscale + + radius = self._get_start_dist(X, k) + + too_low = 0 + too_high = np.inf + # approx_array = self._query_radius_at_least_k(X, radius, k) + approx_array = self._query_radius_r0_only(X, radius) + + approx_count = approx_array.shape[0] + + for i in range(MAX_ITER): + if approx_count == k: + break + elif approx_count < k: + too_low = radius + new_radius = radius * STD_SCALE + # new_radius = radius * k / approx_count + if new_radius > too_high: + new_radius = (too_high + too_low) / 2 + radius = new_radius + # print(f"too small... new radius: {radius}") + # approx_array = self._query_radius_at_least_k(X, radius, k) + approx_array = self._query_radius_r0_only(X, radius) + approx_count = approx_array.shape[0] + # print(f"approx_count: {approx_count}") + continue + elif approx_count > k * MAX_FUDGE: + too_high = radius + radius = (radius + too_low) / STD_SCALE + # print(f"too big... new radius: {radius}") + # approx_array = self._query_radius_at_least_k(X, radius, k) + approx_array = self._query_radius_r0_only(X, radius) + approx_count = approx_array.shape[0] + else: + continue + + return approx_array + + + def query_expand_2(self, X, k, return_distance=True, sort_results=True, + mfudge=5.0, miter=20, sscale=2.0): + """ + X can be what...? + A singlie list of one coordinate: [50, 25, 100] + """ + cdef np.ndarray idx_results + # cdef DTYPE_t[::1] dist_results + cdef np.ndarray dist_results + + # print(f"taa daa: {X}") + + if isinstance(X, list): + X = np.asarray(X) + + cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + + if X.shape[0] == 1: + heap = self._query_expand_2(X, k, mfudge, miter, sscale) + + else: + raise NotImplementedError("You can't do that yet") + + if return_distance: + return heap.get_arrays() + + return heap.get_arrays() # Fix this + + cdef _query_expand_2(self, X, k, mfudge, miter, sscale): + """ + return k-nn by the expanding method... + In this case we expand the radius for the sorted self.distances + NOT necessarily just precise distances + """ + + if X.shape[0] != 1: + raise ValueError("_query takes only a single X point" + "use query for multiple records") + + if X.shape[X.ndim - 1] != self.data.shape[1]: + raise ValueError("query data dimension must " + "match training data dimension") + + cdef DTYPE_t STD_SCALE, MAX_FUDGE, ratio, radius, dist + cdef ITYPE_t approx_count, i, MAX_ITER, low_count, low_idx, high_idx, ground_idx, CHUNK_SIZE + cdef ITYPE_t new_high, new_low, idx + cdef ITYPE_t[::1] approx_array + cdef NeighborsHeap heap = NeighborsHeap(X.shape[0], k) + + cdef np.ndarray q_dists, temp_data + cdef DTYPE_t[::1] new_dists, approx_dists, low_dists, high_dists + + MAX_ITER = miter + MAX_FUDGE = mfudge + STD_SCALE = sscale + CHUNK_SIZE = max(k, 500) # Tune? + + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + ground_idx = _find_nearest_sorted_2D(self.distances, q_dists[0, 0]) + low_idx = max(ground_idx - k, 0) + high_idx = min(ground_idx + k, self.data_arr.shape[0]) + + approx_array = self.idx_array[low_idx:high_idx+1] + # print(f"k: {k}, ground_idx: {ground_idx}, low_idx: {low_idx}, high_idx: {high_idx}") + # print(f"getting dists for approx_array: {approx_array} with shape: {approx_array.shape}") + approx_dists = self.dist_metric.pairwise(self.data_arr[approx_array], X).reshape(approx_array.shape[0]) + + for idx, dist in zip(approx_array, approx_dists): + if dist < heap.largest(0): + heap.push(0, dist, idx) + + lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) + highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) + + # While the "not closer than" numbers are still closer than the current solution... + count = 0 + while (lowdelta < heap.largest(0) and low_idx > 0) \ + or (highdelta < heap.largest(0) and high_idx < self.data_arr.shape[0]): + count += 1 + # print(count, lowdelta, highdelta, heap.largest(0), low_idx, high_idx) + if lowdelta < heap.largest(0): + new_low = max(low_idx - CHUNK_SIZE, 0) + if new_low < low_idx: + low_dists = self.dist_metric.pairwise(self.data_arr[self.idx_array[new_low:low_idx]], X)[0] #.reshape(low_idx - new_low) + for idx in range(low_idx - new_low): + if low_dists[idx] < heap.largest(0): + heap.push(0, low_dists[idx], self.idx_array[idx + new_low]) + low_idx = new_low + + if highdelta < heap.largest(0): + new_high = min(high_idx + CHUNK_SIZE, self.data_arr.shape[0]) + if new_high > high_idx: + # print(f"new_high: {new_high}, high_idx: {high_idx}, self.data_arr.shape[1]") + high_dists = self.dist_metric.pairwise(self.data_arr[self.idx_array[high_idx:new_high]], X)[0] #.reshape(new_high - high_idx) + # testing approaches for performance: + for idx in range(new_high - high_idx): + if high_dists[idx] < heap.largest(0): + heap.push(0, high_dists[idx], self.idx_array[idx + high_idx]) + # for idx, dist in zip(self.idx_array[high_idx:new_high], high_dists): + # if dist < heap.largest(0): + # heap.push(0, dist, idx) + high_idx = new_high + + lowdelta = fabs(self.distances[low_idx, 0] - q_dists[0, 0]) + highdelta = fabs(self.distances[high_idx, 0] - q_dists[0, 0]) + + return heap + + + def query_radius(self, X, r, + int return_distance=False, + int count_only=False, + int sort_results=False): + """ + query the index for neighbors within a radius r + """ + if count_only and return_distance: + raise ValueError("count_only and return_distance " + "cannot both be true") + + if sort_results and not return_distance: + raise ValueError("return_distance must be True " + "if sort_results is True") + + # cdef ITYPE_t[::1] idx_within_r = np.asarray([1]) + cdef np.ndarray idx_within_r = np.asarray([1]) + cdef np.ndarray dists_within_r + + if isinstance(X, list): + X = np.asarray(X) + + if X.shape[0] == 1: + idx_within_r = self._query_radius_approx(X, r) + else: + idx_within_r = [self._query_radius_approx(x, r) for x in X] + + dists_within_r = self.dist_metric.pairwise(self.data_arr[idx_within_r], X).reshape(idx_within_r.shape[0]) + valid_idx = [x <= r for x in dists_within_r] + dists_within_r = dists_within_r[valid_idx] + idx_within_r = idx_within_r[valid_idx] + + if sort_results: + sort_order = np.argsort(dists_within_r) + dists_within_r = dists_within_r[sort_order] + idx_within_r = idx_within_r[sort_order] + + if return_distance: + return idx_within_r, dists_within_r + + return idx_within_r + + + def query_radius_approx(self, X, r): + """ + query the index for neighbors within a radius r + return approximate results only (by relying on index distances) + """ + result = self._query_radius_approx(X, r) + return result + + cdef _query_radius_approx(self, X, r): + """ + + """ + + # cdef np.ndarray points_in_range = np.arange(self.r_indexes_mv.shape[0]).reshape(self.r_indexes_mv.shape[0]) + cdef np.ndarray q_dists + # cdef ITYPE_t[::1] common_idx + cdef np.ndarray common_idx + cdef np.ndarray new_points + + point_counts = Counter() + + if isinstance(X, list) or X.ndim == 1: + X = np.asarray([X]) + + points_in_range = [] + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + # print(f"q_dists: {q_dists[0]} - {q_dists[0, 1]}") + # print(f"r.indexes.shape: {self.r_indexes_mv.shape}") + + for i in range(self.ndims): + # low_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, i] - r) + low_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] - r, side="left") + # high_idx = _find_nearest_sorted_1D(self.r_distances_mv[i, :], q_dists[0, i] + r) + high_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] + r, side="right") + + # print(f"low_idx: {low_idx}, high_idx: {high_idx}") + # print(f"valid IDS: {self.r_indexes[i, low_idx:high_idx]}") + new_points = self.r_indexes[i, low_idx:high_idx] + # print(f"new_points: {new_points} len: {len(new_points)}") + # point_counts.update(new_points) # Failed performance improvement test; leaving for posterity + points_in_range.append(new_points) + + # THIS IS THE PERFORMANCE PROBLEM: + common_idx = reduce(np.intersect1d, points_in_range) # This works but seems slow (fastest so far tho) + # common_idx = np.asarray([k for (k, v) in point_counts.items() if v == self.r_indexes_mv.shape[0]]) + + # print(f"commonIDs: {common_idx}") + return common_idx + + cdef _query_radius_at_least_k(self, X, r, k): + """ for use with expanding estimates... + break early if we've eliminated too many points + """ + + cdef np.ndarray q_dists + cdef np.ndarray common_idx + + if isinstance(X, list) or X.ndim == 1: + X = np.asarray([X]) + + points_in_range = [] + new_points = [] + + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + + for i in range(self.r_indexes_mv.shape[0]): + low_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] - r, side="left") + high_idx = np.searchsorted(self.r_distances_mv[i, :], q_dists[0, i] + r, side="right") + + new_points = self.r_indexes[i, low_idx:high_idx] + + if points_in_range == []: + points_in_range = new_points + else: + # THIS IS THE PERFORMANCE PROBLEM: + points_in_range = np.intersect1d(points_in_range, new_points) # This is the line that works, but slowly + # At Least k voodoo: + if len(points_in_range) < k: + break + + common_idx = np.asarray(points_in_range) + return common_idx + + + def query_radius_t3(self, X, r): + """ + query the index for neighbors within a radius r + return approximate results only (by relying on index distances) + """ + # result = np.asarray(self._query_radius_r0_only(X, r)) + result = self._query_radius_r0_only(X, r) + + return result + + cdef _query_radius_r0_only(self, X, r): + """ + This approach uses only the original sort order from Ref Point 0 + and then brute forces whatever that index indicates MAY be in range + """ + + cdef DTYPE_t[:, ::1] q_dists # , dist_results + cdef np.ndarray[DTYPE_t, ndim=1] dist_results + # cdef DTYPE_t[::1] dist_results + cdef ITYPE_t[::1] approx_idxs, results + # cdef np.ndarray[DTYPE_t, ndim=2] q_dists, dist_results + # cdef np.ndarray[ITYPE_t] approx_idxs #, points_in_range + cdef ITYPE_t i, low_idx, high_idx, count + + if isinstance(X, list) or X.ndim == 1: + X = np.asarray([X]) + + # points_in_range = [] + + q_dists = self.dist_metric.pairwise(X, self.ref_points_arr) + # ground_idx = _find_nearest_sorted_2D(self.distances, q_dists[0, 0]) + + # Which of the following two pairs are reliably faster? Probably neither: + # low_idx = np.searchsorted(self.r_distances_mv[0,:], q_dists[0, 0] - r, side="left") + # high_idx = np.searchsorted(self.r_distances_mv[0,:], q_dists[0, 0] + r, side="right") + + low_idx = np.searchsorted(self.distances[:,0], q_dists[0, 0] - r, side="left") + high_idx = np.searchsorted(self.distances[:,0], q_dists[0, 0] + r, side="right") + + # print(f"self.distances.shape: {self.distances.shape}, low_idx: {low_idx}, high_idx: {high_idx}") + # print(f"self.distances[low_idx, :]: {np.asarray(self.distances[low_idx,:])}") + # print(f"self.distances[low_idx+1, :]: {np.asarray(self.distances[low_idx+1,:])}") + # print(f"self.distances[low_idx+2, :]: {np.asarray(self.distances[low_idx+2,:])}") + + # print(f"q_dists[0, :]: {q_dists[0, :]}") + # print(f"self.distances - qd: {abs(np.asarray(self.distances[low_idx,:] - q_dists[0, :]))} vs r: {r}") + + # The next two lines work but are 10 times slower than just brute alone (as-is) + # approx_idxs = np.asarray([i for i in range(low_idx, high_idx+1) \ + # if np.all(abs(np.asarray(self.distances[i,:]) - q_dists[0, :]) <= r)]) + # approx_idxs = [self.idx_array[i] for i in approx_idxs] + approx_idxs = self.idx_array[low_idx:high_idx+1] + # print(f"approx_idxs (len {len(approx_idxs)}): {np.array(approx_idxs)}") + + dist_results = self.dist_metric.pairwise(self.data_arr[approx_idxs], X).reshape(approx_idxs.shape[0]) + # print(f"len(dist_results): {len(dist_results)} or {dist_results.shape[0]} low_idx:high_idx {low_idx}:{high_idx}") + # dist_results = dist_results.reshape(dist_results.shape[0]) + # print(f"dist_results: {np.asarray(dist_results)}") + + # cdef ITYPE_t result_count + # result_count = np.count_nonzero(dist_results <= r) + # print(f"dist_results <= r: {dist_results <= r}") + # print(f"result_count: {result_count}") + + # result_arr = np.zeros(result_count, dtype=ITYPE) + result_arr = np.zeros(len(dist_results), dtype=ITYPE) + # results = get_memview_ITYPE_1D(result_arr) # If you replace result_arr with results from here on you get nonsense. I should understand, but do not. + + count = 0 + for i in range(len(dist_results)): + # print(f"{i} (count: {count}) testing point {approx_idxs[i]} with dist: {dist_results[i]}") + if dist_results[i] <= r: + result_arr[count] = approx_idxs[i] + count += 1 + + + # NOW WHAT? + # results_arr = approx_idxs[dist_results <= r] + + # Why is this slow: ? + # results = approx_idxs[[range(8000)]] + + # This alone is slow... why? + # cdef np.array(dtype=bool) things # Not sure how to declare this + # cdef ITYPE_t[::1] things + # things = [x <= r for x in dist_results] + + # for i in range(len(dist_results)): + # if things[i]: + # results.append(approx_idxs[i]) + + # This is slow but works: + # results = approx_idxs[[x[0] <= r for x in dist_results]] + + # This is even slower but works: + # results = np.asarray([approx_idxs[i] for i in range(len(approx_idxs)) if dist_results[i] <= r]) + + # print(f"results: {results}") + # return results + # return np.sort(np.asarray(results[0:count])) + return np.asarray(result_arr[0:count]) + + + cdef _query_radius_best_r_only(self, X, r): + """ + This approach counts the candidates using each reference point by themselves + then brute forces the indexees from the best possible result + + NOT FINISHED + """ + pass \ No newline at end of file diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 822a30f503bd2..1b57da81dacb2 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -28,6 +28,7 @@ class NearestNeighbors(KNeighborsMixin, - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` + - 'trilateration' will use :class:`TrilaterationIndex` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py index 9264044678193..a8b6d98cf1ec4 100644 --- a/sklearn/neighbors/setup.py +++ b/sklearn/neighbors/setup.py @@ -20,6 +20,11 @@ def configuration(parent_package='', top_path=None): include_dirs=[numpy.get_include()], libraries=libraries) + config.add_extension('_trilateration', + sources=['_trilateration.pyx'], + include_dirs=[numpy.get_include()], + libraries=libraries) + config.add_extension('_dist_metrics', sources=['_dist_metrics.pyx'], include_dirs=[numpy.get_include(), @@ -31,6 +36,7 @@ def configuration(parent_package='', top_path=None): sources=['_typedefs.pyx'], include_dirs=[numpy.get_include()], libraries=libraries) + config.add_extension("_quad_tree", sources=["_quad_tree.pyx"], include_dirs=[numpy.get_include()], diff --git a/trilat_test_10.py b/trilat_test_10.py new file mode 100644 index 0000000000000..c69c7740f90f5 --- /dev/null +++ b/trilat_test_10.py @@ -0,0 +1,90 @@ +# File to debug +# This file should never be published... +# Purge it from git before merging + +import csv +import sklearn +import numpy as np + +from scipy.spatial.distance import cdist + +# samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] +querypoint = [[50, 65]] + +samples = [] +refpoints = [] + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/point_sample_10.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + samples.append([row['x'], row['y']]) + +with open('/home/chipmonkey/repos/TrilaterationIndex/data/sample_ref_points.csv') as csvfile: + reader = csv.DictReader(csvfile, quoting=csv.QUOTE_NONNUMERIC) + for row in reader: + print(row) + refpoints.append([row['x'], row['y']]) + +print(refpoints) + +print(samples) +print("Actual distances:") +print(cdist(samples, querypoint)) + +from sklearn.neighbors import NearestNeighbors, KDTree +from sklearn.neighbors import TrilaterationIndex + +tree = KDTree(samples) +tq = tree.query(querypoint, k=2) +print("tree query results:") +print(tq) + +neigh = NearestNeighbors(radius=10) +neigh.fit(samples) + +print("finding neighbors within radius:") +rng = neigh.radius_neighbors([[44., 44.]]) +print("distances:") +print(np.asarray(rng[0][0])) +print("indexes:") +print(list(np.asarray(rng[1][0]))) +print("points:") +things = [samples[x] for x in list(np.asarray(rng[1][0]))] +print(things) +# [samples[x] for x in list(np.asarray(rng[1][0]))] +# print(samples[list(np.asarray(rng[1][0]))]) + +print(type(tree)) +print(type(tree.data)) +# dist, ind = tree.query +print(np.asarray(tree.node_data)) # Ohhh https://stackoverflow.com/questions/20978377/cython-convert-memory-view-to-numpy-array +print(tree.node_data[0]) + +ti = TrilaterationIndex(samples, refpoints) + + + +print(np.asarray(ti.ref_points)) +print(np.asarray(ti.idx_array)) +print("distances:") +print(np.asarray(ti.distances)) + +print(ti.ref_points.shape[0]) +print(ti.ref_points.shape[1]) + +result1 = ti.query(X=np.asarray(querypoint), k=1) +print(result1) + + + +t2 = TrilaterationIndex(samples) +result2 = t2.query(X=np.asarray(querypoint), k=1) +print(result2) +print("This should be 14.30117 something") + + +t2 = TrilaterationIndex(samples) +result2 = t2.query(X=np.asarray(querypoint), k=3) +print(result2) +print("This should be 14.30117 something") \ No newline at end of file