From f0fac2aa80da7c739b88043571e5d49ba40f9413 Mon Sep 17 00:00:00 2001 From: MechCoder <manojkumarsivaraj334@gmail.com> Date: Fri, 3 Jul 2015 15:49:32 -0700 Subject: [PATCH] [SPARK-7401] [MLLIB] [PYSPARK] Vectorize dot product and sq_dist between SparseVector and DenseVector Currently we iterate over indices which can be vectorized. Author: MechCoder <manojkumarsivaraj334@gmail.com> Closes #5946 from MechCoder/spark-7203 and squashes the following commits: 034d086 [MechCoder] Vectorize dot calculation for numpy arrays for ndim=2 bce2b07 [MechCoder] fix doctest fcad0a3 [MechCoder] Remove type checks for list, pyarray etc 0ee5dd4 [MechCoder] Add tests and other isinstance changes e5f1de0 [MechCoder] [SPARK-7401] Vectorize dot product and sq_dist --- python/pyspark/mllib/linalg.py | 44 ++++++++++++++++------------------ python/pyspark/mllib/tests.py | 8 +++++++ 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index e96c5ef87d..9959a01cce 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -577,22 +577,19 @@ class SparseVector(Vector): ... AssertionError: dimension mismatch """ - if type(other) == np.ndarray: - if other.ndim == 2: - results = [self.dot(other[:, i]) for i in xrange(other.shape[1])] - return np.array(results) - elif other.ndim > 2: + + if isinstance(other, np.ndarray): + if other.ndim not in [2, 1]: raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim) + assert len(self) == other.shape[0], "dimension mismatch" + return np.dot(self.values, other[self.indices]) assert len(self) == _vector_size(other), "dimension mismatch" - if type(other) in (np.ndarray, array.array, DenseVector): - result = 0.0 - for i in xrange(len(self.indices)): - result += self.values[i] * other[self.indices[i]] - return result + if isinstance(other, DenseVector): + return np.dot(other.array[self.indices], self.values) - elif type(other) is SparseVector: + elif isinstance(other, SparseVector): result = 0.0 i, j = 0, 0 while i < len(self.indices) and j < len(other.indices): @@ -635,22 +632,23 @@ class SparseVector(Vector): AssertionError: dimension mismatch """ assert len(self) == _vector_size(other), "dimension mismatch" - if type(other) in (list, array.array, DenseVector, np.array, np.ndarray): - if type(other) is np.array and other.ndim != 1: + + if isinstance(other, np.ndarray) or isinstance(other, DenseVector): + if isinstance(other, np.ndarray) and other.ndim != 1: raise Exception("Cannot call squared_distance with %d-dimensional array" % other.ndim) - result = 0.0 - j = 0 # index into our own array - for i in xrange(len(other)): - if j < len(self.indices) and self.indices[j] == i: - diff = self.values[j] - other[i] - result += diff * diff - j += 1 - else: - result += other[i] * other[i] + if isinstance(other, DenseVector): + other = other.array + sparse_ind = np.zeros(other.size, dtype=bool) + sparse_ind[self.indices] = True + dist = other[sparse_ind] - self.values + result = np.dot(dist, dist) + + other_ind = other[~sparse_ind] + result += np.dot(other_ind, other_ind) return result - elif type(other) is SparseVector: + elif isinstance(other, SparseVector): result = 0.0 i, j = 0, 0 while i < len(self.indices) and j < len(other.indices): diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 49ce125de7..d9f9874d50 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -129,17 +129,22 @@ class VectorTests(MLlibTestCase): [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) + arr = pyarray.array('d', [0, 1, 2, 3]) self.assertEquals(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEquals(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEquals(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) + self.assertEquals(7.0, sv.dot(arr)) def test_squared_distance(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([4, 3, 2, 1]) + lst1 = [4, 3, 2, 1] + arr = pyarray.array('d', [0, 2, 1, 3]) + narr = array([0, 2, 1, 3]) self.assertEquals(15.0, _squared_distance(sv, dv)) self.assertEquals(25.0, _squared_distance(sv, lst)) self.assertEquals(20.0, _squared_distance(dv, lst)) @@ -149,6 +154,9 @@ class VectorTests(MLlibTestCase): self.assertEquals(0.0, _squared_distance(sv, sv)) self.assertEquals(0.0, _squared_distance(dv, dv)) self.assertEquals(0.0, _squared_distance(lst, lst)) + self.assertEquals(25.0, _squared_distance(sv, lst1)) + self.assertEquals(3.0, _squared_distance(sv, arr)) + self.assertEquals(3.0, _squared_distance(sv, narr)) def test_conversion(self): # numpy arrays should be automatically upcast to float64 -- GitLab