酸洗错误-Cython与Pyspark:scikit-learn knn,用户定义的大数据集度量

时间:2018-04-10 23:15:52

标签: apache-spark scikit-learn pyspark cython knn

我想使用Cython和Pyspark来加速Sklearn knn,其中包含用户定义的度量标准,用于包含400000行和65列的大型数据集。我已按照herehere的说明进行操作。我使用Spark版本1.6.0和python 2.7.13

我为一个小样本数据集编写了以下代码,但是我收到了以下酸洗错误

Traceback (most recent call last):
File "/farzanadata/main.py", line 26, in <module>
bc_nbrs = sc.broadcast(nbrs)
File "/opt/cloudera/parcels/CDH-5.7.0-1.cdh5.7.0.p0.45/lib/spark/python/lib/pyspark.zip/pyspark/context.py", line 741, in broadcast
File "/opt/cloudera/parcels/CDH-5.7.0-1.cdh5.7.0.p0.45/lib/spark/python/lib/pyspark.zip/pyspark/broadcast.py", line 70, in __init__
File "/opt/cloudera/parcels/CDH-5.7.0-1.cdh5.7.0.p0.45/lib/spark/python/lib/pyspark.zip/pyspark/broadcast.py", line 78, in dump
cPickle.PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed

spark_tools.py

def spark_cython(module, method):
 def wrapped(*args, **kwargs):
    global cython_function_
    try:
        return cython_function_(*args, **kwargs)
    except:
        import pyximport
        pyximport.install()
        cython_function_ = getattr(__import__(module), method)
    return cython_function_(*args, **kwargs)
return wrapped

clinical_kernel.pyx

cimport cython
from libc cimport math
cimport numpy as cnp
cnp.import_array()
def mydist(cnp.npy_double[:] x,cnp.npy_double[:] y):
  cdef double ranges[3]
  cdef int k
  cdef double out=0, out2=0
  ranges[:]=  [0.04028, 0.0983, 0.06602]
  for k in range(3):
      out += (ranges[k] - math.fabs(x[k] - y[k])) / ranges[k]
  for k in range(3,5):
      out2 += x[k]==y[k]       
  return (out+out2)/5

main.py

from __future__ import print_function
from pyspark import SparkConf, SparkContext
from sklearn.neighbors import NearestNeighbors
import numpy as np
from spark_tools import spark_cython

import pyximport

conf = SparkConf().setAppName('Fibo')
sc = SparkContext(conf=conf)
sc.addFile('file:///farzanadata/clinical_kernel.pyx')
sc.addFile('file:///farzanadata/spark_tools.py')
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
pyximport.install()
import clinical_kernel
df=sc.parallelize([[0.72694,1.4742,0.32396,1,1],[0.74173,1.5257,0.36116,0,0],[0.76722,1.5725,0.38998,1,0],[0.76722, 1.5725, 0.38998,0,1]])
X=np.array(df.collect())
mapper = spark_cython('clinical_kernel', 'mydist')
nbrs=NearestNeighbors(n_neighbors=4,metric=mapper)
nbrs.fit(X)
bc_nbrs = sc.broadcast(nbrs)
neighbors=df.map(lambda x: bc_nbrs.value.kneighbors(x,n_neighbors=4,return_distance=False))
neigh_df = neighbors.map(lambda x: x.tolist()).toDF(["neighbors"])
neigh_df.show()

使用以下代码完美地工作,而不是广播KNN树,当然,这对于大型数据集来说并不理想。

neighbors=nbrs.kneighbors(X,n_neighbors=4,return_distance=False)

使用import dill作为pickle也无济于事

1 个答案:

答案 0 :(得分:0)

以下列方式改变sparktool.py就可以了解

    func searchDataRequested(_ apiUrl: String,_ country: String,_ phone:String) {

        var search: SearchResultObj?

     init() {
          self.search = SearchResultObj()
      }


    let service = ServiceCall(urlServiceCall: apiUrl, country: country, phone: phone)
        let url = URL(string: apiUrl)
        let request = URLRequest(url: url!)
        let country = country
        let phone = phone


    service.fetchJson(request: request, customerCountry: country, mobileNumber: phone) // making service call and returns json
         { (ok, json) in
            print("CallBack response : \(String(describing: json))")

            self.jsonMappingToSearch(json! as AnyObject)
    }
}

func jsonMappingToSearch(_ json: AnyObject) {

    print( json["fullName"] as! String?) // This returns a value 
    search?.name = json["fullName"] as! String?
    search?.profileImage = json["image"] as! String?
    search?.publicKey = json["publicKey"] as! String?
    search?.accountType = json["accountType"] as! String?
    search?.status = (json["status"] as! Bool?)!
    testResponse()
}

func testResponse(){  

    // testing to see if my object contains anything
    print(search?.name as Any )
    print(search?.profileImage as Any )
    print(search?.publicKey as Any )
    print(search?.accountType as Any )
    print(search?.status as Any )
}