我想使用Cython和Pyspark来加速Sklearn knn,其中包含用户定义的度量标准,用于包含400000行和65列的大型数据集。我已按照here和here的说明进行操作。我使用Spark版本1.6.0和python 2.7.13
我为一个小样本数据集编写了以下代码,但是我收到了以下酸洗错误
Traceback (most recent call last):
File "/farzanadata/main.py", line 26, in <module>
bc_nbrs = sc.broadcast(nbrs)
File "/opt/cloudera/parcels/CDH-5.7.0-1.cdh5.7.0.p0.45/lib/spark/python/lib/pyspark.zip/pyspark/context.py", line 741, in broadcast
File "/opt/cloudera/parcels/CDH-5.7.0-1.cdh5.7.0.p0.45/lib/spark/python/lib/pyspark.zip/pyspark/broadcast.py", line 70, in __init__
File "/opt/cloudera/parcels/CDH-5.7.0-1.cdh5.7.0.p0.45/lib/spark/python/lib/pyspark.zip/pyspark/broadcast.py", line 78, in dump
cPickle.PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
spark_tools.py
def spark_cython(module, method):
def wrapped(*args, **kwargs):
global cython_function_
try:
return cython_function_(*args, **kwargs)
except:
import pyximport
pyximport.install()
cython_function_ = getattr(__import__(module), method)
return cython_function_(*args, **kwargs)
return wrapped
clinical_kernel.pyx
cimport cython
from libc cimport math
cimport numpy as cnp
cnp.import_array()
def mydist(cnp.npy_double[:] x,cnp.npy_double[:] y):
cdef double ranges[3]
cdef int k
cdef double out=0, out2=0
ranges[:]= [0.04028, 0.0983, 0.06602]
for k in range(3):
out += (ranges[k] - math.fabs(x[k] - y[k])) / ranges[k]
for k in range(3,5):
out2 += x[k]==y[k]
return (out+out2)/5
main.py
from __future__ import print_function
from pyspark import SparkConf, SparkContext
from sklearn.neighbors import NearestNeighbors
import numpy as np
from spark_tools import spark_cython
import pyximport
conf = SparkConf().setAppName('Fibo')
sc = SparkContext(conf=conf)
sc.addFile('file:///farzanadata/clinical_kernel.pyx')
sc.addFile('file:///farzanadata/spark_tools.py')
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
pyximport.install()
import clinical_kernel
df=sc.parallelize([[0.72694,1.4742,0.32396,1,1],[0.74173,1.5257,0.36116,0,0],[0.76722,1.5725,0.38998,1,0],[0.76722, 1.5725, 0.38998,0,1]])
X=np.array(df.collect())
mapper = spark_cython('clinical_kernel', 'mydist')
nbrs=NearestNeighbors(n_neighbors=4,metric=mapper)
nbrs.fit(X)
bc_nbrs = sc.broadcast(nbrs)
neighbors=df.map(lambda x: bc_nbrs.value.kneighbors(x,n_neighbors=4,return_distance=False))
neigh_df = neighbors.map(lambda x: x.tolist()).toDF(["neighbors"])
neigh_df.show()
使用以下代码完美地工作,而不是广播KNN树,当然,这对于大型数据集来说并不理想。
neighbors=nbrs.kneighbors(X,n_neighbors=4,return_distance=False)
使用import dill作为pickle也无济于事
答案 0 :(得分:0)
以下列方式改变sparktool.py就可以了解
func searchDataRequested(_ apiUrl: String,_ country: String,_ phone:String) {
var search: SearchResultObj?
init() {
self.search = SearchResultObj()
}
let service = ServiceCall(urlServiceCall: apiUrl, country: country, phone: phone)
let url = URL(string: apiUrl)
let request = URLRequest(url: url!)
let country = country
let phone = phone
service.fetchJson(request: request, customerCountry: country, mobileNumber: phone) // making service call and returns json
{ (ok, json) in
print("CallBack response : \(String(describing: json))")
self.jsonMappingToSearch(json! as AnyObject)
}
}
func jsonMappingToSearch(_ json: AnyObject) {
print( json["fullName"] as! String?) // This returns a value
search?.name = json["fullName"] as! String?
search?.profileImage = json["image"] as! String?
search?.publicKey = json["publicKey"] as! String?
search?.accountType = json["accountType"] as! String?
search?.status = (json["status"] as! Bool?)!
testResponse()
}
func testResponse(){
// testing to see if my object contains anything
print(search?.name as Any )
print(search?.profileImage as Any )
print(search?.publicKey as Any )
print(search?.accountType as Any )
print(search?.status as Any )
}