print (embed.take(1))
返回以下内容:
[(u'text', array([-0.31921682, -0.20686883, 0.01824462, 0.13350081, 0.01187224,
0.22116834, -0.287487 , -0.11509234, 0.01763669, 0.06377559,
0.2989474 , 0.41020724, -0.22037283, 0.45994595, -0.12425458,
-0.20881261, 0.06872807, 0.53261876, 0.45528108, 0.3245842 ,
0.09092806, 0.17639753, -0.47674257, -0.00098801, -0.12842408,
-0.2413709 , 0.38194713, -0.11250313, -0.15904745, 0.16058864,
-0.33080024, 0.37156585, 0.01329294, 0.36711624, 0.1973844 ,
-0.18771271, 0.08853641, 0.23573542, 0.09280699, -0.07244137,
0.09726012, -0.28807876, 0.01709639, 0.375758 , 0.4611828 ,
0.02307661, 0.3119973 , -0.5212216 , 0.230173 , -0.09128311,
0.2713826 , -0.2568359 , -0.11232495, 0.00200466, 0.09583885,
-0.25420523, -0.10021619, -0.19341935, -0.22922793, 0.0212901 ,
0.20808727, 0.32417038, 0.03864996, 0.2969149 , -0.03171158,
0.45413095, 0.6309765 , 0.28096622, -0.45515797, 0.28787974,
-0.5809179 , -0.09877653, 0.24814974, -0.35588014, 0.42792156,
0.21451631, -0.1350529 , 0.37952444, -0.00165558, 0.384076 ,
0.1413526 , 0.08866125, -0.10355992, 0.25019792, -0.00393839,
0.5695221 , -0.12449711, 0.00210058, 0.07765691, 0.3281926 ,
-0.08014766, 0.20689923, -0.22970992, 0.07591247, -0.15799475,
-0.22765721, -0.1927638 , -0.15355097, 0.09073654, -0.04624737,
-0.01643844, -0.8464762 , 0.08931787, 0.5332598 , 0.2911471 ,
0.3791839 , -0.303577 , -0.22905344, -0.21888404, -0.1810556 ,
-0.019493 , 0.1367392 , 0.2187451 , 0.04935849, 0.14806354,
-0.00551599, -0.05861316, 0.26915333, -0.3377117 , 0.00114926,
-0.08245742, 0.4929164 , 0.06329145, 0.88905925, 0.25238925,
-0.33230686, 0.19560733, 0.4172665 , 0.02552557, 0.26976195,
0.26783204, -0.25621846, 0.1972084 , -0.3187281 , 0.04309576,
0.5668932 , -0.20068711, -0.55052537, 0.38765076, 0.4864744 ,
0.1130944 , -0.01685749, 0.2522309 , 0.35446006, -0.09084648,
0.24245648, 0.06625048, 0.32369784, -0.06834482, -0.43762162,
0.5748935 , -0.3647702 , 0.35806394, -0.1582715 , 0.0772159 ,
-0.16100545, 0.4267 , -0.1307025 , -0.03227446, 0.10494301,
-0.05289922, 0.7097728 , -0.17166416, -0.054304 , 0.12740278,
-0.14317441, -0.26385677, -0.22849232, 0.10305541, -0.04086439,
-0.46178675, -0.09216189, -0.76668286, -0.09225449, -0.2168125 ,
0.12703866, 0.05073327, 0.04017496, -0.24126993, 0.06369572,
-0.09168304, -0.40669888, 0.28603286, -0.04988515, 0.05513516,
0.7806739 , -0.02534869, 0.00973589, 0.33947662, 0.24699458,
0.42974108, -0.19158548, 0.07973159, 0.30443648, 0.01040802,
0.1698588 , 0.06705329, 0.03138978, -0.5304623 , 0.1360791 ,
0.24522378, 0.12584817, -0.14219321, 0.19166584, 0.2226152 ,
0.07703363, 0.23977087, 0.11122001, 0.1861035 , 0.37293455,
0.33047304, -0.00121733, 0.5897423 , -0.06080131, -0.05202375,
0.10317306, -0.04354465, -0.31621787, 0.35597408, 0.3476911 ,
0.19293919, -0.02774811, -0.2739977 , -0.08927495, 0.04805851,
-0.17200205, -0.7588404 , -0.0615377 , 0.22575249, -0.09135661,
0.05711236, 0.04755763, -0.22369057, -0.17867683, -0.5001432 ,
-0.50440204, 0.09518195, 0.21083611, 0.1761034 , -0.12429572,
-0.39988747, 0.61002195, 0.4314368 , 0.02579845, -0.28676844,
-0.03552085, 0.21395397, 0.075849 , -0.21982886, -0.39733076,
0.17018917, 0.0251913 , 0.14543247, 0.2262631 , -0.05759874,
-0.18585657, 0.3211592 , -0.272339 , -0.22289205, -0.31557533,
0.27803165, 0.22905917, -0.15953094, -0.10018265, -0.32501385,
0.3319722 , -0.5288052 , -0.18201298, 0.08485821, -0.3703766 ,
0.21344219, -0.13149662, 0.21560058, 0.08420809, 0.15680231,
0.22140822, 0.24218608, 0.40488818, -0.00483301, -0.34549713,
0.4376315 , 0.53999686, 0.5157788 , -0.14618067, 0.15518756,
-0.29715803, 0.3193897 , 0.08780982, -0.08137056, -0.01085411,
0.17661236, 0.03506121, -0.030442 , -0.04898388, 0.36918342,
-0.07420906, 0.24415983, -0.02784878, 0.11730439, 0.24948044],
dtype=float32))]
和print(type(embed))
打印:<class 'pyspark.rdd.PipelinedRDD'>
。
当我执行以下操作时,会抛出异常:
回溯(最近通话最近):文件 “ /project/6008168/tamouze/testSparkCedar.py”,第390行,在 打印(embed1.distinct()。count())文件“ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/rdd.py ”, 计数文件中的第1056行 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/rdd.py”, 总和文件中的第1047行 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/rdd.py”, 折叠文件中的第921行 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/rdd.py”, 收集文件中的第824行 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py”, 第1160行,在通话文件中 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/py4j-0.10.6-src.zip/py4j/protocol.py”, 第320行,位于get_return_value py4j.protocol.Py4JJavaError:错误 通话时发生 z:org.apache.spark.api.python.PythonRDD.collectAndServe。 : org.apache.spark.SparkException:由于阶段失败,作业中止了: 阶段1.0中的任务3失败1次,最近一次失败:丢失任务3.0 在阶段1.0(TID 4,本地主机,执行程序驱动程序)中: org.apache.spark.api.python.PythonException:追溯(最新 最后调用):文件 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/worker.py”, 229行,在主要 process()文件“ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/worker.py”, 224行,进行中 serializer.dump_stream(func(split_index,iterator),outfile)文件 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/rdd.py”, line_func文件中的第2438行 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/rdd.py”, line_func文件中的第2438行 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/rdd.py”, func文件中的第362行 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/rdd.py”, 第1857行,在CombineLocally文件中 “ /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/spark/2.3.0/python/lib/pyspark.zip/pyspark/shuffle.py”, 第238行,在mergeValues中 d [k] = comb(d [k],v),如果k在其他情况下造物主(v)TypeError:无法散列的类型:'numpy.ndarray'
我得到如下嵌入:
inputInvertedIndexEmbedding = inputRawEmbed.flatMap(lambda (url, document): [(url, word) for word in document.lower().split()]).map(lambda (url, word): (word, Loader(modelpath).map(word)))
加载程序在哪里:
import gensim
from gensim.models.fasttext import FastText as FT_gensim
import numpy as np
class Loader(object):
cache = {}
emb_d
ic = {}
count = 0
def __init__(self, filename):
#print("\n|-------------------------------------|")
#print ("Welcome to Loader class in python")
#print("\n|-------------------------------------|")
self.fn = filename
@property
def fasttext(self):
#if Loader.count == 1:
#print("already loaded")
if self.fn not in Loader.cache:
Loader.cache[self.fn] = FT_gensim.load_fasttext_format(self.fn)
print ("==================================================================")
Loader.count = Loader.count + 1
print("**********************The class Loader is loded for the {} times ******************".format(Loader.count))
print ("==================================================================")
return Loader.cache[self.fn]
def map(self, word):
if word not in self.fasttext:
Loader.emb_dic[word] = np.random.uniform(low = 0.0, high = 1.0, size = 300)
return Loader.emb_dic[word]
return self.fasttext[word]
答案 0 :(得分:1)
我的最佳猜测是,Pyspark不知道如何处理numpy数组,因为numpy具有其自己的类型,而不是Python的float / double-我通常会在执行类似的操作时看到此错误数据框。如果您返回self.fasttext[word].tolist()
并将其他任何numpy类型转换为基本python类型,然后再返回它们,则应该解决该错误。
答案 1 :(得分:0)
我通过以下方式解决了这个问题:
inputInvertedIndexEmbedding= inputRawEmbed.flatMap(lambda (url, document): [(url, word) for word in document.lower().split()]).map(lambda (url, word): (word, tuple(Loader(modelpath).map(word))))