我想在Python中广播一个我希望用于工作节点上查找的散列图。
class datatransform:
# Constructor
def __init__(self, lookupFileName, dataFileName):
self.lookupFileName = lookupFileName
self.dataFileName = dataFileName
self.hamp = {}
self.broadcastVar = None;
# Read lookup file from the filesystem and create a local hashmap
# first and then create a broadcast variable.
def create_dictionary(self):
lookup_read = sc.textFile(self.lookupFileName)
self.lookup_parsed = (lookup_read
.map(lambda line: [line.split('\t')[0], line.split('\t')[1]]))
self.broadcastVar = sc.broadcast(self.lookup_parsed)
# This function will map the given id to a new index using the broadcasted hashmap.
def featurize(self) :
data_projected = sqlContext.sql("SELECT uid, prod_id FROM userprods ")
data = data_projected.map(lambda p: [p.uid, p.prod_id])
bcastmap = self.broadcastVar
data_featurized = (data_projected
.map(lambda p: [p.uid, bcastmap.value[p.prod_id]]))
datatransform = datatransform ('/path/to/lookupfile', '/path/to/datafile')
datatransform.create_dictionary()
datatransform.read_data()
我收到以下错误消息:
错误讯息: 您似乎正在尝试广播RDD或从"中引用RDD。 例外:您似乎正在尝试广播RDD或从动作或转换中引用RDD。 RDD转换和操作只能由驱动程序调用,而不是
答案 0 :(得分:1)
如果您想播放字典,请先收集。这意味着create_dictionary
应该看起来或多或少像这样
def create_dictionary(self):
lookup_read = sc.textFile(self.lookupFileName)
lookup_parsed = (lookup_read
.map(lambda line: [line.split('\t')[0], line.split('\t')[1]]))
self.broadcastVar = sc.broadcast(lookup_parsed.collectAsMap())