将pyspark rdd保存到hbase会引发属性错误

时间:2016-02-05 16:30:27

标签: python apache-spark hbase pyspark rdd

我正在尝试使用Pyspark将Spark RDD编写到Hbase表中。 RDD看起来像 以下使用print rdd.take(rdd.count())命令

[Decimal('0.39326837'), Decimal('0.03643601'), Decimal('0.06031798'), Decimal('0.08885452')]

当我尝试使用函数saveRecord

将rdd写入Hbase表时
def SaveRecord(tx_fee_rdd):  
    host = 'localhost'  #sys.argv[1]  
    table = 'tx_fee_table'  #needs to be created before hand in hbase shell  
    conf = {"hbase.zookeeper.quorum": host,
            "hbase.mapred.outputtable": table,
            "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
            "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
            "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
    keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
    valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
    #row key id,id, cfamily=tx_fee_col,column_name = tx_fee, column_value=x 
    datamap = tx_fee_rdd.map(lambda x: ("tx_fee_col","tx_fee",x ) )  
    datamap.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv)  


tx_fee_rdd.foreach(SaveRecord)

我收到以下错误

AttributeError: 'Decimal' object has no attribute 'map'

如何处理?

按照@ zeros323建议,我收到以下错误

Traceback (most recent call last):
  File "/home/ubuntu/unix_practice/bcrpc/bitcoin-inspector-webserver/bitcoin/bctxfee_text3.py", line 66, in <module>
    SaveRecord(tx_fee_rdd)
  File "/home/ubuntu/unix_practice/bcrpc/bitcoin-inspector-webserver/bitcoin/bctxfee_text3.py", line 29, in SaveRecord
    datamap.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv)  
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1348, in saveAsNewAPIHadoopDataset
  File "/usr/local/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py", line 538, in __call__
  File "/usr/local/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line 300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.saveAsHadoopDataset.
: org.apache.spark.SparkException: RDD element of type [Ljava.lang.Object; cannot be used
    at org.apache.spark.api.python.SerDeUtil$.pythonToPairRDD(SerDeUtil.scala:237)
    at org.apache.spark.api.python.PythonRDD$.saveAsHadoopDataset(PythonRDD.scala:801)
    at org.apache.spark.api.python.PythonRDD.saveAsHadoopDataset(PythonRDD.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
    at py4j.Gateway.invoke(Gateway.java:259)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:207)
    at java.lang.Thread.run(Thread.java:745)

2 个答案:

答案 0 :(得分:2)

foreach对单个记录进行操作,因此接收Decimal个对象而不是RDD个。您无法映射这些,更不用说使用saveAsNewAPIHadoopDataset方法了。

如果您想使用saveAsNewAPIHadoopDataset,您的功能应直接在RDD上运行:

SaveRecord(tx_fee_rdd)

另一个可能的问题是以下部分:

datamap = tx_fee_rdd.map(lambda x: ("tx_fee_col","tx_fee",x ) )  

saveAsNewAPIHadoopDataset期望对不是三胞胎。它也可能不适用于Decimal个对象。有关详细信息,请参阅hbase_outputformat.py example

答案 1 :(得分:0)

@zeros,关于密钥值的陈述是正确的但不是foreach。您可以看到此blog

这是一个写入HBase的工作代码

from pyspark import SparkContext
from jsonrpc.authproxy import AuthServiceProxy
from pyspark.streaming import StreamingContext
import json


# Create a local StreamingContext with * working thread and batch interval of 1 second
sc = SparkContext("local[*]", "txcount")
ssc = StreamingContext(sc, 0.5) #0.001 did 9710 blocks in 12 minutes

#function SaveRecord: saves tx_fee for a block to hbase database
def SaveRecord(tx_fee_rdd):  
    host = 'localhost'  #sys.argv[1]  
    table = 'transaction_fee_table' #needs to be created before hand in hbase shell  
    conf = {"hbase.zookeeper.quorum": host,
            "hbase.mapred.outputtable": table,
            "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
            "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
            "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
    keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
    valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
    #row key id,id, cfamily=tx_fee_col,column_name = tx_fee, column_value=x 
    #datamap = tx_fee_rdd.map(lambda x: ("tx_fee",x) )  
    #( rowkey , [ row key , column family , column name , value ] )
    datamap = tx_fee_rdd.map(lambda x: (str(x[0]),
                            [str(x[0]),"tx_fee_col","tx_fee",str(x[1])])
                            )

    datamap.saveAsNewAPIHadoopDataset(conf=conf,
                                      keyConverter=keyConv,
                                      valueConverter=valueConv)






lines = ssc.socketTextStream("localhost", 8888)
dump_rdd = lines.map(lambda x: json.dumps(x))
#print dump_rdd.take(2)
load_rdd = dump_rdd.map(lambda x: json.loads(x)).map(lambda x : x.decode('unicode_escape').encode('ascii','ignore'))
#load_rdd.pprint(2)

#load_rdd.pprint(100)
#tx = load_rdd.flatMap(lambda x: x.split(":")) #this works
split_blk_rdd = load_rdd.map(lambda x: x.split(":"))
#split_blk_rdd.pprint()

tx_fee_rdd = split_blk_rdd.map(lambda x : (x[14][1:7],x[15][1:-15])) #this gets transaction fee
#tx_fee_rdd.pprint(200)     #works
tx_fee_rdd.foreachRDD(SaveRecord)       #function call
#tx_fee_rdd.saveAsTextFiles("hdfs://ec2-52-21-47-235.compute-1.amazonaws.com:9000/bitcoin/","txt")

######tx_fee_rdd.pprint(1000)

#gen_tx = tx.map(lambda x: x[8])

#gen_tx.pprint(100)
#gen_tx = tx.map(parse_gen_tx)
#print type(tx)
#lst_tx = pprint.pprint(tx)
#print lst_tx
#print lst_tx[8][0]
#print type(lst_tx[8])
#print "here"
#print str(tx[8])[4:-4] #gives tx_id without the enclosing quotes

#print type(lst_tx)

ssc.start()             # Start the computation
#ssc.awaitTermination()  # Wait for the computation to terminate
ssc.awaitTerminationOrTimeout(15000) #13000#time out in 3 hours
#ssc.stop()  # Wait for the computation to terminate