from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import gc
import pandas as pd
import datetime
import numpy as np
import sys
APP_NAME = "DataFrameToCSV"
spark = SparkSession\
.builder\
.appName(APP_NAME)\
.config("spark.sql.crossJoin.enabled","true")\
.getOrCreate()
group_ids = [1,1,1,1,1,1,1,2,2,2,2,2,2,2]
dates = ["2016-04-01","2016-04-01","2016-04-01","2016-04-20","2016-04-20","2016-04-28","2016-04-28","2016-04-05","2016-04-05","2016-04-05","2016-04-05","2016-04-20","2016-04-20","2016-04-29"]
#event = [0,1,0,0,0,0,1,1,0,0,0,0,1,0]
event = [0,1,1,0,1,0,1,0,0,1,0,0,0,0]
dataFrameArr = np.column_stack((group_ids,dates,event))
df = pd.DataFrame(dataFrameArr,columns = ["group_ids","dates","event"])
上面的python代码将在gcloud dataproc上的spark群集上运行。我想在gs:// mybucket / csv_data /
中将pandas数据帧保存为gcloud存储桶中的csv文件我该怎么做?
答案 0 :(得分:3)
所以,我想出了如何做到这一点。继续上面的代码,这是解决方案:
sc = SparkContext.getOrCreate()
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
sparkDf = sqlCtx.createDataFrame(df)
sparkDf.coalesce(1).write.option("header","true").csv('gs://mybucket/csv_data')
答案 1 :(得分:3)
您也可以将此解决方案与Dask一起使用。您可以将DataFrame转换为Dask DataFrame,然后将其写入Cloud Storage上的csv中
import dask.dataframe as dd
import pandas
df # your Pandas DataFrame
ddf = dd.from_pandas(df,npartitions=1, sort=True)
ddf.to_csv('gs://YOUR_BUCKET/ddf-*.csv', index=False, sep=',', header=False,
storage_options={'token': gcs.session.credentials})
storage_options参数是可选的