下面的代码将spark数据帧转换为Pandas,以CSV文件格式写入本地。
import pandas as pd
import numpy as np
from math import sqrt
df1=pd.read_excel('/home/badri/DataAnalytics_CCE/Data_Generation/Khanapur_Flow.xlsx')
# print(df1.head())
#########To calculate mean flow of months###################
mean_flow=df1.mean(axis=1)
# print(mean_flow)
#########To calculate standard deviation of months'st'###################
std_dev=df1.std(axis=1)
# print(std_dev)
#########To calculate correlation coefficients'rt'###################
x = df1.iloc[0]
y = df1.iloc[len(df1)-1]
result0=round(x.corr(y),4)
series1=pd.Series(result0)
result=[]
for i in range(1,len(df1)):
x = df1.iloc[i]
y = df1.iloc[i-1]
result1=round(x.corr(y),4)
i += 1
result.append(result1)
series2=pd.Series(result)
corr=series1.append(series2, ignore_index=True)
print(corr)
#########To calculate regression coefficient 'bt'(slope of regression)###################
b_0=round(series1[0]*(std_dev[0]/std_dev[len(df1)-1]),4)
series3=pd.Series(b_0)
re=[]
for i in range(1,len(df1)):
b_i=round(corr[i]*(std_dev[i]/std_dev[i-1]),4)
i += 1
re.append(b_i)
series4 = pd.Series(re)
reg=series3.append(series4, ignore_index=True)
print(reg)
#########To generate random normal deviate with zero mean & unit variance'Zt'###################
mean=0
variance=1
z_t = np.random.normal(mean, variance, 48)
# print("random normal deviate with zero mean & unit variance:", z_t)
rand=pd.Series(z_t)
print(rand)
# print(len(rand))
#########To generate synthetic data of monthly volume of discharge'qt'###################
for vol in range(0,len(rand)-1):
vol_discharge = df1.iloc[-1, -1]
volume_flow = []
for t in range (0,len(df1)):
vol_discharge=mean_flow[t]+reg[t]*(vol_discharge-mean_flow[t-1])+rand[t]*std_dev[t]*sqrt(1-(corr[t-1]**2))
t+=1
volume_flow.append(vol_discharge)
volume_series=pd.Series(volume_flow)
print('The generated data is:')
print(volume_series)
以上命令的输出:
myschema.toPandas().to_csv("final_op.txt",header=False,sep='|',index=False,mode='a',doublequote=False,excapechar='"',quoting=None)
请注意,在我的“ myschema”数据框中,没有双引号。在写入CSV时,双引号会出现。 所需的输出没有双引号,如下所示:
"COLUMN DEFINITION|id"|int
"COLUMN DEFINITION|name"|string
我通过设置COLUMN DEFINITION|id|int
COLUMN DEFINITION|name|string
可以解决问题。但是没有运气。有人可以在这里帮我吗?谢谢。
答案 0 :(得分:0)
将 quoting=csv.QUOTE_NONE
传递给to_csv
命令:
myschema.toPandas().to_csv("final_op.txt",header=False,sep='|',index=False,mode='a',doublequote=False,excapechar='"',quoting=csv.QUOTE_NONE)