我收到ImportError:没有名为openpyxl的模块 用于执行的命令: spark-submit-驱动程序内存5g --jars /tmp/spark-csv_2.10-1.5.0.jar,/tmp/commons-csv-1.6.jar /app/T6M0_app/T6M0_app_Copy/QA_Automation/myspark.py #!/ usr / bin / env python #--编码:utf-8--
import sys, datetime
from openpyxl import load_workbook
from openpyxl.compat import range
from openpyxl.utils import get_column_letter
from openpyxl import Workbook
from pyspark.sql import *
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql.types import *
import os
import subprocess
import re
def compare_source_target_values(b_date,revid):
print("Hello spark")
wb = load_workbook('/app/T6M0_app/T6M0_app_Copy/Data_Count_SQL.xlsx',data_only=True)
ws = wb.worksheets[0]
maxrow = ws.max_row
maxcol = ws1.max_column
print 'Max Row:' + maxrow
print 'Max Col:' + maxcol
conf1 = SparkConf().setAppName("ExportFromHDFS")
sc = SparkContext(conf=conf1)
sqlContext = HiveContext(sc)
for curr_row in range(2, maxrow+1):
source_sql = str(ws.cell(row=curr_row, column=2).value)
dest_path = str(ws.cell(row=curr_row, column=3).value)
result_fname=str(ws.cell(row=curr_row, column=4).value)
print 'Executing*************' + source_sql
print 'Destination Path' + dest_path
print 'Result File Name' + result_fname
df= sqlContext.sql("select * from qa_lbn0_cz.vw_yqi0_ebr_core where BUSINESSEFFECTIVEDATE = '2018-01-31' and rev=2 limit 5")
df.coalesce(1).write.format("com.databricks.spark.csv").mode("overwrite").option("header", "true").save("/tmp/sample3")
wb1.close()
if __name__ == "__main__":
# record start time
startTime = datetime.datetime.now()
result = False
# Check input file
if len(sys.argv) != 3:
print "Error: Need 2 input parameter business effective date and rev id."
exit(0)
else:
f_businessdate = sys.argv[1]
rev_id = sys.argv[2]
#get_master_control_table_data_hive(f_businessdate,rev_id)
#format_master_control_data()
compare_source_target_values(f_businessdate,rev_id)
从HDFS导出数据并将其保存到CSV文件。 导出查询将从Excel中读取 将放置在HDFS lcoation中的结果.deflate文件转换为.CSV并将其存储在本地系统中