我当前正在使用Google Cloud。请按照以下命令操作:
gcloud beta dataproc clusters create my-test
--project my-project
--subnet prod-sub-1
--zone southamerica-east1-a
--region=southamerica-east1
--master-machine-type n1-standard-4
--master-boot-disk-size 40
--num-workers 5
--worker-machine-type n1-standard-4
--worker-boot-disk-size 20
--image-version 1.2
--tags internal,ssh,http-server,https-server
--properties dataproc:dataproc.conscrypt.provider.enable=false
--format=json
--max-idle=10m
gcloud dataproc jobs submit pyspark gs://path-to-script/spark_full_job.py
--cluster=my-test
--project=my-project
--region=southamerica-east1
--jars=gs://path-to-driver/mssql-jdbc-6.4.0.jre8.jar
--format=json -- [JOB_ARGS]
import sys
import json
from pyspark.sql import SparkSession
json_args = sys.argv[1] # load job_args passed as json string
conf_dict = json.loads(json_args)
spark = SparkSession.builder.appName("Db to data lake - Table: {}".format(conf_dict['src_table'])).getOrCreate()
# SQL command which works for both databases: MySQL and SQL Server
table_query = """
(
SELECT
CAST(COALESCE(ABS({candidate_key}), FLOOR(RAND() * {dataproc_partition_number})) AS {db_int_type}) % {dataproc_partition_number} + 1 row_number,
a.*,
CAST(CURRENT_TIMESTAMP AS DATE) AS DATE_LOAD
FROM {src_table} a
) as lake_query
""".format(src_table=conf_dict['src_table'],
dataproc_partition_number=conf_dict['dataproc_partition_number'],
db_int_type=conf_dict['db_int_type'],
candidate_key=conf_dict['candidate_key'])
df_source = spark.read.format("jdbc") \
.option("url", conf_dict['db_url']) \
.option("driver", conf_dict['db_driver']) \
.option("user", conf_dict['db_user']) \
.option("password", conf_dict['db_password']) \
.option("dbtable", table_query) \
.option("partitionColumn", "row_number") \
.option("lowerBound", 1) \
.option("upperBound", '{dataproc_partition_number}'.format(dataproc_partition_number=int(conf_dict['dataproc_partition_number']) * 2)) \
.option("numPartitions",
'{dataproc_partition_number}'.format(dataproc_partition_number=conf_dict['dataproc_partition_number'])) \
.load()
destination = "hdfs:///{gcs_lake_bucket}/{gcs_lake_temp_prefix}/{bq_dataset}/{src_table}" \
.format(gcs_lake_bucket=conf_dict['gcs_lake_bucket'],
gcs_lake_temp_prefix=conf_dict['gcs_lake_temp_prefix'],
bq_dataset=conf_dict['bq_dataset'],
src_table=conf_dict['src_table'])
df_source.write \
.mode("overwrite") \
.partitionBy("DATE_LOAD") \
.parquet(destination)
此python代码负责执行全表读取。它在午夜之后触发,并从SQL Server中提取数据并加载到GCS中。下一步是从GCS提取数据并将其加载到BigQuery中,以进行数据转换和分析。
我注意到管道大部分时间都卡住了。显示的错误:
[Stage 0:=============================> (3 + 3) / 6]19/01/22 18:47:53 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, my-test-w-0.c.eva-bigdata.internal, executor 1): com.microsoft.sqlserver.jdbc.SQLServerException: Connection reset
at com.microsoft.sqlserver.jdbc.SQLServerConnection.terminate(SQLServerConnection.java:2670)
at com.microsoft.sqlserver.jdbc.TDSChannel.read(IOBuffer.java:1993)
at com.microsoft.sqlserver.jdbc.TDSReader.readPacket(IOBuffer.java:6494)
at com.microsoft.sqlserver.jdbc.TDSReader.nextPacket(IOBuffer.java:6453)
at com.microsoft.sqlserver.jdbc.TDSReader.ensurePayload(IOBuffer.java:6431)
at com.microsoft.sqlserver.jdbc.TDSReader.readBytes(IOBuffer.java:6715)
at com.microsoft.sqlserver.jdbc.TDSReader.readWrappedBytes(IOBuffer.java:6736)
at com.microsoft.sqlserver.jdbc.TDSReader.readInt(IOBuffer.java:6683)
at com.microsoft.sqlserver.jdbc.ServerDTVImpl.getValue(dtv.java:3934)
at com.microsoft.sqlserver.jdbc.DTV.getValue(dtv.java:289)
at com.microsoft.sqlserver.jdbc.Column.getValue(Column.java:198)
at com.microsoft.sqlserver.jdbc.SQLServerResultSet.getValue(SQLServerResultSet.java:1919)
at com.microsoft.sqlserver.jdbc.SQLServerResultSet.getValue(SQLServerResultSet.java:1900)
at com.microsoft.sqlserver.jdbc.SQLServerResultSet.getInt(SQLServerResultSet.java:2127)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$org$apache$spark$sql$execution$datasources$jdbc$JdbcUtils$$makeGetter$6.apply(JdbcUtils.scala:394)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$org$apache$spark$sql$execution$datasources$jdbc$JdbcUtils$$makeGetter$6.apply(JdbcUtils.scala:393)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anon$1.getNext(JdbcUtils.scala:330)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anon$1.getNext(JdbcUtils.scala:312)
at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:32)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:395)
at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:190)
at org.apache.spark.sql.execution.SortExec$$anonfun$1.apply(SortExec.scala:108)
at org.apache.spark.sql.execution.SortExec$$anonfun$1.apply(SortExec.scala:101)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketException: Connection reset
at java.net.SocketInputStream.read(SocketInputStream.java:210)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at com.microsoft.sqlserver.jdbc.TDSChannel.read(IOBuffer.java:1983)
... 36 more
我找不到此连接重置的任何原因。有人遇到过这个问题吗?有任何猜测吗?
谢谢!