我试图读一些json,推断出一个模式,然后将它再次写成s3(s3a)。由于某种原因,大约三分之一的通过写入部分的运行,火花始终错误输出,下面包含错误。我找不到任何明显的问题原因:它没有内存;没有长时间的GC暂停。在各个执行者的日志中似乎没有任何其他错误消息。
该脚本在我拥有的另一组数据上运行良好,这些数据结构非常相似,但要小几个数量级。
我正在运行spark 2.0.1-hadoop-2.7并使用FileOutputCommitter。算法版本似乎并不重要。
编辑: 这在形成不良的json或损坏的文件中似乎不是问题。我已单独解压缩并读取每个文件,没有错误。
这是脚本的简化版本:
spark.executor.extraJavaOptions -XX:+UseG1GC -XX:MaxPermSize=1G -XX:+HeapDumpOnOutOfMemoryError
spark.executor.memory 16G
spark.executor.uri https://s3.amazonaws.com/foo/spark-2.0.1-bin-hadoop2.7.tgz
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.buffer.dir /raid0/spark
spark.hadoop.fs.s3n.buffer.dir /raid0/spark
spark.hadoop.fs.s3a.connection.timeout 500000
spark.hadoop.fs.s3n.multipart.uploads.enabled true
spark.hadoop.parquet.block.size 2147483648
spark.hadoop.parquet.enable.summary-metadata false
spark.jars.packages com.databricks:spark-avro_2.11:3.0.1
spark.local.dir /raid0/spark
spark.mesos.coarse false
spark.mesos.constraints priority:1
spark.network.timeout 600
spark.rpc.message.maxSize 500
spark.speculation false
spark.sql.parquet.mergeSchema false
spark.sql.planner.externalSort true
spark.submit.deployMode client
spark.task.cpus 1
完成早期架构成功推断步骤。错误本身发生在最后一行,但我想这至少可以包含前一个statemnt,如果不是更早的话:
$(document).ready(function() {
var jsonData = {
"Name": "Julie Brown",
"Account": "C0010",
"LoanApproved": "12/5/2015",
"LastActivity": "4/1/2016",
"PledgedPortfolio": "4012214.00875",
"MaxApprovedLoanAmt": "2050877.824375",
"LoanBalance": "1849000",
"AvailableCredit": "201877.824375",
"Aging": "3",
"Brokerage": "My Broker",
"Contact": "Robert L. Johnson",
"ContactPhone": "(212) 902-3614",
"RiskCategory": "Yellow",
"rows": [{
"ClientID": "C0010",
"Symbol": "WEC",
"Description": "Western Electric Co",
"ShareQuantity": "20638",
"SharePrice": "21.12",
"TotalValue": "435874.56",
"LTVCategory": "Equities",
"LTVRatio": "50%",
"MaxLoanAmt": "217937.28"
}, {
"ClientID": "C0010",
"Symbol": "BBB",
"Description": "Bins Breakers and Boxes",
"ShareQuantity": "9623",
"SharePrice": "74.29125",
"TotalValue": "714904.69875",
"LTVCategory": "Equities",
"LTVRatio": "50%",
"MaxLoanAmt": "357452.349375"
}, {
"ClientID": "C0010",
"Symbol": "GPSC",
"Description": "Great Plains Small Cap Stock",
"ShareQuantity": "49612",
"SharePrice": "14.24",
"TotalValue": "706474.88",
"LTVCategory": "Mutual Funds - Small Cap",
"LTVRatio": "40%",
"MaxLoanAmt": "282589.952"
}]
},
mmddyyyy = "";
/*********************************************************************/
$("#output").jqGrid({
url: "/echo/json/",
mtype: "POST",
datatype: "json",
postData: {
json: JSON.stringify(jsonData)
},
colModel: [
/** { name: 'ClientID', label:'ClientID',width: 80, key: true },****/
{
name: 'Symbol',
width: 65
}, {
name: 'Description',
width: 165
}, {
name: 'ShareQuantity',
align: 'right',
width: 85,
classes: "hidden-xs", labelClasses: "hidden-xs",
formatter: 'currency',
formatoptions: {
prefix: " ",
suffix: " "
}
}, {
name: 'SharePrice',
label: 'Share Price',
align: 'right',
width: 100,
classes: "hidden-xs", labelClasses: "hidden-xs",
template: "number",
formatoptions: {
prefix: " $",
decimalPlaces: 4
}
},
/*{ label: 'Value1',
name: 'Value1',
width: 80,
sorttype: 'number',
formatter: 'number',
align: 'right'
}, */
{
name: 'TotalValue',
label: 'Total Value',
width: 160,
sorttype: 'number',
align: "right",
search: false,
formatter: 'currency',
formatoptions: {
prefix: " $",
suffix: " "
}
}, {
name: 'LTVRatio',
label: 'LTV Ratio',
width: 70,
sorttype: 'number',
align: "right",
formatter: 'percentage',
formatoptions: {
prefix: " ",
suffix: " "
}
}, {
name: 'LTVCategory',
label: 'LTV Category',
classes: "hidden-xs", labelClasses: "hidden-xs",
width: 120,
width: 165
},
{
name: 'MaxLoanAmt',
label: 'MaxLoanAmount',
width: 165,
sorttype: 'number',
align: "right",
search: false,
formatter: 'currency',
formatoptions: {
prefix: " $",
suffix: " "
}
}
],
additionalProperties: ["Symbol", "Description"],
subGrid: true,
subGridRowExpanded: function (subgridDivId, rowid) {
var item = $(this).jqGrid("getLocalRow", rowid);
$("#" + $.jgrid.jqID(subgridDivId)).html("Symbol: <em>" + item.Symbol +
"</em><br/>Description: <em>" + item.Description + "</em>");
},
beforeProcessing: function (data) {
var symbolsMap = {}, symbolsValues = ":All", rows = data.rows, i, symbol;
for (i = 0; i < rows.length; i++) {
symbol = rows[i].Symbol;
if (!symbolsMap.hasOwnProperty(symbol)) {
symbolsMap[symbol] = 1;
symbolsValues += ";" + symbol + ":" + symbol;
}
}
$(this).jqGrid("setColProp", 'Symbol', {
stype: "select",
searchoptions: {
value: symbolsValues
}
}).jqGrid('destroyFilterToolbar')
.jqGrid('filterToolbar', {
stringResult: true,
searchOnEnter: false,
defaultSearch : "cn"
});
},
/*beforeProcessing: function (data) {
var item, i, n = data.length;
for (i = 0; i < n; i++) {
item = data[i];
item.Quantity = parseFloat($.trim(item.Quantity).replace(",", ""));
item.LTVRatio = parseFloat($.trim(item.LTVRatio *10000).replace(",", ""));
item.Value = parseFloat($.trim(item.Value).replace(",", ""));
item.Num1 = parseInt($.trim(item.Num1).replace(",", ""), 10);
item.Num2 = parseInt($.trim(item.Num2).replace(",", ""), 10);
}
}, */
iconSet: "fontAwesome",
loadonce: true,
rownumbers: true,
cmTemplate: {
autoResizable: true,
editable: true
},
autoResizing: {
compact: true
},
autowidth: true,
height: 'auto',
forceClientSorting: true,
sortname: "Symbol",
footerrow: true,
caption: "<b>Collateral Value</b> <span class='pull-right' style='margin-right:20px;'>Valuation as of: " + mmddyyyy + "</span>",
loadComplete: function() {
var $self = $(this),
sum = $self.jqGrid("getCol", "Price", false, "sum"),
sum1 = $self.jqGrid("getCol", "MaxLoanAmt", false, "sum");
//ltvratio = $self.jqGrid("getCol","LTVRatio:addas", "Aved Loan Amount");
$self.jqGrid("footerData", "set", {
LTVCategory: "Max Approved Loan Amount:",
Price: sum,
MaxLoanAmt: sum1
});
}
});
$("#output").jqGrid('filterToolbar', {stringResult: true, searchOnEnter: false, defaultSearch : "cn"});
$(window).on("resize", function () {
var newWidth = $("#output").closest(".ui-jqgrid").parent().width();
$("#output").jqGrid("setGridWidth", newWidth, true);
}).triggerHandle("resize");
});
这是我的尴尬:
{{1}}
答案 0 :(得分:2)
我可以想出这个问题的三个可能原因。
答案 1 :(得分:1)
SAXParseException
可能表示格式错误的XML文件。由于作业一直失败大约三分之一,这意味着它每次都可能在同一个地方失败(一个文件的分区大约是通过分区列表的三分之一)。
你可以粘贴你的脚本吗?可以将Spark步骤包装在try / catch循环中,如果发生此错误,将打印出文件,这样可以轻松放大问题。
答案 2 :(得分:1)
来自日志:
引起:org.xml.sax.SAXParseException; lineNumber:1; columnNumber:2; XML文档结构必须在同一实体内开始和结束。
和
引起:com.amazonaws.AmazonClientException:无法使用处理程序类com.amazonaws.services.s3.model.transform.XmlResponsesSaxParser解析XML文档$ ListBucketHandler
看起来您的文件已损坏/格式不正确,并且您的错误实际上是在任务的读取部分期间发生的。您可以通过尝试另一个强制读取的操作来确认这一点,例如count()
。
如果确认,那么目标就是找到损坏的文件。您可以通过列出列出的s3文件sc.parallelize()
,然后尝试使用map()
读取自定义函数中的文件来执行此操作。
import boto3
from pyspark.sql import Row
def scanKeys(startKey, endKey):
bucket = boto3.resource('s3').Bucket('bucketName')
for obj in bucket.objects.filter(Prefix='prefix', Marker=startKey):
if obj.key < endKey:
yield obj.key
else:
return
def testFile(s3Path):
s3obj = boto3.resource('s3').Object(bucket_name='bucketName', key=key)
body = s3obj.get()['Body']
...
logic to test file format, or use a try/except and attempt to parse it
...
if fileFormatedCorrectly == True:
return Row(status='Good', key = s3Path)
else:
return Row(status='Fail', key = s3Path)
keys = list(scanKeys(startKey, endKey))
keyListRdd = sc.parallelize(keys, 1000)
keyListRdd.map(testFile).filter(lambda x: x.asDict.get('status') == 'Fail').collect()
这将返回格式不正确的文件的s3路径