我有一个使用Python的爬虫,我的函数使用dataLayer获取脚本并将字符串转换为格式并转换为json。
这是dataLayer:
dataLayer = [{
"site": {
"isMobile": false,
"source": (function() {
var userAgent = navigator.userAgent.toLocaleLowerCase();
var source = "web";
resultMatch = userAgent.match(/\[olx-source\/(\w+);/);
if (resultMatch) {
[, source] = resultMatch;
}
return source;
})()
},
"page": {
"pageType": "ad_detail",
"detail": {
"parent_category_id": "2000",
"category_id": "2020",
"state_id": "2",
"region_id": "31",
"ad_id": "354269527",
"list_id": "295567499",
"city_id": "9190",
"zipcode":"32146045",
},
"adDetail": {
"adID": "354269527",
"listID": "295567499",
"sellerName": "Glauber Marlon",
"adDate": "2017-01-23 18:35:26",
},
},
"session": {
"user": {
"userID": "",
"loginType": ""
}
},
"pageType": "Ad_detail",
"abtestingEnable" : "1",
// Listing information
"listingCategory": "2020",
// Ad information
"adId": "354269527",
"state": "2",
"region": "31",
"category": "2020",
"pictures": "14",
"listId": "295567499",
//Account Information
"loggedUser":"0",
"referrer": "",
//User Information
}];
这是我在Json格式化和转换的函数:
s = page_ad.findAll('script')[25].text.replace('\'', '"')
// if print s this line and put in JsonLint show error in function.
s = re.search(r'\{.+\}', s, re.DOTALL).group() # get json data
s = re.sub(r'//.+\n', '', s) # replace comment
s = re.sub(r'\s+', '', s) # strip whitspace
s = re.sub(r',}', '}', s) # get rid of last , in the dict
dataLayer = json.loads(s)
这是转换前的json:
{
"site":{
"isMobile":false,
"source":(function() {
varuserAgent=navigator.userAgent.toLocaleLowerCase();varsource="web";resultMatch=userAgent.match(/\ [
olx-source\/(\w+);/);if(resultMatch) {
[
,
source
] =resultMatch;
} returnsource;
} )()
},
"page":{
"pageType":"ad_detail",
"detail":{
"parent_category_id":"2000",
"category_id":"2020",
"state_id":"2",
"region_id":"31",
"ad_id":"354269527",
"list_id":"295567499",
"city_id":"9190",
"zipcode":"32146045"
},
"adDetail":{
"adID":"354269527",
"listID":"295567499",
"sellerName":"GlauberMarlon",
"adDate":"2017-01-2318:35:26"
}
},
"session":{
"user":{
"userID":"",
"loginType":""
}
},
"pageType":"Ad_detail",
"abtestingEnable":"1",
"listingCategory":"2020",
"adId":"354269527",
"state":"2",
"region":"31",
"category":"2020",
"pictures":"14",
"listId":"295567499",
"loggedUser":"0",
"referrer":""
}
我想删除索引“site”错误在哪里。
Traceback (most recent call last):
File "crawler_olx_0.1.py", line 182, in <module>
run(link_base)
File "crawler_olx_0.1.py", line 52, in run
vehicleInformation = getVehicleInformation(page_ad)
File "crawler_olx_0.1.py", line 81, in getVehicleInformation
dataLayer = json.loads(s)
File "/usr/lib/python2.7/json/__init__.py", line 339, in loads
return _default_decoder.decode(s)
File "/usr/lib/python2.7/json/decoder.py", line 364, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib/python2.7/json/decoder.py", line 382, in raw_decode
raise ValueError("No JSON object could be decoded")
ValueError: No JSON object could be decoded
答案 0 :(得分:0)
在抓取时执行javascript会更好(Selenium,...) 以下可能是愚蠢的方法(对于2.7,我没有安装py3 + pyv8)。
var PrintThis = function (val) {
printWindow.focus();
printWindow.print();
}
printWindow.addEventListener("message", PrintThis);
printWindow.postMessage("Print", "*");
//in this case iam able to get the data in printhis()
//but its also not working
返回:
import PyV8
jsCtx = PyV8.JSContext()
jsCtx.enter()
jsCtx.eval('var navigator = { "userAgent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3" }')
JsFilledJSON = """dataLayer = [
{
"site": {
"isMobile": false,
"source": (function() {
var userAgent = navigator.userAgent.toLocaleLowerCase();
var source = "web";
resultMatch = userAgent.match(/\[olx-source\/(\w+);/);
if (resultMatch) {
[, source] = resultMatch;
}
return source;
})()
},
"page": {
"pageType": "ad_detail",
"detail": {
"parent_category_id": "2000",
"category_id": "2020",
"state_id": "2",
"region_id": "31",
"ad_id": "354269527",
"list_id": "295567499",
"city_id": "9190",
"zipcode":"32146045",
}
}
}
]
"""
x = jsCtx.eval("JSON.stringify(%s)" % JsFilledJSON.decode('utf-8'))
print json.dumps(json.loads(x), indent=4, sort_keys=True)
但是您可以使用PyV8或任何其他python执行JS-JSON到avascript包装器。二进制文件位于github。