我正在尝试在网络抓取过程中从以下脚本标签中获取一些信息,例如jobTitle,大小,行业。是否有任何简单且可复制的方式来执行此操作?因为我需要刮很多页面。
我尝试将其转换为字符串,除去前部和尾部多余的部分,然后使用eval
,但它显示语法错误。
<script>
window.gdGlobals = window.gdGlobals ||
[{
'analyticsId': "UA-2595786-1",
'analyticsUrl': "/jobview/jobs/joblisting/Teradata Corporation",
'deferredScriptType': "text/x-deferred-js",
'accessDeniedRedirectUrl': '',
'locale': 'en-US',
'env': 'prod',
'device': {
'handheld': false,
'tablet': false,
'deviceTypeId': 1,
'platformTypeId': 3,
'viewTypeId': 4
},
'page': {
'domain': "www.glassdoor.com",
'domainId': 1,
'domainSuffix': "",
'group': "JobListing",
'guid': '000001675b5fa6a78fa0ac61fe36e9b6',
'flex': true,
'section': "job-listing",
'type': "job-listing:job-listing",
'id': "job-listing:job-listing",
'med': '',
'src' : '',
'content' : '',
'campaign' : '',
'term' : '',
'state' : 'locked',
'untranslatedUrl' : 'https://www.glassdoor.com/job-listing/service-management-business-analyst-teradata-JV_IC1147311_KO0,35_KE36,44.htm?jl=3032410565'
},
'user': {
'guid': '2917b9da-9897-4a3b-90c6-da49238a5924',
'ipAddr': "70.95.16.113",
'locale': "en_US",
'country': "US",
'ipLocationId': "1147311",
'ipLocationType': "C",
'elligibleForAppBoy': false
},
'vendor': {
'fbReqPerms': ""
},
'search': {
"rawKeyword":""
},
'employer' : {
'size' : "10000--1",
'sector' : "Information Technology",
'sectorId' : "10013",
'industry' : "Computer Hardware & Software",
'industryId' : "200060",
'name':"Teradata",
'id' : "14638"
,'location': "San Diego"
,'locationId': "1147311"
,'locationType': "C"
},
'job' : {
'jobTitle' : "Service Management Business Analyst",
'city' : "",
'state' : "",
'country' : "",
'id': "3032410565",
'jobSource': "6938",
'hasPostalAddress': 0,
'hasOccupationalCategory': 1,
'hasSalaryCurrency': 1,
'hasGeoCoordinates': 1,
'category' : "10014",
'expired' : 'false'
},
'test' : {
'planoutIdList': [
, "jobViewDomain.exp_jobViewDomain_catchall"
, "savedJobsDomain.non_user_saved_jobs_catchall"
, "urgency.2018_10_15_badgeDiversity"
, "easyApplyDomain.exp_easyApplyDomain_catchall"
, "jobDetailsDomain.exp_jobDetailsDomain_catchall"
, "serpDomain.exp_serpDomain_catchall"
, "serpDomain.reviseFacetCounts_2018_11_06"
, "jxGlobalDomain.2018_11_20_exp_userReg"
, "myJobsDomain.2018_09_06_myJobsJAFilters"
, "urgency.urgency_catchall"
, "jobs-view.extractedFields"
, "jxGlobalDomain.exp_jxGlobalDomain_catchall"
, "jx_global.2018_06_25_xToSerpUrgencyBadge"
, "jobs-ux-dk-2.redirectToHome"
, "jobAlertDomain.exp_jobAlertDomain_catchAll"
],
'planoutTreatmentList': [
, "jobViewDomainDefaultTreatment"
, "savedJobsDefaultTreatment"
, "badges_sevenByThree"
, "easyApplyDefaultTreatment"
, "jobDetailsDefaultTreatment"
, "serpDefaultTreatment"
, "reviseFacetCounts_on"
, "userReg_control"
, "myJobsJAFilters_on"
, "urgency_default"
, "false"
, "jxGlobalDefaultTreatment"
, "xToSerpUrgencyBadge_on"
, "savedJobsRedirect-false"
, "jobAlertDefaultTreatment"
]
},
'staticList' : {
}
}];
window.getGdGlobals = window.getGdGlobals ||
function() {
return gdGlobals[0];
};
GD = window.GD || {};
GD.pageInfo = GD.pageInfo || {};
GD.pageInfo.pageGroup = getGdGlobals().page.group;
GD.domain = getGdGlobals().page.domain;
</script>
答案 0 :(得分:1)
假设您的脚本位于名为script
的python变量中。
import json
import re
script = '''
'''
<script>
window.gdGlobals = window.gdGlobals ||
[{
'analyticsId': "UA-2595786-1",
...
</script>
'''
script = script.replace('\n', ' ').replace('\t', ' ').replace("'", '"')
myvars = re.search(r"window.gdGlobals\s*\|\|\s*\[({.*})\];", script).group(1);
myvars = re.sub('\[\s*,', '[', myvars)
myvars = json.loads(myvars)
print(myvars['employer'])
产量:
{'size': '10000--1',
'sector': 'Information Technology',
'sectorId': '10013',
'industry': 'Computer Hardware & Software',
'industryId': '200060',
'name': 'Teradata',
'id': '14638',
'location': 'San Diego',
'locationId': '1147311',
'locationType': 'C'}
正则表达式将非常脆弱,因此您可能想要比本例更聪明。