python web scrape-解析脚本标签的任何简单方法

时间:2018-11-28 18:12:24

标签: python

我正在尝试在网络抓取过程中从以下脚本标签中获取一些信息,例如jobTitle,大小,行业。是否有任何简单且可复制的方式来执行此操作?因为我需要刮很多页面。

我尝试将其转换为字符串,除去前部和尾部多余的部分,然后使用eval,但它显示语法错误。

<script>
	window.gdGlobals = window.gdGlobals ||
		[{
			'analyticsId':                      "UA-2595786-1",

			
			'analyticsUrl':                     "/jobview/jobs/joblisting/Teradata Corporation",

			'deferredScriptType':               "text/x-deferred-js",
			'accessDeniedRedirectUrl':          '',
			'locale': 							'en-US',
			'env': 'prod',


			'device':       {

								
								'handheld':     false,

								
								'tablet':       	false,
								'deviceTypeId': 	1,
								'platformTypeId': 	3,
								'viewTypeId': 		4
							},
			'page':         {
								
								'domain':       "www.glassdoor.com",
								'domainId':		1,

								
								'domainSuffix': "",

								
								'group':        "JobListing",

								
								'guid':         '000001675b5fa6a78fa0ac61fe36e9b6',

								
								'flex':         true,

								
								'section':      "job-listing",

								
								'type':         "job-listing:job-listing",

								
								'id':           "job-listing:job-listing",

								'med': '',

								'src' : '',
								'content' : '',
								'campaign' : '',
								'term' : '',
								'state' : 'locked',
								'untranslatedUrl' : 'https://www.glassdoor.com/job-listing/service-management-business-analyst-teradata-JV_IC1147311_KO0,35_KE36,44.htm?jl=3032410565'
                                
							},
			'user':         {
								
								'guid':         '2917b9da-9897-4a3b-90c6-da49238a5924',

								
								'ipAddr':       "70.95.16.113",

								
								'locale':       "en_US",

								
								'country':      "US",
								'ipLocationId': "1147311",
								'ipLocationType': "C",
								
								'elligibleForAppBoy': false
							},
			'vendor':       {
								'fbReqPerms':   ""
							},
			
			'search':       {
				"rawKeyword":""
				
			},
			'employer' : {
				
						'size' : "10000--1",
					
						'sector' : "Information Technology",
						'sectorId' : "10013",
					
						'industry' : "Computer Hardware &amp; Software",
						'industryId' : "200060",
					
					'name':"Teradata",
					'id' : "14638"
                    
								,'location': "San Diego"
								,'locationId': "1147311"
								,'locationType': "C"
							
			},
			'job' : {
				
						'jobTitle' : "Service Management Business Analyst",
						'city' : "",
						'state' : "",
						'country' : "",
						'id': "3032410565",
                		'jobSource': "6938",
						'hasPostalAddress': 0,

						'hasOccupationalCategory': 1,
						'hasSalaryCurrency': 1,

                        'hasGeoCoordinates': 1,
					
					'category' : "10014",
					'expired' : 'false'
				
			},
			'test' : {
				
				'planoutIdList': [
					
						, "jobViewDomain.exp_jobViewDomain_catchall"
					
						, "savedJobsDomain.non_user_saved_jobs_catchall"
					
						, "urgency.2018_10_15_badgeDiversity"
					
						, "easyApplyDomain.exp_easyApplyDomain_catchall"
					
						, "jobDetailsDomain.exp_jobDetailsDomain_catchall"
					
						, "serpDomain.exp_serpDomain_catchall"
					
						, "serpDomain.reviseFacetCounts_2018_11_06"
					
						, "jxGlobalDomain.2018_11_20_exp_userReg"
					
						, "myJobsDomain.2018_09_06_myJobsJAFilters"
					
						, "urgency.urgency_catchall"
					
						, "jobs-view.extractedFields"
					
						, "jxGlobalDomain.exp_jxGlobalDomain_catchall"
					
						, "jx_global.2018_06_25_xToSerpUrgencyBadge"
					
						, "jobs-ux-dk-2.redirectToHome"
					
						, "jobAlertDomain.exp_jobAlertDomain_catchAll"
					
				],
				'planoutTreatmentList': [
					
						, "jobViewDomainDefaultTreatment"
					
						, "savedJobsDefaultTreatment"
					
						, "badges_sevenByThree"
					
						, "easyApplyDefaultTreatment"
					
						, "jobDetailsDefaultTreatment"
					
						, "serpDefaultTreatment"
					
						, "reviseFacetCounts_on"
					
						, "userReg_control"
					
						, "myJobsJAFilters_on"
					
						, "urgency_default"
					
						, "false"
					
						, "jxGlobalDefaultTreatment"
					
						, "xToSerpUrgencyBadge_on"
					
						, "savedJobsRedirect-false"
					
						, "jobAlertDefaultTreatment"
					
				]
			},
			'staticList' : {
				
			}
            
		}];

	
	window.getGdGlobals = window.getGdGlobals ||
			function() {
				return gdGlobals[0];
			};

	GD = window.GD || {};
	GD.pageInfo = GD.pageInfo || {};
	GD.pageInfo.pageGroup = getGdGlobals().page.group;
	GD.domain = getGdGlobals().page.domain;

	</script>

1 个答案:

答案 0 :(得分:1)

假设您的脚本位于名为script的python变量中。

import json
import re
script = '''
'''
<script>
    window.gdGlobals = window.gdGlobals ||
        [{
            'analyticsId':                      "UA-2595786-1",
...
</script>
'''
script = script.replace('\n', ' ').replace('\t', ' ').replace("'", '"')
myvars = re.search(r"window.gdGlobals\s*\|\|\s*\[({.*})\];", script).group(1);
myvars = re.sub('\[\s*,', '[', myvars)
myvars = json.loads(myvars)
print(myvars['employer'])

产量:

{'size': '10000--1',
 'sector': 'Information Technology',
 'sectorId': '10013',
 'industry': 'Computer Hardware &amp; Software',
 'industryId': '200060',
 'name': 'Teradata',
 'id': '14638',
 'location': 'San Diego',
 'locationId': '1147311',
 'locationType': 'C'}

正则表达式将非常脆弱,因此您可能想要比本例更聪明。