我有一个包含多个javascript元素的网页。我只想访问名为SOURCE.pdp.propertyJSON
的一个,并以PYTHONIC方式访问属性。
HTML源代码的编辑(为了便于阅读)版本如下;以下是我的python代码。
任何指针都将非常感谢!
<script type="text/javascript">
SOURCE = SOURCE || {};
SOURCE.pdp = SOURCE.pdp || {};
SOURCE.pdp.propertyJSON = {
"neighborhood": "Westwood",
"neighborhoodId": 7187,
"zipCode": "90024",
"city": "Los Angeles",
"county": "Los Angeles",
"countyFIPS": "06037",
"stateCode": "CA",
"stateName": "California",
"type": "CONDO",
"typeDisplay": "Condo",
"numBedrooms": "2",
"numBathrooms": 2,
"numFullBathrooms": 2,
"numBeds": 2,
"indexSource": "Assessor",
"isForeclosure": false,
"isOpaqueBAL": false,
"foreclosureStatus": "",
"isSrpFeatured": false,
"price": null,
"sqft": 1321,
"formattedBedAndBath": "2bd, 2 full ba",
"formattedSqft": "1,321 sqft"
}
pdp_location_data = {
"neighborhood": {
"locationId": "87308",
"name": "Westwood",
"locationType": "neighborhood",
"altId": "7187"
},
"state": {
"locationId": "5",
"name": "California",
"locationType": "state",
"altId": "CA"
},
"county": {
"locationId": "57",
"name": "Los Angeles County",
"locationType": "county",
"altId": "06037"
},
"city": {
"locationId": "22637",
"name": "Los Angeles",
"locationType": "city",
"altId": "4396"
},
"zipCode": {
"locationId": "76090",
"name": "90024",
"locationType": "zipCode",
"altName": "90024",
"altId": "90024"
}
};
SOURCE.pdp.isCountySupportsValuation = true;
SOURCE.pdp.isInHighDemandRegion = false;
var _SPANLONG = pdp_location_data.longitude;
var _SPANLAT = pdp_location_data.latitude;
var _CENLONG = pdp_location_data.longitude;
var _CENLAT = pdp_location_data.latitude;
</script>
小心丑陋的蟒蛇!
from bs4 import BeautifulSoup as bsoup
import requests as rq
url = 'https://www.SOURCE.com'
source_code = rq.get(url).text
soupcon = bsoup(source_code,"html.parser")
souper = soupcon.find_all('script', {'type': 'text/javascript'})
for line in souper:
if format(line).find('SOURCE.pdp.propertyJSON') != -1:
parts = format(line).split(',')
for var in parts:
if var.find('zipCode') != -1:
zipCode = var.split(':')[1].strip('"')
elif var.find('numBathrooms') != -1:
numBathrooms = var.split(':')[1].strip('"')
正如你所看到的,我正在通过查找text / javascript类型的所有脚本元素来访问我想要的JS对象,迭代它们以找到包含我想要的对象的脚本,然后分割整个JS分隔符','
的脚本,并通过搜索我们的关键词来识别JS对象的元素。不是一个理想的解决方案。
答案 0 :(得分:0)
您可以使用json.loads将数据作为dict加载:
from bs4 import BeautifulSoup as bsoup
import re
from json import loads
source = """<script type="text/javascript"> SOURCE = SOURCE || {};
SOURCE.pdp = SOURCE.pdp || {};
SOURCE.pdp.propertyJSON = { "neighborhood": "Westwood", "neighborhoodId": 7187, "zipCode": "90024", "city": "Los Angeles", "county": "Los Angeles", "countyFIPS": "06037", "stateCode": "CA", "stateName": "California", "type": "CONDO", "typeDisplay": "Condo", "numBedrooms": "2", "numBathrooms": 2, "numFullBathrooms": 2, "numBeds": 2, "indexSource": "Assessor", "isForeclosure": false, "isOpaqueBAL": false, "foreclosureStatus": "", "isSrpFeatured": false, "price": null, "sqft": 1321, "formattedBedAndBath": "2bd, 2 full ba", "formattedSqft": "1,321 sqft" } pdp_location_data = { "neighborhood": { "locationId": "87308", "name": "Westwood", "locationType": "neighborhood", "altId": "7187" }, "state": { "locationId": "5", "name": "California", "locationType": "state", "altId": "CA" }, "county": { "locationId": "57", "name": "Los Angeles County", "locationType": "county", "altId": "06037" }, "city": { "locationId": "22637", "name": "Los Angeles", "locationType": "city", "altId": "4396" }, "zipCode": { "locationId": "76090", "name": "90024", "locationType": "zipCode", "altName": "90024", "altId": "90024" } };
SOURCE.pdp.isCountySupportsValuation = true;
SOURCE.pdp.isInHighDemandRegion = false;
var _SPANLONG = pdp_location_data.longitude;
var _SPANLAT = pdp_location_data.latitude;
var _CENLONG = pdp_location_data.longitude;
var _CENLAT = pdp_location_data.latitude; </script>"""
soup = bsoup(source,"html.parser")
json_re = re.compile("SOURCE\.pdp\.propertyJSON\s+=\s+(\{.*\})\s+pdp_location_data")
scr = soup.find("script", text=re.compile("SOURCE.pdp.propertyJSON")).text
js_raw = json_re.search(scr).group(1)
json_dict = loads(js_raw)
哪会给你:
{u'numBeds': 2, u'neighborhood': u'Westwood', u'stateName': u'California', u'numFullBathrooms': 2, u'indexSource': u'Assessor', u'countyFIPS': u'06037', u'city': u'Los Angeles', u'isSrpFeatured': False, u'type': u'CONDO', u'formattedSqft': u'1,321 sqft', u'isOpaqueBAL': False, u'price': None, u'zipCode': u'90024', u'numBedrooms': u'2', u'neighborhoodId': 7187, u'county': u'Los Angeles', u'formattedBedAndBath': u'2bd, 2 full ba', u'sqft': 1321, u'numBathrooms': 2, u'stateCode': u'CA', u'isForeclosure': False, u'typeDisplay': u'Condo', u'foreclosureStatus': u''}
如果你想要pdp_location_data
json,只需应用完全相同的逻辑。