我需要解析script标记内的一些数据。第一个挑战是页面上有几个没有id或class的脚本标签。我需要的是这样的:
<script>
window.runParams = {
data: {
"priceModule":{
"maxActivityAmount":{
"currency":"USD",
"formatedAmount":"US $28.71",
"value":28.71 ***VALUE TO IGNORE***
},
"maxAmount":{
"currency":"USD",
"formatedAmount":"US $52.20",
"value":52.2 ***VALUE TO IGNORE***
},
"minActivityAmount":{
"currency":"USD",
"formatedAmount":"US $6.83",
"value":6.83 ***THIS IS THE VALUE I NEED***
},
"minAmount":{
"currency":"USD",
"formatedAmount":"US $12.42",
"value":12.42 ***THIS IS THE VALUE I NEED***
},
},
"freightItemModule":{
"commitDay":"60",
"company":"Standard Shipping",
"currency":"USD",
"discount":100,
"displayType":"deliveryTime",
"features":{
},
"freightAmount":{
"currency":"USD",
"formatedAmount":"US $0.00",
"value":0.0 ***VALUE TO IGNORE***
},
"fullMailLine":false,
"hbaService":false,
"i18nMap":{
},
"id":0,
"name":"FreightItemModule",
"notification":"",
"sendGoodsCountry":"CN",
"sendGoodsCountryFullName":"China",
"serviceName":"CAINIAO_STANDARD",
"standardFreightAmount":{
"currency":"USD",
"formatedAmount":"US $13.12",
"value":13.12 ***VALUE TO IGNORE***
},
"time":"17-25",
"tracking":true
},
"skuModule":{
"categoryId":200001392,
"features":{
},"
forcePromiseWarrantyJson":"{
}",
"hasSizeInfo":false,
"hasSkuProperty":true,
"id":0,
"name":"SKUModule",
"productSKUPropertyList":[{
"isShowTypeColor":false,
"order":1,
"showType":"none",
"showTypeColor":false,
"skuPropertyId":14,
"skuPropertyName":"????",
"skuActivityAmount":{
"currency":"USD",
"formatedAmount":"US $12.38",
"value":12.38 ***VALUE TO IGNORE***
},
"skuAmount":{
"currency":"USD",
"formatedAmount":"US $22.51",
"value":22.51 ***VALUE TO IGNORE***
},
"skuCalPrice":"22.51",
"skuMultiCurrencyCalPrice":"22.51",
"skuMultiCurrencyDisplayPrice":"22.51"
}
},
},
};
var GaData = {
pageType: "product",
productIds: "32955439786",
totalValue: "US $6.83"
};
var PAGE_TIMING = {
pageType: 'gloDetail'
};
</script>
我需要在[value]
-> [data]
-> [priceModule]
和[minActivityAmount]
中解析[minAmount]
并将它们保存在两个单独的变量下:{{1 }}和activity_amount = 6.83
。如您所见,在多个“模块”中有多个“值”。因此,使用正则表达式解析它们似乎并不理想。也许有更好的方法来从此脚本中提取这些值?预先谢谢你。
答案 0 :(得分:2)
很遗憾,BeautifulSoup
没有提供提取JS内容的工具。
解决此问题的一种方法是使用正则表达式
import re
from bs4 import BeautifulSoup
data = """
<script>
window.runParams = {
data: {
"priceModule":{
"maxActivityAmount":{
"currency":"USD",
"formatedAmount":"US $28.71",
"value":28.71 ***VALUE TO IGNORE***
},
"maxAmount":{
"currency":"USD",
"formatedAmount":"US $52.20",
"value":52.2 ***VALUE TO IGNORE***
},
"minActivityAmount":{
"currency":"USD",
"formatedAmount":"US $6.83",
"value":6.83 ***THIS IS THE VALUE I NEED***
},
"minAmount":{
"currency":"USD",
"formatedAmount":"US $12.42",
"value":12.42 ***THIS IS THE VALUE I NEED***
},
},
"freightItemModule":{
"commitDay":"60",
"company":"Standard Shipping",
"currency":"USD",
"discount":100,
"displayType":"deliveryTime",
"features":{
},
"freightAmount":{
"currency":"USD",
"formatedAmount":"US $0.00",
"value":0.0 ***VALUE TO IGNORE***
},
"fullMailLine":false,
"hbaService":false,
"i18nMap":{
},
"id":0,
"name":"FreightItemModule",
"notification":"",
"sendGoodsCountry":"CN",
"sendGoodsCountryFullName":"China",
"serviceName":"CAINIAO_STANDARD",
"standardFreightAmount":{
"currency":"USD",
"formatedAmount":"US $13.12",
"value":13.12 ***VALUE TO IGNORE***
},
"time":"17-25",
"tracking":true
},
"skuModule":{
"categoryId":200001392,
"features":{
},"
forcePromiseWarrantyJson":"{
}",
"hasSizeInfo":false,
"hasSkuProperty":true,
"id":0,
"name":"SKUModule",
"productSKUPropertyList":[{
"isShowTypeColor":false,
"order":1,
"showType":"none",
"showTypeColor":false,
"skuPropertyId":14,
"skuPropertyName":"????",
"skuActivityAmount":{
"currency":"USD",
"formatedAmount":"US $12.38",
"value":12.38 ***VALUE TO IGNORE***
},
"skuAmount":{
"currency":"USD",
"formatedAmount":"US $22.51",
"value":22.51 ***VALUE TO IGNORE***
},
"skuCalPrice":"22.51",
"skuMultiCurrencyCalPrice":"22.51",
"skuMultiCurrencyDisplayPrice":"22.51"
}
},
},
};
var GaData = {
pageType: "product",
productIds: "32955439786",
totalValue: "US $6.83"
};
var PAGE_TIMING = {
pageType: 'gloDetail'
};
</script>
"""
soup = BeautifulSoup(data, features='html.parser')
script = soup.find('script')
values = []
keys = ['minActivityAmount', 'minAmount']
for key in keys:
value = re.search(r'(?<=\"%s\":{)([^]]+?)(?=\})' % key, script.text)
value = re.search(r'(?<="value":)([0-9.,]+)', value.group())
values.append(value.group())
print(values)
输出:
['6.83', '12.42']
答案 1 :(得分:1)
有一种方法可以不使用正则表达式(就此而言,也可以不使用BeautifulSoup);它有些令人费解,但应该可以。基本上,它将脚本分成较小的块,直到每个目标数据都隔离在自己的块中,然后从该块中提取目标。
data = [your script above]
items = ' '.join(data.split()).split('}, "')
for item in items:
if ("minAmount" in item or "minActivityAmount" in item):
print(item.split('"value":')[1].replace('},',''))
输出:
6.83 ***THIS IS THE VALUE I NEED***
12.42 ***THIS IS THE VALUE I NEED***
答案 2 :(得分:1)
另一个正则表达式
import re
html = '''
<script>
window.runParams = {
data: {
"priceModule":{
"maxActivityAmount":{
"currency":"USD",
"formatedAmount":"US $28.71",
"value":28.71 ***VALUE TO IGNORE***
},
"maxAmount":{
"currency":"USD",
"formatedAmount":"US $52.20",
"value":52.2 ***VALUE TO IGNORE***
},
"minActivityAmount":{
"currency":"USD",
"formatedAmount":"US $6.83",
"value":6.83 ***THIS IS THE VALUE I NEED***
},
"minAmount":{
"currency":"USD",
"formatedAmount":"US $12.42",
"value":12.42 ***THIS IS THE VALUE I NEED***
},
},
"freightItemModule":{
"commitDay":"60",
"company":"Standard Shipping",
"currency":"USD",
"discount":100,
"displayType":"deliveryTime",
"features":{
},
"freightAmount":{
"currency":"USD",
"formatedAmount":"US $0.00",
"value":0.0 ***VALUE TO IGNORE***
},
"fullMailLine":false,
"hbaService":false,
"i18nMap":{
},
"id":0,
"name":"FreightItemModule",
"notification":"",
"sendGoodsCountry":"CN",
"sendGoodsCountryFullName":"China",
"serviceName":"CAINIAO_STANDARD",
"standardFreightAmount":{
"currency":"USD",
"formatedAmount":"US $13.12",
"value":13.12 ***VALUE TO IGNORE***
},
"time":"17-25",
"tracking":true
},
"skuModule":{
"categoryId":200001392,
"features":{
},"
forcePromiseWarrantyJson":"{
}",
"hasSizeInfo":false,
"hasSkuProperty":true,
"id":0,
"name":"SKUModule",
"productSKUPropertyList":[{
"isShowTypeColor":false,
"order":1,
"showType":"none",
"showTypeColor":false,
"skuPropertyId":14,
"skuPropertyName":"????",
"skuActivityAmount":{
"currency":"USD",
"formatedAmount":"US $12.38",
"value":12.38 ***VALUE TO IGNORE***
},
"skuAmount":{
"currency":"USD",
"formatedAmount":"US $22.51",
"value":22.51 ***VALUE TO IGNORE***
},
"skuCalPrice":"22.51",
"skuMultiCurrencyCalPrice":"22.51",
"skuMultiCurrencyDisplayPrice":"22.51"
}
},
},
};
var GaData = {
pageType: "product",
productIds: "32955439786",
totalValue: "US $6.83"
};
var PAGE_TIMING = {
pageType: 'gloDetail'
};
</script>
'''
p1 = re.compile(r'"minActivityAmount":(.*?),[\n\t\s]+"freightItemModule"', re.DOTALL)
text = p1.findall(html)[0]
p2 = re.compile(r'value":\d+\.?\d+')
results = p2.findall(text)
print(results)