如何用漂亮的汤解析脚本?

时间:2019-05-22 16:42:28

标签: django python-3.x beautifulsoup

我需要解析script标记内的一些数据。第一个挑战是页面上有几个没有id或class的脚本标签。我需要的是这样的:

<script>
    window.runParams = {
        data: {
            "priceModule":{
                "maxActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $28.71",
                    "value":28.71 ***VALUE TO IGNORE***
                },
                "maxAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $52.20",
                    "value":52.2 ***VALUE TO IGNORE***
                },
                "minActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $6.83",
                    "value":6.83 ***THIS IS THE VALUE I NEED***
                },
                "minAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $12.42",
                    "value":12.42 ***THIS IS THE VALUE I NEED***
                },
            },
            "freightItemModule":{
                "commitDay":"60",
                "company":"Standard Shipping",
                "currency":"USD",
                "discount":100,
                "displayType":"deliveryTime",
                "features":{
                },
                "freightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $0.00",
                    "value":0.0 ***VALUE TO IGNORE***
                },
                "fullMailLine":false,
                "hbaService":false,
                "i18nMap":{
                },
                "id":0,
                "name":"FreightItemModule",
                "notification":"",
                "sendGoodsCountry":"CN",
                "sendGoodsCountryFullName":"China",
                "serviceName":"CAINIAO_STANDARD",
                "standardFreightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $13.12",
                    "value":13.12 ***VALUE TO IGNORE***
                },
                "time":"17-25",
                "tracking":true
            },
            "skuModule":{
                "categoryId":200001392,
                "features":{
                },"
                forcePromiseWarrantyJson":"{
                }",
                "hasSizeInfo":false,
                "hasSkuProperty":true,
                "id":0,
                "name":"SKUModule",
                "productSKUPropertyList":[{
                    "isShowTypeColor":false,
                    "order":1,
                    "showType":"none",
                    "showTypeColor":false,
                    "skuPropertyId":14,
                    "skuPropertyName":"????",
                    "skuActivityAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $12.38",
                        "value":12.38 ***VALUE TO IGNORE***
                    },
                    "skuAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $22.51",
                        "value":22.51 ***VALUE TO IGNORE***
                    },
                    "skuCalPrice":"22.51",
                    "skuMultiCurrencyCalPrice":"22.51",
                    "skuMultiCurrencyDisplayPrice":"22.51"
                }
            },
        },
    };

    var GaData = {
        pageType: "product",
        productIds: "32955439786",
        totalValue: "US $6.83"
    };

    var PAGE_TIMING = {
        pageType: 'gloDetail'
    };
</script>

我需要在[value]-> [data]-> [priceModule][minActivityAmount]中解析[minAmount]并将它们保存在两个单独的变量下:{{1 }}和activity_amount = 6.83。如您所见,在多个“模块”中有多个“值”。因此,使用正则表达式解析它们似乎并不理想。也许有更好的方法来从此脚本中提取这些值?预先谢谢你。

3 个答案:

答案 0 :(得分:2)

很遗憾,BeautifulSoup没有提供提取JS内容的工具。

解决此问题的一种方法是使用正则表达式

import re

from bs4 import BeautifulSoup

data = """
<script>
    window.runParams = {
        data: {
            "priceModule":{
                "maxActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $28.71",
                    "value":28.71 ***VALUE TO IGNORE***
                },
                "maxAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $52.20",
                    "value":52.2 ***VALUE TO IGNORE***
                },
                "minActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $6.83",
                    "value":6.83 ***THIS IS THE VALUE I NEED***
                },
                "minAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $12.42",
                    "value":12.42 ***THIS IS THE VALUE I NEED***
                },
            },
            "freightItemModule":{
                "commitDay":"60",
                "company":"Standard Shipping",
                "currency":"USD",
                "discount":100,
                "displayType":"deliveryTime",
                "features":{
                },
                "freightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $0.00",
                    "value":0.0 ***VALUE TO IGNORE***
                },
                "fullMailLine":false,
                "hbaService":false,
                "i18nMap":{
                },
                "id":0,
                "name":"FreightItemModule",
                "notification":"",
                "sendGoodsCountry":"CN",
                "sendGoodsCountryFullName":"China",
                "serviceName":"CAINIAO_STANDARD",
                "standardFreightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $13.12",
                    "value":13.12 ***VALUE TO IGNORE***
                },
                "time":"17-25",
                "tracking":true
            },
            "skuModule":{
                "categoryId":200001392,
                "features":{
                },"
                forcePromiseWarrantyJson":"{
                }",
                "hasSizeInfo":false,
                "hasSkuProperty":true,
                "id":0,
                "name":"SKUModule",
                "productSKUPropertyList":[{
                    "isShowTypeColor":false,
                    "order":1,
                    "showType":"none",
                    "showTypeColor":false,
                    "skuPropertyId":14,
                    "skuPropertyName":"????",
                    "skuActivityAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $12.38",
                        "value":12.38 ***VALUE TO IGNORE***
                    },
                    "skuAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $22.51",
                        "value":22.51 ***VALUE TO IGNORE***
                    },
                    "skuCalPrice":"22.51",
                    "skuMultiCurrencyCalPrice":"22.51",
                    "skuMultiCurrencyDisplayPrice":"22.51"
                }
            },
        },
    };

    var GaData = {
        pageType: "product",
        productIds: "32955439786",
        totalValue: "US $6.83"
    };

    var PAGE_TIMING = {
        pageType: 'gloDetail'
    };
</script>
"""

soup = BeautifulSoup(data, features='html.parser')
script = soup.find('script')

values = []
keys = ['minActivityAmount', 'minAmount']
for key in keys:
    value = re.search(r'(?<=\"%s\":{)([^]]+?)(?=\})' % key, script.text)
    value = re.search(r'(?<="value":)([0-9.,]+)', value.group())
    values.append(value.group())

print(values)

输出:

['6.83', '12.42']

答案 1 :(得分:1)

有一种方法可以不使用正则表达式(就此而言,也可以不使用BeautifulSoup);它有些令人费解,但应该可以。基本上,它将脚本分成较小的块,直到每个目标数据都隔离在自己的块中,然后从该块中提取目标。

data = [your script above]

items = ' '.join(data.split()).split('}, "')

for item in items:
if ("minAmount" in item or "minActivityAmount" in item):        
    print(item.split('"value":')[1].replace('},',''))

输出:

 6.83 ***THIS IS THE VALUE I NEED*** 
12.42 ***THIS IS THE VALUE I NEED***  

答案 2 :(得分:1)

另一个正则表达式

import re

html = '''
<script>
    window.runParams = {
        data: {
            "priceModule":{
                "maxActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $28.71",
                    "value":28.71 ***VALUE TO IGNORE***
                },
                "maxAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $52.20",
                    "value":52.2 ***VALUE TO IGNORE***
                },
                "minActivityAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $6.83",
                    "value":6.83 ***THIS IS THE VALUE I NEED***
                },
                "minAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $12.42",
                    "value":12.42 ***THIS IS THE VALUE I NEED***
                },
            },
            "freightItemModule":{
                "commitDay":"60",
                "company":"Standard Shipping",
                "currency":"USD",
                "discount":100,
                "displayType":"deliveryTime",
                "features":{
                },
                "freightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $0.00",
                    "value":0.0 ***VALUE TO IGNORE***
                },
                "fullMailLine":false,
                "hbaService":false,
                "i18nMap":{
                },
                "id":0,
                "name":"FreightItemModule",
                "notification":"",
                "sendGoodsCountry":"CN",
                "sendGoodsCountryFullName":"China",
                "serviceName":"CAINIAO_STANDARD",
                "standardFreightAmount":{
                    "currency":"USD",
                    "formatedAmount":"US $13.12",
                    "value":13.12 ***VALUE TO IGNORE***
                },
                "time":"17-25",
                "tracking":true
            },
            "skuModule":{
                "categoryId":200001392,
                "features":{
                },"
                forcePromiseWarrantyJson":"{
                }",
                "hasSizeInfo":false,
                "hasSkuProperty":true,
                "id":0,
                "name":"SKUModule",
                "productSKUPropertyList":[{
                    "isShowTypeColor":false,
                    "order":1,
                    "showType":"none",
                    "showTypeColor":false,
                    "skuPropertyId":14,
                    "skuPropertyName":"????",
                    "skuActivityAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $12.38",
                        "value":12.38 ***VALUE TO IGNORE***
                    },
                    "skuAmount":{
                        "currency":"USD",
                        "formatedAmount":"US $22.51",
                        "value":22.51 ***VALUE TO IGNORE***
                    },
                    "skuCalPrice":"22.51",
                    "skuMultiCurrencyCalPrice":"22.51",
                    "skuMultiCurrencyDisplayPrice":"22.51"
                }
            },
        },
    };

    var GaData = {
        pageType: "product",
        productIds: "32955439786",
        totalValue: "US $6.83"
    };

    var PAGE_TIMING = {
        pageType: 'gloDetail'
    };
</script>
'''
p1 = re.compile(r'"minActivityAmount":(.*?),[\n\t\s]+"freightItemModule"', re.DOTALL)
text = p1.findall(html)[0]
p2 = re.compile(r'value":\d+\.?\d+')
results = p2.findall(text)
print(results)