Question

我在以下链接中有一个本地HTML文件：https://pastebin.com/L3iFQgQH

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml">
<head><title>
    335i | autoTRADER.ca
</title><link id="ctl00_ctl00_canonical" rel="canonical" href="http://www.autotrader.ca/cars/bmw/3%20series/2013/" /><meta name="viewport" content="width=device-width, height=device-height, user-scalable=0, minimum-scale=0.75, maximum-scale=1.0" /><meta name="SKYPE_TOOLBAR" content="SKYPE_TOOLBAR_PARSER_COMPATIBLE" /><script>
var dataLayer = [
{
  'search': {
    'pageNumber': '1',
    'searchType': 'unique',
    'filterFieldsUsed': '10',
    'category': 'Cars, Trucks & SUVs',
   'minPrice': 'not used',
   'maxPrice': 'not used',
   'make': 'BMW',
   'model': '3 Series',
   'new': 'yes',
   'used': 'yes',
   'CPO': 'yes',
   'distance': 'national',
   'location': 'canada',
   'searchLocation': 'advancedSearch',
   'minYear': '2013',
   'maxYear': '2013',
   'transmission': 'Automatic',
   'fuelType': 'not used',
   'exteriorColor': 'not used',
   'refinedKeywords': '335i',
   'bodyType': 'not used',
   'minKms': 'not used',
   'maxKms': 'not used',
   'damaged': 'yes',
   'dealer': 'yes',
   'privateSeller': 'yes',
   'withPrice': 'yes',
   'withPhotos': 'yes',
   'withFreeCarProof': 'not used',
   'sortOrder': 'Price: High to Low'
 },
 'lists': [
   {
     'key': 'advancedSearch',
     'vehicles': [
       {
         'make': 'BMW',
         'model': '3 Series',
         'year': '2013',
         'category': 'PassengerVehicles',
         'price': '37800',
         'condition': 'used',
         'adType': 'dealer',
         'adID': '5-33635639',
         'dealerID': '5-BS2004915125635',
         'listingPosition': 'ppl',
         'upgradeExecUpgrade': 'no',
         'upgradePL': 'no',
         'upgradeHL': 'no',
         'upgradePPL': 'no',
         'mobialsParticipation': 'no',
         'strikethrough': 'no',
         'vehicleSpecialist': 'no',
         'priceHistory': '1',
         'priceAnalysis': 'above average',
         'transparency': 'yes',
         'car360enabled': 'no',
         'province': 'BC',
         'financingPrice': 'no',
         'merchandising': 'gold'
       },
       {
         'make': 'BMW',
         'model': '3 Series',
         'year': '2013',
         'category': 'PassengerVehicles',
         'price': '33995',
         'condition': 'used',
         'adType': 'dealer',
         'ad
       }
     ]
   }
 ],
 'pageType': 'search-results',
 'mvt': null
}
];
dataLayer.push({'ShowNewCoPath': 'True'});

</script>
<!--Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-K7JHZJ');</script>
<!-- End Google Tag Manager -->

在最顶层，有一个变量dataLayer，它是一个字典，后面跟着很多html和其他东西。我想提取这个变量并使用python将它存储在json字典中。现在，我使用拆分功能，但它非常具体是否有任何方法可以为更广泛的html文件执行此操作？

Answer 1

一种选择是首先使用例如BeautiulSoup HTML解析器提取脚本内容，然后使用slimit或pyjsparser之类的JavaScript解析器来提取dataLayer变量值，然后稍微后处理它以使JSON可加载。然后，通过json.loads()加载到Python列表中：

使用slimit的工作示例：

from ast import literal_eval
import json

from bs4 import BeautifulSoup

from slimit import ast
from slimit.parser import Parser
from slimit.visitors import nodevisitor


data = """
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml">
<head><title>
    335i | autoTRADER.ca
</title><link id="ctl00_ctl00_canonical" rel="canonical" href="http://www.autotrader.ca/cars/bmw/3%20series/2013/" /><meta name="viewport" content="width=device-width, height=device-height, user-scalable=0, minimum-scale=0.75, maximum-scale=1.0" /><meta name="SKYPE_TOOLBAR" content="SKYPE_TOOLBAR_PARSER_COMPATIBLE" /><script>
var dataLayer = [
{
  'search': {
    'pageNumber': '1',
    'searchType': 'unique',
    'filterFieldsUsed': '10',
    'category': 'Cars, Trucks & SUVs',
   'minPrice': 'not used',
   'maxPrice': 'not used',
   'make': 'BMW',
   'model': '3 Series',
   'new': 'yes',
   'used': 'yes',
   'CPO': 'yes',
   'distance': 'national',
   'location': 'canada',
   'searchLocation': 'advancedSearch',
   'minYear': '2013',
   'maxYear': '2013',
   'transmission': 'Automatic',
   'fuelType': 'not used',
   'exteriorColor': 'not used',
   'refinedKeywords': '335i',
   'bodyType': 'not used',
   'minKms': 'not used',
   'maxKms': 'not used',
   'damaged': 'yes',
   'dealer': 'yes',
   'privateSeller': 'yes',
   'withPrice': 'yes',
   'withPhotos': 'yes',
   'withFreeCarProof': 'not used',
   'sortOrder': 'Price: High to Low'
 },
 'lists': [
   {
     'key': 'advancedSearch',
     'vehicles': [
       {
         'make': 'BMW',
         'model': '3 Series',
         'year': '2013',
         'category': 'PassengerVehicles',
         'price': '37800',
         'condition': 'used',
         'adType': 'dealer',
         'adID': '5-33635639',
         'dealerID': '5-BS2004915125635',
         'listingPosition': 'ppl',
         'upgradeExecUpgrade': 'no',
         'upgradePL': 'no',
         'upgradeHL': 'no',
         'upgradePPL': 'no',
         'mobialsParticipation': 'no',
         'strikethrough': 'no',
         'vehicleSpecialist': 'no',
         'priceHistory': '1',
         'priceAnalysis': 'above average',
         'transparency': 'yes',
         'car360enabled': 'no',
         'province': 'BC',
         'financingPrice': 'no',
         'merchandising': 'gold'
       },
       {
         'make': 'BMW',
         'model': '3 Series',
         'year': '2013',
         'category': 'PassengerVehicles',
         'price': '33995',
         'condition': 'used',
         'adType': 'dealer'
       }
     ]
   }
 ],
 'pageType': 'search-results',
 'mvt': null
}
];
dataLayer.push({'ShowNewCoPath': 'True'});

</script>
<!--Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-K7JHZJ');</script>
<!-- End Google Tag Manager -->"""


soup = BeautifulSoup(data, "html.parser")
script = soup.find("script", text=lambda text: text and "dataLayer" in text).get_text()

parser = Parser()
tree = parser.parse(script)

data_layer = next(node.initializer.items[0].to_ecma().replace("'", '"')
                  for node in nodevisitor.visit(tree)
                  if isinstance(node, ast.VarDecl) and node.identifier.value == 'dataLayer')

print(json.loads(data_layer))

另一个选项，可能更实用但总体上不太可靠，是使用正则表达式 - 匹配所需对象，从HTML字符串中提取，后处理并使用{{1}加载将模块转换为Python对象。工作片段：

json

Answer 2

您可以通过以下方式将BeautifulSoup与pyjsparser结合：

from bs4 import BeautifulSoup
from pyjsparser import parse as js_parse


def recompose_var(var):
    if var.get("elements"):
        return [recompose_var(val) for val in var["elements"]]
    if var.get("properties"):
        return {prop["key"]["value"]: recompose_var(prop["value"]) for prop in var["properties"]}
    if var.get("value"):
        return var["value"]


def extract_js_var(script, var_name: str):
    parsed = js_parse(script.contents[0])
    var = next(filter(lambda x: x["type"] == "VariableDeclaration" and x["declarations"][0]["id"]["name"] == var_name, parsed["body"]))
    return recompose_var(var["declarations"][0]["init"]) if var else None


data = """
....
"""

soup = BeautifulSoup(data, "html.parser")
script = soup.find("script", text=lambda text: text and "dataLayer" in text).get_text()

extract_js_var(script, "dataLayer")

从本地HTML文件中提取字典

2 个答案: