在脚本标记内提取JS变量信息

时间:2016-09-10 09:52:39

标签: javascript python beautifulsoup html-parsing

我正在从网址检索HTML网页,并希望从该HTML中的script标记中提取信息。我特意寻找这个特殊的script标签:

<script type="text/javascript">
    var zomato = zomato || {};
    zomato.menuPages = [{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/4bab50546bf3314e25dea4310ddf524e.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/4bab50546bf3314e25dea4310ddf524e.jpg","filename":"4bab50546bf3314e25dea4310ddf524e.jpg","url_master":"menus_original\/705\/51705\/4bab50546bf3314e25dea4310ddf524e.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/4bab50546bf3314e25dea4310ddf524e.jpg","data_center":"sng","menu_type":"FOOD","title":"FOOD","menu_type_class":"FOOD","real_menu_type":"FOOD","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344370},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/0a284792e41edbb5ba5bbc7b0cde26db.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/0a284792e41edbb5ba5bbc7b0cde26db.jpg","filename":"0a284792e41edbb5ba5bbc7b0cde26db.jpg","url_master":"menus_original\/705\/51705\/0a284792e41edbb5ba5bbc7b0cde26db.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/0a284792e41edbb5ba5bbc7b0cde26db.jpg","data_center":"sng","menu_type":"FOOD","title":"FOOD","menu_type_class":"FOOD","real_menu_type":"FOOD","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344371},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/6ff338c3891bca1cc61574e9864b15ae.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/6ff338c3891bca1cc61574e9864b15ae.jpg","filename":"6ff338c3891bca1cc61574e9864b15ae.jpg","url_master":"menus_original\/705\/51705\/6ff338c3891bca1cc61574e9864b15ae.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/6ff338c3891bca1cc61574e9864b15ae.jpg","data_center":"sng","menu_type":"FOOD","title":"FOOD","menu_type_class":"FOOD","real_menu_type":"FOOD","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344365},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/ff5a5ea0945782ad1d82102461a39b52.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/ff5a5ea0945782ad1d82102461a39b52.jpg","filename":"ff5a5ea0945782ad1d82102461a39b52.jpg","url_master":"menus_original\/705\/51705\/ff5a5ea0945782ad1d82102461a39b52.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/ff5a5ea0945782ad1d82102461a39b52.jpg","data_center":"sng","menu_type":"FOOD","title":"FOOD","menu_type_class":"FOOD","real_menu_type":"FOOD","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344366},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/3cb04e221c4db345ceb41b638d9faa6a.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/3cb04e221c4db345ceb41b638d9faa6a.jpg","filename":"3cb04e221c4db345ceb41b638d9faa6a.jpg","url_master":"menus_original\/705\/51705\/3cb04e221c4db345ceb41b638d9faa6a.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/3cb04e221c4db345ceb41b638d9faa6a.jpg","data_center":"sng","menu_type":"FOOD","title":"FOOD","menu_type_class":"FOOD","real_menu_type":"FOOD","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344367},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/733759862d474dfd8e710fa08e78849b.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/733759862d474dfd8e710fa08e78849b.jpg","filename":"733759862d474dfd8e710fa08e78849b.jpg","url_master":"menus_original\/705\/51705\/733759862d474dfd8e710fa08e78849b.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/733759862d474dfd8e710fa08e78849b.jpg","data_center":"sng","menu_type":"FOOD","title":"FOOD","menu_type_class":"FOOD","real_menu_type":"FOOD","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344368},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/69144be9b82cbba9adcc9de35003522d.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/69144be9b82cbba9adcc9de35003522d.jpg","filename":"69144be9b82cbba9adcc9de35003522d.jpg","url_master":"menus_original\/705\/51705\/69144be9b82cbba9adcc9de35003522d.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/69144be9b82cbba9adcc9de35003522d.jpg","data_center":"sng","menu_type":"FOOD","title":"FOOD","menu_type_class":"FOOD","real_menu_type":"FOOD","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344369},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/9dfd7dcc0e45639acbde792781012e0d.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/9dfd7dcc0e45639acbde792781012e0d.jpg","filename":"9dfd7dcc0e45639acbde792781012e0d.jpg","url_master":"menus_original\/705\/51705\/9dfd7dcc0e45639acbde792781012e0d.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/9dfd7dcc0e45639acbde792781012e0d.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344483},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/b89c707ff99087cd8098ddaf3b5f1346.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/b89c707ff99087cd8098ddaf3b5f1346.jpg","filename":"b89c707ff99087cd8098ddaf3b5f1346.jpg","url_master":"menus_original\/705\/51705\/b89c707ff99087cd8098ddaf3b5f1346.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/b89c707ff99087cd8098ddaf3b5f1346.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344484},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/439bf88da8bfce35ba44c6f206360a90.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/439bf88da8bfce35ba44c6f206360a90.jpg","filename":"439bf88da8bfce35ba44c6f206360a90.jpg","url_master":"menus_original\/705\/51705\/439bf88da8bfce35ba44c6f206360a90.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/439bf88da8bfce35ba44c6f206360a90.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344485},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/854abd602c815f84dcaa2fdea1c22f81.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/854abd602c815f84dcaa2fdea1c22f81.jpg","filename":"854abd602c815f84dcaa2fdea1c22f81.jpg","url_master":"menus_original\/705\/51705\/854abd602c815f84dcaa2fdea1c22f81.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/854abd602c815f84dcaa2fdea1c22f81.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344486},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/7670f299fd8f065252b94665df390790.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/7670f299fd8f065252b94665df390790.jpg","filename":"7670f299fd8f065252b94665df390790.jpg","url_master":"menus_original\/705\/51705\/7670f299fd8f065252b94665df390790.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/7670f299fd8f065252b94665df390790.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344487},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/f308c376afe08aed9b4ccf38eb0d6652.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/f308c376afe08aed9b4ccf38eb0d6652.jpg","filename":"f308c376afe08aed9b4ccf38eb0d6652.jpg","url_master":"menus_original\/705\/51705\/f308c376afe08aed9b4ccf38eb0d6652.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/f308c376afe08aed9b4ccf38eb0d6652.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344488},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/c734ed73e8e5f15f2e3ef9e287bf86f7.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/c734ed73e8e5f15f2e3ef9e287bf86f7.jpg","filename":"c734ed73e8e5f15f2e3ef9e287bf86f7.jpg","url_master":"menus_original\/705\/51705\/c734ed73e8e5f15f2e3ef9e287bf86f7.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/c734ed73e8e5f15f2e3ef9e287bf86f7.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344489},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/0cbe5c590f3d5312238de6b00cc9b0a9.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/0cbe5c590f3d5312238de6b00cc9b0a9.jpg","filename":"0cbe5c590f3d5312238de6b00cc9b0a9.jpg","url_master":"menus_original\/705\/51705\/0cbe5c590f3d5312238de6b00cc9b0a9.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/0cbe5c590f3d5312238de6b00cc9b0a9.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344490},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/3db4f82866e075bb1852990a0cdbe30a.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/3db4f82866e075bb1852990a0cdbe30a.jpg","filename":"3db4f82866e075bb1852990a0cdbe30a.jpg","url_master":"menus_original\/705\/51705\/3db4f82866e075bb1852990a0cdbe30a.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/3db4f82866e075bb1852990a0cdbe30a.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344477},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/1e0df9160c02273466e96239eae1a555.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/1e0df9160c02273466e96239eae1a555.jpg","filename":"1e0df9160c02273466e96239eae1a555.jpg","url_master":"menus_original\/705\/51705\/1e0df9160c02273466e96239eae1a555.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/1e0df9160c02273466e96239eae1a555.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344478},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/0d7fd654ec9f090883fa428df0f1ebb2.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/0d7fd654ec9f090883fa428df0f1ebb2.jpg","filename":"0d7fd654ec9f090883fa428df0f1ebb2.jpg","url_master":"menus_original\/705\/51705\/0d7fd654ec9f090883fa428df0f1ebb2.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/0d7fd654ec9f090883fa428df0f1ebb2.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344479},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/da16fc1d8d9641581fca258cbcb99f80.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/da16fc1d8d9641581fca258cbcb99f80.jpg","filename":"da16fc1d8d9641581fca258cbcb99f80.jpg","url_master":"menus_original\/705\/51705\/da16fc1d8d9641581fca258cbcb99f80.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/da16fc1d8d9641581fca258cbcb99f80.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344480},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/5336b40a11d4486db5e3a4bcfb0e9ae8.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/5336b40a11d4486db5e3a4bcfb0e9ae8.jpg","filename":"5336b40a11d4486db5e3a4bcfb0e9ae8.jpg","url_master":"menus_original\/705\/51705\/5336b40a11d4486db5e3a4bcfb0e9ae8.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/5336b40a11d4486db5e3a4bcfb0e9ae8.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344481},{"url":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/c96e340be834ecf086536234a56e7626.jpg","href":"https:\/\/b.zmtcdn.com\/data\/menus\/705\/51705\/c96e340be834ecf086536234a56e7626.jpg","filename":"c96e340be834ecf086536234a56e7626.jpg","url_master":"menus_original\/705\/51705\/c96e340be834ecf086536234a56e7626.jpg","path_master":"\/home\/foodie\/zomato_data\/menus_original\/705\/51705\/c96e340be834ecf086536234a56e7626.jpg","data_center":"sng","menu_type":"BAR","title":"BAR","menu_type_class":"BAR","real_menu_type":"BAR","is_salt_special_menu":0,"start_date":"","consumer_upload":0,"start_date_formatted":"","end_date":"","end_date_formatted":"","id":129344482}];
    zomato.menuTypes = ["DEFAULT","FOOD","BAR","DELIVERY","SPECIAL","TAKEAWAY","INTERNAL"];
    zomato.currentMenuPage = 1;
</script>

这个名单还有很长一段时间。我正在使用。这样:

soup.find_all('script')[14] 

给了我想要的确切script标签。但是,一旦我这样做了,我不确定如何进一步解析。

有没有办法可以访问zomato.menuPages作为python列表然后访问其元素?如果没有合适的Python解决方案,可能是JS中的东西?

2 个答案:

答案 0 :(得分:2)

我发现jsxml非常有效,它将 javascript 属性/函数解析为xml树:

import js2xml
import re

soup = BeautifulSoup(the_html,"html.parser")

tree = js2xml.parse(soup.find("script", text=re.compile("zomato.menuPages\s+=")).text)
print(js2xml.pretty_print(tree))

在树的中间你会看到:

  <assign operator="=">
    <left>
      <dotaccessor>
        <object>
          <identifier name="zomato"/>
        </object>
        <property>
          <identifier name="menuTypes"/>
        </property>
      </dotaccessor>
    </left>
    <right>
      <array>
        <string>DEFAULT</string>
        <string>FOOD</string>
        <string>BAR</string>
        <string>DELIVERY</string>
        <string>SPECIAL</string>
        <string>TAKEAWAY</string>
        <string>INTERNAL</string>
      </array>
    </right>

您有分配节点, left 节点内有operator="=",然后您拥有dotaccessor节点,其中包含object和{{ 1}}子节点,所以基本上我们只需要使用 dotproperty 节点内的对象/属性找到正确的左分配,并得到以下数组,一个简单的例子使用树来获取数组和内容:

property

输出被截断,因为太多了,您可以使用 xpaths 找到特定的属性/值,就像使用任何树一样。

答案 1 :(得分:1)

嗯,一种方法是使用正则表达式,这当然不是最容易做到的事情,但有时会派上用场。所以我尝试的是以下内容:

#!/usr/bin/env python

from BeautifulSoup import BeautifulSoup
import requests  
import re 

# I didn't post the url for typical reasons 
url = "the_url"

r = requests.get(url)
response = r.text
soup = BeautifulSoup(response)
x = soup.findAll(name = 'script')[14] 


# use regular expression
values = re.findall(r'zomato..*?=\s*(.*?);', str(x), re.DOTALL | re.MULTILINE)

那么这个正则表达式在这种情况下会做什么就是给你 列表由4个元素组成 - 例如第2个元素 你要求zomato.menuPages。 然后,您可以更多地处理zomato.menuPages,例如:

k = ''.join(values[1])
w = k[1:-1]
list = w.split("{",21)
print list[1]

然后您可以尝试从字符串转换list的项目 - &gt;字典能够更容易地解析它们(使用json或ast)。

您还可以使用另一个正则表达式解析每个值(我引用值列表)。 同样使用regex模块中的groupdict函数,您可以根据某些正则表达式规则从列表的每个元素创建一个字典。 好吧,我希望它会有所帮助!