从html主体获取文本

时间:2017-06-18 17:22:48

标签: html css beautifulsoup html-parsing

我有以下HTML代码:

<body class="frontend page-object" data-tealium="{"tmsData":{"ad_type":"Marktplatz","page_type":"Ad_View","vertical_id":"5","vertical":"Marktplatz","ad_title":"LEGO+Technic+8045+-+Mini-Teleskoplader+-+2+in+1","num_pictures":"4","category_level_1":"Spielen+%2F+Spielzeug","region_level_id_2":"9","category_level_3":"Lego","region_level_id_3":"117244","category_level_2":"Lego+%2F+Playmobil","region_level_id_1":"-141","price":"6","product_id":"67","category_level_max":"4","region_level_2":"Wien","region_level_3":"Wien%2C+22.+Bezirk%2C+Donaustadt","category_level_4":"Technic","seller_id":"19284847","region_level_1":"%C3%96sterreich","ad_type_id":"67","category_level_id_3":"5191","category_level_id_2":"5182","category_level_id_1":"5136","category_level_id_4":"5199","environment":"web","ad_id":"208824705","post_code":"1220","event_name":"adview","publish_date":"Sun+Jun+18+18%3A51%3A00+CEST+2017"}}" data-adid="208824705">

在这里,我尝试使用beautifulsoup获得此类别级别:"category_level_1":"Spielen+%2F+Spielzeug"。但是,我无法得到它。

如果我这样做:CatId = soup2.select("html body.frontend.page-object")[0].get_text().strip()我会得到整个HTML文字。

CatId = soup2.find("html body.frontend.page-object", {category_level_1})[0].get_text().strip()没有给我任何东西。我只需要获取Spielen+%2F+Spielzeug任何想法如何解决这个问题?

非常感谢提前。

2 个答案:

答案 0 :(得分:0)

使用JavaScript获取它的一种方法是:

const category1 = JSON.parse(document.body.getAttribute('data-tealium')).tmsData.category_level_1;

console.log(category1);

确保数据 - tealium始终可用且JSON可解析:

const tealium = document.body.getAttribute('data-tealium');
const parsedData = JSON.parse(tealium);
const category1 =
  parsedData &&
  parsedData.tmsData &&
  parsedData.tmsData.category_level_1 || null;

console.log(category1);

答案 1 :(得分:0)

我不知道结果对你有多大意义,但你可以用Python看到该字段的内容。

>>> import requests
>>> page = requests.get('https://www.willhaben.at/iad/kaufen-und-verkaufen/d/lego-technic-8045-mini-teleskoplader-2-in-1-208824705/').content
>>> import bs4
>>> soup = bs4.BeautifulSoup(page, 'lxml')
>>> data_tealium = soup.find('body').attrs['data-tealium']
>>> info = eval(data_tealium)['tmsData']
>>> for i, item in enumerate(info):
...     '--->', i
...     item, info[item]
... 
('--->', 0)
('category_level_max', '4')
('--->', 1)
('region_level_id_1', '-141')
('--->', 2)
('ad_type', 'Marktplatz')
('--->', 3)
('seller_id', '19284847')
('--->', 4)
('product_id', '67')
('--->', 5)
('category_level_id_3', '5191')
('--->', 6)
('vertical_id', '5')
('--->', 7)
('ad_type_id', '67')
('--->', 8)
('region_level_id_3', '117244')
('--->', 9)
('category_level_4', 'Technic')
('--->', 10)
('region_level_3', 'Wien%2C+22.+Bezirk%2C+Donaustadt')
('--->', 11)
('vertical', 'Marktplatz')
('--->', 12)
('region_level_id_2', '9')
('--->', 13)
('region_level_1', '%C3%96sterreich')
('--->', 14)
('post_code', '1220')
('--->', 15)
('event_name', 'adview')
('--->', 16)
('page_type', 'Ad_View')
('--->', 17)
('category_level_1', 'Spielen+%2F+Spielzeug')
('--->', 18)
('num_pictures', '4')
('--->', 19)
('price', '6')
('--->', 20)
('category_level_3', 'Lego')
('--->', 21)
('category_level_id_2', '5182')
('--->', 22)
('ad_title', 'LEGO+Technic+8045+-+Mini-Teleskoplader+-+2+in+1')
('--->', 23)
('publish_date', 'Sun+Jun+18+18%3A51%3A00+CEST+2017')
('--->', 24)
('category_level_2', 'Lego+%2F+Playmobil')
('--->', 25)
('category_level_id_1', '5136')
('--->', 26)
('ad_id', '208824705')
('--->', 27)
('region_level_2', 'Wien')
('--->', 28)
('category_level_id_4', '5199')
('--->', 29)
('environment', 'web')