我想知道酒店的名字,通常我对这种抓取没有问题,但在这里它不起作用我不明白。
这是我的脚本:
import numpy as np
import time
from random import randint
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
#headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://www.espncricinfo.com/',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
url = 'https://www.booking.com/hotel/fr/hyatt-regency-paris-etoile.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1619708145;srpvid=6f6268f8305e011d;type=total;ucfs=1&#hotelTmpl'
results = requests.get(url, headers = headers)
soup = BeautifulSoup(results.text, "html.parser")
hotel = soup.find('h2', class_ = 'hp__hotel-name').text
print(hotel)
这里是错误:
Traceback (most recent call last):
File "test_booking_info_supp.py", line 75, in <module>
hotel = soup.find('h2', class_ = 'hp__hotel-name').text
AttributeError: 'NoneType' object has no attribute 'text'
我不明白为什么我获得了 None
,这是 html:
网站链接在图片中
答案 0 :(得分:0)
您可以尝试使用标签中的 id
属性来获取文本值
url = 'https://www.booking.com/hotel/fr/hyatt-regency-paris-etoile.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1619708145;srpvid=6f6268f8305e011d;type=total;ucfs=1&#hotelTmpl'
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
hotel = soup.find("h2",attrs={"id":"hp_hotel_name"})
print(hotel.text.strip("\n").split("\n")[1])
输出:
'Hyatt Regency Paris Etoile'