Question

我想知道酒店的名字，通常我对这种抓取没有问题，但在这里它不起作用我不明白。

这是我的脚本：

import numpy as np


import time
from random import randint
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import random

#headers= {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3',
    'Referer': 'https://www.espncricinfo.com/',
    'Upgrade-Insecure-Requests': '1',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}

url = 'https://www.booking.com/hotel/fr/hyatt-regency-paris-etoile.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1619708145;srpvid=6f6268f8305e011d;type=total;ucfs=1&#hotelTmpl'

results = requests.get(url, headers = headers)

soup = BeautifulSoup(results.text, "html.parser")

hotel = soup.find('h2', class_ = 'hp__hotel-name').text

print(hotel)

这里是错误：

Traceback (most recent call last):
  File "test_booking_info_supp.py", line 75, in <module>
    hotel = soup.find('h2', class_ = 'hp__hotel-name').text
AttributeError: 'NoneType' object has no attribute 'text'

我不明白为什么我获得了 None，这是 html：

_{网站链接在图片中}

Answer 1

您可以尝试使用标签中的 id 属性来获取文本值

url = 'https://www.booking.com/hotel/fr/hyatt-regency-paris-etoile.fr.html?label=gen173nr-1DCA0oTUIMZWx5c2Vlc3VuaW9uSA1YBGhNiAEBmAENuAEXyAEM2AED6AEB-AECiAIBqAIDuAL_5ZqEBsACAdICJDcxYjgyZmI2LTFlYWQtNGZjOS04Y2U2LTkwNTQyZjI5OWY1YtgCBOACAQ;sid=303509179a2849df63e4d1e5bc1ab1e3;dest_id=-1456928;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1619708145;srpvid=6f6268f8305e011d;type=total;ucfs=1&#hotelTmpl'

results = requests.get(url)

soup = BeautifulSoup(results.text, "html.parser")

hotel = soup.find("h2",attrs={"id":"hp_hotel_name"})

print(hotel.text.strip("\n").split("\n")[1])

输出：

'Hyatt Regency Paris Etoile'

网页抓取问题

1 个答案: