我是python的新手。我想根据csv文件中名为hotel_FortWorth.csv
的给定链接提取每个酒店的所有评论详细信息,共有3列:订单,名称,链接。
hotel_FortWorth.csv示例:
name link
1 Crockett Hotel https://www.tripadvisor.com.au/Hotel_Review-g60956-d553469-Reviews-Crockett_Hotel-San_Antonio_Texas.html
2 La Cantera Resort & Spa https://www.tripadvisor.com.au/Hotel_Review-g60956-d108571-Reviews-La_Cantera_Resort_Spa-San_Antonio_Texas.html
3 .....
4....
我在thepage = urllib.request.urlopen(url)
遇到错误。有人请帮助我解决这个问题。我对此表示高度赞赏。
data = pd.read_csv('hotel_FortWorth.csv', header = None)
df = data[2]
for url in df:
print(url)
thepage = urllib.request.urlopen(url)
soup = BeautifulSoup(thepage, "html.parser")
while True:
a = b = 0
overallRatingarray = seeAllReviewsarray = rankarray = hotelarray = ""
for profile in soup.findAll(attrs={"class": "overview_card"}):
image = profile.text.replace("\n", "|||||").strip()
if image.find("rating") > 0:
counter = image.split("rating", 1)[0].split("|", 1)[1][-4].replace("|", "").strip()
if len(overallRatingarray) == 0:
overallRatingarray = [counter]
else:
overallRatingarray.append(counter)
错误是:
Traceback (most recent call last):
File "E:/LA TROBE SUBJECTS/Python/testing.py", line 33, in <module>
counter = image.split("rating", 1)[0].split("|", 1)[1][-4].replace("|", "").strip()
IndexError: list index out of range
Process finished with exit code 1
答案 0 :(得分:0)
requests
的示例,请参见http://docs.python-requests.org/en/master/。
import requests
import pandas as pd
from bs4 import BeautifulSoup
def main():
data = pd.read_csv("hotel_FortWorth.csv", header=None)
df = data[2]
for url in df:
print(url)
thepage = requests.get(url).text
soup = BeautifulSoup(thepage, "html.parser")
print(soup)
...
if __name__ == '__main__':
main()