我正在尝试使用两种选项(在不同的站点上已经对我有用)打开一个URL,但是它们都不起作用。
from urllib.request import urlopen as ureq
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/s?k=black+watch&s=review-rank&qid=1568506943&ref=sr_st_review-rank"
client = ureq(url)
page = client.read()
client.close()
amazon_soup = BeautifulSoup(page, "html.parser")
# amazon_soup = get_page(url)
print(amazon_soup)
def get_page(url):
try:
response = requests.get(url)
if not response.ok:
print(('server responded', response.status_code))
else:
soup = BeautifulSoup(response.text, 'html.parser')
except:
print("oops")
requests.exceptions.ConnectionError
return
return soup
if __name__ == "__main__":
main()
它应该打印此搜索的html代码,但我不断收到“ urllib.error.HTTPError:HTTP错误503:服务不可用” 即使我尝试其他方式(目前标记为评论),我也会收到503错误
答案 0 :(得分:1)
您需要在请求中附加适当的用户代理标头,请查看此代码段 像Amazon和reddit这样的网站希望您拥有适当的用户代理,否则它们将以503响应。
from urllib.request import urlopen, Request
import requests
from bs4 import BeautifulSoup
def main():
url = "https://www.amazon.com/s?k=black+watch&s=review-rank&qid=1568506943&ref=sr_st_review-rank"
req = Request(url)
req.add_header('user-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20')
client = urlopen(req)
page = client.read()
client.close()
amazon_soup = BeautifulSoup(page, "html.parser")
# amazon_soup = get_page(url)
print(amazon_soup)
def get_page(url):
try:
response = requests.get(url)
if not response.ok:
print(('server responded', response.status_code))
else:
soup = BeautifulSoup(response.text, 'html.parser')
except:
print("oops")
requests.exceptions.ConnectionError
return
return soup
if __name__ == "__main__":
main()