我正在抓亚马逊网站,所以当我运行程序时,我开始废弃包含总共75页(1200个产品)的书籍,它停在随机页面(我想10到20页),只需抓取200 - 400个产品。 听到我的代码
import requests
from bs4 import BeautifulSoup
def amazon_cat(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.amazon.in/s/ref=sr_pg_2?fst=as%3Aoff&rh=n%3A976389031%2Cp_n_feature_three' \
'_browse-bin%3A9141482031&page=' + str(page) + '&bbn=976389031&ie=UTF8&qid=1468676695&lo=stripbooks'
source_code = requests.get(url)
plain_text = source_code.text
soap = BeautifulSoup(plain_text, 'html.parser')
for links in soap.findAll('h2',
{'class': 'a-size-medium a-color-null s-inline s-access-title a-text-normal'}):
value = links.get('data-attribute')
# link = links.get('herf')
# link = parse.urljoin('http://www.amazon.in', link)
with open('valuess.txt', 'a') as file:
file.write(value + '\n')
print(value)
page += 1
amazon_cat(75)