我是网络抓取领域的新手。因此希望这个问题是明确的。
我在互联网上找到了一个教程,该教程基于给定的ASIN(唯一的Amazon编号)来刮取Amazon数据。参见:https://www.scrapehero.com/tutorial-how-to-scrape-amazon-product-details-using-python/
运行此代码时(我对代码进行了一些调整)我每次都遇到不同的结果(即使在5秒钟后运行)也遇到了问题。在我的示例中,一次找到标题,但5秒钟后结果为NULL。
我认为原因是因为我通过Google Chrome搜索了XPATH,并且在代码的开头,有
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
我的问题:如何稳定地抓取内容? (例如:通过使用ASIN号获取网页的真实结果)
下面的代码进行再现。您可以通过命令行运行脚本:
python script_name.py
非常感谢您的帮助!
脚本:
from lxml import html
import csv,os,json
import requests
#from exceptions import ValueError
from time import sleep
def AmzonParser(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url,headers=headers)
while True:
sleep(5)
try:
doc = html.fromstring(page.content)
# Title
XPATH_NAME = '//*[@id="productTitle"]/text()'
XPATH_NAME1 = doc.xpath(XPATH_NAME)
TITLE = ' '.join(''.join(XPATH_NAME1).split()) if XPATH_NAME1 else None
#XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()'
#XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
#XPATH_CATEGORY = '//a[@class="a-link-normal a-color-tertiary"]//text()'
#XPATH_AVAILABILITY = '//div[@id="availability"]//text()'
#RAW_NAME = doc.xpath(XPATH_NAME)
#RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
#RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
#RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
#RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
#NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
#SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
#CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
#ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
#AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
#if not ORIGINAL_PRICE:
# ORIGINAL_PRICE = SALE_PRICE
if page.status_code!=200:
raise ValueError('captha')
data = {
'TITLE':TITLE
#'SALE_PRICE':SALE_PRICE,
#'CATEGORY':CATEGORY,
#'ORIGINAL_PRICE':ORIGINAL_PRICE,
#'AVAILABILITY':AVAILABILITY,
#'URL':url,
}
return data
except Exception as e:
print(e)
def ReadAsin():
# AsinList = csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
AsinList = [
'B00AEINQ9K',
'B00JWP8F3I']
extracted_data = []
for i in AsinList:
url = "http://www.amazon.com/dp/"+i
print ("Processing: "+url)
extracted_data.append(AmzonParser(url))
sleep(5)
f=open('data_scraped_data.json','w')
json.dump(extracted_data,f,indent=4)
if __name__ == "__main__":
ReadAsin()