这是我的scrapy代码。
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
import pymongo
import time
class CompItem(scrapy.Item):
text = scrapy.Field()
name = scrapy.Field()
date = scrapy.Field()
url = scrapy.Field()
rating = scrapy.Field()
title = scrapy.Field()
category = scrapy.Field()
source = scrapy.Field()
user_info = scrapy.Field()
email = scrapy.Field()
mobile_no = scrapy.Field()
url_1 = scrapy.Field()
model_name = scrapy.Field()
class criticspider(CrawlSpider):
name = "flipkart_reviews"
allowed_domains = ["flipkart.com"]
urls = []
connection = pymongo.MongoClient("mongodb://localhost")
db = connection.electronics
db_coll = db.flipkart_url
d = []
start_urls = ['http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top']#urls
def parse_start_url(self, response):
sites = response.css('div.review-list div[review-id]')
items = []
model_name = response.xpath('//h1[@class="title"]/text()').re(r'Reviews of (.*?)$')[0].strip().encode('ascii','ignore')
for site in sites:
item = CompItem()
item['email'] = None
item['mobile_no'] = int(0)
item['category'] = None
item['title'] = site.xpath('.//div[contains(@class,"line fk-font-normal bmargin5 dark-gray")]/strong/text()').extract()[0].encode('ascii','ignore')
item['date'] = site.xpath('.//div[contains(@class, "date")]/text()').extract()[0].strip()
item['model_name'] = model_name
item['text'] = site.xpath('.//span[contains(@class,"review-text")]/text()').extract()[0]
item['rating'] = float(site.xpath('.//div[contains(@class,"fk-stars")]/@title').extract()[0].split("stars")[0])
item['name'] = ''.join(site.xpath('.//div[contains(@class, "date")]/preceding-sibling::*[1]//text()').extract()).strip()
item["url"] = response.url
item['source'] = int(3)
yield item
这在本地计算机上工作正常,没有任何错误。但是当我把它放在AWS上时,它开始让我犯错误。
2015-10-05 12:08:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:09:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:09:34 [scrapy] DEBUG: Retrying <GET http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top> (failed 1 times): TCP connection timed out: 110: Connection timed out.
2015-10-05 12:10:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:11:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:11:41 [scrapy] DEBUG: Retrying <GET http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top> (failed 2 times): TCP connection timed out: 110: Connection timed out.
2015-10-05 12:12:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:13:26 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-10-05 12:13:48 [scrapy] DEBUG: Gave up retrying <GET http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top> (failed 3 times): TCP connection timed out: 110: Connection timed out.
2015-10-05 12:13:48 [scrapy] ERROR: Error downloading <GET http://www.flipkart.com/samsung-galaxy-note-4/product-reviews/ITMEYFHGFDB75R73?pid=MOBEYAW2RFHQG83F&type=top>: TCP connection timed out: 110: Connection timed out.
2015-10-05 12:13:48 [scrapy] INFO: Closing spider (finished)
2015-10-05 12:13:48 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 3,
'downloader/exception_type_count/twisted.internet.error.TCPTimedOutError': 3,
'downloader/request_bytes': 1119,
'downloader/request_count': 3,
'downloader/request_method_count/GET': 3,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 10, 5, 6, 43, 48, 727700),
'log_count/DEBUG': 3,
'log_count/ERROR': 1,
'log_count/INFO': 13,
'scheduler/dequeued': 3,
'scheduler/dequeued/memory': 3,
'scheduler/enqueued': 3,
'scheduler/enqueued/memory': 3,
'start_time': datetime.datetime(2015, 10, 5, 6, 37, 26, 877249)}
2015-10-05 12:13:48 [scrapy] INFO: Spider closed (finished)
之前脚本运行得很好。我尝试了各种方法 -
1.提出DOWNLOAD_DELAY
2.用Google搜索相同的问题并阅读网络抓取礼仪。
但是一切都是徒劳的。
答案 0 :(得分:4)
您可以进行几项检查。
如果以上步骤产生结果,则意味着我们正在获得响应,并且我们的Spider请求方式存在问题。现在我们几乎无法为蜘蛛做什么。
答案 1 :(得分:0)
也许您的IP被网站阻止