我正在使用Scrapy和Python抓取一个网站。该代码产生七个意外错误。这是我的代码:
from scrapy import Spider
from scrapy.http import Request
import re
import pymysql
import sys
class EventSpider(Spider):
name = 'event' #name of the spider
allowed_domains = ['....com']
start_urls = ['http://....com/...',
'http://....com/....',
'http://....com/.....',
'http://.....com/.....',
'http://www.....com/....',
'http://www.....com/....',
'http://www....com/.....',
'http://www.....com/....',
'http://www......com/....',
'http://www......com/....',
'http://www......com/....',
'http://www......com/...',
'http://www......com/....',
'http://www......com/....',
'http://www......com/...',
'http://www.....com/.....',
'http://www......com/.....']
def parse(self, response):
events = response.xpath('//h2/a/@href').extract()
#events = response.xpath('//a[@class = "event-overly"]').extract()
for event in events:
absolute_url = response.urljoin(event)
yield Request(absolute_url, callback = self.parse_event)
def parse_event(self, response):
title = response.xpath('//title/text()').extract()
''' date = response.xpath('//div/p/text()')[0]. extract()
start_date = re.search("^[0-9]{1,2}\s[A-Z][a-z]{2}(,)\s[0-9]{4}",date)
if(start_date==None):
start_date2 =''
else:
start_date2 = start_date.group(0)
#end_date = response.xpath('//div/p/text()')[0]. extract()
end_date = re.search("\s[0-9]{1,2}\s[A-Z][a-z]{2}(,)\s[0-9]{4}", date)
if(end_date==None):
end_date2=''
else:
end_date2=end_date.group(0)'''
#email = response.xpath('//*[@id="more-email-with-dots"]/@value').extract_first()
#email_final = re.findall("[a-zA-Z0-9_.+-]+@(?!....)[\.[a-zA-Z0-9-.]+",email)
description = response.xpath('//*[@class = "events-discription-block"]//*/text()').extract()
description1 = [w.replace('\r\n', '') for w in description]
description2 = ",".join(description1)
''' time = response.xpath('//div/p/text()')[1]. extract()
end_time = re.search("\s[0-9]{1,2}(:)[0-9]{1,2}(:)[0-9]{1,2}", time)
if(end_time==None):
end_time2=''
else:
end_time2=end_time.group(0)
start_time = re.search("^[0-9]{1,2}(:)[0-9]{1,2}(:)[0-9]{1,2}", time)
if(start_time==None):
start_time2=''
else:
start_time2=start_time.group(0) '''
venue = response.xpath('//*[@id ="more-text-with-dots"]/@value').extract_first()
pin = re.search("\s[0-9]{6}", venue)
if(pin==None):
pin2 = ''
else:
pin2 = pin.group(0)
connection = pymysql.connect (host = "localhost", user = "root", passwd = "Iam90#honest", db = "city_details")
cursor = connection.cursor ()
cursor.execute ("select city, state, country from cities_list")
data = cursor.fetchall ()
for row in data :
found = re.search(row[0], venue)
if(found!=None):
city = row[0]
state = row[1]
country = row[2]
break
else:
city = ''
state = ''
country = ''
creative = response.xpath('//img/@src')[1].extract()
yield{
'title': title,
#'start_date': start_date2,
#'end_date': end_date2,
#'start_time': start_time2,
#'end_time': end_time2,
'venue': venue,
'city': city,
'state': state,
'country': country,
'pin': pin2,
#'email': email_final,
'description': description2,
'creative': creative
}
这是统计数据:
2018-03-23 19:18:30 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 45819,
'downloader/request_count': 109,
'downloader/request_method_count/GET': 109,
'downloader/response_bytes': 1024848,
'downloader/response_count': 109,
'downloader/response_status_count/200': 90,
'downloader/response_status_count/301': 19,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2018, 3, 23, 10, 18, 30, 182504),
'item_scraped_count': 64,
'log_count/DEBUG': 174,
'log_count/ERROR': 7,
'log_count/INFO': 8,
'memusage/max': 54501376,
'memusage/startup': 54501376,
'request_depth_max': 1,
'response_received_count': 90,
'scheduler/dequeued': 105,
'scheduler/dequeued/memory': 105,
'scheduler/enqueued': 105,
'scheduler/enqueued/memory': 105,
'spider_exceptions/TypeError': 7,
'start_time': datetime.datetime(2018, 3, 23, 10, 18, 13, 744056)}
2018-03-23 19:18:30 [scrapy.core.engine] INFO: Spider closed (finished)
更具体地说,显示的错误是: TypeError:期望的字符串或类字节对象。我无法弄清楚错误。我无法理解它为什么以及在哪里产生预期的字符串或类似字节的对象错误。
答案 0 :(得分:1)
正如@FrankMartin所说,venue
的值在该页面中为None
并导致错误。您可以按照以下步骤轻松查看:
https://www.eventsnow.com/events/9238-ipl-2018-srh-vs-royal-challengers-bangalore
scrapy shell https://www.eventsnow.com/events/9238-ipl-2018-srh-vs-royal-challengers-bangalore
第3点的代码。
In [2]: response.xpath('//*[@id ="more-text-with-dots"]/@value').extract_first()
In [3]: