我认为没有调用Scrapy parse_item回调。
我正在解析一个你能想到的网站就像黄页,但我不能在这里提及。这是我的代码。删除了一些我认为没必要的信息:
# convert unicode from html to string
fn = lambda t: unicodedata.normalize('NFKD', t).encode('ascii', 'ignore')
class NowfloatsSpiderSpider(CrawlSpider):
name =
allowed_domains =
start_url =
start_urls =
rules = [
Rule(LinkExtractor(allow=[r'[0-9a-zA-Z]+\/[-0-9a-zA-Z]+\-Stores\-in\-[a-z]+.*?\/', \
r'[0-9a-zA-Z]+\/[0-9a-zA-Z]+\-Stores\-in\-[a-z]+.*?\/', \
r'[0-9a-zA-Z]+\/Stores\-in\-[a-z]+.*?\/', \
r'[0-9a-zA-Z]+\/Stores\-in\-[a-z]+.*?\/\?page=\d+', \
r'[0-9a-zA-Z]+\/[0-9a-zA-Z]+\-Stores\-in\-[a-z]+.*?\/\?page=\d+'], \
allow_domains=allowed_domains), callback='parse_item', \
follow=True, process_links='clean_uri'),
Rule(LinkExtractor(allow=[r'\/[0-9a-zA-Z]+\/[0-9a-zA-Z]+\/'],\
allow_domains=allowed_domains), follow=True, process_links='clean_uri')
]
def clean_uri(self, links):
"""
cleans invalid url links generated with LinkExtractor
"""
for link in links:
search = invalid_regex.match(link.url)
if search:
subroute1, subroute2 = search.groups()
link.url = urlparse.urljoin(self.start_url, '/'.join([subroute1, subroute2]))
return links
def parse_item(self, response):
"""
parses the contents
"""
self.logger.info('===================== INSIDE PARSING FUNCTION ===================== %s', response.url)
item = TheItem() #defines the fields i want
for banner in response.css('.store-box'):
print "======= banners ==========="
# print banner
company_website_name = banner.css('.store-banner h3::text').extract_first().strip()
company_website_url = fn(banner.css('.store-banner > a ').xpath('./@href').extract_first())
category = fn(banner.css('.store-banner a').xpath('./span/@title')[0].extract())
city = fn(banner.css('.store-banner a').xpath('./span/@title')[1].extract())
company_num = fn(banner.css('.store-banner .telephone::text').extract_first().strip()).replace(' ', '')
tags = map(lambda x: fn(x).strip(), banner.css('.posted-item ul li a::text').extract())
print {
'company_website_name': company_website_name,
'company_website_url': company_website_url,
'category': category,
'city': city,
'company_num': company_num,
'tags': tags
}
item['company_website_name'] = company_website_name
item['company_website_url'] = company_website_url
item['category'] = category
item['city'] = city
item['company_num'] = company_num
item['tags'] = tags
return item
首先,这是回调回叫的方式吗?如果是这样,为什么不使用它。在跑步时没有打印。
也许有人可以指出我在这里做错了什么?
这里有类似的问题,但我不知道问题首先出现在什么地方?
答案 0 :(得分:0)
问题是:
allow
中的LinkExtractor(allow=[])
没有绝对网址正则表达式。它应该有,而我给了部分。所以类似于:
LinkExtractor(allow=['full_site_address_to_scrape/<regex>'])
follow_link=True
时,我被重定向,因此请确保在类中定义了handle_httpstatus_list = [301, 302]
。