大家好,我正在制作Scrapy蜘蛛只会刮掉今天发布的广告。但我认为我的代码有问题,请帮助我。它也应该从广告中抓取电子邮件地址,但它不会刮掉电子邮件。标题,网址和广告位置都很好。
# -*- coding: utf-8 -*-
import scrapy
import re
from datetime import datetime
from datetime import date,timedelta
options = {
'CONCURRENT_REQUESTS': 60,
'CONCURRENT_ITEMS': 10000,
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24
(KHTML, like Gecko) "
"Chrome/19.0.1055.1 Safari/535.24",
'SW_SAVE_BUFFER': 30,
'DOWNLOAD_DELAY': 5,
'COOKIES_ENABLED': False,
}
current_date = "datetime.today().strftime('%b %d')"
class adscrapperSpider(scrapy.Spider):
name = 'ad-scrapper'
allowed_domains = ['www.craigslist.org']
start_urls = ['https://phoenix.craigslist.org/']
def parse(self, response):
ad = response.xpath('//p[@class="result-info"]')
for ad in ad-scrapper:
title = ad.xpath('a/text()').extract_first()
address = ad.xpath('span[@class="result-meta"]/span[@class="result-
hood"]/text()').extract_first("")[2:-1]
relative_url = ad.xpath('a/@href').extract_first()
absolute_url = response.urljoin(relative_url)
if current_date == datetime.strptime(date, '%b %d').strftime('%b
%d'):
yield{'URL':absolute_url, 'Title':title, 'Address':address,
'Email':email}
else:
print('there was no ad posted yesterday!')