就像一个例子,我正在使用Yelp。 Yelp不会列出电子邮件,因此如果您想获取Yelp电子邮件,则需要抓取一个列表,然后向该列表网站发出请求并抓取该电子邮件。目前,我正在抓取列表网站的主页,如果该页面上没有列出电子邮件,电话号码等,那么我加载联系页面并检查。我遇到的问题是我正在寻找的信息并不总是在这些页面上。最好是加载包含某些关键字的网站上的所有链接,然后创建一个方法,查看所有这些页面中的电子邮件,电话号码等,并在找到时返回它们。这样做的好方法是什么?以下是我目前正在浏览网站页面的方式:
rules = (
Rule(LinkExtractor(allow=r'biz', restrict_xpaths='//*[contains(@class, "natural-search-result")]//a[@class="biz-name"]'), callback='parse_item', follow=True),
Rule(LinkExtractor(allow=r'start', restrict_xpaths='//a[contains(@class, "prev-next")]'), follow=True)
)
def parse_item(self, response):
i = YelpscraperItem()
i['phone'] = self.beautify(response.xpath('//*[@class="biz-phone"]/text()').extract())
i['state'] = self.beautify(response.xpath('//span[@itemprop="addressRegion"]/text()').extract())
i['company'] = self.beautify(response.xpath('//h1[contains(@class, "biz-page-title")]/text()').extract())
website = i['website'] = self.beautify(response.xpath('//div[@class="biz-website"]/a/text()').extract())
if type(website) is list and website:
website = self.checkScheme(website[0])
request = Request(website, callback=self.parse_home_page, dont_filter=True)
request.meta['item'] = i
yield request
else:
yield i
def parse_home_page(self, response):
try:
i = response.meta['item']
sel = Selector(response)
rawEmail = sel.xpath("substring-after(//a[starts-with(@href, 'mailto:')]/@href, 'mailto:')").extract()
if (type(rawEmail) is list) and ('@' in rawEmail[0]):
i = self.format_email(rawEmail, i, "Home Page (Link)")
yield i
else:
rawContactPage = response.xpath("//a[contains(@href, 'contact')]/@href").extract()
if type(rawContactPage) is list and rawContactPage:
contactPage = rawContactPage[0]
contactPage = urlparse.urljoin(response.url, contactPage.strip())
request = Request(contactPage, callback=self.parse_contact_page, dont_filter=True)
request.meta['item'] = i
request.meta['home-page-response'] = response
yield request
else:
yield i
except TypeError as er:
print er
def parse_contact_page(self, response):
try:
i = response.meta['item']
homePageResponse = response.meta['home-page-response']
rawEmail = response.xpath("substring-after(//a[starts-with(@href, 'mailto:')]/@href, 'mailto:')").extract()
if (type(rawEmail) is list) and ('@' in rawEmail[0]):
i = self.format_email(rawEmail, i, "Contact Page (Link)")
elif (type(rawEmail) is list) and (rawEmail[0] == ''):
rawEmail = response.xpath('//body').re(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[A-Za-z]{2,3}')
if (type(rawEmail) is list) and rawEmail:
i = self.format_email(rawEmail, i, "Contact Page (Text)")
else:
rawEmail = homePageResponse.xpath('//body').re(r'[a-zA-Z0-9\.\-+_]+@[a-zA-Z0-9\.\-+_]+\.[A-Za-z]{2,3}')
if (type(rawEmail) is list) and rawEmail:
i = self.format_email(rawEmail, i, "Home Page (Text)")
else:
rawEmail = [self.get_whois_email(i)]
i = self.format_email(rawEmail, i, "Whois Page")
yield i
except TypeError as er:
print er
def get_whois_email(self, i):
email = ""
try:
if 'website' in i.keys():
website = i['website']
if type(website) is list:
website = i['website'][0].lower()
w = whois.whois(website)
for whoisEmail in w.emails:
whoisEmail = whoisEmail.lower()
if website in whoisEmail:
email = whoisEmail
else:
for domain in self.whiteListed:
if domain in whoisEmail:
email = whoisEmail
except IndexError as er:
log.msg("Whois Email IndexError:")
return email
答案 0 :(得分:0)
这就是Scrapy的工作方式,因为它基于Twisted,一个异步框架。每个已爬网的页面都由一个回调处理。
我认为通过请求的元属性,将信息从一个回调传递到另一个回调的唯一方法就是你正在做的事情。