scrapy解析第一页

时间:2015-01-05 02:32:15

标签: python web-scraping scrapy

我正在使用scrapy .24.4,我试图从威吓专家那里获取一些信息并且我几乎得到了它,我可以获取所有页面上的所有信息,除了第一页(或者start_url)。我已经尝试了parse_start_url并添加了规则,但却无法让它发挥作用。我确定它只是我忽略的东西,但是我整个周末都在看它,只是需要休息一下。如果有人有任何建议等,我会很感激哦。我确实让它在start_url中使用了一个范围,但它看起来有点不雅,而且我正在努力学习正确的方法。非常感谢提前!!

import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse

class ThreatExpertSpider(scrapy.Spider):
name = 'threatexpert'
start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]

def parse(self, response):
    print '++++++++++++++++++++++++pull all page links+++++++++++++++++++++++'
    urls = response.xpath('//a[contains(@href, "page")]/@href').extract()
    for url in urls:
        url = urlparse.urljoin(response.url, url)
        self.log('Found follow url: %s' % url)
        yield scrapy.Request(url, callback = self.parse_links)


def parse_links(self, response):
    print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
    urls = response.xpath('//a[contains(@href, "md5")]/@href').extract()
    for url in urls:
        url = urlparse.urljoin(response.url, url)
        self.log('Found follow url: %s' % url)
        yield scrapy.Request(url, callback = self.parse_items)


def parse_items(self, response):
    self.log("Hi, this is an item page! %s" % response.url)
    item = ThreatExpert()
    item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
    yield item

2 个答案:

答案 0 :(得分:1)

很多,非常感谢它引导我完成工作的回应!只是使用了错误的类而不是类ThreatExpertSpider(scrapy.Spider),我使用了类ThreatExpertSpider(CrawlSpider):我仍然不完全确定它是如何工作的,但确实如此。我知道RTFM,哈哈,但我正在学习。如果其他人正在寻找这个,那么这对我有用。

import scrapy
import re
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from threatexpert.items import ThreatExpert
import urlparse

class ThreatExpertSpider(CrawlSpider):
    name = 'threatexpert'
    start_urls = ["http://www.threatexpert.com/reports.aspx?tf=3&sl=1"]

    rules = (
        Rule(SgmlLinkExtractor(allow=r'page=\d'), callback='parse_links', follow=True),
        )

    def parse_start_url(self, response):
        print '++++++++++++++++++++++++parse_start_url+++++++++++++++++++++++'
        return self.parse_items(response)
        # urls = response.xpath('//a[contains(@href, "page")]/@href').extract()
        # for url in urls:
        #     url = urlparse.urljoin(response.url, url)
        #     self.log('Found follow url: %s' % url)
        #     yield scrapy.Request(url, callback = self.parse_links)


    def parse_links(self, response):
        print '++++++++++++++++++++++++pull item urls++++++++++++++++++++++++++'
        urls = response.xpath('//a[contains(@href, "md5")]/@href').extract()
        for url in urls:
            url = urlparse.urljoin(response.url, url)
            self.log('Found follow url: %s' % url)
            yield scrapy.Request(url, callback = self.parse_items)


    def parse_items(self, response):
        self.log("Hi, this is an item page! %s" % response.url)
        item = ThreatExpert()
        item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")

        # item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
        # if item['callback']:
        #     item['callback'] = response.xpath('//*[contains(text(), "The following Host Names were requested from a host database:")]/following-sibling::ul/li/text()').extract()
        # else:
        #     del item['callback']
        yield item

答案 1 :(得分:0)

请参考以下代码,这对我有用。如果您有任何疑问,请通过命令更新。

from scrapy.spider import BaseSpider
from scrapy.http import Request
import re
from urlparse import urljoin
from scrapy.selector import HtmlXPathSelector
from threatexpert.items import ThreatExpert
import inspect
class usmallspider(BaseSpider):
    name = 'threatexpert'
    start_urls = ["http://www.threatexpert.com/reports.aspx?tf=2&sl=1"]

def parse(self, response):
    hxs = HtmlXPathSelector(response)
    urls = response.xpath('//a[contains(@href, "page")]/@href').extract()
    for url in urls:
        url = urljoin(response.url, url)
        print url
        if url:
            yield Request(url, callback=self.parse_links)

def parse_links(self, response):
    hxs = HtmlXPathSelector(response)
    urls = response.xpath('//a[contains(@href, "md5")]/@href').extract()
    for url in urls:
        url = urljoin(response.url, url)
        if url:
            yield Request(url, callback = self.parse_items)


def parse_items(self, response):
    itm=[]
    item = MallUk1Item()
    hxs = HtmlXPathSelector(response)
    item['md5'] = response.xpath('//html/body/ul[1]/ul/ul/li[1]/text()').re(r"File MD5: ([\w, ]+)")
    itm.append(item)
    return itm