Question

几个小时以来，我一直在努力解决以下问题。我正试图刮取https://www.upwork.com/jobs/_~0180b9eef40aafe057/（以及类似的帖子）

我的xpath表达式在shell和xpath验证程序中都有效，但在我的代码中却没有。

当我使用：

将响应输出到文本文件中时

    with open('response.html','w+') as f:
        f.write(response.body)

然后使用http://videlibri.sourceforge.net/cgi-bin/xidelcgi在html代码上测试xpath 它工作正常。

这适用于Shell：

for item in response.xpath("//p[strong = 'About the Client']/following-sibling::p"):
    print " ".join(map(unicode.strip, item.xpath(".//text()").extract()))
    print 'Succes!'

但是将它用于我的Scrapy Spider它什么都不返回。

我尝试了很多不同的解决方案，但似乎没有任何效果。

EDIT添加了完整的代码：

from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ypscrape.items import item1
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.loader import ItemLoader
import arrow

import logging
import re


class MySpider(CrawlSpider):

    # Login credentials for account, so more details are available
    #Rss Token for the RSS feed which pulls the new links
    rsstoken = 'REDACTED'
    user = 'REDACTED'
    password = 'REDACTED'

    name = 'dataup'
    allowed_domains = ['upwork.com']
    login_page = 'http://www.upwork.com/login'
    rssurl = 'https://www.upwork.com/ab/feed/jobs/rss?api_params=1&q=&securityToken='+ rsstoken

    # can probably be removed
    rules = (
    Rule(LinkExtractor(allow=r'upwork\.com\/jobs\/.*?_%'), callback='parse_item', follow=False),
    )

    #Called when Spider is started, initiates the login request
    def start_requests(self):
        self.log("start request started")
        yield Request(
            url=self.login_page,
            callback=self.login,
            dont_filter=True
        )

    # Use the RSS feed to gather the newest links
    def get_urls_from_rss(self, response):
        urllist = []
        content = response
        self.log("Get rss from url")
        #print str(content.body)
        gathered_urls = re.findall('(https\:\/\/.*?\/jobs\/.*?)source=rss', str(content.body))

        # Request the URLS and send them to parse_item
        for url in gathered_urls:
            if url not in urllist:
                #Check if URL has not been visited before ADD THIS
                urllist.append(url)
                yield scrapy.Request(url, callback=self.parse_item)

    def login(self, response):
        """Generate a login request."""

        self.log("login request started")
        return FormRequest.from_response(response,formname='login',
                    formdata={'login[username]': self.user, 'login[password]': self.password} 
                    , callback=self.check_login_response, method="POST")

    def check_login_response(self, response):
        """Check the response returned by a login request to see if we are
        successfully logged in.
        """
        self.log("check request started")

        if "<title>My Job Feed</title>" in response.body:
            self.log("Successfully logged in. Let's start crawling!")
            # Now the crawling can begin..
            yield scrapy.Request(self.rssurl, callback = self.get_urls_from_rss )
        else:
            self.log("Bad times :( Logging in failed")
            # Something went wrong, we couldn't log in, so nothing happens.
        #return self.initialized()

    def parse_item(self, response):
        self.logger.info('Crawling item page! %s', response.url)

        # Collect the data from the page

        with open('response.html','w+') as f:
            f.write(response.body)

        for item in response.xpath("//p[strong = 'About the Client']/following-sibling::p"):
            print " ".join(map(unicode.strip, item.xpath(".//text()").extract()))
            print 'Bingo'



        l = ItemLoader(item1(), response)


        l.add_value('timestamp', arrow.utcnow().format('YYYY-MM-DD HH:mm'))
        l.add_xpath('category1', '//*[@id="layout"]/div[2]/div[3]/div[1]/a/text()')



        return l.load_item()
        # Scrape data from page

编辑2： 我想我找到了解决方案。用

替换xpath

//p[strong]/strong

似乎解决了这个问题。问题是什么？我认为它与编码有关。它无法找到“关于客户端”，因为它获得的响应对象类似于“关于客户端”，其中包含几个空格或其他与编码相关的内容。谢谢你的帮助

Answer 1

我很快把它变成了一只蜘蛛：

import scrapy


class UpworkSpider(scrapy.Spider):
    name = "upwork"
    allowed_domains = ["upwork.com"]
    start_urls = [
        "https://www.upwork.com/jobs/_~0180b9eef40aafe057/",
    ]

    def parse(self, response):
        for item in response.xpath("//p[strong = 'About the Client']/following-sibling::p"):
            print " ".join(map(unicode.strip, item.xpath(".//text()").extract()))
            print 'Succes!'

然后，我将其作为：

运行

$ scrapy runspider spider.py

得到：

Croatia  Kastel Sucurac
            04:56 PM 
Succes!
 4
        Jobs Posted   0% Hire Rate,
        4 Open Jobs 
Succes!

在输出中。

Scrapy Xpath在Shell中工作而不是在代码中

1 个答案: