如何判断django芹菜任务是否正确运行scrapy蜘蛛

时间:2015-07-29 00:56:13

标签: python django web-scraping scrapy celery

我写了一个scrapy蜘蛛,我在django celery任务中运行。当我使用以下命令运行任务时: python manage.py celery worker --lvelvel = info 来自this tutorial任务在终端中运行,似乎scrapy日志开始启动但是在日志开始出现在屏幕上之后不久,似乎芹菜脚本接管了终端窗口。我仍然不习惯使用芹菜,所以我无法分辨这项任务的进展情况。这是task.py脚本和蜘蛛文件的代码(包含我从SO post获得的代码)

tasks.py

from celery.registry import tasks
from celery.task import Task


from django.template.loader import render_to_string
from django.utils.html import strip_tags

from django.core.mail import EmailMultiAlternatives


from ticket_city_scraper.ticket_city_scraper.spiders.tc_spider import spiderCrawl
from celery import shared_task

@shared_task
def crawl():
    return spiderCrawl()

蜘蛛文件(底部有相关代​​码)

import scrapy
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from comparison.ticket_city_scraper.ticket_city_scraper.items import ComparatorItem
from urlparse import urljoin

from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging


from billiard import Process




bandname = raw_input("Enter bandname\n")
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html" 

class MySpider3(CrawlSpider):
    handle_httpstatus_list = [416]
    name = 'comparator'
    allowed_domains = ["www.ticketcity.com"]

    start_urls = [tc_url]
    tickets_list_xpath = './/div[@class = "vevent"]'
    def create_link(self, bandname):
        tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"  
        self.start_urls = [tc_url]
        #return tc_url      

    tickets_list_xpath = './/div[@class = "vevent"]'

    def parse_json(self, response):
        loader = response.meta['loader']
        jsonresponse = json.loads(response.body_as_unicode())
        ticket_info = jsonresponse.get('B')
        price_list = [i.get('P') for i in ticket_info]
        if len(price_list) > 0:
            str_Price = str(price_list[0])
            ticketPrice = unicode(str_Price, "utf-8")
            loader.add_value('ticketPrice', ticketPrice)
        else:
            ticketPrice = unicode("sold out", "utf-8")
            loader.add_value('ticketPrice', ticketPrice)
        return loader.load_item()

    def parse_price(self, response):
        print "parse price function entered \n"
        loader = response.meta['loader']
        event_City = response.xpath('.//span[@itemprop="addressLocality"]/text()').extract() 
        eventCity = ''.join(event_City) 
        loader.add_value('eventCity' , eventCity)
        event_State = response.xpath('.//span[@itemprop="addressRegion"]/text()').extract() 
        eventState = ''.join(event_State) 
        loader.add_value('eventState' , eventState) 
        event_Date = response.xpath('.//span[@class="event_datetime"]/text()').extract() 
        eventDate = ''.join(event_Date)  
        loader.add_value('eventDate' , eventDate)    
        ticketsLink = loader.get_output_value("ticketsLink")
        json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
        json_id=  "".join(json_id_list)
        json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
        yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True) 

    def parse(self, response):
        """
        # """
        selector = HtmlXPathSelector(response)
        # iterate over tickets
        for ticket in selector.select(self.tickets_list_xpath):
            loader = XPathItemLoader(ComparatorItem(), selector=ticket)
            # define loader
            loader.default_input_processor = MapCompose(unicode.strip)
            loader.default_output_processor = Join()
            # iterate over fields and add xpaths to the loader
            loader.add_xpath('eventName' , './/span[@class="summary listingEventName"]/text()')
            loader.add_xpath('eventLocation' , './/div[@class="divVenue location"]/text()')
            loader.add_xpath('ticketsLink' , './/a[@class="divEventDetails url"]/@href')
            #loader.add_xpath('eventDateTime' , '//div[@id="divEventDate"]/@title') #datetime type
            #loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')

            print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
            #sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
            ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
            ticketsURL = urljoin(response.url, ticketsURL)
            yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)

#Code to run spider from celery task script
class UrlCrawlerScript(Process):
    def __init__(self, spider):
        Process.__init__(self)
        settings = get_project_settings()
        self.crawler = Crawler(settings)
        self.crawler.configure()
        self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
        self.spider = spider

    def run(self):
        self.crawler.crawl(self.spider)
        self.crawler.start()
        reactor.run()

def spiderCrawl():
   # settings = get_project_settings()
   # settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
   # process = CrawlerProcess(settings)
   # process.crawl(MySpider3)
   # process.start()
   spider = MySpider()
   crawler = UrlCrawlerScript(spider)
   crawler.start()
   crawler.join()

我尝试制作代码,以便用户可以将文本输入到一个表单中,然后将其连接到网址,但是现在我使用raw_input来获取用户的输入。是否需要将某些内容添加到代码中才能使任务完全运行?任何帮助/代码将不胜感激,谢谢。

修改

运行命令后的终端窗口

(trydjango18)elijah@elijah-VirtualBox:~/Desktop/trydjango18/src2/trydjango18$ python manage.py celery worker --loglevel=info
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/django/core/management/base.py:259: RemovedInDjango19Warning: "requires_model_validation" is deprecated in favor of "requires_system_checks".
  RemovedInDjango19Warning)

/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning: 
    The 'BROKER_VHOST' setting is scheduled for deprecation in     version 2.5 and removal in version v4.0.     Use the BROKER_URL setting instead

  alternative='Use the {0.alt} instead'.format(opt))

/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning: 
    The 'BROKER_HOST' setting is scheduled for deprecation in     version 2.5 and removal in version v4.0.     Use the BROKER_URL setting instead

  alternative='Use the {0.alt} instead'.format(opt))

/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning: 
    The 'BROKER_USER' setting is scheduled for deprecation in     version 2.5 and removal in version v4.0.     Use the BROKER_URL setting instead

  alternative='Use the {0.alt} instead'.format(opt))

/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning: 
    The 'BROKER_PASSWORD' setting is scheduled for deprecation in     version 2.5 and removal in version v4.0.     Use the BROKER_URL setting instead

  alternative='Use the {0.alt} instead'.format(opt))

/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning: 
    The 'BROKER_PORT' setting is scheduled for deprecation in     version 2.5 and removal in version v4.0.     Use the BROKER_URL setting instead

  alternative='Use the {0.alt} instead'.format(opt))

/home/elijah/Desktop/trydjango18/src2/trydjango18/comparison/ticket_city_scraper/ticket_city_scraper/spiders/tc_spider.py:6: ScrapyDeprecationWarning: Module `scrapy.contrib.spiders` is deprecated, use `scrapy.spiders` instead
  from scrapy.contrib.spiders import CrawlSpider , Rule

/home/elijah/Desktop/trydjango18/src2/trydjango18/comparison/ticket_city_scraper/ticket_city_scraper/spiders/tc_spider.py:9: ScrapyDeprecationWarning: Module `scrapy.contrib.loader` is deprecated, use `scrapy.loader` instead
  from scrapy.contrib.loader import ItemLoader

/home/elijah/Desktop/trydjango18/src2/trydjango18/comparison/ticket_city_scraper/ticket_city_scraper/spiders/tc_spider.py:11: ScrapyDeprecationWarning: Module `scrapy.contrib.loader.processor` is deprecated, use `scrapy.loader.processors` instead
  from scrapy.contrib.loader.processor import Join, MapCompose

Enter bandname
awolnation
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/apps/worker.py:161: CDeprecationWarning: 
Starting from version 3.2 Celery will refuse to accept pickle by default.

The pickle serializer is a security concern as it may give attackers
the ability to execute any command.  It's important to secure
your broker from unauthorized access when using pickle, so we think
that enabling pickle should require a deliberate action and not be
the default choice.

If you depend on pickle then you should set a setting to disable this
warning and to be sure that everything will continue working
when you upgrade to Celery 3.2::

    CELERY_ACCEPT_CONTENT = ['pickle', 'json', 'msgpack', 'yaml']

You must only enable the serializers that you will actually use.


  warnings.warn(CDeprecationWarning(W_PICKLE_DEPRECATED))

[2015-08-05 18:15:22,915: WARNING/MainProcess] /home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/apps/worker.py:161: CDeprecationWarning: 
Starting from version 3.2 Celery will refuse to accept pickle by default.

The pickle serializer is a security concern as it may give attackers
the ability to execute any command.  It's important to secure
your broker from unauthorized access when using pickle, so we think
that enabling pickle should require a deliberate action and not be
the default choice.

If you depend on pickle then you should set a setting to disable this
warning and to be sure that everything will continue working
when you upgrade to Celery 3.2::

    CELERY_ACCEPT_CONTENT = ['pickle', 'json', 'msgpack', 'yaml']

You must only enable the serializers that you will actually use.


  warnings.warn(CDeprecationWarning(W_PICKLE_DEPRECATED))


 -------------- celery@elijah-VirtualBox v3.1.18 (Cipater)
---- **** ----- 
--- * ***  * -- Linux-3.13.0-54-generic-x86_64-with-Ubuntu-14.04-trusty
-- * - **** --- 
- ** ---------- [config]
- ** ---------- .> app:         default:0x7f6ce3b3e410 (djcelery.loaders.DjangoLoader)
- ** ---------- .> transport:   amqp://guest:**@localhost:5672//
- ** ---------- .> results:     database
- *** --- * --- .> concurrency: 2 (prefork)
-- ******* ---- 
--- ***** ----- [queues]
 -------------- .> celery           exchange=celery(direct) key=celery


[tasks]
  . comparison.tasks.crawl

[2015-08-05 18:15:23,178: INFO/MainProcess] Connected to amqp://guest:**@127.0.0.1:5672//
[2015-08-05 18:15:23,276: INFO/MainProcess] mingle: searching for neighbors
[2015-08-05 18:15:24,322: INFO/MainProcess] mingle: all alone
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/djcelery/loaders.py:136: UserWarning: Using settings.DEBUG leads to a memory leak, never use this setting in production environments!
  warn('Using settings.DEBUG leads to a memory leak, never '

[2015-08-05 18:15:24,403: WARNING/MainProcess] /home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/djcelery/loaders.py:136: UserWarning: Using settings.DEBUG leads to a memory leak, never use this setting in production environments!
  warn('Using settings.DEBUG leads to a memory leak, never '

[2015-08-05 18:15:24,404: WARNING/MainProcess] celery@elijah-VirtualBox ready.

0 个答案:

没有答案