Question

我在this guide之后在我的Django项目中集成了scrapy 不幸的是，无论如何，蜘蛛作业都没有启动，即使schedule.json给了我一个jobid作为回报。

我的观点：

@csrf_exempt
@api_view(['POST'])
def crawl_url(request):
    url = request.POST.get('url', None)  # takes url from request
    if not url:
        return JsonResponse({'error': 'Missing  args'})
    if not is_valid_url(url):
        return JsonResponse({'error': 'URL is invalid'})

    domain = urlparse(url).netloc  # parse the url and extract the domain
    unique_id = str(uuid4())  # creates a unique ID.

    # Custom settings for scrapy spider.
    # We can send anything we want to use it inside spiders and pipelines.
    settings = {
        'unique_id': unique_id,  # unique ID for each record for DB
        'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    }

    # Schedule a new crawling task from scrapyd.
    # settings is a special argument name.
    # This returns an ID which belongs to this task, used to check the task status
    task = scrapyd.schedule('default', 'kw_spider', settings=settings, url=url, domain=domain)

    return JsonResponse({'task_id': task, 'unique_id': unique_id, 'status': 'started'})


@csrf_exempt
@api_view(['GET'])
def get_crawl_data(request):
    task_id = request.GET.get('task_id', None)
    unique_id = request.GET.get('unique_id', None)

    if not task_id or not unique_id:
        return JsonResponse({'error': 'Missing args'})

    # Check status of crawling
    # If finished, makes query from database and get results
    # If not, return active status
    # Possible results are -> pending, running, finished
    status = scrapyd.job_status('default', task_id)
    if status == '' or status is None:
        return JsonResponse({
            'status': 'error',
            'data': 'Task not found'
        })
    elif status == 'finished':
        try:
            item = ScrapyItem.objects.get(unique_id=unique_id)
            return JsonResponse({
                'status': status,
                'data': item.to_dict['data']
            })
        except Exception as e:
            return JsonResponse({
                'status': 'error',
                'data': str(e)
            })
    else:
        return JsonResponse({
            'status': status,
            'data': {}
        })

我的蜘蛛：

class KwSpiderSpider(CrawlSpider):
    name = 'kw_spider'

    def __init__(self, *args, **kwargs):
        # __init__ overridden to have a dynamic spider
        # args passed from django views
        self.url = kwargs.get('url')
        self.domain = kwargs.get('domain')
        self.start_urls = [self.url]
        self.allowed_domains = [self.domain]

        KwSpiderSpider.rules = [
            Rule(LinkExtractor(unique=True), callback='parse_item'),
        ]
        super(KwSpiderSpider, self).__init__(*args, **kwargs)

    def parse_item(self, response):
        resp_dict = {
            'url': response.url
        }
        # resp_dict['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        # resp_dict['name'] = response.xpath('//div[@id="name"]').extract()
        # resp_dict['description'] = response.xpath('//div[@id="description"]').extract()
        return resp_dict

我也试过卷曲 curl http://localhost:6800/schedule.json -d project=default -d spider=kw_spider
这给了我以下回应：
{"node_name": "9jvtf82", "status": "ok", "jobid": "0ca057026e5611e8898f64006a668b22"}

但没有任何反应，工作没有开始

Answer 1

我通过注意scrapyd控制台日志中的错误解决了这个问题我错过了pywin32图书馆，但我不明白为什么这不符合要求。

一个简单的 pip install pywin32
修好了

Scrapyd工作没有开始

1 个答案: