我在this guide之后在我的Django项目中集成了scrapy
不幸的是,无论如何,蜘蛛作业都没有启动,即使schedule.json
给了我一个jobid
作为回报。
我的观点:
@csrf_exempt
@api_view(['POST'])
def crawl_url(request):
url = request.POST.get('url', None) # takes url from request
if not url:
return JsonResponse({'error': 'Missing args'})
if not is_valid_url(url):
return JsonResponse({'error': 'URL is invalid'})
domain = urlparse(url).netloc # parse the url and extract the domain
unique_id = str(uuid4()) # creates a unique ID.
# Custom settings for scrapy spider.
# We can send anything we want to use it inside spiders and pipelines.
settings = {
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# Schedule a new crawling task from scrapyd.
# settings is a special argument name.
# This returns an ID which belongs to this task, used to check the task status
task = scrapyd.schedule('default', 'kw_spider', settings=settings, url=url, domain=domain)
return JsonResponse({'task_id': task, 'unique_id': unique_id, 'status': 'started'})
@csrf_exempt
@api_view(['GET'])
def get_crawl_data(request):
task_id = request.GET.get('task_id', None)
unique_id = request.GET.get('unique_id', None)
if not task_id or not unique_id:
return JsonResponse({'error': 'Missing args'})
# Check status of crawling
# If finished, makes query from database and get results
# If not, return active status
# Possible results are -> pending, running, finished
status = scrapyd.job_status('default', task_id)
if status == '' or status is None:
return JsonResponse({
'status': 'error',
'data': 'Task not found'
})
elif status == 'finished':
try:
item = ScrapyItem.objects.get(unique_id=unique_id)
return JsonResponse({
'status': status,
'data': item.to_dict['data']
})
except Exception as e:
return JsonResponse({
'status': 'error',
'data': str(e)
})
else:
return JsonResponse({
'status': status,
'data': {}
})
我的蜘蛛:
class KwSpiderSpider(CrawlSpider):
name = 'kw_spider'
def __init__(self, *args, **kwargs):
# __init__ overridden to have a dynamic spider
# args passed from django views
self.url = kwargs.get('url')
self.domain = kwargs.get('domain')
self.start_urls = [self.url]
self.allowed_domains = [self.domain]
KwSpiderSpider.rules = [
Rule(LinkExtractor(unique=True), callback='parse_item'),
]
super(KwSpiderSpider, self).__init__(*args, **kwargs)
def parse_item(self, response):
resp_dict = {
'url': response.url
}
# resp_dict['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
# resp_dict['name'] = response.xpath('//div[@id="name"]').extract()
# resp_dict['description'] = response.xpath('//div[@id="description"]').extract()
return resp_dict
我也试过卷曲
curl http://localhost:6800/schedule.json -d project=default -d spider=kw_spider
这给了我以下回应:
{"node_name": "9jvtf82", "status": "ok", "jobid": "0ca057026e5611e8898f64006a668b22"}
但没有任何反应,工作没有开始
答案 0 :(得分:0)
我通过注意scrapyd控制台日志中的错误解决了这个问题
我错过了pywin32
图书馆,但我不明白为什么这不符合要求。
一个简单的
pip install pywin32
修好了