我对python和编码很新。我试图制作一个可以从过度播放器页面中抓取数据的网络抓取工具(例如:https://playoverwatch.com/en-gb/career/pc/eu/Taimou-2526) 我尝试使用portia,它在云端工作,但当我将其作为scrapy代码导出时,我无法使其工作。 Here是我的portia spider的截图。
这是我的蜘蛛的代码(从portia导出为scrapy):owData.py
from __future__ import absolute_import
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import Identity
from scrapy.spiders import Rule
from utils.spiders import BasePortiaSpider
from utils.starturls import FeedGenerator, FragmentGenerator
from utils.processors import Item, Field, Text, Number, Price, Date, Url
(Image, Regex)
from items import PortiaItem
class Owdata(BasePortiaSpider):
name = 'owData'
allowed_domains = [u'playoverwatch.com']
start_urls = \
[u'https://playoverwatch.com/en-gb/career/pc/eu/Taimou-2526']
rules = [Rule(LinkExtractor(allow=(), deny='.*'),
callback='parse_item', follow=True)]
items = [[]]
这是我的items.py代码:
from __future__ import absolute_import
import scrapy
from collections import defaultdict
from scrapy.loader.processors import Join, MapCompose, Identity
from w3lib.html import remove_tags
from .utils.processors import Text, Number, Price, Date, Url, Image
class PortiaItem(scrapy.Item):
fields = defaultdict(
lambda: scrapy.Field(
input_processor=Identity(),
output_processor=Identity()
)
)
def __setitem__(self, key, value):
self._values[key] = value
def __repr__(self):
data = str(self)
if not data:
return '%s' % self.__class__.__name__
return '%s(%s)' % (self.__class__.__name__, data)
def __str__(self):
if not self._values:
return ''
string = super(PortiaItem, self).__repr__()
return string
class CareerOverviewOverwatch1Item(PortiaItem):
field1 = scrapy.Field(
input_processor=Text(),
output_processor=Join(),
)
melee_final_blows = scrapy.Field(
input_processor=Text(),
output_processor=Join(),
)
table = scrapy.Field(
input_processor=Text(),
output_processor=Join(),
)
tr = scrapy.Field(
input_processor=Text(),
output_processor=Join(),
当我使用:
运行蜘蛛时scrapy crawl owData -o data.csv
我刚收到一个空的data.csv文件。 我猜我的物品有问题吗? 我认为xPath系列应该只是// tbody,但同样,我对Python,xPath或scrapy都一无所知......