我正在尝试将字符串存储到scrapy Item类中,但是当字符串超过67个字符时,它会自动截断为两个字符串。
import scrapy
from scrapy import Item, Field
class InfoItem(Item):
info = Field()
class TopicItem(Item):
topic = Field()
categories = Field()
class CategoryItem(Item):
category = Field()
fields = Field()
class FieldItem(Item):
field = Field()
value = Field()
class QuotesSpider(scrapy.Spider):
name = "categories"
def start_requests(self):
urls = ['http://www.eustat.euskadi.eus/t35-t64cont/eu/t64amVisorWar/t64aIndicadores.jsp?language=0']
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
info_item = InfoItem()
info_item['info'] = []
for i, eje in enumerate(response.css('h1.eje')):
text = response.css('h1.eje::text')[i].extract()
topic_item = TopicItem()
topic_item['topic'] = text
topic_item['categories'] = []
info_item['info'].append(topic_item)
siblings = eje.xpath('following-sibling::div')
print(siblings)
div_sibling = siblings[0]
print(div_sibling)
for j, h2 in enumerate(div_sibling.css('h2')):
text = div_sibling.css('h2::text')[j].extract()
category_item = CategoryItem()
category_item['category'] = text
category_item['fields'] = []
div_siblings_h2 = h2.xpath('following-sibling::div')
div_sibling_h2 = div_siblings_h2[0]
for h3 in div_sibling_h2.css('h3'):
the_input = h3.xpath('input//following-sibling::text()')[0].extract()[:-1]
input_value = h3.xpath('input/@value').extract().pop()
field_item = FieldItem()
field_item['field'] = the_input[:66] (*)
field_item['value'] = input_value
category_item['fields'].append(field_item)
topic_item['categories'].append(category_item)
print(info_item)
(*)如果我没有截断这个字符串,最后一条打印指令会显示'the_input'变量的最长值是如何分成两个字符串的。例如:
'fields': [
{'field': 'Superficie ocupada por parques, jardines y zonas verdes urbanas (%',
'value': '142'},
不截断:
'fields': [
{'field': 'Superficie ocupada por parques, jardines y zonas verdes urbanas ''(%/suelo urbano)',
'value': '142'},
环境:
OS: Ubuntu 16.04.2
python: 3.5.2
Scrapy == 1.3.3