我正在试图弄清楚如何只提取列表中的部分文本。
以下是我目前正在使用的脚本:
import scrapy.selector
import urlparse
from scrapy.spiders import Spider
from scrapy.http import Request
from MediaMarkt.items import MediamarktItem
def complete_url(string):
return "http://www.mediamarkt.be" + string
def encode(str):
return str.encode('utf8', 'ignore')
class MshbeSpider(Spider):
name = "mshbetv"
start_urls = ['http://www.mediamarkt.be/mcs/productlist/_TV,98952,452540.html?langId=-17']
def parse(self, response):
items = response.xpath('//*[@id="filters"]/form/fieldset[2]/div[2]/ul[2]/li/a/@href')
for item in items:
link = item.extract()
yield Request(link, callback=self.parse_category)
def parse_category(self, response):
items = response.xpath('//ul[@class="products-list"]/li/div')
for item in items:
mshtv = MediamarktItem()
mshtv['item_3_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0]).replace("-","")
mshtv['item_2_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
mshtv['item_a_link'] = complete_url(item.select('.//div/h2/a/@href').extract()[0])
mshtv['item_4_avai'] = encode(item.xpath('normalize-space(.//aside/div/div/ul/span/text())').extract()[0])
mshtv['item_1_cat'] = encode(item.xpath('normalize-space(//*[@id="category"]/hgroup/h1/text())').extract()[0])
yield mshtv
new_link = response.xpath('//li[@class="pagination-next"]/a/@href').extract()[0]
yield Request(complete_url(new_link),callback=self.parse_category)
字段“mshtv ['item_2_name']”是我想要仅提取某些文本的地方。 如果尝试了我能找到但没有成功的一切。
使用我当前的脚本,我得到的结果是“mshtv ['item_2_name']”,例如
电视三星UE55J6200AWXXN 55“LCD全LED智能电视SONY
KDL55W755CBAEP 55“LCD EDGE LED Smart
TV SONY KD55-X9405CBAEP 55“4K
我有一个正确的制造商型号的孔列表。如在这个例子中,他们是
我想要实现的是,在运行我的脚本时,我只是将制造商的型号作为返回。
这可能是不可能的?
答案 0 :(得分:0)
将正确的型号列表放入dict中,然后将包含型号的字符串拆分为空格中的单词并在字典中查找。
d = {}
for model in models:
d[model] = True
for word in mshtv['item_2_name'].split(" "):
if word in d:
print word
答案 1 :(得分:0)
在我尝试了解决方案后," ehm"建议我试过了。 它有效,但现在还有其他错误。在我获得超过200个结果之前,我只获得了13个结果。
import scrapy.selector
import urlparse
from scrapy.spiders import Spider
from scrapy.http import Request
from MediaMarkt.items import MediamarktItem
models = ["ue78js9500lxxn","UE60J6200AWXXN","kdl40w705cbaep","KDL55W755CBAEP","KDL40W705CBAEP"]
d = {}
for model in models:
d[model] = True
def complete_url(string):
return "http://www.mediamarkt.be" + string
def encode(str):
return str.encode('utf8', 'ignore')
class MshbeSpider(Spider):
name = "mshbetv"
start_urls = ['http://www.mediamarkt.be/mcs/productlist/_TV,98952,452540.html?langId=-17']
def parse(self, response):
items = response.xpath('//*[@id="filters"]/form/fieldset[2]/div[2]/ul[2]/li/a/@href')
for item in items:
link = item.extract()
yield Request(link, callback=self.parse_category)
def parse_category(self, response):
items = response.xpath('//ul[@class="products-list"]/li/div')
for item in items:
mshtv = MediamarktItem()
mshtv['item_3_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0]).replace("-","")
mshtv['item_2_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
mshtv['item_a_link'] = complete_url(item.select('.//div/h2/a/@href').extract()[0])
mshtv['item_4_avai'] = encode(item.xpath('normalize-space(.//aside/div/div/ul/span/text())').extract()[0])
mshtv['item_1_cat'] = encode(item.xpath('normalize-space(//*[@id="category"]/hgroup/h1/text())').extract()[0])
for word in mshtv['item_2_name'].split(" "):
if word in d:
mshtv['item_model'] = word
yield mshtv
new_link = response.xpath('//li[@class="pagination-next"]/a/@href').extract()[0]
yield Request(complete_url(new_link),callback=self.parse_category)