坚持Scrapy的数据爬行

时间:2015-05-25 04:37:11

标签: python scrapy

我的一位朋友正在开发一个scrapy脚本来废弃页面中的数据。 过了一段时间,我需要添加另一个字段。我成功地添加了这个领域。但问题是该领域没有得到td内链接的数据。字段名称为“Last Batsman”

数据网址:

http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385

数据的XPath:

// * [@ ID = “ctl00_ContentPlaceHolder1_divData”] /表[6] / TR / TD

import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector

from digicricket.items import ODIorTestItem


class DigicricketMarsilOp1Spider(scrapy.Spider):
    name = "digicricket.marssil.op1"
    allowed_domains = ["digicricket.marssil.com"]

def __init__(self, match_id=None):
    if match_id:
        match_id_list = match_id.split(',')
        for i in match_id_list:
            if not i.isdigit():
                raise CloseSpider('Match ID = {0} is not a number'.format(i))
        else:
            self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
                               for i in match_id_list]
    else:
        raise CloseSpider('You forgot input Match ID/IDs')

def parse(self, response):
    item = ODIorTestItem()
    item['Batsman_op1'] = []
    item['Bowler_op1'] = []
    item['other_op1'] = []
    sel = Selector(response)
    tables = sel.xpath('//div[@id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
    row_for_other = dict()
    for i in xrange(len(tables)):
        html_text = BeautifulSoup(tables[i])
        if i == 1:
            sl = 0
            for tr in html_text.find_all('tr'):
                td = tr.find_all('td')
                if td:
                    sl += 1
                    row = dict()
                    row['sl'] = sl
                    row['match_id'] = response.url[response.url.rfind('=')+1:]
                    row["Batsman"] = td[0].get_text()
                    row["R"] = td[1].get_text()
                    row["B"] = td[2].get_text()
                    row["4s"] = td[3].get_text()
                    row["6s"] = td[4].get_text()
                    row["SR"] = td[5].get_text()
                    item['Batsman_op1'].append(row)
        elif i == 2:
            sl = 0
            for tr in html_text.find_all('tr'):
                td = tr.find_all('td')
                if td:
                    sl += 1
                    row = dict()
                    row['sl'] = sl
                    row['match_id'] = response.url[response.url.rfind('=')+1:]
                    row["Bowler"] = td[0].get_text()
                    row["O"] = td[1].get_text()
                    row["M"] = td[2].get_text()
                    row["R"] = td[3].get_text()
                    row["W"] = td[4].get_text()
                    row["Econ"] = td[5].get_text()
                    item['Bowler_op1'].append(row)
        else:
            for tr in html_text.find_all('tr'):
                td = tr.find_all('td')

            if i == 0:
                try:
                    row_for_other["InningsMatchDetails"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                                     'table[1]/tr/td/b/text()[1]').extract()[0]
                except:
                    row_for_other["InningsMatchDetails"] = None
                try:
                    row_for_other["CurrentScore"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                              'table[1]/tr/td/b/span/text()').extract()[0]
                except:
                    row_for_other["CurrentScore"] = None
                try:
                    row_for_other["OversRunRate"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                              'table[1]/tr/td/b/text()[2]').extract()[0]
                except:
                    row_for_other["OversRunRate"] = None
                try:
                    row_for_other["Extras"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
                                                        'tr/td/b/text()[3]').extract()[0]
                except:
                    row_for_other["Extras"] = None
                try:
                    row_for_other["MatchResult"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                             'table[1]/tr/td/b/text()[4]').extract()[0]
                except:
                    row_for_other["MatchResult"] = None
                try:
                    row_for_other["RecentOvers"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                             'table[4]/tr/td[2]/text()').extract()[0]
                except:
                    row_for_other["RecentOvers"] = None
                try:
                    row_for_other["LastBatsman"] = sel.xpath('//*[@id="ctl00_ContentPlaceHolder1_divData"]/'
                                                             'table[6]/tr/td/text()').extract()[0]
                except:
                    row_for_other["LastBatsman"] = None

    row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
    item['other_op1'].append(row_for_other)
    return item

1 个答案:

答案 0 :(得分:0)

您的XPath似乎错过了一些标签。在网页上,第二个div之前有两个table级别。用/替换//可以解决这些问题。 (因为我的浏览器添加了一些<tbody>标记,所以tr前面还有一个双斜杠。

  

.//* [@ ID = “ctl00_ContentPlaceHolder1_divData”] //表[6] // TR / TD / A [1] /文本()