Scrapy MySQL管道 - 所有数据库条目都相同

时间:2013-08-19 04:16:16

标签: mysql scrapy mysql-python

我在Scrapy上运行Mac OSX Lion 10.7.5(以防万一)

以下是我的抓取工具:

 from scrapy.spider import BaseSpider
 from scrapy.selector import HtmlXPathSelector
 from BoxOfficeMojo.items import BoxofficemojoItem
 from BoxOfficeMojo.items import ActorItem

 class MojoSpider(BaseSpider):
     name = 'MojoSpider'
     allowed_domains = ['boxofficemojo.com']
     start_urls = ['http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&p=.htm']

     def parse(self, response):
          items = []
          movie = BoxofficemojoItem()
          hxs = HtmlXPathSelector(response)
          print ('hxs:', hxs)
          links = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/@href').extract() #was previously
          titles = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/b/text()').extract()
          gross = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[3]/font/text()').extract()
          opening = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[7]/font//text()').extract()
          for item in gross:
              if 'Total' in item:
                  gross.remove(item)


          items = []
          for i in range(len(links)):
              movie['title'] = titles[i]
              movie['link'] = 'http://www.boxofficemojo.com' + links[i]
              movie['gross'] = gross[i]
              movie['release_date'] = opening[i]
              items.append(movie)
          return items

这是我的MySQL管道:

  import sys; sys.path.append("/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages")
 import MySQLdb
 import hashlib
 from scrapy.exceptions import DropItem
 from scrapy.http import Request

 class BoxofficemojoPipeline(object):

     def __init__(self):
         self.conn = MySQLdb.connect(user='testuser', passwd='test', db='testdb', host='localhost', charset='utf8', use_unicode=True)
         self.cursor = self.conn.cursor()

     def process_item(self, item, spider):
         try:
             self.cursor.execute("""INSERT INTO example_movie (title, link, gross, release_date) VALUES (%s, %s, %s, %s)""", (item['title'], item['link'], item['gross'], item['release_date']))
             self.conn.commit()
         except MySQLdb.Error, e:
             print "Error %d: %s" % (e.args[0], e.args[1])

         return item

当我查看MySQL Database中的条目时,页面中应该有多少部电影,但它们都是同一部电影Act of Worship,这是最后一部电影这页纸。欢迎任何和所有建议!谢谢你的期待!

1 个答案:

答案 0 :(得分:2)

尝试移动movie = BoxofficemojoItem()循环中的for i in range(len(links)):

    def parse(self, response):
        items = []

        hxs = HtmlXPathSelector(response)
        print ('hxs:', hxs)
        links = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/@href').extract() #was previously
        titles = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/b/text()').extract()
        gross = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[3]/font/text()').extract()
        opening = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[7]/font//text()').extract()
        for item in gross:
            if 'Total' in item:
                gross.remove(item)

        items = []
        for i in range(len(links)):
            movie = BoxofficemojoItem()
            movie['title'] = titles[i]
            movie['link'] = 'http://www.boxofficemojo.com' + links[i]
            movie['gross'] = gross[i]
            movie['release_date'] = opening[i]
            items.append(movie)
        return items

以下建议使您的代码更简单:

  • 为所有电影项目字段使用共同的祖先://div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr
  • 使用urlparse.urljoin()创建“完整”网址

    导入urlparse ...

    def parse(self, response):
        items = []
    
        hxs = HtmlXPathSelector(response)
        print ('hxs:', hxs)
    
        movie_rows = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr')
        for m in movie_rows:
            movie = BoxofficemojoItem()
    
            movie['title'] = m.select('td[1]/font/a/@href').extract()[0]
            movie['link'] = urlparse.urljoin(
                response.url, m.select('td[1]/font/a/b/text()').extract()[0])
            movie['gross'] = m.select('td[3]/font/text()').extract()[0]
            movie['release_date'] = m.select('td[7]/font//text()').extract()[0]
    
            items.append(movie)
        return items