我在Scrapy
上运行Mac OSX Lion 10.7.5
(以防万一)
以下是我的抓取工具:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from BoxOfficeMojo.items import BoxofficemojoItem
from BoxOfficeMojo.items import ActorItem
class MojoSpider(BaseSpider):
name = 'MojoSpider'
allowed_domains = ['boxofficemojo.com']
start_urls = ['http://www.boxofficemojo.com/movies/alphabetical.htm?letter=A&p=.htm']
def parse(self, response):
items = []
movie = BoxofficemojoItem()
hxs = HtmlXPathSelector(response)
print ('hxs:', hxs)
links = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/@href').extract() #was previously
titles = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/b/text()').extract()
gross = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[3]/font/text()').extract()
opening = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[7]/font//text()').extract()
for item in gross:
if 'Total' in item:
gross.remove(item)
items = []
for i in range(len(links)):
movie['title'] = titles[i]
movie['link'] = 'http://www.boxofficemojo.com' + links[i]
movie['gross'] = gross[i]
movie['release_date'] = opening[i]
items.append(movie)
return items
这是我的MySQL
管道:
import sys; sys.path.append("/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages")
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class BoxofficemojoPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(user='testuser', passwd='test', db='testdb', host='localhost', charset='utf8', use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute("""INSERT INTO example_movie (title, link, gross, release_date) VALUES (%s, %s, %s, %s)""", (item['title'], item['link'], item['gross'], item['release_date']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
当我查看MySQL Database
中的条目时,页面中应该有多少部电影,但它们都是同一部电影Act of Worship
,这是最后一部电影这页纸。欢迎任何和所有建议!谢谢你的期待!
答案 0 :(得分:2)
尝试移动movie = BoxofficemojoItem()
循环中的for i in range(len(links)):
行
def parse(self, response):
items = []
hxs = HtmlXPathSelector(response)
print ('hxs:', hxs)
links = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/@href').extract() #was previously
titles = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[1]/font/a/b/text()').extract()
gross = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[3]/font/text()').extract()
opening = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr/td[7]/font//text()').extract()
for item in gross:
if 'Total' in item:
gross.remove(item)
items = []
for i in range(len(links)):
movie = BoxofficemojoItem()
movie['title'] = titles[i]
movie['link'] = 'http://www.boxofficemojo.com' + links[i]
movie['gross'] = gross[i]
movie['release_date'] = opening[i]
items.append(movie)
return items
以下建议使您的代码更简单:
//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr
)使用urlparse.urljoin()
创建“完整”网址
导入urlparse ...
def parse(self, response):
items = []
hxs = HtmlXPathSelector(response)
print ('hxs:', hxs)
movie_rows = hxs.select('//div[@id="body"]/div/table/tr/td/table/tr[2]/td/table[2]/tr')
for m in movie_rows:
movie = BoxofficemojoItem()
movie['title'] = m.select('td[1]/font/a/@href').extract()[0]
movie['link'] = urlparse.urljoin(
response.url, m.select('td[1]/font/a/b/text()').extract()[0])
movie['gross'] = m.select('td[3]/font/text()').extract()[0]
movie['release_date'] = m.select('td[7]/font//text()').extract()[0]
items.append(movie)
return items