好的,我遇到的主要问题是我有一个不应该有任何重复条目的表。这是因为我希望有一个主键,它将在一个单独的表中引用。
根据我对规范化的理解,最好以这种方式设计数据库。所以现在我有一个表有一堆重复的条目(应该只有6个唯一的条目,但有30个条目,每个唯一的条目重复5次)。
我该如何解决这个问题?我应该在导入数据时使用此解决方法,还是使用UNIQUE关键字。要注意这次我尝试使用UNIQUE功能的时候,它给了我一个错误,因为我导入的数据确实有重复的条目。
编辑:
这就是我的项目:
from scrapy.item import Item, Field
class TeamStats(Item):
# define the fields for your item here like:
# name = scrapy.Field()
team = Field()
division = Field()
rosterurl = Field()
player_desc = Field()
playerurl = Field()
pass
class Player(Item):
exp = Field()
pass
这就是我的代码:
import scrapy
import string
import re
from scrapy.selector import HtmlXPathSelector ##needed to import xpath command
from scrapy.shell import inspect_response ##needed for Response object
from nbastats.items import TeamStats, Player ##needed to import player stats
class NbastatsSpider(scrapy.Spider):
name = "nbaStats"
start_urls = [
"http://espn.go.com/nba/teams" ##only start not allowed because had some issues when navigated to team roster pages
]
def parse(self,response):
items = [] ##array or list that stores TeamStats item
i=0 ##counter needed for older code
for division in response.xpath('//div[@id="content"]//div[contains(@class, "mod-teams-list-medium")]'):
for team in division.xpath('.//div[contains(@class, "mod-content")]//li'):
item = TeamStats()
item['division'] = division.xpath('.//div[contains(@class, "mod-header")]/h4/text()').extract()[0]
item['team'] = team.xpath('.//h5/a/text()').extract()[0]
item['rosterurl'] = "http://espn.go.com" + team.xpath('.//div/span[2]/a[3]/@href').extract()[0]
items.append(item)
print(item['rosterurl'])
request = scrapy.Request(item['rosterurl'], callback = self.parseWPNow)
request.meta['play'] = item
yield request
def parseWPNow(self, response):
item = response.meta['play']
item = self.parseRoster(item, response)
return item
def parseRoster(self, item, response):
players1 = []
int = 0
for players in response.xpath("//td[@class='sortcell']"):
play = {}
play['name'] = players.xpath("a/text()").extract()[0]
play['position'] = players.xpath("following-sibling::td[1]").extract()[0]
play['age'] = players.xpath("following-sibling::td[2]").extract()[0]
play['height'] = players.xpath("following-sibling::td[3]").extract()[0]
play['weight'] = players.xpath("following-sibling::td[4]").extract()[0]
play['college'] = players.xpath("following-sibling::td[5]").extract()[0]
play['salary'] = players.xpath("following-sibling::td[6]").extract()[0]
players1.append(play)
item['playerurl'] = response.xpath("//td[@class='sortcell']/a").extract()
item['player_desc']=players1
return item
这就是我的管道:
class NbastatsPipeline(object):
def __init__(self):
self.setupDBCon()
self.createTables()
def setupDBCon(self):
self.con = lite.connect('test.db')
self.cur = self.con.cursor()
def createTables(self):
self.dropTeamsTable()
self.dropPlayersTable()
self.dropDivsTable()
self.createTeamsTable()
self.createPlayersTable()
self.createDivsTable()
def createTeamsTable(self):
self.cur.execute("CREATE TABLE IF NOT EXISTS Teams(P_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
team TEXT, \
DivId INTEGER, \
FOREIGN KEY (DivId) REFERENCES Divs1(Did) \
)")
def createDivsTable(self):
self.cur.execute("CREATE TABLE IF NOT EXISTS Divs(Did INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
division TEXT)")
def createPlayersTable(self):
self.cur.execute("CREATE TABLE IF NOT EXISTS Players(player_name TEXT, \
salary TEXT, \
weight INTEGER, \
age INTEGER, \
college TEXT )")
def dropTeamsTable(self):
self.cur.execute("DROP TABLE IF EXISTS Teams")
def dropPlayersTable(self):
self.cur.execute("DROP TABLE IF EXISTS Players")
def dropDivsTable(self):
self.cur.execute("DROP TABLE IF EXISTS Divs")
def closeDB(self):
self.con.close()
def __del__(self):
self.closeDB()
def process_item(self, item, spider):
for key, value in item.iteritems():
if key == "division":
print(item.get('division', ""))
self.cur.execute("INSERT INTO Divs( division ) VALUES(?)", (item.get('division', ""),))
self.con.commit()
# self.storeInDb(item) #this is the line you'll use when ready to completely pass item through to storeInDb but it'll be lower in code
return item
答案 0 :(得分:0)
通常的方法是在数据库级别强制执行唯一性并在您的应用程序中处理它。
对于Scrapy,如果您在管道中插入记录,通常会有这样的结构:
import sqlite3
from scrapy.exceptions import DropItem
try:
cursor.execute("""
INSERT INTO
table
(field1, field2)
VALUES (?, ?)""", (field1, field2))
except sqlite3.IntegrityError:
raise DropItem('Duplicate entry')
另见sqlite管道示例:
还有一个名为scrapy-dblite
的项目,它提供了一个很好的抽象 - 你不会深入到SQL查询,内置一个简单的ORM。例如:
from scrapy.exceptions import DropItem
from myproject.items import Product
import dblite
class StoreItemsPipeline(object):
def __init__(self):
self.ds = None
def open_spider(self, spider):
self.ds = open(Product, 'sqlite://db/products.sqlite:items', autocommit=True)
def close_spider(self, spider):
self.ds.commit()
self.ds.close()
def process_item(self, item, spider):
if isinstance(item, Product):
try:
self.ds.put(item)
except dblite.DuplicateItem:
raise DropItem("Duplicate item found: %s" % item)
else:
raise DropItem("Unknown item type, %s" % type(item))
return item