对于练习我正在建立一个数据库,用于搜索音乐评级网站,以提供专辑,艺术家和评级。
当我多次运行脚本时,如何防止在我的表中复制相同的数据?
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import urllib.error
import sqlite3
conn = sqlite3.connect('pitchscraper.sqlite')
cur = conn.cursor()
#create table
cur.execute('''
CREATE TABLE IF NOT EXISTS Albums (id INTEGER, rating INTEGER, name TEXT, url TEXT, artist TEXT)''')
#open and read page
req = Request('http://pitchfork.com/reviews/albums/?page=1', headers={'User-Agent': 'Mozilla/5.0'})
pitchpage = urlopen(req).read()
#parse with beautiful soup
soup = BeautifulSoup(pitchpage, "lxml")
albums = soup('h2')
artists = soup.find_all(attrs={"class" : "artist-list"})
print("ALBUMS")
for tag in albums:
for album in tag:
print(album)
# need to fix this so that duplicate code is not added
cur.execute('INSERT OR IGNORE INTO Albums (name) VALUES (?)', (album, ))