我正在使用beautifulsoup,requests和sqlite3抓取一个网站。现在我正在抓取网站并将其数据保存到数据库中。但是现在由于数据的大量流入,抓取会将过多的重复数据增加到数据库中。我试图使用命令INSERT OR REPLACE来获取唯一的数据,但它没有用。如何只将唯一数据保存到数据库中。
以下是我的代码:
import csv
from bs4 import BeautifulSoup
import requests
import time
import pdb
import sqlite3
url = "http://up-rera.in/projects"
url1 = "http://up-rera.in"
final_data = []
dct = {}
def writefiles(alldata, filename):
with open ("./"+ filename, "w") as csvfile:
csvfile = csv.writer(csvfile, delimiter=",")
csvfile.writerow("")
for i in range(0, len(alldata)):
csvfile.writerow(alldata[i])
def getbyGet(url, values):
res = requests.get(url, data=values)
text = res.text
return text
def readHeaders():
global url, url1
html = getbyGet(url, {})
soup = BeautifulSoup(html, "html.parser")
EVENTTARGET = soup.select("#__VIEWSTATE")[0]['value']
EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
#VIEWSTATEGENERATOR = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]
headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Content-Type':'application/x-www-form-urlencoded',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
formfields = {'__EVENTARGUMENT':'',
'__EVENTVALIDATION':EVENTVALIDATION,
'__EVENTTARGET':EVENTTARGET,
'__VIEWSTATE':VIEWSTATE,
"__VIEWSTATEGENERATOR": "4F1A7E70",
'ctl00$ContentPlaceHolder1$btnSearch':'Search',
'ctl00$ContentPlaceHolder1$DdlprojectDistrict':0, #this is where your city name changes in each iteration
'ctl00$ContentPlaceHolder1$txt_regid':'',
'ctl00$ContentPlaceHolder1$txtProject':''}
s = requests.session()
conn = sqlite3.connect("99_data_increment.db")
#cur = conn.cursor()
conn.execute("CREATE TABLE IF NOT EXISTS crawled (id INTEGER PRIMARY KEY, Rera_Number text, Project_Name text, Promoter_Name text, City text, ResComm text, Links text) ")
cur = conn.cursor()
res = s.post(url, data=formfields, headers=headers).text
soup = BeautifulSoup(res, "html.parser")
get_details = soup.find_all(id="ctl00_ContentPlaceHolder1_GridView1")
for details in get_details:
gettr = details.find_all("tr")[1:]
for tds in gettr:
td = tds.find_all("td")[1]
rera = td.find_all("span")
rnumber = ""
for num in rera:
rnumber = num.text
sublist = []
sublist.append(rnumber)
name = tds.find_all("td")[2]
prj_name = name.find_all("span")
prj = ""
for prjname in prj_name:
prj = prjname.text
sublist.append(prj)
promoter_name = tds.find_all("td")[3]
promoter = promoter_name.find_all("span")
prom = ""
for promname in promoter:
prom = promname.text
sublist.append(prom)
district = tds.find_all("td")[4]
dist = district.find_all("span")
district_name = ""
for districtname in dist:
district_name = districtname.text
sublist.append(district_name)
protype = tds.find_all("td")[5]
project = protype.find_all("span")
projectype = ""
for prjtype in project:
projectype = prjtype.text
sublist.append(projectype)
final_data.append(sublist)
cur.execute("INSERT OR REPLACE INTO crawled VALUES (NULL,?,?,?,?,?,?)",(rnumber, prj, prom , district_name , projectype, projectype))
conn.commit()
#print(final_data)
return final_data
def main():
datas = readHeaders()
writefiles(datas, "Up-new.csv")
main()