如何仅抓取唯一值并将其保存到数据库

时间:2018-06-08 09:27:22

标签: python python-3.x beautifulsoup

我正在使用beautifulsoup,requests和sqlite3抓取一个网站。现在我正在抓取网站并将其数据保存到数据库中。但是现在由于数据的大量流入,抓取会将过多的重复数据增加到数据库中。我试图使用命令INSERT OR REPLACE来获取唯一的数据,但它没有用。如何只将唯一数据保存到数据库中。

以下是我的代码:

import csv
from bs4 import BeautifulSoup
import requests
import time
import pdb
import sqlite3

url = "http://up-rera.in/projects"
url1 = "http://up-rera.in"
final_data = []
dct = {}

def writefiles(alldata, filename):
    with open ("./"+ filename, "w") as csvfile:
        csvfile = csv.writer(csvfile, delimiter=",")
        csvfile.writerow("")
        for i in range(0, len(alldata)):
            csvfile.writerow(alldata[i])

def getbyGet(url, values):
    res = requests.get(url, data=values)
    text = res.text
    return text

def readHeaders():
    global url, url1
    html = getbyGet(url, {})
    soup  = BeautifulSoup(html, "html.parser")
    EVENTTARGET = soup.select("#__VIEWSTATE")[0]['value']
    EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
    VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
    #VIEWSTATEGENERATOR = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]
    headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
              'Content-Type':'application/x-www-form-urlencoded',
              'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}

    formfields =  {'__EVENTARGUMENT':'',
                  '__EVENTVALIDATION':EVENTVALIDATION,
                  '__EVENTTARGET':EVENTTARGET,
                  '__VIEWSTATE':VIEWSTATE,
                  "__VIEWSTATEGENERATOR": "4F1A7E70",
                  'ctl00$ContentPlaceHolder1$btnSearch':'Search',
                  'ctl00$ContentPlaceHolder1$DdlprojectDistrict':0, #this is where your city name changes in each iteration
                  'ctl00$ContentPlaceHolder1$txt_regid':'',
                  'ctl00$ContentPlaceHolder1$txtProject':''}
    s = requests.session()
    conn = sqlite3.connect("99_data_increment.db")
    #cur = conn.cursor()
    conn.execute("CREATE TABLE IF NOT EXISTS crawled (id INTEGER PRIMARY KEY, Rera_Number text, Project_Name text, Promoter_Name text, City text, ResComm text, Links text) ")
    cur = conn.cursor()
    res = s.post(url, data=formfields, headers=headers).text
    soup = BeautifulSoup(res, "html.parser")
    get_details = soup.find_all(id="ctl00_ContentPlaceHolder1_GridView1")
    for details in get_details:
        gettr = details.find_all("tr")[1:]
        for tds in gettr:
            td = tds.find_all("td")[1]
            rera = td.find_all("span")
            rnumber = ""
            for num in rera:
                rnumber = num.text
                sublist = []
                sublist.append(rnumber)
            name = tds.find_all("td")[2]
            prj_name = name.find_all("span")
            prj = ""
            for prjname in prj_name:
                prj = prjname.text
                sublist.append(prj)
            promoter_name = tds.find_all("td")[3]
            promoter = promoter_name.find_all("span")
            prom = ""
            for promname in promoter:
                prom = promname.text
                sublist.append(prom)
            district = tds.find_all("td")[4]
            dist = district.find_all("span")
            district_name = ""
            for districtname in dist:
                district_name = districtname.text
                sublist.append(district_name)
            protype = tds.find_all("td")[5]
            project = protype.find_all("span")
            projectype = ""
            for prjtype in project:
                projectype = prjtype.text
                sublist.append(projectype)
            final_data.append(sublist)
            cur.execute("INSERT OR REPLACE INTO crawled VALUES (NULL,?,?,?,?,?,?)",(rnumber, prj, prom , district_name , projectype, projectype))
            conn.commit()
        #print(final_data)
return final_data


def main():
    datas = readHeaders()
    writefiles(datas, "Up-new.csv")
main()

0 个答案:

没有答案