如何合并两个python webscrapers

时间:2018-07-17 10:25:25

标签: pdf web-scraping python-3.6

我写了一个下载内容(文本)和pdf并将其保存在sqlite3中的webscraper。 我还写了一个抓取pdf的webscraper。

我正在尝试创建一个整合的Web刮板,该刮板可以下载内容和pdf并刮除pdf,并将其内容一起保存到sqlite3中。<​​/ p>

下面是我的代码:

import urllib.request
import urllib, json, csv
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import base64, requests, random
import sqlite3
from datetime import date
import pdfquery
import glob
import csv
link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
newlink = 'https://maharerait.mahaonline.gov.in/SearchList/Search'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
divLink = "https://maharerait.mahaonline.gov.in/SearchList/GetDivision"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"

alldata = []

links = {}

divisionList = {}
state_ids = [33, 10, 27]
max_entries = 500

user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36']

ses = requests.session()
ses.headers['User-Agent'] = random.choice( user_agents )
ses.headers['Host'] = 'maharerait.mahaonline.gov.in'
ses.headers['Referer'] = newlink

def getDataByGet(url, values):
    print("url >> "+url)
    req = ses.get(url, timeout=10)
    text = req.text
    req.close()
    #print("hello world"+str(values))
    return text

def getDivision(sid):
    ## for now we are taking 6 districts.. it needs to updated when the data gets updatedd
    global divisionList, divLink
    data = ses.post(divLink, data={'StateID': str(sid) } )
    print(data.status_code)
    jsn = json.loads( data.text )
    #getDivisionList()
    print(" >>>>> Data fetched from divisionList is ", sid)
    lst = {}
    for dct in jsn:
        lst[ str( dct['ID'] ) ] = dct['Text']
    print( lst )
    return lst

def getDistrict(divId):
    global distlink
    values = {'DivID': divId}
    data = getData(distlink, values)
    return data

def parseJson(data):
    parsed = json.loads(data)
    return parsed

def getTaluka(disId):
    global talukaLink
    values= {'DisID': disId}
    data = getData(talukaLink, values)
    return data

def getProjects(divId, disId):
    global prjLink
    values= {'DisID': disId, 'DivID': divId}
    data = getData( prjLink, values)
    if len(data)<10:
        return "{}"
    return data

def getToken():
    data = getDataByGet( newlink, values={})
    soup = BeautifulSoup(data, "html.parser")
    token = soup.find('input', {'name':'__RequestVerificationToken'} )
    token = token.attrs['value']
    return token

def getProjectsList():
    global state_ids
    token = getToken()
    ses.cookies['__RequestVerificationToken']=token
    for sid in state_ids:
        divList = getDivision(sid)
        #if len(alldata)>max_entries:
        #    break
        for divid in divList.keys():
            #if len(alldata)>max_entries:
            #    break
            values = {'__RequestVerificationToken': token, 'Type':'Promoter', 'ID': '0', 'pageTraverse':'1','Project':'','hdnProject':'','Promoter':'','hdnPromoter':'','CertiNo':'','hdnCertiNo':'','State':str( sid ),'Division':divid,'hdnDivision':'','hdnDistrict':'','hdnDTaluka':'','hdnVillage':'','hdnState': str(sid),'District':'','Taluka':'','Village':'','CompletionDate_From':'','hdnfromdate':'','CompletionDate_To':'','hdntodate':'','PType':'','hdnPType':'','btnSearch':'Search', 'TotalRecords':'50', 'CurrentPage':'1', 'TotalPages':'1'}
            finalPrjData = getData(newlink, values)
            divName = divList[divid]
            print(divName)
            print(" page No : 1")
            tp, tr, cp = getTotalPages(finalPrjData)
            print( tp, tr, cp )
            pt = 1
            for pageIndex in range(0, tp+1):
                #if len(alldata)>max_entries:
                #    break
                pg = pageIndex-1
                if pg<1:
                    pg = 3
                values = {'__RequestVerificationToken': token, 'Type':'Promoter', 'ID': '0', 'pageTraverse':pt,'Project':'','hdnProject':'','Promoter':'','hdnPromoter':'','CertiNo':'','hdnCertiNo':'','State':str( sid ),'Division':divid,'hdnDivision':'','hdnDistrict':'','hdnDTaluka':'','hdnVillage':'','hdnState': str(sid),'District':'','Taluka':'','Village':'','CompletionDate_From':'','hdnfromdate':'','CompletionDate_To':'','hdntodate':'','PType':'','hdnPType':'','btnSearch':'Search', 'TotalRecords':tr, 'CurrentPage':pg, 'TotalPages':tp}
                finalPrjData = getData(newlink, values)
                print(" page No : ",pageIndex)
                parseXMLData(finalPrjData, divName, distName="")
                pt += 1
                if pt>1:
                    pt = 3

def getTotalPages(data):
    soup = BeautifulSoup(data, "html.parser")
    tp = soup.find('input', {'name':'TotalPages'})
    tr = soup.find('input', {'name':'TotalRecords'})
    cp = soup.find('input', {'name':'CurrentPage'})
    if tr==None:# or 'value' not in tr.keys():
        tr = {}
        tr['value'] = '0'
    if cp==None:# or 'value' not in cp.keys():
        cp = {}
        cp['value'] = '0'
    if tp==None:# or 'value' not in tp.keys():
        tp = {}
        tp['value'] = '0'
    return int(tp.attrs['value']), int(tr.attrs['value']), int(cp.attrs['value'])

def parseXMLData(htmldata, divName, distName=""):
    global alldata, links
    conn = sqlite3.connect("99_data_increment.db", timeout=10)
    fdate = date.today()
    #cur = conn.cursor()
    conn.execute("""CREATE TABLE IF NOT EXISTS crawled
                      (id INTEGER PRIMARY KEY, State text, XID text, Project_Name text, City text, Main_City text, Registration_Number text, Promoter_Name text, Rera_URL text, PDF_text, Crawled_Date text, Status text, Names text, Transaction_Date text, Comments text, Call_Contact_Number text, Creation_Type text, Builder_Website text,
                      CONSTRAINT number_unique UNIQUE (Project_Name, Promoter_Name))
                      """)
    cur = conn.cursor()
    soup = BeautifulSoup(htmldata, "html.parser")
    tables = soup.find_all("table")
    for table in tables:
        #if len(alldata)>max_entries:
        #    break
        print(len(alldata))
        attr = table.attrs
        if "table" in attr['class']:
            tbody = table.find_all("tbody")
            if len(tbody)>0:
                tbody = tbody[0]
                tr_lst = tbody.find_all("tr")
                for tr in tr_lst:
                    #if len(alldata)>max_entries:
                    #    break
                    sublist = []
                    blank = ""
                    td_lst = tr.find_all("td")
                    if len(td_lst)>6:
                        prjname = td_lst[1].text
                        proname = td_lst[2].text
                        td = td_lst[4]
                        a_lst = td.find_all("a")
                        td5 = td_lst[5]
                        a_lst5 = td5.find_all("a")[0]
                        data =a_lst5.attrs["data-qstr"]
                        data_dec = base64.b64decode(data)
                        data_dec = data_dec.decode("utf-8")
                        data_dec = data_dec.split("&")
                        data1 = ''
                        for each in data_dec:
                            if each.find('AppID')!=-1 and each.find('xt')==-1:
                                certNo = each.split("=")[1]
                        #print( certNo )
                        sublist.append(prjname)
                        sublist.append(proname)
                        #sublist.append(last_modified)
                        sublist.append(divName)
                        #print(sublist)
                        if len(a_lst)>0:
                            a = a_lst[0]
                            href = a.attrs['href']
                            link = "https://maharerait.mahaonline.gov.in/"+href
                            links[certNo] = link
                            sublist.append(link)
                        td_cert = td_lst[6]
                        a_cert = td_cert.find("a")
                        a_atr = a_cert.attrs
                        status = True
                        if "data-qstr" in a_atr.keys():
                            val =a_atr["data-qstr"]
                            fname = certNo+".pdf"
                            status = downloadPdf( fname, val)
                            if status==True:
                                sublist.append( fname )
                            else:
                                sublist.append( " ")
                    if len(sublist)>0:
                        alldata.append(sublist)
                    if status==False:
                        print("Project :"+prjname+", couldnt be downloaded")
                    data = scrapepdf()# here we need to call function
                    cur.execute("INSERT OR IGNORE INTO crawled VALUES (NULL,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",("Maharashtra",blank,prjname,divName,blank,data, proname , link, blank,fdate, blank, blank, blank, blank, blank, blank, blank ))
    conn.commit()
    return alldata

def downloadPdf(fname, _id):
    try:
        values = {'ID': _id}
        lnk =  "https://maharerait.mahaonline.gov.in/SearchList/ShowCertificate"
        data = getData(lnk, values)
        decode_val = base64.b64decode(data)
        f= open("./"+fname, "wb")
        f.write(decode_val)
        f.close()
        return True
    except:
        print("Couldnt download the file ",fname)
        return False

def writedata(alldata1, filename):
    print(" >>>> FINAL PRINTING DATA >>>> ")
    with open("./"+filename,'w') as csvfile:
        csvfile = csv.writer(csvfile, delimiter=',')
        csvfile.writerow("")
        for i in range(0, len( alldata1 )):
            try:
                csvfile.writerow( alldata1[i]  )
            except:
                print("Coudlnt save the data for the project ",alldata1[i])


def formattext(text):
    while text.find("\r\n")>=0:
        text = text.replace("\r\n","")

    while text.find("   ")>=0:
        text = text.replace("   ","")
    return text

def main():
    global alldata
    getProjectsList()
    print("Before write the projects data to the file. Count >> "+str(len(alldata)))
    writedata(alldata, "data.csv")
    savejsonformat()

def savejsonformat():
    global alldata
    jsondata = []
    for sublist in alldata:
        jsonlist = [sublist[4].replace(".pdf",''), sublist[3]]
        jsondata.append(jsonlist)
    f = open("./json_data.txt", "w")
    f.write( json.dumps(jsondata) )
    f.close()

def scrapepdf():
    import pdfquery
    import glob
    import csv
    mahadata = []
    d = glob.glob(r"C:\Users\prince.bhatia\Desktop\incremenaal\*.pdf")

    for i in d:
        sublist = []
        pdf = pdfquery.PDFQuery(i)
        pdf.load()
        label = pdf.pq('LTTextLineHorizontal:contains("registration number")')
        left_corner = float(label.attr('x0'))
        bottom_corner = float(label.attr('y0'))
        name = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner, bottom_corner-30, left_corner+150, bottom_corner)).text()
        return name

def getData(url, values):
    import requests
    #print("url >> "+url)
    req = requests.post(url, data=values)#, timeout=10)
    text = req.text
    req.close()
    return text

main()

我写了一个函数scrape pdf,下载了pdf,但是我的问题是它不使用此代码读取pdf。当我运行文件时,它将分别读取PDF,但不会沿着另一个Webscraper读取。我已经写了一个功能刮pdf。

0 个答案:

没有答案