我写了一个下载内容(文本)和pdf并将其保存在sqlite3中的webscraper。 我还写了一个抓取pdf的webscraper。
我正在尝试创建一个整合的Web刮板,该刮板可以下载内容和pdf并刮除pdf,并将其内容一起保存到sqlite3中。</ p>
下面是我的代码:
import urllib.request
import urllib, json, csv
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import base64, requests, random
import sqlite3
from datetime import date
import pdfquery
import glob
import csv
link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
newlink = 'https://maharerait.mahaonline.gov.in/SearchList/Search'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
divLink = "https://maharerait.mahaonline.gov.in/SearchList/GetDivision"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"
alldata = []
links = {}
divisionList = {}
state_ids = [33, 10, 27]
max_entries = 500
user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36']
ses = requests.session()
ses.headers['User-Agent'] = random.choice( user_agents )
ses.headers['Host'] = 'maharerait.mahaonline.gov.in'
ses.headers['Referer'] = newlink
def getDataByGet(url, values):
print("url >> "+url)
req = ses.get(url, timeout=10)
text = req.text
req.close()
#print("hello world"+str(values))
return text
def getDivision(sid):
## for now we are taking 6 districts.. it needs to updated when the data gets updatedd
global divisionList, divLink
data = ses.post(divLink, data={'StateID': str(sid) } )
print(data.status_code)
jsn = json.loads( data.text )
#getDivisionList()
print(" >>>>> Data fetched from divisionList is ", sid)
lst = {}
for dct in jsn:
lst[ str( dct['ID'] ) ] = dct['Text']
print( lst )
return lst
def getDistrict(divId):
global distlink
values = {'DivID': divId}
data = getData(distlink, values)
return data
def parseJson(data):
parsed = json.loads(data)
return parsed
def getTaluka(disId):
global talukaLink
values= {'DisID': disId}
data = getData(talukaLink, values)
return data
def getProjects(divId, disId):
global prjLink
values= {'DisID': disId, 'DivID': divId}
data = getData( prjLink, values)
if len(data)<10:
return "{}"
return data
def getToken():
data = getDataByGet( newlink, values={})
soup = BeautifulSoup(data, "html.parser")
token = soup.find('input', {'name':'__RequestVerificationToken'} )
token = token.attrs['value']
return token
def getProjectsList():
global state_ids
token = getToken()
ses.cookies['__RequestVerificationToken']=token
for sid in state_ids:
divList = getDivision(sid)
#if len(alldata)>max_entries:
# break
for divid in divList.keys():
#if len(alldata)>max_entries:
# break
values = {'__RequestVerificationToken': token, 'Type':'Promoter', 'ID': '0', 'pageTraverse':'1','Project':'','hdnProject':'','Promoter':'','hdnPromoter':'','CertiNo':'','hdnCertiNo':'','State':str( sid ),'Division':divid,'hdnDivision':'','hdnDistrict':'','hdnDTaluka':'','hdnVillage':'','hdnState': str(sid),'District':'','Taluka':'','Village':'','CompletionDate_From':'','hdnfromdate':'','CompletionDate_To':'','hdntodate':'','PType':'','hdnPType':'','btnSearch':'Search', 'TotalRecords':'50', 'CurrentPage':'1', 'TotalPages':'1'}
finalPrjData = getData(newlink, values)
divName = divList[divid]
print(divName)
print(" page No : 1")
tp, tr, cp = getTotalPages(finalPrjData)
print( tp, tr, cp )
pt = 1
for pageIndex in range(0, tp+1):
#if len(alldata)>max_entries:
# break
pg = pageIndex-1
if pg<1:
pg = 3
values = {'__RequestVerificationToken': token, 'Type':'Promoter', 'ID': '0', 'pageTraverse':pt,'Project':'','hdnProject':'','Promoter':'','hdnPromoter':'','CertiNo':'','hdnCertiNo':'','State':str( sid ),'Division':divid,'hdnDivision':'','hdnDistrict':'','hdnDTaluka':'','hdnVillage':'','hdnState': str(sid),'District':'','Taluka':'','Village':'','CompletionDate_From':'','hdnfromdate':'','CompletionDate_To':'','hdntodate':'','PType':'','hdnPType':'','btnSearch':'Search', 'TotalRecords':tr, 'CurrentPage':pg, 'TotalPages':tp}
finalPrjData = getData(newlink, values)
print(" page No : ",pageIndex)
parseXMLData(finalPrjData, divName, distName="")
pt += 1
if pt>1:
pt = 3
def getTotalPages(data):
soup = BeautifulSoup(data, "html.parser")
tp = soup.find('input', {'name':'TotalPages'})
tr = soup.find('input', {'name':'TotalRecords'})
cp = soup.find('input', {'name':'CurrentPage'})
if tr==None:# or 'value' not in tr.keys():
tr = {}
tr['value'] = '0'
if cp==None:# or 'value' not in cp.keys():
cp = {}
cp['value'] = '0'
if tp==None:# or 'value' not in tp.keys():
tp = {}
tp['value'] = '0'
return int(tp.attrs['value']), int(tr.attrs['value']), int(cp.attrs['value'])
def parseXMLData(htmldata, divName, distName=""):
global alldata, links
conn = sqlite3.connect("99_data_increment.db", timeout=10)
fdate = date.today()
#cur = conn.cursor()
conn.execute("""CREATE TABLE IF NOT EXISTS crawled
(id INTEGER PRIMARY KEY, State text, XID text, Project_Name text, City text, Main_City text, Registration_Number text, Promoter_Name text, Rera_URL text, PDF_text, Crawled_Date text, Status text, Names text, Transaction_Date text, Comments text, Call_Contact_Number text, Creation_Type text, Builder_Website text,
CONSTRAINT number_unique UNIQUE (Project_Name, Promoter_Name))
""")
cur = conn.cursor()
soup = BeautifulSoup(htmldata, "html.parser")
tables = soup.find_all("table")
for table in tables:
#if len(alldata)>max_entries:
# break
print(len(alldata))
attr = table.attrs
if "table" in attr['class']:
tbody = table.find_all("tbody")
if len(tbody)>0:
tbody = tbody[0]
tr_lst = tbody.find_all("tr")
for tr in tr_lst:
#if len(alldata)>max_entries:
# break
sublist = []
blank = ""
td_lst = tr.find_all("td")
if len(td_lst)>6:
prjname = td_lst[1].text
proname = td_lst[2].text
td = td_lst[4]
a_lst = td.find_all("a")
td5 = td_lst[5]
a_lst5 = td5.find_all("a")[0]
data =a_lst5.attrs["data-qstr"]
data_dec = base64.b64decode(data)
data_dec = data_dec.decode("utf-8")
data_dec = data_dec.split("&")
data1 = ''
for each in data_dec:
if each.find('AppID')!=-1 and each.find('xt')==-1:
certNo = each.split("=")[1]
#print( certNo )
sublist.append(prjname)
sublist.append(proname)
#sublist.append(last_modified)
sublist.append(divName)
#print(sublist)
if len(a_lst)>0:
a = a_lst[0]
href = a.attrs['href']
link = "https://maharerait.mahaonline.gov.in/"+href
links[certNo] = link
sublist.append(link)
td_cert = td_lst[6]
a_cert = td_cert.find("a")
a_atr = a_cert.attrs
status = True
if "data-qstr" in a_atr.keys():
val =a_atr["data-qstr"]
fname = certNo+".pdf"
status = downloadPdf( fname, val)
if status==True:
sublist.append( fname )
else:
sublist.append( " ")
if len(sublist)>0:
alldata.append(sublist)
if status==False:
print("Project :"+prjname+", couldnt be downloaded")
data = scrapepdf()# here we need to call function
cur.execute("INSERT OR IGNORE INTO crawled VALUES (NULL,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",("Maharashtra",blank,prjname,divName,blank,data, proname , link, blank,fdate, blank, blank, blank, blank, blank, blank, blank ))
conn.commit()
return alldata
def downloadPdf(fname, _id):
try:
values = {'ID': _id}
lnk = "https://maharerait.mahaonline.gov.in/SearchList/ShowCertificate"
data = getData(lnk, values)
decode_val = base64.b64decode(data)
f= open("./"+fname, "wb")
f.write(decode_val)
f.close()
return True
except:
print("Couldnt download the file ",fname)
return False
def writedata(alldata1, filename):
print(" >>>> FINAL PRINTING DATA >>>> ")
with open("./"+filename,'w') as csvfile:
csvfile = csv.writer(csvfile, delimiter=',')
csvfile.writerow("")
for i in range(0, len( alldata1 )):
try:
csvfile.writerow( alldata1[i] )
except:
print("Coudlnt save the data for the project ",alldata1[i])
def formattext(text):
while text.find("\r\n")>=0:
text = text.replace("\r\n","")
while text.find(" ")>=0:
text = text.replace(" ","")
return text
def main():
global alldata
getProjectsList()
print("Before write the projects data to the file. Count >> "+str(len(alldata)))
writedata(alldata, "data.csv")
savejsonformat()
def savejsonformat():
global alldata
jsondata = []
for sublist in alldata:
jsonlist = [sublist[4].replace(".pdf",''), sublist[3]]
jsondata.append(jsonlist)
f = open("./json_data.txt", "w")
f.write( json.dumps(jsondata) )
f.close()
def scrapepdf():
import pdfquery
import glob
import csv
mahadata = []
d = glob.glob(r"C:\Users\prince.bhatia\Desktop\incremenaal\*.pdf")
for i in d:
sublist = []
pdf = pdfquery.PDFQuery(i)
pdf.load()
label = pdf.pq('LTTextLineHorizontal:contains("registration number")')
left_corner = float(label.attr('x0'))
bottom_corner = float(label.attr('y0'))
name = pdf.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner, bottom_corner-30, left_corner+150, bottom_corner)).text()
return name
def getData(url, values):
import requests
#print("url >> "+url)
req = requests.post(url, data=values)#, timeout=10)
text = req.text
req.close()
return text
main()
我写了一个函数scrape pdf,下载了pdf,但是我的问题是它不使用此代码读取pdf。当我运行文件时,它将分别读取PDF,但不会沿着另一个Webscraper读取。我已经写了一个功能刮pdf。