我这里有一个刮刀。它工作了几个小时然后突然插入没有进入表中。程序继续运行,但表保持不变...我看到的唯一错误是主键错误,因为有些行是重复的,我不想插入它们。
from bs4 import BeautifulSoup
from datetime import datetime
import mechanize,cookielib,pyodbc,socket,sys
import httplib
url = 'www'
base= 'www'
proxies = {'http': 'proxy'}
username='u'
pw = 'p'
cnxnstring = 'DRIVER={SQL Server};SERVER=s;DATABASE=DB;UID=u;PWD=p'
insert="""INSERT INTO TxProductionPreStaging(LeaseName,LeaseNo,DistrictNo,WellNo,ProdMonth,ProdYear,ProdDate,OilBBL,CasingHeadMCF,GWGasMCF,CondensateBBL,LastScraped)
VALUES(?,?,?,?,?,?,?,?,?,?,?,?)"""
def initReq():
br = mechanize.Browser()
br.set_proxies(proxies)
br.add_proxy_password(username, pw)
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_redirect(True)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
while True:
try:
soup = BeautifulSoup(br.open(url,timeout=20).read())
if soup is not None:
if soup.head.title.text=='Texas RRC - Railroad Commission of Texas Site Policies':
print 'REDIRECT PAGE'
else:
break
except (mechanize.URLError,mechanize.HTTPError,httplib.IncompleteRead) as exc:
if isinstance(exc.reason, socket.timeout):
print exc
except Exception as error:
print error
return br
def initForm( br, prodMonth ):
br.select_form('SearchCriteriaForm')
br.form.set_all_readonly(False)
br.form.find_control(name='viewType').value = ['Lease']
br["startMonth"]=[prodMonth]
br["startYear"]=[prodYear]
br["endMonth"]=[prodMonth]
br["endYear"]=[prodYear]
br["district"]=['Statewide']
r=br.submit(nr=2)
return r
def bs( r ):
soup = BeautifulSoup(r.read())
return soup
def getTags( soup ):
bigL=[]
mini=[]
for node in soup.findAll(attrs={'class': 'DataGrid'}):
for i in node.findAll('tr'):
if i.find('td'):
for j in i.findAll('td'):
s = str(j.text);s= s.replace('\r\n',''); s=s.replace(' ','').strip('-').strip('\n')
mini.append(s)
bigL.append(mini[:])
del mini[:]
return bigL
def insertTable( bigL, cnxn, cursor ,prodMonth, prodDate):
print 'INSERT TABLE'
global c
for i,item in enumerate(bigL):
leaseName=bigL[i][0]
leaseNo=bigL[i][1]
districtNo=bigL[i][2]
wellNo=bigL[i][3]
oil=int(bigL[i][4].replace(',',''))
casingHead=int(bigL[i][5].replace(',',''))
gas=int(bigL[i][6].replace(',',''))
condensate=int(bigL[i][7].replace(',',''))
dt = datetime.now()
try:
cursor.execute(insert,leaseName,leaseNo,districtNo,wellNo,prodMonth,prodYear,prodDate,oil,casingHead,gas,condensate,dt)
cnxn.commit()
except pyodbc.Error as e:
print e
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=u;DATABASE=DB;UID=us;PWD=p');
cursor = cnxn.cursor()
return
def baseUrl( prodYear ):
months=['01','02','03','04','05','06','07','08','09','10','11','12']
for item in months:
prodMonth=str(item)
prodDate=str(prodMonth)+'/01/'+str(prodYear)
prodDate=datetime.strptime(prodDate, '%m/%d/%Y')
br = initReq()
r = initForm( br, prodMonth )
soup = bs( r )
L = getTags( soup )
cnxn, cursor = getcnxn()
insertTable( L, cnxn, cursor, prodMonth, prodDate )
count = 20;
while True:
cs= str(count)
count = count +20
print count,cs
while True:
try:
soup = BeautifulSoup( br.open(base+cs, timeout=20).read())
if soup is not None:
if soup.head.title.text=='Texas RRC - Railroad Commission of Texas Site Policies':
print 'REDIRECT PAGE'
else:
break
except (mechanize.URLError,mechanize.HTTPError, httplib.IncompleteRead) as exc:
print exc
except Exception as e:
print e
var=soup.prettify(formatter=None)
if 'No Matches Found' in var:
break
else:
L = getTags( soup )
insertTable( L, cnxn, cursor, prodMonth, prodDate )
return
def getcnxn():
while True:
try:
cnxn = pyodbc.connect(cnxnstring);
cursor = cnxn.cursor()
break
except:
print sys.exc_info()[1]
return cnxn, cursor
if __name__ == '__main__':
prodYear=str(sys.argv[1]);
baseUrl( prodYear )
cnxn.close()
答案 0 :(得分:1)
对此有帮助的一件事是尝试定期获取光标。这测试连接。我正在抓取每个新页面的网页:
try:
cursor = cnxn.cursor()
except e:
cnxn ==reinit()
cursor = cnxn.cursor()
编辑:pyodbc也没有正确捕捉到错误...这就是为什么我认为它是默默失败的原因。事实证明,我只需捕获所有错误,看看它失败了。