Question

我这里有一个刮刀。它工作了几个小时然后突然插入没有进入表中。程序继续运行，但表保持不变...我看到的唯一错误是主键错误，因为有些行是重复的，我不想插入它们。

from bs4 import BeautifulSoup
from datetime import datetime
import mechanize,cookielib,pyodbc,socket,sys
import httplib

url = 'www'
base= 'www'

proxies = {'http': 'proxy'}
username='u'
pw = 'p'

cnxnstring = 'DRIVER={SQL Server};SERVER=s;DATABASE=DB;UID=u;PWD=p'
insert="""INSERT INTO TxProductionPreStaging(LeaseName,LeaseNo,DistrictNo,WellNo,ProdMonth,ProdYear,ProdDate,OilBBL,CasingHeadMCF,GWGasMCF,CondensateBBL,LastScraped)
VALUES(?,?,?,?,?,?,?,?,?,?,?,?)"""

def initReq():

br = mechanize.Browser()
br.set_proxies(proxies)
br.add_proxy_password(username, pw)
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_redirect(True)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

while True:
    try:
        soup = BeautifulSoup(br.open(url,timeout=20).read())
        if soup is not None:
            if soup.head.title.text=='Texas RRC - Railroad Commission of Texas Site Policies':
                print 'REDIRECT PAGE'
            else:
                break
    except (mechanize.URLError,mechanize.HTTPError,httplib.IncompleteRead) as exc:
        if isinstance(exc.reason, socket.timeout):
            print exc
    except Exception as error:
        print error

return br

def initForm( br, prodMonth ):
br.select_form('SearchCriteriaForm')
br.form.set_all_readonly(False)
br.form.find_control(name='viewType').value = ['Lease']
br["startMonth"]=[prodMonth]
br["startYear"]=[prodYear]
br["endMonth"]=[prodMonth]
br["endYear"]=[prodYear]
br["district"]=['Statewide']

r=br.submit(nr=2) 

return r

def bs( r ):

    soup = BeautifulSoup(r.read())

    return soup

def getTags( soup ):

    bigL=[]
    mini=[]
    for node in soup.findAll(attrs={'class': 'DataGrid'}):
        for i in node.findAll('tr'):
            if i.find('td'):
                for j in i.findAll('td'):
                    s = str(j.text);s= s.replace('\r\n',''); s=s.replace(' ','').strip('-').strip('\n')
                    mini.append(s)
                bigL.append(mini[:])
                del mini[:]

    return bigL

def insertTable( bigL, cnxn, cursor ,prodMonth,  prodDate):

        print 'INSERT TABLE'
        global c
        for i,item in enumerate(bigL):
            leaseName=bigL[i][0]
            leaseNo=bigL[i][1]
            districtNo=bigL[i][2]
            wellNo=bigL[i][3]
            oil=int(bigL[i][4].replace(',',''))
            casingHead=int(bigL[i][5].replace(',',''))
            gas=int(bigL[i][6].replace(',',''))
            condensate=int(bigL[i][7].replace(',',''))
            dt = datetime.now()
            try:
                cursor.execute(insert,leaseName,leaseNo,districtNo,wellNo,prodMonth,prodYear,prodDate,oil,casingHead,gas,condensate,dt)
                cnxn.commit()
            except pyodbc.Error as e:
                print e
                cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=u;DATABASE=DB;UID=us;PWD=p');
                cursor = cnxn.cursor()

        return

def baseUrl( prodYear ):

    months=['01','02','03','04','05','06','07','08','09','10','11','12']
    for item in months:
        prodMonth=str(item)
        prodDate=str(prodMonth)+'/01/'+str(prodYear)
        prodDate=datetime.strptime(prodDate, '%m/%d/%Y')
        br = initReq()
        r = initForm( br, prodMonth )
        soup = bs( r )
        L = getTags( soup )
        cnxn, cursor = getcnxn()
        insertTable( L, cnxn, cursor, prodMonth, prodDate )
        count = 20;
        while True:
            cs= str(count)
            count = count +20
            print count,cs

            while True:
                try:
                    soup = BeautifulSoup( br.open(base+cs, timeout=20).read())
                    if soup is not None:
                        if soup.head.title.text=='Texas RRC - Railroad Commission of Texas Site Policies':
                            print 'REDIRECT PAGE'
                        else:
                            break
                except (mechanize.URLError,mechanize.HTTPError, httplib.IncompleteRead) as exc:
                    print exc
                except Exception as e:
                    print e

            var=soup.prettify(formatter=None)

            if 'No Matches Found' in var:
                break
            else:
                L = getTags( soup )
                insertTable( L, cnxn, cursor, prodMonth, prodDate )
    return

def getcnxn():

    while True:
        try:
            cnxn = pyodbc.connect(cnxnstring);
            cursor = cnxn.cursor()
            break
        except:
            print sys.exc_info()[1]
    return cnxn, cursor

if __name__ == '__main__':

    prodYear=str(sys.argv[1]);

    baseUrl( prodYear )

    cnxn.close()

Answer 1

对此有帮助的一件事是尝试定期获取光标。这测试连接。我正在抓取每个新页面的网页：

    try:
        cursor = cnxn.cursor()
    except e:
        cnxn ==reinit()
        cursor = cnxn.cursor()

编辑：pyodbc也没有正确捕捉到错误...这就是为什么我认为它是默默失败的原因。事实证明，我只需捕获所有错误，看看它失败了。

经过数小时的正常工作后，pyodbc插入失败

1 个答案: