TyperError:期待字符串或字节对象

时间:2017-02-10 18:15:39

标签: python html sql oracle

我的代码从HTML表中解析数据,然后将其导出到我的Oracle数据库。出于某种原因,在桌面上运行代码时,我有时会收到错误:

Traceback (most recent call last):
  File "Z:\Code\successfullest_html_code.py", line 122, in <module>
    cursor.executemany(sql_query, exported_data)
TypeError: expecting string or bytes object

在大多数桌面上,我的代码运行完美,而对于产生此错误的代码我只是手动输入..但现在这些错误更频繁地发生。我只想知道为什么这种情况只发生在一些表上,而不是在其他表看起来完全相同时。

当您尝试通过命令运行字符串(或字节对象)以外的其他操作时,我已经读取了此错误。但是这些表几乎完全相同,所以它让我困惑为什么有时会产生这个错误。

这是我的代码;虽然您可以忽略其中的大部分内容,但错误发生在cursor.executemany(sql_query, exported_data)行上:

from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import numpy as np
import os
import re
import email
import cx_Oracle

dsnStr = cx_Oracle.makedsn("sole.nefsc.noaa.gov", "1526", "sole")
con = cx_Oracle.connect(user="username", password="password$", dsn=dsnStr)

def celltext(cell):
    '''    
        textlist=[]
        for br in cell.findAll('br'):
            next = br.nextSibling
            if not (next and isinstance(next,NavigableString)):
                continue
            next2 = next.nextSibling
            if next2 and isinstance(next2,Tag) and next2.name == 'br':
                text = str(next).strip()
                if text:
                    textlist.append(next)
        return (textlist)
    '''
    textlist=[]
    y = cell.find('span')
    for a in y.childGenerator(): 
        if isinstance(a, NavigableString):
            textlist.append(str(a))
    return (textlist)

path = 'Z:\\bins_html_yes'

for filename in os.listdir(path):
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path):
        with open(file_path,'r') as w:
            html=w.read()
        #html=open(file_path,'r').read()
            soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string
            table = soup.find_all('table')[1] # Grab the second table

df_Quota = pd.DataFrame()

for filename in os.listdir(path):
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            pattern = re.compile(r'Sent:.*?\b(\d{4})\b')
            email = f.read()
            dates = pattern.findall(email)
            if dates:
                export_year = (''.join(dates))
                print("export_year:", export_year)

for row in table.find_all('tr'):    
    columns = row.find_all('td')
    try:
        if columns[0].get_text().strip()!='ID':# skip header
            #print("First Column:", columns[0].get_text().strip())
            Quota = celltext(columns[1]) 
            Weight =  celltext(columns[2])
            price =  celltext(columns[3])

            Nrows= max([len(Quota),len(Weight),len(price)]) #get the max number of rows

            IDList = [columns[0].get_text()] * Nrows
            DateList = [columns[4].get_text()] * Nrows

            if price[0].strip()=='Package':
                 price = [columns[3].get_text()] * Nrows

            if len(Quota)<len(Weight):#if Quota has less itmes extend with NaN
               lstnans= [np.nan]*(len(Weight)-len(Quota))
               Quota.extend(lstnans)

            if len(price) < len(Quota): #if price column has less items than quota column,
                val = [columns[3].get_text()] * (len(Quota)-len(price)) #extend with 
                price.extend(val)                                       #whatever is in
                                                                        #price column

            #if len(DateList) > len(Quota): #if DateList is longer than Quota, 
                #print("it's longer than")
                #value = [columns[4].get_text()] * (len(DateList)-len(Quota))
                #DateList = value * Nrows

            if len(Quota) < len(DateList): #if Quota is less than DateList (due to gap),
                stu = [columns[1].get_text()] * (len(DateList)-len(Quota)) #extend with what exists
                #stu = [np.nan]*(len(DateList)-len(Quota)) #extend with NaN
                Quota.extend(stu)

            if len(Weight) < len(DateList):
                dru = [columns[2].get_text()] * (len(DateList)-len(Weight)) #extend with what exists
                #dru = [np.nan]*(len(DateList)-len(Weight)) #extend with Nan
                Weight.extend(dru)

            FinalDataframe = pd.DataFrame(
            {
            'ID':IDList,    
            'AvailableQuota': Quota,
            'LiveWeightPounds': Weight,
            'price':price,
            'DatePosted':DateList
            })
            #print("ID:", IDList)
            #print("Price:", price)

            df_Quota = df_Quota.append(FinalDataframe, ignore_index=True)
            #df_Q = df_Quota['DatePosted'].iloc[0] #capture only most recent
            #df_Quota = df_Quota[df_Quota['DatePosted'] == df_Q] #date's data 
    except IndexError:
        continue

df_Quota['year'] = export_year

print ("Dataframe is:", df_Quota)

cursor = con.cursor()
exported_data = [tuple(x) for x in df_Quota.values]
sql_query = ("INSERT INTO FISHTABLE(species, date_posted, stock_id, pounds, advertised_price, year_posted, sector_name, ask)" "VALUES(:1, :2, :3, :4, :5, :6, 'NEFS 2', '1')")
cursor.executemany(sql_query, exported_data)
con.commit() #commit to database

cursor.close()
con.close()

这是一张成功导出的表格:

enter image description here

这是一张失败的桌子:

enter image description here

以下是DataFrame的打印输出(\n实际上根本不会导致导出:

Dataframe is:       AvailableQuota DatePosted        ID LiveWeightPounds    price  year
0         White Hake   \n4/15\n   \n002\n           50,000    $0.10  2015
1            GOM COD   \n3/23\n  \n1493\n            3,600    $0.60  2015
2         \nGreysole   \n3/23\n  \n1493\n            \n350  \n$1.25  2015
3            GBE COD   \n3/20\n  \n1878\n            1,113    $0.60  2015
4               Dabs   \n3/18\n  \n1043\n            3,000    $0.50  2015
5         \nGreysole   \n3/18\n  \n1043\n            \n700   \n$.85  2015
6           GOM HADD   \n3/13\n   \n011\n              790    $0.50  2015
7               Dabs   \n3/13\n   \n370\n            2,100     $.60  2015
8         \nGreySole   \n3/13\n   \n370\n          \n4,700   \n$.85  2015
9            GOM COD   \n3/13\n  \n1734\n            1,900    $0.90  2015
10        \nGOM HADD   \n3/13\n  \n1734\n          \n1,000  \n$1.00  2015
11        \nGreysole   \n3/13\n  \n1734\n          \n3,000  \n$1.50  2015
12            \nDabs   \n3/13\n  \n1734\n          \n2,700  \n$1.00  2015
13           GBW Cod   \n3/13\n   \n816\n           12,000    $0.40  2015
14            \nDabs   \n3/13\n   \n816\n          \n2,000  \n$0.60  2015
15        \nGreysole   \n3/13\n   \n816\n          \n2,000  \n$0.90  2015
16           GOM COD   \n3/13\n   \n373\n              300    $0.90  2015
17  \nGOM YellowTail   \n3/13\n   \n373\n          \n3,300  \n$0.20  2015
18        \nGOM Hadd   \n3/13\n   \n373\n          \n1,000  \n$0.50  2015
19          GOM Hadd   \n3/11\n   \n001\n             2500    $0.40  2015
20          GOM HADD    \n3/9\n   \n187\n            1,100    $0.50  2015
21       \nGreysole     \n3/9\n   \n187\n            \n900  \n$0.85  2015
22            \nDabs    \n3/9\n   \n187\n            \n450  \n$0.50  2015
23           GOM COD    \n3/5\n   \n255\n              500    $0.40  2015
24        \nGOM Hadd    \n3/5\n   \n255\n          \n1,000  \n$0.40  2015
25  \nGOM Yellowtail    \n3/5\n   \n255\n          \n3,000  \n$0.20  2015
26          Gom Hadd   \n2/12\n   \n485\n            5,800    $0.40  2015
27  \nGom Yellowtail   \n2/12\n   \n485\n           \n1100  \n$0.20  2015
28          GOM HADD   \n1/26\n   \n314\n              439    $1.50  2015
29  \nGOM Yellowtail   \n1/26\n   \n314\n          \n2,274  \n$0.25  2015
30          GOM HADD   \n1/26\n  \n1610\n            2,950    $0.70  2015
31               NaN   \n1/26\n  \n1610\n            \n500       \n  2015
32               NaN   \n1/26\n  \n1610\n          \n2,550  \n$0.25  2015
33    GOM Yellowtail   \n1/23\n   \n347\n            4,780    $0.25  2015
34    GOM Yellowtail   \n1/23\n   \n802\n            2,141    $0.25  2015
35              POLL   \n12/8\n  \n310B\n            65234    $0.01  2015
36             \nRED   \n12/8\n  \n310B\n          \n76610  \n$0.01  2015
37          \nSNE BB   \n12/8\n  \n310B\n           \n2121  \n$0.30  2015
38          \nGOM BB   \n12/8\n  \n310B\n           \n7285  \n$0.05  2015
39            GOM BB   \n5/29\n   \n588\n             9989    $0.10  2015
40          \nGOM YT   \n5/29\n   \n588\n           \n6172  \n$0.25  2015
41            \nPOLL   \n5/29\n   \n588\n          \n10314  \n$0.01  2015
42         \nREDFISH   \n5/29\n   \n588\n           \n2705  \n$0.01  2015

这是(exported_data)的打印输出:

[('White Hake', '\n4/15\n', '\n002\n', '50,000', '$0.10', '2015'), ('GOM COD', '\n3/23\n', '\n1493\n', '3,600', '$0.60', '2015'), ('\nGreysole', '\n3/23\n', '\n1493\n', '\n350', '\n$1.25', '2015'), ('GBE COD', '\n3/20\n', '\n1878\n', '1,113', '$0.60', '2015'), ('Dabs', '\n3/18\n', '\n1043\n', '3,000', '$0.50', '2015'), ('\nGreysole', '\n3/18\n', '\n1043\n', '\n700', '\n$.85', '2015'), ('GOM HADD', '\n3/13\n', '\n011\n', '790', '$0.50', '2015'), ('Dabs', '\n3/13\n', '\n370\n', '2,100', '$.60', '2015'), ('\nGreySole', '\n3/13\n', '\n370\n', '\n4,700', '\n$.85', '2015'), ('GOM COD', '\n3/13\n', '\n1734\n', '1,900', '$0.90', '2015'), ('\nGOM HADD', '\n3/13\n', '\n1734\n', '\n1,000', '\n$1.00', '2015'), ('\nGreysole', '\n3/13\n', '\n1734\n', '\n3,000', '\n$1.50', '2015'), ('\nDabs', '\n3/13\n', '\n1734\n', '\n2,700', '\n$1.00', '2015'), ('GBW Cod', '\n3/13\n', '\n816\n', '12,000', '$0.40', '2015'), ('\nDabs', '\n3/13\n', '\n816\n', '\n2,000', '\n$0.60', '2015'), ('\nGreysole', '\n3/13\n', '\n816\n', '\n2,000', '\n$0.90', '2015'), ('GOM COD', '\n3/13\n', '\n373\n', '300', '$0.90', '2015'), ('\nGOM YellowTail', '\n3/13\n', '\n373\n', '\n3,300', '\n$0.20', '2015'), ('\nGOM Hadd', '\n3/13\n', '\n373\n', '\n1,000', '\n$0.50', '2015'), ('GOM Hadd', '\n3/11\n', '\n001\n', '2500', '$0.40', '2015'), ('GOM HADD', '\n3/9\n', '\n187\n', '1,100', '$0.50', '2015'), ('\nGreysole ', '\n3/9\n', '\n187\n', '\n900', '\n$0.85', '2015'), ('\nDabs', '\n3/9\n', '\n187\n', '\n450', '\n$0.50', '2015'), ('GOM COD', '\n3/5\n', '\n255\n', '500', '$0.40', '2015'), ('\nGOM Hadd', '\n3/5\n', '\n255\n', '\n1,000', '\n$0.40', '2015'), ('\nGOM Yellowtail', '\n3/5\n', '\n255\n', '\n3,000', '\n$0.20', '2015'), ('Gom Hadd', '\n2/12\n', '\n485\n', '5,800', '$0.40', '2015'), ('\nGom Yellowtail', '\n2/12\n', '\n485\n', '\n1100', '\n$0.20', '2015'), ('GOM HADD', '\n1/26\n', '\n314\n', '439', '$1.50', '2015'), ('\nGOM Yellowtail', '\n1/26\n', '\n314\n', '\n2,274', '\n$0.25', '2015'), ('GOM HADD', '\n1/26\n', '\n1610\n', '2,950', '$0.70', '2015'), (nan, '\n1/26\n', '\n1610\n', '\n500', '\n', '2015'), (nan, '\n1/26\n', '\n1610\n', '\n2,550', '\n$0.25', '2015'), ('GOM Yellowtail', '\n1/23\n', '\n347\n', '4,780', '$0.25', '2015'), ('GOM Yellowtail', '\n1/23\n', '\n802\n', '2,141', '$0.25', '2015'), ('POLL', '\n12/8\n', '\n310B\n', '65234', '$0.01', '2015'), ('\nRED', '\n12/8\n', '\n310B\n', '\n76610', '\n$0.01', '2015'), ('\nSNE BB', '\n12/8\n', '\n310B\n', '\n2121', '\n$0.30', '2015'), ('\nGOM BB', '\n12/8\n', '\n310B\n', '\n7285', '\n$0.05', '2015'), ('GOM BB', '\n5/29\n', '\n588\n', '9989', '$0.10', '2015'), ('\nGOM YT', '\n5/29\n', '\n588\n', '\n6172', '\n$0.25', '2015'), ('\nPOLL', '\n5/29\n', '\n588\n', '\n10314', '\n$0.01', '2015'), ('\nREDFISH', '\n5/29\n', '\n588\n', '\n2705', '\n$0.01', '2015')]

除此之外,它真的让我感到困惑的是为什么错误发生在所有地方的那一行...... cursor.executemany()只是从上面的行执行SQL查询,对吧?它适用于某些表,但对其他表失败,我真的不知道为什么。感谢您解释和解决此问题的任何帮助。

1 个答案:

答案 0 :(得分:1)

if len(Quota)<len(Weight):  #if Quota has less itmes extend with NaN
    lstnans= [np.nan]*(len(Weight)-len(Quota))
    Quota.extend(lstnans)

您有意将nan添加到列表中以掩盖一些解析错误。根本原因在于构建Quota

回答你的问题:

  

为什么NaN会导致导出失败?我的Oracle设置为在单元格中允许Null,所以它不应该接受NaN结果吗?

>>> float('NaN') == None
False

nan不是None / Null