Python:TypeError:必须是str,而不是字节

时间:2017-06-02 15:51:27

标签: python excel web-scraping file-writing

我正在尝试编写用于处理html网页的代码并从中创建excel表。有错误抛出。如果有人可以提供帮助,请告诉我。我运行的很多代码都有评论。我也想把它包括在内。

book = xlwt.Workbook(encoding="utf-8")
sheet1 = book.add_sheet("Product List")

i=0
for row in soup.findAll('li', { "class" : "product-link" }):
    link = row.a['href']
    #print(link)
    #print(link[23:])

    conn = http.client.HTTPSConnection("www.sanjamar.com")
    conn.request("GET", link[23:])
    req = conn.getresponse()
    #print(req.status, req.reason) 
    data2 = req.read()
    soup2=BeautifulSoup(data2)

    Name=soup2.find('title')
    Name_text = Name.text
    words = Name_text.split('|')
    Name = words[0]
    print(Name)

    if len(soup2.find_all("div", {"id":"productDescription"})) > 0:
        Desc = soup2.find('div', {"id":"productDescription"})
        ProdDesc = Desc.p
        #print(ProdDesc)

        if ProdDesc != None:
            Desc == ProdDesc.text

    if len(soup2.find_all("div", {"id":"productBenefits"})) > 0:
        Feat = soup2.find('div',{"id":"productBenefits"})
        ProdFeat = Feat.ul
        #print(ProdFeat)

        if ProdFeat != None:
            Feat == ProdFeat.text

    if len(soup2.find_all("table", {"class":"mceItemTable"})) > 0:
        Spec = soup2.find('table',{"class":"mceItemTable"})
        #print(Spec)
        if Spec != None:
            specrow = ''
            for row in Spec.findAll('tr'):
                specrow = specrow + ',' + row.text

        if specrow != None:
            Spec = specrow[1:]

        words = Spec.partition(",")[2]        


    record = (Name,Desc.text,Feat.text[20:],words)
    print(record)

    for col_index, item in enumerate(record):
        sheet1.write(i, col_index, item)
    i += 1

book.save("Sanjamar1.xls")

'''
    if len(soup2.find_all("table", {"class":"variations"})) > 0:
    options = soup2.find('table',{"class":"variations"})
    Prodoptions = options.select
    print('options')
    print(Prodoptions)

if len(soup2.find_all("div", {"id":"availableColorsWrapper"})) > 0:
    options = soup2.find('div',{"id":"availableColorsWrapper"})
    ProdColors = options.ul
    print('Colors')
    print(ProdColors)

if len(soup2.find_all("a", {"class":"fancybox-media"})) > 0:
    options = soup2.find('a',{"class":"fancybox-media"})
    ProdVideos = options['href']
    print('Videos')
    print(ProdVideos)

'''
'''
j = 0
if len(soup2.find_all("a", {"class":"lit-link"})) > 0:
    for row1 in soup2.findAll('a', {"class":"lit-link"}):
        file_download1 = row1['href'] 
        words = file_download1.split('/')
        print (words[-1])
        if words[-1][-3:]=="pdf":
            print (file_download1)
            if file_download1 != None:
                if file_download1 != '': 
                    try:
                        resource = urllib.request.urlopen(file_download1)
                        file_name1 = words[-1]
                        output = open(file_name1,"wb")
                        output.write(resource.read())
                        output.close()
                    except urllib.request.HTTPError:
                        print('Nthn')


j += 1
   '''

#i += 1

出现的错误是:

TypeError                                 Traceback (most recent call last)
<ipython-input-17-468fc8825863> in <module>()
     63     i += 1
     64 
---> 65 book.save('Sanjamar.xls')
     66 
     67 '''

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in save(self, 
filename_or_stream)
    708 
    709         doc = CompoundDoc.XlsDoc()
--> 710         doc.save(filename_or_stream, self.get_biff_data())
    711 
    712 

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in 
get_biff_data(self)
    672         all_links          = self.__all_links_rec()
    673 
--> 674         shared_str_table   = self.__sst_rec()
    675         after = country + all_links + shared_str_table
    676 

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in _ 
_sst_rec(self)
    634 
    635     def __sst_rec(self):
--> 636         return self.__sst.get_biff_record()
    637 
    638     def __ext_sst_rec(self, abs_stream_pos):

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\BIFFRecords.py in 
get_biff_record(self)
     77                 self._add_to_sst(s)
     78             else:
---> 79                 self._add_rt_to_sst(s)
     80         del data
     81         self._new_piece()

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\BIFFRecords.py in 
_add_rt_to_sst(self, rt)
    106 
    107     def _add_rt_to_sst(self, rt):
--> 108         rt_str, rt_fr = upack2rt(rt, self.encoding)
    109         is_unicode_str = rt_str[2] == b'\x09'[0]
    110         if is_unicode_str:

C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\UnicodeUtils.py in 
upack2rt(rt, encoding)
     84             # code in Rows.py ensures that
     85             # fontx can be None only for the first piece
---> 86             fr += pack('<HH', offset, fontx)
     87         # offset is the number of MS C wchar characters.
     88         # That is 1 if c <= u'\uFFFF' else 2

TypeError: must be str, not bytes

0 个答案:

没有答案