我正在尝试编写用于处理html网页的代码并从中创建excel表。有错误抛出。如果有人可以提供帮助,请告诉我。我运行的很多代码都有评论。我也想把它包括在内。
book = xlwt.Workbook(encoding="utf-8")
sheet1 = book.add_sheet("Product List")
i=0
for row in soup.findAll('li', { "class" : "product-link" }):
link = row.a['href']
#print(link)
#print(link[23:])
conn = http.client.HTTPSConnection("www.sanjamar.com")
conn.request("GET", link[23:])
req = conn.getresponse()
#print(req.status, req.reason)
data2 = req.read()
soup2=BeautifulSoup(data2)
Name=soup2.find('title')
Name_text = Name.text
words = Name_text.split('|')
Name = words[0]
print(Name)
if len(soup2.find_all("div", {"id":"productDescription"})) > 0:
Desc = soup2.find('div', {"id":"productDescription"})
ProdDesc = Desc.p
#print(ProdDesc)
if ProdDesc != None:
Desc == ProdDesc.text
if len(soup2.find_all("div", {"id":"productBenefits"})) > 0:
Feat = soup2.find('div',{"id":"productBenefits"})
ProdFeat = Feat.ul
#print(ProdFeat)
if ProdFeat != None:
Feat == ProdFeat.text
if len(soup2.find_all("table", {"class":"mceItemTable"})) > 0:
Spec = soup2.find('table',{"class":"mceItemTable"})
#print(Spec)
if Spec != None:
specrow = ''
for row in Spec.findAll('tr'):
specrow = specrow + ',' + row.text
if specrow != None:
Spec = specrow[1:]
words = Spec.partition(",")[2]
record = (Name,Desc.text,Feat.text[20:],words)
print(record)
for col_index, item in enumerate(record):
sheet1.write(i, col_index, item)
i += 1
book.save("Sanjamar1.xls")
'''
if len(soup2.find_all("table", {"class":"variations"})) > 0:
options = soup2.find('table',{"class":"variations"})
Prodoptions = options.select
print('options')
print(Prodoptions)
if len(soup2.find_all("div", {"id":"availableColorsWrapper"})) > 0:
options = soup2.find('div',{"id":"availableColorsWrapper"})
ProdColors = options.ul
print('Colors')
print(ProdColors)
if len(soup2.find_all("a", {"class":"fancybox-media"})) > 0:
options = soup2.find('a',{"class":"fancybox-media"})
ProdVideos = options['href']
print('Videos')
print(ProdVideos)
'''
'''
j = 0
if len(soup2.find_all("a", {"class":"lit-link"})) > 0:
for row1 in soup2.findAll('a', {"class":"lit-link"}):
file_download1 = row1['href']
words = file_download1.split('/')
print (words[-1])
if words[-1][-3:]=="pdf":
print (file_download1)
if file_download1 != None:
if file_download1 != '':
try:
resource = urllib.request.urlopen(file_download1)
file_name1 = words[-1]
output = open(file_name1,"wb")
output.write(resource.read())
output.close()
except urllib.request.HTTPError:
print('Nthn')
j += 1
'''
#i += 1
出现的错误是:
TypeError Traceback (most recent call last)
<ipython-input-17-468fc8825863> in <module>()
63 i += 1
64
---> 65 book.save('Sanjamar.xls')
66
67 '''
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in save(self,
filename_or_stream)
708
709 doc = CompoundDoc.XlsDoc()
--> 710 doc.save(filename_or_stream, self.get_biff_data())
711
712
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in
get_biff_data(self)
672 all_links = self.__all_links_rec()
673
--> 674 shared_str_table = self.__sst_rec()
675 after = country + all_links + shared_str_table
676
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\Workbook.py in _
_sst_rec(self)
634
635 def __sst_rec(self):
--> 636 return self.__sst.get_biff_record()
637
638 def __ext_sst_rec(self, abs_stream_pos):
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\BIFFRecords.py in
get_biff_record(self)
77 self._add_to_sst(s)
78 else:
---> 79 self._add_rt_to_sst(s)
80 del data
81 self._new_piece()
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\BIFFRecords.py in
_add_rt_to_sst(self, rt)
106
107 def _add_rt_to_sst(self, rt):
--> 108 rt_str, rt_fr = upack2rt(rt, self.encoding)
109 is_unicode_str = rt_str[2] == b'\x09'[0]
110 if is_unicode_str:
C:\Users\Santosh\Anaconda3\lib\site-packages\xlwt\UnicodeUtils.py in
upack2rt(rt, encoding)
84 # code in Rows.py ensures that
85 # fontx can be None only for the first piece
---> 86 fr += pack('<HH', offset, fontx)
87 # offset is the number of MS C wchar characters.
88 # That is 1 if c <= u'\uFFFF' else 2
TypeError: must be str, not bytes