我有一些用于导出到Excel的Web scraper的代码。帮助我的人使用2.7但我只学习了几个月并使用3.5。有人能帮我修改这段代码吗?
#import urllib2 as urllib
#import urllib.request
import requests
from bs4 import BeautifulSoup
import datetime
import xlsxwriter
import sys
# Web scraping
def make_soup(url):
#the_page = urllib.request.urlopen(url)
res = requests.get(url)
the_page = res.text
soup_data = BeautifulSoup(the_page, "html.parser")
return soup_data
soup = make_soup('http://www.url.co.uk/')
def getNames():
ret_names_list = ['Names']
for record in soup.findAll('tr'):
for data in record.findAll('td'):
for td_in_data in data.findAll('td', {"class": "propname"}):
#import unicodedata
td_in_data = td_in_data.text
td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
#unicodedata.normalize('NFKD', td_in_data).encode('ascii','ignore')
print(td_in_data)
ret_names_list.append(td_in_data)
return ret_names_list
def getRooms():
ret_rooms_list = ['Rooms']
for record in soup.findAll('tr'):
for data in record.findAll('td'):
for td_in_data in data.findAll('span', {"class": "beds"}):
print(td_in_data.text)
td_in_data = td_in_data.text
td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
ret_rooms_list.append(td_in_data)
return ret_rooms_list
def getRents():
ret_rents_list = ['Rents']
for record in soup.findAll('tr'):
for data in record.findAll('td'):
for td_in_data in data.findAll('td', {"class": "rentprice"}):
print(td_in_data.text)
td_in_data = td_in_data.text
td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
ret_rents_list.append(td_in_data)
return ret_rents_list
''' To do: get the scraped data to an Excel doc.'''
# Create a workbook and add a worksheet.
if __name__ == '__main__':
todays_date = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M") )+ '.xlsx'
todays_date = todays_date.replace(" ", "_").replace(":", "_")
workbook = xlsxwriter.Workbook(todays_date)
worksheet = workbook.add_worksheet()
# Data to Excel.
excel_dump = zip(getNames(), getRents(), getRooms())
#Excel_dump = (
# ['Name', getNames()],
# ['Rent', getRents()],
# ['Rooms', getRooms()]
#)
# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0
# Iterate over the data and write it out row by row.
for name, rent, room in excel_dump:
try:
reload(sys)
sys.setdefaultencoding('Cp1252')
worksheet.write(col, row, str(name))
worksheet.write(col+1, row, rent)
worksheet.write(col+2, row, room)
except Exception as e:
raise e
#col += 1
row += 1
workbook.close()
我得到的错误信息是:
^内存中的Scraped数据^ Traceback(最近一次调用最后一次):文件 “C:/Users/joseph.devlin/PycharmProjects/Web_Scraping/scraper.py” 第91行,在 提出e文件“C:/Users/joseph.devlin/PycharmProjects/Web_Scraping/scraper.py”, 第85行,在 reload(sys)NameError:未定义名称'reload'在:>中忽略了异常追溯(大多数 最近的呼叫最后):文件 “C:\用户\ joseph.devlin \应用程序数据\漫游\ Python的\ Python35 \站点包\ xlsxwriter \ workbook.py” 第149行,在 del 异常:工作簿中捕获异常 析构函数。工作簿可能需要显式close()。
使用退出代码1完成处理
我目前正在研究如何解决这个问题,但任何帮助都会受到赞赏!