将代码从Python 2.7改为3.5

时间:2017-01-20 14:55:32

标签: python excel python-2.7 python-3.x

我有一些用于导出到Excel的Web scraper的代码。帮助我的人使用2.7但我只学习了几个月并使用3.5。有人能帮我修改这段代码吗?

#import urllib2 as urllib
#import urllib.request
import requests
from bs4 import BeautifulSoup
import datetime
import xlsxwriter
import sys

# Web scraping
def make_soup(url):
    #the_page = urllib.request.urlopen(url)
    res = requests.get(url)
    the_page = res.text
    soup_data = BeautifulSoup(the_page, "html.parser")
    return soup_data


soup = make_soup('http://www.url.co.uk/')


def getNames():
    ret_names_list = ['Names']
    for record in soup.findAll('tr'):
        for data in record.findAll('td'):
            for td_in_data in data.findAll('td', {"class": "propname"}):
                #import unicodedata
                td_in_data = td_in_data.text
                td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
                #unicodedata.normalize('NFKD', td_in_data).encode('ascii','ignore')

                print(td_in_data)
                ret_names_list.append(td_in_data)
    return ret_names_list


def getRooms():
    ret_rooms_list = ['Rooms']
    for record in soup.findAll('tr'):
        for data in record.findAll('td'):
            for td_in_data in data.findAll('span', {"class": "beds"}):
                print(td_in_data.text)
                td_in_data = td_in_data.text
                td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
                ret_rooms_list.append(td_in_data)
    return ret_rooms_list


def getRents():
    ret_rents_list = ['Rents']
    for record in soup.findAll('tr'):
        for data in record.findAll('td'):
            for td_in_data in data.findAll('td', {"class": "rentprice"}):
                print(td_in_data.text)
                td_in_data = td_in_data.text
                td_in_data = td_in_data.encode(sys.stdout.encoding, errors='replace')
                ret_rents_list.append(td_in_data)
    return ret_rents_list

''' To do: get the scraped data to an Excel doc.'''

# Create a workbook and add a worksheet.
if __name__ == '__main__':
    todays_date = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M") )+ '.xlsx'
    todays_date = todays_date.replace(" ", "_").replace(":", "_")

    workbook = xlsxwriter.Workbook(todays_date)
    worksheet = workbook.add_worksheet()

    # Data to Excel.
    excel_dump = zip(getNames(), getRents(), getRooms())

    #Excel_dump = (
    #    ['Name', getNames()],
    #    ['Rent',   getRents()],
    #    ['Rooms',  getRooms()]
    #)

    # Start from the first cell. Rows and columns are zero indexed.
    row = 0
    col = 0


    # Iterate over the data and write it out row by row.
    for name, rent, room in excel_dump:
        try:
            reload(sys)
            sys.setdefaultencoding('Cp1252')
            worksheet.write(col, row, str(name))
            worksheet.write(col+1, row, rent)
            worksheet.write(col+2, row, room)
        except Exception as e:
            raise e
        #col += 1
        row += 1

    workbook.close()

我得到的错误信息是:

  

^内存中的Scraped数据^   Traceback(最近一次调用最后一次):文件   “C:/Users/joseph.devlin/PycharmProjects/Web_Scraping/scraper.py”   第91行,在       提出e文件“C:/Users/joseph.devlin/PycharmProjects/Web_Scraping/scraper.py”,   第85行,在       reload(sys)NameError:未定义名称'reload'在:>中忽略了异常追溯(大多数   最近的呼叫最后):文件   “C:\用户\ joseph.devlin \应用程序数据\漫游\ Python的\ Python35 \站点包\ xlsxwriter \ workbook.py”   第149行,在 del 异常:工作簿中捕获异常   析构函数。工作簿可能需要显式close()。

     

使用退出代码1完成处理

我目前正在研究如何解决这个问题,但任何帮助都会受到赞赏!

0 个答案:

没有答案