无法使用Python串联2个列表(Int和文本列表)并将输出数据拆分为单独的文件

时间:2018-10-31 14:55:16

标签: python web-scraping beautifulsoup concatenation

我面临着将2个列表的数据连接在一起的问题。在合并它们以打印输出时,列表中都具有页码(整数值)和表数据(文本值),我得到此错误:

TypeError:类型为'int'的对象没有len()

我的目标是打印以下输出,并且还需要帮助将基于“飞机操作员ID”关键字的表3中的数据保存到表名称“表3 A”中,并将标头值为“安装ID”的数据保存到表名称中”表3“分为2个单独的excel文件标签。我的代码如下:

import time
import requests
import random
from lxml import html  # used to use Xpath
from bs4 import BeautifulSoup
import xlsxwriter

def append_row(ws, row):
    for col, value in enumerate(row):
        ws.write_string(ws.cur_row, col, value)
    ws.cur_row += 1

workbook = xlsxwriter.Workbook('Output.xlsx')
ws_3_A = workbook.add_worksheet("Table 3 A")
ws_3_I = workbook.add_worksheet("Table 3 I")

# Keep a track of the row to use in each worksheet
ws_3_A.cur_row = 0  
ws_3_I.cur_row = 0   

# Code starts from here:
start = 1 
end = 3 
link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods="

for page_number in range(start, end):
    print("Page {}".format(page_number))
    url = link.format(page_number)
    r = requests.get(url)
    print(url)

    serial_no = [int(x) for x in str(page_number)]
    print(serial_no)

    time.sleep(random.randint(2, 5))
    soup = BeautifulSoup(r.content, "lxml")

    # Table 3 Aircraft Operator ID data:
    for items in soup.find(id="tblChildDetails").find("table").find_all("tr")[1:]:
        dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
        print(dataset)
        append_row(ws_3_A, serial_no + [url] + dataset)

    # Table 3 Installation ID data:
    for items in soup.find(id="tblChildDetails").find("table").find_all("tr")[1:]:
        dataset = [item.get_text(strip=True) for item in items.find_all("td")[:]]
        print(dataset)
        append_row(ws_3_I, serial_no + [url] + dataset)
workbook.close()  

当前和预期的输出作为屏幕快照随附。预期产量: enter image description here

跟踪:

Traceback (most recent call last):
  File "D:\QRS\Script.py", line 42, in <module>
    append_row(ws_3_A, serial_no + [url] + dataset)
  File "D:\QRS\Script.py", line 10, in append_row
    ws.write_string(ws.cur_row, col, value)
  File "C:\Users\varun\AppData\Roaming\Python\Python36\site-packages\xlsxwriter\worksheet.py", line 67, in cell_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\varun\AppData\Roaming\Python\Python36\site-packages\xlsxwriter\worksheet.py", line 500, in write_string
    return self._write_string(row, col, string, cell_format)
  File "C:\Users\varun\AppData\Roaming\Python\Python36\site-packages\xlsxwriter\worksheet.py", line 512, in _write_string
    if len(string) > self.xls_strmax:
TypeError: object of type 'int' has no len()

1 个答案:

答案 0 :(得分:2)

[int(x) for x in str(page_number)]中的每个元素应为字符串。因为每个元素都会传递给函数value中的变量append_row()

然后,要获得预期的输出,需要在trpage_number时忽略第一个start

  

使用try...finally,因此您可以在脚本发生错误时关闭工作簿

import time
import requests
import random
from lxml import html
from bs4 import BeautifulSoup
import xlsxwriter

def append_row(ws, row):
    for col, value in enumerate(row):
        ws.write_string(ws.cur_row, col, value)
    ws.cur_row += 1

workbook = xlsxwriter.Workbook('Output.xlsx')


def ws_3(name):
    return workbook.add_worksheet("Table 3 {}".format(name))


# Code starts from here:
start = 1 
end = 5
link = "http://ec.europa.eu/environment/ets/ohaDetails.do?returnURL=&languageCode=en&accountID=&registryCode=&buttonAction=all&action=&account.registryCode=&accountType=&identifierInReg=&accountHolder=&primaryAuthRep=&installationIdentifier=&installationName=&accountStatus=&permitIdentifier=&complianceStatus=&mainActivityType=-1&searchType=oha&resultList.currentPageNumber={}&nextList=Next%C2%A0%3E&selectedPeriods="
coldict = {}
try:
    for page_number in [1,2,3,342,343]:
        print("Page {}".format(page_number))
        url = link.format(page_number)
        r = requests.get(url)

        serial_no = [str(page_number)]

        time.sleep(random.randint(2, 5))
        soup = BeautifulSoup(r.content, "lxml")

        # Table 3 Aircraft Operator ID data:
        tr = soup.find(id="tblChildDetails").find("table").find_all("tr")
        dataset = [item.get_text(strip=True) for item in tr[1].find_all("td")]

        #select or create new table
        if not coldict.get(dataset[0]):
            ws = ws_3(dataset[0])
            ws.cur_row = 0
            coldict[dataset[0]] = ws
            append_row(ws, ["Page no","Url"] + dataset)
        else:
            ws = coldict.get(dataset[0])

        for items in tr[2:]:
            dataset = [item.get_text(strip=True) for item in items.find_all("td")]
            print(url)
            print(dataset)
            append_row(ws, serial_no + [url] + dataset)

finally:
    workbook.close()