Question

我正在试图抓页。我可以得到它来拉取所有数据并将其保存到数组对象但不能让我的for循环遍历数组的每个索引并将它们输出到CSV。它将编写标题和第一个对象。新手编写代码所以任何帮助表示赞赏。

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'https://www.sports-reference.com/cfb/schools/air-force/'

# Open Connection & Grabbing the Page
uClient = uReq(my_url)

#Creating variable to Save the Page
page_html = uClient.read()

#Closing the connection
uClient.close()

#Parse the data to HTML
page_soup = soup(page_html, "html.parser")

#Grab container info from the DOM
containers = page_soup.findAll("div",{"class":"overthrow table_container"})

filename = "airforce.csv"
f = open(filename, "w")

headers = "year, wins, losses, ties, wl, sos\n"

f.write(headers)

for container in containers:
 #Find all years
 year_container = container.findAll("td",{"data-stat":"year_id"})
 year = year_container[0].text

 #Find number of Wins
 wins_container = container.findAll("td",{"data-stat":"wins"})
 wins = wins_container[0].text

 #Find number of Wins
 losses_container = container.findAll("td",{"data-stat":"losses"})
 losses = losses_container[0].text

 #Number of Ties if any
 ties_container = container.findAll("td",{"data-stat":"ties"})
 ties = ties_container[0].text

 #Win-Loss as a percentage
 wl_container = container.findAll("td",{"data-stat":"win_loss_pct"})
 wl = wl_container[0].text


 #Strength of Schedule. Can be +/- w/0 being average
 sos_container = container.findAll("td",{"data-stat":"sos"})
 sos = sos_container[0].text

 f.write(year + "," + wins + "," + losses + "," + ties + "," + wl + "," + 
 sos + "\n")

f.close()

Answer 1

您想要查找表（正文），然后遍历不是标题行的表行，即所有不具有类的行。

对于写入（和读取）CSV文件，标准库中有一个csv模块。

import csv
from urllib.request import urlopen

import bs4


def iter_rows(html):
    headers = ['year_id', 'wins', 'losses', 'ties', 'win_loss_pct', 'sos']
    yield headers

    soup = bs4.BeautifulSoup(html, 'html.parser')
    table_body_node = soup.find('table', 'stats_table').tbody
    for row_node in table_body_node('tr'):
        if not row_node.get('class'):
            yield [
                row_node.find('td', {'data-stat': header}).text
                for header in headers
            ]


def main():
    url = 'https://www.sports-reference.com/cfb/schools/air-force/'
    with urlopen(url) as response:
        html = response.read()

    with open('airforce.csv', 'w') as csv_file:
        csv.writer(csv_file).writerows(iter_rows(html))   


if __name__ == '__main__':
    main()

Answer 2

提取html源代码，只有一个容器可以放入容器列表中。这意味着您的for循环正在尝试访问错误的信息。

您应该使用range()生成器来访问位于td列表中一个项目内的containers的不同元素。

试试这个

#number of records to iterate over
num = len(list(containers.findAll("td",{"data-stat":"year_id"})))

for i in range(num):
    #Find all years
    year_container = containers.findAll("td",{"data-stat":"year_id"})
    year = year_containers[i].text

只写第一行到csv

2 个答案: