我想使用以下代码从URL网页中获取所有表格。
import csv
import requests
from bs4 import BeautifulSoup
urls = [
'https://g10oal.com/match/c81e21f3-7804-4961-ac74-4e2804a19784/odds'
]
all_data = []
for url in urls:
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.findAll("class", {"class":"table table-sm odds-compare-table"})[0]
# here I store all rows to list `all_data`
for row in table.findAll('tr'):
tds = [cell.get_text(strip=True, separator=' ') for cell in row.findAll(["td", "th"])]
all_data.append(tds)
print(*tds)
# write list `all_data` to CSV
with open("c:/logs/test.csv", "wt+", newline="") as f:
writer = csv.writer(f)
for row in all_data:
writer.writerow(row)
运行代码后,它显示“ IndexError:列表索引超出范围”
答案 0 :(得分:0)
argument
中的第一个findAll
是标记/元素的name
,而不是attribute
。
你应该做
# make sure to use a single space only between table-sm and odds-compare-table
tables = soup.findAll("table", {"class": "table table-sm odds-compare-table"})
# or, pass the classes as list
tables = soup.findAll("table", {"class": ["table", "table-sm", "odds-compare-table"]})
然后您在tables
上循环
for table in tables:
# here I store all rows to list `all_data`
for row in table.findAll('tr'):
tds = [cell.get_text(strip=True, separator=' ') for cell in row.findAll(["td", "th"])]
all_data.append(tds)
print(*tds)
如果只希望第一个table
,则可以使用find
代替findAll
table = soup.find("table", {"class": ["table", "table-sm", "odds-compare-table"]})