我继承了以下工作。
import os
import pandas as pd
from selenium import webdriver
from tabulate import tabulate
from datetime import datetime
import time
from bs4 import BeautifulSoup as bs
start = datetime.now()
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
if main is None:
return None
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
urls = {
"https://www.oddsportal.com/soccer/england/premier-league/results/"
}
if __name__ == '__main__':
results = None
for url in urls:
try:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
except ValueError:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
except AttributeError:
game_data = parse_data(url)
if game_data is None:
continue
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(tabulate(results, headers='keys', tablefmt="github"))
end = datetime.now()
time_taken = end - start
print('Time taken to complete: ', time_taken)
| | date | time | game | score | home_odds | draw_odds | away_odds | country | league |
|----|-------------------|--------|----------------------------------|---------|-------------|-------------|-------------|-----------|----------------|
| 0 | Yesterday, 11 May | 19:15 | Southampton - Crystal Palace | 3:1 | 1.89 | 3.8 | 4.11 | England | Premier League |
| 1 | Yesterday, 11 May | 17:00 | Manchester Utd - Leicester | 1:2 | 3.72 | 3.58 | 2.07 | England | Premier League |
| 2 | 10 May 2021 | 19:00 | Fulham - Burnley | 0:2 | 2.24 | 3.44 | 3.38 | England | Premier League |
| 3 | 09 May 2021 | 18:00 | Arsenal - West Brom | 3:1 | 1.5 | 4.53 | 6.76 | England | Premier League |
| 4 | 09 May 2021 | 15:30 | West Ham - Everton | 0:1 | 2.15 | 3.56 | 3.48 | England | Premier League |
我现在需要为 https://www.oddsportal.com/matches/soccer/ 获得类似的数据输出
当我将网址更改为:
urls = {
"https://www.oddsportal.com/matches/soccer/"
}
我收到此错误
Traceback (most recent call last):
File "C:/Users/Harshad/AppData/Roaming/JetBrains/PyCharmCE2020.3/scratches/scratch_36.py", line 79, in <module>
game_data = parse_data(url)
File "C:/Users/Harshad/AppData/Roaming/JetBrains/PyCharmCE2020.3/scratches/scratch_36.py", line 48, in parse_data
league = count[2].text
IndexError: list index out of range
我对从网络上抓取数据完全陌生,因此这个新手问题是关于如何修改此代码以使其正常工作。
请帮忙
答案 0 :(得分:-1)
运行 len(count)
以检查 count 中的元素数。它可能只找到了 1 个元素。因此,当您尝试 count[2].text 时,它会给出 IndexError。