此代码适用于单个网址,但使用多个网址时,出现错误
import os
import pandas as pd
from selenium import webdriver
from tabulate import tabulate
from datetime import datetime
import time
from bs4 import BeautifulSoup as bs
start = datetime.now()
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
html = browser.page_source
soup = bs(html,"lxml")
cont = soup.find('div', {'id':'wrap'})
conti = cont.find('div', {'id':'col-content'})
content = conti.find('table', {'class':'table-main'}, {'id':'tournamentTable'})
main = content.find('th', {'class':'first2 tl'})
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
browser.quit()
return game_data, country, league
# You can input as many URLs you want
urls = {
"https://www.oddsportal.com/soccer/europe/champions-league/results/",
"https://www.oddsportal.com/soccer/australia/a-league/results/#/page/1/",
"https://www.oddsportal.com/soccer/belgium/jupiler-league/results/#/page/1/",
"https://www.oddsportal.com/soccer/czech-republic/1-liga/results/#/page/1/",
}
if __name__ == '__main__':
results = None
for url in urls:
game_data, country, competition = parse_data(url)
result = pd.DataFrame(game_data.__dict__)
result['country'] = country
result['competition'] = competition
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(tabulate(results.head(), headers='keys', tablefmt="github"))
end = datetime.now()
time_taken = end - start
print('Time taken to complete: ', time_taken)
浏览器不循环到下一个网址并关闭
提高 MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=54008): 最大重试次数超过 url: /session/6c9f4ce81beb95e93f2cc32858dfb114/url (由 NewConnectionError(' 上面没有 如何为多个网址循环它?return game_data, country, league
的代码可以完美运行from selenium import webdriver
import pandas as pd
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.dates = []
self.games = []
self.scores = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.dates.append(game_date)
game_data.games.append(row[2])
game_data.scores.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
return game_data
urls = {"https://www.oddsportal.com/soccer/australia/a-league/results/",
"https://www.oddsportal.com/soccer/europe/champions-league/results/",
"https://www.oddsportal.com/soccer/europe/europa-league/results/"}
if __name__ == '__main__':
results = None
for url in urls:
game_data = parse_data(url)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
答案 0 :(得分:0)
问题是我没有正确定义数据框并构建它
另外 browser.close
是在循环完成之前,因此浏览器在下一个 url 之前关闭。
import os
import pandas as pd
from selenium import webdriver
from tabulate import tabulate
from datetime import datetime
import time
from bs4 import BeautifulSoup as bs
browser = webdriver.Chrome()
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def parse_data(url):
browser.get(url)
df = pd.read_html(browser.page_source, header=0)[0]
html = browser.page_source
soup = bs(html, "lxml")
cont = soup.find('div', {'id': 'wrap'})
content = cont.find('div', {'id': 'col-content'})
content = content.find('table', {'class': 'table-main'}, {'id': 'tournamentTable'})
main = content.find('th', {'class': 'first2 tl'})
count = main.findAll('a')
country = count[1].text
league = count[2].text
game_data = GameData()
game_date = None
for row in df.itertuples():
if not isinstance(row[1], str):
continue
elif ':' not in row[1]:
game_date = row[1].split('-')[0]
continue
game_data.date.append(game_date)
game_data.time.append(row[1])
game_data.game.append(row[2])
game_data.score.append(row[3])
game_data.home_odds.append(row[4])
game_data.draw_odds.append(row[5])
game_data.away_odds.append(row[6])
game_data.country.append(country)
game_data.league.append(league)
return game_data
urls = {
"https://www.oddsportal.com/soccer/europe/champions-league/results/",
"https://www.oddsportal.com/soccer/australia/a-league/results/#/page/1/",
"https://www.oddsportal.com/soccer/belgium/jupiler-league/results/#/page/1/",
"https://www.oddsportal.com/soccer/czech-republic/1-liga/results/#/page/1/",
}
if __name__ == '__main__':
results = None
for url in urls:
game_data = parse_data(url)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(tabulate(results, headers='keys', tablefmt="github"))