我有以下代码:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import requests
from requests import get
date = []
tourney_round = []
result = []
winner_odds = []
loser_odds = []
surface = []
players_and_tourney
response = get('http://www.tennisexplorer.com/player/humbert-e2553/?annual=all')
page_html = BeautifulSoup(response.text, 'html.parser')
results2018_containers = page_html.find_all('div', id = 'matches-2018-1-data')
for container in results2018_containers:
played_date_2018 = results2018_containers[0].findAll('td', class_ = 'first time')
for i in played_date_2018:
date.append(i.text)
string_2018 = '2018'
date = [x + string_2018 for x in date]
for container in results2018_containers:
rounds_2018 = results2018_containers[0].findAll('td', class_ = 'round')
for i in rounds_2018:
tourney_round.append(i.text)
for container in results2018_containers:
results_2018 = results2018_containers[0].findAll('td', class_ = 'tl')
for i in results_2018:
result.append(i.text)
for container in results2018_containers:
surfaces_2018 = results2018_containers[0].findAll('td', class_ = 's-color')
for i in surfaces_2018:
surface.append(i.find('span')['title'])
for container in results2018_containers:
odds_2018 = results2018_containers[0].findAll('td', class_ = 'course')
winner_odds_2018 = odds_2018[0:][::2]
for i in winner_odds_2018:
winner_odds.append(i.text)
loser_odds_2018 = odds_2018[1:][::2]
for i in loser_odds_2018:
loser_odds.append(i.text)
for container in results2018_containers:
namesandtourney_2018 = results2018_containers[0].findAll('td', class_ = 't-name')
for i in namesandtourney_2018:
players_and_tourney.append(i.text)
from itertools import chain, groupby, repeat
chainer = chain.from_iterable
def condition(x):
return x.startswith('\xa0')
elements = [list(j) for i, j in groupby(players_and_tourney, key=condition) if not i]
# create list of headers
headers = [next(j) for i, j in groupby(players_and_tourney, key=condition) if i]
# chain list of lists, and use repeat for headers
initial_df_2018 = pd.DataFrame({'Date': date,
'Surface': surface,
'Players': list(chainer(elements)),
'Tournament': list(chainer(repeat(i, j) for i, j in \
zip(headers, map(len, elements)))),
'Round': tourney_round,
'Result': result,
'Winner Odds': winner_odds,
'Loser Odds' : loser_odds})
initial_df_2018['Winner'], initial_df_2018['Loser'] =
initial_df_2018['Players'].str.split(' - ', 1).str
del initial_df_2018['Players']
initial_df_2018 = initial_df_2018[['Date','Surface','Tournament','Winner','Loser','Result','Winner Odds','Loser Odds']]
我想创建一个循环来运行从2005年开始的每年的代码。所以基本上,在2005年到2018年之间每年在代码中替换2018来运行循环。如果可能,代码将首先运行2018年,然后是2017年,依此类推,直到2005年。
编辑:我添加了用于提取2018年数据的代码,但我希望有一个循环,可以在页面上找到所有年份的数据。
答案 0 :(得分:2)
如果我理解正确,您希望完成2005年至2018年间所有年份的2018年申请。
我所做的是在这些范围内循环你的代码多年,每次都替换id并将所有数据添加到列表中。
response = get('http://www.example.com')
page_html = BeautifulSoup(response.text, 'html.parser')
date_dict = {}
for year in range(2019, 1, -1):
date = []
string_id = "played-{}-data".format(year)
results_containers = page_html.find_all('div', id = string_id)
if (results_containers == None):
continue
for container in results_containers :
played_date = results_containers [0].findAll('td', class_ = 'plays')
for i in played_date :
date.append(i.text)
if not (year in date_dict):
date_dict[year] = []
date_dict[year] += date
答案 1 :(得分:2)
您可以将年份存储为整数但仍以字符串形式使用。
for year in range(2018, 2004, -1):
print(f"Happy New Year {year}")
在字符串中包含数字的其他方式有"Happy New Year {}".format(year)
或"it is now " + str(year) + " more text"
。
另外,我不认为你这样做,但是如果有人发现了这个并且真的想要"迭代字符串" caesar ciphers是一个值得关注的好地方。
答案 2 :(得分:1)
循环没问题,但你需要定义你想要的结果。我在这里使用了一个字典,我把你的代码变成了一个我可以用变量调用的函数:
def get_data(year):
date =[]
response = get('http://www.example.com')
page_html = BeautifulSoup(response.text, 'html.parser')
results_containers = page_html.find_all('div', id = 'played-{year}-data'.format(year))
for container in results_containers:
played_date = results_containers[0].findAll('td', class_ = 'plays')
for i in played_date:
date.append(i.text)
return date
现在我所要做的就是创建一个可能年份的range
并每次调用该函数,这可以简单地完成:
all_data = {year: get_data(year) for year in range(2018, 2004, -1)}
答案 3 :(得分:1)
只需在for
上使用range
循环即可。类似的东西:
date =[]
response = get('http://www.example.com')
page_html = BeautifulSoup(response.text, 'html.parser')
for year in range(2018, 2004, -1):
year_id = 'played-{}-data'.format(year)
results_containers = page_html.find_all('div', id=year_id)
...