我想从下面https://oddslot.com/tips/的最后一页提取免费足球投注技巧是我的代码
import requests
from bs4 import BeautifulSoup
from pandas.io.html import read_html
bet = None
while True:
if bet:
page =bet
else:
page = 'https://oddslot.com/tips/'
infoboxes = read_html(page, index_col=0, attrs={"class":"table table-hover team-schedule team-schedule--full"})
file_name = './my_file.csv'
infoboxes[0].to_csv(file_name, sep=',')
page = requests.get('https://oddslot.com/tips/')
soup = BeautifulSoup(page.text, 'html.parser')
bet_link = soup.find('a',class_='post-pagination text-center')
if bet_link:
bet = bet_link.get('href')
else:
break
我只得到第一页,我怎么能到达最后一页?
答案 0 :(得分:0)
关键是要从页面中获取所有数据,然后遍历所有页面。一种方法是:
import csv
import requests
from lxml import html
def extractor(p, x):
return html.fromstring(p).xpath(x)
def parse_rows(rows):
d = {}
for row in rows:
d["time"] = row.xpath('//*[@class="team-schedule__date"]/strong/text()')
teams = row.xpath('//*[@class="team-meta__name"]/strong/text()')
d["home_team"] = teams[1::2]
d["away_team"] = teams[::2]
other_info = row.xpath('//*[@class="team-schedule__time"]/strong/text()')
d["league"] = other_info[::3] # league name
d["chance"] = other_info[1::3] # chance % value
d["odds"] = row.xpath('//*[@class="team-schedule__time"]/a/strong/text()')
d["pick"] = other_info[2::3] # pick
return d
def page_scraper(page_range):
for page_number in range(1, page_range + 1):
url = f"https://oddslot.com/tips/?page={page_number}"
print(f"Fetching data from page: {page_number} / {page_range}")
response = requests.get(url)
yield parse_rows(extractor(response.content, "//tbody/tr"))
for page_data in page_scraper(page_range=100):
with open('odd_slots_tips.csv', 'a') as f:
w = csv.writer(f)
for i in zip(*page_data.values()):
w.writerow(list(i))
这会抓取所有100
页,并生成包含.csv
(10条记录* 100页)数据行的1000
文件。
.csv文件中的示例:
23:00 GMT,Sao Paulo,Atletico-MG,Brazil: Serie A,67%,1.95,HOME DC
22:00 GMT,Sport Recife,Gremio,Brazil: Serie A,88%,1.32,HOME WIN
18:45 GMT,Hungary,Turkey,Europe: Uefa Nations League - League B,81%,1.52,HOME WIN
...