我正在从此page
中抓取excel链接是否可以提取日期和页面链接?
import requests
from bs4 import BeautifulSoup
from pprint import pprint
base_url = 'https://usda.library.cornell.edu'
url = 'https://usda.library.cornell.edu/concern/publications/3t945q76s?locale=en#release-items'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
b = []
page = 1
while True:
pdf_urls = [a["href"] for a in soup.select('#release-items a[href$=".xls"]')]
pprint(pdf_urls)
b.append(pdf_urls)
m = soup.select_one('a[rel="next"][href]')
if m and m['href'] != '#':
soup = BeautifulSoup(requests.get(base_url + m['href']).text, 'html.parser')
else:
break
答案 0 :(得分:2)
您可以使用zip()
data = {}
for url, date in zip(soup.find_all("a", attrs={"data-label": "latest.xls"}), soup.find_all("td", class_="date_uploaded")):
data[url['href']] = date.text
print(data)
#Output
{
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/gm80jj54x/rx914c582/latest.xls': 'Aug 12, 2020',
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/sb397x16q/wm118b04x/latest.xls': 'Jul 10, 2020',
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/g158c396h/5138k2221/latest.xls': 'Jun 11, 2020',
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/w6634p60m/fx71b7035/latest.xls': 'May 12, 2020',
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/q237jb60d/47429t84v/latest.xls': 'Apr 9, 2020',
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/02871d57q/vx021z530/latest.xls': 'Mar 10, 2020',
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/pz50hc74s/xw42ns32q/latest.xls': 'Feb 11, 2020',
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/79408c82d/zg64v261j/latest.xls': 'Jan 10, 2020',
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/1544c4419/xk81k173q/latest.xls': 'Dec 10, 2019',
'https://downloads.usda.library.cornell.edu/usda-esmis/files/3t945q76s/st74d522v/d217r356w/latest.xls': 'Nov 8, 2019'
}
答案 1 :(得分:1)
在某些情况下,页面给出不带XLS的行,因此获取行更安全,以后使用for
循环分别搜索每行中的链接和日期。这样,您可以获取没有xls
的日期。使用zip()
,您可以从下一行获取带有链接的日期。
下一页使用带有&page=2
,&page=2
等的网址,因此您可以在URL中使用&page={}
生成下一页的网址,但是这样您就无法识别上一页和停止循环。也许在请求中使用status_code
,您可能会发现您没有得到下一页然后停止循环。
但是有一种更好/更简单的方法-页面上的按钮Next
带有指向下一页的链接,我用它来获取下一页。
因为有时页面上没有xls文件,所以我用它来更早地停止循环。
import requests
from bs4 import BeautifulSoup
# --- functions ---
def get_data(soup):
"""Get links and dates for one page"""
results = []
all_rows = soup.select('#release-items tr')
for row in all_rows:
date = row.select_one('.date_uploaded').text
pdf_url = row.select_one('a[href$=".xls"]')
if pdf_url:
pdf_url = pdf_url['href']
results.append([date, pdf_url])
print(date, pdf_url)
else:
print(date, "Can't find XLS")
return results
# --- main ---
url = 'https://usda.library.cornell.edu/concern/publications/3t945q76s?locale=en#release-items'
all_results = []
# - loop -
while True:
print('url:', url)
# get current page
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# parse current page - get all needed data
results = get_data(soup)
if not results: # there is no more XLS
break
all_results += results
# get link to next page
url = soup.find('a', {'rel': 'next'})
if not url or url['href'] == '#': # there is no more pages
break
url = 'https://usda.library.cornell.edu' + url['href']
# - after loop -
print('--- results ---')
print('len:', len(all_results))
print('first:', all_results[0])
print('last :', all_results[-1])