我设法编写了从第一页抓取数据的代码,现在我被困在此代码中编写一个循环以抓取下一个“ n”页。下面是代码
如果有人可以指导/帮助我编写可从其余页面抓取数据的代码,我将不胜感激。
谢谢!
from bs4 import BeautifulSoup
import requests
import csv
url = requests.get('https://wsc.nmbe.ch/search?sFamily=Salticidae&fMt=begin&sGenus=&gMt=begin&sSpecies=&sMt=begin&multiPurpose=slsid&sMulti=&mMt=contain&searchSpec=s').text
soup = BeautifulSoup(url, 'lxml')
elements = soup.find_all('div', style="border-bottom: 1px solid #C0C0C0; padding: 10px 0;")
#print(elements)
csv_file = open('wsc_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['sp_name', 'species_author', 'status', 'family'])
for element in elements:
sp_name = element.i.text.strip()
print(sp_name)
status = element.find('span', class_ = ['success label', 'error label']).text.strip()
print(status)
author_family = element.i.next_sibling.strip().split('|')
species_author = author_family[0].strip()
family = author_family[1].strip()
print(species_author)
print(family)
print()
csv_writer.writerow([sp_name, species_author, status, family])
csv_file.close()
答案 0 :(得分:2)
您必须在URL中传递page=
参数并遍历所有页面:
from bs4 import BeautifulSoup
import requests
import csv
csv_file = open('wsc_scrape.csv', 'w', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['sp_name', 'species_author', 'status', 'family'])
for i in range(151):
url = requests.get('https://wsc.nmbe.ch/search?page={}&sFamily=Salticidae&fMt=begin&sGenus=&gMt=begin&sSpecies=&sMt=begin&multiPurpose=slsid&sMulti=&mMt=contain&searchSpec=s'.format(i+1)).text
soup = BeautifulSoup(url, 'lxml')
elements = soup.find_all('div', style="border-bottom: 1px solid #C0C0C0; padding: 10px 0;")
for element in elements:
sp_name = element.i.text.strip()
print(sp_name)
status = element.find('span', class_ = ['success label', 'error label']).text.strip()
print(status)
author_family = element.i.next_sibling.strip().split('|')
species_author = author_family[0].strip()
family = author_family[1].strip()
print(species_author)
print(family)
print()
csv_writer.writerow([sp_name, species_author, status, family])
csv_file.close()
答案 1 :(得分:0)
我不确定您的描述如何映射到页面上,但是下面显示了循环的原理以及如何提取信息。
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
n = 4
results = []
headers = ['Success/Failure', 'Names', 'AuthorInfo', 'Family']
df = pd.DataFrame(columns = headers)
with requests.Session() as s:
for page in range(1,n + 1):
r = s.get('https://wsc.nmbe.ch/search?sFamily=Salticidae&fMt=begin&sGenus=&gMt=begin&sSpecies=&sMt=begin&multiPurpose=slsid&sMulti=&mMt=contain&searchSpec=s&page={}'.format(page))
soup = bs(r.content, 'lxml')
failSucceed = [item.text for item in soup.select('.success, .error')]
names = [item.text for item in soup.select('.ym-gbox div > i')]
authorInfo = [item.next_sibling for item in soup.select('.ym-gbox div > i')]
family= [item.split('|')[1] for item in authorInfo]
dfCurrent = pd.DataFrame(list(zip(failSucceed, names, authorInfo, family)))
df = pd.concat([df, dfCurrent])
df = df.reset_index(drop=True)
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')
print(df)
您可以通过以下方法获得结果页数:
numPages = int(soup.select('[href*=search\?page]')[-2].text)