import requests, bs4
import numpy as np
import requests
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame
urls = ['http://www.gutenberg.org/ebooks/search/?
sort_order=title','http://www.gutenberg.org/ebooks/search/?sort_order=title&start_index=26']
for url in urls:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
tb = soup.find_all('span', class_='cell content')
soup_books = soup.findAll("span",{"class":"title"}) #books
soup_authors= soup.findAll("span",{"class":"subtitle"}) #authors
article_title = []
article_author = []
soup_title= soup.findAll("span",{"class":"title"}) # books
soup_para= soup.findAll("span",{"class":"subtitle"}) #authors
for x in range(len(soup_para)):
article_title.append(soup_title[x].text.strip())
article_author.append(soup_para[x].text)
data = {'Article_Author':article_author, 'Article_Title':article_title}
df = DataFrame(data, columns = ['Article_Title','Article_Author'])
print(df)
len(df)
我需要从网站'profiling抓取数据吗? sort_order = title”,直到页面结尾,我如何遍历页面以获取该部分中所有作者和标题的作品
答案 0 :(得分:0)
您是说在前25个结果之后,要导航到下一页并获取下一页的结果吗?您可以使用beatufiulsoup来获取页面右下方的“下一步”按钮的URL:
next_url = soup.find('a', {'title': 'Go to the next page results.'})
,然后使用新的URL再次运行代码。