我想使用Android Studio标记对Stack Overflow上的问题/答案URL 进行爬网,以进行研究。网站上应该大约有55628个问题: https://stackoverflow.com/questions/tagged/android-studio?sort=newest&page=1&pagesize=15
但是,到目前为止,我只能抓取50个问题。它在第四页的中间结束爬网
我觉得我应该写一个 for循环来遍历URL,但是我无法想象首先要修改的地方。 如何修改我的程序?
import requests
from bs4 import BeautifulSoup
import re
import json
class Stack(object):
def __init__(self):
self.baseurl = "https://stackoverflow.com"
self.starturl = "https://stackoverflow.com/questions/tagged/android-studio"
# The second page's URL: https://stackoverflow.com/questions/tagged/android-studio?sort=newest&page=2&pagesize=15
# The third page's URL: https://stackoverflow.com/questions/tagged/android-studio?sort=newest&page=3&pagesize=15
def start_requests(self, url):
r = requests.get(url)
return r.content
def parse(self, text):
soup = BeautifulSoup(text, 'html.parser')
divs = soup.find_all('div', class_ = 'question-summary')
for div in divs:
div.find('div', class_ = 'summary').find_all('div')[1].find_all('a')
yield {
'title': div.h3.a.text,
'url': self.baseurl + div.h3.a.get('href')
}
def start(self):
text = self.start_requests(self.starturl) #呼叫function start_requests
items = self.parse(text)
s = json.dumps(list(items), indent = 4, ensure_ascii=False)
with open('stackoverflow.json', 'w', encoding = 'utf-8') as f:
# If answer is nonempty
f.write(s)
stack = Stack()
stack.start()