我的程序每次都返回不同的数字。如果我单独运行每个页面,它将给出正确的结果。我想获得所有具有3个或更多投票的链接。
from bs4 import BeautifulSoup as bs
import requests
import pandas
pg = 1
url ="https://stackoverflow.com/search?page="+str(pg)+"&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src,'html.parser')
pages = soup.findAll('a',{'class' : 's-pagination--item js-pagination-item'})
number_of_pages = len(pages)
print(number_of_pages)
qualified=[]
while pg<=number_of_pages:
print("In Page :"+str(pg))
url = "https://stackoverflow.com/search?page=" + str(pg) + "&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src, 'html.parser')
a_links = soup.findAll('a',{'class':'question-hyperlink'})
span_links = soup.findAll('span',{'class':'vote-count-post'})
hrefs = []
for a_link in a_links:
hrefs.append(a_link.get('href'))
for link in range(len(span_links)):
vote = span_links[link].strong.text
n = int(vote)
if n>2:
the_link = 'https://stackoverflow.com' + hrefs[link]
qualified.append(the_link)
print(len(qualified))
pg +=1
答案 0 :(得分:0)
print(len(qualified))
将显示完整列表的长度,这是您的错误。通过在i = 0
之后添加{{1},在while pg<=number_of_pages:
之后添加i += 1
,然后在if n>2:
之前或之后添加print(i)
,可以得到每个链接中有多少个链接。
然后代码将如下所示:
pg +=1
输出:
from bs4 import BeautifulSoup as bs
import requests
import pandas
pg = 1
url ="https://stackoverflow.com/search?page="+str(pg)+"&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src,'html.parser')
pages = soup.findAll('a',{'class' : 's-pagination--item js-pagination-item'})
number_of_pages = len(pages)
print(number_of_pages)
qualified=[]
while pg<=number_of_pages:
i = 0
print("In Page :"+str(pg))
url = "https://stackoverflow.com/search?page=" + str(pg) + "&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src, 'html.parser')
a_links = soup.findAll('a',{'class':'question-hyperlink'})
span_links = soup.findAll('span',{'class':'vote-count-post'})
hrefs = []
for a_link in a_links:
hrefs.append(a_link.get('href'))
for link in range(len(span_links)):
vote = span_links[link].strong.text
n = int(vote)
if n>2:
i += 1
the_link = 'https://stackoverflow.com' + hrefs[link]
qualified.append(the_link)
print(i)
pg +=1
#print(qualified)