在python中使用bs4进行网络清理时遇到问题

时间:2020-05-29 15:10:46

标签: web-scraping beautifulsoup

我的程序每次都返回不同的数字。如果我单独运行每个页面,它将给出正确的结果。我想获得所有具有3个或更多投票的链接。

from bs4 import BeautifulSoup as bs
import requests
import pandas
pg = 1
url ="https://stackoverflow.com/search?page="+str(pg)+"&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src,'html.parser')
pages = soup.findAll('a',{'class' : 's-pagination--item js-pagination-item'})
number_of_pages = len(pages)
print(number_of_pages)
qualified=[]


while pg<=number_of_pages:
    print("In Page :"+str(pg))
    url = "https://stackoverflow.com/search?page=" + str(pg) + "&tab=Relevance&q=scrappy%20python"
    src = requests.get(url).text
    soup = bs(src, 'html.parser')
    a_links = soup.findAll('a',{'class':'question-hyperlink'})
    span_links = soup.findAll('span',{'class':'vote-count-post'})
    hrefs = []
    for a_link in a_links:
        hrefs.append(a_link.get('href'))
    for link in range(len(span_links)):
        vote = span_links[link].strong.text
        n = int(vote)
        if n>2:
            the_link = 'https://stackoverflow.com' + hrefs[link]
            qualified.append(the_link)
            print(len(qualified))

    pg +=1

1 个答案:

答案 0 :(得分:0)

print(len(qualified))将显示完整列表的长度,这是您的错误。通过在i = 0之后添加{{1},在while pg<=number_of_pages:之后添加i += 1,然后在if n>2:之前或之后添加print(i),可以得到每个链接中有多少个链接。 然后代码将如下所示:

pg +=1

输出:

from bs4 import BeautifulSoup as bs
import requests
import pandas
pg = 1
url ="https://stackoverflow.com/search?page="+str(pg)+"&tab=Relevance&q=scrappy%20python"
src = requests.get(url).text
soup = bs(src,'html.parser')
pages = soup.findAll('a',{'class' : 's-pagination--item js-pagination-item'})
number_of_pages = len(pages)
print(number_of_pages)
qualified=[]


while pg<=number_of_pages:
    i = 0
    print("In Page :"+str(pg))
    url = "https://stackoverflow.com/search?page=" + str(pg) + "&tab=Relevance&q=scrappy%20python"
    src = requests.get(url).text
    soup = bs(src, 'html.parser')
    a_links = soup.findAll('a',{'class':'question-hyperlink'})
    span_links = soup.findAll('span',{'class':'vote-count-post'})
    hrefs = []
    for a_link in a_links:
        hrefs.append(a_link.get('href'))
    for link in range(len(span_links)):
        vote = span_links[link].strong.text
        n = int(vote)
        if n>2:
            i += 1
            the_link = 'https://stackoverflow.com' + hrefs[link]
            qualified.append(the_link)
    print(i)

    pg +=1
#print(qualified)