Python Web抓取如何循环所有页面/下一页

时间:2019-05-07 09:41:34

标签: python html web-scraping

有人可以帮我如何遍历下一页,在这里尝试了所有解决方案,但似乎无法正常工作。

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'  # to suppress "false positive" warnings
import datetime as dt
import requests
from bs4 import BeautifulSoup
import time



def getPage(url):
    attempt = 1
    while True:
        response = requests.get(url)
        if response.status_code == requests.codes.ok:
            return response.content
        else:
            time.sleep(0.5)
            attempt += 1
            if attempt > 3:
                print("Data could not be requested for url:  ", url, "  after  ", attempt, "  attempts")
                return None



if __name__ == '__main__':


    url = "https://www.opic.com/upphandlingar/"

    data_df = pd.DataFrame()  # all data from the websites is saved to this data frame
    # get data
    try:
        markup = getPage(url).decode('utf-8')
    except:
        markup = getPage(url)


    if markup is None:
        print("Nothing was found. Value of 'markup' is 'None'.")
        sys.exit()

    soup = BeautifulSoup(markup, 'lxml')

    containers = soup.findAll("a", {"class": "ListItem"})
    for container in containers:
        upplagtdatum = container.div.p.text.strip()
        titel = container.h3.text.strip()
        stad_kommun = container.span.text.strip()



        # ----------------------------------------------------------------------------------------------------------
        # Save data to data frame
        df = pd.DataFrame(data={'Upplagtdatum': [upplagtdatum], 'Titel': [titel], 'Stad Kommun': [stad_kommun]})
        data_df = pd.concat([data_df, df], sort=False)


    #   SAVE DATA

    # Save data frame to csv-file

    filePathName = "data_" + dt.datetime.now().strftime('%Y-%m-%d') + ".csv"
    data_df.to_csv(filePathName, sep=';', index=False, encoding='utf-8')


    print(data_df)

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'  # to suppress "false positive" warnings
import datetime as dt
import requests
from bs4 import BeautifulSoup
import time

def getPage(url):
    attempt = 1
    while True:
        response = requests.get(url)
        if response.status_code == requests.codes.ok:
            return response.content
        else:
            time.sleep(0.5)
            attempt += 1
            if attempt > 3:
                print("Data could not be requested for url:  ", url, "  after  ", attempt, "  attempts")
                return None

if __name__ == '__main__':

 url = "https://www.opic.com/upphandlingar/"

    data_df = pd.DataFrame()  # all data from the websites is saved to this data frame
    # get data
    try:
        markup = getPage(url).decode('utf-8')
    except:
        markup = getPage(url)


    if markup is None:
        print("Nothing was found. Value of 'markup' is 'None'.")
        sys.exit()

    soup = BeautifulSoup(markup, 'lxml')

    containers = soup.findAll("a", {"class": "ListItem"})
    for container in containers:
        upplagtdatum = container.div.p.text.strip()
        titel = container.h3.text.strip()
        stad_kommun = container.span.text.strip()

 # Save data to data frame
        df = pd.DataFrame(data={'Upplagtdatum': [upplagtdatum], 'Titel': [titel], 'Stad Kommun': [stad_kommun]})
        data_df = pd.concat([data_df, df], sort=False)

filePathName = "data_" + dt.datetime.now().strftime('%Y-%m-%d') + ".csv"
    data_df.to_csv(filePathName, sep=';', index=False, encoding='utf-8')


    print(data_df)

2 个答案:

答案 0 :(得分:1)

我对您的代码进行了一些更改。使用这种格式可以实现分页。

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'  # to suppress "false positive" warnings
import datetime as dt
import requests
from bs4 import BeautifulSoup
import time
import sys

def getPage(url):
    attempt = 1
    while True:
        response = requests.get(url)
        if response.status_code == requests.codes.ok:
            return response.content
        else:
            time.sleep(0.5)
            attempt += 1
            if attempt > 3:
                print("Data could not be requested for url:  ", url, "  after  ", attempt, "  attempts")
                return None

def getData(markup):
    data_df = pd.DataFrame()  # all data from the websites is saved to this data frame
    soup = BeautifulSoup(markup, 'lxml')
    containers = soup.findAll("a", {"class": "ListItem"})
    for container in containers:
        upplagtdatum = container.div.p.text.strip()
        titel = container.h3.text.strip()
        stad_kommun = container.span.text.strip()

        # ----------------------------------------------------------------------------------------------------------
        # Save data to data frame
        df = pd.DataFrame(data={'Upplagtdatum': [upplagtdatum], 'Titel': [titel], 'Stad Kommun': [stad_kommun]})
        data_df = pd.concat([data_df, df], sort=False)

    #   SAVE DATA
    # Save data frame to csv-file
    filePathName = "data_" + dt.datetime.now().strftime('%Y-%m-%d') + ".csv"
    data_df.to_csv(filePathName, sep=';', index=False, encoding='utf-8')
    print(data_df)

if __name__ == '__main__':
    results = 2871
    per_page = 20
    url = "https://www.opic.com/upphandlingar/?p={}"
    no_of_pages = int(results/per_page)
    for page_no in range(1,no_of_pages + 1):
        try:
            markup = getPage(url.format(page_no)).decode('utf-8')
        except:
            markup = getPage(url)

        if markup is None:
            print("Nothing was found. Value of 'markup' is 'None'.")
            sys.exit()
        else:
            getData(markup)

说明

  • 每个页面具有相同的模板/页面结构,因此您需要一些功能来提取所需的内容
  • 分页,如何形成下一页,如果您看到URL中添加了 p = 参数,则在这里。
  • 多少页?这取决于总共有多少个结果以及每页有多少个结果。如果您找出来,只需对地图进行遍历

看看代码,如果需要的话更新它。

答案 1 :(得分:0)

通过查看网站和提供的代码,我假设您想从获得的所有ListItem(容器)中提取href属性。您可以像这样简单地获取href: (假设您有BeautifulSoup4)

    for container in containers:
        upplagtdatum = container.div.p.text.strip()
        titel = container.h3.text.strip()
        stad_kommun = container.span.text.strip()
        href = container.get('href')

然后您可以立即使用该href,也可以将其保存在DataFrame中以在以后遍历。