Question

我是Python和BeautifulSoup的新手，我想在csv中抓取多个页面，但是当我尝试仅将最后3个链接存储在csv中时，

如何解决我的问题？

## importing bs4, requests, fake_useragent and csv modules
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import csv

## create an array with URLs
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]

## initializing the UserAgent object
user_agent = UserAgent()

## starting the loop
for url in urls:
    ## getting the reponse from the page using get method of requests module
    page = requests.get(url, headers={"user-agent": user_agent.chrome})

    ## storing the content of the page in a variable
    html = page.content

    ## creating BeautifulSoup object
    soup = BeautifulSoup(html, "html.parser")
    table = soup.findAll("table", {"class":"table"})[0]
    rows = table.findAll("tr")

with open("test.csv", "wt+", newline="") as f:
    writer = csv.writer(f)
    for row in rows:
        csv_row = []
        for cell in row.findAll(["td", "th"]):
            csv_row.append(cell.get_text())
        writer.writerow(csv_row)

非常感谢！

Answer 1

为简化行的读取过程，您还可以使用pandas进行拍摄：

import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd


urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]

headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}

all_data = []
for url in urls:
    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.findAll("table", {"class":"table"})[0]
    
    df_table = pd.read_html(str(table))[0]
    
    #add a column with additional info
    df_table['hit'] = soup.find("span", {"class":"c"}).text.strip() 
    
    #store the table in a list of tables
    all_data.append(df_table)

#concat the tables and export them to csv
pd.concat(all_data).to_csv('test.csv',index=False)

Answer 2

在您的代码中，您没有将rows变量存储到任何地方，因此您只将最后一个URL中的值写入CSV文件。此示例将从所有三个URL中写入值：

import csv
import requests
from bs4 import BeautifulSoup


urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]

headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}

all_data = []
for url in urls:
    page = requests.get(url, headers=headers)

    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.findAll("table", {"class":"table"})[0]

    # here I store all rows to list `all_data`
    for row in table.findAll('tr'):
        tds = [cell.get_text(strip=True, separator=' ') for cell in row.findAll(["td", "th"])]
        all_data.append(tds)
        print(*tds)

# write list `all_data` to CSV
with open("test.csv", "wt+", newline="") as f:
    writer = csv.writer(f)
    for row in all_data:
        writer.writerow(row)

从所有三个URL（从LibreOffice截屏中）写入test.csv：

BeautifulSoup-在多页上抓取HTML表

2 个答案: