Question

我正在尝试抓取这个网站：voxnews.info

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd

web='https://voxnews.info'
def main(req, num, web):
    r = req.get(web+"/page/{}/".format(num))
    soup = BeautifulSoup(r.content, 'html.parser')
    goal = [(x.time.text, x.h1.a.get_text(strip=True), x.select_one("span.cat-links").get_text(strip=True), x.p.get_text(strip=True))
           for x in soup.select("div.site-content")]

    return goal


with ThreadPoolExecutor(max_workers=30) as executor:
    with requests.Session() as req:
        fs = [executor.submit(main, req, num) for num in range(1, 2)] # need to scrape all the webpages in the website
        allin = []
        for f in fs:
            allin.extend(f.result())
        df = pd.DataFrame.from_records(
            allin, columns=["Date", "Title", "Category", "Content"])
        print(df)

但是代码有两个问题：

第一个是我没有抓取所有页面（我目前将 1 和 2 放在范围内，但我需要所有页面）；
它没有正确保存日期。

如果可以看看代码并告诉我如何改进它以解决这两个问题，那就太棒了。

Answer 1

一些细微的变化。

首先，没有必要对单个请求使用 requests.Session() - 您不会试图在请求之间保存数据。

对 with 语句的处理方式略有改动，我不知道它是否更正确，或者只是我是如何做的，您仍然不需要所有代码与执行器一起运行打开。

我为您提供了两种解析日期的选项，可以是网站上写的日期、意大利语字符串或日期时间对象。

我在文章中没有看到任何“p”标签，所以我删除了那部分。似乎为了获得文章的“内容”，您必须实际导航并单独抓取它们。我从代码中删除了该行。

在您的原始代码中，您并没有获得页面上的每篇文章，而是每篇文章中的第一篇。每个页面只有一个“div.site-content”标签，但有多个“article”标签。这就是变化。

最后，我更喜欢 find 而不是 select，但这只是我的风格选择。这在前三页对我有用，我没有尝试整个网站。运行此程序时要小心，30 个请求中的 78 个块可能会阻止您...

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import datetime


def main(num, web):
    r = requests.get(web+"/page/{}/".format(num))
    soup = BeautifulSoup(r.content, 'html.parser')
    html = soup.find("div", class_="site-content")
    articles = html.find_all("article")
    
    # Date as string In italian
    goal = [(x.time.get_text(), x.h1.a.get_text(strip=True), x.find("span", class_="cat-links").get_text(strip=True)) for x in articles]
    # OR as datetime object
    goal = [(datetime.datetime.strptime(x.time["datetime"], "%Y-%m-%dT%H:%M:%S%z"), x.h1.a.get_text(strip=True), x.find("span", class_="cat-links").get_text(strip=True)) for x in articles]

    return goal


web='https://voxnews.info'

r = requests.get(web)
soup = BeautifulSoup(r.text, "html.parser")
last_page = soup.find_all("a", class_="page-numbers")[1].get_text()
last_int = int(last_page.replace(".",""))

### BE CAREFUL HERE WITH TESTING, DON'T USE ALL 2,320 PAGES ###
with ThreadPoolExecutor(max_workers=30) as executor:
    fs = [executor.submit(main, num, web) for num in range(1, last_int)]

allin = []
for f in fs:
    allin.extend(f.result())
df = pd.DataFrame.from_records(
    allin, columns=["Date", "Title", "Category"])
print(df)

抓取所有页面

1 个答案: