Question

我在以下网站上进行过网络抓取：http://www.legorafi.fr/ 它适用于每个类别（策略等），但是对于每个类别，我循环浏览相同数量的页面。

我希望能够根据此网站中每个类别的页面数来抓取所有页面。

我这样做是为了遍历页面：

import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

import newspaper
import requests
from newspaper.utils import BeautifulSoup
from newspaper import Article

#categories = ['france/politique','france/societe', 'monde-libre', 'france/economie/', 'culture', 'people', 'sports', 'hi-tech', 'sciences']
papers = []


driver = webdriver.Chrome(executable_path="/Users/name/Downloads/chromedriver 4")
#driver.get('http://www.legorafi.fr/')


for category in categories:
    url = 'http://www.legorafi.fr/category/' + category
    #WebDriverWait(self.driver, 10)
    driver.get(url)
    Foo()
    
    time.sleep(2)
    pagesToGet = 120

pagesToGet = 120

title = []
content = []
for page in range(1, pagesToGet+1):
    print('Processing page :', page)
    #url = 'http://www.legorafi.fr/category/france/politique/page/'+str(page)
    print(driver.current_url)
    #print(url)
    
    raw_html = requests.get(url)
    soup = BeautifulSoup(raw_html.text, 'html.parser')
    for articles_tags in soup.findAll('div', {'class': 'articles'}):
        for article_href in articles_tags.find_all('a', href=True):
            if not str(article_href['href']).endswith('#commentaires'):
                urls_set.add(article_href['href'])
                papers.append(article_href['href'])

                
    for url in papers:
        article = Article(url)
        article.download()
        article.parse()
        if article.title not in title:
            title.append(article.title)
        if article.text not in content:
            content.append(article.text)
        print(article.title,article.text)

    time.sleep(3)
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    driver.find_element_by_xpath("//a[contains(text(),'Suivant')]").click()

我想遍历所有这些类别，并取决于每个类别的页面数。

categories = ['france/politique','france/societe', 'monde-libre', 'france/economie/', 'culture', 'people', 'sports', 'hi-tech', 'sciences']

我该怎么办？

Answer 1

下面的代码能够遍历所有类别并提取数据。该代码肯定需要更多的测试和一些增强的错误处理。

P.S。祝您在此编码项目中一切顺利。

import requests

import time
from random import randint
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

from newspaper.utils import BeautifulSoup
from newspaper import Article

chrome_options = Options()
chrome_options.add_argument("--test-type")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument("--incognito")
# chrome_options.add_argument('--headless')

# window size as an argument is required in headless mode
# chrome_options.add_argument('window-size=1920x1080')
driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)

papers = []
urls_set = set()


def get_articles(link):
   while True:
      try:
        next_link = driver.find_element_by_link_text("Suivant")
        if next_link:
            raw_html = requests.get(url)
            soup = BeautifulSoup(raw_html.text, 'html.parser')
            for articles_tags in soup.findAll('div', {'class': 'articles'}):
                for article_href in articles_tags.find_all('a', href=True):
                    if not str(article_href['href']).endswith('#commentaires'):
                        article = Article(article_href['href'])
                        article.download()
                        article.parse()
                        if article.url is not None:
                            article_url = article_href['href']
                            title = article.title
                            publish_date = datetime.strptime(str(article.publish_date),
                                                             '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
                            
                            text_of_article = article.text.replace('\n', '')

            driver.execute_script("arguments[0].scrollIntoView(true);", next_link)
            next_link.click()

            # Initiates a random wait to prevent the
            # harvesting operation from starting before
            # the page has completely loaded
            time.sleep(randint(2, 4))

    except NoSuchElementException:
        return



 legorafi_urls = {'monde-libre': 'http://www.legorafi.fr/category/monde-libre',
             'politique': 'http://www.legorafi.fr/category/france/politique',
             'societe': 'http://www.legorafi.fr/category/france/societe',
             'economie': 'http://www.legorafi.fr/category/france/economie',
             'culture': 'http://www.legorafi.fr/category/culture',
             'people': 'http://www.legorafi.fr/category/people',
             'sports': 'http://www.legorafi.fr/category/sports',
             'hi-tech': 'http://www.legorafi.fr/category/hi-tech',
             'sciences': 'http://www.legorafi.fr/category/sciences',
             'ledito': 'http://www.legorafi.fr/category/ledito/'
             }


for category, url in legorafi_urls.items():
   if url:
     browser = driver.get(url)
     driver.implicitly_wait(30)
     get_articles(browser)
  else:
     driver.quit()

硒：根据网站每个类别的页面数进行抓取

1 个答案: