Python慢​​慢抓取网站

时间:2018-12-03 06:03:38

标签: python-3.x selenium parsing web-scraping beautifulsoup

我已经实现了新闻网站抓取,该抓取通过使用Selenium Web驱动程序来访问动态网页并通过BeautifulSoup来检索内容来进行抓取。解析网站时,我还将抓取的数据写入MongoDB存储中并下载图片。我想通过给定类别或文本(显示在新闻内容中)来实现完整新闻搜索。关于并行化/添加异步代码以提高性能有什么建议?

# -*- coding: utf-8 -*-

import os
import json
import requests
from bs4 import BeautifulSoup
from mongo_setup import Database
import gridfs
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import time
import logging
import re
import pymongo


PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
DRIVER_BIN = os.path.join(PROJECT_ROOT, "bin/chromedriver")


class Scraper:

    tsn_resource = 'https://tsn.ua/'
    ukrnet_resource = 'https://www.ukr.net/'

    db_name = 'scraper_db'
    category_coll = 'categories'
    articles_coll = 'articles'


    def __init__(self, limit=10):
        self.limit = limit  # max number of articles per category
        self.db = Database(self.db_name).connect_db()
        self.category_coll = self.init_collection(self.category_coll)
        self.articles_coll = self.init_collection(self.articles_coll)
        self.logger = self.init_logger()
        self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
        self.image_storage = os.path.join(PROJECT_ROOT, "image_storage/")


    def init_logger(self):
        '''
        Initialize log file.
        '''
        logger = logging.getLogger('scraper_app')
        logger.setLevel(logging.INFO)

        # create a file handler
        handler = logging.FileHandler('scraper_logfile.log')
        handler.setLevel(logging.INFO)

        # create a logging format
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)

        # add the handlers to the logger
        logger.addHandler(handler)
        return logger


    def init_collection(self, name):
        if name in self.db.collection_names():
            self.db[name].drop()
        return self.db[name]


    def insert_one_to_collection(self, data, collection):
        try:
            collection.insert_one(data)
        except pymongo.errors.DuplicateKeyError:
            pass

    def insert_many_to_collection(self, data, collection):
        try:
            collection.insert_many(data)
        except pymongo.errors.DuplicateKeyError:
            pass


    def download_image(self, image_url):
        '''
        download images from news articles
        to local storage
        '''
        if not image_url.startswith(("data:image", "javascript")):
            local_filename = image_url.split('/')[-1].split("?")[0]

            r = requests.get(image_url, stream=True, verify=False)
            with open(self.image_storage + local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    f.write(chunk)


    def upload_image_to_mongo(self, image_url):
        response = requests.get(image_url, stream=True)
        fs = gridfs.GridFS(self.db)
        img = response.raw.read()
        fs.put(img, filename=local_filename)


    def get_page_content(self, url):
        try:
            self.driver.get(url)
        except WebDriverException:
            self.driver = webdriver.Chrome(executable_path = DRIVER_BIN)
        page = self.driver.page_source
        return page


    def parse_page_content(self, url, parser_lib):
        page_obj = self.get_page_content(url)
        soup = BeautifulSoup(page_obj, parser_lib)
        return soup


    def tsn_categories(self):
        categories = self.gather_categories(self.tsn_resource, 'ul.c-app-nav-more-list li a')
        return categories

    def ukrnet_categories(self):
        categories = self.gather_categories(self.ukrnet_resource, 'h2.feed__section--title a')
        return categories


    def gather_categories(self, url, selector):
        categories = []
        soup = self.parse_page_content(url, "html.parser")
        all_categories = soup.select(selector)

        for item in all_categories:
            category = {}
            link = str(item.attrs.get('href'))
            if link.startswith('javascript'):
                continue
            if not link.startswith('https:'):
                link = 'https:' + link
            category['link'] = link
            category['name'] = item.get_text().strip()
            categories.append(category)

        self.insert_many_to_collection(categories, self.category_coll)
        return categories


    def search_by_category(self, category_name):
        category_name = category_name.decode('utf-8')
        category_list = []
        category_list += self.tsn_categories()
        category_list += self.ukrnet_categories()
        category_obj = next(item for item in category_list if item['name'] == category_name)
        link = category_obj['link']
        if 'ukr.net' in link:
            articles = self.get_ukrnet_articles(category_name, link)
        else:
            articles = self.get_tsn_articles(category_name, link)
        return articles


    def get_ukrnet_articles(self, category_name, url):
        '''
        retrieve all articles from ukr.net by given category link
        '''
        count = 0
        result = []
        soup = self.parse_page_content(url, "html.parser")
        all_articles = soup.select('div.im-tl a')
        for item in all_articles:
            if count <= self.limit:
                article = {}
                link = item.attrs.get('href')
                article['link'] = link
                article['category'] = category_name
                article['content'] = item.contents[0].encode('utf-8')
                result.append(article)
                self.insert_one_to_collection(article, self.articles_coll)
            else:
                break
            count += 1

        return result


    def get_tsn_articles(self, category_name, url):
        '''
        retrieve all articles from tsn.ua by given category link
        '''
        count = 0
        result = []

        data = []  # temporary storage

        # first parse through the list of articles
        soup = self.parse_page_content(url, "html.parser")
        all_articles = soup.select('div.c-entry-embed a.c-post-img-wrap')
        for item in all_articles:

            # iterate limit amount of articles
            if count <= self.limit:
                article = {}
                link = item.attrs.get('href')
                img_src = item.find('img').get('src')
                if link.endswith(".html"):
                    article['link'] = link
                    if img_src is not None:
                        article['img_src'] = img_src
                        self.download_image(img_src)

                    article['category'] = category_name
                    data.append(article)
                count += 1
            else:
                break

        # then iterate over each article
        for article in data:
            new_soup = self.parse_page_content(article['link'], "html5lib")
            news_content = new_soup.select('div.e-content p')

            text_content = [] # article content
            for chunk in news_content:
                text_content.append(chunk.get_text().strip(''))
            article_text = ' '.join(text_content)

            news_header = new_soup.select('div.c-post-meta h1') # article title
            if news_header:
                header_text = "".join(news_header[0].contents)

            article_image = new_soup.find('figure', class_='js-lightgallery')
            if article_image:
                img_src = article_image.find('img').get('src') # articles image
                self.download_image(img_src)

            news_chunk = {}
            news_chunk['category'] = article['category']
            news_chunk['link'] = article['link']
            news_chunk['title'] = header_text
            # news_chunk['title'] = ''
            news_chunk['content'] = article_text
            news_chunk['images'] = []
            if 'img_src' in article:
                news_chunk['images'].append(article['img_src']) # caption image
            if article_image:
                news_chunk['images'].append(img_src) # article image

            result.append(news_chunk)
            self.insert_one_to_collection(news_chunk, self.articles_coll)

        return result

    def search_by_text(self, text):
        category_links = []
        category_links += self.ukrnet_categories()
        category_links += self.tsn_categories()
        result = self.website_search_by_text(text, category_links)
        return result


    def website_search_by_text(self, text_searched, category_links):
        result = []

        text_searched = text_searched.decode('utf-8')
        for link in category_links:
            article = {}
            soup = self.parse_page_content(link['link'], "html.parser")
            all_articles = soup.find_all('a', text=re.compile(text_searched))
            for item in all_articles:
                article['link'] = item.attrs.get('href')
                article['category'] = link['name']
                article['content'] = (item.contents[0].strip()).encode('utf-8')
                self.insert_one_to_collection(article, self.articles_coll)
                result.append(article)
        return result


    def collect_ukrnet_articles(self):
        '''
        outdated
        '''
        categories = self.ukrnet_categories()

        for category in categories:
            count = 0
            soup = self.parse_page_content(category['link'], "html.parser")

            all_articles = soup.select('div.im-tl a')
            for item in all_articles:
                # only 10 first articles
                if count < self.limit:
                    article = {}
                    link = item.attrs.get('href')
                    article['link'] = link
                    article['category'] = category['name']
                    article['content'] = item.contents[0].encode('utf-8')
                    self.insert_one_to_collection(article, self.articles_coll)
                else:
                    break
                count += 1


    def run(self):
        self.search_by_category('Economics', self.tsn_categories())
        self.search_by_text('Economics')
        self.driver.quit()


if __name__ == '__main__':
    scraper = Scraper()
    scraper.run()

1 个答案:

答案 0 :(得分:1)

scrapy是一个可靠的python框架,可自动执行异步/并行操作。

还有multiprocessing被方便地放入一个包装中。

然后是multithreading,也很方便地放入一个包装中。

使用多线程库,有一种方法可以调用您尝试使用map()进行线程化的函数,然后传递要与其一起使用的列表/变量。 map(your_func, your_list) 我不记得确切的链接或结构了,但这是Google的快速搜索。真的很容易。