使用urllib.request和BeautifulSoup下载文件时,如何使用来自特定HTML元素的文本重命名文件

时间:2019-07-10 08:26:25

标签: html python-3.x beautifulsoup urllib

我有一种算法,可以使用urllib.requestBeautifulSoup(PDF 3.6)下载PDF文章:

import requests as r
from bs4 import BeautifulSoup as soup
import os
import urllib.request

#make a list of all web pages' urls
webpages=[]
for i in range(9):
    root_url = 'xxx.com/articles/page'+ str(i)
    webpages.append(root_url)

#make a list of PDF links
pdf_links = []
for item in webpages:
    headers = {'User-Agent': 'Mozilla/5.0'}
    data = r.get(item, headers=headers)
    page_soup = soup(data.text, 'html.parser')
    links = [span.attrs['href'] for span in page_soup.find_all('a', href = True)] 
    for link in links:
        link_string = str(link)
        if link_string.endswith('pdf'):
            pdf_links.append(link_string)

    #download the files
    for pdf_link in pdf_links:
        save_to = os.path.basename(pdf_link.strip())
        urllib.request.urlretrieve(pdf_link.strip(), save_to)

我需要使用存储在特定div类中的文章标题来重命名每个下载的PDF文章:

<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>

还有一个较大的div,它同时存储文章标题和相应的PDF链接:

<div article-id="1741" class="online article_row_view">
<div class="article article title">
<h2>The Disaster of Deforestation</h2>
</div>
<span class="file-pdf"> <a href="xsdf.pdf" title="BowenA.pdf">PDF</a></span>
</div>

我不知道如何自动重命名文件,更不用说使用特定的HTML元素了。任何帮助将不胜感激!

1 个答案:

答案 0 :(得分:1)

这是一个完整的解决方案,可以浏览导航中的所有页面并为您下载所有pdf文件:

import requests
from bs4 import BeautifulSoup, Tag, Comment, NavigableString
from pathlib import Path
from urllib.parse import urljoin

BASE_URL = 'https://cross-currents.berkeley.edu/archives'

def make_soup(url: str) -> BeautifulSoup:
    res = requests.get(url, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'})
    res.raise_for_status()
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def extract_articles(soup: BeautifulSoup) -> list:
    articles = []
    for result in soup.select('.node--view-mode-search-result'):
        author = result.select_one('.field--name-field-citation-authors').text.strip()
        date = result.select_one('.field--name-field-issue-date').text.strip()
        title = result.select_one('.field-name-node-title').text.strip()
        journal = result.find('em', recursive=False).text.strip()
        pdf_url = result.select_one('a[href*=".pdf"]')['href']
        articles.append({
            'author': author,
            'date': date,
            'title': title,
            'journal': journal,
            'pdf_url': pdf_url,
        })
    return articles

def make_safe_filename(text: str) -> str:
    """convert forbidden chars to underscores"""
    return ''.join(c if (c.isalnum() or c.isspace()) else '_' for c in text).strip('_ ')


def get_next_page_url(soup: BeautifulSoup) -> str:
    try:
        path = soup.select_one('.pager a[rel="next"]')['href']
        return urljoin(BASE_URL, path)
    except (TypeError, KeyError):
        return None

def download_file(url: str, filename: str) -> str:
    with requests.get(url, stream=True) as res, open(filename, 'wb') as f:
        res.raise_for_status()
        for chunk in res.iter_content(chunk_size=8192): 
            if chunk:
                f.write(chunk)
    return filename

def scrape_archive():
    save_dir = Path(r'd:\downloads')
    save_dir.mkdir(exist_ok=True, parents=True)

    url = 'https://cross-currents.berkeley.edu/archives?author=&title=&type=onlinearticle&issue=All&region=All&page=0'
    while True:
        soup = make_soup(url)
        articles = extract_articles(soup)

        for a in articles:
            pdf_url = a['pdf_url']
            filename = make_safe_filename(a['title'])
            save_path = str(save_dir / (filename + '.pdf'))
            print('Downloading:', a['title'])
            download_file(pdf_url, save_path)
            print('Finished')

        # go to next page if exists
        next_url = get_next_page_url(soup)
        if not next_url:
            break
        url = next_url
        print('Moving to next page', url)

scrape_archive()

在这里,我仅使用标题来生成pdf文件名,但是您可以混合使用journaldateauthor等来生成更好的文件名。
另外请记住,请根据自己的喜好更改save_dir