Question

该脚本应找到包含文章的子页面的地址，并从中收集必要的数据。数据应该进入数据库，但是我不知道如何使脚本从博客的每个页面提取每篇文章的内容。

import requests
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

url = 'https://xxx/'

r = requests.get(url)
# Extract HTML
html = r.text
# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html, "html5lib")

# Get the text

text = soup.get_text()
# Create tokenizer
tokenizer = RegexpTokenizer('\w+')

# Create tokens
tokens = tokenizer.tokenize(text)

# Initialize new list
words = []

# Loop through list

for word in tokens:
    words.append(word.lower())

# Get English stopwords and print some of them
sw = nltk.corpus.stopwords.words('english')

# Initialize new list
words_ns = []

for word in words:
    if word not in sw:
        words_ns.append(word)

# plotting
freqdist1 = nltk.FreqDist(words_ns)
freqdist1.plot(25)

print(soup.get_text())

Answer 1

您可以根据要求使用beautifulsoup进行全部操作。文本提取代码通过@nmgeek；相同的问题还有其他方法可供选择。我猜您可以使用nltk处理文本。该方法很不错，因为您可以玩添加到列表中的选择器。通过将选择器列表传递给select，即[item.text for item in soup.select('selector list goes here')

，您可以实现类似的操作

编辑：下面为您提供了所有链接，但是一段时间后似乎网站阻止了您。看看遍历all_links的rotating IPs和these / User-Agents

如果您至少要求助于硒，则可以找到所有文章链接的列表，然后可以循环使用.get和硒

import requests
from bs4 import BeautifulSoup as bs

url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []

headers = {
    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent' : 'Mozilla/5.0'
}
with requests.Session() as s:
    r = s.get('https://teonite.com/blog/')
    soup = bs(r.content, 'lxml')
    article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
    all_links.append(article_links)
    num_pages = int(soup.select_one('.page-number').text.split('/')[1])

    for page in range(2, num_pages + 1):
        r = s.get(url.format(page))
        soup = bs(r.content, 'lxml')
        article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
        all_links.append(article_links)

    all_links = [item for i in all_links for item in i]

    for article in all_links:
        #print(article)
        r = s.get(article, headers = headers)
        soup = bs(r.content, 'lxml')
        [t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
        visible_text = soup.getText()   # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek
        # here I think you need to consider IP rotation/User-Agent changing
        try:
            print(soup.select_one('.post-title').text)
        except:
            print(article)
            print(soup.select_one('h1').text)
            break
        # do something with text

添加硒似乎肯定可以解决不好的请求被阻塞的问题：

import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver

url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []

with requests.Session() as s:
    r = s.get('https://teonite.com/blog/')
    soup = bs(r.content, 'lxml')
    article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
    all_links.append(article_links)
    num_pages = int(soup.select_one('.page-number').text.split('/')[1])

    for page in range(2, num_pages + 1):
        r = s.get(url.format(page))
        soup = bs(r.content, 'lxml')
        article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
        all_links.append(article_links)

all_links = [item for i in all_links for item in i]

d = webdriver.Chrome()

for article in all_links:
    d.get(article)
    soup = bs(d.page_source, 'lxml')
    [t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
    visible_text = soup.getText()   # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek

    try:
        print(soup.select_one('.post-title').text)
    except:
        print(article)
        print(soup.select_one('h1').text)
        break #for debugging
    # do something with text
d.quit()

查找包含文章的子页面URL，并从中收集数据

1 个答案: