该脚本应找到包含文章的子页面的地址,并从中收集必要的数据。数据应该进入数据库,但是我不知道如何使脚本从博客的每个页面提取每篇文章的内容。
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
url = 'https://xxx/'
r = requests.get(url)
# Extract HTML
html = r.text
# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html, "html5lib")
# Get the text
text = soup.get_text()
# Create tokenizer
tokenizer = RegexpTokenizer('\w+')
# Create tokens
tokens = tokenizer.tokenize(text)
# Initialize new list
words = []
# Loop through list
for word in tokens:
words.append(word.lower())
# Get English stopwords and print some of them
sw = nltk.corpus.stopwords.words('english')
# Initialize new list
words_ns = []
for word in words:
if word not in sw:
words_ns.append(word)
# plotting
freqdist1 = nltk.FreqDist(words_ns)
freqdist1.plot(25)
print(soup.get_text())
答案 0 :(得分:2)
您可以根据要求使用beautifulsoup进行全部操作。文本提取代码通过@nmgeek;相同的问题还有其他方法可供选择。我猜您可以使用nltk处理文本。该方法很不错,因为您可以玩添加到列表中的选择器。通过将选择器列表传递给select
,即[item.text for item in soup.select('selector list goes here')
编辑:下面为您提供了所有链接,但是一段时间后似乎网站阻止了您。看看遍历all_links的rotating IPs和these / User-Agents
如果您至少要求助于硒,则可以找到所有文章链接的列表,然后可以循环使用.get
和硒
import requests
from bs4 import BeautifulSoup as bs
url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []
headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent' : 'Mozilla/5.0'
}
with requests.Session() as s:
r = s.get('https://teonite.com/blog/')
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
num_pages = int(soup.select_one('.page-number').text.split('/')[1])
for page in range(2, num_pages + 1):
r = s.get(url.format(page))
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
all_links = [item for i in all_links for item in i]
for article in all_links:
#print(article)
r = s.get(article, headers = headers)
soup = bs(r.content, 'lxml')
[t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText() # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek
# here I think you need to consider IP rotation/User-Agent changing
try:
print(soup.select_one('.post-title').text)
except:
print(article)
print(soup.select_one('h1').text)
break
# do something with text
添加硒似乎肯定可以解决不好的请求被阻塞的问题:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []
with requests.Session() as s:
r = s.get('https://teonite.com/blog/')
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
num_pages = int(soup.select_one('.page-number').text.split('/')[1])
for page in range(2, num_pages + 1):
r = s.get(url.format(page))
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
all_links = [item for i in all_links for item in i]
d = webdriver.Chrome()
for article in all_links:
d.get(article)
soup = bs(d.page_source, 'lxml')
[t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText() # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek
try:
print(soup.select_one('.post-title').text)
except:
print(article)
print(soup.select_one('h1').text)
break #for debugging
# do something with text
d.quit()