Python的新手,我正在尝试抓取特定的网站,但遇到了麻烦。我正在尝试从“ https://www.cnn.com/business”中抓取文章,但是正在发生的事情是我正在抓取“ cnn.com”并抓取了所有这些文章。有没有办法只刮擦网站的业务部分? 如果我的方法是完全错误的,我很想知道哪种方法更好。谢谢
我有一个链接到cnn.com/business的json文件,并且我正在使用Python中的报纸库
#!pip install feedparser
#!pip install newspaper3k
import feedparser as fp
import numpy as np
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import csv
# Set the limit for number of articles to download
LIMIT = 10
articles_array = []
data = {}
data['newspapers'] = {}
# Loads the JSON files with news sites
with open('newspapers.json') as data_file:
companies = json.load(data_file)
paper = newspaper.build(value['link'], memoize_articles=False)
newsPaper = {
"link": value['link'],
"articles": [],
}
noneTypeCount = 0
for content in paper.articles:
if count > LIMIT:
break
try:
content.download()
content.parse()
except Exception as e:
print(e)
print("continuing...")
continue
article = {}
article['title'] = content.title
article['authors'] = content.authors
article['text'] = content.text
article['top_image'] = content.top_image
article['link'] = content.url
article['published'] = content.publish_date
newsPaper['articles'].append(article)
articles_array.append(article)
print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
count = count + 1
#noneTypeCount = 0
count = 1
data['newspapers'][company] = newsPaper
答案 0 :(得分:0)
这很可能是评论而不是完整的帖子。
import urllib
import re
NUM_LINKS_YOU_WANT = 10
urllib.request.urlretrieve("https://edition.cnn.com/business", ".temp_file")
occurrences = []
for line in open(".temp_file"):
if "index.html" in line:
occurrences.append(line)
positions = [m.start() for m in re.finditer('index.html', occurrences[-1])]
line = occurrences[-1]
links = []
for p in positions:
href = line[0:p].rfind("href")
links.append(" https://edition.cnn.com"+line[href+6:p])
print(links[0:NUM_LINKS_YOU_WANT])