我正在使用我发现的教程(linke)使用python库报纸和feedparser抓取新闻站点。
它从json文件中读取要处理的链接,然后从中获取文章。问题是它只能从首页获得文章,而不会迭代到第二,第三等等。因此,我编写了一个脚本,用站点的前50页填充json文件,例如www.site.com/page/x。
{
"site0" : { "link" : "https://sitedotcom/page/0/"},
"site1" : { "link" : "https://sitedotcom/page/1/"},
"site2" : { "link" : "https://sitedotcom/page/2/"}
etc
}
# Set the limit for number of articles to download
LIMIT = 1000000000
articles_array = []
data = {}
data['newspapers'] = {}
# Loads the JSON files with news sites
with open('thingie2.json') as data_file:
companies = json.load(data_file)
count = 1
# Iterate through each news company
for company, value in companies.items():
# If a RSS link is provided in the JSON file, this will be the first choice.
# Reason for this is that, RSS feeds often give more consistent and correct data. RSS (Rich Site Summary; originally RDF Site Summary; often called Really Simple Syndication) is a type of
# web feed which allows users to access updates to online content in a standardized, computer-readable format
# If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
if 'rss' in value:
d = fp.parse(value['rss'])
print("Downloading articles from ", company)
newsPaper = {
"rss": value['rss'],
"link": value['link'],
"articles": []
}
for entry in d.entries:
# Check if publish date is provided, if no the article is skipped.
# This is done to keep consistency in the data and to keep the script from crashing.
if hasattr(entry, 'published'):
if count > LIMIT:
break
article = {}
article['link'] = entry.link
date = entry.published_parsed
article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
try:
content = Article(entry.link)
content.download()
content.parse()
except Exception as e:
# If the download for some reason fails (ex. 404) the script will continue downloading
# the next article.
print(e)
print("continuing...")
continue
article['title'] = content.title
article['text'] = content.text
article['authors'] = content.authors
article['top_image'] = content.top_image
article['movies'] = content.movies
newsPaper['articles'].append(article)
articles_array.append(article)
print(count, "articles downloaded from", company, ", url: ", entry.link)
count = count + 1
else:
# This is the fallback method if a RSS-feed link is not provided.
# It uses the python newspaper library to extract articles
print("Building site for ", company)
paper = newspaper.build(value['link'], memoize_articles=False)
newsPaper = {
"link": value['link'],
"articles": []
}
noneTypeCount = 0
for content in paper.articles:
if count > LIMIT:
break
try:
content.download()
content.parse()
except Exception as e:
print(e)
print("continuing...")
continue
# Again, for consistency, if there is no found publish date the article will be skipped.
# After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
article = {}
article['title'] = content.title
article['authors'] = content.authors
article['text'] = content.text
article['top_image'] = content.top_image
article['movies'] = content.movies
article['link'] = content.url
article['published'] = content.publish_date
newsPaper['articles'].append(article)
articles_array.append(article)
print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
count = count + 1
#noneTypeCount = 0
count = 1
data['newspapers'][company] = newsPaper
#Finally it saves the articles as a CSV-file.
try:
f = csv.writer(open('Scraped_data_news_output2.csv', 'w', encoding='utf-8'))
f.writerow(['Title', 'Authors','Text','Image','Videos','Link','Published_Date'])
#print(article)
for artist_name in articles_array:
title = artist_name['title']
authors=artist_name['authors']
text=artist_name['text']
image=artist_name['top_image']
video=artist_name['movies']
link=artist_name['link']
publish_date=artist_name['published']
# Add each artist’s name and associated link to a row
f.writerow([title, authors, text, image, video, link, publish_date])
except Exception as e: print(e)
在我的浏览器中导航到这些网站会按预期显示较旧且独特的文章。但是,当我在它们上运行脚本时,无论页码如何,它都会返回相同的文章。我做错了什么或没有考虑过吗?