我试图浏览Reddit上的多篇文章,浏览每篇文章并提取最相关的实体(通过过滤获得最高相关性得分),然后将其添加到master_locations
列表中:
from __future__ import print_function
from alchemyapi import AlchemyAPI
import json
import urllib2
from bs4 import BeautifulSoup
alchemyapi = AlchemyAPI()
reddit_url = 'http://www.reddit.com/r/worldnews'
urls = []
locations = []
relevance = []
master_locations = []
def get_all_links(page):
html = urllib2.urlopen(page).read()
soup = BeautifulSoup(html)
for a in soup.find_all('a', 'title may-blank ', href=True):
urls.append(a['href'])
run_alchemy_entity_per_link(a['href'])
def run_alchemy_entity_per_link(articleurl):
response = alchemyapi.entities('url', articleurl)
if response['status'] == 'OK':
for entity in response['entities']:
if entity['type'] in entity == 'Country' or entity['type'] == 'Region' or entity['type'] == 'City' or entity['type'] == 'StateOrCountry' or entity['type'] == 'Continent':
if entity.get('disambiguated'):
locations.append(entity['disambiguated']['name'])
relevance.append(entity['relevance'])
else:
locations.append(entity['text'])
relevance.append(entity['relevance'])
else:
locations.append('No Location')
relevance.append('0')
max_pos = relevance.index(max(relevance)) # get nth position of the highest relevancy score
master_locations.append(locations[max_pos]) #Use n to get nth position of location and store that location name to master_locations
del locations[0] # RESET LIST
del relevance[0] # RESET LIST
else:
print('Error in entity extraction call: ', response['statusInfo'])
get_all_links('http://www.reddit.com/r/worldnews') # Gets all URLs per article, then analyzes entity
for item in master_locations:
print(item)
但我认为由于某些原因,列表locations
和relevance
未被重置。我做错了吗?
打印结果如下:
Holland
Holland
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Beirut
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Mogadishu
Johor Bahru
(可能来自未清除的清单)
答案 0 :(得分:0)
del list[0]
仅删除列表中的第一项。
如果要删除所有项目,请使用以下命令:
del list[:]
或
list[:] = []
答案 1 :(得分:0)
在您的情况下,不要重复使用这些列表,只需创建新列表:
from __future__ import print_function
from alchemyapi import AlchemyAPI
import json
import urllib2
from bs4 import BeautifulSoup
alchemyapi = AlchemyAPI()
reddit_url = 'http://www.reddit.com/r/worldnews'
def get_all_links(page):
html = urllib2.urlopen(page).read()
soup = BeautifulSoup(html)
urls = []
master_locations = []
for a in soup.find_all('a', 'title may-blank ', href=True):
urls.append(a['href'])
master_locations.append(run_alchemy_entity_per_link(a['href']))
return urls, master_locations
def run_alchemy_entity_per_link(articleurl):
response = alchemyapi.entities('url', articleurl)
if response['status'] != 'OK':
print('Error in entity extraction call: ', response['statusInfo'])
return
locations_with_relevance = []
for entity in response['entities']:
if entity['type'] in ('Country', 'Region', 'City', 'StateOrCountry', 'Continent'):
if entity.get('disambiguated'):
location = entity['disambiguated']['name']
else:
location = entity['text']
locations_with_relevance.append((int(entity['relevance']), location))
else:
locations_with_relevance.append((0, 'No Location'))
return max(locations_with_relevance)[1]
def main():
_urls, master_locations = get_all_links(reddit_url) # Gets all URLs per article, then analyzes entity
for item in master_locations:
print(item)
if __name__ == '__main__':
main()
如果要在列表中存储多个项目,请将项目放在元组中,将元组放在一个列表中,而不是两个或多个单独的列表。