这是一篇很长的文章,因为我所面临的问题已嵌入到一个较大的项目中-感谢任何花时间阅读本文的人。
基本上,我正在抓取Wikipedia:精选文章页面。此页面上有数百个文章链接,并且我已经成功地从该页面上汇编了传记的文章列表。使用了以下代码:
def __api_GET_latest_page(title):
parameters = {
"action": "parse",
"page": title,
"format": "json"
}
response_json = __get("revisions", title, parameters)
if("parse" in response_json.keys()
and "text" in response_json["parse"].keys()
and "*" in response_json["parse"]["text"].keys()):
return response_json["parse"]["text"]["*"]
return None
def __get(function_key, key, parameters, check_cache=True, write_cache=True):
target = "https://en.wikipedia.org/w/api.php"
cache_path = "cached_api"
params_unicode = str(parameters).encode('utf-8')
md5 = hashlib.md5(params_unicode).hexdigest()
return_json = None
cache_file = os.path.join(cache_path, function_key, str(key), md5)
cache_exists = os.path.isfile(cache_file)
if cache_exists:
try:
json_in = open(cache_file, "r")
json_str = json_in.read()
return_json = json.loads(json_str)
if "error" in return_json.keys() and "code" in return_json["error"].keys() and return_json["error"]["code"]=="maxlag":
cache_exists = False
except:
cache_exists = False
if not cache_exists:
cache_dir = os.path.dirname(cache_file)
if not os.path.isdir(cache_dir):
os.makedirs(cache_dir)
r = requests.get(target, params=parameters)
request_json = r.json()
json_out = open(cache_file, "w")
print(json.dumps(request_json), file=json_out)
return_json = request_json
return return_json
def __remove_tables_and_scripts(tree):
tags_to_remove = ["tbody", "td", "script"]
for tag in tags_to_remove:
elements = tree.find(f".//{tag}")
if elements is not None:
for e in elements:
e.getparent().remove(e)
return tree
def page_text(name, format, include_tables = False):
try:
result = __api_GET_latest_page(name)
except:
print("API request failed.")
if result:
e = etree.fromstring(result)
if not include_tables:
e = __remove_tables_and_scripts(e)
if format == "html":
return str(etree.tostring(e))
elif format == "text":
return ''.join(e.itertext())
elif format == "list":
return ''.join(e.itertext()).split('\n')
else:
print("Failed to retrieve a page.")
return None
上面的代码,特别是page_text()函数,获取任何Wikipedia页面的纯文本并将结果缓存在本地。无论如何,通过以下代码,我在Wikipedia特色文章页面上获得了所有传记的标题列表:
def get_featured_biographies(t):
titles = page_text("Wikipedia:Featured articles", "list")
titles = titles[40: ]
titles = titles[:-7]
titles = list(filter(lambda x: x != '', titles))
list_featured_biographies = []
boolean = False
for elem in t:
if ('[edit]' in elem) and ('biographies' in elem) | ('Biographies' in elem):
boolean = True
continue
elif ('[edit]' in elem) and ('biographies' not in elem):
boolean = False
if boolean:
list_featured_biographies = list_featured_biographies + [elem]
else:
continue
return list_featured_biographies
list_featured_biographies = get_featured_biographies(titles)
这是输出示例:
这是我遇到问题的地方。我需要编写一个函数,以在我创建的列表中抓取特色文章传记标题的所有各个页面。具体来说,我需要编写一个提取每个传记的第一段的函数。我使用以下代码成功完成了此任务:
for title in list_featured_biographies:
page_content = page_text(title, "list")
list_of_values_with_keywords = []
for value in page_content:
if ('was a' in value) | ('was an ' in value) | ('is a ' in value) | ('is an ' in value):
list_of_values_with_keywords.append(value)
first_paragraph = list_of_values_with_keywords[0]
print(first_paragraph)
因此Bronwyn Bancroft,Felice Beato和Jean Bellette是前三个名字。以下屏幕快照显示了前三个名称的输出。
如您所见,我的输出实质上是第一段的列表。 我想将此信息组织成两列的数据框,第一列是文章标题的名称,第二列是文章的第一段。以下代码在尝试实现这个:
title2_list = []
list_of_first_para = []
for title in list_featured_biographies:
page_content = page_text(title, "list")
title2_list.append(title)
list_of_values_with_keywords = []
for value in page_content:
if ('was a' in value) | ('was an ' in value) | ('is a ' in value) | ('is an ' in value):
list_of_values_with_keywords.append(value)
first_paragraph = list_of_values_with_keywords[0]
list_of_first_para.append(first_paragraph)
data2_for_df = {'Article_Title':title2_list, 'First_Paragraph':list_of_first_para}
wiki1para_df = pd.DataFrame(data2_for_df)
print(wiki1para_df)
这是我遇到的错误:
IndexError Traceback (most recent call last)
<ipython-input-317-f36585876409> in <module>
13 return first_paragraph
14
16 print(first_paragraph)
<ipython-input-317-f36585876409> in get_first_paragraph(list)
9 list_of_values_with_keywords.append(value)
10
---> 11 first_paragraph = list_of_values_with_keywords[0]
12
13 return first_paragraph
IndexError: list index out of range