import urllib2
from datetime import datetime
from bs4 import BeautifulSoup
page1 = urllib2.urlopen("http://en.wikipedia.org/wiki/List_of_human_stampedes")
soup = BeautifulSoup(page1)
events = soup.find('span', id='20th_century').parent.find_next_sibling('ul')
for event in events.find_all('li'):
try:
date_string, rest = event.text.split(':', 1)
print datetime.strptime(date_string, '%B %d, %Y').strftime('%d/%m/%Y')
except ValueError:
print event.text
使用上述方法,我可以从< li>标签。我也希望提取引用链接。问题是每个< li>标签有很多链接。虽然引用了类定义"引用:我仍然无法获得完整的链接。 我将结果存储为每个行包含日期和引用链接的表。 (以.csv格式)。 提问题 - Web crawler to extract from list elements
答案 0 :(得分:1)
这是你可以用作开始的东西。它以以下行格式创建csv文件:
date,link
如果提取日期组件时出错,它会跳过一行。目前,为了举例,它适用于“20世纪”段落:
import csv
import urllib2
from datetime import datetime
from urlparse import urljoin
from bs4 import BeautifulSoup
base_url = 'http://en.wikipedia.org'
page = urllib2.urlopen("http://en.wikipedia.org/wiki/List_of_human_stampedes")
soup = BeautifulSoup(page)
# build a list of references
references = {}
for item in soup.select('ol.references li[id]'):
links = [a['href'] if a['href'].startswith('http') else urljoin(base_url, a['href'])
for a in item.select('span.reference-text a[href]')]
references[item['id']] = links
events = soup.find('span', id='20th_century').parent.find_next_siblings()
with open('output.csv', 'wb') as f:
writer = csv.writer(f)
for tag in events:
if tag.name == 'h2':
break
for event in tag.find_all('li'):
# extract text
try:
date_string, _ = event.text.split(':', 1)
date = datetime.strptime(date_string, '%B %d, %Y').strftime('%d/%m/%Y')
except ValueError:
continue
# extract links and write data
links = event.find_all('a', href=lambda x: x.startswith('#cite_note-'))
if links:
for link in links:
for ref in references[link['href'][1:]]:
writer.writerow([date, ref])
else:
writer.writerow([date, ''])
运行脚本后 output.csv
:
19/09/1902,
30/12/1903,
11/01/1908,
24/12/1913,
23/10/1942,http://www.ferrovieinrete.com/doc_storici/GalleriaGrazie.pdf
09/03/1946,
01/01/1956,
02/01/1971,
...