
时间:2015-09-30 18:31:29

标签: python loops for-loop web-scraping beautifulsoup

我目前正在抓取并存储来自的911个文本文件。我的代码的整个HTML处理工作;当我尝试在for列表中包含的URL(我遇到问题)时(在最后的every_link循环中)进行迭代。 processURL()函数停止工作大约一半的链接 - 所以我没有得到我需要的一半的演讲。 for循环似乎随机地通过every_link进行迭代,因为当总统乔治·W·布什和哈里·杜鲁门的演讲被刮掉时,像格罗弗·克利夫兰这样的人的演讲却没有。我的for循环有什么问题?

import urllib2,sys,os
from bs4 import BeautifulSoup,NavigableString
from string import punctuation as p
from multiprocessing import Pool
import re, nltk
import requests

os.chdir('U:/Fall 2015/ENGL 3XX')

# Scraping and cleaning one speech from Obama to show the method works

obama_4427_url = ''
obama_4427 = urllib2.urlopen(obama_4427_url).read()
obama_4427 = BeautifulSoup(obama_4427)

# find the speech itself within the HTML
obama_4427 = obama_4427.find('div',{'id': 'transcript'},{'class': 'displaytext'})

# obama_4427_div.text removes extraneous characters (e.g. '<br/>')
obama_4427 = obama_4427.text.lower()

# for further text analysis, remove punctuation
punctuation = re.compile('[{}]+'.format(re.escape(p)))

# obama_4427_nopunct = [line.decode('utf-8').strip() for line in obama_4427_html.readlines()]

obama_4427 = punctuation.sub('', obama_4427)
obama_4427 = obama_4427.replace('—',' ')
obama_4427 = obama_4427.replace('transcript','')

# divide obama_4427_str_processed into individual words
words = obama_4427.split(' ')

# Cleaning links begins below, so that  we can process all 911 speeches through processURL()

url = ''
url2 = ''

conn = urllib2.urlopen(url)
html =

miller_center_soup = BeautifulSoup(html)
links = miller_center_soup.find_all('a')

linklist = [tag.get('href') for tag in links if tag.get('href') is not None]

# remove all items in list that don't contain 'speeches'
linkslist = [_ for _ in linklist if'speeches',_)]
del linkslist[0:2]

# concatenate '' with end of speech links
every_link_dups = [url2 + end_link for end_link in linkslist]

# remove duplicates
seen = set()
every_link = [] # no duplicates array
for l in every_link_dups:
    if l not in seen:

# list of presidents (print(len(set(presidents))) = 43 total)
presidents_dups = [l[l.find('president/')+len('president/'):] for l in every_link if 'president' in l]
presidents_dups = [l[0:l.find('/')] for l in presidents_dups]
set2 = set()
presidents = []
for l in presidents_dups:
    if l not in set2:

presidents = sorted(presidents)

# the following two lines - now commented out - were used to identify duplicates in the original every_link array
# import collections
# print [l for l, count in collections.Counter(every_link).items() if count > 1]

# define a function to clean & store speeches from 'every_link' repository

def processURL(l):
        open_url = urllib2.urlopen(l).read()
        item_soup = BeautifulSoup(open_url)
        item_div = item_soup.find('div',{'id':'transcript'},{'class':'displaytext'})
        item_str = item_div.text.lower()
        item_str_processed = punctuation.sub('',item_str)
        item_str_processed_final = item_str_processed.replace('—',' ')

        splitlink = l.split("/")
        president = splitlink[4]
        speech_num = splitlink[-1].split("-")[1]
        filename = "{0}_{1}".format(president, speech_num)

        return filename, item_str_processed_final # returning a tuple

# right now, this loop only works for 423 speeches - where are the remaining ones?
for l in every_link:
    filename, content = processURL(l) # tuple unpacking
    with open(filename, 'w') as f:

0 个答案:
