import urllib2,sys
from bs4 import BeautifulSoup,NavigableString
from string import punctuation as p
# URL for Obama's presidential acceptance speech in 2008
obama_4427_url = 'http://www.millercenter.org/president/obama/speeches/speech-4427'
# read in URL
obama_4427_html = urllib2.urlopen(obama_4427_url).read()
# BS magic
obama_4427_soup = BeautifulSoup(obama_4427_html)
# find the speech itself within the HTML
obama_4427_div = obama_4427_soup.find('div',{'id': 'transcript'},{'class': 'displaytext'})
# obama_4427_div.text.lower() removes extraneous characters (e.g. '<br/>')
# and places all letters in lowercase
obama_4427_str = obama_4427_div.text.lower()
# for further text analysis, remove punctuation
for punct in list(p):
obama_4427_str_processed = obama_4427_str.replace(p,'')
obama_4427_str_processed_2 = obama_4427_str_processed.replace(p,'')
print(obama_4427_str_processed_2)
# store individual words
words = obama_4427_str_processed.split(' ')
print(words)
长话短说,我有奥巴马总统的演讲,我正在寻找删除所有标点符号,以便我只留下这些字眼。我已导入punctuation
模块,运行for
循环,但未删除所有标点符号。我在这里做错了什么?
答案 0 :(得分:2)
str.replace()
搜索第一个参数的整个值。它不是一个模式,所以只有当整个`string.punctuation *值存在时才会被替换为空字符串。
改为使用正则表达式:
import re
from string import punctuation as p
punctuation = re.compile('[{}]+'.format(re.escape(p)))
obama_4427_str_processed = punctuation.sub('', obama_4427_str)
words = obama_4427_str_processed.split()
请注意,您可以使用不带参数的str.split()
来分割任意宽度的空格,包括换行符。
答案 1 :(得分:1)
如果您想删除标点符号,可以rstrip
关闭它:
obama_4427_str = obama_4427_div.text.lower()
# for further text analysis, remove punctuation
from string import punctuation
print([w.rstrip(punctuation) for w in obama_4427_str.split()])
输出:
['transcript', 'to', 'chairman', 'dean', 'and', 'my', 'great',
'friend', 'dick', 'durbin', 'and', 'to', 'all', 'my', 'fellow',
'citizens', 'of', 'this', 'great', 'nation', 'with', 'profound',
'gratitude', 'and', 'great', 'humility', 'i', 'accept', 'your',
'nomination', 'for', 'the', 'presidency', 'of', 'the', 'united',
................................................................
使用python3从任何地方删除使用str.translate:
from string import punctuation
tbl = str.maketrans({ord(ch):"" for ch in punctuation})
obama_4427_str = obama_4427_div.text.lower().translate(tbl)
print(obama_4427_str.split())
对于python2:
from string import punctuation
obama_4427_str = obama_4427_div.text.lower().encode("utf-8").translate(None,punctuation)
print( obama_4427_str.split())
输出:
['transcript', 'to', 'chairman', 'dean', 'and', 'my', 'great',
'friend', 'dick', 'durbin', 'and', 'to', 'all', 'my', 'fellow',
'citizens', 'of', 'this', 'great', 'nation', 'with', 'profound',
'gratitude', 'and', 'great', 'humility', 'i', 'accept', 'your',
'nomination', 'for', 'the', 'presidency', 'of', 'the', 'united',
............................................................
另一方面,你可以迭代一个字符串,这样list(p)
在你自己的代码中是多余的。