我正在尝试使用tweepy处理作为JSON文件收集的推文。现在,我尝试使用以下代码处理文本。 [The Chicago.json包含带有关键字Chicago的推文]
# import json library to analysis tweet text
import json
# import re library to use regular expression
import nltk
nltk.download('punkt')
import re
from nltk.tokenize import word_tokenize
# parse the whole json file
with open('stream_Chicago.json', 'r') as f:
# read only first tweet
line = f.readline()
# load ti as python dictionary
tweet = json.loads(line)
# print to view tweets data structure
# print(json.dumps(tweet, indent = 4))
tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(word_tokenize(tweet))
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', #
URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE |
re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for
token in tokens]
return tokens
tweet = "RT @marcobonzanini: just an example! :D http://example.com #NLP"
print(preprocess(tweet))
with open('stream_Chicago.json', 'r') as f:
for line in f:
tweet = json.loads(line)
print(tweet['text'])
#print(tokens)
但是我遇到以下错误:
请帮助我如何解决此错误