import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from nltk import punkt
class pre:
def __init__(self):
self._stopwords=set(stopwords.words('english')+list(punctuation)+['AT_USER','URL'])
def processtweets(self,list1):
processedtweets=[]
for tweet in list1:
processedtweets.append((self._processtweet(tweet["text"]),tweet["label"]))
return processedtweets
def _processtweet(self, tweet):
tweet=tweet.lower()
tweet=re.sub('((www\.[^s]+)|(https?://[^s]+))','URL',tweet)
tweet=re.sub('@[^s]+','AT_USER',tweet)
tweet=re.sub(r'#([^\s]+)',r'\1',tweet)
#r is used to not excape any character
tweet=word_tokenize(tweet)
#tokenized the tweet into the list of words
return [word for word in tweet if word not in self._stopwords]
这给出了错误AttributeError:' Status'对象没有属性' lower'在函数tweet = tweet.lower()