我是正则表达式的新手。在google colaboratory中,在基于Twitter数据集对一袋单词进行赋值时,我给出了这样的代码来修改给定数据集中的Twitter Tweet。
#Replacing 2 or more repetitions of character with the character itself
def replaceTwoOrMore(s):
pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
return pattern.sub(r"\1\1", s)
def tweets_reconstruction(tweet):
#Removing numbers
tweet = re.sub('[0-9]', '', tweet)
#Convert to lower case
tweet = tweet.lower()
tweet = tweet.translate(str.maketrans('', '', string.punctuation))
#Replacing short URLs with ""
tweet = re.sub(r"[\b(http)]+", "", tweet)
#Replacing "@username" with "AT_USER"
tweet = re.sub(r"[\b(@)]+", "AT_USER", tweet)
#Replacing "#word" with "word"
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#Replacing multiple whitespaces with single whitespace
tweet = re.sub(r"[\s]+", " ", tweet)
tweet = replaceTwoOrMore(tweet)
return tweet
processedTweets = []
for tweet in tweets:
processedTweets.append(tweets_reconstruction(tweet))
vectorizer = CountVectorizer()
featurevector = vectorizer.fit_transform(processedTweets)
featurevector.todense()
在运行时,它显示“使用所有可用RAM后会话崩溃” 请帮帮我。