我有50000行的数据,并且必须在每行上使用该函数。所以我正在使用for循环,但它花费了大约1.5个小时的太多时间。所以任何人都可以请我帮忙,以使其更快。
test_feat_custom = []
count = 0
for x in test_data['text']:
row = getFeatures(x)
test_feat_custom.append(row)
此代码执行所需的时间很长,因此请帮助我使其更快。
这是我的getFeatures函数代码
import pandas as pd
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import pos_tag_sents
from nltk.tokenize import sent_tokenize
from nltk.chunk import ne_chunk
import collections
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import spacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
vad = pd.read_csv('/home/riteshjain/anaconda3/Api/vader_lexicon.txt', error_bad_lines=False, header = None, sep = ' ')
post = vad.copy()
post[1] = post[post[1] > 0][1]
post = post.dropna()
post = post.reset_index(drop=True)
vad[1] = vad[vad[1] < 0][1]
vad = vad.dropna()
vad = vad.reset_index(drop=True)
def Que_Counter(x):
que_c = x.count('?')
return que_c
def Exc_Counter(x):
exc_c = x.count('!')
return exc_c
def Sent_Counter(x):
tokenized_text=sent_tokenize(x)
return len(tokenized_text)
def Word_Counter(x):
tokenized_word = word_tokenize(x)
return len(tokenized_word)
def Noun_Counter(x):
tokenized_word = word_tokenize(x)
pos = pos_tag(tokenized_word)
nouns = []
for (word, pos) in pos:
if pos.startswith("NN"):
nouns.append(word)
return len(nouns)
def Pronoun_Counter(x):
tokenized_word = word_tokenize(x)
pos = pos_tag(tokenized_word)
pronouns = []
for (word, pos) in pos:
if pos.startswith("PRP"):
pronouns.append(word)
return len(pronouns)
def Adjective_Counter(x):
tokenized_word = word_tokenize(x)
pos = pos_tag(tokenized_word)
adject = []
for (word, pos) in pos:
if pos.startswith("JJ"):
adject.append(word)
return len(adject)
def Adverb_Counter(x):
tokenized_word = word_tokenize(x)
pos = pos_tag(tokenized_word)
adverb = []
for (word, pos) in pos:
if pos.startswith("RB"):
adverb.append(word)
return len(adverb)
def Name_Entity(x):
doc = nlp(x)
ca = [(X.text, X.label_) for X in doc.ents]
return len(ca)
def negation(x):
negat = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
"ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
"dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
"don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
"neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
"oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
"oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
"without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]
count = 0
tokenized_word = x.split()
for a in negat:
for b in tokenized_word:
if a == b:
count +=1
return count
def neg(x,y):
words = x.split()
n_count = 0
for c in y[0]:
for j in words:
if c == j:
n_count += 1
return n_count
def pos(x,y):
words = x.split()
p_count = 0
for c in y[0]:
for j in words:
if c == j:
p_count += 1
return p_count
def getFeatures(sentence):
# get data from the user
# extract all features
numQuestion = Que_Counter(sentence)
numExclamation= Exc_Counter(sentence)
numSent = Sent_Counter(sentence)
numWord = Word_Counter(sentence)
numNoun = Noun_Counter(sentence)
numPronoun = Pronoun_Counter(sentence)
numAdject = Adjective_Counter(sentence)
numAdverb = Adverb_Counter(sentence)
numEntity = Name_Entity(sentence)
numNegation = negation(sentence)
numPosOpinion = pos(sentence, post)
numNegOpinion = neg(sentence, vad)
features = [numQuestion, numExclamation, numSent, numWord, numNoun, numPronoun, numAdject, numAdverb, numEntity, numNegation, numPosOpinion, numNegOpinion]
return features