Question

我有50000行的数据，并且必须在每行上使用该函数。所以我正在使用for循环，但它花费了大约1.5个小时的太多时间。所以任何人都可以请我帮忙，以使其更快。

test_feat_custom = []
count = 0
for x in test_data['text']:
    row = getFeatures(x)
    test_feat_custom.append(row)

此代码执行所需的时间很长，因此请帮助我使其更快。

这是我的getFeatures函数代码

import pandas as pd
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import pos_tag_sents
from nltk.tokenize import sent_tokenize
from nltk.chunk import ne_chunk
import collections
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import spacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

vad = pd.read_csv('/home/riteshjain/anaconda3/Api/vader_lexicon.txt',  error_bad_lines=False, header = None, sep = '    ')

post  = vad.copy()
post[1] = post[post[1] > 0][1]
post = post.dropna()
post = post.reset_index(drop=True)


vad[1] = vad[vad[1] < 0][1]
vad = vad.dropna()
vad = vad.reset_index(drop=True)


def Que_Counter(x):
    que_c = x.count('?')
    return que_c


def Exc_Counter(x):
    exc_c = x.count('!')
    return exc_c


def Sent_Counter(x):
    tokenized_text=sent_tokenize(x)
    return len(tokenized_text)


def Word_Counter(x):
    tokenized_word = word_tokenize(x)
    return len(tokenized_word)


def Noun_Counter(x):
    tokenized_word = word_tokenize(x)
    pos = pos_tag(tokenized_word)
    nouns = []
    for (word, pos) in pos:
        if pos.startswith("NN"):
            nouns.append(word)
    return len(nouns)


def Pronoun_Counter(x):
    tokenized_word = word_tokenize(x)
    pos = pos_tag(tokenized_word)
    pronouns = []
    for (word, pos) in pos:
        if pos.startswith("PRP"):
            pronouns.append(word)
    return len(pronouns)


def Adjective_Counter(x):
    tokenized_word = word_tokenize(x)
    pos = pos_tag(tokenized_word)
    adject = []
    for (word, pos) in pos:
        if pos.startswith("JJ"):
            adject.append(word)
    return len(adject)


def Adverb_Counter(x):
    tokenized_word = word_tokenize(x)
    pos = pos_tag(tokenized_word)
    adverb = []
    for (word, pos) in pos:
        if pos.startswith("RB"):
            adverb.append(word)
    return len(adverb)


def Name_Entity(x):
    doc = nlp(x)
    ca = [(X.text, X.label_) for X in doc.ents]
    return len(ca)

def negation(x):
    negat = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]
    count = 0
    tokenized_word = x.split()
    for a in negat:
        for b in tokenized_word:
            if a == b:
                count +=1
    return count   

def neg(x,y):
    words = x.split()
    n_count = 0
    for c in y[0]:
        for j in words:
                if c == j:
                    n_count += 1
    return n_count


def pos(x,y):
    words = x.split()
    p_count = 0
    for c in y[0]:
        for j in words:
                if c == j:
                    p_count += 1
    return p_count

def getFeatures(sentence):
    # get data from the user


    # extract all features
    numQuestion   = Que_Counter(sentence)
    numExclamation= Exc_Counter(sentence)
    numSent       = Sent_Counter(sentence)
    numWord       = Word_Counter(sentence)
    numNoun       = Noun_Counter(sentence)
    numPronoun    = Pronoun_Counter(sentence)
    numAdject     = Adjective_Counter(sentence)
    numAdverb     = Adverb_Counter(sentence)
    numEntity     = Name_Entity(sentence)
    numNegation   = negation(sentence)
    numPosOpinion = pos(sentence, post)
    numNegOpinion = neg(sentence, vad)

    features = [numQuestion, numExclamation, numSent, numWord, numNoun, numPronoun, numAdject, numAdverb, numEntity, numNegation, numPosOpinion, numNegOpinion]


    return features

针对循环进行优化，从而花费更少的时间

0 个答案: