在Quora Pairs Kaggle Challenge中使用文本

时间:2017-09-30 18:01:31

标签: python-3.x machine-learning nlp

我最近开始使用Quora Question Pairs Challenge中的数据集。

Quora Question Pairs Challenge Dataset

所以我做了一些基本的东西,比如可视化数据,清理它(Lematization,停止单词reomval,删除标点符号等)。我还使用gensim包中的word2vec模型从问题对中生成嵌入.I我对现在的情况有点困惑吗?

我如何适应模型并使其能够做出预测。欢迎提供这方面的任何帮助。

这是我为此编写的代码。

   '''Opening directory path'''

path=os.path.normpath('M:\PycharmProjects\AI+DL+CP\QQP')

train_df=0
test_dataset=0

for subdir,dir,files in os.walk(path):
    for file in files:
        #print(file)
        if file =='QQPT.csv':
            train_df=pd.read_csv(os.path.join(subdir,file),encoding='utf-8')

        elif file=='QQPTest.csv':
            test_dataset=pd.read_csv(os.path.join(subdir,file),encoding='utf-8')

eng_stopwords = set(stopwords.words('english'))

def remove_special_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    #print('Without special')
    return list(filtered_tokens)


def remove_stop_words(tokens):
    filtered_tokens=[word for word in tokens if word not in eng_stopwords]
    return list(filtered_tokens)

def remove_repeated_characters(tokens):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'

    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
    correct_tokens = [replace(word) for word in tokens]
    return list(correct_tokens)

'''Stopword removal and extra character removal'''
tokenizer=nltk.word_tokenize

print(train_df['question2'])
#print(train_df.dtypes.index)
lemmatizer=nltk.stem.WordNetLemmatizer()

print('Processing question2')
train_df['question2']=train_df['question2'].apply(lambda x:str(x))
train_df['question2']=[word.lower() for word in train_df['question2']]
train_df['question2']=train_df['question2'].apply(tokenizer)

print('Processing question1')
train_df['question1']=[word.lower() for word in train_df['question1']]
train_df['question1']=train_df['question1'].apply(tokenizer)

for something in train_df['question1']:
    for item in something:
        item=lemmatizer.lemmatize(item)

for something in train_df['question2']:
    for item in something:
        item=lemmatizer.lemmatize(item)

train_df['question2']=train_df['question2'].apply(lambda x:remove_repeated_characters(x))
train_df['question2']=train_df['question2'].apply(lambda z:remove_special_characters_after_tokenization(z))
train_df['question2']=train_df['question2'].apply(lambda x:remove_stop_words(x))

train_df['question1']=train_df['question1'].apply(lambda x:remove_repeated_characters(x))
train_df['question1']=train_df['question1'].apply(lambda z:remove_special_characters_after_tokenization(z))
train_df['question1']=train_df['question1'].apply(lambda x:remove_stop_words(x))



'''Creating Embeddings[For both question together]'''

print('Creating Embeddings')
embeddings=gensim.models.Word2Vec(train_df['question1']+train_df['question2'])

print(embeddings)

0 个答案:

没有答案