我最近开始使用Quora Question Pairs Challenge中的数据集。
Quora Question Pairs Challenge Dataset
所以我做了一些基本的东西,比如可视化数据,清理它(Lematization,停止单词reomval,删除标点符号等)。我还使用gensim包中的word2vec模型从问题对中生成嵌入.I我对现在的情况有点困惑吗?
我如何适应模型并使其能够做出预测。欢迎提供这方面的任何帮助。
这是我为此编写的代码。
'''Opening directory path'''
path=os.path.normpath('M:\PycharmProjects\AI+DL+CP\QQP')
train_df=0
test_dataset=0
for subdir,dir,files in os.walk(path):
for file in files:
#print(file)
if file =='QQPT.csv':
train_df=pd.read_csv(os.path.join(subdir,file),encoding='utf-8')
elif file=='QQPTest.csv':
test_dataset=pd.read_csv(os.path.join(subdir,file),encoding='utf-8')
eng_stopwords = set(stopwords.words('english'))
def remove_special_characters_after_tokenization(tokens):
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
#print('Without special')
return list(filtered_tokens)
def remove_stop_words(tokens):
filtered_tokens=[word for word in tokens if word not in eng_stopwords]
return list(filtered_tokens)
def remove_repeated_characters(tokens):
repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
match_substitution = r'\1\2\3'
def replace(old_word):
if wordnet.synsets(old_word):
return old_word
new_word = repeat_pattern.sub(match_substitution, old_word)
return replace(new_word) if new_word != old_word else new_word
correct_tokens = [replace(word) for word in tokens]
return list(correct_tokens)
'''Stopword removal and extra character removal'''
tokenizer=nltk.word_tokenize
print(train_df['question2'])
#print(train_df.dtypes.index)
lemmatizer=nltk.stem.WordNetLemmatizer()
print('Processing question2')
train_df['question2']=train_df['question2'].apply(lambda x:str(x))
train_df['question2']=[word.lower() for word in train_df['question2']]
train_df['question2']=train_df['question2'].apply(tokenizer)
print('Processing question1')
train_df['question1']=[word.lower() for word in train_df['question1']]
train_df['question1']=train_df['question1'].apply(tokenizer)
for something in train_df['question1']:
for item in something:
item=lemmatizer.lemmatize(item)
for something in train_df['question2']:
for item in something:
item=lemmatizer.lemmatize(item)
train_df['question2']=train_df['question2'].apply(lambda x:remove_repeated_characters(x))
train_df['question2']=train_df['question2'].apply(lambda z:remove_special_characters_after_tokenization(z))
train_df['question2']=train_df['question2'].apply(lambda x:remove_stop_words(x))
train_df['question1']=train_df['question1'].apply(lambda x:remove_repeated_characters(x))
train_df['question1']=train_df['question1'].apply(lambda z:remove_special_characters_after_tokenization(z))
train_df['question1']=train_df['question1'].apply(lambda x:remove_stop_words(x))
'''Creating Embeddings[For both question together]'''
print('Creating Embeddings')
embeddings=gensim.models.Word2Vec(train_df['question1']+train_df['question2'])
print(embeddings)