Python Gensim FastText保存和加载模型

时间:2020-05-12 07:56:42

标签: python gensim fasttext

我正在使用Gensim FASTText建模,并且有以下问题。

  • “ ft_model.save(BASE_PATH + MODEL_PATH + fname)”的输出保存以下3个文件。这个对吗?有没有办法合并所有三个文件?
ft_gensim-v3
ft_gensim-v3.trainables.vectors_ngrams_lockf.npy
ft_gensim-v3.wv.vectors_ngrams.npy

当我尝试加载训练文件然后使用它时,我从if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:收到以下错误

“功能”对象没有属性“ wv”

最后,这两个模型都有一种方法不必存储def read_train(path,label_path)def lemmetize(df_col)的输出,因此我不必每次都想训练代码时就运行这部分代码模型还是比较?

感谢您的协助。

这是我的FastText训练模型

import os
import logging
from config import BASE_PATH, DATA_PATH, MODEL_PATH
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pprint import pprint as print
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath

#Read Training data
import pandas as pd
def read_train(path,label_path):
    d = []
    #e = []
    df = pd.read_excel(path)
    labelled = pd.read_csv(label_path)
    updated_col1 = lemmetize(df['query_text'])
    updated_col2 = lemmetize(labelled['QueryText'])
    for i in range(len(updated_col1)):
        d.append(updated_col1[i])
        #print(d)
    for i in range(len(updated_col2)):
        d.append(updated_col2[i])
    return d


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer

def lemmetize(df_col):
    df_updated_col = pd.Series(0, index = df_col.index)
    stop_words = set(stopwords.words('english'))
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    ps = PorterStemmer()
    for i, j in zip(df_col, range(len(df_col))):
        lem = []
        t = str(i).lower()
        t = t.replace("'s","")
        t = t.replace("'","")
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        t = t.translate(translator)
        word_tokens = word_tokenize(t)
        for i in range(len(word_tokens)):
            l1 = lemmatizer.lemmatize(word_tokens[i])
            s1 = ps.stem(word_tokens[i])
            if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
                lem.append(l1)
        filtered_sentence = [w for w in lem if not w in stop_words]
        df_updated_col[j] = filtered_sentence
    return df_updated_col

#read test data
def read_test(path):
    return pd.read_excel(path)


#Read labelled data
def read_labelled(path):
    return pd.read_csv(path)


word_tokenized_corpus = read_train('Train Data.xlsx','SMEQueryText.csv')


#Train fasttext model
import tempfile
import os

from gensim.models import FastText
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("ft_gensime-v3")

def train_fastText(data, embedding_size = 60, window_size = 40, min_word = 5, down_sampling = 1e-2, iter=100):
    ft_model = FastText(word_tokenized_corpus,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

    #with tempfile.NamedTemporaryFile(prefix=BASE_PATH + MODEL_PATH + 'ft_gensim_v2-', delete=False) as tmp:
    #    ft_model.save(tmp.name, separately=[])
    ft_model.save(BASE_PATH + MODEL_PATH + fname)
    return ft_model


# main function to output
def main(test_path, train_path, labelled):
    test_data = read_test(test_path)
    train_data = read_train(train_path,labelled)
    labelled = read_labelled(labelled)
    output_df = pd.DataFrame(index = range(len(test_data)))
    output_df['test_query'] = str()
    output_df['Similar word'] = str()
    output_df['category'] = str()
    output_df['similarity'] = float()
    model = train_fastText(train_data)

# run main
if __name__ == "__main__":
    output = main('Test Data.xlsx','Train Data.xlsx','QueryText.csv')

这是我的使用模式

import pandas as pd
from gensim.models import FastText
import gensim
from config import BASE_PATH, DATA_PATH, MODEL_PATH

#Read Training data
def read_train(path,label_path):
    d = []
    #e = []
    df = pd.read_excel(path)
    labelled = pd.read_csv(label_path)
    updated_col1 = lemmetize(df['query_text'])
    updated_col2 = lemmetize(labelled['QueryText'])
    for i in range(len(updated_col1)):
        d.append(updated_col1[i])
    for i in range(len(updated_col2)):
        d.append(updated_col2[i])
    return d

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import string
from nltk.stem import PorterStemmer

def lemmetize(df_col):
    df_updated_col = pd.Series(0, index = df_col.index)
    stop_words = set(stopwords.words('english'))
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    ps = PorterStemmer()
    for i, j in zip(df_col, range(len(df_col))):
        lem = []
        t = str(i).lower()
        t = t.replace("'s","")
        t = t.replace("'","")
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        t = t.translate(translator)
        word_tokens = word_tokenize(t)
        for i in range(len(word_tokens)):
            l1 = lemmatizer.lemmatize(word_tokens[i])
            s1 = ps.stem(word_tokens[i])
            if list(l1) != [''] and list(l1) != [' '] and l1 != '' and l1 != ' ':
                lem.append(l1)
        filtered_sentence = [w for w in lem if not w in stop_words]
        df_updated_col[j] = filtered_sentence
    return df_updated_col

#read test data
def read_test(path):
    return pd.read_excel(path)

#Read labelled data
def read_labelled(path):
    return pd.read_csv(path)

def load_training():
    return FT_gensim.load(BASE_PATH + MODEL_PATH +'ft_gensim-v3')

#compare similarity
def compare_similarity(model, real_data, labelled):
    maxWord = ''
    category = ''
    maxSimilaity = 0
    #print("train data",labelled[1])
    for i in range(len(labelled)):
        if model.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
            #print('labelled',labelled['QueryText'][i], 'i', i)
            maxWord = labelled['QueryText'][i]
            category = labelled['Subjectmatter'][i]
            maxSimilaity = model.similarity(real_data, labelled['QueryText'][i])

    return maxWord, category, maxSimilaity

# Output from Main to excel
from pandas import ExcelWriter
def export_Excel(data, aFile = 'FASTTEXTOutput.xlsx'):
    df = pd.DataFrame(data)
    writer = ExcelWriter(aFile)
    df.to_excel(writer,'Sheet1')
    writer.save()

# main function to output
def main(test_path, train_path, labelled):
    test_data = read_test(test_path)
    train_data = read_train(train_path,labelled)
    labelled = read_labelled(labelled)
    output_df = pd.DataFrame(index = range(len(test_data)))
    output_df['test_query'] = str()
    output_df['Similar word'] = str()
    output_df['category'] = str()
    output_df['similarity'] = float()
    model = load_training
    for i in range(len(test_data)):
        output_df['test_query'][i] = test_data['query_text'][i]
        #<first change>
        maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
        output_df['Similar word'][i] = maxWord
        output_df['category'][i] = category
        output_df['similarity'][i] = maxSimilaity
    #<second change>    
    return output_df

# run main
if __name__ == "__main__":
    output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
    export_Excel(output)

这是完整的可跟踪错误消息

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-22-57803b59c0b9> in <module>
      1 # run main
      2 if __name__ == "__main__":
----> 3     output = main('Test Data.xlsx','Train Data.xlsx','SMEQueryText.csv')
      4     export_Excel(output)

<ipython-input-21-17cb88ee0f79> in main(test_path, train_path, labelled)
     13         output_df['test_query'][i] = test_data['query_text'][i]
     14         #<first change>
---> 15         maxWord, category, maxSimilaity = compare_similarity(model, str(test_data['query_text'][i]), labelled)
     16         output_df['Similar word'][i] = maxWord
     17         output_df['category'][i] = category

<ipython-input-19-84d7f268d669> in compare_similarity(model, real_data, labelled)
      6     #print("train data",labelled[1])
      7     for i in range(len(labelled)):
----> 8         if model.wv.similarity(real_data, labelled['QueryText'][i]) > maxSimilaity:
      9             #print('labelled',labelled['QueryText'][i], 'i', i)
     10             maxWord = labelled['QueryText'][i]

AttributeError: 'function' object has no attribute 'wv'

1 个答案:

答案 0 :(得分:0)

您在这里有三个单独的,仅含糊的相关问题。按顺序排列每个

  • 为什么有3个文件,并且可以将它们合并?

将大型原始阵列与主要的“腌制”模型分开存储是更有效的方法,对于超过几千兆字节的模型,解决“棘手的”实现限制是必需的。因此,我建议您仅保留默认行为,并养成一起管理/移动/复制文件集的习惯。

如果您的模型足够小,则可以尝试一些方法。 .save()方法具有一个可选参数sep_limit,该参数控制阈值数组大小,在该阈值数组上将数组存储为单独的文件。通过设置较大的值,例如sep_limit=2*1024*1024*1024(2GiB),较小的模型应保存一个文件。 (但是,加载速度会变慢,您将不会使用有时有用的内存映射加载选项,并且在超大型模型上保存可能会中断。)

  • 为什么会出现AttributeError: 'function' object has no attribute 'wv'错误?

您的代码行model = load_training将实际函数分配给model变量,而不是您可能想要的,而是使用一些参数调用该函数的返回值。该函数没有.wv属性,因此出现错误。如果modelFastText的实际实例,则不会收到该错误。

  • 可以存储语料库文本,以避免重复预处理和从熊猫格式转换吗?

当然,您可以将文本写入文件中。大概是:

with open('mycorpus.txt', mode='w') as corpusfile:
    for text in word_tokenized_corpus:
        corpusfile.write(' '.join(text))
        corpusfile.write('\n')

尽管实际上,gensim提供了一个实用程序功能utils.save_as_line_sentence(),它可以做到这一点(&明确地处理了一些额外的编码问题)。参见:

https://radimrehurek.com/gensim/utils.html#gensim.utils.save_as_line_sentence

LineSentence中的gensim.models.word2vec实用工具类可以将此类文件中的文本流回以供将来重用:

https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence