使用sklearn加载文件以执行Kmean

时间:2015-10-20 11:53:48

标签: python-2.7 machine-learning scikit-learn

我有100个包含系统调用跟踪的文件。每个文件如下所示:

setpgrp ioctl setpgrp ioctl ioctl ....

我正在尝试加载这些文件并对它们执行kmean计算,以根据相似性对它们进行聚类。根据sklearn网页上的tutorial,我写了以下内容:

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.datasets import load_files
from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np

# parse commandline arguments
op = OptionParser()
op.add_option("--lsa",
              dest="n_components", type="int",
              help="Preprocess documents with latent semantic analysis.")
op.add_option("--no-minibatch",
              action="store_false", dest="minibatch", default=True,
              help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--use-idf",
              action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--n-features", type=int, default=10000,
              help="Maximum number of features (dimensions)"
                   " to extract from text.")
op.add_option("--verbose",
              action="store_true", dest="verbose", default=False,
              help="Print progress reports inside k-means algorithm.")
print(__doc__)
op.print_help()

(opts, args) = op.parse_args()
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

print("Loading training data:")
trainingdata = load_files('C:\data\Training data')
print("%d documents" % len(trainingdata.data))
print() 

print("Extracting features from the training trainingdata using a sparse vectorizer")

if opts.use_idf:
    vectorizer = TfidfVectorizer(input="file",min_df=1)   
X = vectorizer.fit_transform(trainingdata.data)


print("n_samples: %d, n_features: %d" % X.shape)
print()

if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
    # not normalized, we have to redo the normalization.
    svd = TruncatedSVD(opts.n_components)
    lsa = make_pipeline(svd, Normalizer(copy=False))

    X = lsa.fit_transform(X)


    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

但是,当所有文件都可用时,似乎数据集目录中的所有文件都没有加载到内存中。执行程序时出现以下错误:

raise ValueError("empty vocabulary; perhaps the documents only"

ValueError: empty vocabulary; perhaps the documents only contain stop words

有谁能告诉我为什么没有加载数据集?我做错了什么?

1 个答案:

答案 0 :(得分:1)

我终于设法加载文件了。在sklearn中使用Kmean的方法是对训练数据进行矢量化(使用tfidf或count_vectorizer),然后使用训练数据的矢量化来转换测试数据。完成后,您可以初始化Kmean参数,使用训练数据集向量来创建kmean集群。最后,您可以围绕训练数据质心对测试数据进行聚类。 以下代码执行上面解释的内容。

#Read the data in a directory:
def readfile(dataDir):
    data_set = []
    for file in os.listdir(dataDir):
        trainingfiles = os.path.join(dataDir, file)
        if os.path.isfile(trainingfiles): 
            data = open(trainingfiles, 'r')
            dataread=str.decode(data.read())
            data_set.append(dataread)
    return data_set 

#fitting tfidf transfrom for training data
tfidf_vectorizer_trainingset = tfidf_vectorizer.fit_transform(readfile(trainingdataDir)).toarray()

#transform the test set based on the training set
tfidf_vectorizer_testset = tfidf_vectorizer.transform(readfile(testingdataDir)).toarray()

# Kmean Clustering parameters
kmean_parameters = KMeans(n_clusters=number_of_clusters, init='k-means++', max_iter=100, n_init=1)


#Cluster the training data based on the parameters
KmeanAnalysis_training = kmean_parameters.fit(tfidf_vectorizer_trainingset)

#transform the test data based on the clustering of the training data
KmeanAnalysis_test = kmean_parameters.transform(tfidf_vectorizer_testset)