但使用GoogleNews word2vec 300维度模型中的预计算词嵌入。






我尝试过1,2,3的单个过滤器 过滤器的组合[3,4,5],[1,3,4]






将权重的stddev设置在0.4和0.01之间。 将偏差初始值设置在0到0.1之间。 尝试使用xavier初始化程序获取conv2d权重。


我只尝试了两个部分数据集,一个包含15000个训练数据,另一个在5000个测试数据上。我总共有263 000个数据需要训练。无论是对15 000个训练数据进行训练和评估,还是使用5000个测试数据作为训练数据(以节省测试时间),都没有准确性差异。

我使用带有BoW输入的前馈网络(准确度为93%),带有SVM的TF-IDF(92%)和带有原生贝叶斯的TF-IDF对15 000/5000分割进行了成功的分类(91.5%)。所以我不认为这是数据。



        def do_eval(data_set,
            Runs one evaluation against the full epoch of data.
            data_set: The set of embeddings to eval
            label_set: the set of labels to eval
            # And run one epoch of eval.

            true_count = 0  # Counts the number of correct predictions.
            steps_per_epoch = len(label_set) // batch_size
            num_examples = steps_per_epoch * batch_size
            totalLoss = 0
            # Need to compute eval accuracy
            for evalStep in xrange(steps_per_epoch):
                input_batch, label_batch = nextBatch(data_set, labels_set, batchSize)
                evalAcc, evalLoss = eval_step(input_batch, label_batch)
                true_count += evalAcc * batchSize
                totalLoss += evalLoss
            precision = float(true_count) / num_examples
            print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' % (num_examples, true_count, precision))
            print("Eval Loss: " + str(totalLoss))


class TextCNN(object):
A CNN for text classification
Uses a convolutional, max-pooling and softmax layer.

    def __init__(
            self, batchSize, numWords, num_classes,
            embedding_size, filter_sizes, num_filters):

        # Set place holders
        self.input_placeholder = tf.placeholder(tf.float32,[batchSize,numWords,embedding_size,1])
        self.labels = tf.placeholder(tf.int32, [batchSize,num_classes])
        self.pKeep = tf.placeholder(tf.float32)

        # Inference
        Ready to build conv layers followed by max pooling layers
        Each conv layer produces a different shaped output so need to loop over
        them and create a layer for each and then merge the results
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-maxpool-%s" % filter_size):
                # Convolution Layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]

                # W: Filter matrix
                W = tf.Variable(tf.truncated_normal(filter_shape,stddev=0.01), name='W')
                b = tf.Variable(tf.constant(0.0,shape=[num_filters]),name="b")

                # Valid padding: Narrow convolution (no edge padded so filter slides over everything)
                # Output size = (input_size (numWords in this case) + 2 * padding (0 in this case) - filter_size) + 1
                conv = tf.nn.conv2d(
                    strides=[1, 1, 1, 1],

                # Apply nonlinearity i.e add the bias to Wx + b
                # Where Wx is the conv layer above
                # Then run it through the activation function
                h = tf.nn.relu(tf.nn.bias_add(conv, b),name='relu')

                # Max-pooling over the outputs
                # Max-pool to control the output size
                # By taking only the best features determined by the filter
                # Ksize is the size of the window of the input tensor
                pooled = tf.nn.max_pool(
                    ksize=[1, numWords - filter_size + 1, 1, 1],
                    strides=[1, 1, 1, 1],

                # Each pooled outputs a tensor of size
                # [batchSize, 1, 1, num_filters] where num_filters represents the
                # Number of features we wanted pooled

        # Combine all pooled features
        num_filters_total = num_filters * len(filter_sizes)
        # Concat the pool output along the 3rd (num_filters / feature size) dimension
        self.h_pool = tf.concat(pooled_outputs, 3)
        # Flatten
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # Add drop out to regularize the learning curve / accuracy
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat,self.pKeep)

        # Fully connected output layer
        with tf.name_scope("output"):
            W = tf.Variable(tf.truncated_normal([num_filters_total,num_classes],stddev=0.01),name="W")
            b = tf.Variable(tf.constant(0.0,shape=[num_classes]), name='b')
            self.logits = tf.nn.xw_plus_b(self.h_drop, W, b, name='logits')
            self.predictions = tf.argmax(self.logits, 1, name='predictions')

        # Loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.labels,logits=self.logits, name="xentropy")
            self.loss = tf.reduce_mean(losses)

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.labels,1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

# Running the training
# Define various parameters for network

batchSize = 100
numWords = 120
embedding_size = 300
num_classes = 4
filter_sizes = [3,4,5] # slide over a the number of words, i.e 3 words, 4     words etc...
num_filters = 126
maxSteps = 5000
initial_learning_rate = 0.001
dropoutRate = 1

data_set = np.load("/home/kevin/Documents/NSERC_2017/articles/classifyDataSet/TestSmaller_CNN_inputMat_0.npy")
labels_set = np.load("Test_NN_target_smaller.npy")

with tf.Graph().as_default():

    sess = tf.Session()

    with sess.as_default():
    cnn = TextCNN(batchSize=batchSize,

        # Define training operation
        # Pick an optimizer, set it's learning rate, and tell it what to minimize

        global_step = tf.Variable(0,name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(initial_learning_rate)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Summaries to save for tensor board

        # Set directory
        out_dir = "/home/kevin/Documents/NSERC_2017/articles/classifyDataSet/tf_logs/CNN_Embedding/"

        # Loss and accuracy summaries
        loss_summary = tf.summary.scalar("loss",cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train summaries
        train_summary_op = tf.summary.merge([loss_summary,acc_summary])
        train_summary_dir = out_dir + "train/"
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Test summaries
        test_summary_op = tf.summary.merge([loss_summary, acc_summary])
        test_summary_dir = out_dir + "test/"
        test_summary_write = tf.summary.FileWriter(test_summary_dir, sess.graph)

        # Init all variables

        init = tf.global_variables_initializer()


        def train_step(input_data, labels_data):
            Single training step
            :param input_data: input
            :param labels_data: labels to train to
            feed_dict = {
                cnn.input_placeholder: input_data,
                cnn.labels: labels_data,
                cnn.pKeep: dropoutRate
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
            train_summary_writer.add_summary(summaries, step)


        def eval_step(input_data, labels_data, writer=None):
            Evaluates model on a test set
            Single step
            feed_dict = {
            cnn.input_placeholder: input_data,
            cnn.labels: labels_data,
            cnn.pKeep: 1.0

            step, summaries, loss, accuracy = sess.run(
            [global_step, test_summary_op, cnn.loss, cnn.accuracy],
            if writer:
                writer.add_summary(summaries, step)
        return accuracy, loss


        def nextBatch(data_set, labels_set, batchSize):
            Get the next batch of data
            :param data_set: entire training or test data set
            :param labels_set: entire training or test label set
            :param batchSize: batch size
            :return: a batch of the data and it's corresponding labels
            # Generate random row indices for the documents
            rand_index = np.random.choice(data_set.shape[0], size=batchSize)

            # Grab the data to give to the feed dicts
            data_batch, labels_batch = data_set[rand_index, :, :], labels_set[rand_index, :]

            # Resize for tensorflow
            data_batch = data_batch.reshape([data_batch.shape[0],data_batch.shape[1],data_batch.shape[2],1])
            return data_batch, labels_batch

        def do_eval(data_set,
            Runs one evaluation against the full epoch of data.
            data_set: The set of embeddings to eval
            label_set: the set of labels to eval
            # And run one epoch of eval.

            true_count = 0  # Counts the number of correct predictions.
            steps_per_epoch = len(label_set) // batch_size
            num_examples = steps_per_epoch * batch_size
            totalLoss = 0
            # Need to compute eval accuracy
            for evalStep in xrange(steps_per_epoch):
                input_batch, label_batch = nextBatch(data_set, labels_set, batchSize)
                evalAcc, evalLoss = eval_step(input_batch, label_batch)
                true_count += evalAcc * batchSize
                totalLoss += evalLoss
            precision = float(true_count) / num_examples
            print('  Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' % (num_examples, true_count, precision))
            print("Eval Loss: " + str(totalLoss))

        # Training Loop

        for step in range(maxSteps):
            input_batch, label_batch = nextBatch(data_set,labels_set,batchSize)

        # Evaluate over the entire data set on last eval
            if step  % 100 == 0:
                print "On Step : " + str(step) + " of " + str(maxSteps)
                do_eval(data_set, labels_set,batchSize)


def createInputEmbeddedMatrix(corpusPath, maxWords, svName):
    # Create a [docNum, Words per Art, Embedding Size] matrix to fill

    genDocsPath = "gen_docs_classifyData_smallerTest_TFIDF.npy"
    # corpus = "newsCorpus_word2vec_All_Corpus.mm"
    dictPath = 'news_word2vec_smallerDict.dict'
    tf_idf_path = "news_tfIdf_word2vec_All.tfidf_model"

    gen_docs = np.load(genDocsPath)
    dictionary = gensim.corpora.dictionary.Dictionary.load(dictPath)
    tf_idf = gensim.models.tfidfmodel.TfidfModel.load(tf_idf_path)

    corpus = corpora.MmCorpus(corpusPath)
    numOfDocs = len(corpus)
    embedding_size = 300

    id2embedding = np.load("smallerID2embedding.npy").item()

    # Need to process in batches as takes up a ton of memory

    step = 5000
    totalSteps = int(np.ceil(numOfDocs / step))

    for i in range(totalSteps):
        # inputMatrix = scipy.sparse.csr_matrix([step,maxWords,embedding_size])
        inputMatrix = np.zeros([step, maxWords, embedding_size])
        start = i * step
        end = start + step
        for docNum in range(start, end):
            print "On docNum " + str(docNum) + " of " + str(numOfDocs)
            # Extract the top N words
            topWords, wordVal = tf_idfTopWords(docNum, gen_docs, dictionary, tf_idf, maxWords)
            # doc = corpus[docNum]
            # Need to track word dex and doc dex seperate
            # Doc dex because of the batch processing
            wordDex = 0
            docDex = 0
            for wordID in wordVal:
                inputMatrix[docDex, wordDex, :] = id2embedding[wordID]
                wordDex += 1
            docDex += 1

        # Save the batch of input data
        # scipy.sparse.save_npz(svName + "_%d"  % i, inputMatrix)
        np.save(svName + "_%d.npy" % i, inputMatrix)


