将文本数组转换为向量

时间:2017-08-01 04:12:37

标签: python-3.x pandas tensorflow

我目前正在使用tensorflowtflearn开展情感分析项目。我有一个社交媒体帖子的数据集,这些数据集是在CSV文件中提供给我的,我试图将它们变成用于训练的矢量。

这是我第一次尝试手动执行此类操作,我通常会导入已经过预处理的数据集。

目前,以下是我在Google搜索过程中找到的资源中尝试过的代码:

import pandas as pd
import numpy 
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import keras
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import csv

highschool_posts = pd.read_csv('Depression-Posts-Sentiment.xlsx-Sheet1.csv',names=["Post"],header=None,skiprows = 1)
highschool_posts
# with open()
# pickle.dump(highschool_posts,'highschool_posts.pkl')
posts = pd.DataFrame.as_matrix(highschool_posts)
print(posts.shape)
print(posts[0:2])
# print(posts)

vectorizer = CountVectorizer(input = highschool_posts,
                            analyzer = "word",
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = None,
                             max_features =5000)
print(vectorizer)

当我打印结果时,我得到了这个:

(14, 1)
[[ "You'd think i'd feel the worst when im alone but it gets worst when im around you"]
 [ 'I took too many drugs and I passed out and started foaming at the mouth. I wish that it would have killed me but it did not. Ugh ']]
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8',
        input=                                                                                                                                                                                                                                                                                                      ...hill day at school today! Wonderful day... one...  WOW WE GRADUATED TODAY FINALLY!! This isn't th...,
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

有没有人能更好地了解我可以尝试获得看起来像这样的输出? :

([[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4],
  [17,
   10,
   2,
   6409,
   25,
   325,
   7,
   3161,
   4,
   2588,
   1977,
   176,
   3,
   26,
   50,
   35,
   70,
   1050,
   395,
   4],
  [9846,
   8681,
   56,
   33,
   80,
   113,
   125,
   62,
   116,
   4397,
   5,
   17,
   10,
   70,
   284,
   4,
   277,
   2480,
   23,
   92,
   46,
   49,
   4358,
   1,
   31,
   78,
   116,
   4],
  [824,
   69,
   704,
   47,
   389,
   8,
   1373,
   4,
   100,
   365,
   1268,
   8,
   17,
   3524,
   27,
   4,
   573,
   2433,
   299,
   192,
   418,
   194,
   174,
   536,
   120,
   8,
   364,
   4],
  [39,
   7,
   69,
   519,
   154,
   10,
   42,
   2,
   468,
   65,
   5518,
   31,
   6,
   1924,
   1,
   1181,
   8,
   210,
   45,
   1,
   4942,
   87,
   15,
   78,
   6544,
   40,
   259,
   1,
   388,
   23,
   9420,
   40],
  [16,
   117,
   14,
   20,
   39,
   7,
   2,
   829,
   116,
   74,
   35,
   140,
   107,
   3,
   5,
   16,
   155,
   126,
   125,
   86,
   2,
   286,
   10,
   142,
   3,
   26,
   14,
   20,
   145,
   6,
   67,
   62,
   25,
   40],
  [16,
   21,
   585,
   1656,
   44,
   2,
   4640,
   3111,
   5,
   5022,
   7,
   2996,
   1,
   3,
   15,
   17,
   1291,
   1025,
   3,
   8772,
   5,
   2002,
   1433,
   27,
   4,
   1292,
   30,
   48,
   88,
   3,
   1,
   3268,
   1,
   7315,
   4],
  [9435,
   2605,
   20,
   251,
   10,
   59,
   1388,
   58,
   15,
   69,
   358,
   164,
   17,
   27,
   4,
   66,
   1,
   58,
   2,
   7886,
   1168,
   7,
   2,
   27,
   24,
   45,
   1,
   1697,
   4,
   16,
   71,
   49,
   8,
   79,
   64,
   7,
   52,
   4],
  [16,
   133,
   41,
   36,
   179,
   26,
   17,
   282,
   49,
   176,
   403,
   801,
   403,
   17,
   282,
   64,
   49,
   6,
   19,
   604,
   1880,
   1,
   19,
   4,
   16,
   117,
   92,
   49,
   14,
   75,
   99,
   19,
   14,
   20,
   47,
   604,
   1880,
   19,
   4],
  [17,
   25,
   935,
   1270,
   1,
   1120,
   15,
   6,
   3538,
   1328,
   4,
   723,
   18,
   3,
   14,
   935,
   6,
   378,
   96,
   242,
   24,
   760,
   3,
   2231,
   431,
   4,
   18,
   14,
   204,
   22,
   125,
   1115,
   6119,
   3151,
   61,
   92,
   213,
   38,
   7467,
   4],
  [17,
   56,
   45,
   236,
   3,
   3016,
   651,
   1,
   1038,
   131,
   3,
   26,
   14,
   20,
   1050,
   24,
   351,
   474,
   5,
   1,
   330,
   323,
   4,
   1,
   3374,
   6889,
   10,
   2282,
   3,
   480,
   2,
   588,
   7,
   6,
   865,
   76,
   5,
   474,
   4419,
   24,
   52,
   687,
   4],
  [6,
   1525,
   7,
   5172,
   544,
   1901,
   24,
   6,
   1,
   1611,
   771,
   4,
   2,
   952,
   10,
   960,
   397,
   5,
   371,
   3,
   127,
   54,
   2,
   889,
   23,
   2146,
   1112,
   3304,
   4,
   16,
   63,
   32,
   278,
   17,
   21,
   644,
   15,
   2208,
   4,
   1,
   17,
   25,
   4],
  [5,
   16,
   77,
   399,
   18,
   4,
   16,
   1052,
   14,
   247,
   328,
   31,
   1,
   3,
   5,
   16,
   21,
   33,
   1006,
   14,
   8,
   38,
   48,
   62,
   4,
   17,
   10,
   167,
   39,
   7,
   69,
   2628,
   4,
   16,
   237,
   761,
   18,
   14,
   56,
   6,
   460,
   810,
   4],
  [17,
   27,
   1,
   5,
   1,
   42,
   6,
   414,
   25,
   749,
   15,
   522,
   3,
   5,
   2,
   1,
   6344,
   204,
   83,
   8,
   79,
   14,
   4,
   6,
   230,
   55,
   6418,
   92,
   87,
   2,
   869,
   571,
   933,
   772,
   7,
   92,
   260,
   47,
   6,
   180,
   5,
   9536,
   27,
   4],
  [101,
   39,
   21,
   88,
   142,
   3,
   16,
   80,
   515,
   14,
   6,
   189,
   4,
   17,
   39,
   56,
   33,
   68,
   1136,
   6,
   1801,
   4,
   2,
   336,
   21,
   791,
   102,
   966,
   206,
   63,
   425,
   257,
   804,
   3,
   26,
   15,
   2933,
   8,
   2,
   118,
   33,
   2,
   8998,
   4],
  [1,
   1921,
   4,
   17,
   56,
   8,
   38,
   39,
   7,
   2,
   262,
   116,
   16,
   37,
   140,
   126,
   4,
   2,
   431,
   10,
   7991,
   3,
   2,
   209,
   323,
   35,
   1307,
   4,
   2,
   75,
   169,
   18,
   824,
   83,
   164,
   21,
   2,
   1,
   5,
   494,
   2159,
   489,
   1,
   4],
  [49,
   174,
   2115,
   315,
   3,
   17,
   25,
   10,
   33,
   6,
   1410,
   2602,
   3,
   26,
   14,
   2009,
   24,
   2460,
   6057,
   4,
   6,
   67,
   67,
   8404,
   25,
   18,
   139,
   33,
   1,
   23,
   6,
   564,
   4,
   61,
   67,
   62,
   129,
   5,
   61,
   398,
   5874,
   22,
   89,
   4],
  [1212,
   89,
   107,
   130,
   25,
   4,
   370,
   15,
   1965,
   42,
   55,
   1147,
   246,
   4,
   23,
   6,
   503,
   446,
   25,
   3,
   67,
   89,
   107,
   4,
   131,
   10,
   615,
   8,
   8263,
   4,
   195,
   10,
   333,
   277,
   2730,
   1165,
   1,
   22,
   2,
   1000,
   24,
   1,
   20,
   6906,
   4],
  [6,
   180,
   230,
   47,
   468,
   8,
   147,
   40,
   50,
   35,
   464,
   654,
   154,
   26,
   14,
   20,
   95,
   4849,
   24,
   125,
   1298,
   1700,
   395,
   74,
   10,
   59,
   107,
   2,
   25,
   23,
   83,
   4,
   1521,
   76,
   381,
   24,
   6,
   67,
   1037,
   195,
   4,
   16,
   1539,
   515,
   14,
   40],
  [17,
   25,
   10,
   48,
   96,
   3,
   14,
   63,
   75,
   38,
   1080,
   8,
   2,
   3932,
   262,
   19,
   230,
   19,
   84,
   572,
   1779,
   833,
   4,
   70,
   928,
   482,
   2,
   25,
   4,
   98,
   159,
   2618,
   3,
   245,
   77,
   4,
   54,
   104,
   32,
   456,
   143,
   73,
   31,
   17,
   1246,
   4],
  [17,
   10,
   39,
   7,
   2,
   262,
   116,
   16,
   37,
   140,
   126,
   40,
   16,
   232,
   14,
   42,
   2,
   5829,
   27,
   1392,
   5,
   494,
   2562,
   3091,
   69,
   73,
   4,
   349,
   4014,
   24,
   70,
   1640,
   2397,
   1678,
   4,
   12,
   13,
   9,
   11,
   12,
   13,
   9,
   11,
   6964,
   793,
   4],
  [489,
   10,
   625,
   4,
   17,
   27,
   1343,
   61,
   7,
   2,
   134,
   544,
   1848,
   56,
   140,
   243,
   4,
   3685,
   1646,
   10,
   1048,
   1256,
   4,
   2,
   3177,
   35,
   398,
   3,
   26,
   22,
   223,
   3,
   2,
   306,
   1486,
   10,
   9125,
   324,
   4,
   34,
   10,
   2,
   134,
   3,
   964,
   206,
   4],
  [436,
   2826,
   193,
   1,
   67,
   88,
   4,
   129,
   21,
   67,
   62,
   4,
   76,
   80,
   6,
   957,
   5,
   236,
   1,
   4,
   2,
   3784,
   7,
   578,
   5,
   4526,
   410,
   21,
   2382,
   4,
   118,
   82,
   67,
   1088,
   5,
   430,
   49,
   30,
   109,
   401,
   78,
   1403,
   4,
   67,
   746,
   25,
   4],
  [6,
   1048,
   653,
   1,
   27,
   4,
   6449,
   3,
   322,
   5,
   492,
   101,
   27,
   7,
   2,
   172,
   4,
   4357,
   4921,
   31,
   2,
   1,
   3480,
   4,
   61,
   92,
   56,
   1080,
   2,
   1,
   7,
   2,
   27,
   24,
   1,
   20,
   122,
   86,
   89,
   3,
   18,
   1588,
   32,
   38,
   6,
   459,
   86],
  [1204,
   23,
   127,
   366,
   15,
   2,
   1244,
   7,
   106,
   956,
   428,
   5,
   406,
   129,
   3,
   17,
   27,
   158,
   128,
   38,
   309,
   896,
   30,
   35,
   307,
   23,
   6,
   62,
   4301,
   23,
   143,
   6790,
   4,
   16,
   63,
   32,
   838,
   266,
   178,
   546,
   17,
   21,
   6,
   19,
   62,
   25,
   4,
   19]

非常感谢任何帮助。 感谢

1 个答案:

答案 0 :(得分:0)

您从哪里获得此代码?使用input参数通常不常见。 vectorizer这里是执行矢量化的对象,而不是矢量化文本本身。通常的scikit-learn习惯用法是使用fittransform(或fit_transform)方法:

vectorizer = CountVectorizer()
vectorizer.fit(highschool_posts)
vectorized_text = vectorizer.transform(highschool_posts)

查看官方教程http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html以获得更全面的介绍。