我目前正在使用tensorflow
和tflearn
开展情感分析项目。我有一个社交媒体帖子的数据集,这些数据集是在CSV文件中提供给我的,我试图将它们变成用于训练的矢量。
这是我第一次尝试手动执行此类操作,我通常会导入已经过预处理的数据集。
目前,以下是我在Google搜索过程中找到的资源中尝试过的代码:
import pandas as pd
import numpy
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
import keras
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import csv
highschool_posts = pd.read_csv('Depression-Posts-Sentiment.xlsx-Sheet1.csv',names=["Post"],header=None,skiprows = 1)
highschool_posts
# with open()
# pickle.dump(highschool_posts,'highschool_posts.pkl')
posts = pd.DataFrame.as_matrix(highschool_posts)
print(posts.shape)
print(posts[0:2])
# print(posts)
vectorizer = CountVectorizer(input = highschool_posts,
analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
max_features =5000)
print(vectorizer)
当我打印结果时,我得到了这个:
(14, 1)
[[ "You'd think i'd feel the worst when im alone but it gets worst when im around you"]
[ 'I took too many drugs and I passed out and started foaming at the mouth. I wish that it would have killed me but it did not. Ugh ']]
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8',
input= ...hill day at school today! Wonderful day... one... WOW WE GRADUATED TODAY FINALLY!! This isn't th...,
lowercase=True, max_df=1.0, max_features=5000, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None)
有没有人能更好地了解我可以尝试获得看起来像这样的输出? :
([[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4],
[17,
10,
2,
6409,
25,
325,
7,
3161,
4,
2588,
1977,
176,
3,
26,
50,
35,
70,
1050,
395,
4],
[9846,
8681,
56,
33,
80,
113,
125,
62,
116,
4397,
5,
17,
10,
70,
284,
4,
277,
2480,
23,
92,
46,
49,
4358,
1,
31,
78,
116,
4],
[824,
69,
704,
47,
389,
8,
1373,
4,
100,
365,
1268,
8,
17,
3524,
27,
4,
573,
2433,
299,
192,
418,
194,
174,
536,
120,
8,
364,
4],
[39,
7,
69,
519,
154,
10,
42,
2,
468,
65,
5518,
31,
6,
1924,
1,
1181,
8,
210,
45,
1,
4942,
87,
15,
78,
6544,
40,
259,
1,
388,
23,
9420,
40],
[16,
117,
14,
20,
39,
7,
2,
829,
116,
74,
35,
140,
107,
3,
5,
16,
155,
126,
125,
86,
2,
286,
10,
142,
3,
26,
14,
20,
145,
6,
67,
62,
25,
40],
[16,
21,
585,
1656,
44,
2,
4640,
3111,
5,
5022,
7,
2996,
1,
3,
15,
17,
1291,
1025,
3,
8772,
5,
2002,
1433,
27,
4,
1292,
30,
48,
88,
3,
1,
3268,
1,
7315,
4],
[9435,
2605,
20,
251,
10,
59,
1388,
58,
15,
69,
358,
164,
17,
27,
4,
66,
1,
58,
2,
7886,
1168,
7,
2,
27,
24,
45,
1,
1697,
4,
16,
71,
49,
8,
79,
64,
7,
52,
4],
[16,
133,
41,
36,
179,
26,
17,
282,
49,
176,
403,
801,
403,
17,
282,
64,
49,
6,
19,
604,
1880,
1,
19,
4,
16,
117,
92,
49,
14,
75,
99,
19,
14,
20,
47,
604,
1880,
19,
4],
[17,
25,
935,
1270,
1,
1120,
15,
6,
3538,
1328,
4,
723,
18,
3,
14,
935,
6,
378,
96,
242,
24,
760,
3,
2231,
431,
4,
18,
14,
204,
22,
125,
1115,
6119,
3151,
61,
92,
213,
38,
7467,
4],
[17,
56,
45,
236,
3,
3016,
651,
1,
1038,
131,
3,
26,
14,
20,
1050,
24,
351,
474,
5,
1,
330,
323,
4,
1,
3374,
6889,
10,
2282,
3,
480,
2,
588,
7,
6,
865,
76,
5,
474,
4419,
24,
52,
687,
4],
[6,
1525,
7,
5172,
544,
1901,
24,
6,
1,
1611,
771,
4,
2,
952,
10,
960,
397,
5,
371,
3,
127,
54,
2,
889,
23,
2146,
1112,
3304,
4,
16,
63,
32,
278,
17,
21,
644,
15,
2208,
4,
1,
17,
25,
4],
[5,
16,
77,
399,
18,
4,
16,
1052,
14,
247,
328,
31,
1,
3,
5,
16,
21,
33,
1006,
14,
8,
38,
48,
62,
4,
17,
10,
167,
39,
7,
69,
2628,
4,
16,
237,
761,
18,
14,
56,
6,
460,
810,
4],
[17,
27,
1,
5,
1,
42,
6,
414,
25,
749,
15,
522,
3,
5,
2,
1,
6344,
204,
83,
8,
79,
14,
4,
6,
230,
55,
6418,
92,
87,
2,
869,
571,
933,
772,
7,
92,
260,
47,
6,
180,
5,
9536,
27,
4],
[101,
39,
21,
88,
142,
3,
16,
80,
515,
14,
6,
189,
4,
17,
39,
56,
33,
68,
1136,
6,
1801,
4,
2,
336,
21,
791,
102,
966,
206,
63,
425,
257,
804,
3,
26,
15,
2933,
8,
2,
118,
33,
2,
8998,
4],
[1,
1921,
4,
17,
56,
8,
38,
39,
7,
2,
262,
116,
16,
37,
140,
126,
4,
2,
431,
10,
7991,
3,
2,
209,
323,
35,
1307,
4,
2,
75,
169,
18,
824,
83,
164,
21,
2,
1,
5,
494,
2159,
489,
1,
4],
[49,
174,
2115,
315,
3,
17,
25,
10,
33,
6,
1410,
2602,
3,
26,
14,
2009,
24,
2460,
6057,
4,
6,
67,
67,
8404,
25,
18,
139,
33,
1,
23,
6,
564,
4,
61,
67,
62,
129,
5,
61,
398,
5874,
22,
89,
4],
[1212,
89,
107,
130,
25,
4,
370,
15,
1965,
42,
55,
1147,
246,
4,
23,
6,
503,
446,
25,
3,
67,
89,
107,
4,
131,
10,
615,
8,
8263,
4,
195,
10,
333,
277,
2730,
1165,
1,
22,
2,
1000,
24,
1,
20,
6906,
4],
[6,
180,
230,
47,
468,
8,
147,
40,
50,
35,
464,
654,
154,
26,
14,
20,
95,
4849,
24,
125,
1298,
1700,
395,
74,
10,
59,
107,
2,
25,
23,
83,
4,
1521,
76,
381,
24,
6,
67,
1037,
195,
4,
16,
1539,
515,
14,
40],
[17,
25,
10,
48,
96,
3,
14,
63,
75,
38,
1080,
8,
2,
3932,
262,
19,
230,
19,
84,
572,
1779,
833,
4,
70,
928,
482,
2,
25,
4,
98,
159,
2618,
3,
245,
77,
4,
54,
104,
32,
456,
143,
73,
31,
17,
1246,
4],
[17,
10,
39,
7,
2,
262,
116,
16,
37,
140,
126,
40,
16,
232,
14,
42,
2,
5829,
27,
1392,
5,
494,
2562,
3091,
69,
73,
4,
349,
4014,
24,
70,
1640,
2397,
1678,
4,
12,
13,
9,
11,
12,
13,
9,
11,
6964,
793,
4],
[489,
10,
625,
4,
17,
27,
1343,
61,
7,
2,
134,
544,
1848,
56,
140,
243,
4,
3685,
1646,
10,
1048,
1256,
4,
2,
3177,
35,
398,
3,
26,
22,
223,
3,
2,
306,
1486,
10,
9125,
324,
4,
34,
10,
2,
134,
3,
964,
206,
4],
[436,
2826,
193,
1,
67,
88,
4,
129,
21,
67,
62,
4,
76,
80,
6,
957,
5,
236,
1,
4,
2,
3784,
7,
578,
5,
4526,
410,
21,
2382,
4,
118,
82,
67,
1088,
5,
430,
49,
30,
109,
401,
78,
1403,
4,
67,
746,
25,
4],
[6,
1048,
653,
1,
27,
4,
6449,
3,
322,
5,
492,
101,
27,
7,
2,
172,
4,
4357,
4921,
31,
2,
1,
3480,
4,
61,
92,
56,
1080,
2,
1,
7,
2,
27,
24,
1,
20,
122,
86,
89,
3,
18,
1588,
32,
38,
6,
459,
86],
[1204,
23,
127,
366,
15,
2,
1244,
7,
106,
956,
428,
5,
406,
129,
3,
17,
27,
158,
128,
38,
309,
896,
30,
35,
307,
23,
6,
62,
4301,
23,
143,
6790,
4,
16,
63,
32,
838,
266,
178,
546,
17,
21,
6,
19,
62,
25,
4,
19]
非常感谢任何帮助。 感谢
答案 0 :(得分:0)
您从哪里获得此代码?使用input
参数通常不常见。 vectorizer
这里是执行矢量化的对象,而不是矢量化文本本身。通常的scikit-learn习惯用法是使用fit
和transform
(或fit_transform
)方法:
vectorizer = CountVectorizer()
vectorizer.fit(highschool_posts)
vectorized_text = vectorizer.transform(highschool_posts)
查看官方教程http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html以获得更全面的介绍。