我正在进行文本分类任务(由10个标签均匀分布的7000个文本)。我将字符,单词和POS设置为colume名称,并将每个文本的所有字符单词和POS标签存储到Sqlite3
数据库中。我想要做的是将字符2-gram,word 2-gram和POS 2-gram组合成SVM
的训练向量。我的代码如下:
def dbConnect(db_name,sql):
conn = sqlite3.connect(db_name)
cursor = conn.cursor()
n = cursor.execute(sql)
res1 = cursor.fetchall()
return res1
def createtrainingVector(item,n,count1,res1):
''' append all items(character/word/POS n-gram) that occurs at least count1 times in the text into a training Vector used for SVM'''
def featureComb(item,count1,res1):
'''combine features into a vector'''
if item == "word":
train_vector1 = createtrainingVector(item,1,count1,res1)
train_vector2 = createtrainingVector(item,2,count1,res1)
elif item == "character":
train_vector1 = createtrainingVector(item,1,count1,res1)
train_vector2 = createtrainingVector(item,2,count1,res1)
elif item == "pos":
train_vector1 = createtrainingVector(item,1,count1,res1)
train_vector2 = createtrainingVector(item,2,count1,res1)
return sorted(train_vector1+train_vector2)
def svmParaComb(item,train_vector,label_vector,X,y,res1):
matrix = []
for res in res1:
y.append(label_vector[res[0]])
if item == "word":
word_dic1 = wordNgrams(res[1], 1)
word_dic2 = wordNgrams(res[1], 2)
elif lexical == "pos":
word_dic1 = lemmaNgrams(json.loads(res[1]), 1)
word_dic2 = lemmaNgrams(json.loads(res[1]), 2)
elif lexical == "character":
word_dic1 = characterNgrams(res[1], 1)
word_dic2 = characterNgrams(res[1], 2)
X.append([1 if gram in (word_dic1 or word_dic2) else 0 for gram in train_vecter)
db = 'text.db'
'''Access to database and Extract label and essay'''
sql_dict1 = {"word": "select label,word from main.training_data",
"pos" : "select label,pos from main.training_data",
"character" : "select label,character from main.training_data",
}
sql_dict2 = {"word": "select label,word from main.test_data",
"pos" : "select label,pos from main.test_data",
"character" : "select label,character from main.test_data",
}
if i == 1:
sql1 = sql_dict1["word"]
sql2 = sql_dict2["word"]
elif i == 2:
sql1 = sql_dict1["character"]
sql2 = sql_dict2["character"]
elif i == 3:
sql1 = sql_dict1["pos"]
sql2 = sql_dict2["pos"]
res1 = dbConnect(db, sql1)
X = []
y = []
train_vector_combine = featureComb(item, 2,res1)
svmParaComb(item, train_vector_combine, label_vector, X, y, res1)
clf1 = svm.LinearSVC()
clf1.fit(X, y)
res2 = dbConnect(db, sql2)
'''Parameter used for predicting'''
X_test = []
y_true = []
svmCombinePredict(item, train_vector, label_vector, X_test, y_true, res2)
clf1.predict(X_test)
print clf1.score(X_test,y_true)
但是,它只能组合相同类型的功能。我不知道如何处理不同类型功能的组合,因为res1只代表一个colume word / character / POS。因此,如果我想将字符2-gram,单词2-gram和POS 2-gram组合成训练向量,我该怎么办?任何人都可以帮我解决我的代码吗?