我正在尝试使用Twitter的数据集创建文本分类器,以将文本分类为两类:激进或正常。
我的分类器正在以66%的精度工作,但由于数据集非常庞大,这确实很慢。
任何建议如何使它更快?
数据集可以在这里看到:
https://dataturks.com/projects/abhishek.narayanan/Dataset%20for%20Detection%20of%20Cyber-Trolls
我的主要代码:
'''
The dataset basically contains vectors with two components.
The first component is basically the String to analyse.
The Second component is the integer representing the sort of text
1 ----> Aggressive Text
0 ----> Positive Text
'''
import Preprocessdata as Dataset
import random
def Dataset_by_class(Dataset):
Dict = {}
for Instance in Dataset:
if Instance[-1] not in Dict:
Dict[Instance[-1]] = []
Dict[Instance[-1]].append(Instance[0])
return Dict
def String_by_class(Dataset):
Dict = {}
for index,Instance in Dataset.items():
Final = []
for Value in Instance:
Final.extend(Value.split())
Dict[index] = Final
return Dict
def PreProcess_TestSet(Dataset):
Label = []
Testing = []
for Instance in Dataset:
Testing.append(Instance[0])
Label.append(Instance[1])
return[Testing,Label]
def Laplace_Smoothening(Item,List,Alpha,Unique_Count,index):
Prob = 1
for String in List:
X = Item.count(String)
Prob = Prob * ((X+Alpha)/(len(Item) + Alpha * Unique_Count[index]))
return Prob
def Predict(Dataset,TestSet):
# We will be using LAplace Smoothening using alpha as 1
Unique_Count = {}
for index,Item in Dataset.items():
Unique_Count[index] = len(set(Item))
Alpha = 1
Predicted_Label = []
Prob_by_class = {}
for Test_Instance in TestSet:
List = Test_Instance.split()
for index,Item in Dataset.items():
Probility_by_class = Laplace_Smoothening(Item,List,Alpha,Unique_Count,index)
Prob_by_class[index] = Probility_by_class
if(Prob_by_class[0] > Prob_by_class[1]):
Predicted_Label.append(0)
else:
Predicted_Label.append(1)
return Predicted_Label
def Calculate_Accuracy(Prediction,Labels):
Count = 0
for Predicted,Given in zip(Prediction,Labels):
if Predicted == Given:
Count += 1
return (Count/len(Labels))*100
# This function automatically creates dataset from json and is imported
Data_Set = Dataset.PreProcess_Dataset('Dataset for Detection of Cyber-Trolls.json')
SplitRatio = 0.70
#Splits the dataset according to split ratio
TrainSet,TestSet = Train_Test_Dataset(Data_Set,SplitRatio)
DataSet_By_Class = Dataset_by_class(TrainSet)
String_By_Class = String_by_class(DataSet_By_Class)
TestSet,Labels = PreProcess_TestSet(TestSet)
Prediction = Predict(String_By_Class,TestSet)
Accuracy = Calculate_Accuracy(Prediction,Labels)
print("The Accuracy is {:.2f}".format(Accuracy))
结果很好,但是非常慢。