我遇到错误,完全卡住了。我正在使用的数据集可以在此处下载:https://github.com/faizann24/Using-machine-learning-to-detect-malicious-URLs/blob/master/data/data.csv
此代码的目标是预测URL是否是网络钓鱼尝试或合法的。我想生成一个ROC曲线,将不同模型的有效性相互比较。我认为我的向量有问题,但无法找出解决方法。任何帮助或投入都会很棒。谢谢
from __future__ import print_function
import matplotlib
matplotlib.use("TkAgg")
from matplotlib import pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings("ignore", category=(FutureWarning))
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import interp
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
"""Tokenizer"""
def create_tokens(f):
slashTokens = str(f.encode('utf-8')).split('/') # splits URL by slash and gets tokens
totalTokens = []
for i in slashTokens:
tokens = str(i).split('-') # splits URL by dashes and gets tokens
dotTokens = []
for j in range(0, len(tokens)):
temp = str(tokens[j]).split('.') # splits url by dots and gets tokens
dotTokens = dotTokens + temp
totalTokens = totalTokens + tokens + dotTokens
totalTokens = list(set(totalTokens)) # remove duplicates
if 'com' in totalTokens:
totalTokens.remove('com') # pretty standard in a URL, will not be included in feature set
return totalTokens
def runClassifiers(X_train, X_test, y_test, y_train):
y_train = np.array(y_train)
y_test = np.array(y_test)
X_train = np.array(X_train)
X_test = np.array(X_test)
rbf_svc = SVC(kernel='rbf', gamma=0.00001, C=1000, probability=True).fit(X_train, y_train)
predictedprobSVC = rbf_svc.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, predictedprobSVC[:, 1], pos_label=2)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('SVM Classifier ROC')
plt.plot(fpr, tpr, color='blue', lw=2, label='SVM ROC area = %0.2f)' % roc_auc)
plt.legend(loc="lower right")
plt.show()
def main():
load_data = '/Users/nickalonso/Documents/urldata.csv'
csv = pd.read_csv(load_data)
csv = pd.DataFrame(csv)
csv = np.array(csv)
y = [d[1] for d in csv]
corpus = [d[0] for d in csv]
vectorizer = TfidfVectorizer(tokenizer=create_tokens)
X = vectorizer.fit_transform(corpus) # get X vector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
runClassifiers(X_train, X_test, y_test, y_train)
if __name__ == '__main__':
main()
跟踪是:
/Users/nickalonso/.conda/envs/390Project/bin/python "/Users/nickalonso/PycharmProjects/390Project/Phishing Net.py"
/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py:47: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
import imp
Traceback (most recent call last):
File "/Users/nickalonso/PycharmProjects/390Project/Phishing Net.py", line 78, in <module>
main()
File "/Users/nickalonso/PycharmProjects/390Project/Phishing Net.py", line 75, in main
runClassifiers(X_train, X_test, y_test, y_train)
File "/Users/nickalonso/PycharmProjects/390Project/Phishing Net.py", line 47, in runClassifiers
rbf_svc = SVC(kernel='rbf', gamma=0.00001, C=1000, probability=True).fit(X_train, y_train)
File "/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/sklearn/svm/base.py", line 149, in fit
accept_large_sparse=False)
File "/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/sklearn/utils/validation.py", line 747, in check_X_y
estimator=estimator)
File "/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/sklearn/utils/validation.py", line 522, in check_array
array = np.asarray(array, dtype=dtype, order=order)
File "/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/numpy/core/numeric.py", line 501, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: setting an array element with a sequence.
Process finished with exit code 1