给我的向量:ValueError:使用序列设置数组元素

时间:2018-11-28 19:41:39

标签: python

我遇到错误,完全卡住了。我正在使用的数据集可以在此处下载:https://github.com/faizann24/Using-machine-learning-to-detect-malicious-URLs/blob/master/data/data.csv

此代码的目标是预测URL是否是网络钓鱼尝试或合法的。我想生成一个ROC曲线,将不同模型的有效性相互比较。我认为我的向量有问题,但无法找出解决方法。任何帮助或投入都会很棒。谢谢

from __future__ import print_function
import matplotlib
matplotlib.use("TkAgg")
from matplotlib import pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings("ignore", category=(FutureWarning))
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import interp
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC

"""Tokenizer"""
def create_tokens(f):
    slashTokens = str(f.encode('utf-8')).split('/')  # splits URL by slash and gets tokens
    totalTokens = []
    for i in slashTokens:
        tokens = str(i).split('-')  # splits URL by dashes and gets tokens
        dotTokens = []
        for j in range(0, len(tokens)):
            temp = str(tokens[j]).split('.')  # splits url by dots and gets tokens
            dotTokens = dotTokens + temp
        totalTokens = totalTokens + tokens + dotTokens
    totalTokens = list(set(totalTokens))  # remove duplicates
    if 'com' in totalTokens:
        totalTokens.remove('com')   # pretty standard in a URL, will not be included in feature set
    return totalTokens

def runClassifiers(X_train, X_test, y_test, y_train):

    y_train = np.array(y_train)
    y_test = np.array(y_test)
    X_train = np.array(X_train)
    X_test = np.array(X_test)

    rbf_svc = SVC(kernel='rbf', gamma=0.00001, C=1000, probability=True).fit(X_train, y_train)
    predictedprobSVC = rbf_svc.predict_proba(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, predictedprobSVC[:, 1], pos_label=2)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.title('SVM Classifier ROC')
    plt.plot(fpr, tpr, color='blue', lw=2, label='SVM ROC area = %0.2f)' % roc_auc)
    plt.legend(loc="lower right")
    plt.show()


def main():
    load_data = '/Users/nickalonso/Documents/urldata.csv'
    csv = pd.read_csv(load_data)
    csv = pd.DataFrame(csv)
    csv = np.array(csv)
    y = [d[1] for d in csv]
    corpus = [d[0] for d in csv]

    vectorizer = TfidfVectorizer(tokenizer=create_tokens)
    X = vectorizer.fit_transform(corpus) # get X vector

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    runClassifiers(X_train, X_test, y_test, y_train)

if __name__ == '__main__':
    main()

跟踪是:

/Users/nickalonso/.conda/envs/390Project/bin/python "/Users/nickalonso/PycharmProjects/390Project/Phishing Net.py"
/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py:47: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
  import imp
Traceback (most recent call last):
  File "/Users/nickalonso/PycharmProjects/390Project/Phishing Net.py", line 78, in <module>
    main()
  File "/Users/nickalonso/PycharmProjects/390Project/Phishing Net.py", line 75, in main
    runClassifiers(X_train, X_test, y_test, y_train)
  File "/Users/nickalonso/PycharmProjects/390Project/Phishing Net.py", line 47, in runClassifiers
    rbf_svc = SVC(kernel='rbf', gamma=0.00001, C=1000, probability=True).fit(X_train, y_train)
  File "/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/sklearn/svm/base.py", line 149, in fit
    accept_large_sparse=False)
  File "/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/sklearn/utils/validation.py", line 747, in check_X_y
    estimator=estimator)
  File "/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/sklearn/utils/validation.py", line 522, in check_array
    array = np.asarray(array, dtype=dtype, order=order)
  File "/Users/nickalonso/.conda/envs/390Project/lib/python3.6/site-packages/numpy/core/numeric.py", line 501, in asarray
    return array(a, dtype, copy=False, order=order)
ValueError: setting an array element with a sequence.

Process finished with exit code 1

0 个答案:

没有答案