Question

我一直在与sklearn合作，尝试将文档（以下称为“文档”）分类到它们所属的行业。这是代码（问题的描述在代码下方）：

数据集：file

import pandas as pd
import csv
import urllib2

# I think this is how you open a csv dataset file from a url?
url = 'https://files.fm/down.php?cf&i=qcnmbqxv&n=test_data.csv'
response = urllib2.urlopen(url)

df = pd.read_csv(response)


# this is the "function" f
category_id_df = df[['industry', 'category_id']].drop_duplicates().sort_values('category_id')

category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'industry']].values)

# function f
print(category_id_df)

Output:
                                     industry  category_id
17439                               education            0
11120                             real_estate            1
8838                     social_organizations            2
6722                                      law            3
2782   film_animation_design_and_graphic_arts            4
15541                   international_affairs            5
4540                                     arts            6
12252                                 medical            7
1156                               technology            8
14375                 politics_and_government            9
13405   architecture_and_physical_engineering           10
15                         financial_services           11
15917             health_wellness_and_fitness           12
10108                                    food           13
4376           construction_and_manufacturing           14
5625                  media_and_entertainment           15
7861                              environment           16

import nltk

def tokenize(document):
    tokens = nltk.word_tokenize(document)
    tokens = [word for word in tokens if word.isalpha()]
    return tokens

# rank/vectorize/numerically normalize words
from sklearn.feature_extraction.text import TfidfVectorizer
# parameter to tweak
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', ngram_range = (1,2), max_df = 1 - (10 ** (-16)))
# tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.doc).toarray()
labels = df.category_id

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#parameter to tweak
from sklearn.linear_model import LogisticRegression
import numpy as np

# make the training set the entire linkedin list
# (perhaps find other sources of data to add to the training set so it doesn't absorb the quirks of linkedin data?)
# make the dev/test set the user input data, use entropy as a measure for the dev set

# this is the dataframe for the test set
df_test = pd.DataFrame(columns=['doc','industry'])

# read in the typeform assessment input
df_assessment_raw = pd.read_csv('typeform_responses.csv')
df_assessment = df_assessment_raw.iloc[:,np.r_[21:24,26:29,31:33]]
df_assessment['industry'] = df_assessment_raw.iloc[:,-1:]

j = 0

total_number_industries = 0

# fill in the rows of the test set dataframe
for index, row in df_assessment.iterrows():
    industry_instance = str(row['industry'])
    if industry_instance in industries:
        total_number_industries += 1
        row = pd.DataFrame(row).transpose()
        row.drop(['industry'], axis=1)
        doc_instance = process_dataframe(row)[0]
        new_row = [doc_instance, industry_instance]
        df_test.loc[j] = new_row
        j += 1

X_train, y_train = df['doc'], df['industry']
X_test, y_test = df_test['doc'], df_test['industry']

count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()


def clf(model, X_train, y_train):
    # count_vect could be function g
    X_train_counts = count_vect.fit_transform(X_train)
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    clf = model.fit(X_train_tfidf, y_train)
    return clf

# create the multiclass logistic regression model clf from the training set
clf_log_reg = clf(LogisticRegression(solver = 'lbfgs', C = 0.5, class_weight='balanced', random_state = 0), X_train, y_train)

b = 0

for index, value in y_test.iteritems():
    if str(value) == 'financial_services':
        b += 1

print(b)

Output: 8

def conf_mat(clf, X_test, y_test):
    # predict the outputs of the test set
    # function g
    y_pred = clf.predict(count_vect.transform(X_test))

    # use a confusion matrix to compare the actual and predicted outputs
    conf_mat = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=(14,14))
    sns.heatmap(conf_mat, annot=True, fmt='d',
                xticklabels=category_id_df.industry.values, yticklabels=category_id_df.industry.values)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

conf_mat(clf_log_reg, X_test, y_test)

运行conf_mat返回：

Confusion Matrix

如您所见，虽然“ financial_services”的实际文档数为8，但是混淆矩阵显示“ financial_services”的实际文档数为0。有人知道为什么会这样吗？让我知道是否需要提供更多信息。谢谢！

编辑：所以我认为问题是混淆矩阵x轴和y轴被错误标记，因为准确性，类总数以及其他不正确。由于每个分类类别都被分配了一个类别ID（因此映射f：{industries}-> {1，...，行业数}），因此标签名称错误。但是，我必须在数据上使用CountVectorizer（）才能将其传递到sklearn的predict（）和fit（）函数中，只是以不同的方式将类别分配给数字（因此使用不同的映射g：{industries} > {1，...，行业数量}）。由于f！= g，所以我们最终会出现标签错误的问题，因为我们应用g∘f^ -1而不是应用f∘f^ -1来返回原始标签。但这只是一个假设，因此请带一点盐。

一旦确定了方法，我将上传数据集。

sklearn混淆矩阵显示的实际类别数量与实际数量有所不同

0 个答案: