python sgdclassifier sklearn

时间:2013-06-12 04:04:26

标签: python sql machine-learning

我正在使用Python 2.7并且我安装了sklearn。

data_io代码

import csv
import json
import os
import pickle
import psycopg2

def paper_ids_to_string(ids):
return " ".join([str(x) for x in ids])

conn_string = None

def get_db_conn():
global conn_string
if conn_string is None:
    conn_string = get_paths()["postgres_conn_string"]
if "##AskForPassword##" in conn_string:
    password = raw_input("PostgreSQL Password: ")
    conn_string = conn_string.replace("##AskForPassword##", password)
conn = psycopg2.connect(conn_string)
return conn

def get_paths():
paths = json.loads(open("SETTINGS.json").read())
for key in paths:
    paths[key] = os.path.expandvars(paths[key])
return paths

def save_model(model):
out_path = get_paths()["model_path"]
pickle.dump(model, open(out_path, "w"))

def load_model():
in_path = get_paths()["model_path"]
return pickle.load(open(in_path))

def write_submission(predictions):
submission_path = get_paths()["submission_path"]
rows = [(author_id, paper_ids_to_string(predictions[author_id])) for author_id in predictions]
writer = csv.writer(open(submission_path, "w"), lineterminator="\n")
writer.writerow(("AuthorId", "PaperIds"))
writer.writerows(rows)

def get_features_db(table_name):
conn = get_db_conn()
query = get_features_query(table_name)
cursor = conn.cursor()
cursor.execute(query)
res = cursor.fetchall()
return res

def get_features_query(table_name):
query = open("feature_query.sql").read().strip()
return query.replace("##DataTable##", table_name)

这是火车代码

import data_io
from sklearn.linear_model import SGDClassifier

def main():
    print("Getting features for deleted papers from the database")
    features_deleted = data_io.get_features_db("TrainDeleted")

    print("Getting features for confirmed papers from the database")
    features_conf = data_io.get_features_db("TrainConfirmed")

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]

    print("Training the Classifier")
    classifier = SGDClassifier(alpha=0.0001, class_weight='auto', epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, rho=None, shuffle=False, verbose=0, warm_start=False)
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)

if __name__=="__main__":
    main()

这预测代码

from collections import defaultdict
import data_io

def main():
print("Getting features for valid papers from the database")
data = data_io.get_features_db("ValidPaper")
author_paper_ids = [x[:2] for x in data]
features = [x[2:] for x in data]

print("Loading the classifier")
classifier = data_io.load_model()

print("Making predictions")
predictions = classifier.predict_proba[:,1]
predictions = list(predictions)

author_predictions = defaultdict(list)
paper_predictions = {}

for (a_id, p_id), pred in zip(author_paper_ids, predictions):
    author_predictions[a_id].append((pred, p_id))

for author_id in sorted(author_predictions):
    paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
    paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

print("Writing predictions to file")
data_io.write_submission(paper_predictions)

if __name__=="__main__":
    main()

当我尝试运行时,我有以下类型错误: predictions = classifier.predict_proba [:,1] TypeError:'instancemethod'对象不可订阅

我该怎么办?

1 个答案:

答案 0 :(得分:0)

predict_proba是一种方法。您需要使用特征矩阵或向量来调用它。