
时间:2017-08-03 19:49:56

标签: python machine-learning scikit-learn




对于我的ML模型训练(我正在使用pickle,因此一些变量被加载到下一个文件中。)功能是ml_list,标签是标签。我将多个CSV文件加载到Pandas DF中,并遍历DF中的每一行,提取' slash_count',' dash_count'' colon_count' ,' dot_count','数字'和'字母'。我将在下面的ML模型中包含一个模式样本,用于匹配整数,日期和浮点数(有多个数据帧,这是一般模式):

import pandas as pd
import numpy as np
import sklearn
from sklearn import tree
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import re
import pickle

letters = 0
digit = 0
space = 0
slash_count = 0
dash_count = 0
boolean_count = []
digits = 0
labels = []
date_model_count = 0
int_model_count = 0
float_model_count = 0
boolean_model_count = 0
ml_list = []
ml_list_integer = []
labels_integer = []
ml_list_float = []
labels_float = []
ml_list_boolean = []
labels_boolean = []

def analyze(s):
    return [sum(n) for n in zip(*((c.isdigit(), c.isalpha()) for c in s))]
for x in df['Order Date']: #Date ML Model
slash_count = x.count("/")
dash_count = x.count("-")
colon_count = x.count(":")
dot_count = x.count(".")
digits, letters = analyze(x)
ml_list.append([slash_count, dash_count, colon_count, letters, dot_count, digits])
date_pattern_models = [[2, 0, 0, 3, 0, 4],
                       [2, 0, 0, 3, 0, 6],
                       [0, 2, 0, 3, 0, 4],
                       [0, 2, 0, 3, 0, 6],
                       [0, 2, 0, 3, 0, 3],
                       [2, 0, 0, 0, 0, 5],
                       [2, 0, 0, 0, 0, 6],
                       [2, 0, 0, 0, 0, 7],
                       [2, 0, 0, 0, 0, 8],
                       [0, 2, 0, 0, 0, 5],
                       [0, 2, 0, 0, 0, 6],
                       [0, 2, 0, 0, 0, 7],
                       [0, 2, 0, 0, 0, 8],
                       [2, 0, 2, 3, 0, 12],
                       [2, 0, 2, 3, 0, 11],
                       [2, 0, 2, 3, 0, 10],
                       [2, 0, 2, 3, 0, 9],
                       [0, 2, 2, 3, 0, 12],
                       [0, 2, 2, 3, 0, 11],
                       [0, 2, 2, 3, 0, 10],
                       [0, 2, 2, 3, 0, 9],
                       [0, 2, 2, 0, 0, 14],
                       [0, 2, 2, 0, 0, 13],
                       [0, 2, 2, 0, 0, 12],
                       [0, 2, 2, 0, 0, 11],
                       [0, 2, 2, 0, 0, 10],
                       [2, 0, 2, 0, 0, 14],
                       [2, 0, 2, 0, 0, 13],
                       [2, 0, 2, 0, 0, 12],
                       [2, 0, 2, 0, 0, 11],
                       [2, 0, 2, 0, 0, 10]]

if ml_list[-1] in date_pattern_models:
    labels = labels + [1]
    labels = labels + [0]
slash_count = 0
dash_count = 0
colon_count = 0
digits = 0
letters = 0

for x in df9["Row ID"]: #Int
    dot_count = x.count(".")
    slash_count = x.count("/")
    colon_count = x.count(":")
    dash_count = x.count("-")
    digits, letters = analyze(x)
    ml_list_integer.append([slash_count, dash_count, colon_count, letters, dot_count, digits])
    if slash_count == 0 & colon_count == 0 & dash_count == 0 & letters == 0 & dot_count == 0:
       labels_integer = labels_integer + [1]
       labels_integer = labels_integer + [0]
    dot_count = 0
    slash_count = 0
    colon_count = 0
    dash_count = 0
    digits = 0
    letters = 0

for x in df9["Sales"]: #Float
    dot_count = x.count(".")
    slash_count = x.count("/")
    colon_count = x.count(":")
    dash_count = x.count("-")
    digits, letters = analyze(x)
    ml_list_float.append([slash_count, dash_count, colon_count, letters, dot_count, digits])
    if slash_count == 0 & colon_count == 0 & dash_count == 0 & letters == 0 & dot_count == 1:
        labels_float = labels_float + [1]
        labels_float = labels_float + [0]
    dot_count = 0
    slash_count = 0
    colon_count = 0
    dash_count = 0
    digits = 0
    letters = 0

p_data = {'ml_list': ml_list,
          "labels": labels,
          "ml_list_float": ml_list_float,
          "labels_float": labels_float,
          "ml_list_integer": ml_list_integer,
          "labels_integer": labels_integer,
with open('p_data.pickle', 'wb') as f:
    pickle.dump([ml_list, labels, ml_list_integer, labels_integer, ml_list_float, labels_float], f)


import pickle
import pandas as pd
import numpy as np
import sklearn
from sklearn import tree
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import re

with open('p_data.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    ml_list, labels, ml_list_integer, labels_integer, ml_list_float, 
labels_float = pickle.load(f)

# #Training Date Model
date_clf = tree.DecisionTreeClassifier()
date_clf = date_clf.fit(ml_list, labels)
# #Training Integer Model
integer_clf = tree.DecisionTreeClassifier()
integer_clf = integer_clf.fit(ml_list_integer, labels_integer)
# #Training Float Model
float_clf = tree.DecisionTreeClassifier()
float_clf = float_clf.fit(ml_list_float, labels_float)
def date_model(slash_count, dash_count, colon_count, letters, 
dot_count, digits):
     date_model_count = 0
    date_answer = date_clf.predict([slash_count, dash_count, colon_count, letters, dot_count, digits])
def int_model(slash_count, dash_count, colon_count, letters, dot_count, digits):
    int_model_count = 0
    int_answer = integer_clf.predict([slash_count, dash_count, colon_count, letters, dot_count, digits])
def float_model(slash_count, dash_count, colon_count, letters, dot_count, digits):
    float_model_count = 0
    float_answer = float_clf.predict([slash_count, dash_count, colon_count, letters, dot_count, digits])

df_fin = pd.read_csv("/Users/rohinmahesh/Downloads/TechCrunchcontinentalUSA.csv", dtype=str)
df_fin = df_fin.reset_index()
del df_fin['index']
lst = list(df_fin.columns.values)
numrows = df_fin.shape[0]
numcols = df_fin.shape[1]
col = 0
row = 0
date_count = []
int_count = []
str_count = []
boolean_count = []
float_count = []
time_count = []
dict = {}
keys = []
vals = []
variable_1 = 0
date_model_count = 0
int_model_count = 0
float_model_count = 0

while col < numcols: 
    int_model_count = 0
    float_model_count = 0
    boolean_model_count = 0
    date_model_count = 0
    del str_count[:]
    del boolean_count[:]
    while row < numrows:
        var2 = str(df_fin.ix[row][col])
        def analyze(s):
            return [sum(n) for n in zip(*((c.isdigit(), c.isalpha()) for c in s))]
        slash_count = var2.count("/")
        dash_count = var2.count("-")
        colon_count = var2.count(":")
        dot_count = var2.count(".")
        digits, letters = analyze(var2)
        date_answer = date_model(slash_count=slash_count, dash_count=dash_count, colon_count=colon_count, letters=letters, dot_count=dot_count, digits=digits)
        if date_answer== 1:
            date_model_count+= 1
        int_answer = int_model(slash_count=slash_count, dash_count=dash_count, colon_count=colon_count, letters=letters, dot_count=dot_count, digits=digits)
        if int_answer==1:
            int_model_count+= 1
        float_answer = float_model(slash_count=slash_count, dash_count=dash_count, colon_count=colon_count, letters=letters, dot_count=dot_count, digits=digits)
        if float_answer==1:
            float_model_count+= 1
        str_pattern = re.findall(r'\b\w+\b', var2)
        boolean_pattern = re.findall(r'TRUE|FALSE|True|False|true|false|t|f|T|F', var2)
        str_count = str_count + [str_pattern]
        boolean_count = boolean_count + [boolean_pattern]
        # How to clear out all the empty values in the array
        str_count = [x for x in str_count if x != []]
        boolean_count = [x for x in boolean_count if x != []]
        row = row + 1
    # Changing the column data types
    if int_model_count == len(str_count):
        df_fin[lst[col]] = pd.to_numeric(df_fin[lst[col]], errors='coerce', downcast='integer')
    if float_model_count == len(str_count):
        df_fin[lst[col]] = pd.to_numeric(df_fin[lst[col]], errors='coerce', downcast='float')
    if len(boolean_count) == len(str_count):
        df_fin[lst[col]] = df_fin[lst[col]].astype('bool')
    if date_model_count == len(str_count):
        df_fin[lst[col]] = pd.to_datetime(df_fin[lst[col]], errors='coerce')
    # Converting any column that has type object into a string
    col = col + 1
    row = 0

#Creating Key to create dictionary
keys = list(df_fin.columns.values)

0 个答案:
