这个转换错误的问题?

时间:2017-08-03 19:49:56

标签: python machine-learning scikit-learn

编辑:我已经实施了其他用户推荐的所有要求和建议。

我的程序的目标是在加载CSV文件(datetime,string,boolean,int和float)后识别列数据类型。我最初使用相同的逻辑通过使用正则表达式匹配不同的行模式来成功运行它。这一次,我的主管希望我探索培训ML模型(Scikit-learn)来识别列数据类型(特别是日期),而不是完全使用正则表达式。

当使用ML模型(Scikit-learn)时,除了日期之外,似乎所有内容都被正确识别。它不断将日期标识为浮点数。我无法理解的是我使用了与正则表达式相同的逻辑,它适用于除日期之外的所有数据类型。

对于我的ML模型训练(我正在使用pickle,因此一些变量被加载到下一个文件中。)功能是ml_list,标签是标签。我将多个CSV文件加载到Pandas DF中,并遍历DF中的每一行,提取' slash_count',' dash_count'' colon_count' ,' dot_count','数字'和'字母'。我将在下面的ML模型中包含一个模式样本,用于匹配整数,日期和浮点数(有多个数据帧,这是一般模式):

import pandas as pd
import numpy as np
import sklearn
from sklearn import tree
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import re
import pickle

letters = 0
digit = 0
space = 0
slash_count = 0
dash_count = 0
boolean_count = []
digits = 0
labels = []
date_model_count = 0
int_model_count = 0
float_model_count = 0
boolean_model_count = 0
ml_list = []
ml_list_integer = []
labels_integer = []
ml_list_float = []
labels_float = []
ml_list_boolean = []
labels_boolean = []

def analyze(s):
    return [sum(n) for n in zip(*((c.isdigit(), c.isalpha()) for c in s))]
for x in df['Order Date']: #Date ML Model
slash_count = x.count("/")
dash_count = x.count("-")
colon_count = x.count(":")
dot_count = x.count(".")
digits, letters = analyze(x)
ml_list.append([slash_count, dash_count, colon_count, letters, dot_count, digits])
date_pattern_models = [[2, 0, 0, 3, 0, 4],
                       [2, 0, 0, 3, 0, 6],
                       [0, 2, 0, 3, 0, 4],
                       [0, 2, 0, 3, 0, 6],
                       [0, 2, 0, 3, 0, 3],
                       [2, 0, 0, 0, 0, 5],
                       [2, 0, 0, 0, 0, 6],
                       [2, 0, 0, 0, 0, 7],
                       [2, 0, 0, 0, 0, 8],
                       [0, 2, 0, 0, 0, 5],
                       [0, 2, 0, 0, 0, 6],
                       [0, 2, 0, 0, 0, 7],
                       [0, 2, 0, 0, 0, 8],
                       [2, 0, 2, 3, 0, 12],
                       [2, 0, 2, 3, 0, 11],
                       [2, 0, 2, 3, 0, 10],
                       [2, 0, 2, 3, 0, 9],
                       [0, 2, 2, 3, 0, 12],
                       [0, 2, 2, 3, 0, 11],
                       [0, 2, 2, 3, 0, 10],
                       [0, 2, 2, 3, 0, 9],
                       [0, 2, 2, 0, 0, 14],
                       [0, 2, 2, 0, 0, 13],
                       [0, 2, 2, 0, 0, 12],
                       [0, 2, 2, 0, 0, 11],
                       [0, 2, 2, 0, 0, 10],
                       [2, 0, 2, 0, 0, 14],
                       [2, 0, 2, 0, 0, 13],
                       [2, 0, 2, 0, 0, 12],
                       [2, 0, 2, 0, 0, 11],
                       [2, 0, 2, 0, 0, 10]]

if ml_list[-1] in date_pattern_models:
    labels = labels + [1]
else:
    labels = labels + [0]
slash_count = 0
dash_count = 0
colon_count = 0
digits = 0
letters = 0

for x in df9["Row ID"]: #Int
    dot_count = x.count(".")
    slash_count = x.count("/")
    colon_count = x.count(":")
    dash_count = x.count("-")
    digits, letters = analyze(x)
    ml_list_integer.append([slash_count, dash_count, colon_count, letters, dot_count, digits])
    if slash_count == 0 & colon_count == 0 & dash_count == 0 & letters == 0 & dot_count == 0:
       labels_integer = labels_integer + [1]
    else:
       labels_integer = labels_integer + [0]
    dot_count = 0
    slash_count = 0
    colon_count = 0
    dash_count = 0
    digits = 0
    letters = 0

for x in df9["Sales"]: #Float
    dot_count = x.count(".")
    slash_count = x.count("/")
    colon_count = x.count(":")
    dash_count = x.count("-")
    digits, letters = analyze(x)
    ml_list_float.append([slash_count, dash_count, colon_count, letters, dot_count, digits])
    if slash_count == 0 & colon_count == 0 & dash_count == 0 & letters == 0 & dot_count == 1:
        labels_float = labels_float + [1]
    else:
        labels_float = labels_float + [0]
    dot_count = 0
    slash_count = 0
    colon_count = 0
    dash_count = 0
    digits = 0
    letters = 0

p_data = {'ml_list': ml_list,
          "labels": labels,
          "ml_list_float": ml_list_float,
          "labels_float": labels_float,
          "ml_list_integer": ml_list_integer,
          "labels_integer": labels_integer,
          }
with open('p_data.pickle', 'wb') as f:
    pickle.dump([ml_list, labels, ml_list_integer, labels_integer, ml_list_float, labels_float], f)

加载pickle变量后,我创建一个定义并使用pickle数据训练模型并返回预测。然后我通过将示例csv加载到新的数据帧中来测试模型,然后我将实现这样的算法:

import pickle
import pandas as pd
import numpy as np
import sklearn
from sklearn import tree
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import re

with open('p_data.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    ml_list, labels, ml_list_integer, labels_integer, ml_list_float, 
labels_float = pickle.load(f)

# #Training Date Model
date_clf = tree.DecisionTreeClassifier()
date_clf = date_clf.fit(ml_list, labels)
# #Training Integer Model
integer_clf = tree.DecisionTreeClassifier()
integer_clf = integer_clf.fit(ml_list_integer, labels_integer)
# #Training Float Model
float_clf = tree.DecisionTreeClassifier()
float_clf = float_clf.fit(ml_list_float, labels_float)
def date_model(slash_count, dash_count, colon_count, letters, 
dot_count, digits):
     date_model_count = 0
    date_answer = date_clf.predict([slash_count, dash_count, colon_count, letters, dot_count, digits])
    return(date_answer)
def int_model(slash_count, dash_count, colon_count, letters, dot_count, digits):
    int_model_count = 0
    int_answer = integer_clf.predict([slash_count, dash_count, colon_count, letters, dot_count, digits])
    return(int_answer)
def float_model(slash_count, dash_count, colon_count, letters, dot_count, digits):
    float_model_count = 0
    float_answer = float_clf.predict([slash_count, dash_count, colon_count, letters, dot_count, digits])
    return(float_answer)

df_fin = pd.read_csv("/Users/rohinmahesh/Downloads/TechCrunchcontinentalUSA.csv", dtype=str)
df_fin = df_fin.reset_index()
del df_fin['index']
lst = list(df_fin.columns.values)
numrows = df_fin.shape[0]
numcols = df_fin.shape[1]
col = 0
row = 0
date_count = []
int_count = []
str_count = []
boolean_count = []
float_count = []
time_count = []
dict = {}
keys = []
vals = []
variable_1 = 0
date_model_count = 0
int_model_count = 0
float_model_count = 0

while col < numcols: 
    int_model_count = 0
    float_model_count = 0
    boolean_model_count = 0
    date_model_count = 0
    del str_count[:]
    del boolean_count[:]
    while row < numrows:
        var2 = str(df_fin.ix[row][col])
        def analyze(s):
            return [sum(n) for n in zip(*((c.isdigit(), c.isalpha()) for c in s))]
        slash_count = var2.count("/")
        dash_count = var2.count("-")
        colon_count = var2.count(":")
        dot_count = var2.count(".")
        digits, letters = analyze(var2)
        date_answer = date_model(slash_count=slash_count, dash_count=dash_count, colon_count=colon_count, letters=letters, dot_count=dot_count, digits=digits)
        if date_answer== 1:
            date_model_count+= 1
        int_answer = int_model(slash_count=slash_count, dash_count=dash_count, colon_count=colon_count, letters=letters, dot_count=dot_count, digits=digits)
        if int_answer==1:
            int_model_count+= 1
        float_answer = float_model(slash_count=slash_count, dash_count=dash_count, colon_count=colon_count, letters=letters, dot_count=dot_count, digits=digits)
        if float_answer==1:
            float_model_count+= 1
        str_pattern = re.findall(r'\b\w+\b', var2)
        boolean_pattern = re.findall(r'TRUE|FALSE|True|False|true|false|t|f|T|F', var2)
        str_count = str_count + [str_pattern]
        boolean_count = boolean_count + [boolean_pattern]
        # How to clear out all the empty values in the array
        str_count = [x for x in str_count if x != []]
        boolean_count = [x for x in boolean_count if x != []]
        row = row + 1
    # Changing the column data types
    if int_model_count == len(str_count):
        df_fin[lst[col]] = pd.to_numeric(df_fin[lst[col]], errors='coerce', downcast='integer')
    if float_model_count == len(str_count):
        df_fin[lst[col]] = pd.to_numeric(df_fin[lst[col]], errors='coerce', downcast='float')
    if len(boolean_count) == len(str_count):
        df_fin[lst[col]] = df_fin[lst[col]].astype('bool')
    if date_model_count == len(str_count):
        df_fin[lst[col]] = pd.to_datetime(df_fin[lst[col]], errors='coerce')
    # Converting any column that has type object into a string
    df_fin.update(df_fin.select_dtypes(include=[np.object]).astype(str))
    col = col + 1
    row = 0

#Creating Key to create dictionary
keys = list(df_fin.columns.values)
print(df_fin.dtypes)

0 个答案:

没有答案