I was doing some researches in text analyzing with scikit-learn when I was faced with a problem. I created a new estimator for grid search:
class DataJoiner(BaseEstimator):
def __init__(self, type_of_joining="average", necessary_count_of_words=20, count_of_joined_messages=3):
self.count_of_joined_messages = count_of_joined_messages
self.type_of_joining = type_of_joining
self.necessary_count_of_words = necessary_count_of_words
def fit(self, df, y):
if self.type_of_joining == "average":
authors = y
self.count_of_words_and_num = {author: [] for author in authors}
all_count_of_words = 0
for i in list(df.axes[0]):
count_of_words = len(list(filter(lambda x: x != "" and x != "\n",
re.sub('[^а-яА-Яa-zA-Z0-9)(-]', ' ', str(df["text"][i])).split(" "))))
self.count_of_words_and_num[df["id_user"][i]].append((i, count_of_words))
all_count_of_words += count_of_words
self.critical_count_of_words = int(all_count_of_words / len(df["text"]))
return self
elif self.type_of_joining == "necessary_count_of_words":
self.critical_count_of_words = self.necessary_count_of_words
return self
else:
raise ValueError("Not available value of parameter")
def transform(self, df, y):
temp_df = df
rows_to_drop = []
authors = y
for author in authors:
need_to_join = []
for num, count_of_words in self.count_of_words_and_num[author]:
if count_of_words < self.critical_count_of_words:
need_to_join.append(num)
for i in range(0, len(need_to_join) - (self.count_of_joined_messages - 1), self.count_of_joined_messages):
temp_df["text"][need_to_join[i]] = str(temp_df["text"][need_to_join[i]])
for j in range(1, self.count_of_joined_messages):
temp_df["text"][need_to_join[i]] += " " + str(temp_df["text"][need_to_join[i + j]])
for key in ["doc", "fwd_messages", "photo", "video", "bad_smile", "good_smile", "is_upper_words", "repeated_chars"]:
if functools.reduce(lambda res, x: (res == 1) or (x == 1), [temp_df[key][need_to_join[i + j]] for j in range(0, self.count_of_joined_messages)]):
temp_df[key][need_to_join[i]] = 1
rows_to_drop.append(need_to_join[i + 1])
rows_to_drop.append(need_to_join[i + 2])
temp_df.drop(temp_df["Unnamed: 0"][rows_to_drop], inplace=True)
temp_df.drop("id_user", axis=1, inplace=True)
return temp_df
Moved it into the pipeline with other estimators
text_pipe = make_pipeline(DataJoiner(), SelectKBest(), SVC(kernel="rbf", random_state=241))
param = {"datajoiner__type_of_joining": ["average", "necessary_count_of_words"],
"datajoiner__necessary_count_of_words": [10, 20, 50],
"selectkbest__k": [100, 500, 1000, 3000, 5000],
"svc__C": numpy.power(10.0, numpy.arange(-5, 6)),
"svc__gamma": [random.random() for i in range(0, 10)]}
gs = GridSearchCV(self.text_pipe, param_grid=param)
gs.fit(df, y)
And got this error:
TypeError: transform() missing 1 required positional argument: 'y'
I have read about contributions for new estimators here http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator并且不明白我的估算工具有什么问题。