Question

I was doing some researches in text analyzing with scikit-learn when I was faced with a problem. I created a new estimator for grid search:

class DataJoiner(BaseEstimator):
    def __init__(self, type_of_joining="average", necessary_count_of_words=20, count_of_joined_messages=3):
        self.count_of_joined_messages = count_of_joined_messages
        self.type_of_joining = type_of_joining
        self.necessary_count_of_words = necessary_count_of_words

    def fit(self, df, y):
        if self.type_of_joining == "average":
            authors = y
            self.count_of_words_and_num = {author: [] for author in authors}
            all_count_of_words = 0
            for i in list(df.axes[0]):
                count_of_words = len(list(filter(lambda x: x != "" and x != "\n",
                                         re.sub('[^а-яА-Яa-zA-Z0-9)(-]', ' ', str(df["text"][i])).split(" "))))
                self.count_of_words_and_num[df["id_user"][i]].append((i, count_of_words))
                all_count_of_words += count_of_words
                self.critical_count_of_words = int(all_count_of_words / len(df["text"]))
            return self
        elif self.type_of_joining == "necessary_count_of_words":
            self.critical_count_of_words = self.necessary_count_of_words
            return self
        else:
            raise ValueError("Not available value of parameter")

    def transform(self, df, y):
            temp_df = df
            rows_to_drop = []
            authors = y
            for author in authors:
                need_to_join = []
                for num, count_of_words in self.count_of_words_and_num[author]:
                    if count_of_words < self.critical_count_of_words:
                        need_to_join.append(num)
                for i in range(0, len(need_to_join) - (self.count_of_joined_messages - 1), self.count_of_joined_messages):
                    temp_df["text"][need_to_join[i]] = str(temp_df["text"][need_to_join[i]])
                    for j in range(1, self.count_of_joined_messages):
                        temp_df["text"][need_to_join[i]] += " " + str(temp_df["text"][need_to_join[i + j]])
                    for key in ["doc", "fwd_messages", "photo", "video", "bad_smile", "good_smile", "is_upper_words", "repeated_chars"]:
                        if functools.reduce(lambda res, x: (res == 1) or (x == 1), [temp_df[key][need_to_join[i + j]] for j in range(0, self.count_of_joined_messages)]):
                            temp_df[key][need_to_join[i]] = 1
                    rows_to_drop.append(need_to_join[i + 1])
                    rows_to_drop.append(need_to_join[i + 2])
            temp_df.drop(temp_df["Unnamed: 0"][rows_to_drop], inplace=True)
            temp_df.drop("id_user", axis=1, inplace=True)
            return temp_df

Moved it into the pipeline with other estimators

text_pipe = make_pipeline(DataJoiner(), SelectKBest(), SVC(kernel="rbf", random_state=241))

param = {"datajoiner__type_of_joining": ["average", "necessary_count_of_words"],
                  "datajoiner__necessary_count_of_words": [10, 20, 50],
                  "selectkbest__k": [100, 500, 1000, 3000, 5000],
                  "svc__C": numpy.power(10.0, numpy.arange(-5, 6)),
                  "svc__gamma": [random.random() for i in range(0, 10)]}
gs = GridSearchCV(self.text_pipe, param_grid=param)
gs.fit(df, y)

And got this error:

TypeError: transform() missing 1 required positional argument: 'y'

I have read about contributions for new estimators here http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator并且不明白我的估算工具有什么问题。

Can not make implementation of new estimator for grid search in scikit-learn

0 个答案: