这可能会在非常愚蠢的问题上结束,但作为python中的新手,我无法找到解决问题的好方法。
class Preprocessor:
mPath = None;
df = None;
def __init__(self, path):
self.mPath = path;
def read(self):
self.df = pd.read_csv(self.mPath);
return self.df;
def __findUniqueGenres(self):
setOfGenres = set();
for index, genre in self.df['genres'].iteritems():
listOfGenreInMovie = genre.lower().split("|");
for i, _genre in np.ndenumerate(listOfGenreInMovie):
setOfGenres.add(_genre)
return setOfGenres;
def __prepareDataframe(self, genres):
all_columns = set(["title", "movieId"]).union(genres)
_df = pd.DataFrame(columns=all_columns)
return _df;
def __getRowTemplate(self, listOfColumns):
_rowTemplate = {}
for col in listOfColumns:
_rowTemplate[col] = 0
return _rowTemplate;
def __createRow(self, rowTemplate, row):
rowTemplate['title'] = row.title;
rowTemplate['movieId'] = row.movieId;
movieGenres = row.genres.lower().split("|");
for movieGenre in movieGenres:
rowTemplate[movieGenre] = 1;
return rowTemplate;
def tranformDataFrame(self):
genres = self.__findUniqueGenres();
print('### List of genres...', genres);
__df = self.__prepareDataframe(genres); # Data frame with all required columns.
rowTemplate = self.__getRowTemplate(__df.columns)
print('### Row template looks like -->', rowTemplate)
collection = []
for index, row in self.df.iterrows():
_rowToAdd=self.__createRow(rowTemplate, row);
print('### Row looks like', _rowToAdd)
collection.append(_rowToAdd)
print('### Collection looks like', collection)
return __df.append(collection)
当我尝试将_rowToAdd
附加到集合时,它会结束最后一项(self.df
的最后一行)的集合。
下面是相同的日志(self.df
这里有3行),
### List of genres... {'mystery', 'horror', 'comedy', 'drama', 'thriller', 'children', 'adventure'}
### Row template looks like --> {'title': 0, 'horror': 0, 'comedy': 0, 'drama': 0, 'children': 0, 'mystery': 0, 'movieId': 0, 'thriller': 0, 'adventure': 0}
### Row looks like {'title': 'Big Night (1996)', 'horror': 0, 'comedy': 1, 'drama': 1, 'children': 0, 'mystery': 0, 'movieId': 994, 'thriller': 0, 'adventure': 0}
### Row looks like {'title': 'Grudge, The (2004)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 0, 'mystery': 1, 'movieId': 8947, 'thriller': 1, 'adventure': 0}
### Row looks like {'title': 'Cheetah (1989)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 1, 'mystery': 1, 'movieId': 2039, 'thriller': 1, 'adventure': 1}
### Collection looks like [{'title': 'Cheetah (1989)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 1, 'mystery': 1, 'movieId': 2039, 'thriller': 1, 'adventure': 1}, {'title': 'Cheetah (1989)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 1, 'mystery': 1, 'movieId': 2039, 'thriller': 1, 'adventure': 1}, {'title': 'Cheetah (1989)', 'horror': 1, 'comedy': 1, 'drama': 1, 'children': 1, 'mystery': 1, 'movieId': 2039, 'thriller': 1, 'adventure': 1}]
我想要我的收藏品
### [
{'title': 'Big Night (1996)', 'horror': 0, 'comedy': 1, 'drama': 1, 'children': 0, 'mystery': 0, 'movieId': 994, 'thriller': 0, 'adventure': 0},
{'title': 'Grudge, The (2004)', 'horror': 1, 'comedy': 0, 'drama': 0, 'children': 0, 'mystery': 1, 'movieId': 8947, 'thriller': 1, 'adventure': 0},
{'title': 'Cheetah (1989)', 'horror': 0, 'comedy': 0, 'drama': 0, 'children': 1, 'mystery': 0, 'movieId': 2039, 'thriller': 0, 'adventure': 1}
]
数据集 - https://grouplens.org/datasets/movielens/
答案 0 :(得分:0)
我现在理解了这个问题,我试图改变字典对象。
def tranformDataFrame(self):
genres = self.__findUniqueGenres();
print('### List of genres...', genres);
__df = self.__prepareDataframe(genres); # Data frame with all required columns.
rowTemplate = self.__getRowTemplate(__df.columns)
print('### Row template looks like -->', rowTemplate)
collection = []
for index, row in self.df.iterrows():
# Creating the fresh copy of row template every time prevent mutation.
_rowToAdd = self.__createRow(self.__getRowTemplate(__df.columns), row);
print('### Row looks like', _rowToAdd)
collection.append(_rowToAdd)
print('### Collection looks like', collection)
return __df.append(collection)
虽然必须有某种方法来缓存副本并每次克隆它(而不是处理某些逻辑,并创建字典)。但是,此解决方案至少可以解决这一特定问题。