我尝试读取大小超过5gb的数据集。我从kaggle下载的数据集。
train_df = pd.read_csv(f"{PATH}/train.csv",
low_memory=False,
nrows=10**5,
dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16',
'content_type_id': 'int8','task_container_id': 'int16', 'user_answer': 'int8',
'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32',
'prior_question_had_explanation': 'boolean'},
parse_dates=["timestamp"],
date_parser=lambda x: datetime.datetime.fromtimestamp(int(x)))
我尝试了上面的代码,我指定了dtypes,然后将所有时间戳转换为日期时间,但出现了诸如以下的错误:
TypeError Traceback (most recent call last)
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
3358 result = tools.to_datetime(
-> 3359 date_parser(*date_cols), errors="ignore", cache=cache_dates
3360 )
<ipython-input-82-2cac40069ffd> in <lambda>(x)
8 parse_dates=["timestamp"],
----> 9 date_parser=lambda x: datetime.datetime.fromtimestamp(int(x)))
10 questions_df = pd.read_csv(f"{PATH}/questions.csv")
TypeError: only size-1 arrays can be converted to Python scalars
During handling of the above exception, another exception occurred:
OverflowError Traceback (most recent call last)
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
3367 parsing.try_parse_dates(
-> 3368 parsing.concat_date_cols(date_cols),
3369 parser=date_parser,
pandas\_libs\tslibs\parsing.pyx in pandas._libs.tslibs.parsing.concat_date_cols()
pandas\_libs\tslibs\parsing.pyx in pandas._libs.tslibs.parsing.convert_to_unicode()
OverflowError: Python int too large to convert to C long
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-82-2cac40069ffd> in <module>
7 'prior_question_had_explanation': 'boolean'},
8 parse_dates=["timestamp"],
----> 9 date_parser=lambda x: datetime.datetime.fromtimestamp(int(x)))
10 questions_df = pd.read_csv(f"{PATH}/questions.csv")
11 lectures_df = pd.read_csv(f"{PATH}/lectures.csv")
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
684 )
685
--> 686 return _read(filepath_or_buffer, kwds)
687
688
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
456
457 try:
--> 458 data = parser.read(nrows)
459 finally:
460 parser.close()
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1194 def read(self, nrows=None):
1195 nrows = _validate_integer("nrows", nrows)
-> 1196 ret = self._engine.read(nrows)
1197
1198 # May alter columns / col_dict
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
2228 data = {k: v for k, (i, v) in zip(names, data)}
2229
-> 2230 names, data = self._do_date_conversions(names, data)
2231 index, names = self._make_index(data, alldata, names)
2232
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in _do_date_conversions(self, names, data)
1968 self.index_names,
1969 names,
-> 1970 keep_date_col=self.keep_date_col,
1971 )
1972
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in _process_date_conversion(data_dict, converter, parse_spec, index_col, index_names, columns, keep_date_col)
3411 if _isindex(colspec):
3412 continue
-> 3413 data_dict[colspec] = converter(data_dict[colspec])
3414 else:
3415 new_name, col, old_names = _try_convert_dates(
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
3373 )
3374 except Exception:
-> 3375 return generic_parser(date_parser, *date_cols)
3376
3377 return converter
c:\users\public(parse_func, *cols)
36 for i in range(N):
37 args = [c[i] for c in cols]
---> 38 results[i] = parse_func(*args)
39
40 return results
<ipython-input-82-2cac40069ffd> in <lambda>(x)
7 'prior_question_had_explanation': 'boolean'},
8 parse_dates=["timestamp"],
----> 9 date_parser=lambda x: datetime.datetime.fromtimestamp(int(x)))
10 questions_df = pd.read_csv(f"{PATH}/questions.csv")
11 lectures_df = pd.read_csv(f"{PATH}/lectures.csv")
OSError: [Errno 22] Invalid argument
.shape
train_df.shape
我理解该错误是因为我需要将太多行从时间戳转换为日期时间,但是我想我看到有这样做的方法,但我却忘记了。有人可以帮忙吗?谢谢。