我正在尝试从football data下载数据
import os
import pandas as pd
GAMES = ['E0', 'E1', 'E2', 'E3']
def download_statistics():
for year in range(2003, 2020):
year_format = str(year)[-2:] + str(year+1)[-2:]
for game in GAMES:
previous_data = None
file_name = f'{game}.csv'
if os.path.isfile(file_name):
previous_data = pd.read_csv(file_name)
url_data = pd.read_csv(f'http://football-data.co.uk/mmz4281/{year_format}/{game}.csv')
if previous_data is not None:
combined_data = pd.concat([previous_data, url_data])
combined_data.to_csv(file_name)
else:
url_data.to_csv(file_name)
if __name__ == '__main__':
download_statistics()
我知道有些单元格丢失了数据,但是熊猫无法以某种方式处理它们并返回错误。我尝试使用其他分隔符/分隔符,但没有用。
Traceback (most recent call last):
File "F:\Programowanie\GitHub Repositories\football_predict\venv\lib\site-packages\pandas\io\parsers.py", line 454, in _read
data = parser.read(nrows)
File "F:\Programowanie\GitHub Repositories\football_predict\venv\lib\site-packages\pandas\io\parsers.py", line 1133, in read
ret = self._engine.read(nrows)
File "F:\Programowanie\GitHub Repositories\football_predict\venv\lib\site-packages\pandas\io\parsers.py", line 2037, in read
data = self._reader.read(nrows)
File "pandas\_libs\parsers.pyx", line 860, in pandas._libs.parsers.TextReader.read
File "pandas\_libs\parsers.pyx", line 875, in pandas._libs.parsers.TextReader._read_low_memory
File "pandas\_libs\parsers.pyx", line 929, in pandas._libs.parsers.TextReader._read_rows
File "pandas\_libs\parsers.pyx", line 916, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas\_libs\parsers.pyx", line 2071, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: Expected 57 fields in line 305, saw 72
下面的代码不会返回错误,但是会返回形状为(380,1)的DataFrame并在尝试使用逗号分割数据时返回:
import os
import io
import pandas as pd
import requests
GAMES = ['E0', 'E1', 'E2', 'E3']
def download_statistics():
for year in range(2003, 2020):
year_format = str(year)[-2:] + str(year+1)[-2:]
for game in GAMES:
previous_data = None
file_name = f'{game}.csv'
if os.path.isfile(file_name):
previous_data = pd.read_csv(file_name)
response = requests.get(f'http://football-data.co.uk/mmz4281/{year_format}/{game}.csv')
url_data = pd.read_csv(io.StringIO(response.text), sep='delimiter')
url_data = url_data[0].str.split(',', expand=True)
if previous_data is not None:
combined_data = pd.concat([previous_data, url_data])
combined_data.to_csv(file_name)
else:
url_data.to_csv(file_name)
if __name__ == '__main__':
download_statistics()
但是我收到另一个错误:
F:/Programowanie/GitHub Repositories/football_predict/data.py:23: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
url_data = pd.read_csv(io.StringIO(response.text), sep='delimiter')
Traceback (most recent call last):
File "F:\Programowanie\GitHub Repositories\football_predict\venv\lib\site-packages\pandas\core\indexes\base.py", line 2646, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Program Files\PyCharm 2019.3.4\plugins\python\helpers\pydev\pydevd.py", line 1434, in _exec
pydev_imports.execfile(file, globals, locals) # execute the script
File "D:\Program Files\PyCharm 2019.3.4\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "F:/Programowanie/GitHub Repositories/football_predict/data.py", line 34, in <module>
download_statistics()
File "F:/Programowanie/GitHub Repositories/football_predict/data.py", line 24, in download_statistics
url_data = url_data[0].str.split(',', expand=True)
File "F:\Programowanie\GitHub Repositories\football_predict\venv\lib\site-packages\pandas\core\frame.py", line 2800, in __getitem__
indexer = self.columns.get_loc(key)
File "F:\Programowanie\GitHub Repositories\football_predict\venv\lib\site-packages\pandas\core\indexes\base.py", line 2648, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 111, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1619, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1627, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
我在这里想念东西吗?
答案 0 :(得分:0)
这解决了我的问题:
Python Pandas Error tokenizing data
将文件的第一行拆分为names
后添加参数。
import os
import pandas as pd
import requests
GAMES = ['E0', 'E1', 'E2', 'E3']
def download_statistics():
for year in range(2003, 2020):
year_format = str(year)[-2:] + str(year+1)[-2:]
for game in GAMES:
previous_data = None
file_name = f'{game}.csv'
if os.path.isfile(file_name):
previous_data = pd.read_csv(file_name)
url = f'http://football-data.co.uk/mmz4281/{year_format}/{game}.csv'
response = requests.get(url)
url_data = pd.read_csv(url, names=response.text.splitlines()[0].split(','))
if previous_data is not None:
combined_data = pd.concat([previous_data, url_data])
combined_data.to_csv(file_name)
else:
url_data.to_csv(file_name)
if __name__ == '__main__':
download_statistics()