我有14MB Excel file with five worksheets我正在阅读Pandas数据框,虽然下面的代码有效,但需要9分钟!
有没有人有加快速度的建议?
import pandas as pd
def OTT_read(xl,site_name):
df = pd.read_excel(xl.io,site_name,skiprows=2,parse_dates=0,index_col=0,
usecols=[0,1,2],header=None,
names=['date_time','%s_depth'%site_name,'%s_temp'%site_name])
return df
def make_OTT_df(FILEDIR,OTT_FILE):
xl = pd.ExcelFile(FILEDIR + OTT_FILE)
site_names = xl.sheet_names
df_list = [OTT_read(xl,site_name) for site_name in site_names]
return site_names,df_list
FILEDIR='c:/downloads/'
OTT_FILE='OTT_Data_All_stations.xlsx'
site_names_OTT,df_list_OTT = make_OTT_df(FILEDIR,OTT_FILE)
答案 0 :(得分:18)
正如其他人所说,csv阅读速度更快。因此,如果您在Windows上并拥有Excel,则可以调用vbscript将Excel转换为csv然后读取csv。我尝试了下面的脚本,花了大约30秒。
# create a list with sheet numbers you want to process
sheets = map(str,range(1,6))
# convert each sheet to csv and then read it using read_csv
df={}
from subprocess import call
excel='C:\\Users\\rsignell\\OTT_Data_All_stations.xlsx'
for sheet in sheets:
csv = 'C:\\Users\\rsignell\\test' + sheet + '.csv'
call(['cscript.exe', 'C:\\Users\\rsignell\\ExcelToCsv.vbs', excel, csv, sheet])
df[sheet]=pd.read_csv(csv)
这里有一小段python用于创建ExcelToCsv.vbs脚本:
#write vbscript to file
vbscript="""if WScript.Arguments.Count < 3 Then
WScript.Echo "Please specify the source and the destination files. Usage: ExcelToCsv <xls/xlsx source file> <csv destination file> <worksheet number (starts at 1)>"
Wscript.Quit
End If
csv_format = 6
Set objFSO = CreateObject("Scripting.FileSystemObject")
src_file = objFSO.GetAbsolutePathName(Wscript.Arguments.Item(0))
dest_file = objFSO.GetAbsolutePathName(WScript.Arguments.Item(1))
worksheet_number = CInt(WScript.Arguments.Item(2))
Dim oExcel
Set oExcel = CreateObject("Excel.Application")
Dim oBook
Set oBook = oExcel.Workbooks.Open(src_file)
oBook.Worksheets(worksheet_number).Activate
oBook.SaveAs dest_file, csv_format
oBook.Close False
oExcel.Quit
""";
f = open('ExcelToCsv.vbs','w')
f.write(vbscript.encode('utf-8'))
f.close()
此答案受益于Convert XLS to CSV on command line和csv & xlsx files import to pandas data frame: speed issue
答案 1 :(得分:5)
如果您的行数少于65536(在每张表格中),您可以尝试xls
(而不是xlsx
。根据我的经验xls
比xlsx
更快。很难与csv
进行比较,因为它取决于张数。
虽然这不是一个理想的解决方案(xls
是一个二进制旧的私有格式),但我发现如果你有太多工作表,内部公式与经常更新的值,或出于任何原因,您希望保持Excel多表格功能。
答案 2 :(得分:2)
I know this is old but in case anyone else is looking for an answer that doesn't involve VB. Pandas read_csv()
is faster but you don't need a VB script to get a csv file.
Open your Excel file and save as *.csv (comma separated value) format.
Under tools you can select Web Options and under the Encoding tab you can change the encoding to whatever works for your data. I ended up using Windows, Western European because Windows UTF encoding is "special" but there's lots of ways to accomplish the same thing. Then use the encoding argument in pd.read_csv()
to specify your encoding.
Encoding options are listed here
答案 3 :(得分:1)
我写了一个小巧的脚本来从.xlsx文件中读取值。这不会更新值(如日期),仅适用于我使用的文件。可能仍然存在一些错误,因为我只是在没有仔细研究xlsx定义的情况下就写下了它:-)。但这比默认pd.read_excel快五到十倍。
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 3 16:42:08 2020
@author: FelixKling
"""
import re
import zipfile
import pandas as pd
import datetime as dt
import numpy as np
import html
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def read(path, sheet_name=None, header=True, index_col=False, skiprows=[], skipcolumns=[]):
"""
Reads an .xlsx or .xlsm file and returns a Pandas DataFrame. Is much faster than pandas.read_excel().
Parameters
----------
path : str
The path to the .xlsx or .xlsm file.
sheet_name : str, optional
Name of the sheet to read. If none, the first (not the active!) sheet is read. The default is None.
header : bool, optional
Whether to use the first line as column headers. The default is True.
index_col : bool, optional
Whether to use the first column as index. The default is False.
skiprows : list of int, optional.
The row numbers to skip ([0, 1] skips the first two rows). The default is [].
skipcolumns : list of int, optional.
The column numbers to skip ([0, 1] skips the first two columns). The default is [].
Raises
------
TypeError
If the file is no .xlsx or .xlsm file.
FileNotFoundError
If the sheet name is not found.
Returns
-------
Pandas DataFrame
The input file as DataFrame.
"""
# check extension
if "." not in path:
raise TypeError("This is no .xlsx or .xlsm file!")
if path.rsplit(".", 1)[1] not in ["xlsx", "xlsm"]:
raise TypeError("This is no .xlsx or .xlsm file!")
path = path.replace("\\","/")
tempfiles = dict()
with zipfile.ZipFile(path, 'r') as zipObj:
for name in zipObj.namelist():
if name.startswith("xl/worksheets/") or name in [
"xl/_rels/workbook.xml.rels",
"xl/styles.xml",
"xl/workbook.xml",
"xl/sharedStrings.xml",
]:
try:
tempfiles[name] = zipObj.read(name).decode("utf-8")
except UnicodeDecodeError:
tempfiles[name] = zipObj.read(name).decode("utf-16")
# read rels (paths to sheets)
text = tempfiles["xl/_rels/workbook.xml.rels"]
rels = {}
relids = re.findall(r'<Relationship Id="([^"]+)"', text)
relpaths = re.findall(r'<Relationship .*?Target="([^"]+)"', text)
rels = dict(zip(relids, relpaths))
# read sheet names and relation ids
if sheet_name:
text = tempfiles["xl/workbook.xml"]
workbooks = {}
workbookids = re.findall(r'<sheet.*? r:id="([^"]+)"', text)
workbooknames = re.findall(r'<sheet.*? name="([^"]+)"', text)
workbooks = dict(zip(workbooknames, workbookids))
if sheet_name in workbooks:
sheet = rels[workbooks[sheet_name]].rsplit("/", 1)[1]
else:
raise FileNotFoundError("Sheet " + str(sheet_name) + " not found in Excel file! Available sheets: " + "; ".join(workbooks.keys()))
else:
sheet="sheet1.xml"
# read strings, they are numbered
string_items = []
if "xl/sharedStrings.xml" in tempfiles:
text = tempfiles["xl/sharedStrings.xml"]
string_items = re.split(r"<si.*?><t.*?>", text.replace("<t/>", "<t></t>").replace("</t></si>","").replace("</sst>",""))[1:]
string_items = [html.unescape(str(i).split("</t>")[0]) if i != "" else np.nan for i in string_items]
# read styles, they are numbered
text = tempfiles["xl/styles.xml"]
styles = re.split(r"<[/]?cellXfs.*?>", text)[1]
styles = styles.split('numFmtId="')[1:]
styles = [int(s.split('"', 1)[0]) for s in styles]
numfmts = text.split("<numFmt ")[1:]
numfmts = [n.split("/>", 1)[0] for n in numfmts]
for i, n in enumerate(numfmts):
n = re.sub(r"\[[^\]]*\]", "", n)
n = re.sub(r'"[^"]*"', "", n)
if any([x in n for x in ["y", "d", "w", "q"]]):
numfmts[i] = "date"
elif any([x in n for x in ["h", "s", "A", "P"]]):
numfmts[i] = "time"
else:
numfmts[i] = "number"
def style_type(x):
if 14 <= x <= 22:
return "date"
if 45 <= x <= 47:
return "time"
if x >= 165:
return numfmts[x - 165]
else:
return "number"
styles = list(map(style_type, styles))
text = tempfiles["xl/worksheets/" + sheet]
def code2nr(x):
nr = 0
d = 1
for c in x[::-1]:
nr += (ord(c)-64) * d
d *= 26
return nr - 1
table = []
max_row_len = 0
rows = [r.replace("</row>", "") for r in re.split(r"<row .*?>", text)[1:]]
for r in rows:
# c><c r="AT2" s="1" t="n"><v></v></c><c r="AU2" s="115" t="inlineStr"><is><t>bla (Namenskürzel)</t></is></c>
r = re.sub(r"</?r.*?>","", r)
r = re.sub(r"<(is|si).*?><t.*?>", "<v>", r)
r = re.sub(r"</t></(is|si)>", "</v>", r)
r = re.sub(r"</t><t.*?>","", r)
values = r.split("</v>")[:-1]
add = []
colnr = 0
for v in values:
value = re.split("<v.*?>", v)[1]
v = v.rsplit("<c", 1)[1]
# get column number of the field
nr = v.split(' r="')[1].split('"')[0]
nr = code2nr("".join([n for n in nr if n.isalpha()]))
if nr > colnr:
for i in range(nr - colnr):
add.append(np.nan)
colnr = nr + 1
sty = "number"
if ' s="' in v:
sty = int(v.split(' s="', 1)[1].split('"', 1)[0])
sty = styles[sty]
# inline strings
if 't="inlineStr"' in v:
add.append(html.unescape(value) if value != "" else np.nan)
# string from list
elif 't="s"' in v:
add.append(string_items[int(value)])
# boolean
elif 't="b"' in v:
add.append(bool(int(value)))
# date
elif sty == "date":
if len(value) == 0:
add.append(pd.NaT)
# Texts like errors
elif not is_number(value):
add.append(html.unescape(value))
else:
add.append(dt.datetime(1900,1,1) + dt.timedelta(days=float(value) - 2))
# time
elif sty == "time":
if len(value) == 0:
add.append(pd.NaT)
# Texts like errors
elif not is_number(value):
add.append(html.unescape(value))
else:
add.append((dt.datetime(1900,1,1) + dt.timedelta(days=float(value) - 2)).time())
# Null
elif len(value) == 0:
add.append(np.nan)
# Texts like errors
elif not is_number(value):
add.append(html.unescape(value))
# numbers
else:
add.append(round(float(value), 16))
table.append(add)
if len(add) > max_row_len:
max_row_len = len(add)
df = pd.DataFrame(table)
# skip rows or columns
df = df.iloc[[i for i in range(len(df)) if i not in skiprows], [i for i in range(len(df.columns)) if i not in skipcolumns]]
if index_col:
df = df.set_index(df.columns[0])
if header:
df.columns = df.iloc[0].values
df = df.iloc[1:]
return df
答案 4 :(得分:0)
如果您愿意一次处理缓慢的转换,则没有理由打开excel 。
pd.read_excel()
将数据读入数据框pd.to_csv()
立即将其转储到csv中避免同时调用excel和Windows。就我而言,一次性打击是值得的麻烦。我有一个。
答案 5 :(得分:0)
以我的经验,Pandas read_excel()
可以很好地处理多张Excel文件。如Using Pandas to read multiple worksheets中的建议,如果将sheet_name
分配给None
,它将自动将每张工作表放入一个数据框中,并输出带有工作表名称键的数据框字典。
但是花费时间的原因是您在代码中解析文本的位置。 5MB的14MB excel并不是很多。我有一个20.1MB的excel文件,每个文件有46张纸,每张纸有6000余行和17列,使用read_excel
的过程如下:
t0 = time.time()
def parse(datestr):
y,m,d = datestr.split("/")
return dt.date(int(y),int(m),int(d))
data = pd.read_excel("DATA (1).xlsx", sheet_name=None, encoding="utf-8", skiprows=1, header=0, parse_dates=[1], date_parser=parse)
t1 = time.time()
print(t1 - t0)
## result: 37.54169297218323 seconds
在data
以上的代码中,包含46个数据帧的字典。
正如其他人所建议的那样,使用read_csv()
会有所帮助,因为读取.csv
文件更快。但是考虑到.xlsx
文件使用压缩这一事实,.csv
文件可能更大,因此读取速度更慢。但是,如果您想将文件转换为使用python逗号分隔(VBcode由Rich Signel提供),则可以使用:Convert xlsx to csv
答案 6 :(得分:0)
我使用 xlsx2csv 在内存中将 excel 文件虚拟转换为 csv,这有助于将读取时间减少一半左右。
from xlsx2csv import Xlsx2csv
from io import StringIO
import pandas as pd
def read_excel(path: str, sheet_name: str) -> pd.DataFrame:
buffer = StringIO()
Xlsx2csv(path, outputencoding="utf-8", sheet_name=sheet_name).convert(buffer)
buffer.seek(0)
df = pd.read_csv(buffer)
return df