我需要在pandas中读取一个非常大的文件,并添加一个新列。 由于文件对我的系统而言太大(7500万行,超过3GB),我决定以块的形式阅读它。我知道跳跃者和跳跃者,但虽然跳过似乎有用,但我有跳过脚的问题。 例如,当我尝试:
data = pandas.read_table("path", skipfooter=75000000);
我的系统内存全部耗尽,看起来它正在读取整个文件,这正是我想要避免的。我做错了吗?
答案 0 :(得分:1)
根据我的研究,性能更差的原因有两个
原因1
更快的C引擎支持skiprows参数,而只有Python引擎支持skipfooter。
来自http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html ..
skiprows:list-like或integer,默认为None 要在文件开头跳过(0索引)或要跳过的行数(int)的行号
skipfooter:int,默认为0 要跳过的文件底部的行数(不支持engine ='c')
在代码中,请参阅第781-786行。因为你指定了skipfooter,所以这些行将引擎推回到Python引擎,这可能导致性能不佳。
https://github.com/pandas-dev/pandas/blob/v0.19.2/pandas/io/parsers.py#L781
# C engine not supported yet
if engine == 'c':
if options['skipfooter'] > 0:
fallback_reason = "the 'c' engine does not support"\
" skipfooter"
engine = 'python'
原因2
此外,通过查看代码,您对整个文件中的skipfooter读取然后删除底行的怀疑似乎也是正确的。见第2631-2632行。
https://github.com/pandas-dev/pandas/blob/v0.19.2/pandas/io/parsers.py#L2631
if self.skipfooter:
lines = lines[:-self.skipfooter]
答案 1 :(得分:1)
#include <iostream>
#include <inttypes.h>
#include <random>
#include <algorithm>
#include <chrono>
#include <limits>
uint64_t test_overflow(uint64_t start, uint64_t add, uint64_t check)
{
uint64_t count = 0;
uint64_t sum1 = start;
uint64_t sum2 = start;
do
{
sum2 += sum1 += add;
count++;
} while (sum1 + add < check && sum2 + (sum1 + add) < check);
return count;
}
template <class T, class R>
T fletcherTA(const uint8_t * data, const T & count, const T & start)
{
// calculate how many full R-words the input has
T rwords = count / sizeof(R);
// calculate how many extra bytes, that do not fit into an R-word, the input has
T remainingBytes = count - rwords * sizeof(R);
// now calculate the flechter-T checksum from R-words
T sum1 = start & std::numeric_limits<R>::max();
T sum2 = start >> (sizeof(R)*8);
const R * dataR = reinterpret_cast<const R*>(data);
while (rwords)
{
rwords--;
sum1 = (sum1 + *dataR++) % std::numeric_limits<R>::max();
sum2 = (sum2 + sum1) % std::numeric_limits<R>::max();
}
if (remainingBytes > 0)
{
// copy the excess bytes to our dummy variable. you could use memcpy here...
R dummy = 0;
const uint8_t * data8 = reinterpret_cast<const uint8_t*>(dataR);
for (uint64_t index = 0; index < remainingBytes; ++index)
{
reinterpret_cast<uint8_t*>(&dummy)[index] = data8[index];
}
// now add the dummy on top
sum1 = (sum1 + dummy) % std::numeric_limits<R>::max();
sum2 = (sum2 + sum1) % std::numeric_limits<R>::max();
}
// build final checksum
return (sum2 << sizeof(R)*8) | sum1;
}
template <class T, class R, T overflowAfter>
T fletcherTB(const uint8_t * data, const T & count, const T & start)
{
// calculate how many full R-words the input has
T rwords = count / sizeof(R);
// calculate how many extra bytes, that do not fit into an R-word, the input has
T remainingBytes = count - rwords * sizeof(R);
// now calculate the flechter-T checksum from R-words
T sum1 = start & std::numeric_limits<R>::max();
T sum2 = start >> (sizeof(R)*8);
const R * dataR = reinterpret_cast<const R*>(data);
while (rwords)
{
T tlen = ((rwords >= overflowAfter) ? overflowAfter : rwords);
rwords -= tlen;
do
{
sum2 += sum1 += *dataR++;
tlen--;
} while (tlen);
sum1 = (sum1 & std::numeric_limits<R>::max()) + (sum1 >> (sizeof(R)*8));
sum2 = (sum2 & std::numeric_limits<R>::max()) + (sum2 >> (sizeof(R)*8));
}
if (remainingBytes > 0)
{
// copy the excess bytes to our dummy variable. you could use memcpy here...
R dummy = 0;
const uint8_t * data8 = reinterpret_cast<const uint8_t*>(dataR);
for (uint64_t index = 0; index < remainingBytes; ++index)
{
reinterpret_cast<uint8_t*>(&dummy)[index] = data8[index];
}
// now add the dummy on top
sum2 += sum1 += dummy;
sum1 = (sum1 & std::numeric_limits<R>::max()) + (sum1 >> (sizeof(R)*8));
sum2 = (sum2 & std::numeric_limits<R>::max()) + (sum2 >> (sizeof(R)*8));
}
// build final checksum
return (sum2 << (sizeof(R)*8)) | sum1;
}
template <class T, class R, T overflowAfter>
void test_implementations()
{
std::cout << "Testing " << sizeof(T)*8 << " bit implementations:" << std::endl;
// test flechter overflow
std::cout << "Overflow after: " << test_overflow(0, std::numeric_limits<R>::max(), std::numeric_limits<T>::max() - std::numeric_limits<R>::max()) << " rounds (start value 0)." << std::endl;
// test fletcher checksum in both implementations with the same data
const uint64_t dataSize = 1 * 1024 * 1024 * 1024; // 1 * 1024 * 1 MB = 1 GB of test data
const uint64_t blockSize = std::min(std::min(dataSize, (uint64_t)10 * 1024 * 1024), (uint64_t)(std::numeric_limits<T>::max() - std::numeric_limits<T>::max() % 4));
const T oddBlockSize = static_cast<T>(blockSize - 1);
const uint64_t nrOfBlocks = dataSize / blockSize;
std::vector<uint32_t> data(blockSize / sizeof(uint32_t));
// initialize random number generator using current time
std::minstd_rand prng(std::chrono::high_resolution_clock::now().time_since_epoch().count());
std::cout << "Testing checksums with " << std::dec << dataSize / (1024 * 1024) << " MB of data in " << blockSize / 1024 << " kB blocks..." << std::endl;
T ca = 0;
T cb = 0;
for (uint64_t block = 0; block < nrOfBlocks; block++)
{
// generate random numbers
std::generate(data.begin(), data.end(), [&prng](){ return prng(); });
// apply checksum function. make sure to use an odd value to test remaining bytes being captured
ca = fletcherTA<T, R>(reinterpret_cast<const uint8_t*>(data.data()), oddBlockSize, ca);
cb = fletcherTB<T, R, overflowAfter>(reinterpret_cast<const uint8_t*>(data.data()), oddBlockSize, cb);
}
std::cout << "Checksum A: 0x" << std::hex << ca << std::endl;
std::cout << "Checksum B: 0x" << std::hex << cb << std::endl;
// test speed
const uint64_t runs = nrOfBlocks;
std::cout << "Testing speed with " << std::dec << dataSize / (1024 * 1024) << " MB of data..." << std::endl;
auto startA = std::chrono::high_resolution_clock::now();
for (uint64_t run = 0; run < runs; run++)
{
ca = fletcherTA<T, R>(reinterpret_cast<const uint8_t*>(data.data()), oddBlockSize, ca);
}
auto endA = std::chrono::high_resolution_clock::now();
auto startB = std::chrono::high_resolution_clock::now();
for (uint64_t run = 0; run < runs; run++)
{
cb = fletcherTB<T, R, overflowAfter>(reinterpret_cast<const uint8_t*>(data.data()), oddBlockSize, cb);
}
auto endB = std::chrono::high_resolution_clock::now();
std::cout << "Checksum A: 0x" << std::hex << ca << ", took " << std::dec << std::chrono::duration_cast<std::chrono::milliseconds>(endA-startA).count() << " ms" << std::endl;
std::cout << "Checksum B: 0x" << std::hex << cb << ", took " << std::dec << std::chrono::duration_cast<std::chrono::milliseconds>(endB-startB).count() << " ms" << std::endl;
std::cout << std::endl;
}
int main() {
test_implementations<uint16_t, uint8_t, 20>();
test_implementations<uint32_t, uint16_t, 359>();
test_implementations<uint64_t, uint32_t, 683442530>();
return 0;
}
并不真正需要一个文件,它只需要一个带有read_table
方法的对象。您可以使用read方法编写自己的分块对象,并将其交给read
。
pandas