如何在pandas中读取非常大的制表符分隔文件

时间:2017-01-06 18:40:22

标签: python pandas

我需要在pandas中读取一个非常大的文件,并添加一个新列。 由于文件对我的系统而言太大(7500万行,超过3GB),我决定以块的形式阅读它。我知道跳跃者和跳跃者,但虽然跳过似乎有用,但我有跳过脚的问题。 例如,当我尝试:

data = pandas.read_table("path", skipfooter=75000000);

我的系统内存全部耗尽,看起来它正在读取整个文件,这正是我想要避免的。我做错了吗?

2 个答案:

答案 0 :(得分:1)

根据我的研究,性能更差的原因有两个

原因1

更快的C引擎支持skiprows参数,而只有Python引擎支持skipfooter。

来自http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html ..

skiprows:list-like或integer,默认为None 要在文件开头跳过(0索引)或要跳过的行数(int)的行号

skipfooter:int,默认为0 要跳过的文件底部的行数(不支持engine ='c')

在代码中,请参阅第781-786行。因为你指定了skipfooter,所以这些行将引擎推回到Python引擎,这可能导致性能不佳。

https://github.com/pandas-dev/pandas/blob/v0.19.2/pandas/io/parsers.py#L781

    # C engine not supported yet
    if engine == 'c':
        if options['skipfooter'] > 0:
            fallback_reason = "the 'c' engine does not support"\
                              " skipfooter"
            engine = 'python'

原因2

此外,通过查看代码,您对整个文件中的skipfooter读取然后删除底行的怀疑似乎也是正确的。见第2631-2632行。

https://github.com/pandas-dev/pandas/blob/v0.19.2/pandas/io/parsers.py#L2631

    if self.skipfooter:
        lines = lines[:-self.skipfooter]

答案 1 :(得分:1)

#include <iostream> #include <inttypes.h> #include <random> #include <algorithm> #include <chrono> #include <limits> uint64_t test_overflow(uint64_t start, uint64_t add, uint64_t check) { uint64_t count = 0; uint64_t sum1 = start; uint64_t sum2 = start; do { sum2 += sum1 += add; count++; } while (sum1 + add < check && sum2 + (sum1 + add) < check); return count; } template <class T, class R> T fletcherTA(const uint8_t * data, const T & count, const T & start) { // calculate how many full R-words the input has T rwords = count / sizeof(R); // calculate how many extra bytes, that do not fit into an R-word, the input has T remainingBytes = count - rwords * sizeof(R); // now calculate the flechter-T checksum from R-words T sum1 = start & std::numeric_limits<R>::max(); T sum2 = start >> (sizeof(R)*8); const R * dataR = reinterpret_cast<const R*>(data); while (rwords) { rwords--; sum1 = (sum1 + *dataR++) % std::numeric_limits<R>::max(); sum2 = (sum2 + sum1) % std::numeric_limits<R>::max(); } if (remainingBytes > 0) { // copy the excess bytes to our dummy variable. you could use memcpy here... R dummy = 0; const uint8_t * data8 = reinterpret_cast<const uint8_t*>(dataR); for (uint64_t index = 0; index < remainingBytes; ++index) { reinterpret_cast<uint8_t*>(&dummy)[index] = data8[index]; } // now add the dummy on top sum1 = (sum1 + dummy) % std::numeric_limits<R>::max(); sum2 = (sum2 + sum1) % std::numeric_limits<R>::max(); } // build final checksum return (sum2 << sizeof(R)*8) | sum1; } template <class T, class R, T overflowAfter> T fletcherTB(const uint8_t * data, const T & count, const T & start) { // calculate how many full R-words the input has T rwords = count / sizeof(R); // calculate how many extra bytes, that do not fit into an R-word, the input has T remainingBytes = count - rwords * sizeof(R); // now calculate the flechter-T checksum from R-words T sum1 = start & std::numeric_limits<R>::max(); T sum2 = start >> (sizeof(R)*8); const R * dataR = reinterpret_cast<const R*>(data); while (rwords) { T tlen = ((rwords >= overflowAfter) ? overflowAfter : rwords); rwords -= tlen; do { sum2 += sum1 += *dataR++; tlen--; } while (tlen); sum1 = (sum1 & std::numeric_limits<R>::max()) + (sum1 >> (sizeof(R)*8)); sum2 = (sum2 & std::numeric_limits<R>::max()) + (sum2 >> (sizeof(R)*8)); } if (remainingBytes > 0) { // copy the excess bytes to our dummy variable. you could use memcpy here... R dummy = 0; const uint8_t * data8 = reinterpret_cast<const uint8_t*>(dataR); for (uint64_t index = 0; index < remainingBytes; ++index) { reinterpret_cast<uint8_t*>(&dummy)[index] = data8[index]; } // now add the dummy on top sum2 += sum1 += dummy; sum1 = (sum1 & std::numeric_limits<R>::max()) + (sum1 >> (sizeof(R)*8)); sum2 = (sum2 & std::numeric_limits<R>::max()) + (sum2 >> (sizeof(R)*8)); } // build final checksum return (sum2 << (sizeof(R)*8)) | sum1; } template <class T, class R, T overflowAfter> void test_implementations() { std::cout << "Testing " << sizeof(T)*8 << " bit implementations:" << std::endl; // test flechter overflow std::cout << "Overflow after: " << test_overflow(0, std::numeric_limits<R>::max(), std::numeric_limits<T>::max() - std::numeric_limits<R>::max()) << " rounds (start value 0)." << std::endl; // test fletcher checksum in both implementations with the same data const uint64_t dataSize = 1 * 1024 * 1024 * 1024; // 1 * 1024 * 1 MB = 1 GB of test data const uint64_t blockSize = std::min(std::min(dataSize, (uint64_t)10 * 1024 * 1024), (uint64_t)(std::numeric_limits<T>::max() - std::numeric_limits<T>::max() % 4)); const T oddBlockSize = static_cast<T>(blockSize - 1); const uint64_t nrOfBlocks = dataSize / blockSize; std::vector<uint32_t> data(blockSize / sizeof(uint32_t)); // initialize random number generator using current time std::minstd_rand prng(std::chrono::high_resolution_clock::now().time_since_epoch().count()); std::cout << "Testing checksums with " << std::dec << dataSize / (1024 * 1024) << " MB of data in " << blockSize / 1024 << " kB blocks..." << std::endl; T ca = 0; T cb = 0; for (uint64_t block = 0; block < nrOfBlocks; block++) { // generate random numbers std::generate(data.begin(), data.end(), [&prng](){ return prng(); }); // apply checksum function. make sure to use an odd value to test remaining bytes being captured ca = fletcherTA<T, R>(reinterpret_cast<const uint8_t*>(data.data()), oddBlockSize, ca); cb = fletcherTB<T, R, overflowAfter>(reinterpret_cast<const uint8_t*>(data.data()), oddBlockSize, cb); } std::cout << "Checksum A: 0x" << std::hex << ca << std::endl; std::cout << "Checksum B: 0x" << std::hex << cb << std::endl; // test speed const uint64_t runs = nrOfBlocks; std::cout << "Testing speed with " << std::dec << dataSize / (1024 * 1024) << " MB of data..." << std::endl; auto startA = std::chrono::high_resolution_clock::now(); for (uint64_t run = 0; run < runs; run++) { ca = fletcherTA<T, R>(reinterpret_cast<const uint8_t*>(data.data()), oddBlockSize, ca); } auto endA = std::chrono::high_resolution_clock::now(); auto startB = std::chrono::high_resolution_clock::now(); for (uint64_t run = 0; run < runs; run++) { cb = fletcherTB<T, R, overflowAfter>(reinterpret_cast<const uint8_t*>(data.data()), oddBlockSize, cb); } auto endB = std::chrono::high_resolution_clock::now(); std::cout << "Checksum A: 0x" << std::hex << ca << ", took " << std::dec << std::chrono::duration_cast<std::chrono::milliseconds>(endA-startA).count() << " ms" << std::endl; std::cout << "Checksum B: 0x" << std::hex << cb << ", took " << std::dec << std::chrono::duration_cast<std::chrono::milliseconds>(endB-startB).count() << " ms" << std::endl; std::cout << std::endl; } int main() { test_implementations<uint16_t, uint8_t, 20>(); test_implementations<uint32_t, uint16_t, 359>(); test_implementations<uint64_t, uint32_t, 683442530>(); return 0; } 并不真正需要一个文件,它只需要一个带有read_table方法的对象。您可以使用read方法编写自己的分块对象,并将其交给read

pandas