Question

假设我是一名基因组科学家，试图存储极长的字符串，每个字符串代表两位信息（即每个元素是G，A，T或C）。因为字符串非常长，所以我需要能够以精确的2N位（或者更确切地说，N / 4字节）存储长度为N的字符串。

考虑到这一动机，我正在寻找std::bitset（或boost::dynamic_bitset<>）的泛化，它适用于两位值而不是单位值。我想存储N这样的两位值，每个值可以是0,1,2或3.我需要在内存中尽可能紧密地打包数据，因此vector<char>将不起作用（因为它浪费了4倍的记忆）。

实现目标的最佳方法是什么？一种选择是使用自定义的operator[]，迭代器等包装现有的bitset模板，但是如果可能的话，我更愿意使用现有的库。

Answer 1

std::bitset<>是固定长度的，您可能不希望这样。

我认为你应该继续包裹std::vector<bool>。

请注意，std::vector<bool> 已针对空间进行了优化，但其优势在于它具有动态性。据推测，你需要从某个地方读取任意长度的基因组。

考虑是否需要大量API才能访问它;你可能只需要几种方法。

@ Jefffrey的答案已涵盖相关代码，如果是bitset<>。

[我不熟悉boost::dynamic_bitset<>及其对vector的影响。]

还有一个想法是，你是否可以方便地使用四边形字母，四边形很好地填充空间中的字符。

class Genome
{
public:
    enum class Letter {A,C,G,T};
    Genome(const std::string& source)
    {
        code_.resize(source.size() * 2);
        for (unsigned index = 0; index != source.size(); ++index)
        {
            char text = source[index];
            Letter letter = textToLetter(text);
            set(index, letter);
        }
    }  
    static Letter textToLetter(char text)
    {
        // Or search through the array `letterText`.
        // Or come up with a neat but unintelligible one liner ...
        Letter letter = Letter::A;
        switch (text)
        {
        case 'A':
            letter = Letter::A;
            break;
        case 'C':
            letter = Letter::C;
            break;
        case 'G':
            letter = Letter::G;
            break;
        case 'T':
            letter = Letter::T;
            break;
        default:
            // Invalid - handle error.
            break;
        }
        return letter;
    }
    static char letterToText(Letter l) 
    {
        return letterText[(unsigned)l];
    }
    // Add bounds checking
    Letter get(unsigned index) const
    {
        unsigned distance = index * 2;
        char numeric = code_[distance] + code_[distance + 1] * 2;
        return Letter(numeric);
    }
    // Add bounds checking
    void set(unsigned index, Letter value)
    {
        unsigned distance = index * 2;
        bool low = (unsigned)value & 1;
        bool high = (bool)((unsigned)value & 2);
        code_[distance] = low;
        code_[distance + 1]  = high;
    }
    unsigned size()
    {
        return code_.size() / 2;
    }
    // Extend by numLetters, initially set to 'A'
    void extend(unsigned numLetters)
    {
        code_.resize(code_.size() + numLetters * 2);
    }
private:

    static char letterText[4];
    std::vector<bool> code_;
};

char Genome::letterText [4] = { 'A', 'C', 'G', 'T' };

int main()
{
    Genome g("GATT");
    g.extend(3);
    g.set(5, Genome::Letter::C);
    for (unsigned i = 0; i != g.size(); ++i)
        std::cout << Genome::letterToText(g.get(i));
    std::cout << std::endl;
    return 0;
}

Answer 2

你有两个选择。

假设：

enum class nucleobase { a, c, g, t };

你有两个选择。你可以：

使用单个std::bitset并使用索引编制
将std::bitset与其他容器结合使用

首先，您可以定义几个针对每组/ get的正确位数的函数：

template<std::size_t N>
void set(std::bitset<N>& bits, std::size_t i, nucleobase x) {
    switch (x) {
        case nucleobase::a: bits.set(i * 2, 0); bits.set(i * 2 + 1, 0); break;
        case nucleobase::c: bits.set(i * 2, 0); bits.set(i * 2 + 1, 1); break;
        case nucleobase::g: bits.set(i * 2, 1); bits.set(i * 2 + 1, 0); break;
        case nucleobase::t: bits.set(i * 2, 1); bits.set(i * 2 + 1, 1); break;
    }
}

template<std::size_t N>
nucleobase get(const std::bitset<N>& bits, std::size_t i) {
    if (!bits[i * 2])
        if (!bits[i * 2 + 1]) return nucleobase::a;
        else                  return nucleobase::c;
    else
        if (!bits[i * 2 + 1]) return nucleobase::g;
        else                  return nucleobase::t;
}

Live demo

以上只是一个例子而且是一个可怕的例子（这里差不多凌晨4点，我真的需要睡觉）。

对于第二个，您只需要映射等位基因和位：

bit_pair bits_for(nucleobase x) {
    switch (x) {
        case nucleobase::a: return bit_pair("00"); break;
        case nucleobase::c: return bit_pair("10"); break;
        case nucleobase::g: return bit_pair("01"); break;
        case nucleobase::t: return bit_pair("11"); break;
    }
}

nucleobase nucleobase_for(bit_pair x) {
    switch (x.to_ulong()) {
        case 0: return nucleobase::a; break;
        case 1: return nucleobase::c; break;
        case 2: return nucleobase::g; break;
        case 3: return nucleobase::t; break;
        default: return nucleobase::a; break; // just for the warning
    }
}

Live demo

当然，如果您需要运行时长度，可以使用boost::dynamic_bitset和std::vector。

Answer 3

这是我用于固定长度k-mers的内容。

#include <cstdint>
#include <cstdlib>
#include <ostream>

enum class nucleotide { A, C, G, T };

inline std::ostream&
operator<<(std::ostream& pOut, nucleotide pNt)
{
    switch (pNt) {
        case nucleotide::A: pOut << 'A'; break;
        case nucleotide::C: pOut << 'C'; break;
        case nucleotide::G: pOut << 'G'; break;
        case nucleotide::T: pOut << 'T'; break;
    }
    return pOut;
}

class kmer_base;

class nucleotide_proxy {
public:
    operator nucleotide() const {
        return nucleotide((*mWord >> (mPosition * 2)) & 3);
    };

    nucleotide_proxy& operator=(nucleotide pNt) {
        uint64_t word = *mWord;
        word &= ~(uint64_t(3) << (mPosition*2));
        word |= uint64_t(pNt) << (mPosition*2);
        *mWord = word;

        return *this;
    };

private:
    friend class kmer_base;

    nucleotide_proxy(uint64_t* pWord, uint8_t pPosition)
        : mWord(pWord), mPosition(pPosition)
    {
    }

    uint64_t* mWord;
    uint8_t mPosition;
};


class kmer_base {
protected:
    nucleotide_proxy access(uint64_t* pWord, size_t pPosition)
    {
        return nucleotide_proxy(pWord + (pPosition / 32), (pPosition & 31));
    }

    const nucleotide_proxy access(uint64_t* pWord, size_t pPosition) const
    {
        return nucleotide_proxy(pWord + (pPosition / 32), (pPosition & 31));
    }
};


template<int K>
class kmer : public kmer_base
{
    enum { Words = (K + 31) / 32 };
public:
    nucleotide_proxy operator[](size_t pOutdex) {
        return access(mWords, pOutdex);
    }

    const nucleotide_proxy operator[](size_t pOutdex) const {
        return access(mWords, pOutdex);
    }

private:
    uint64_t mWords[Words];
};

将其扩展为动态长度k-mere是一项练习;一旦你有nucleotide_proxy可供使用，这很容易。有效地实施反向补充算子也是一种练习。

对于两位值是否存在std :: bitset的推广？

3 个答案: