Question

我有一大块数据（hexdump），其中包含数千个小图像，数据结构就像这样。

20 00 20 00 00 10 00 00 <data> 20 00 20 00 00 10 00 00 <data> ...

其中（20 00 20 00 00 10 00 00）是每个数据部分（图像）之间的分隔。

包含整个hexdump的文件myfile看起来像这样

3C 63 9E FF 38 5F 9E FF
31 59 91 FF 20 00 20 00
00 10 00 00 55 73 A2 FF
38 5D 9C FF 3A 5E 95 FF

我想要做的基本上是将它分开。我想取20 00 20 00 00 10 00 00分隔的部分，并将每个部分放在一个txt文件中，如1.txt，2.txt ... n.txt

我尝试逐行阅读，但它会导致一些问题，因为20 00 ..部分可以在某些情况下在2行中找到，就像上面的例子一样，所以它不会发现每次出现。

while (getline(myfile,line,'\n')){
    if (line == "20 00 20 00 00 10 00 00")
        ...
}

Answer 1

我的建议是阅读二进制文件。如果它足够小，你可以一次性将它全部读入内存，否则我建议你使用操作系统map the file into memory（或者至少是它的“窗口”）。

然后很容易找到分隔记录的8字节序列。首先只需搜索0x20，无论何时找到它，您都会看到它是否是整个分隔符序列的开头。

当您找到分隔符序列时，您将获取前一个分隔符的保存位置，以及新找到的分隔符的位置，以及它们之间的数据是您想要的数据。将新找到的分隔符的位置保存为旧位置，然后继续搜索下一个分隔符。

Answer 2

绝对将文件保存为二进制文件并转储实际的十六进制字节，而不是文本格式。您将节省3倍的空间，并且读取文件的实现更容易编写。

话虽这么说，如果您的文件是二进制文件，这就是解决方案：

#include <fstream>  

using std::ifstream;
using std::ofstream;
using std::string;

void incrementFilename(char* filename) {
  int iFile;
  sscanf(filename, "%d.dat", &iFile);
  sprintf(filename, "%d.dat", ++iFile);
}

int main() {
  char outputFilename[16] = "1.dat";
  ifstream input("myfile.dat", ifstream::binary);
  ofstream output(outputFilename, ofstream::binary);

  while (!input.eof() || !input.is_open()) {
    char readbyte;
    input.read(&readbyte, 1);

    if (readbyte == 0x20) {
      char remaining[7];
      char testcase[7] = { 0x00, 0x20, 0x00, 0x00, 0x10, 0x00, 0x00 };
      input.read(remaining, 7);
      if (strncmp(remaining, testcase, 7) == 0) {
        incrementFilename(outputFilename);
        output.close();
        output.open(outputFilename, ofstream::binary);
      } else {
        output.write(&readbyte, 1);
        output.write(remaining, 7);
      }
    } else {
      output.write(&readbyte, 1);
    }
  }

  return 0;
}

Answer 3

鉴于您所追踪的实际数据序列可能跨行分割，您需要以最小的“咬合”读取数据 - 双字符数组 - 并忽略空格（空格或换行符分隔符）。

执行此操作后，您可以在将其写入子文件时跟踪已阅读的内容。获得“神奇序列”后，启动一个新的子文件。

您未涉及的两个复杂问题：

“魔术序列”是否可以作为普通数据的一部分存在于文件中？如果是这样，你将拆分一个单独的文件。
我假设你不想要每个子文件末尾的“魔术序列”。这会给你的比较增加一些复杂性：
- 如果您开始匹配，则需要暂停写入子文件。
- 如果你到达中途并突然停止匹配，那么在写出新的不匹配条目之前，你将不得不写出部分匹配。

这样做的一个好处是：如果一个子文件仍然在主文件中，在一行的末尾附近开始，它将从一个新行开始并在16个双字符之后中断而不是模仿它在主文件中的位置。或者您是否希望子文件以真实字节输出，没有空格分隔符？

我要离开并编写这个程序：听起来很有趣！

好的，我写了以下内容。希望用法描述了该做什么。我并不特别想使用溪流 - 我发现它们非常低效 - 但你开始了它......

//
// SubFile.cpp
//

#include <string>
#include <fstream>
#include <iostream>
#include <iomanip>

using namespace std;

const unsigned MaxBytesPerLine = 16;

const unsigned char magic[] = { '\x20','\x00','\x20','\x00','\x00','\x10','\x00','\x00' };

class OutFile : private ofstream {
public: // Methods
    using ofstream::is_open; // Let others see whether I'm open
    OutFile(const string &fileName, bool bin);
    bool Write(unsigned b);
    ~OutFile();
private: // Variables
    unsigned num; // Number bytes in line
    bool bin; // Whether to output binary
}; // OutFile

OutFile::OutFile(const string &filename, bool bin) :
         ofstream(filename),
         num(0),
         bin(bin) {
    if (!bin) {
        setf(uppercase);
    } // if
} // OutFile::OutFile(name, bin)

bool OutFile::Write(unsigned b) {
    if (bin) {
        char c = (char)b; // Endian fix!
        return write(&c, 1).good();
    } // if
    if (num > 0) {
        *this << " ";
    } // if
    *this << setbase(16) << setw(2) << setfill('0') << b;
    if (++num == MaxBytesPerLine) {
        *this << endl;
        num = 0;
    } // if
    return good();
} // OutFile::Write(b)

OutFile::~OutFile() {
    if (bin) {
        return;
    } // if
    if (num == 0) {
        return;
    } // if
    if (!good()) {
        return;
    } // if
    *this << endl;
} // OutFile::~OutFile

void Usage(char *argv0) {
    cout << "Usage:" << endl;
    cout << "     " << argv0 << " <filename.txt> [bin]" << endl;
    cout << "  Read <filename.txt> in hex char pairs, ignoring whitespace." << endl;
    cout << "  Write pairs out to multiple sub-files, called \"1.txt\", \"2.txt\" etc." << endl;
    cout << "  New files are started when the following sequence is detected: " << endl << " ";
    for (unsigned i = 0; i < sizeof(magic); ++i) {
        cout << ' ' << hex << setw(2) << setfill('0') << (int)magic[i];
    } // for
    cout << endl;
    cout << "  If bin is specified: write out in binary, and files have a '.bin' extension" << endl;
} // Usage(argv0)

int main(int argc, char *argv[]) {
    if (argc < 2) {
        Usage(argv[0]);
        return 1;
    } // if
    ifstream inFile(argv[1]);
    if (!inFile.is_open()) {
        cerr << "Could not open '" << argv[1] << "'!" << endl;
        Usage(argv[0]);
        return 2;
    } // if

    bool bin = (argc >= 3) &&
               (argv[2][0] == 'b'); // Close enough!
    unsigned fileNum = 0; // Current output file number

    inFile >> setbase(16); // All inFile accesses will be like this
    while (inFile.good()) { // Let's get started!
        string outFileName = to_string(++fileNum) + (bin ? ".bin" : ".txt");
        OutFile outFile(outFileName, bin);
        if (!outFile.is_open()) {
            cerr << "Could not create " << outFileName << "!" << endl;
            return (int)(fileNum + 2);
        } // if

        unsigned b; // byte read in
        unsigned pos = 0; // Position in 'magic'
        while (inFile >> b) {
            if (b > 0xFF) {
                cerr << argv[1] << " contains illegal value: "
                     << hex << uppercase << showbase << b << endl;
                return -1;
            } // if
            if (b == magic[pos]) {            // Found some magic!
                if (++pos == sizeof(magic)) { // ALL the magic?
                    break;                    // Leave!
                } // if
                continue;                     // Otherwise go back for more
            } // if
            if (pos > 0) {                   // Uh oh. No more magic!
                for (unsigned i = 0; i < pos; ++i) {
                    outFile.Write(magic[i]); // So write out what we got
                } // for
                pos = 0;
            } // if
            outFile.Write(b);
        } // while
    } // for
    if (inFile.eof()) {
        return 0; // Success!
    } // if

    string s;
    inFile.clear();
    getline(inFile, s);
    cerr << argv[1] << " contains invalid data: " << s << endl;
    return -2;
} // main(argc,argv)

每当有人发布代码时，总会发布评论：
“你为什么不这样做？” “你为什么那样做？” 让闸门打开！

Answer 4

这是我的解决方案。这有点低效，但是一旦我完成决赛，我可能会重写它。我假设有一些数据字节由空格分隔。问题很简单 - ＆gt;这只是一个模式匹配问题。我可以使用一些复杂的技术来处理它，但我们的模式有一个非常小的修复大小。即使是蛮力的方法也会有线性时间。

代码是自我解释的。我逐字节地读取文件并将其添加到缓冲区（效率不高，只能在文件中保留带索引边界的数据窗口 - ＆gt;这可以在创建新文件时提供更高效的I / O操作）。一旦找到终止序列，我们将其弹出并保存到文件中（我假设我们不想要空文件）。

void save(const std::vector<short>& bytes, std::string filename, int sequenceLength)
{
    if (!bytes.size()) return; // Don't want empty files

    std::ofstream outputFile(filename);
    int i = 0;
    for (short byte : bytes)
    {
        outputFile << std::uppercase << std::hex << byte;

        i = (i + 1) % sequenceLength;
        if (i) outputFile << " ";
        else   outputFile << std::endl;
    }
}

std::string getFilename(int number)
{
    std::stringstream ss;
    ss << number << ".txt";
    return ss.str();
}

short getIntFromHex(const char* buffer)
{
    short result;
    std::stringstream ss;
    ss << std::hex << buffer;
    ss >> result;
    return result;
}

bool findTerminatingSequence(const std::vector<short>& bytes, short terminatingSequence[], int sequenceLength)
{
    int i = 0;
    int startIndex = bytes.size() - sequenceLength;
    for (i; i < sequenceLength; i++)
        if (terminatingSequence[i] != bytes[startIndex + i])
            break;
    return i == sequenceLength;
}

void popSequence(std::vector<short>& bytes, int sequenceLength)
{
    for (int j = 0; j < sequenceLength; j++)
        bytes.pop_back();
}

int main()
{
    std::vector<short> bytes;
    std::ifstream inputFile("input.txt");
    int outputFileIndex = 1;
    int sequenceLength = 8;
    short terminatingSequence[] = { 0x20, 0x00, 0x20, 0x00, 0x00, 0x10, 0x00, 0x00 };
    short nextByte;
    char buffer[3];

    while (inputFile >> buffer)
    {
        nextByte = getIntFromHex(buffer);
        bytes.push_back(nextByte);
        if (bytes.size() < sequenceLength || 
            !findTerminatingSequence(bytes, terminatingSequence, sequenceLength)) 
            continue;

        popSequence(bytes, sequenceLength);
        save(bytes, getFilename(outputFileIndex++), sequenceLength);
        bytes.clear();
    }

    save(bytes, getFilename(outputFileIndex), sequenceLength);

    return 0;
}

Answer 5

我会按照以下方式使用Perl：

#!/usr/bin/perl
use warnings;
use strict;

# Slurp entire file from stdin into variable $data
my $data = <>;

# Find offsets of all occurrences of marker in file
my @matches;
my $marker='\x20\x00\x20\x00\x00\x10\x00\x00';
while ($data =~ /($marker)/gi){
    # Save offset of this match - you may want to add length($marker) here to avoid including marker in output file
    push @matches, $-[0];
}

# Extract data between pairs of markers and write to file
for(my $i=0;$i<scalar @matches -1;$i++){
   my $image=substr $data, $matches[$i], $matches[$i+1] - $matches[$i];
   my $filename=sprintf("file-%05d",$i);
   printf("Saving match at offset %d to file %s\n",$matches[$i],$filename);
   open(MYFILE,">$filename");
   print MYFILE $image;
   close(MYFILE);
}

<强>输出

Saving match at offset 12 to file file-00000
Saving match at offset 44 to file file-00001

像这样跑：

./perlscript < binaryData

我或多或少地使用这种技术从相机中恢复受损的闪存卡。您只需在整个闪存卡中搜索一些看起来像JPEG /原始文件开头的字节，然后获取以下10-12MB并将其另存为文件。

Answer 6

您的问题可以通过实施一个简单的finite state machine来解决，因为您没有长期条件。您将读取以空格分隔的十六进制值，并且如果符合您的条件，则逐个检查值。如果匹配则创建一个新文件继续流，如果没有写，则已读取当前文件。这是解决方案，可以通过改变循环来优化读取部分。

（假设输入文件名为input.txt）

#include <fstream>
#include <sstream>

using namespace std;

void writeChunk(ostream& output, int value) {
    if (value == 0)
        output << "00" << " ";
    else
        output << hex << value << " ";
}

bool readNext(fstream& input, int& value, stringstream* keep = NULL) {
    if (input.eof()) {
        return false;
    } else {
        input >> hex >> value;
        if (keep != NULL)
            writeChunk(*keep, value);
        return true;
    }
}

string getFileName(int count) {
    stringstream fileName;
    fileName << count << ".txt";
    return fileName.str();
}

int main() {
    int fileCount = 1;
    stringstream fileName;
    fstream inputFile, outputFile;

    inputFile.open("input.txt");
    outputFile.open(getFileName(fileCount), ios::out);

    int hexValue;
    while (readNext(inputFile, hexValue)) {
        // It won't understand eof until an unsuccessful read, so double checking 
        if (inputFile.eof())
            break;

        if (hexValue == 0x20) {
            stringstream ifFails;
            ifFails << "20 ";
            if (readNext(inputFile, hexValue, &ifFails) && hexValue == 0x00 &&
                    readNext(inputFile, hexValue, &ifFails) && hexValue == 0x20 &&
                    readNext(inputFile, hexValue, &ifFails) && hexValue == 0x00 &&
                    readNext(inputFile, hexValue, &ifFails) && hexValue == 0x00 &&
                    readNext(inputFile, hexValue, &ifFails) && hexValue == 0x10 &&
                    readNext(inputFile, hexValue, &ifFails) && hexValue == 0x00 &&
                    readNext(inputFile, hexValue, &ifFails) && hexValue == 0x00) {
                outputFile.close();
                outputFile.open(getFileName(++fileCount), ios::out);
                continue;
            }
            outputFile << ifFails.str();
        } else {
            writeChunk(outputFile, hexValue);
        }
    }

    return 1;
}

Answer 7

你也可以使用tokenizer：首先阅读＆＃34; myfile＆＃34;成一个字符串。这是必需的，因为在一个文件上你只能有前向迭代器，但正则表达式需要一个双向的：

auto const& str(dynamic_cast<ostringstream&> (ostringstream().operator<<(ifstream("myfile").rdbuf())).str());

然后你需要一个模式来分割，extended＆＃39;。＆＃39;匹配也换行：

auto const& re(regex(".?20.00.20.00.00.10.00.00.?", regex_constants::extended));

最后迭代标记化的字符串并将其写入文件0.txt，依此类推。

auto i(0u);
for_each(sregex_token_iterator(str.cbegin(), str.cend(), re, -1),
         sregex_token_iterator(),
         [&i] (string const& s) {ofstream(to_string(i++) + ".txt") << s; });

请注意，输出文件没有完全格式化，它们看起来像1.txt：

55 73 A2 FF
38 5D 9C FF 3A 5E 95 FF

这只是没有分隔符的内容。

在文本文件中分隔数据

7 个答案: