Question

背景：我在C ++中摇摆不定

输入文件：1万行

FCC5G2YACXX:5:1101:1224:2059#NNNNNNNN   97  genome  96003934    24  118M4D11M   =   96004135    0   GCA....ACG  P\..GW^EO   AS:i:-28    XN:i:0  XM:i:2  XO:i:1  XG:i:4  NM:i:6  MD:Z:54G53T9^TACA11 YT:Z:UP

预期输出

96003934 98.31

解释输出

第4栏：96003934

第18栏：MD:Z:54G53T9^TACA11

匹配= 54 + 53 + 9 = 116

mismatch = count_letter（54G53T9）= 2

id = 116 * 100 /（116 + 2）= 98.30508474576272

awk脚本

awk '{
    split($18,v,/[\^:]/); 
    nmatch = split(v[3],vmatch, /[^0-9]/); 
    cmatch=0; 
    for(i=1; i<=nmatch; i++) cmatch+=vmatch[i]; 
    printf("%s"OFS"%.2f\n", $4, cmatch*100/(cmatch+nmatch-1));
}' file.sam

C ++ ，我认为会更快

#include <iostream>
#include <string>
#include <vector>
#include <sstream>
#include <algorithm>
#include <iterator>
#include <iomanip>

using namespace std;

int main(){
  string line;
  while(getline(cin, line)){
    istringstream iss(line);
    vector<string> columns;
    copy(istream_iterator<string>(iss),    //Split line by spaces
         istream_iterator<string>(),
         back_inserter(columns));
    //I extract information from column 18
    int start = columns[17].find_last_of(':');
    int end = columns[17].find_first_of('^');
    string smatch = columns[17].substr(start+1, end-start-1);
    // I get for example "54G53T9"
    replace( smatch.begin(), smatch.end(), 'A', ' ');
    replace( smatch.begin(), smatch.end(), 'C', ' ');
    replace( smatch.begin(), smatch.end(), 'G', ' ');
    replace( smatch.begin(), smatch.end(), 'T', ' ');
    // I get for example "54 53 9"
    istringstream iss_sum(smatch);
    int n=0, sum=0, count=0;
    while(iss_sum >> n){
      sum += n;
      count++;
    }
    cout << columns[3] << ' ' << fixed << setprecision(2) 
         << (float)sum*100 / (sum+count-1) << endl;
  }
}

基准

在输入中有1毫米的线......

awk：0m6.102s
C ++：0m15.814s

问题

我做错了什么，以便C++慢慢运作？ .....我可以改进C ++程序吗？如果有，怎么样？ .....我应该写C吗？ ....

提前谢谢

Answer 1

C ++ iostream并没有真正提供一种检查列存在于某些输入中的好方法，但忽略了它。 C ++ iostream有一个ignore，但它并不适合这个特殊情况，所以它可能不会有帮助。

在这种情况下，我至少考虑使用scanf代替，可能是这个一般顺序的事情：

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <string>
#include <iostream>
#include <numeric>

int main() {
    char column4[256];
    char column17[256];

    while (2 == scanf("%*s %*s %*s %255s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %255s %*s", column4, column17)) {
        char *beg = strrchr(column17, ':') + 1;
        char *end = strchr(column17, '^');

        *end = '\0';

        int nums[5];

        int count = sscanf(beg, "%d%*[A-Z]%d%*[A-Z]%d%*[A-Z]%d%*[A-Z]%d", nums, nums + 1, nums + 2, nums + 3, nums + 4);


        int sum = std::accumulate(nums, nums + count, 0);

        double result = (sum*100.0) / (sum + count-1);
        printf("%s %2.2f\n", column4, result);
    }
}

目前，这假设（可能不正确，但我不得不猜测某事）第17（或，我算作第18，但无论如何）列可以从开头到最后一个冒号（{ {1}}）。然后有一些任意数量的重复数字，然后是字母，另一个数字，另一个字母，等等（假定当前以数字开头和结尾）。目前，我已经允许最多5个数字，但允许更多数字将是微不足道的。允许模式的更多变化可能需要更多的工作（取决于可能发生的变化类型。

为了提高速度，可以使用更大的输入缓冲区，如下所示：

你需要/想要在阅读任何内容之前执行此操作，因此它将在setvbuf(stdin, NULL, _IOFBF, 65536);循环之前执行。这将做多少好事（如果有的话）似乎有所不同，但它很容易做到这一点值得至少尝试一下，看看它是否有所作为。

Answer 2

我的代码改进了，时间8s

#include <iostream>
#include <string>
#include <iomanip>
#include <stdio.h>

using namespace std;

int main(){
  string record, col4;
  int sum, count, c_garbage, i, n;
  char garbage;
  while(cin >> record){
    if(i%19 == 3) col4 = record;
    else if(i%19 == 16){
      sum = 0;
      count = 0;
      c_garbage = 0;
      while(1){
        if(c_garbage == 2){
          cin >> n;
          sum += n;
          count++;
        }
        cin >> garbage;
        if(garbage==':') c_garbage++;
        else if(garbage=='^') break;
      }
      printf("%s %2.2f\n", col4.c_str(), (float)sum*100 / (sum+count-1));
    }
    i++;
  }
}

Answer 3

我总是建议在尝试回答性能问题时使用分析器。

快速查看代码......使用C ++代码中的字符串处理可能会导致性能下降。

你正在进行大量的字符串复制......

getline()
iss()
copy()
substr()

然后你在smatch上做了四次替换。

为什么awk脚本比C ++程序更快？

3 个答案: