(' '实际上只是变形金刚的角色。令人惊讶的是,在某些线条上,这些角色完美显示)


#include <algorithm>
#include <cassert>
#include <cctype>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <pthread.h>
#include <sstream>
#include <string>
#include <tuple>
#include <unordered_map>
#include <unordered_set>

using namespace std;

int lengthLowerBound = -1;
int requirePunct = 0;

unordered_map <string, string> specials;

// weird characters
#pragma warning( push)
#pragma warning( disable : 4101 )
bool isOK(char ch) {
  if (ch == '�') return false;
  return true;
#pragma warning( pop )

bool isOK(string &line) {
  for (int i = 0; i < line.length(); ++i) {
    if (line[i] == '�') return false;
  return true;

// has punctuations
bool hasPunctuations(string &line) {
  for (int i = 0; i < line.length(); ++i) {
    if (ispunct(line[i])) return true;
  return false;

int main(int argc, char *argv[]) {
  if (argc < 5) {
    cout << "Usage: ./Filter input1.txt input2.txt output1.txt output2.txt [hasPunctuation | noShorterThan x]" << endl;

  string inpFile1 = string(argv[1]);
  string inpFile2 = string(argv[2]);
  string outFile1 = string(argv[3]);
  string outFile2 = string(argv[4]);

  for (int i = 5; i < argc; ++i) {
    if (strcmp(argv[i], "hasPunctuation") == 0) requirePunct = 1;
    else if (strcmp(argv[i], "noShorterThan") == 0) lengthLowerBound = atoi(argv[i+1]);

  // filter
  ifstream finp1(inpFile1, ifstream::in);
  if (finp1.fail()) {
    cout << "  Can't open file " << inpFile1 << endl;

  ifstream finp2(inpFile2, ifstream::in);
  if (finp2.fail()) {
    cout << "  Can't open file " << inpFile2 << endl;

  ofstream fout1(outFile1, ofstream::out);
  if (fout1.fail()) {
    cout << "  Can't open file " << outFile1 << endl;

  ofstream fout2(outFile2, ofstream::out);
  if (fout2.fail()) {
    cout << "  Can't open file " << outFile2 << endl;

  string line1, line2;
  int numLines = 0;
  cout << "# Start tokenizing" << endl;
  while (getline(finp1, line1)) {
    getline(finp2, line2);

    if (line1.empty() || line2.empty()) continue;

    if (!isOK(line1) || !isOK(line2)) continue;

    if (lengthLowerBound > 0) {
      if (line1.length() < lengthLowerBound || line2.length() < lengthLowerBound) continue; 

    if (requirePunct) {
      if (!hasPunctuations(line1) || !hasPunctuations(line2)) continue;

    fout1 << line1 << endl;
    fout2 << line2 << endl;

    if (numLines % 1000 == 0) cout << "\r  Read " << numLines/1000 << "k lines.";
  cout << endl;
  cout << "# Done" << endl;


  return 0;

如果您只是复制上面的代码并按照代码本身的指导用适当的命令运行它,在给定的句子上,您将看到代码没有做任何事情。我怀疑这是因为比较ch == '�'总是假的。实际上有一个警告。

所以,我的问题是如何做我想做的事情?它不需要在C ++中。 Python,Perl或sed命令,非常感谢。感谢。

TL; DR:我想处理一个文本文件,以除去引用文本中的所有“ ”字符。

It looks like less does not understand the encoding of your German file: the '�' characters are probably being displayed in place of the actual characters for that reason. The easiest way to filter out these lines is probably to check for non-ASCII characters by seeing if any bytes have the highest bit set:

bool isOk(const string& line) {
    return none_of(begin(line), end(line), [](uint8_t c) {return 0x80 & c;});

This works for UTF-8 encoded text because any non-ASCII code point is encoded as multiple bytes which each have the highest bit set. It also works for more limited 8-bit codes like ISO-8859-1 because the non-ASCII characters are represented by bytes outside of the ASCII range, which must also have the highest bit set.

bool isOk(string str) {
    for(int i=0; i < str.length(); i++) {
        char c = tolower(str[i]);
        if(!isalnum(c) && !ispunct(c) && !isspace(c)) // check if not alphanumeric, a punctuation nor white space
            return false
    return true;
编辑:也许是&#39; tolower&#39;不是真的需要

&#39;&#39;是Unicode&#34; replacement character&#34; (U + FFFD); Unicode输出设备通常会显示带有替换字符实例的无效代码。



  • 将您的语料库转换为UTF-8或

  • 告诉您的终端期望语料库中使用的任何编码。


对于第一种可能性,请查看iconv实用程序(还有一个具有相同名称的Posix标准库函数)。对于第二个,您需要更改您的语言环境以及可能的终端模拟器配置。有关一些想法,请参阅姐妹Stackexchange网站上的this question