C ++逐字逐句读取文件

时间:2017-06-25 21:31:28

标签: c++ string parsing ifstream lexical-analysis

我想从文本文件中逐字逐句阅读。这是我在C ++中的代码:

int main(int argc, const char * argv[]) {
    // insert code here...

    ifstream file("./wordCount.txt");
    string word;
    while(file >> word){
        cout<<word<<endl;
    }

    return 0;
}

文本文件包含以下句子:

I don't have power, but he has power.

这是我得到的结果:

I
don\241\257t
have
power,
but
he
has
power.

您能告诉我如何获得如下格式的结果:

I
don't
have
power
but
he
has
power

感谢。

4 个答案:

答案 0 :(得分:4)

我知道您正在寻找摆脱标点符号。

不幸的是,从流中提取字符串只会将空格视为分隔符。所以&#34;不要&#34;或者&#34;你好,世界&#34;将被视为一个单词,并且&#34; don&#39; T&#34;或者&#34;你好,世界&#34;作为两个字。

另一种方法是逐行阅读文本,并使用string::find_first_of()从分隔符跳转到分隔符:

string separator{" \t\r\n,.!?;:"};
string line; 
string word;
while(getline (cin, line)){  // read line by line 
    size_t e,s=0;            // s = offset of next word, e = end of next word 
    do {
        s = line.find_first_not_of(separator,s);  // skip leading separators
        if (s==string::npos)                  // stop if no word left
            break;
        e=line.find_first_of(separator, s);   // find next separator 
        string word(line.substr(s,e-s));      // construct the word
        cout<<word<<endl;
        s=e+1;                                // position after the separator
    } while (e!=string::npos);                // loop if end of line not reached
}

Online demo

答案 1 :(得分:1)

下面的代码摆脱了标点符号,除了撇号:

#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>

using namespace std;

int main(int argc, const char * argv[]) {
    ifstream file("wordCount.txt");
    string word;
    while(file >> word) {
        for (auto c : word)
            if (ispunct(c) && c != '`')
                word.erase(word.find_first_of(c));
        cout << word << endl;
    }

    return 0;
}

应该产生所需的输出:

Georgioss-MacBook-Pro:~ gsamaras$ g++ -Wall -std=c++0x main.cpp 
Georgioss-MacBook-Pro:~ gsamaras$ ./a.out 
I
don`t
have
power
but
he
has
power

对于某些字符的问题,我建议你检查文件的编码,所以试试(按照here所述):

file -I wordCount.txt 
wordCount.txt: text/plain; charset=us-ascii

这对我有用。或者只需打开文本编辑器并确保字符有效。

答案 2 :(得分:0)

为了简化调试,我用std :: istringstream替换文件。

  • 轻松添加其他测试输入
  • 因此记录了输入,并且可重复。

我还添加了一个bool(类数据属性)来简化其他诊断信息的启用/禁用。 (m_dbg)

#include <algorithm>
#include <chrono>
// 'compressed' chrono access --------------vvvvvvv
typedef std::chrono::high_resolution_clock  HRClk_t; // std-chrono-hi-res-clk
typedef HRClk_t::time_point                 Time_t;  // std-chrono-hi-res-clk-time-point
typedef std::chrono::microseconds           MS_t;    // std-chrono-milliseconds
typedef std::chrono::microseconds           US_t;    // std-chrono-microseconds
typedef std::chrono::nanoseconds            NS_t;    // std-chrono-nanoseconds
using   namespace std::chrono_literals;          // support suffixes like 100ms, 2s, 30us

#include <iostream>
#include <iomanip>
#include <sstream>
#include <string>
#include <vector>


class T496_t
{
   std::array<char, 256>     m_keep;
   std::vector<std::string>  m_wordVec;
   bool                      m_dbg = false;

public:

   T496_t()
      {
         for (uint i=0; i<256; ++i)
            m_keep[i] = static_cast<char>(i);
         m_keep[uint(',')] = 0;
         m_keep[uint('.')] = 0;
      }

   ~T496_t() = default;

   int exec()
      {
         std::istringstream file(
            "Hello\n"
            "I don't have power, but he has power.\n"
            "I don't  have power , but he has power.\n"
            ); //ifstream file("./wordCount.txt");

         uint lineCount = 1;
         while(1)
         {
            std::string line;
            (void)std::getline(file, line);
            if(file.eof())
            {
               ltrim(line);
               if(0 != line.size())
                  if(m_dbg) std::cout << __LINE__ << "  tail: " << line << std::endl;
               break;
            }

            if(m_dbg) std::cout << "\n  line " << lineCount++ << " :  '"
                                << line << "'\n  " << std::setfill('-')
                                << std::setw(static_cast<int>(line.size())+12)
                                << "-" << std::setfill(' ');
            std::cout << '\n';


            size_t sz = line.size();
            if(0 == sz)
               continue;     // ignore empty lines

            extractWordsFrom(line); // extract words

            if(file.eof()) break;
         }
         return(0);
      }

private: // methods

   void extractWordsFrom(std::string& unfiltered)
      {
         std::string line; // filtered
         filter(unfiltered, line);

         if(0 == line.size()) {
            if(m_dbg) std::cout << "  empty line" << std::endl; return;
         }

         size_t indx1 = 0;
         do {
            while(isspace(line[indx1])) { indx1 += 1; } // skip leading spaces

            size_t indx2 = line.find(" ", indx1);
            if(std::string::npos == indx2)
            {
               m_wordVec.push_back(line.substr(indx1));
               if(m_dbg) std::cout << "  word(" << std::setw(3) << indx1 << ", eoln): ";
               std::cout << "  " << m_wordVec.back() << std::endl;
               break;
            }

            m_wordVec.push_back(line.substr(indx1, indx2-indx1));
            if(m_dbg) std::cout << "  word(" << std::setw(3) << indx1 << ","
                                << std::setw(3) << indx2 << "): ";
            std::cout << "  " << m_wordVec.back() << std::endl;
            indx1 = indx2+1;

         }while(1);
      }

   void filter(std::string& unfiltered, std::string& line)
      {
         ltrim(unfiltered); // remove leading blanks

         for(uint i=0; i<unfiltered.size(); ++i) // transfer all chars
            if(m_keep[unfiltered[i]])            // exception check
               line.push_back(unfiltered[i]);
      }


   // trim from start
   void  ltrim(std::string &s) {
      s.erase(s.begin(),
              std::find_if(s.begin(), s.end(),
                           std::not1(std::ptr_fun<int, int>(std::isspace)) ));
   }
   // trim from end
   void rtrim(std::string &s) {
      s.erase(std::find_if(s.rbegin(), s.rend(),
                           std::not1(std::ptr_fun<int, int>(std::isspace))).base(),s.end());
   }
   // trim from both ends
   void  lrtrim(std::string &s) { rtrim(s); ltrim(s); }

}; // class T496_t


int main(int /*argc*/, char** /*argv[]*/)
{
  setlocale(LC_ALL, "");
  std::ios::sync_with_stdio(false);

  Time_t start_us = HRClk_t::now();

  int retVal = -1;
  {
     T496_t   t496;
     retVal = t496.exec();
  }

  auto  duration_us = std::chrono::duration_cast<US_t>(HRClk_t::now() - start_us);
  std::cout << "\n\n  FINI   " << duration_us.count() << " us" << std::endl;
  return(retVal);

}

   // desired output:
   // I
   // don't
   // have
   // power
   // but
   // he
   // has
   // power

此代码的输出:

  Hello

  I
  don't
  have
  power
  but
  he
  has
  power

  I
  don't
  have
  power
  but
  he
  has
  power

输出m_dbg = true

  line 1 :  'Hello'
  -----------------
  word(  0, eoln):   Hello

  line 2 :  'I don't have power, but he has power.'
  -------------------------------------------------
  word(  0,  1):   I
  word(  2,  7):   don't
  word(  8, 12):   have
  word( 13, 18):   power
  word( 19, 22):   but
  word( 23, 25):   he
  word( 26, 29):   has
  word( 30, eoln):   power

  line 3 :  'I don't  have power , but he has power.'
  ---------------------------------------------------
  word(  0,  1):   I
  word(  2,  7):   don't
  word(  9, 13):   have
  word( 14, 19):   power
  word( 21, 24):   but
  word( 25, 27):   he
  word( 28, 31):   has
  word( 32, eoln):   power


  FINI   215 us

答案 3 :(得分:0)

一种简单的方法是首先过滤字符串。删除除撇号(即')之外的任何标点符号,并用空格替换它们以进行进一步操作(即利用某些内置函数)。

#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include <sstream>
#include <iterator>

using namespace std;

bool isOk(char c)
{
    if ( ispunct(c) )
        if ( c == '\'' )
            return false;

    return ispunct(c);
}

int main()
{
    ifstream file("data.txt");
    string word;

    while(file >> word){
        std::replace_if(word.begin(), word.end(), isOk, ' ');
        istringstream ss(word);
        copy(istream_iterator<string>(ss), istream_iterator<string>(), ostream_iterator<string>(cout, "\n"));
    }

    return 0;
}

输出

I
don't
have
power
but
he
has
power