Question

这可能是一个简单的问题，但我无法弄清楚。我有一个文件，其中包含大量文本。此文本以某种方式格式化。格式为
<![LOG["The text to display in the log."]LOG]!><time="12:48:39.0+120" date="9-14-2015" component="mycomponent" context="" type="0" thread="0" file="myfile.cpp">

从这一行我想得到的结果：
- The text to display in the log.
- 12:48:39.0+120
- 9-14-2015
- mycomponent
- "" (empty)
- 0
- 0
- myfile.cpp

任何所需的值都可能为空。有一个简单的方法，我怎么能得到它们？

提前致谢！

Answer 1

Search for " in pairs
start " --> end " repeat.
example like this:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(void){
    const char *text = "<![LOG[\"The text to display in the log.\"]LOG]!><time=\"12:48:39.0+120\" date=\"9-14-2015\" component=\"mycomponent\" context=\"\" type=\"0\" thread=\"0\" file=\"myfile.cpp\">";

    const char *start = text;
    const char *end;
    while(start = strchr(start, '"')){
        ++start;
        end = strchr(start, '"');//if(end == NULL) bad format
        size_t len = end - start;
        char *pickup = malloc(len + 1);
        memcpy(pickup, start, len);
        pickup[len] = '\0';
        if(len)
            puts(pickup);
        else
            puts("\"\"");//empty
        free(pickup);
        start = end + 1;
    }

    return 0;
}

Answer 2

这很容易......

编辑：

#include <stdio.h>
#include <string.h>


int main(void)
{
FILE *input;

fopen_s(&input,"myfile.cpp","r");

char c;

int printState = 0 , startPrinting = 0 ;

bool empty = true;

while( (c=getc(input)) != EOF )
{
    if( c == '"' )
        printState++;

    if( printState == 2 )
    {
        if( empty == true )
        {
            printf("\"\" (empty)\n");

            printState = 0;

            startPrinting = 0;
        }
        else
        {
            printf("\n");

            printState = 0;

            startPrinting = 0;

            empty = true;

        }
    }

    if( startPrinting == 1 )
    {
        printf("%c",c);
        empty = false;
    }

    if( printState == 1 )
    {
        startPrinting = 1;
    }

}

fclose(input);

return 0;
}

Answer 3

假设您有POSIX正则表达式库，这里有一种方法：

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#include <regex.h>

int main( int argc, char **argv )
{
  const char *text = "<![LOG[\"The text to display in the log.\"]LOG]!>"
                     "<time=\"12:48:39.0+120\" date=\"9-14-2015\" "
                     "component=\"mycomponent\" context=\"\" type=\"0\" "
                     "thread=\"0\" file=\"myfile.cpp\">";

  /**
   * [ and ] are special characters in regular expressions and must be
   * escaped, but we must also escape the \ and " characters in string
   * literals.
   *
   * The *actual* regular expression would be
   *
   *    <!\[LOG\["(.*)"\]LOG\]!><time="(.*)" *date="(.*)" etc.
   *
   */
  const char *ptn="<!\\[LOG\\[\"(.*)\"\\]LOG\\]!>"                       
                  "<time=\"(.*)\" *date=\"(.*)\" *component=\"(.*)\" *"
                  "context=\"(.*)\" *type=\"(.*)\" *thread=\"(.*)\" *"
                  "file=\"(.*)\">";

  /**
   * "Compile" the regular expression; REG_EXTENDED means we're using
   * extended regular expression syntax (so we don't need to escape
   * opening and closing parens, among other things).  By default, 
   * we try to match subexpressions specified in parentheses.
   */
  regex_t regex;
  if ( regcomp( &regex, ptn, REG_EXTENDED ) != 0 )
  {
    fprintf( stderr, "regcomp failed on %s\n", ptn );
    exit( 0 );
  }

  fprintf( stdout, "number of subexpressions: %zu\n", regex.re_nsub );
  size_t matchCount=regex.re_nsub + 1;
  regmatch_t pmatch[matchCount];

  int ret = 0;
  /**
   * Try to match the input string.  matchCount specifies the *maximum*
   * number of expressions we expect to match (main string plus any  
   * subexpressions).  pmatch will contain the metadata for each expression
   * match (start and end indices in the overall string, basically).  
   */
  if ( (ret = regexec( &regex, text, matchCount, pmatch, 0 )) != 0 )
  {
    /**
     * regexec call failed.  There are a number of error codes that can be 
     * returned (text didn't match, pattern is invalid, etc.).  Check 
     * the documentation for regexec.
     */
    fprintf( stdout, "%s does not match %s, return code %d\n", text, ptn, ret );
  }
  else
  {
    /**
     * regexec call succeeded - print out all the matching expressions.
     */
    fprintf( stdout, "%s matches %s\n", text, ptn );
    for ( size_t i = 0; i < matchCount; i++ )
    {
      if ( pmatch[i].rm_so >= 0 )
      {
        fprintf( stdout, "match %zu (start: %3lu; end: %3lu): %*.*s\n", i,
          (unsigned long) pmatch[i].rm_so,
          (unsigned long) pmatch[i].rm_eo,
          (int) ( pmatch[i].rm_eo - pmatch[i].rm_so ), 
          (int) ( pmatch[i].rm_eo - pmatch[i].rm_so ), 
          text + pmatch[i].rm_so );
      }
    }
  }

  return 0;
}

并且，示例运行：

[fbgo448@n9dvap997]~/prototypes/regex: ./myregex
number of subexpressions: 8
<![LOG["The text to display in the log."]LOG]!><time="12:48:39.0+120" date="9-14-2015" component="mycomponent" context="" type="0" thread="0" file="myfile.cpp"> matches <!\[LOG\["(.*)"\]LOG\]!><time="(.*)" *date="(.*)" *component="(.*)" *context="(.*)" *type="(.*)" *thread="(.*)" *file="(.*)">
match 0 (start:   0; end: 160): <![LOG["The text to display in the log."]LOG]!><time="12:48:39.0+120" date="9-14-2015" component="mycomponent" context="" type="0" thread="0" file="myfile.cpp">
match 1 (start:   8; end:  39): The text to display in the log.
match 2 (start:  54; end:  68): 12:48:39.0+120
match 3 (start:  76; end:  85): 9-14-2015
match 4 (start:  98; end: 109): mycomponent
match 5 (start: 120; end: 120):
match 6 (start: 128; end: 129): 0
match 7 (start: 139; end: 140): 0
match 8 (start: 148; end: 158): myfile.cpp

请记住，每个pmatch条目仅指向源字符串;它们本身不是单独的字符串。

这是正则表达式库的理想任务，无论是POSIX正则表达式还是其他东西。手动攻击你自己的解析例程当然是值得学习的经验，但如果你不需要这样做，那就利用别人所做的工作。

如何阅读格式化文本的部分内容？

3 个答案: