这可能是一个简单的问题,但我无法弄清楚。
我有一个文件,其中包含大量文本。此文本以某种方式格式化。格式为
<![LOG["The text to display in the log."]LOG]!><time="12:48:39.0+120" date="9-14-2015" component="mycomponent" context="" type="0" thread="0" file="myfile.cpp">
从这一行我想得到的结果:
- The text to display in the log.
- 12:48:39.0+120
- 9-14-2015
- mycomponent
- "" (empty)
- 0
- 0
- myfile.cpp
任何所需的值都可能为空。 有一个简单的方法,我怎么能得到它们?
提前致谢!
答案 0 :(得分:2)
Search for "
in pairs
start " --> end " repeat.
example like this:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(void){
const char *text = "<![LOG[\"The text to display in the log.\"]LOG]!><time=\"12:48:39.0+120\" date=\"9-14-2015\" component=\"mycomponent\" context=\"\" type=\"0\" thread=\"0\" file=\"myfile.cpp\">";
const char *start = text;
const char *end;
while(start = strchr(start, '"')){
++start;
end = strchr(start, '"');//if(end == NULL) bad format
size_t len = end - start;
char *pickup = malloc(len + 1);
memcpy(pickup, start, len);
pickup[len] = '\0';
if(len)
puts(pickup);
else
puts("\"\"");//empty
free(pickup);
start = end + 1;
}
return 0;
}
答案 1 :(得分:0)
这很容易......
编辑:
#include <stdio.h>
#include <string.h>
int main(void)
{
FILE *input;
fopen_s(&input,"myfile.cpp","r");
char c;
int printState = 0 , startPrinting = 0 ;
bool empty = true;
while( (c=getc(input)) != EOF )
{
if( c == '"' )
printState++;
if( printState == 2 )
{
if( empty == true )
{
printf("\"\" (empty)\n");
printState = 0;
startPrinting = 0;
}
else
{
printf("\n");
printState = 0;
startPrinting = 0;
empty = true;
}
}
if( startPrinting == 1 )
{
printf("%c",c);
empty = false;
}
if( printState == 1 )
{
startPrinting = 1;
}
}
fclose(input);
return 0;
}
答案 2 :(得分:0)
假设您有POSIX正则表达式库,这里有一种方法:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <regex.h>
int main( int argc, char **argv )
{
const char *text = "<![LOG[\"The text to display in the log.\"]LOG]!>"
"<time=\"12:48:39.0+120\" date=\"9-14-2015\" "
"component=\"mycomponent\" context=\"\" type=\"0\" "
"thread=\"0\" file=\"myfile.cpp\">";
/**
* [ and ] are special characters in regular expressions and must be
* escaped, but we must also escape the \ and " characters in string
* literals.
*
* The *actual* regular expression would be
*
* <!\[LOG\["(.*)"\]LOG\]!><time="(.*)" *date="(.*)" etc.
*
*/
const char *ptn="<!\\[LOG\\[\"(.*)\"\\]LOG\\]!>"
"<time=\"(.*)\" *date=\"(.*)\" *component=\"(.*)\" *"
"context=\"(.*)\" *type=\"(.*)\" *thread=\"(.*)\" *"
"file=\"(.*)\">";
/**
* "Compile" the regular expression; REG_EXTENDED means we're using
* extended regular expression syntax (so we don't need to escape
* opening and closing parens, among other things). By default,
* we try to match subexpressions specified in parentheses.
*/
regex_t regex;
if ( regcomp( ®ex, ptn, REG_EXTENDED ) != 0 )
{
fprintf( stderr, "regcomp failed on %s\n", ptn );
exit( 0 );
}
fprintf( stdout, "number of subexpressions: %zu\n", regex.re_nsub );
size_t matchCount=regex.re_nsub + 1;
regmatch_t pmatch[matchCount];
int ret = 0;
/**
* Try to match the input string. matchCount specifies the *maximum*
* number of expressions we expect to match (main string plus any
* subexpressions). pmatch will contain the metadata for each expression
* match (start and end indices in the overall string, basically).
*/
if ( (ret = regexec( ®ex, text, matchCount, pmatch, 0 )) != 0 )
{
/**
* regexec call failed. There are a number of error codes that can be
* returned (text didn't match, pattern is invalid, etc.). Check
* the documentation for regexec.
*/
fprintf( stdout, "%s does not match %s, return code %d\n", text, ptn, ret );
}
else
{
/**
* regexec call succeeded - print out all the matching expressions.
*/
fprintf( stdout, "%s matches %s\n", text, ptn );
for ( size_t i = 0; i < matchCount; i++ )
{
if ( pmatch[i].rm_so >= 0 )
{
fprintf( stdout, "match %zu (start: %3lu; end: %3lu): %*.*s\n", i,
(unsigned long) pmatch[i].rm_so,
(unsigned long) pmatch[i].rm_eo,
(int) ( pmatch[i].rm_eo - pmatch[i].rm_so ),
(int) ( pmatch[i].rm_eo - pmatch[i].rm_so ),
text + pmatch[i].rm_so );
}
}
}
return 0;
}
并且,示例运行:
[fbgo448@n9dvap997]~/prototypes/regex: ./myregex
number of subexpressions: 8
<![LOG["The text to display in the log."]LOG]!><time="12:48:39.0+120" date="9-14-2015" component="mycomponent" context="" type="0" thread="0" file="myfile.cpp"> matches <!\[LOG\["(.*)"\]LOG\]!><time="(.*)" *date="(.*)" *component="(.*)" *context="(.*)" *type="(.*)" *thread="(.*)" *file="(.*)">
match 0 (start: 0; end: 160): <![LOG["The text to display in the log."]LOG]!><time="12:48:39.0+120" date="9-14-2015" component="mycomponent" context="" type="0" thread="0" file="myfile.cpp">
match 1 (start: 8; end: 39): The text to display in the log.
match 2 (start: 54; end: 68): 12:48:39.0+120
match 3 (start: 76; end: 85): 9-14-2015
match 4 (start: 98; end: 109): mycomponent
match 5 (start: 120; end: 120):
match 6 (start: 128; end: 129): 0
match 7 (start: 139; end: 140): 0
match 8 (start: 148; end: 158): myfile.cpp
请记住,每个pmatch
条目仅指向源字符串;它们本身不是单独的字符串。
这是正则表达式库的理想任务,无论是POSIX正则表达式还是其他东西。手动攻击你自己的解析例程当然是值得学习的经验,但如果你不需要这样做,那就利用别人所做的工作。