我必须对代码C ++代码应用转换,但不应对注释或预处理器语句应用转换。这是我遇到麻烦的预处理器语句。基本上,我想要这样的东西:
#!/usr/bin/perl
my $file = $ARGV[0];
my $doubleQuotedString = q{"(?>[^"\\\\]++|\\\\{2}|\\\\(?s).)*+"};
my $singleQuotedString = q{'(?>[^'\\\\]++|\\\\{2}|\\\\(?s).)*+'};
my ($rest, $code, $stuffToIgnore) = ("");
open(my $inputFH, "<:raw:crlf", $file) or die "can't open $file for reading. $!";
open(my $outputFH, ">:raw:crlf", "$file.out") or die "can't open $file.out for writing. $!";
my $counter = 0;
while (<$inputFH>)
{
$_ = "$rest$_";
do
{
($code, $stuffToIgnore, $rest) = m(
((?:
$doubleQuotedString # found a string
|$singleQuotedString # found a string
|(?:[^/]++|/[^*/]) # found something not a string, comment or preprocessor statement
)*+
)
((?:
^\s*+#.*$ | # preprocessor statement
\s*+//.*$ | # line comment
\s*+/\*(?:[^*]++|\*(?!/))*+\*/ # block comment
)*+
)
((?s).*) # rest
)xm;
++$counter;
goto BLOCK_READ if $stuffToIgnore ne "" or eof($inputFH);
} while ($_ .= <$inputFH>);
BLOCK_READ:
defined $code or die "Unterminated block.";
# do transformation on $code
print "CODE: >>$code<<\nIGNORE: >>$stuffToIgnore<<\n";
print $outputFH "$code$stuffToIgnore";
}
我的问题是评论为# found something not a string, comment or preprocessor statement
的问题。注释掉行# preprocessor statement
使其有效,但它会将预处理器语句视为应用转换的代码。
如何在预处理程序语句中将正则表达式的第一部分更改为失败?或许你有另一种方法可以做到这一点?
修改
我在@ sln的帮助下完成了答案。我在这里发布它作为参考,以了解我如何完成我试图做的事情,同时为那些感兴趣的人留下一些调试内容。
#!/usr/bin/perl
use strict;
use warnings;
my $file = $ARGV[0];
my $debug = 1;
my ($rest, $code, $ignore) = ("");
my $lineNumber = 1;
my $topLineOfBlock;
open(my $inputFH, "<:raw:crlf", $file) or die "can't open $file for reading. $!";
open(my $outputFH, ">:raw:crlf", "$file.out") or die "can't open $file.out for writing. $!";
my $complete = 1; # NOTE: These 2 must be declared in the base scope of the package.
my $lineOffset; # See https://rt.perl.org/Ticket/Display.html?id=120554 for details.
while ($_ = <$inputFH>, !eof($inputFH) or length($rest) != 0)
{
$topLineOfBlock = $lineNumber;
print "Read line $lineNumber\n" if $debug;
if (defined $_)
{
$_ = "$rest$_";
}
else
{
$_ = $rest;
}
my $loopAgain;
do
{
if (/\\$/) # if line ends with '\' read and append in next line
{
$complete = 0;
}
elsif (eof($inputFH) or /;\s*+$/) # if eof or line does end in a ';', break it up.
# otherwise read and append in next line.
{
print "INPUT: '$_'\n" if $debug;
use re 'eval';
m%
(?{print "STRING: '${^POSTMATCH}'\n" if $debug})
(?{$lineOffset = 0})
# ROUTINES
(?!) # Ignore this section for matching
# DEBUG ROUTINES
# Call them using (?N) where N is the corrisponding number.
((?{print "1]'${^MATCH}'\n" if $debug}))
((?{print "2]'${^MATCH}'\n" if $debug}))
((?{print "3]'${^MATCH}'\n" if $debug}))
((?{print "4]'${^MATCH}'\n" if $debug}))
((?{print "5]'${^MATCH}'\n" if $debug}))
((?{print "6]'${^MATCH}'\n" if $debug}))
((?{print "7]'${^MATCH}'\n" if $debug}))
((?{print "8]'${^MATCH}'\n" if $debug}))
((?{print "9]'${^MATCH}'\n" if $debug}))
((?{print "10]'${^MATCH}'\n" if $debug}))
((?{print "11]'${^MATCH}'\n" if $debug}))
((?{print "12]'${^MATCH}'\n" if $debug}))
((?{print "13]'${^MATCH}'\n" if $debug}))
((?{print "14]'${^MATCH}'\n" if $debug}))
((?{print "15]'${^MATCH}'\n" if $debug}))
((?{print "16]'${^MATCH}'\n" if $debug}))
((?{print "17]'${^MATCH}'\n" if $debug}))
((?{print "18]'${^MATCH}'\n" if $debug}))
((?{print "19]'${^MATCH}'\n" if $debug}))
# SUBROUTINES
# States that code read in is in an incomplete state.
(?<INCOMPLETE>(?{print "INCOMPLETE: '${^MATCH}'\n" if $debug; $complete = 0;}))
# States that code read in is in a completed state.
(?<COMPLETE> (?{print "COMPLETE: '${^MATCH}'\n" if $debug; $complete = 1;}))
# Matches against one character that has been escaped including EOL.
# If a quoted EOL found, mark match as incomplete.
(?<ESCAPED_CHAR>
\\
(?:
(?&EOL) (?&INCOMPLETE)
| (?s).
)
)
# Matches against a single quoted string excluding EOL.
(?<SINGLE_QUOTED_STRING>
\'(?&INCOMPLETE) # Escaped quotes due to a syntax highlighting bug in SO
(?:
[^\'\\\\n]++ # Escaped quotes due to a syntax highlighting bug in SO
| (?&ESCAPED_CHAR)
)*+
\'(?&COMPLETE) # Escaped quotes due to a syntax highlighting bug in SO
)
# Matches against a double quoted string excluding EOL.
(?<DOUBLE_QUOTED_STRING>
\"(?&INCOMPLETE) # Escaped quotes due to a syntax highlighting bug in SO
(?:
[^\"\\\n]++ # Escaped quotes due to a syntax highlighting bug in SO
| (?&ESCAPED_CHAR)
)*+
\"(?&COMPLETE) # Escaped quotes due to a syntax highlighting bug in SO
)
# matches strings intermingled with other chars excluding EOL.
(?<STRINGS_WITH_CHARS>
(?:
# (?&NON_ESCAPED_CHARS)
[^\\\n\"\']++ # Escaped quotes due to a syntax highlighting bug in SO
| (?&DOUBLE_QUOTED_STRING)
| (?&SINGLE_QUOTED_STRING)
| (?&ESCAPED_CHAR)
)*+
)
# Matches against non escaped characters excluding EOL.
(?<NON_ESCAPED_CHARS> [^\\\n]++)
# Matches all non escaped chars and escaped chars.
# upto but not including the EOL unless it's escaped.
(?<CHARS> (?:(?&NON_ESCAPED_CHARS)|(?&ESCAPED_CHAR))*+)
# Matches EOL (end of line) or EOS (end of string) and states it is in a complete state.
(?<EOL_OR_EOS> (?&EOL) | $ (?&COMPLETE))
# Matches on EOL and increments $lineOffset if matched.
# When using this, make sure you don't allow backtracking over this call.
(?<EOL> \n(?{++$lineOffset}))
| # ACTUAL SEARCH
(?<ignore>
(?:
(?&EOL)? ^ [^\S\n]*+ \# (?&STRINGS_WITH_CHARS) (?&EOL_OR_EOS) # preprocessor statement
# Escaping slashes due to a syntax highlighting bug in SO
| \s*+ \/\/ (?&CHARS) (?&EOL_OR_EOS) # line comment
| \s*+ \/\* (?&INCOMPLETE) # block comment
(?:
[^*]++
| \* (?!\/)
)*+
\*\/ (?&COMPLETE) # block comment completed
)*+
)
(?(?{$complete}) # completed parse of all ignored stuff? Then read code stuff.
(?<code>
(?:
(?!^[^\S\n]*+\#) # do not match on a preprocessor statement
(?:
(?&DOUBLE_QUOTED_STRING) # found a string
| (?&SINGLE_QUOTED_STRING) # found a string
| (?: [^\'\"/\n]++ | /[^*/\n]) # found something not a string or comment or newline
# Escaped quotes due to a syntax highlighting bug in SO
| (?&EOL) # newline
)
)*+
)
(?<rest>
(?s).* # match to the end of the string
)
) # if not completed, read in more stuff and do parse over again.
%xmp;
($code, $ignore, $rest) = ($+{'code'}, $+{'ignore'}, $+{'rest'});
print "**COMPLETE = $complete\n" if $debug;
goto BLOCK_READ_COMPLETE if $complete or eof($inputFH);
}
# read in more data to allow for a complete parse
++$lineNumber;
print "Reading line $lineNumber\n" if $debug;
my $newStuff = <$inputFH>;
if (defined $newStuff)
{
$_ .= $newStuff;
$loopAgain = 1;
}
else
{
$loopAgain = 0;
}
} while ($loopAgain);
BLOCK_READ_COMPLETE:
$complete or die "Something wasn't terminated at line ". ($topLineOfBlock+$lineOffset) ." of file '$file'.\n";
# do transformation on $code
print "CODE: >>$code<<\nIGNORE: >>$ignore<<\nREST: >>$rest<<\n" if $debug;
print $outputFH "$ignore$code";
}
要查看有效的演示,请参阅here。
答案 0 :(得分:2)
你可以试试这个。
一些问题。您是否一次累积和处理超过1行?
如果没有,我认为没有任何条款评论的规定延伸到多行 此外,块注释消耗行注释,我没有看到任何关于行注释延续的规定。
修改 - 5
@Adrian - 我打了一个适合我的版本。 Perl代码和输出如下
如果这样做,请告诉我
请注意,#
预处理器的添加行继续代码以及//
行注释
它测试得非常好。
my $doubleQuotedString = q{"(?>[^"\\\\]++|\\\\{2}|\\\\(?s).)*+"};
my $singleQuotedString = q{'(?>[^'\\\\]++|\\\\{2}|\\\\(?s).)*+'};
my ($rest, $code, $stuffToIgnore) = ("");
my $counter = 0;
while (<DATA>)
{
$_ = "$rest$_";
do
{
($code, $stuffToIgnore, $rest) = m~
( # (1), Code
(?:
(?! ^ [^\S\n]* \# ) # Not a preprocessor statement
(?:
$ doubleQuotedString # found a string
| $ singleQuotedString # found a string
| (?: [^/\n]++ | / [^*/\n] ) # found something not a string or comment or newline
| \n # newline
)
)*+
)
( # (2), Ignore
(?:
\s*+ ^ [^\S\n]* \# # '#' Preprocessor statement with Continuation logic
(?:
[^\\] # any Non-Continuation character ^\
| \\ \n? # Or, any Continuation character followed by optional newline
)*?
(?<! \\ )
\n
|
\s*+ /\* # '/**_**/' Block comment
(?:
[^*]++
| \*
(?! / )
)*+
\*/
|
\s*+ // # '//' Line comment, with Continuation logic
(?:
[^\\]
| \\ \n?
)*?
(?<! \\ )
\n
)*+
)
( # (3), Rest
(?s)
.*
)
~xm;
++$counter;
goto BLOCK_READ if $stuffToIgnore ne "" or eof(DATA);
} while ($_ .= <DATA>);
BLOCK_READ:
defined $code or die "Unterminated block.";
# do transformation on $code
print "CODE: >>$code<<\nIGNORE: >>$stuffToIgnore<<\n---------------------------------------\n";
}
__DATA__
# hello \
there
# how
# are
#you
#pragma once
#include "EditState.h"
#include "MyDoc.h"
// InputEdit
class CInputEdit : public CEdit
{
DECLARE_DYNAMIC(CInputEdit)
public:
CInputEdit();
virtual ~CInputEdit();
// Attributes
protected:
DECLARE_MESSAGE_MAP()
BOOL m_bTrackingMenu;
};
输出
CODE: >>
<<
IGNORE: >># hello \
there
<<
---------------------------------------
CODE: >><<
IGNORE: >># how
<<
---------------------------------------
CODE: >><<
IGNORE: >># are
<<
---------------------------------------
CODE: >><<
IGNORE: >>#you
<<
---------------------------------------
CODE: >>
<<
IGNORE: >>#pragma once
<<
---------------------------------------
CODE: >>
<<
IGNORE: >>#include "EditState.h"
<<
---------------------------------------
CODE: >><<
IGNORE: >>#include "MyDoc.h"
<<
---------------------------------------
CODE: >>
<<
IGNORE: >>// InputEdit
<<
---------------------------------------
CODE: >>
class CInputEdit : public CEdit
{
DECLARE_DYNAMIC(CInputEdit)
public:
CInputEdit();
virtual ~CInputEdit();
<<
IGNORE: >>// Attributes
<<
---------------------------------------
CODE: >>protected:
DECLARE_MESSAGE_MAP()
BOOL m_bTrackingMenu;
};
<<
IGNORE: >><<
---------------------------------------
答案 1 :(得分:0)
我认为通过首先将正则表达式的复杂性分解为离散的部分,您将获得成功。只是将不同的可能匹配拉到单独的if
块中会对可读性产生很大影响。
您还应该能够利用有少量预处理程序指令的事实。