// We do not use /*-style comments
It doesn't even handle the case:
// We don't use old C-style comments
because it tries to find the closing single quote to match the apostrophe in "don't". You simply have to parse //-style comments for such a tool.
/"([^"]|\\")*"/
This doesn't handle "\\". Also note that it will fail for strings of 32K characters, which is why I prefer to add the + in "([^"\\]+|\\.)*".
Why don't you factor out m{((?! \*/ | /\* ).)+}sx into its own rule so you don't have to repeat that regex three times and so you can assign a descriptive name to it to aid understanding?
m{((?!/\*|"|').)+}s could be replaced by [^/"'] and /(?![*]), which is more to my taste but YMMV.
And I'd probably do this all with simpler regexes and a simple state machine instead of resorting to Parse::RecDescent (not that my result will be simpler code in total). Note that I even avoid having to slurp the entire input into a single string.
#!/usr/bin/perl -w
use strict;
$|= 1; # Useful for ad-hoc testing
my $canNest= 1; # Whether /*-style comments can be nested
my $depth= 0;
my $output= "";
while( <DATA> ) {
while( ! m[\G\z]gc ) {
while( $depth && m[/[*]|[*]/]gc ) {
if( "/" eq substr( $_, $-[0], 1 ) ) {
$depth++;
} elsif( $canNest ) {
$depth--;
} else {
$depth= 0;
}
}
last if $depth;
if( m[
\G
(?:
[^'"/]+
| ' (?: [^'\\]+ | \\. )* '
| " (?: [^"\\]+ | \\. )* "
| /(?![/*])
)+
]xgc
) {
$output .= substr( $_, $-[0], $+[0] - $-[0] );
} elsif( m[\G//.*]gc ) {
# skip C++ comments
} elsif( m[\G/[*]]gc ) {
$depth++;
} elsif( m[\G['"]]gc ) {
warn "Ignoring unclosed quote: $_";
} else {
die $_, ' ' x pos($_), "^\nCouldn't be parsed";
}
}
print $output;
$output= "";
}
warn "$depth unclosed /*-comments\n" if $depth;
__END__
#include "StdAfx.h" // Tail comment
#include "Utility\perftime.h"
#pragma hdrstop
/* Comment before MACRO */
/* Comment /* and nested comment */ lines */
#define MACRO 10\
+ 3 // Multi line macro with comment
#define __DEBUG /* comment */ 1
#define STRING 'This is a string' /* comment */
#define BACKSLASH '\\'
#define COMMENT "/* comment in \"a\" string */"
// c++ comment line
/* Comment at start for a number of lines */
/* multi-line comment
/* nested */
block */
// cpp block
char PerfTimer::Buf[64];
// Don't use contractions
// /*-style comment below over multiple lines:
test/*ing how newlines work
when a comment spans lines, does it st*/ing?
total/*divide*//count//*comment
Produces
#include "StdAfx.h"
#include "Utility\perftime.h"
#pragma hdrstop
#define MACRO 10\
+ 3
#define __DEBUG 1
#define STRING 'This is a string'
#define BACKSLASH '\\'
#define COMMENT "/* comment in \"a\" string */"
char PerfTimer::Buf[64];
testing?
total/count
(Very minor updates applied.)
|