use strict; use warnings; sub parse { my ( $txt_ref, $part_aref, $spec_aref ) = @_; if ( not defined $part_aref ) { # use the whole $$txt_ref $part_aref = [ [ '??', 0, length($$txt_ref) ] ]; } my @part; # result from the parsing for my $part (@$part_aref) { my ( $type, $first, $length ) = @$part; if ( $type ne '??' ) { push @part, $part; next; } my $sub_txt_ref = \substr( $$txt_ref, $first, $length ); my $end_last_match = 0; MATCH: { for my $spec (@$spec_aref) { my ( $reg_exp, $type ) = @$spec; if ( $$sub_txt_ref =~ m{$reg_exp}gcm ) { push @part, [ $type, $-[0] + $first, $+[0] - $-[0] ]; $end_last_match = $+[0]; redo MATCH; } } } if ( $end_last_match < $length ) { warn 'ERROR: Stopped before string end at pos: ', "$first + $end_last_match\n<", substr( $$sub_txt_ref, $end_last_match ), '>'; } } return \@part; } my @spec_1 = ( [ qr{\G'[^'\\]*(?:\\.[^'\\]*)*'}, 'sq' ], [ qr{\G"[^"\\]*(?:\\.[^"\\]*)*"}, 'dq' ], [ qr{\G//[^\n]*[\n]?}, 'cn' ], [ qr{\G/[*](?:[^*]*|[*]+[^/*]*)*[*]/}, 'cb' ], [ qr{\G(?:[^'"/]+|[/][^'"/*])+}, '??' ], ); my $dig1 = qr{-?\d+\.\d*}; my $dig2 = qr{-?\d*\.\d+}; my $dig3 = qr{-?\d+}; my $dig4 = qr{E-?\d+}; my $digit = qr{(?:$dig1|$dig2|$dig3)$dig4?}; my @spec_2 = ( [ qr{\G(?:var|alert)}, 'rw' ], [ qr{\G$digit}, 'di' ], [ qr{\G[_a-zA-Z0-9.\$]+}, 'na' ], [ qr{\G(?:[^_a-zA-Z0-9.\$\d]+|[\n\s]+)+}, '??' ], ); sub to_string_part_aref { my ( $txt_ref, $part_aref ) = @_; return join '', map { my ( $type, $first, $length ) = @$_; if ( $type eq '??' ) { substr( $$txt_ref, $first, $length ); } else { "<$type>" . substr( $$txt_ref, $first, $length ) . ""; } } @{$part_aref}; } my $text = do { local $/; }; my $text_ref = \$text; my $part_ref_1 = parse( $text_ref, undef, \@spec_1 ); my $part_ref_2 = parse( $text_ref, $part_ref_1, \@spec_2 ); warn to_string_part_aref( $text_ref, $part_ref_2 ); __DATA__ // This is a single-line comment var x = 4; // Single /* Multiple-line comment that can span any number of lines */ /* This is a multi-line comment // Still a multi-line comment */ /* Stop code var x = 4; var y = 5; /* Bug? * x = "cool"; End Stop code */ // This is a single-line comment /* ...still a single-line comment 'string\' // still a string'; // comment /* not-a-nested-comment var = 0.5; // comment */* still-a-comment ' /**/ string ' /* "comment..." // still-a-comment */ alert('This isn\'t a comment!'); /\/* this isn't a comment! */; //* comment /* //a comment... // still-a-comment 12345 "Foo /bar/ "" */ /*//Boo*/ /*/**/