#!/usr/bin/perl -w use strict; use warnings; sub parse { my ( $txt_ref, $part_aref, $spec_aref ) = @_; my ( $type, $first, $last ); # for the current part # used in the current matching my $cur_txt_ref; my $cur_sub_str; my $cur_pos0; # start of $cur_sub_str in $$txt_ref my @part; # result from the parsing my $do_part = sub { my $end_last_match = 0; MATCH: { for my $spec (@$spec_aref) { my ( $reg_exp, $type ) = @$spec; if ( $$cur_txt_ref =~ m{$reg_exp}gcm ) { push @part, [ $type, $-[0] + $cur_pos0, $+[0] + $cur_pos0 ]; $end_last_match = $+[0]; redo MATCH; } } last MATCH; } if ( $end_last_match < length $$cur_txt_ref ) { warn 'ERROR: Stopped before string end at pos: ', "$cur_pos0 + $end_last_match\n<", substr( $$cur_txt_ref, $end_last_match ), '>'; } }; if ( not defined $part_aref ) { # use the whole $$txt_ref $cur_txt_ref = $txt_ref; $cur_pos0 = 0; $do_part->(); return \@part; } $cur_txt_ref = \$cur_sub_str; # use a substring from $$txt_ref map { ( $type, $first, $last ) = @$_; if ( $type eq '??' ) { $cur_sub_str = substr( $$txt_ref, $first, $last - $first ); $cur_pos0 = $first; $do_part->(); } else { push @part, $_; } } @$part_aref; return \@part; } my @spec_1 = ( [ qr{\G'[^'\\]*(?:\\.[^'\\]*)*'}, 'sq' ], [ qr{\G"[^"\\]*(?:\\.[^"\\]*)*"}, 'dq' ], [ qr{\G//[^\n]*[\n]?}, 'cn' ], [ qr{\G/[*](?:[^*]*|[*]+[^/*]*)*[*]/}, 'cb' ], [ qr{\G(?:[^'"/]+|[/][^'"/*])+}, '??' ], ); my $dig1 = qr{-?\d+\.\d*}; my $dig2 = qr{-?\d*\.\d+}; my $dig3 = qr{-?\d+}; my $dig4 = qr{E-?\d+}; my $digit = qr{(?:$dig1|$dig2|$dig3)$dig4?}; my @spec_2 = ( [ qr{\G(?:var|alert)}, 'rw' ], [ qr{\G$digit}, 'di' ], [ qr{\G[_a-zA-Z0-9.\$]+}, 'na' ], [ qr{\G(?:[^_a-zA-Z0-9.\$\d]+|[\n\s]+)+}, '??' ], ); sub to_string_part_aref { my ( $txt_ref, $part_aref ) = @_; return join '', map { my ( $type, $first, $last ) = @$_; if ( $type eq '??' ) { substr( $$txt_ref, $first, $last - $first ); } else { "<$type>" . substr( $$txt_ref, $first, $last - $first ) . ""; } } @{$part_aref}; } my $text = do { local $/; }; my $text_ref = \$text; my $part_ref_1 = parse( $text_ref, undef, \@spec_1 ); my $part_ref_2 = parse( $text_ref, $part_ref_1, \@spec_2 ); warn to_string_part_aref( $text_ref, $part_ref_2 ); exit 0; __DATA__ // This is a single-line comment var x = 4; // Single /* Multiple-line comment that can span any number of lines */ /* This is a multi-line comment // Still a multi-line comment */ /* Stop code var x = 4; var y = 5; /* Bug? * x = "cool"; End Stop code */ // This is a single-line comment /* ...still a single-line comment 'string\' // still a string'; // comment /* not-a-nested-comment var = 0.5; // comment */* still-a-comment ' /**/ string ' /* "comment..." // still-a-comment */ alert('This isn\'t a comment!'); /\/* this isn't a comment! */; //* comment /* //a comment... // still-a-comment 12345 "Foo /bar/ "" */ /*//Boo*/ /*/**/