term -> atom operator term term -> '(' term ')' term -> atom operator -> '*' | '/'| '+' | '-' atom -> \d+ #### term -> atom operator term '2' operator term '2' '*' term '2' '*' (atom operator term) '2' '*' ('3' operator term) '2' '*' ('3' '+' term) '2' '*' ('3' '+' (atom)) '2' '*' ('3' '+' ('4')) #### expression -> (term add_op)* term term -> (factor mul_op)* factor factor -> atom | '(' expression ')' add_op -> '+' | '-' mul_op -> '*' | '/' atom -> \d+ #### expression term add_op term term '+' (factor) term '+' ('4') (factor mulop factor) '+' ('4') ((atom) mulop (atom)) '+' ('4') (('2') '*' ('3') ) '+' ('4') #### #!/usr/bin/perl use strict; use warnings; use Data::Dumper; my @token_def = ( [Whitespace => qr{\s+}, 1], [Comment => qr{#.*\n?$}m, 1], [AddOp => qr{[+-]} ], [MulOp => qr{[*/]} ], [Number => qr{\d+} ], [OpenParen => qr{\(} ], [CloseParen => qr{\)} ], ); my $input = $ARGV[0] || "2 * 3 + 4 # and a comment\n"; my @tokens; pos($input) = 0; while(pos($input) < length $input){ my $matched = 0; for my $t (@token_def){ my ($name, $re, $ignore_flag) = @$t; if ($input =~ m/\G($re)/gc){ $matched = 1; next if $ignore_flag; push @tokens, [$name, $1]; next; } } die "Syntax error at postion " . pos($input) unless $matched } print Dumper \@tokens; __END__ # output reformatted for brevity: $VAR1 = [ [ 'Number', '2' ], [ 'MulOp', '*' ], [ 'Number', '3' ], [ 'AddOp', '+' ], [ 'Number', '4' ] ]; #### sub match { my $expected_token = shift; if ($tokens[0][0] eq $expected_token){ my $current = shift @tokens; return $current->[1]; } else { die "Syntax error: expected $expected_token, got $tokens[0][0]\n"; } } sub lookahead { my @expected = @_; no warnings 'uninitialized'; for (0 .. $#expected){ return 0 if $tokens[$_][0] ne $expected[$_]; } return 1; } #### sub expression { my @res = (term()); while (lookahead('AddOp')){ push @res, match('AddOp'); push @res, term(); } return \@res; } sub term { my @res = (factor()); while (lookahead('MulOp')){ push @res, match('MulOp'); push @res, factor(); } return \@res; } sub factor { if (lookahead('OpenParen')){ match('OpenParen'); my $res = expression(); match('CloseParen'); return $res; } else { atom(); } } sub atom { match('Number'); } #### my $parse_tree = expression(); print Dumper $parse_tree; __END__ # again output reformatted $VAR1 = [ [ '2', '*', '3' ], '+', [ '4' ] ]; #### # parse tree for '2 * (3+4)': [ '2', '*', [ '3', '+', '4' ], ]; #### my $parse_tree = expression(); match('EOF'); #### sub execute { my $tree = shift; return $tree unless ref $tree; my %ops = ( '*' => sub { $_[0] * $_[1] }, '/' => sub { $_[0] / $_[1] }, '+' => sub { $_[0] + $_[1] }, '-' => sub { $_[0] - $_[1] }, ); my ($val, @rest) = @$tree; $val = execute($val); while (@rest){ my $op = shift @rest; my $next = execute(shift @rest); $val = $ops{$op}->($val, $next); } return $val; } print execute($parse_tree), "\n"; #### sub make_rule { my ($operator, $next) = @_; return sub { my @res = ($next->()); while (lookahead($operator)){ push @res, match($operator); push @res, $next->(); } return @res > 1 ? \@res : $res[0]; }; } *term = make_rule('MulOp', \&factor); *expression = make_rule('AddOp', \&term); #### grammar Arithmetic { # expression -> (term add_op)* term rule expression { [ ]* } # term -> (factor mul_op)* factor rule term { [ ]* } # factor -> atom | '(' expression ')' rule factor { | | '(' ')' } # add_op -> '+' | '-' token add_op { <[+-]> # that's a char class } # mul_op -> '*' | '/' token mul_op { <[*/]> } token atom { \d+ } } # match it: $input ~~ Grammar.expression; #### rule factor { | | '(' [ ')' || ] } #### # this example mixes official Perl 6 syntax an PGE syntax, # I don't really know if they are 100% compatible # see https://svn.perl.org/parrot/trunk/docs/pct/pct_optable_guide.pod # and http://perlcabal.org/syn/S05.html rule factor { | | '(' ')' } token infix:<*> { }; # stands for a literal * here token infix: is equiv('infix:*') { }; token infix:<+> is looser('infix:*') { }; token infix:<-> is equiv('inifx:+') { }; # put parser in bottom-up mode: rule expression is optable { ... }; # yes, '...' is valid Perl 6 ;-) proto 'term:' is tighter('infix:*') is parsed(&factor) { ... }; #### ModernArithmetic is Arithmetc { token infix: { '÷' } # different symbol, same rule name }