sub Lex { my $input = shift; # Get file handle to a C++ file my @tokens; # This will contain the tokenised file ready for our parser my @longPatterns = ( ['Comment' => qr|//.*| ], ['Directive' => qr|^\s*#define.*| ], ['Directive' => qr|^\s*#elif.*| ], ['Directive' => qr|^\s*#else.*| ], ['Directive' => qr|^\s*#error.*| ], ['Directive' => qr|^\s*#endif.*| ], ['Directive' => qr|^\s*#if.*| ], ['Directive' => qr|^\s*#ifdef.*| ], ['Directive' => qr|^\s*#ifndef.*| ], ['Directive' => qr|^\s*#include.*| ], ['Directive' => qr|^\s*#line.*| ], ['Directive' => qr|^\s*#undef.*| ], ['Directive' => qr|^\s*#pragma.*| ], ); my @reserved = qw( alignas alignof and and_eq asm atomic_cancel atomic_commit atomic_noexcept auto bitand bitor bool break case catch char char16_t char32_t class compl concept const constexpr const_cast continue co_await co_return co_yield decltype default delete do double dynamic_cast else enum explicit export extern false float for friend goto if import inline int long module mutable namespace new noexcept not not_eq nullptr operator or or_eq private protected public register reinterpret_cast requires return short signed sizeof static static_assert static_cast struct switch synchronized template this thread_local throw true try typedef typeid typename union unsigned using virtual void volatile wchar_t while xor xor_eq ); my @patterns = ( # Multi character patterns to lex out ['Number' => qr/^\d[\.\d]*$/ ], ['Identifier' => qr/\w+/ ], ['dblColon' => qr/(? 'LeftParen', ')' => 'RightParen', '[' => 'LeftSquare', ']' => 'RightSquare', '{' => 'LeftCurly', '}' => 'RightCurly', '<' => 'LessThan', '>' => 'GreaterThan', '=' => 'Equal', '+' => 'Plus', '-' => 'Minus', '*' => 'Asterisk', '/' => 'Slash', '#' => 'Hash', '.' => 'Dot', ',' => 'Comma', ':' => 'Colon', ';' => 'Semicolon', "'" => 'SingleQuote', '"' => 'DoubleQuote', '|' => 'Pipe', ); while (my $line = <$input>) { chomp $line; my $matched; for my $patt (@longPatterns) { # some to evaluate on the entire line if ($line =~ s|($patt->[1])|| ) { my $token = $1; print "$patt->[0]\t$token\n" if $debug; push @tokens, [$patt->[0], $token]; } } print "got> $line\n" if $debug and $line =~/\S/; LABEL: for my $token (split /\b/, $line) { # now handle token at a time $token =~ s/^\s+|\s+$//g; # Strip whitespace next unless $token; # anything left? print "Lexing $token\n" if $debug; for my $word (@reserved) { # look for reserve words if ($word eq $token) { # A C++ reserve word, simples print "reserved\t$token\n" if $debug; push @tokens, ['reserved', $token]; next LABEL; } } for my $pat (@patterns) { # Try multi character patterns next if ($token =~ /$pat->[1]/) { print "$pat->[0]\t$token\n" if $debug; push @tokens, [$pat->[0], $token]; next LABEL } } unless ($matched) { # Didn't match multichar pattern, so handle character at a time for my $char (split //, $token) { print "Lexing by character $char\n" if $debug; if (exists $Character{$char}) { print "$Character{$char}\t$char\n" if $debug; push @tokens, [$Character{$char}, $char]; } else { print "Failed to match $char\n"; } } } } Parser(\@tokens) } }