I like doing this kind of thing all in one regex. This is incomplete, but should be enough to give you the basic idea.
#!/usr/bin/perl
# https://perlmonks.org/?node_id=11105353
# following spirit of my http://www.rosettacode.org/wiki/Compiler/lexi
+cal_analyzer#Alternate_Perl_Solution
use strict;
use warnings;
my @tokens;
my %reserved = map { $_ => 1 } qw(
alignas alignof and and_eq asm atomic_cancel atomic_commit
atomic_noexcept auto bitand bitor bool break case catch char
char16_t char32_t class compl concept const constexpr const_cast
continue co_await co_return co_yield decltype default delete do
double dynamic_cast else enum explicit export extern false float for
friend goto if import inline int long module mutable namespace new
noexcept not not_eq nullptr operator or or_eq private protected
public register reinterpret_cast requires return short signed
sizeof static static_assert static_cast struct switch synchronized
template this thread_local throw true try typedef typeid typename
union unsigned using virtual void volatile wchar_t while xor xor_eq
);
my %Character = ( # Single characters by name
'(' => 'LeftParen',
')' => 'RightParen',
'[' => 'LeftSquare',
']' => 'RightSquare',
'{' => 'LeftCurly',
'}' => 'RightCurly',
'<' => 'LessThan',
'>' => 'GreaterThan',
'=' => 'Equal',
'+' => 'Plus',
'-' => 'Minus',
'*' => 'Asterisk',
'/' => 'Slash',
'#' => 'Hash',
'.' => 'Dot',
',' => 'Comma',
':' => 'Colon',
';' => 'Semicolon',
"'" => 'SingleQuote',
'"' => 'DoubleQuote',
'|' => 'Pipe',
);
my %MultiOps = (
'>>' => 'RightShift',
'<<' => 'LeftShift',
'<=' => 'LessThanOrEqual',
);
my $regex = qr/ \G (?|
\s+ (?{ undef })
| \/\/.* (?{ undef })
| \d+(?:\.\d*)? (?{ 'Number' })
| \.\d+ (?{ 'Number' })
| \w+ (?{ $reserved{$&} ? 'reserved' : 'Identifier' })
| "([^"]*)" (?{ [ 'string', $1 ] })
| (?<!:)::(?!:) (?{ 'dblColon' })
| (?:<<|>>|<=) (?{ $MultiOps{$&} })
| . (?{ $Character{$&} or 'character' })
) /x;
$_ = join '', <DATA>;
defined $^R and push @tokens, ref $^R ? $^R : [ $^R, $& ] while /$rege
+x/gc;
use Data::Dump 'dd'; dd @tokens;
__DATA__
// comment should not appear
main(void)
{
int foo = 1 << 5;
puts("testing");
exit(0); // done
}
Outputs:
(
["Identifier", "main"],
["LeftParen", "("],
["reserved", "void"],
["RightParen", ")"],
["LeftCurly", "{"],
["reserved", "int"],
["Identifier", "foo"],
["Equal", "="],
["Number", 1],
["LeftShift", "<<"],
["Number", 5],
["Semicolon", ";"],
["Identifier", "puts"],
["LeftParen", "("],
["string", "testing"],
["RightParen", ")"],
["Semicolon", ";"],
["Identifier", "exit"],
["LeftParen", "("],
["Number", 0],
["RightParen", ")"],
["Semicolon", ";"],
["RightCurly", "}"],
)
At least it doesn't fail any of your provided test cases :)