Another way. This approach uses highly factored and specific regexes to achieve a high degree of discrimination — if that's what you want! It's easy to add further, highly specialized regexes. ($section is returned as '' (empty atring) if no section is present rather than as undef.) Optional whitespace may exist between page and section sub-fields. Note that with the right pattern anchors, multiple page/section fields can be extracted from a single string/line.
c:\@Work\Perl\monks>perl -wMstrict -le
"use Data::Dump qw(dd);
;;
my $rx_simple = qr{
[[:alpha:]] [[:alnum:]]* (?: - [[:alnum:]]+)* }xms;
my $rx_module = qr{
[[:upper:]] [[:alpha:]]* (?: :: [[:upper:]] [[:alpha:]]*)*
}xms;
my $rx_page = qr{ $rx_simple | $rx_module }xms;
;;
my $rx_section = qr{ [(] \d* [)] }xms;
;;
for my $line (qw(
ftpd(8) ftpd dhcp-config(5) dhcp-config foo2 foo2(2) foo-2
Cache::Cache(3) Cache::Cache Foo::Bar::Baz(42) Foo::Bar::Baz
),
'ftpd (8)', 'dhcp-config (5)', 'Cache::Cache (3)',
qw(-foo foo- %^&*@! 123 1foo foo--bar),
) {
my $got_page_section =
my ($page, $section) =
$line =~ m{ \A ($rx_page) \s* ($rx_section?) \z }xms;
;;
$page = $section = '???' unless $got_page_section;
;;
print qq{'$line' -> '$page' '$section'};
}
;;
my $line = 'ftpd(8) -no dhcp-config no- dhcp-config (5) -- Foo::Bar
+::Baz(42) (999)';
my @pages;
push @pages, [ $1, $2 ] while $line =~ m{
(?<! \S) ($rx_page) \s* ($rx_section?) (?! \S)
}xmsg;
dd \@pages;
"
'ftpd(8)' -> 'ftpd' '(8)'
'ftpd' -> 'ftpd' ''
'dhcp-config(5)' -> 'dhcp-config' '(5)'
'dhcp-config' -> 'dhcp-config' ''
'foo2' -> 'foo2' ''
'foo2(2)' -> 'foo2' '(2)'
'foo-2' -> 'foo-2' ''
'Cache::Cache(3)' -> 'Cache::Cache' '(3)'
'Cache::Cache' -> 'Cache::Cache' ''
'Foo::Bar::Baz(42)' -> 'Foo::Bar::Baz' '(42)'
'Foo::Bar::Baz' -> 'Foo::Bar::Baz' ''
'ftpd (8)' -> 'ftpd' '(8)'
'dhcp-config (5)' -> 'dhcp-config' '(5)'
'Cache::Cache (3)' -> 'Cache::Cache' '(3)'
'-foo' -> '???' '???'
'foo-' -> '???' '???'
'%^&*@!' -> '???' '???'
'123' -> '???' '???'
'1foo' -> '???' '???'
'foo--bar' -> '???' '???'
[
["ftpd", "(8)"],
["dhcp-config", ""],
["dhcp-config", "(5)"],
["Foo::Bar::Baz", "(42)"],
]
(Update: A thorough test plan (see Test::More and friends) will give you confidence that whatever solution you choose actually will match what you want and reject what you don't want.)
Give a man a fish: <%-{-{-{-<
|