my $p = HTML::TokeParser::Simple->new(\$html);
my (%href, $this_href, $number, $letter);
while (my $t = $p->get_token){
if ($t->is_start_tag('h2')){
$letter = $p->get_trimmed_text('/h2');
next;
}
if ($t->is_start_tag('a')){
# skip bookmarks
next if $t->get_attr('name');
$this_href = $t->get_attr('href');
next;
}
if ($t->is_start_tag('span')){
$number = $p->get_trimmed_text('/span');
$href{$letter}{$this_href} = $number;
next;
}
}
output
---------- Capture Output ----------
> "C:\Perl\bin\perl.exe" _new.pl
A
pdf\8a956f66-1c60-48fc-905c-b49d617aa6c5.pdf -> 110377660
pdf\c76b834e-36e1-497b-b13e-eba2348dc044.pdf -> 110136892
pdf\ae8d51e0-005b-44be-84cb-3c9b57335755.pdf -> 108318866
pdf\37d3e78b-1adb-458b-9e89-0df780909f08.pdf -> 108116112
pdf\e646f948-f78d-4463-a01d-0261aebf70dc.pdf -> 113069066
pdf\6c0a5bb4-143d-4305-957b-796c8193d07a.pdf -> 116815754
B
pdf\8a956f66-1c60-48fc-905c-b49d617aa6c5.pdf -> 110377660
pdf\c76b834e-36e1-497b-b13e-eba2348dc044.pdf -> 110136892
pdf\ae8d51e0-005b-44be-84cb-3c9b57335755.pdf -> 108318866
pdf\37d3e78b-1adb-458b-9e89-0df780909f08.pdf -> 108116112
pdf\e646f948-f78d-4463-a01d-0261aebf70dc.pdf -> 113069066
pdf\6c0a5bb4-143d-4305-957b-796c8193d07a.pdf -> 116815754
C
pdf\8a956f66-1c60-48fc-905c-b49d617aa6c5.pdf -> 110377660
pdf\c76b834e-36e1-497b-b13e-eba2348dc044.pdf -> 110136892
pdf\ae8d51e0-005b-44be-84cb-3c9b57335755.pdf -> 108318866
pdf\37d3e78b-1adb-458b-9e89-0df780909f08.pdf -> 108116112
pdf\e646f948-f78d-4463-a01d-0261aebf70dc.pdf -> 113069066
pdf\6c0a5bb4-143d-4305-957b-796c8193d07a.pdf -> 116815754
> Terminated with exit code 0..
|