This give you an array of hashes. It uses the second blockquote to trigger the start of the next record.
#!/usr/bin/perl
use strict;
use warnings;
use HTML::TokeParser::Simple;
use Data::Dumper;
my $p = HTML::TokeParser::Simple->new(*DATA)
or die "couldn't parse DATA: $!\n";
my (@records, %record, $start);
while (my $t = $p->get_token){
if ($t->is_start_tag('span')){
if ($t->get_attr('class') and $t->get_attr('class') eq 'jobname'){
$record{jobname} = $p->get_trimmed_text('span');
}
elsif ($t->get_attr('class') and $t->get_attr('class') eq 'jobseri
+al'){
$record{jobserial} = $p->get_trimmed_text('span');
}
elsif ($t->get_attr('name') and $t->get_attr('name') eq 'em'){
push @{$record{em}}, $p->get_trimmed_text('span');
}
elsif ($t->get_attr('name') and $t->get_attr('name') eq 'offices')
+{
$record{offices} = $p->get_trimmed_text('span');
}
}
if ($t->is_start_tag('blockquote')){
next if exists $record{job_desc};
$record{job_desc} = $p->get_trimmed_text('blockquote');
#die Dumper \%record;
push @records, \%record;
%record = ();
}
}
print Dumper \@records;
__DATA__
<p><b><span class="jobname">
Accounting Assistant, Level 2
</span>
<span class="jobserial">(19203)</span>
<br />
Current members:
<br />
<span name="em">Plow, Elliot</span>
<span name="em">Wang, Susan</span>
<br />
<span name=”offices”>Huston</span>
</p>
<blockquote>
Job descriptions here.
This block quoted text contains a job description and it what I am rea
+lly looking to recover.
</blockquote>
<blockquote><a href="#top">Go to the top of this page</a>.</blockquote
+>
<blockquote><a href=”companyHR.html”>Check for open positions now!</a>
+</blockquote>
output:
$VAR1 = {
'job_desc' => 'Job descriptions here. This block quoted text contain
+s a job description and it what I am really looking to recover.',
'em' => [
'Plow, Elliot',
'Wang, Susan'
],
'jobserial' => '(19203) Current members:',
'jobname' => 'Accounting Assistant, Level 2'
};
update: see my reply below.