#!/usr/bin/perl use strict; use warnings; use Benchmark qw(cmpthese timethese); use App::scrape qw(scrape); use HTML::Query qw(Query); use HTML::Selector::XPath qw(selector_to_xpath); use HTML::TreeBuilder qw(); use HTML::TreeBuilder::XPath; use Mojo::DOM; use SGMLExtract qw(sgml_find sgml_extract); use Web::Query qw(wq); use Web::Scraper qw(process scraper); use Debug; my $html = q{ --stuff-- --more stuff-- --still more stuff--
Stuff I do not want
--all the stuff I want, which might include div tags, too--
--yet more stuff-- }; # appse and sgmle cheat because they go off relative position of the div - not the class name sub m_appse { (scrape($html, ['div'], {class => 'myBody'}))[1]->[0] } sub m_hselx { (HTML::TreeBuilder::XPath->new_from_content($html)->findnodes(selector_to_xpath('div.myBody')))[0]->as_HTML } sub m_htmlq { Query(text => $html)->query('div.myBody')->as_HTML } sub m_mojod { Mojo::DOM->new->parse($html)->at('.myBody')->text } sub m_sgmle { sgml_extract(\$html, 'div', {all => 1, content => 1})->[1]->{'content'} } sub m_sgmlf { sgml_find(\$html, 'div', {class => 'myBody'})->[0]->{'content'} } sub m_treeb { HTML::TreeBuilder->new_from_content($html)->look_down(_tag => 'div', class => 'myBody')->as_HTML(q{}) } sub m_webqy { wq($html)->find('div.myBody')->html } sub m_websc { (scraper { process "div.myBody", key => 'TEXT' }->scrape($html))[0]->{'key'} } debug m_appse(), m_hselx(), m_htmlq(), m_mojod(), m_treeb(), m_sgmle(), m_sgmlf(), m_webqy(), m_websc(); cmpthese timethese -1, { appse => \&m_appse, hselx => \&m_hselx, htmlq => \&m_htmlq, mojod => \&m_mojod, sgmle => \&m_sgmle, sgmlf => \&m_sgmlf, treeb => \&m_treeb, webqy => \&m_webqy, websc => \&m_websc, }; __END__ debug: paul/bench.pl line 45 m_appse() = "--all the stuff I want, which might include div tags, too--"; m_hselx() = "
--all the stuff I want, which might include div tags, too--
"; m_htmlq() = "
--all the stuff I want, which might include div tags, too--
"; m_mojod() = "\n--all the stuff want, which might include div tags, too--\n"; m_treeb() = "
--all the stuff I want, which might include div tags, too--
"; m_sgmle() = "\n--all the stuff I want, which might include div tags, too--\n"; m_sgmlf() = "\n--all the stuff I want, which might include div tags, too--\n"; m_webqy() = "
--all the stuff I want, which might include div tags, too--
"; m_websc() = " --all the stuff I want, which might include div tags, too-- "; Rate webqy hselx websc appse htmlq treeb mojod sgmlf sgmle webqy 697/s -- -4% -5% -37% -47% -54% -72% -97% -98% hselx 724/s 4% -- -1% -35% -44% -52% -71% -97% -97% websc 731/s 5% 1% -- -34% -44% -51% -70% -97% -97% appse 1110/s 59% 53% 52% -- -15% -26% -55% -95% -96% htmlq 1305/s 87% 80% 78% 18% -- -13% -47% -94% -95% treeb 1506/s 116% 108% 106% 36% 15% -- -39% -93% -95% mojod 2465/s 254% 240% 237% 122% 89% 64% -- -89% -91% sgmlf 22330/s 3103% 2983% 2953% 1912% 1611% 1383% 806% -- -22% sgmle 28709/s 4018% 3864% 3825% 2486% 2100% 1807% 1065% 29% --