#!/usr/bin/perl use strict; use warnings; use LWP::Simple; use Perl6::Slurp; # to load the page from the cache use HTML::TreeBuilder::XPath; # easier to use than bare HTML::TreeBuilder # during development we don't want to hit the real page, # so we'll have a -c switch to use a cache use Getopt::Std; my %opt; getopts( 'c', \%opt); # if called with -c then $opt{c} is true my $base='http://www.costacrociere.it'; my $url='/it/lista_crociere/capitali_nord_europa-201206.html'; my $cache= 'capitali_nord_europa-201206.html'; # this will get rid of the bad characters you were seeing in the output binmode( STDOUT, ':utf8'); if( ! $opt{c}) { getstore( $base.$url, $cache); } # only get the live page without -c my $page= slurp '<:utf8', $cache; my $p = HTML::TreeBuilder::XPath->new_from_content( $page ); my @trips= $p->findnodes( '//p[@class="itinerari-info"]'); foreach my $trip (@trips){ # you may want to do something more complex here, but for now it will do print "crociera: ", $trip->as_text, "\n"; }