Beefy Boxes and Bandwidth Generously Provided by pair Networks
more useful options
 
PerlMonks  

Why a regex *really* isn't good enough for HTML and XML, even for "simple" tasks

by haukex (Archbishop)
on May 05, 2020 at 11:41 UTC ( [id://11116478]=perlmeditation: print w/replies, xml ) Need Help??

Help for this page

Select Code to Download


  1. or download this
    <a
    href
    ...
    <a href=http://www.example.com/7>Se<span
    >v&#101;</span>n</a>
    <script>/* --> */</script>
    
  2. or download this
    <script type="text/javascript">/*<![CDATA[
    </script>
    ...
    <![CDATA[
    <a href="http://www.example.com/8">Eight</a>
    ]]>
    
  3. or download this
    use Mojo::DOM;
    my $links = Mojo::DOM->new($html)->find('a[href]');
    ...
        ( my $txt_trim = $link->all_text ) =~ s/^\s+|\s+$//g;
        print $link->{href}, "\t", $txt_trim, "\n";
    }
    
  4. or download this
    use HTML::TreeBuilder::XPath;
    my $p = HTML::TreeBuilder::XPath->new;
    ...
    for my $link (@links) {
        print $link->attr('href'), "\t", $link->as_text_trimmed, "\n";
    }
    
  5. or download this
    use HTML::LinkExtor;
    my $p = HTML::LinkExtor->new;
    ...
        my ($tag, %attrs) = @$link;
        print $attrs{href}, "\n";
    }
    

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: perlmeditation [id://11116478]
Approved by marto
Front-paged by marto
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others taking refuge in the Monastery: (1)
As of 2024-04-19 00:24 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found