#!/usr/bin/perl -w use strict; use XML::Twig; $/="\n\n"; # tag => attribute we are interested in # you could avoid having this global by putting it in the twig my %att= ( elt => 'elt_class', subelt => 'subelt_class', ); my $doc = ; # the original data set my $expected_sorted_doc = ; # sorted result my $expected_merged_doc = ; # merged result my $sorted_doc= sort_doc( $doc); if( compact( $sorted_doc) eq compact( $expected_sorted_doc)) { print "sorted doc generation OK\n"; } else { print "sorted doc generation NOK: \n", "expected:\n$expected_sorted_doc\n", "found:\n$sorted_doc\n"; } my $merged_doc= merge_doc( $doc); if( compact( $merged_doc) eq compact( $expected_merged_doc)) { print "merged doc generation OK\n"; } else { print "merged doc generation NOK: \n", "expected:\n$expected_merged_doc\n", "found:\n$merged_doc\n"; } # sort: for each relevant node (elt or subelt in this case) generate a location # key and move the content if the location already exists sub sort_doc { my( $doc)= @_; my $location={}; # location key => existing subelt element with this location key my $t= XML::Twig->new( twig_handlers => { elt => sub { sort_node( $location, @_); }, subelt => sub { sort_node( $location, @_); }, }, pretty_print => 'indented', # makes debugging easier ); $t->parse( $doc); return $t->sprint; } sub sort_node { my( $location, $t, $node)= @_; # compute the location key, which must describe uniquely the node category my $location_key= location( $node); # now see if we need to move the content if( my $new_parent= $location->{$location_key}) { # there is already an element with this location key # move all content's there foreach my $content ($node->children) { $content->move( last_child => $new_parent); } # no need to keep the empty shell $node->delete unless( $node->has_child); } else { # first time we see the location key, store the element in $location $location->{$location_key}= $node; } } # the location describes a node category, nodes with the same location should have the same parent sub location { my( $node)= @_; # a compact way to just join the values of the proper attributes of the ancestors of the node my $location= join( '-', grep {$_} map { $_->att( $att{$_->tag}) || '' } (@{[$node->ancestors]}, $node)); #warn "location: $location\n"; return $location; } # merge doc sub merge_doc { my( $doc)= @_; my $t= XML::Twig->new( twig_handlers => { elt => \&merge_node, subelt => \&merge_node, }, pretty_print => 'indented', ); $t->parse( $doc); return $t->sprint; } sub merge_node { my( $t, $node)= @_; my $potential_merger= $node->prev_elt( $node->tag) or return; # return if this is the first node of this type if( location( $node) eq location( $potential_merger)) { # bingo! we can merge the contents foreach my $content ($node->children) { $content->move( last_child => $potential_merger); } } else { # this branch is not used for this test as we are working in memory # but this is where you could free the memory by dumping the part of # the tree that will no longer need to be updated # $t->flush_up_to( $potential_merger); } $node->delete unless( $node->has_child); } sub compact { my( $doc)= @_; $doc=~ s{^\s+}{}; # trim at the begining $doc=~ s{\s+$}{}; # trim the end $doc=~ s{>\s*<}{><}g; # trim spaces between tags return $doc; } __DATA__