http://qs321.pair.com?node_id=217196


in reply to Removing duplicate subtrees from XML

Your requirements are a bit sketchy and I can see 2 things you might want to do in this case (once you get proper XML of course ;--):

Here is a piece of code using the usual suspect that does both (on a made up XML data set, but adapting it to your data should be really easy):

#!/usr/bin/perl -w use strict; use XML::Twig; $/="\n\n"; # tag => attribute we are interested in # you could avoid having this global by putting it in the twig my %att= ( elt => 'elt_class', subelt => 'subelt_class', ); my $doc = <DATA>; # the original data set my $expected_sorted_doc = <DATA>; # sorted result my $expected_merged_doc = <DATA>; # merged result my $sorted_doc= sort_doc( $doc); if( compact( $sorted_doc) eq compact( $expected_sorted_doc)) { print "sorted doc generation OK\n"; } else { print "sorted doc generation NOK: \n", "expected:\n$expected_sorted_doc\n", "found:\n$sorted_doc\n"; } my $merged_doc= merge_doc( $doc); if( compact( $merged_doc) eq compact( $expected_merged_doc)) { print "merged doc generation OK\n"; } else { print "merged doc generation NOK: \n", "expected:\n$expected_merged_doc\n", "found:\n$merged_doc\n"; } # sort: for each relevant node (elt or subelt in this case) generate a + location # key and move the content if the location already exists sub sort_doc { my( $doc)= @_; my $location={}; # location key => existing subelt element with t +his location key my $t= XML::Twig->new( twig_handlers => { elt => sub { sort_nod +e( $location, @_); }, subelt => sub { sort_nod +e( $location, @_); }, }, pretty_print => 'indented', # makes debuggi +ng easier ); $t->parse( $doc); return $t->sprint; } sub sort_node { my( $location, $t, $node)= @_; # compute the location key, which must describe uniquely the node +category my $location_key= location( $node); # now see if we need to move the content if( my $new_parent= $location->{$location_key}) { # there is already an element with this location key # move all content's there foreach my $content ($node->children) { $content->move( last_child => $new_parent); } # no need to keep the empty shell $node->delete unless( $node->has_child); } else { # first time we see the location key, store the element in $lo +cation $location->{$location_key}= $node; } } # the location describes a node category, nodes with the same location + should have the same parent sub location { my( $node)= @_; # a compact way to just join the values of the proper attributes o +f the ancestors of the node my $location= join( '-', grep {$_} map { $_->att( $att{$_->tag}) | +| '' } (@{[$node->ancestors]}, $node)); #warn "location: $location\n"; return $location; } # merge doc sub merge_doc { my( $doc)= @_; my $t= XML::Twig->new( twig_handlers => { elt => \&merge_node, subelt => \&merge_node, }, pretty_print => 'indented', ); $t->parse( $doc); return $t->sprint; } sub merge_node { my( $t, $node)= @_; my $potential_merger= $node->prev_elt( $node->tag) or return; # re +turn if this is the first node of this type if( location( $node) eq location( $potential_merger)) { # bingo! we can merge the contents foreach my $content ($node->children) { $content->move( last_child => $potential_merger); } } else { # this branch is not used for this test as we are working in m +emory # but this is where you could free the memory by dumping the p +art of # the tree that will no longer need to be updated # $t->flush_up_to( $potential_merger); } $node->delete unless( $node->has_child); } sub compact { my( $doc)= @_; $doc=~ s{^\s+}{}; # trim at the begining $doc=~ s{\s+$}{}; # trim the end $doc=~ s{>\s*<}{><}g; # trim spaces between tags return $doc; } __DATA__ <doc> <elt elt_class="class1"> <subelt subelt_class="sclass1"><content id="content1"/></subelt> </elt> <elt elt_class="class1"> <subelt subelt_class="sclass1"><content id="content2"/></subelt> </elt> <elt elt_class="class1"> <subelt subelt_class="sclass2"><content id="content3"/></subelt> </elt> <elt elt_class="class2"> <subelt subelt_class="sclass3"><content id="content4"/></subelt> </elt> <elt elt_class="class1"> <subelt subelt_class="sclass1"><content id="content5"/></subelt> </elt> </doc> <doc> <elt elt_class="class1"> <subelt subelt_class="sclass1"> <content id="content1"/> <content id="content2"/> <content id="content5"/> </subelt> <subelt subelt_class="sclass2"> <content id="content3"/> </subelt> </elt> <elt elt_class="class2"> <subelt subelt_class="sclass3"> <content id="content4"/> </subelt> </elt> </doc> <doc> <elt elt_class="class1"> <subelt subelt_class="sclass1"> <content id="content1"/> <content id="content2"/> </subelt> <subelt subelt_class="sclass2"> <content id="content3"/> </subelt> </elt> <elt elt_class="class2"> <subelt subelt_class="sclass3"> <content id="content4"/> </subelt> </elt> <elt elt_class="class1"> <subelt subelt_class="sclass1"> <content id="content5"/> </subelt> </elt> </doc>