http://qs321.pair.com?node_id=217196


in reply to Removing duplicate subtrees from XML

Your requirements are a bit sketchy and I can see 2 things you might want to do in this case (once you get proper XML of course ;--):

Here is a piece of code using the usual suspect that does both (on a made up XML data set, but adapting it to your data should be really easy):

#!/usr/bin/perl -w use strict; use XML::Twig; $/="\n\n"; # tag => attribute we are interested in # you could avoid having this global by putting it in the twig my %att= ( elt => 'elt_class', subelt => 'subelt_class', ); my $doc = <DATA>; # the original data set my $expected_sorted_doc = <DATA>; # sorted result my $expected_merged_doc = <DATA>; # merged result my $sorted_doc= sort_doc( $doc); if( compact( $sorted_doc) eq compact( $expected_sorted_doc)) { print "sorted doc generation OK\n"; } else { print "sorted doc generation NOK: \n", "expected:\n$expected_sorted_doc\n", "found:\n$sorted_doc\n"; } my $merged_doc= merge_doc( $doc); if( compact( $merged_doc) eq compact( $expected_merged_doc)) { print "merged doc generation OK\n"; } else { print "merged doc generation NOK: \n", "expected:\n$expected_merged_doc\n", "found:\n$merged_doc\n"; } # sort: for each relevant node (elt or subelt in this case) generate a + location # key and move the content if the location already exists sub sort_doc { my( $doc)= @_; my $location={}; # location key => existing subelt element with t +his location key my $t= XML::Twig->new( twig_handlers => { elt => sub { sort_nod +e( $location, @_); }, subelt => sub { sort_nod +e( $location, @_); }, }, pretty_print => 'indented', # makes debuggi +ng easier ); $t->parse( $doc); return $t->sprint; } sub sort_node { my( $location, $t, $node)= @_; # compute the location key, which must describe uniquely the node +category my $location_key= location( $node); # now see if we need to move the content if( my $new_parent= $location->{$location_key}) { # there is already an element with this location key # move all content's there foreach my $content ($node->children) { $content->move( last_child => $new_parent); } # no need to keep the empty shell $node->delete unless( $node->has_child); } else { # first time we see the location key, store the element in $lo +cation $location->{$location_key}= $node; } } # the location describes a node category, nodes with the same location + should have the same parent sub location { my( $node)= @_; # a compact way to just join the values of the proper attributes o +f the ancestors of the node my $location= join( '-', grep {$_} map { $_->att( $att{$_->tag}) | +| '' } (@{[$node->ancestors]}, $node)); #warn "location: $location\n"; return $location; } # merge doc sub merge_doc { my( $doc)= @_; my $t= XML::Twig->new( twig_handlers => { elt => \&merge_node, subelt => \&merge_node, }, pretty_print => 'indented', ); $t->parse( $doc); return $t->sprint; } sub merge_node { my( $t, $node)= @_; my $potential_merger= $node->prev_elt( $node->tag) or return; # re +turn if this is the first node of this type if( location( $node) eq location( $potential_merger)) { # bingo! we can merge the contents foreach my $content ($node->children) { $content->move( last_child => $potential_merger); } } else { # this branch is not used for this test as we are working in m +emory # but this is where you could free the memory by dumping the p +art of # the tree that will no longer need to be updated # $t->flush_up_to( $potential_merger); } $node->delete unless( $node->has_child); } sub compact { my( $doc)= @_; $doc=~ s{^\s+}{}; # trim at the begining $doc=~ s{\s+$}{}; # trim the end $doc=~ s{>\s*<}{><}g; # trim spaces between tags return $doc; } __DATA__ <doc> <elt elt_class="class1"> <subelt subelt_class="sclass1"><content id="content1"/></subelt> </elt> <elt elt_class="class1"> <subelt subelt_class="sclass1"><content id="content2"/></subelt> </elt> <elt elt_class="class1"> <subelt subelt_class="sclass2"><content id="content3"/></subelt> </elt> <elt elt_class="class2"> <subelt subelt_class="sclass3"><content id="content4"/></subelt> </elt> <elt elt_class="class1"> <subelt subelt_class="sclass1"><content id="content5"/></subelt> </elt> </doc> <doc> <elt elt_class="class1"> <subelt subelt_class="sclass1"> <content id="content1"/> <content id="content2"/> <content id="content5"/> </subelt> <subelt subelt_class="sclass2"> <content id="content3"/> </subelt> </elt> <elt elt_class="class2"> <subelt subelt_class="sclass3"> <content id="content4"/> </subelt> </elt> </doc> <doc> <elt elt_class="class1"> <subelt subelt_class="sclass1"> <content id="content1"/> <content id="content2"/> </subelt> <subelt subelt_class="sclass2"> <content id="content3"/> </subelt> </elt> <elt elt_class="class2"> <subelt subelt_class="sclass3"> <content id="content4"/> </subelt> </elt> <elt elt_class="class1"> <subelt subelt_class="sclass1"> <content id="content5"/> </subelt> </elt> </doc>

Replies are listed 'Best First'.
Re: Re: Removing duplicate subtrees from XML
by matth (Monk) on Dec 09, 2002 at 16:16 UTC
    Thanks for this code. It works very well with your data. Unfortunately it does not seem to work when attribute data is missing. Is there an easy way to overcome this?