Re: Removing duplicate subtrees from XML

Your requirements are a bit sketchy and I can see 2 things you might want to do in this case (once you get proper XML of course ;--):

get the minimun document with properly nested container elements, each "low-level" element being in the proper hierarchy. In general, unless the low-level elements are already ordered (in which case see below), you will need to load the entire data set in memory, which might be a problem if you are dealing with the Human Genome,
just try to merge adjacent nodes, that is if a node can be merged with the previous one, then be it, otherwise don't try to merge it with nodes that are further apart. This can be performed by keeping just parts of the data in memory, so it can be performed on much bigger data sets. This should give the same result as the previous option if the nodes are sorted.

Here is a piece of code using the usual suspect that does both (on a made up XML data set, but adapting it to your data should be really easy):

#!/usr/bin/perl -w
use strict;

use XML::Twig;

$/="\n\n";

# tag => attribute we are interested in
# you could avoid having this global by putting it in the twig
my %att= ( elt    => 'elt_class',
           subelt => 'subelt_class',
         );

my $doc                 = <DATA>;  # the original data set
my $expected_sorted_doc = <DATA>;  # sorted result  
my $expected_merged_doc = <DATA>;  # merged result

my $sorted_doc= sort_doc( $doc);
if( compact( $sorted_doc) eq compact( $expected_sorted_doc))
  { print "sorted doc generation OK\n"; }
else
  { print "sorted doc generation NOK: \n",
          "expected:\n$expected_sorted_doc\n",
          "found:\n$sorted_doc\n";
  }

my $merged_doc= merge_doc( $doc);
if( compact( $merged_doc) eq compact( $expected_merged_doc))
  { print "merged doc generation OK\n"; }
else
  { print "merged doc generation NOK: \n",
          "expected:\n$expected_merged_doc\n",
          "found:\n$merged_doc\n";
  }


# sort: for each relevant node (elt or subelt in this case) generate a
+ location
# key and move the content if the location already exists

sub sort_doc
  { my( $doc)= @_;
    my $location={};  # location key => existing subelt element with t
+his location key
    my $t= XML::Twig->new( twig_handlers => { elt    => sub { sort_nod
+e( $location, @_); },
                                              subelt => sub { sort_nod
+e( $location, @_); },
                                            },
                           pretty_print => 'indented', # makes debuggi
+ng easier
                         );
    $t->parse( $doc);
    return $t->sprint;
  } 

sub sort_node
  { my( $location, $t, $node)= @_;

    # compute the location key, which must describe uniquely the node 
+category
    my $location_key= location( $node);

    # now see if we need to move the content
    if( my $new_parent= $location->{$location_key})
      { # there is already an element with this location key
        # move all content's there
        foreach my $content ($node->children)
          { $content->move( last_child => $new_parent); }
        # no need to keep the empty shell
        $node->delete unless( $node->has_child);
      }
    else
      { # first time we see the location key, store the element in $lo
+cation
        $location->{$location_key}= $node;
      }
  }

# the location describes a node category, nodes with the same location
+ should have the same parent
sub location
  { my( $node)= @_;
    # a compact way to just join the values of the proper attributes o
+f the ancestors of the node
    my $location= join( '-', grep {$_} map { $_->att( $att{$_->tag}) |
+| '' } (@{[$node->ancestors]}, $node));
    #warn "location: $location\n";
    return $location;
  }


# merge doc 

sub merge_doc
  { my( $doc)= @_;
    my $t= XML::Twig->new( twig_handlers => { elt    => \&merge_node,
                                              subelt => \&merge_node,
                                            },
                           pretty_print => 'indented',
                         );
    $t->parse( $doc);
    return $t->sprint;
  } 

sub merge_node
  { my( $t, $node)= @_;
    my $potential_merger= $node->prev_elt( $node->tag) or return; # re
+turn if this is the first node of this type 
    if( location( $node) eq location( $potential_merger))
      { # bingo! we can merge the contents
        foreach my $content ($node->children)
          { $content->move( last_child => $potential_merger); }
      }
    else
      { # this branch is not used for this test as we are working in m
+emory
        # but this is where you could free the memory by dumping the p
+art of
        # the tree that will no longer need to be updated
        # $t->flush_up_to( $potential_merger);
      }

    $node->delete unless( $node->has_child);
  }

sub compact
  { my( $doc)= @_;
    $doc=~ s{^\s+}{};     # trim at the begining
    $doc=~ s{\s+$}{};     # trim the end
    $doc=~ s{>\s*<}{><}g; # trim spaces between tags
    return $doc;
  }

__DATA__
<doc>
  <elt elt_class="class1">
    <subelt subelt_class="sclass1"><content id="content1"/></subelt>
  </elt>
  <elt elt_class="class1">
    <subelt subelt_class="sclass1"><content id="content2"/></subelt>
  </elt>
  <elt elt_class="class1">
    <subelt subelt_class="sclass2"><content id="content3"/></subelt>
  </elt>
  <elt elt_class="class2">
    <subelt subelt_class="sclass3"><content id="content4"/></subelt>
  </elt>
  <elt elt_class="class1">
    <subelt subelt_class="sclass1"><content id="content5"/></subelt>
  </elt>
</doc>

<doc>
  <elt elt_class="class1">
    <subelt subelt_class="sclass1">
      <content id="content1"/>
      <content id="content2"/>
      <content id="content5"/>
    </subelt>
    <subelt subelt_class="sclass2">
      <content id="content3"/>
    </subelt>
  </elt>
  <elt elt_class="class2">
    <subelt subelt_class="sclass3">
      <content id="content4"/>
    </subelt>
  </elt>
</doc>

<doc>
  <elt elt_class="class1">
    <subelt subelt_class="sclass1">
      <content id="content1"/>
      <content id="content2"/>
    </subelt>
    <subelt subelt_class="sclass2">
      <content id="content3"/>
    </subelt>
  </elt>
  <elt elt_class="class2">
    <subelt subelt_class="sclass3">
      <content id="content4"/>
    </subelt>
  </elt>
  <elt elt_class="class1">
    <subelt subelt_class="sclass1">
      <content id="content5"/>
    </subelt>
  </elt>
</doc>
[download]

Comment on Re: Removing duplicate subtrees from XML Download Code


"be consistent"
	PerlMonks