Your requirements are a bit sketchy and I can see 2 things you might want to do in this case (once you get proper XML of course ;--):
#!/usr/bin/perl -w
use strict;
use XML::Twig;
$/="\n\n";
# tag => attribute we are interested in
# you could avoid having this global by putting it in the twig
my %att= ( elt => 'elt_class',
subelt => 'subelt_class',
);
my $doc = <DATA>; # the original data set
my $expected_sorted_doc = <DATA>; # sorted result
my $expected_merged_doc = <DATA>; # merged result
my $sorted_doc= sort_doc( $doc);
if( compact( $sorted_doc) eq compact( $expected_sorted_doc))
{ print "sorted doc generation OK\n"; }
else
{ print "sorted doc generation NOK: \n",
"expected:\n$expected_sorted_doc\n",
"found:\n$sorted_doc\n";
}
my $merged_doc= merge_doc( $doc);
if( compact( $merged_doc) eq compact( $expected_merged_doc))
{ print "merged doc generation OK\n"; }
else
{ print "merged doc generation NOK: \n",
"expected:\n$expected_merged_doc\n",
"found:\n$merged_doc\n";
}
# sort: for each relevant node (elt or subelt in this case) generate a
+ location
# key and move the content if the location already exists
sub sort_doc
{ my( $doc)= @_;
my $location={}; # location key => existing subelt element with t
+his location key
my $t= XML::Twig->new( twig_handlers => { elt => sub { sort_nod
+e( $location, @_); },
subelt => sub { sort_nod
+e( $location, @_); },
},
pretty_print => 'indented', # makes debuggi
+ng easier
);
$t->parse( $doc);
return $t->sprint;
}
sub sort_node
{ my( $location, $t, $node)= @_;
# compute the location key, which must describe uniquely the node
+category
my $location_key= location( $node);
# now see if we need to move the content
if( my $new_parent= $location->{$location_key})
{ # there is already an element with this location key
# move all content's there
foreach my $content ($node->children)
{ $content->move( last_child => $new_parent); }
# no need to keep the empty shell
$node->delete unless( $node->has_child);
}
else
{ # first time we see the location key, store the element in $lo
+cation
$location->{$location_key}= $node;
}
}
# the location describes a node category, nodes with the same location
+ should have the same parent
sub location
{ my( $node)= @_;
# a compact way to just join the values of the proper attributes o
+f the ancestors of the node
my $location= join( '-', grep {$_} map { $_->att( $att{$_->tag}) |
+| '' } (@{[$node->ancestors]}, $node));
#warn "location: $location\n";
return $location;
}
# merge doc
sub merge_doc
{ my( $doc)= @_;
my $t= XML::Twig->new( twig_handlers => { elt => \&merge_node,
subelt => \&merge_node,
},
pretty_print => 'indented',
);
$t->parse( $doc);
return $t->sprint;
}
sub merge_node
{ my( $t, $node)= @_;
my $potential_merger= $node->prev_elt( $node->tag) or return; # re
+turn if this is the first node of this type
if( location( $node) eq location( $potential_merger))
{ # bingo! we can merge the contents
foreach my $content ($node->children)
{ $content->move( last_child => $potential_merger); }
}
else
{ # this branch is not used for this test as we are working in m
+emory
# but this is where you could free the memory by dumping the p
+art of
# the tree that will no longer need to be updated
# $t->flush_up_to( $potential_merger);
}
$node->delete unless( $node->has_child);
}
sub compact
{ my( $doc)= @_;
$doc=~ s{^\s+}{}; # trim at the begining
$doc=~ s{\s+$}{}; # trim the end
$doc=~ s{>\s*<}{><}g; # trim spaces between tags
return $doc;
}
__DATA__
<doc>
<elt elt_class="class1">
<subelt subelt_class="sclass1"><content id="content1"/></subelt>
</elt>
<elt elt_class="class1">
<subelt subelt_class="sclass1"><content id="content2"/></subelt>
</elt>
<elt elt_class="class1">
<subelt subelt_class="sclass2"><content id="content3"/></subelt>
</elt>
<elt elt_class="class2">
<subelt subelt_class="sclass3"><content id="content4"/></subelt>
</elt>
<elt elt_class="class1">
<subelt subelt_class="sclass1"><content id="content5"/></subelt>
</elt>
</doc>
<doc>
<elt elt_class="class1">
<subelt subelt_class="sclass1">
<content id="content1"/>
<content id="content2"/>
<content id="content5"/>
</subelt>
<subelt subelt_class="sclass2">
<content id="content3"/>
</subelt>
</elt>
<elt elt_class="class2">
<subelt subelt_class="sclass3">
<content id="content4"/>
</subelt>
</elt>
</doc>
<doc>
<elt elt_class="class1">
<subelt subelt_class="sclass1">
<content id="content1"/>
<content id="content2"/>
</subelt>
<subelt subelt_class="sclass2">
<content id="content3"/>
</subelt>
</elt>
<elt elt_class="class2">
<subelt subelt_class="sclass3">
<content id="content4"/>
</subelt>
</elt>
<elt elt_class="class1">
<subelt subelt_class="sclass1">
<content id="content5"/>
</subelt>
</elt>
</doc>