1: #!/usr/bin/perl -w
2: #
3: # The challenge - to write a function which given a list of non
4: # negative integers returns a regexp which will match those and only
5: # those numbers.
6: #
7: # Try the test program like this :-
8: #
9: #-- Simple numeric list
10: # ./numeric-list-to-regexp.pl 0..255
11: # \d|(?:[1-9]|1\d|2[0-4])\d|25[0-5]
12: #-- Sparse numeric list
13: # ./numeric-list-to-regexp.pl 0..11,15,21..33
14: # \d|1[015]|2[1-9]|3[0-3]
15: #-- Numbers divisble by 3
16: # ./numeric-list-to-regexp.pl 'map {$_ * 3} 0..33'
17: # [0369]|1[258]|2[147]|3[0369]|4[258]|5[147]|6[0369]|7[258]|8[147]|9[0369]
18: #-- All prime numbers < 100
19: # ./numeric-list-to-regexp.pl 'use Quantum::Superpositions; grep { $_ % all(2..sqrt($_)+1) != 0 } (1..100)'
20: # [1357]|1[1379]|2[39]|3[17]|4[137]|5[39]|6[17]|7[139]|8[39]|97
21:
22: use strict;
23:
24: die "Pass some perl in please, eg 1..20 or 1, 4, 5"
25: unless @ARGV;
26: my @list = eval "@ARGV";
27: my $re = numeric_list_to_regexp( @list );
28: check_numeric_list_to_regexp($re, \@list);
29: exit;
30:
31: ############################################################
32: # Converts a list of numbers into a regexp which will
33: # match those numbers and those numbers only.
34: #
35: # It does this by constructing a regexp and then progressively
36: # simplifying it - recursively if necessary. It uses regexp's to
37: # transform the regexp of course! This is almost a general purpose
38: # regexp optimiser.
39: #
40: # We assume that the caller will bound the regexp with ^( and )$ or
41: # \D(?: and )\D or whatever takes their fancy
42: #
43: # Set $DEBUG to 1 if you want to print lots of info and check the
44: # regexp works after each transformation.
45: #
46: # Warning: code contains heavy regexps - lift with care ;-)
47: # Caution: Code may use exponential time and space ;-(
48: ############################################################
49:
50: sub numeric_list_to_regexp
51: {
52: my (@list) = @_;
53: my $DEBUG = 0;
54:
55: # The basic regexp with |'s on the start and end to make our life
56: # easier
57: # Should uniq here too...
58: my $re = "|" . join("|", sort { $a <=> $b } @list) . "|";
59:
60: # Transform the regexp in stages, making sure at all time the
61: # regexp is correct if $DEBUG is set
62:
63: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
64:
65: # 1) Concatenate all the single characters a|b|c into [abc]'s
66: $re =~ s{ \| ( \d (?: \| \d )+ ) (?= \| ) }
67: {
68: my ( $string ) = ( $1 );
69: print "string = '$string'\n" if $DEBUG;
70: "|[" . join("", split m{\|}, $string) . "]"
71: }gex;
72:
73: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
74:
75: # 2) Find all the Xa|Xb|Xc and change to X(?:a|b|c)]
76: $re =~ s{ \| ( (\d+)(\d+) (?: \| \2\d+ )+ ) (?= \| ) }
77: {
78: my ( $string, $prefix ) = ( $1, $2 );
79: print "prefix = '$prefix', string = '$string'\n" if $DEBUG;
80: "|$prefix\(?:" . join("|", map { substr($_, length $prefix) } split m{\|}, $string) . ")"
81: }gex;
82:
83: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
84:
85: # 3) Find all the aX|bX|cX and change to (a|b|c)X]
86: $re =~ s{ \| ( (\d+?)(.+) (?: \| \d+\3 )+ ) (?= \| ) }
87: {
88: my ( $string, $postfix ) = ( $1, $3 );
89: print "postfix = '$postfix', string = '$string'\n" if $DEBUG;
90: $string =~ s{ \Q$postfix\E (?= \| | $ ) }{}gx;
91: print "...string = '$string'\n" if $DEBUG;
92: "|(?:$string)$postfix"
93: }gex;
94:
95: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
96:
97: # 4) Change (?:a|b|c) into [abc]
98: $re =~ s{ \(\?\: ( \d (?: \| \d )+ ) \) }
99: {
100: my ( $string ) = ( $1 );
101: print "string = '$string'\n" if $DEBUG;
102: "[" . join("", split m{\|}, $string) . "]"
103: }gex;
104:
105: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
106:
107: # 5) Optimise [abc] into [a-c] or \d
108: # This doesn't optimise all the cases only the complete continuous
109: # range in the [ ... ]
110: $re =~ s{ \[ ( \d{3,} ) \] }
111: {
112: my ( $string, $start, $end ) = ( $1, substr($1, 0, 1), substr($1, -1, 1) );
113: print "match ['$string']...range [$start-$end]\n" if $DEBUG;
114: if ($end - $start + 1 == length $string)
115: {
116: $start == 0 && $end == 9 ? '\d' : "[$start-$end]";
117: }
118: else
119: {
120: "[$string]";
121: }
122: }gex;
123:
124: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
125:
126: # 6) recurse on any digit sequences left (?:ab|cd|ef)
127: $re =~ s{ \(\?\: ( \d+ (?: \| \d+ )+ ) \) }
128: {
129: my ( $string ) = ( $1 );
130: print "**** Recursing on '$string'\n" if $DEBUG;
131: "(?:" . numeric_list_to_regexp(split m{\|}, $string) . ")";
132: }gex;
133:
134: check_numeric_list_to_regexp($re, \@list) if $DEBUG;
135:
136: # 7) fix the | on each end
137: $re =~ s{^\|}{};
138: $re =~ s{\|$}{};
139:
140: print "**** Returning '$re'\n" if $DEBUG;
141:
142: return $re;
143: }
144:
145: ############################################################
146: # Test subroutine to check the regexp performs as advertised
147: #
148: # Call with a regexp and a reference to a list of numbers
149: # it will check that the regexp matches all the list and
150: # doesn't match some others (obviously it can't check them
151: # all can it!) die-ing on any failures.
152: ############################################################
153:
154: sub check_numeric_list_to_regexp
155: {
156: my ($re, $list) = @_;
157: my %list = map { $_ => 1 } @$list;
158: print "Re: $re\n";
159:
160: # Put some other test cases in
161: $list{$_} += 0 for (0..999);
162: $list{int(rand()*1000)} += 0 for (0..99);
163: $list{int(rand()*10000)} += 0 for (0..99);
164: $list{int(rand()*100000)} += 0 for (0..99);
165:
166: # print join(", ", map {"$_ => $list{$_}"} keys %list), "\n";
167: $re =~ s{^\|}{}; # fix | on start and end
168: $re =~ s{\|$}{};
169: $re = "^(?:$re)\$"; # put in ^(?: ... )$
170: $re = qr{$re}; # compile the regexp for speed
171:
172: # Check all the keys in list against the regexp - some should pass
173: # and some should fail
174: for my $item (keys %list)
175: {
176: if ($list{$item} xor ($item =~ /$re/))
177: {
178: die "*** FAILED '$re' for '$item' ShouldMatch: $list{$item}\n";
179: }
180: else
181: {
182: # print "OK '$re' for '$item'\n";
183: }
184: }
185: }