# there's some looseness here to account for people's mangling # of the English language entire_input ::= numbers 'End_Of_Input' numbers ::= [ NEGATIVE ] millions | thousands | hundreds | tens | teens | ones [ 'POINT' teens | ones { ones } ] millions ::= [ hundreds | teens | tens | ones ] 'MILLION' [ thousands | hundreds | teens | tens | ones ] thousands ::= [ hundreds | teens | tens | ones ] 'THOUSAND' [ hundreds | teens | tens | ones ] hundreds ::= [ teens | tens | ones ] 'HUNDRED' [ teens | tens | ones ] tens ::= twenty | thirty | forty | fifty | sixty | seventy | eighty | ninety teens ::= ten | eleven | twelve | thirteen | fourteen | fifteen | sixteen | seventeen | eighteen | nineteen ones ::= zero | one | two | three | four | five | six | seven | eight | nine #### package Number::FromWord; use warnings; use strict; use HOP::Stream qw/iterator_to_stream/; use HOP::Lexer qw/make_lexer/; use HOP::Parser qw/:all/; use Regexp::Assemble; use Scalar::Util qw/looks_like_number/; use base 'Exporter'; our @EXPORT_OK = 'word_to_num'; my %one_num_for = ( zero => 0, one => 1, two => 2, three => 3, four => 4, five => 5, six => 6, seven => 7, eight => 8, nine => 9, ); my %teen_num_for = ( ten => 10, eleven => 11, twelve => 12, thirteen => 13, fourteen => 14, fifteen => 15, sixteen => 16, seventeen => 17, eighteen => 18, nineteen => 19, ); my %ten_num_for = ( twenty => 20, thirty => 30, forty => 40, fifty => 50, sixty => 60, seventy => 70, eighty => 80, ninety => 90, ); # # set up the lexer # my $ones_re = Regexp::Assemble->new; foreach my $number ( keys %one_num_for ) { $ones_re->add($number); } my $teens_re = Regexp::Assemble->new; foreach my $number ( keys %teen_num_for ) { $teens_re->add($number); } my $tens_re = Regexp::Assemble->new; foreach my $number ( keys %ten_num_for ) { $tens_re->add($number); } my @tokens = ( [ 'AND', qr/\band\b/i, sub { } ], [ 'MILLION', qr/million/i, \&number ], [ 'THOUSAND', qr/thousand/i, \&number ], [ 'HUNDRED', qr/hundred/i, \&number ], [ 'TENS', qr/$tens_re+/i, \&number ], [ 'TEENS', qr/$teens_re+/i, \&number ], [ 'ONES', qr/$ones_re+/i, \&number ], [ 'NEGATIVE', qr/negative/i ], [ 'POINT', qr/point/i ], [ 'SPACE', qr/[ -]/i, sub { } ], ); sub number { my ( $label, $number_as_word ) = @_; return [ $label, lc $number_as_word ]; } sub word_to_num { my @word = shift; my $lexer = make_lexer( sub { shift @word }, @tokens ); return parse($lexer); } # # set up the parser # my ( $numbers, $negative, $point, $ones, $teens, $tens, $hundreds, $thousands, $millions ); my $Point = parser { $point->(@_) }; my $Ones = parser { $ones->(@_) }; my $Teens = parser { $teens->(@_) }; my $Tens = parser { $tens->(@_) }; my $Hundreds = parser { $hundreds->(@_) }; my $Thousands = parser { $thousands->(@_) }; my $Millions = parser { $millions->(@_) }; my $Negative = parser { $negative->(@_) }; my $Numbers = parser { $numbers->(@_) }; # entire_input ::= numbers 'End_Of_Input' my $entire_input = T( concatenate( $Numbers, \&End_of_Input ), sub { shift } ); # numbers ::= [ NEGATIVE ] millions | thousands | hundreds | tens | teens | ones # [ 'POINT' teens | ones { ones } ] $numbers = T( concatenate( optional( lookfor('NEGATIVE') ), alternate( $Millions, $Thousands, $Hundreds, $Tens, $Teens, $Ones ), optional( concatenate( absorb( lookfor('POINT') ), alternate( $Teens, concatenate( $Ones, star($Ones) ) ) ) ), ), sub { my ( $neg, $num, $point ) = @_; $point = $point->[0]; if ( looks_like_number($point) ) { # we have teens $num .= ".$point"; } elsif ( defined $point->[0] ) { # we have ones my $decimal = $point->[0]; if ( my @points = @{ $point->[1] } ) { $decimal .= join '', @points; } $num .= ".$decimal"; } if (@$neg) { $num *= -1; } return $num; } ); # millions ::= [ hundreds | teens | tens | ones ] # 'MILLION' # [ thousands | hundreds | teens | tens | ones ] $millions = T( concatenate( optional( alternate( $Hundreds, $Teens, $Tens, $Ones ) ), absorb( lookfor('MILLION') ), optional( alternate( $Thousands, $Hundreds, $Teens, $Tens, $Ones ) ) ), sub { my ( $million, $thousands ) = @_; $million = $million->[0] || 1; $thousands = $thousands->[0] || 0; return $million . sprintf "%06d", $thousands; } ); # thousands ::= [ hundreds | teens | tens | ones ] # 'THOUSAND' # [ hundreds | teens | tens | ones ] $thousands = T( concatenate( optional( alternate( $Hundreds, $Teens, $Tens, $Ones ) ), absorb( lookfor('THOUSAND') ), optional( alternate( $Hundreds, $Teens, $Tens, $Ones ) ) ), sub { my ( $thousand, $hundreds ) = @_; $thousand = $thousand->[0] || 1; $hundreds = $hundreds->[0] || 0; return $thousand . sprintf "%03d", $hundreds; } ); # hundreds ::= [ teens | tens | ones ] 'HUNDRED' [ teens | tens | ones ] $hundreds = T( concatenate( optional( alternate( $Teens, $Tens, $Ones ) ), absorb( lookfor('HUNDRED') ), optional( alternate( $Teens, $Tens, $Ones ) ) ), sub { my ( $hundred, $tens ) = @_; $hundred = $hundred->[0] || 1; $tens = $tens->[0] || 0; return $hundred . sprintf "%02d", $tens; } ); # tens ::= 'TENS' [ ones ] $tens = T( concatenate( lookfor('TENS'), optional($Ones) ), sub { my ( $ten_word, $one_num ) = @_; my $ten_num = $ten_num_for{$ten_word}; $ten_num += $one_num->[0] || 0; return $ten_num; } ); # teens ::= 'TEENS' $teens = T( lookfor('TEENS'), sub { $teen_num_for{ $_[0] } } ); # ones ::= 'ONES' $ones = T( lookfor('ONES'), sub { $one_num_for{ $_[0] } } ); sub parse { my $stream = iterator_to_stream(shift); my ( $results, $remainder ) = eval { $entire_input->($stream) }; return $results; } 1; #### #!perl #use Test::More tests => 1; use Test::More qw/no_plan/; BEGIN { chdir 't' if -d 't'; unshift @INC, '../lib'; use_ok( 'Number::FromWord', 'word_to_num' ); } can_ok __PACKAGE__, 'word_to_num'; my @words = ( 'three' => 3, 'fourteen' => 14, 'forty' => 40, 'forty-two' => 42, 'ninety nine' => 99, 'hundred' => 100, 'hundred and twenty' => 120, 'hundred and two' => 102, 'negative hundred and two' => -102, 'hundred ninety-nine' => 199, 'hundred nineteen' => 119, 'two hundred and three' => 203, 'twelve hundred and seventy three', => 1273, 'thousand' => 1000, 'seven hundred thirty-three thousand five hundred and twenty-nine', 733529, 'seventy three point two four five' => 73.245, 'three point seventeen' => 3.17, 'seven million' => 7000000, 'two hundred and ninety six million four hundred and twenty-two thousand five hundred and seventy eight', 296_422_578, 'two hundred and ninety six million four hundred and twenty-two thousand five hundred and seventy eight point three four seven', 296422578.347, ); while (@words) { my ( $word, $num ) = splice @words, 0, 2; is word_to_num($word), $num, "... $word should be $num"; }