Chinese number to words script for invoices

Avadhut Phatarpekar bio photo By Avadhut Phatarpekar Comment

If you are an organization with presence in China, you’d know that invoicing norms there are pretty strict. Invoices generated, whether online or manual, need to follow a set standard.

One of the problems I faced with developing a part of our application that generates invoices for our Chinese clients is the number to words conversion. In China, the invoices, in addition to the actual figure on the invoice, should also have it mentioned in words. For example, USD 123 would become One hundred and twenty-three. Except, the text has to be Chinese.

I hunted a lot for readymade classes/scripts that would do this in a Chinese script or a variant thereof. But I did not find anything based in PHP. We did, though, find something in the CPAN Perl libraries that seemed pretty workable. And we decided to run with it. So, below, I’ve outline the process, script, and the function that will allow you to achieve this (I will be assuming that the target system is Ubuntu Server 10.04 LTS).

Step I: Install the lib lingua package

sudo apt-get install liblingua-*

Step II: Install CPAN

sudo perl -MCPAN -e shell

Step III: Inside the CPAN shell, install YAML

install YAML

Step IV: Upgrade aptitude repositories once

o conf prerequisites_policy follow
o conf commit
upgrade

Step V: Install lingua libraries for numbers

install Lingua::Any::Numbers

Step VI: Now create a Perl module file. For the purpose of this example, let’s call it ChineseNumbersU8.pm. The code has been written by Erik Peterson and you can download it from http://www.mandarintools.com/numbers.html. In case you don’t find it there, though, you can paste the following code in the file

# -*- coding: utf-8; -*-


package ChineseNumbers;

require Exporter;
use strict;
use Lingua::EN::Numbers qw(num2en num2en_ordinal);

use subs qw{EnglishToChineseNumber ChineseToEnglishNumber chinese_output english_output};

# Usage:

#

# use ChineseNumbers;

#

# ChineseNumbers->EnglishToChineseNumber(enumber, [output_type])

#   enumber is an integer

#   output_type (which is optional) can be

#     trad       : Output with traditional Chinese characters

#     formaltrad : Output as formal numbers with traditional characters

#     simp       : Output using simplified Chinese characters

#     formalsimp : Output as formal numbers in simplified characters

#     unicodehex : Output as 4-digit Unicode hex blocks

#     pinyin     : Output as Hanyu Pinyin

#     jyutpin    : Output as Cantonese jyutpin romanization

#     yalecant   : Output as Cantonese Yale romanization

#    The default is trad

#

# ChineseNumbers->ChineseToEnglishNumber(cnumber, [english_type])

#   cnumber is a string in UTF-8

#   english_type is 

#      arabic    : plain Arabic numerals

#      comma     : plain Arabic numbers with commas

#      words     : written out using English words

#

# ChineseNumbers->chinese_output([option])

#   Set the default output type used by EnglishToChineseNumber

#    option can be any of the output options for EnglishToChineseNumber

#    If no arguments, returns the current default

#

# ChineseNumbers->english_output([option])

#   Set the default output type used by ChineseToEnglishNumber

#    option can be any of the output options for ChineseToEnglishNumber

#    If no arguments, returns the current default

#




BEGIN { }

my $default_outputtype = "trad";

my $default_englishtype = "arabic";

my $MINUS = "負";
my $DECIMAL = "點";

my @digits = ("零", "一", "二", "三", "四", "五", "六", "七", "八", "九"); 

my %digits = ("0", 0, "0", 0, "零", 0, "〇", 0,	
	      "1", 1, "1", 1, "一", 1, "壹", 1,
	      "2", 2, "2", 2, "二", 2, "貳", 2, "贰", 2, "兩", 2, "两", 2, 
	      "3", 3, "3", 3, "三", 3, "參", 3, "叄", 3, "叁", 3, 
	      "4", 4, "4", 4, "四", 4, "肆", 4,
	      "5", 5, "5", 5, "五", 5, "伍", 5,
	      "6", 6, "6", 6, "六", 6, "陸", 6, "陆", 6,
	      "7", 7, "7", 7, "七", 7, "柒", 7,
	      "8", 8, "8", 8, "八", 8, "捌", 8,
	      "9", 9, "9", 9, "九", 9, "玖", 9); 

my @beforeWan = ("十", "百", "千"); 

my %beforeWan = ("十", 10, "拾", 10,
		 "百", 100, "佰", 100,
		 "千", 1000, "仟", 1000); 


my @afterWan = ("", "萬", "億", "兆", "京"); 

my %afterWan = ("萬", 10000, "万", 10000,
		"億", 100000000, 	"亿", 100000000,
		"兆", 1000000000000,
		"京", 10000000000000000); 


my $ALTTWO = "兩";
my $TEN = 10;

my %trad2simp = ("負" => "负", 
		 "點" => "点",
		 "零" => "零", 
		 "一" => "一",
		 "二" => "二",
		 "三" => "三", 
		 "四" => "四",
		 "五" => "五",
		 "六" => "六",
		 "七" => "七",
		 "八" => "八",
		 "九" => "九",
		 "十" => "十",
		 "百" => "百",
		 "千" => "千", 
		 "萬" => "万",
		 "億" => "亿",
		 "兆" => "兆", 
		 "兩" => "两",
		 "點" => "点");


my %trad2formal = ("負" => "負", 
		   "點" => "點",
		   "零" => "零", 
		   "一" => "壹",
		   "二" => "貳",
		   "三" => "參", 
		   "四" => "肆",
		   "五" => "伍",
		   "六" => "陸",
		   "七" => "柒",
		   "八" => "捌",
		   "九" => "玖",
		   "十" => "拾",
		   "百" => "佰",
		   "千" => "仟", 
		   "萬" => "萬",
		   "億" => "億",
		   "兆" => "兆", 
		   "兩" => "兩",
		   "點" => "點");


my %trad2formalsimp = ("負" => "负", 
		       "點" => "点",
		       "零" => "零", 
		       "一" => "壹",
		       "二" => "贰",
		       "三" => "叁", 
		       "四" => "肆",
		       "五" => "伍",
		       "六" => "陆",
		       "七" => "柒",
		       "八" => "捌",
		       "九" => "玖",
		       "十" => "拾",
		       "百" => "佰",
		       "千" => "仟", 
		       "萬" => "万",
		       "億" => "亿",
		       "兆" => "兆", 
		       "兩" => "两");



my %trad2pinyin = ("負" => "fu4", 
		   "點" => "dian3",
		   "零" => "ling2", 
		   "一" => "yi1",
		   "二" => "er4",
		   "三" => "san1", 
		   "四" => "si4",
		   "五" => "wu3",
		   "六" => "liu4",
		   "七" => "qi1",
		   "八" => "ba1",
		   "九" => "jiu3",
		   "十" => "shi2",
		   "百" => "bai3",
		   "千" => "qian1", 
		   "萬" => "wan4",
		   "億" => "yi4",
		   "兆" => "zhao4", 
		   "兩" => "liang3");

my %trad2yalecant = ("負" => "fu", 
		     "點" => "dim2",
		     "零" => "ling2", 
		     "一" => "yat",
		     "二" => "yih7",
		     "三" => "saam1", 
		     "四" => "sei5",
		     "五" => "ng4",
		     "六" => "luhk",
		     "七" => "chat1",
		     "八" => "baat1",
		     "九" => "gao3",
		     "十" => "sap7",
		     "百" => "baak5",
		     "千" => "chin1", 
		     "萬" => "maahn",
		     "億" => "yik1",
		     "兆" => "siu", 
		     "兩" => "leung4");


my %trad2jyutpin = ("負" => "fu6", 
		    "點" => "dim4",
		    "零" => "ling4", 
		    "一" => "jat1",
		    "二" => "ji6",
		    "三" => "saam1", 
		    "四" => "sei3",
		    "五" => "ng5",
		    "六" => "luk6",
		    "七" => "cat1",
		    "八" => "baat3",
		    "九" => "gau2",
		    "十" => "sap6",
		    "百" => "baak3",
		    "千" => "cin1", 
		    "萬" => "maan6",
		    "億" => "jik1",
		    "兆" => "siu6", 
		    "兩" => "loeng5");


sub new {
    return bless {};
}


# The heart of the program.  Does the actual conversion


sub EnglishToChineseNumber {
    my($self) = shift;
    my($enumber) = shift;
    my($outputtype) = shift;

    if ($outputtype eq "") {
	$outputtype = $default_outputtype;
    }

    $outputtype = lc($outputtype);

#    print "Output type : $outputtype\n";


    my(@powers) = ();
    my($power) = 0;
    my($value) = 0;
    my($negative) = 0;     # is it a negative integer?

    my($inzero) = 0;       # are we in a stretch or 1 or more zeros (only add one zero for the stretch)

    my($canaddzero) = 0;   # only add a zero if there's something non-zero on both sides of it

    my($cnumber) = "";     # the final result

    my($remainder) = "";

    # Remove all non-digits


    $enumber =~ s/[^0-9\.-]//g;

    # If zero, just return zero

    if ($enumber == 0) {
	return $digits[0];
    }

    # Check if it's negative, set the negative flag and make it positive

    if ($enumber < 0) {
	$negative = 1;
	$enumber = -$enumber;
    }

    if ($enumber =~ m/([0-9]*)\.([0-9]+)/) {
	$remainder = $2;
	$enumber = $1;
    }

    # Get the value of the coefficient for each power of ten

    while ($TEN ** $power <= $enumber) {
	$value = ($enumber % ($TEN** ($power+1)))/($TEN**$power);
	$powers[$power] = $value;

	# Subtract out the current power's coefficient and increase the power

	$enumber -= $enumber % ($TEN**($power+1));
	$power++;
    }

    my($i);

    # Take the decomposition of the number for above and generate the Chinese equivalent

    for ($i = 0; $i < $power; $i++) {
	#System.out.println("10^" + i + ":\t" + powers[i]);

	if (($i % 4) == 0) {  # Reached the next four powers up level

	    if ($powers[$i] != 0) {
		$inzero = 0;
		$canaddzero = 1;
		$cnumber =  $digits[$powers[$i]] . $afterWan[$i/4] . $cnumber;
	    } else {
		# Check that something in the next three powers is non-zero before adding 

		if ((($i+3 < $power) && $powers[$i+3] != 0) ||
		    (($i+2 < $power) && $powers[$i+2] != 0) ||
		    (($i+1 < $power) && $powers[$i+1] != 0)) 
		{
		    $cnumber = $afterWan[$i/4] . $cnumber;
		    $canaddzero = 0; # added

		}
	    }
	} else {  # Add one, tens, hundreds, or thousands place for each level

	    if ($powers[$i] != 0) {
		$inzero = 0;
		$canaddzero = 1;
		if ($power == 2 && $i == 1 && $powers[$i] == 1) {  # No 一 with 10 through 19

		    $cnumber = $beforeWan[($i % 4)-1] . $cnumber;
		    #} else if ((i%4 = 3) && powers[i] == 2) {  # when to use liang3 vs. er4

		    #cnumber.insert(0, ALTTWO + beforeWan[(i%4)-1]);

		} else {
		    $cnumber = $digits[$powers[$i]] . $beforeWan[($i%4)-1] . $cnumber;
		}
	    } else {
		if ($canaddzero == 1 && $inzero == 0) { # Only insert one 零 for all consecutive zeroes

		    $inzero = 1;
		    $cnumber = $digits[$powers[$i]] . $cnumber;
		}
	    }
	}
    }

    if ($remainder ne "") {
	$cnumber .= $DECIMAL;
	for ($i = 0; $i < length($remainder); $i++) {
	    $cnumber .= $digits[substr($remainder, $i, 1)];
	}
    }

    # Add the negative character

    if ($negative == 1) {
	$cnumber = $MINUS . $cnumber;
    }
 
    my($result, $j);
    if ($outputtype eq "trad") {
	$result = $cnumber;
    } elsif ($outputtype eq "simp") {
	for ($j = 0; $j < lengthu8($cnumber); $j++) {
	    $result .= $trad2simp{substru8($cnumber, $j, 1)};
	}
    } elsif ($outputtype eq "formaltrad") {
	for ($j = 0; $j < lengthu8($cnumber); $j++) {
	    $result .= $trad2formal{substru8($cnumber, $j, 1)};
	}

    } elsif ($outputtype eq "formalsimp") {
	for ($j = 0; $j < lengthu8($cnumber); $j++) {
	    $result .= $trad2formalsimp{substru8($cnumber, $j, 1)};
	}

    } elsif ($outputtype eq "pinyin") {
	for ($j = 0; $j < lengthu8($cnumber); $j++) {
	    $result .= $trad2pinyin{substru8($cnumber, $j, 1)} . " ";
	}

    } elsif ($outputtype eq "jyutpin") {
	for ($j = 0; $j < lengthu8($cnumber); $j++) {
	    $result .= $trad2jyutpin{substru8($cnumber, $j, 1)} . " ";
	}

    } elsif ($outputtype eq "yalecant") {
	for ($j = 0; $j < lengthu8($cnumber); $j++) {
	    $result .= $trad2yalecant{substru8($cnumber, $j, 1)} . " ";
	}

    } else {
	$result = $cnumber;
    }

    return $result;
}


sub ChineseToEnglishNumber {
    my($self) = shift;
    my($cnumber) = shift;
    my($outputtype) = shift;

    if ($outputtype eq "") {
	$outputtype = $default_englishtype;
    }

    $outputtype = lc($outputtype);

    my($i, $j, $result);

    my($alldigits) = 1;

    my($ordinal) = 0;

    if ($cnumber =~ m/^第/) {
	$ordinal = 1;
    }

    if ($cnumber =~ m/分之/) {
	my($denom) = ($cnumber =~ m/^(.+?)分之/);
	my($numer) = ($cnumber =~ m/分之(.+)$/);
	$result = &ChineseToEnglishFull($numer)/&ChineseToEnglishFull($denom);

    } elsif (lengthu8($cnumber) > 1) {
	for ($i = 0; $i < lengthu8($cnumber); $i++) {
	    if (!defined($digits{substru8($cnumber, $i, 1)})) {
		$alldigits = 0;
	    }
	}

	if ($alldigits == 1) {
	    $result = &ChineseToEnglishBrief($cnumber);
	} else {
	    $result = &ChineseToEnglishFull($cnumber);
	}

    } else {
	$result = &ChineseToEnglishFull($cnumber);
    }

    if ($outputtype eq "arabic") {
	if ($ordinal) {
	    my($lastdigit) = substru8($result, lengthu8($result)-1, 1);
	    if ($lastdigit eq "1") {
		$result .= "st";
	    } elsif ($lastdigit eq "2") {
		$result .= "nd";
	    } elsif ($lastdigit eq "3") {
		$result .= "rd";
	    } else {
		$result .= "th";
	    }
	}

	return $result;

    } elsif ($outputtype eq "comma") {
	my $withcomma = "" . $result;
	my $start; 
	if ($withcomma =~ m/\./) {
	    
	} else {
	    $start = (lengthu8($withcomma) % 3);
	    for ($i = $start; lengthu8($withcomma) > 3 and $i < lengthu8($withcomma); $i+=3) {
		if ($i != 0) {
		    substr($withcomma, $i, 0, ",");
		    $i++;
		}
	    }
	}

	if ($ordinal) {
	    my($lastdigit) = substru8($withcomma, lengthu8($withcomma)-1, 1);
	    if ($lastdigit eq "1") {
		$withcomma .= "st";
	    } elsif ($lastdigit eq "2") {
		$withcomma .= "nd";
	    } elsif ($lastdigit eq "3") {
		$withcomma .= "rd";
	    } else {
		$withcomma .= "th";
	    }
	}

	return $withcomma;


    } elsif ($outputtype eq "words") {
	if ($ordinal) {
	    return num2en_ordinal($result);
	} else {
	    return num2en($result);
	}
    }

    
}


sub ChineseToEnglishBrief {
    my($cnumber) = shift;
    my($nextcchar);

    my($place, $digitval, $total) = (0,0,0);

    for ($place = 0; $place < lengthu8($cnumber); $place++) {
	$total *= 10;
	$digitval = $digits{substru8($cnumber, $place, 1)};
	$total += $digitval;
    }

    return $total;
}



sub ChineseToEnglishFull {
    my($cnumber) = shift;
    my($negative) = 0;
    my($cnumlength);
    my($i);
    my($j, $digitval, $cchar, $afterdecimal);
    my($power) = 0;
    my($leveltotal) = 0;
    my($total) = 0;
    my($nextcchar);

    $afterdecimal = 0;

    $cnumber =~ s/万亿/兆/;
    $cnumber =~ s/萬億/兆/;
    $cnumber =~ s/亿万/兆/;
    $cnumber =~ s/億萬/兆/;
    $cnumber =~ s/個//;
    $cnumber =~ s/个//;
    $cnumber =~ s/廿/二十/;
    $cnumber =~ s/卄/二十/;
    $cnumber =~ s/卅/三十/;
    $cnumber =~ s/卌/四十/;

    $cnumlength = lengthu8($cnumber);

    #print "In Chinese to English Full<BR>";


    for ($i = 0; $i < $cnumlength; $i++) {
	#print "i $i ";

	$cchar = substru8($cnumber, $i, 1);
	#print "$cchar $leveltotal $power";

	if ($i == 0 && ($cchar eq "负" or $cchar eq '負' or $cchar eq '-')) {
	    $negative = 1;
	} elsif ($i == 0 && $cchar eq '第') { # ordinal

	    # Do nothing, handled elsewhere


	} elsif ($cchar eq '點' or $cchar eq '点' or $cchar eq '.' or
	         $cchar eq '.') {
	    $afterdecimal = 1;
	    $power = -1;

	} elsif ($cchar eq '兆') {
	    $power = 12;
	    $leveltotal = 1 if $leveltotal == 0;
	    $total += $leveltotal * (10 ** $power);
	    $leveltotal = 0;
	    $power -= 4;

	} elsif ($cchar eq '億' or $cchar eq '亿') {
	    $power = 8;
	    $leveltotal = 1 if $leveltotal == 0;
	    $total += $leveltotal * (10** $power);
	    $leveltotal = 0;
	    $power -= 4;

	} elsif ($cchar eq '萬' or $cchar eq '万') {
	    $power = 4;
	    $leveltotal = 1 if $leveltotal == 0;
	    $total += $leveltotal * (10**$power);
	    $leveltotal = 0;
	    $power -= 4;

	} elsif ($cchar eq '千' or $cchar eq '仟') {
	    $leveltotal += 1000;

	} elsif ($cchar eq "百" or $cchar eq '佰') {
	    $leveltotal += 100;

	} elsif ($cchar eq "十" or $cchar eq '拾') {
	    $leveltotal += 10;

	} elsif ($cchar eq "零" or $cchar eq "〇" or
	         $cchar eq "0" or $cchar eq "0") {
	    $power = 0;

	} elsif (defined($digits{$cchar})) {

	    $digitval = $digits{$cchar};
	    #print "Digit val is $digitval, $i, $cnumlength\n";

	    if ($afterdecimal) {
		$leveltotal += $digitval * (10**$power);
		$power--;

		while ($i+1 < $cnumlength and defined($digits{substru8($cnumber, $i+1, 1)})) {
		    $leveltotal += $digits{substru8($cnumber, $i+1, 1)} * (10**$power);
		    $power--;
		    $i++;
		}

	    
	    } elsif ($i+1 < $cnumlength) {
		$nextcchar = substru8($cnumber, $i+1, 1);

		if ($nextcchar eq "十" or $nextcchar eq "拾") {
		    $leveltotal += $digitval * 10;
		    $i++;

		} elsif ($nextcchar eq "百" or $nextcchar eq "佰") {
		    $leveltotal += $digitval * 100;
		    $i++;

		} elsif ($nextcchar eq "千" or $nextcchar eq "仟") {
		    $leveltotal += $digitval * 1000;
		    $i++;

		} elsif (defined($digits{$nextcchar})) {
		    $leveltotal *= 10;
		    $leveltotal += $digitval;

		    while ($i+1 < $cnumlength and defined($digits{substru8($cnumber, $i+1, 1)})) {
			$leveltotal *= 10;
			$leveltotal += $digits{substru8($cnumber, $i+1, 1)};
			$i++;
		    }

		} else {
		    $leveltotal += $digitval;
		}

	    } else {
		if ($i+1 == $cnumlength and $i > 0) {
		    my $prevchar = substru8($cnumber, $i-1, 1);
		    if ($prevchar eq '兆') {
			$leveltotal += $digitval * (10**11);

		    } elsif ($prevchar eq '億' or $prevchar eq '亿') {
			$leveltotal += $digitval * (10**7);

		    } elsif ($prevchar eq '萬' or $prevchar eq '万') {
			$leveltotal += $digitval * 1000;
			
		    } elsif ($prevchar eq '千' or $prevchar eq '仟') {
			$leveltotal += $digitval * 100;
			
		    } elsif ($prevchar eq "百" or $prevchar eq '佰') {
			$leveltotal += $digitval * 10;

		    } else {
			$leveltotal += $digitval;
		    }

		} else {
		    $leveltotal += $digitval;
		}
		#print "digit $digitval\n";

	    }

	} else {
	    print STDERR "Seems to be an error in the number. $cnumber\n";
	    return "";

	    # return negative infinity;

	}
    }


    # Catch remaining leveltotal

    #print("Level total " + $leveltotal + " power " + $power + " ten to power " + (10**$power)/10);


    $total += $leveltotal; # * 10** $power;


    #if ($cchar eq '點' or $cchar eq '点' or $cchar eq '.') {

    #$power = -1;

    #for ($j = $i+1; $j < $cnumlength; $j++, $power--) {

    #$digitval = $digits{substru8($cnumber, $j, 1)};

    #$total += $digitval * (10 ** $power);

    # }

    #}

  

    if ($negative == 1) { $total = -$total; }

    return $total;
}



sub chinese_output {
    my($self) = shift;
    if (@_) { $default_outputtype = shift }
    return $default_outputtype;
}

sub english_output {
    my($self) = shift;
    if (@_) { $default_englishtype = shift }
    return $default_englishtype;
}


sub lengthu8 {
    my($utfstring) = shift;
    my($i, $charcount, $byte1);

    $i = 0; $charcount = 0;
    while ($i < length($utfstring)) {
	#print "i $i $utfstring\n";

	$byte1 = substr($utfstring, $i, 1);
	if (unpack("C", $byte1) <= 0x7F) { # 1 byte long (ASCII)

	    $i++;
	    $charcount++;
	} elsif ((unpack("C", $byte1) & 0xE0) == 0xC0) { # 2 bytes long

	    $i += 2;
	    $charcount++;
	} else {  # 3 bytes long

	    $i += 3;
	    $charcount++;
	}
    }
    return $charcount;
}

sub substru8 {
    my($utfstring, $start, $span) = @_;
    my($i, $charcount, $bytestart, $bytespan, $byte1);
    #print "$utfstring START $start SPAN $span\n";


    $i = 0; $charcount = 0;
    while ($i < length($utfstring)) {
	if ($charcount == $start) { $bytestart = $i; }
	if ($charcount == ($start+$span)) { $bytespan = $i - $bytestart; }
	$byte1 = substr($utfstring, $i, 1);
	if (unpack("C", $byte1) <= 0x7F) { # 1 byte long (ASCII)

	    $i++;
	    $charcount++;
	} elsif ((unpack("C", $byte1) & 0xE0) == 0xC0) { # 2 bytes long

	    $i += 2;
	    $charcount++;
	} else {  # 3 bytes long

	    $i += 3;
	    $charcount++;
	}
    }

    if ($charcount == ($start+$span)) { $bytespan = $i - $bytestart; }
    #print "bytestart $bytestart bytespan $bytespan\n";

    return substr($utfstring, $bytestart, $bytespan);

}


END { }

1;

Step VII: Copy this file to /usr/lib/perl5

sudo cp ChineseNumbersU8.pm /usr/lib/perl5

Step VIII: Create another perl file, wherever your PHP script is going to reside, called chineseNumber.pl. Write the following code in it

#!/usr/bin/env perl


use ChineseNumbersU8;
use Getopt::Long;
GetOptions( 'number=s' => \$number ,'format=s' => \$format);


print ChineseNumbers->EnglishToChineseNumber($number, $format);

Step IX: In your PHP script, you can use a function as follows:

/**
 * Returns the chinese string of the number passed
 * @param type $number
 * @param type $format (See chineseNumberU8.pm for more options)
 */
function getChineseNumber($number, $format = 'simp') {
    if (file_exists("chineseNumber.pl")) { {
            $return = exec("chineseNumber.pl  --number=$number --format=$format");
            if ($number < 1.0 && $number > 0.00) {
                $return = exec("chineseNumber.pl  --number=0 --format=$format") . $return;
            }
            return $return;
        }
    }
    return false;
}