#!/usr/bin/perl -w
use English; # things like MATCH

# --------------------------------------------------------------

# txt2phoNL - usage: perl txt2phoNL <dutch-text.txt >phonemes.pho
# the generated phoneme file is suitable for use with MBROLA,
# but you have to use the -e option in MBROLA to skip over
# spurious unpronounceable phoneme pairs (e.g. caused by English
# words in your Dutch text file!).
# Hint: Use pipes, e.g. "ls | txt2phoNL | mbrola -e - - | play"

# This is GPLed software (open source freeware) by
# Eric Auer <eric@coli.uni-sb.REMOVEthisIFyouAREnoSPAMMER.de>, the license is the GNU GPL
# version 2 or later, also available as copying.txt in this
# directory, http://www.coli.uni-saarland.de/~eric/stuff/soft (3/2002)

# Please give me some feedback: As I am no native speaker
# of Dutch, this txt2phoNL definitely need some improvement!

# --------------------------------------------------------------

# new version 14 feb 2002:
# - sanitize away illegal phone pairs in a last step,
#   includes devoicing of consonants before a break.
# - intermediate repn uses one char per phoneme.
# - simpler rewrite mechanism eats all matched chars
#   and produces only phones - so the text string is constant.
#   BUT: restart from " " if the rule input ended in " " !
# - steps: 1. digit/... names
#          2. sound pattern rules (preferring long matches,
#             walking the string and trying all rules per char)
#          3. sanitize and get final repn from intermediate one

# new version 2003-04-05 by Marc Spoorendonk marc@spoorendonk.com (native Dutch speaker)
# - changed to much to mention. Very acceptable translation now.

my $XLATEDEBUG = 4; # show all translation rule applications
                    # of at least this size

# special one char repn:
# _ is " ", Ei is 1, 9y is 3, Au is 4, ai is 5,
# oi is  6, ui is 7, Ai is 8, Oi is 9, . is EOF, ? is question
# , is comma
 
open(STRING,">/dev/stderr") || die "cannot open debug log\n";
# open(STRING,">nl2pho.log") || die "cannot open debug log\n";
my $foo;

$OUTPUT_AUTOFLUSH = 1; # (also known as $|): flush after every
                       # write/print, do not buffer output
$/ = undef;       # do not split on line breaks
                  # $/ is $RS, record separator in use English
my $text0 = <STDIN>;   # read stdin
my $text = " ";  # other stage (start with a space)
my $phones = " "; # phoneme one-char-per-phoneme repn

# by the way: a "^>*" remover would be nice for mails...


# g vs G vs x: regen [reG@n] goal [goL]  gage [xaZe]
# where the G (voiced "ch") is a dialect alternative to x ("ch")
# and the g only occurs in foreign words.

# e vs E vs @: gemak [x@mAk] gage [xaZe] veer [ver]  pet [pEt]
# this is the len: e is long, is ee or e-at-end-of-syll.
# E is short, is default, kind of.


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# first step: reduce the alphabet by spelling out specials
# result: a plain [a-z.? ]* string

my %special = ("0","null",   "=","is",
               "1","een",    "!","!",
               "2","twee",   '"',"aanhaalingsteken",
               "3","drie",
               "4","vier",   "\$","dollar",
               "5","vijf",   "%","procent",
               "6","zes",    "&","en",
               "7","zeven",  "/","slesh", #phonetically
               "8","acht",   "(","haakje openen",
               "9","negen",  ")","haakje sluiten,",
               "*","ster",   "\\","beckslesh", #phonetically
               "+","plus",   "?","?",
               "#","hekje",  "|","paip", #phonetically
               ".",".",      "_","underscoor", #phonetically
               ",",",",      "-","",
               ">","groter", ";",";",
               "<","kleiner",":",":",
               "^","dakje",  "@","aapestaartje",
               "°","grad",   "{","accolade openen",
               "[","hoekje", "]","hoekje sluiten,",
               "~","tilde",  "}","accolade sluiten,"
              );

# use this: punt. koma, vraagteken?
#  or that: .     ,     ?
# the latter has the problem that a . or , or ?
# surrounded by spaces just sounds like a space...

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#Marc> Prefix with space for easyer matching.
$text0 =~ s/^/ /g; 
$text0 =~ s/$/ /g; 

$text0 =~ s/^[>]*//g; # un-mailify the text :-)
$text0 =~ s{://}{dubbele punt slesh slesh}g; # http:// and similar stuff
$text0 =~ s{:-[)]}{,lachend gezicht}g;    # smiley
$text0 =~ s{:[)]}{,lachend gezicht}g;    # smiley
$text0 =~ s{:-[(]}{,treurig gezicht}g;    # smiley
$text0 =~ s{:[(]}{,treurig gezicht}g;    # smiley
$text0 =~ s{;-[)]}{,knipoogend gezicht}g; # smiley
$text0 =~ s{;[)]}{,knipoogend gezicht}g; # smiley

$text0 =~ s/cie/sie/g; # precies -> presies, provincie -> provinsie

#Marc>  betaal -> betaal
#Marc>  betalen -> betaalen
#Marc> It keeps metten, marren, matten as they are.
#Marc> betaling -> betaaling 
#                b      e           t             a          l             i ng bet a a li ng
$text0 =~ s/([^eaiou][eaou][rtpsdfgklzcvbnm])([eaiou])([rtpsdfgklzcvbnm][eaiou])/$1$2$2$3/g;

#Marc>  meten -> meeten    maren -> maaren
#                m       e             t           e        n     m e e ten
$text0 =~ s/([^eaiou])([eaou])([rtpsdfgklzcvbnm][eaiou][^eaiou])/$1$2$2$3/g;

#Marc> k.n.m.i. -> k n m i
$text0 =~ s/([^a-z])([a-z])\./$1$2 /g;
$text0 =~ s/([^a-z])([a-z])\./$1$2 /g;

#Marc> remove lines from input: "-----------------------------" -> "" 
$text0 =~ s/[-_=+]{3,}//g;

#Marc> www.bla.com -> www punt bla punt com
$text0 =~ s/\.([^ \n\t])/punt $1/g;

#Marc> translate some numbers. (write a function for this once)
$text0 =~ s/([^0-9])10([^0-9])/$1tien$2/g;
$text0 =~ s/([^0-9])11([^0-9])/$1elf$2/g;
$text0 =~ s/([^0-9])12([^0-9])/$1twaalf$2/g;
$text0 =~ s/([^0-9])13([^0-9])/$1dertien$2/g;
$text0 =~ s/([^0-9])14([^0-9])/$1veertien$2/g;
$text0 =~ s/([^0-9])15([^0-9])/$1vijftien$2/g;
$text0 =~ s/([^0-9])16([^0-9])/$1zestien$2/g;
$text0 =~ s/([^0-9])17([^0-9])/$1zeventien$2/g;
$text0 =~ s/([^0-9])18([^0-9])/$1achttien$2/g;
$text0 =~ s/([^0-9])19([^0-9])/$1negentien$2/g;
$text0 =~ s/([^0-9])20([^0-9])/$1twintig$2/g;
$text0 =~ s/([^0-9])21([^0-9])/$1eenentwintig$2/g;
$text0 =~ s/([^0-9])22([^0-9])/$1tweeentwintig$2/g;
$text0 =~ s/([^0-9])23([^0-9])/$1drieentwintig$2/g;
$text0 =~ s/([^0-9])24([^0-9])/$1vierentwintig$2/g;
$text0 =~ s/([^0-9])25([^0-9])/$1vijfentwintig$2/g;
$text0 =~ s/([^0-9])26([^0-9])/$1zesentwintig$2/g;
$text0 =~ s/([^0-9])27([^0-9])/$1zevenentwintig$2/g;
$text0 =~ s/([^0-9])28([^0-9])/$1achtentwintig$2/g;
$text0 =~ s/([^0-9])29([^0-9])/$1negenentwintig$2/g;
$text0 =~ s/([^0-9])30([^0-9])/$1dertig$2/g;
$text0 =~ s/([^0-9])31([^0-9])/$1eenendertig$2/g;
$text0 =~ s/([^0-9])32([^0-9])/$1tweeendertig$2/g;
$text0 =~ s/([^0-9])33([^0-9])/$1drieendertig$2/g;
$text0 =~ s/([^0-9])34([^0-9])/$1vierendertig$2/g;
$text0 =~ s/([^0-9])35([^0-9])/$1vijfendertig$2/g;
$text0 =~ s/([^0-9])36([^0-9])/$1zesendertig$2/g;
$text0 =~ s/([^0-9])37([^0-9])/$1zevenendertig$2/g;
$text0 =~ s/([^0-9])38([^0-9])/$1achtendertig$2/g;
$text0 =~ s/([^0-9])39([^0-9])/$1negenendertig$2/g;

print STDERR "Text: $text0";


for my $char (split(//,$text0)) {
  $char = lc($char);
  $char = "eu" if ($char =~ /öÖ/); # approximately :-)
  $char = "ae" if ($char =~ /äÄ/); # could be better
  $char = "uu" if ($char =~ /Üü/); # should also be for &euml;
  if (defined $special{$char}) {
    $text .= " " unless ($text =~ / $/);
    $text .= $special{$char} . " ";
  } elsif ($char =~ /[a-z]/) {
    $text .= $char;
  } else {
    $text .= " " unless ($text =~ / $/);
  }   # simplify all whitespace/linebreak stretches
      # and other special chars to single spaces
}
$text .= " " x 5; # end with spaces!

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# second step: apply phoneme pattern rules (prefer longest
# match, eat up all left side apart from trailing space,
# produce pure phoneme list)

my %five  = (	" lijk"," l1k", 
		"lijk ",'l@k ',
             	"elijk","El1k",
		" bent", " bEnt",
		"atie ","atsi",		#informatie
             	"+size+",5
            );

my %four  = (	"http","ha te te pe ",
             	"html","ha te Em El ",
		"agen","axEn",
             	"ooie","oi@",	# mooie
		"ooit","oIt",
		" er "," Er ",
		" en "," En ",
		" nl "," EnEl ",
		" he "," hE ",
		" ok "," oke ",
             	"hou ","h4w ",
		"ouch","uS",	# douche
		"oush","uS",	# kianoush
		" pc "," pe se ",
             	"even",'ev@n',
		"tie ","tsi",	#vakantie
		" chi"," Si",	# china
             	"+size+",4
            );

my %three = (	

		"aai","5" ,
		"ooi","oi" ,
		"oei","7",
		"cee","se",
             	"ai ","8" ,
		"oi ","9",
		"age","aZe",
             	"ch ","x" ,
		"ftp","ef te pe ",
             	"www","we we we ",
             	"htm","ha te em ",
             	"tp:", "te pe ",
		"mp ", "Em pe ", # mp3
		"mb ", "Embe ", # mp3
             	"eeu", "e2",
		"en ",'@n',
		"he ","he",
		"eij","1",
#pronounciation of E before double dissonant
		"ett","Et", #letter
		"epp","Ep",
		"ett","Et",
		"err","Er",
		"ekk","Ek", #lekker
		"emm","Em",
		"ess","Es",
		"eff","Ef",
		"ell","El",
		"ebb","Eb",
		"enn","En",
#Marc> distinct letters:  k.n.m.i   a.u.b.
		" a ", "a",
		" b ", "be",
		" c ", "se",
		" d ", "de",
		" e ", "e",
		" f ", "Ef",
		" g ", "xe",
		" h ", "ha",
		" i ", "i",
		" j ", "ie",
		" k ", "ka",
		" l ", "El",
		" m ", "Em",
		" n ", "En",
		" o ", "o",
		" p ", "pe",
		" q ", "ky",
		" r ", "Er",
		" s ", "Es",
		" t ", "te",
		" u ", "y",
		" v ", "ve",
		" w ", "we",
		" x ", "Iks",
		" y ", "1",
		" z ", "zEt",
             	"+size+",3
            ); 

my %two   = ("ie","i" , "oe","u" , "uu","y",
             "aa","a" , "ee","e" , "oo","o",
             "eu","2" ,            "ei","1",
             "ui","3" , "ou","4" , "ij","1",
             "sj","S" , "g ","x" , "nj","J",
	     "ce","sE",
             "l ","l" , "ng","N" ,
             "dt","t" , "ch","x" , "iu","ju",
             "dl",'d@l',           "lf",'l@f',
             "bb","b" , "dd","d" , "e ",'@',
             "d ","t" , "hr","r" , "hl","l",
             "o ","o" , "a ", "a",
	     "yl","1l",
             "zl","z l",
	     "mm", "m", # Marc> m-m is not a sound. Same for p-p and n-n.
	     "pp", "p",
	     "nn", "n",
	     "rr", "r",
	     "kk", "k",
	     "tt", "t",
             "+size+",2
            ); # hr/hl/yl/zl: sane processing
               # of foreign words

my %one   = ("a","A", "b","b", "c","k",
             "d","d", "e",'E', "f","f",
             "g","x", "h","h", "i","I",
             "j","j", "k","k", "l","l",
             "m","m", "n","n", "o","O",
             "p","p", "q","k", "r","r",
             "s","s", "t","t", "u","Y",
             "v","v", "w","w", "x","ks",
             "y","j", "z","z", " "," ",
             ".",".", "?","?", ",",",",
             "+size+",1
            ); # prosody with [ ?.,] is a later step

my @todo = (\%five, \%four, \%three, \%two, \%one);

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

$phones = " ";
my $x = 0;     # string index
while ($x < (length($text)-5)) {
  my $y = 0;
  for my $hashref (@todo) {    # do l-longest rules first...
    if ($y != 0) { next; }
    my $check = substr($text,$x,$hashref->{"+size+"});
    if (defined $hashref->{$check}) {
      $phones .= $hashref->{$check};
      $x += $hashref->{"+size+"};
      $x-- if (($check =~ / $/) && ($check ne " "));
      # skip over matched part, but rewind on " " suffix
      $y++;
      print STDERR "Translate: <$check> to /"
        .   $hashref->{$check} . "/\n"
        if  (length($check) >= $XLATEDEBUG);
    }
  }
  if ($y == 0) {
    print STDERR "Had to translate first char to NIL:\n";
    print STDERR "<" . substr($text,$x,10) . "...>\n";
    $phones .= " ";
    $x++;
  }
}

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# third step: convert to SAMPA alphabet and apply constraints
# on phoneme pairings (input: $phones string)

my %xlate = ("1","Ei", "3","9y", "4","Au", "5","ai",
             "6","oi", "7","ui", "8","Ai", "9","Oi",
             " ","_",  "?","_",  ".","_",  ",","_"
            );

my $Pvowel  = "aeiouAEIOy2Y13456789";
my $Pdipht  = "56789";
my $Pvoiced = "bdcvzZGhJjg"; # adding g for convenience
my $Pconson = "ptkbdgcfvszSZxGhmn";
my $Pvoice2 = "czZGhJj";
my $Psemi   = "GNJL";
my $Pspace  = ".?,_ ";

# rules:

# handled above: no "EY" or "IY" (replace by eY and iY)
# handled above: no d before l (add schwa)
# handled above: common case of bb and dd (replace by b and d)
# handled above: commod case of d_ (devoice to t_)

# no voiced/semi doubled (replace by single occurance)
# no schwa   before OR AFTER dipht (remove schwa)
# no voice2  before l, r or j (add schwa)
# no voiced  before conson (add schwa ; duplication rule first)
# no conson  before semi (add schwa)
# special case of next rule: j-E (replace by j-@)
# no dipht before or after vowel/j (insert " ", see above)
# no voiced  at the end of a word (devoice)

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

$text = "";
my $adder = ""; # buffer before we really add the phone!
my $freq = 200; # freq in Hz, only used at " " for now
my $dur = 100;  # duration in msec
my $ph;         # current phoneme
my $ph0 = " ";  # previous phoneme
# the prosody and rhythm are still extremely simple

foreach $ph (split(//,$phones)) {

  if (($ph =~ /[${Psemi}${Pvoiced}]/) && ($ph0 eq $ph)) {
    print STDERR "${ph}-$ph: removeone $ph\n";
    $adder = ""; # ignore first copy of double phoneme
  } elsif (($ph0 eq "@") && ($ph =~ /[${Pdipht}]/)) {
    # remove the previous schwa
    # (or just insert a short "h")
    $adder = "";
    print STDERR "\@-$ph: remove \@\n";
  } elsif (($ph eq "@") && ($ph0 =~ /[${Pdipht}]/)) {
    # remove the current schwa
    $ph = "";
    print STDERR "${ph0}-\@: remove \@\n";
  } elsif (  (($ph0 =~ /[${Pvoice2}]/) && ($ph =~ /lrj/))
          || (($ph0 =~ /[${Pvoiced}]/) && 
              ($ph  =~ /[${Pconson}]/))
          || (($ph0 =~ /[${Pconson}]/) &&
              ($ph  =~ /[${Psemi}]/))
          ) {
    $adder .= "\@ 50\n";
    print STDERR "${ph0}-$ph: insert schwa\n";
  } elsif (($ph0 eq "j") && ($ph eq "E")) {
    print STDERR "j-E: changeto j-\@\n";
    $ph = "@"; # modify this part this time...
  } elsif (  (($ph0 =~ /[${Pdipht}j]/) && 
              ($ph  =~ /[${Pvowel}j]/))
          || (($ph0 =~ /[${Pvowel}j]/) && 
              ($ph  =~ /[${Pdipht}j]/))
          ) {
    if ($ph0 eq "j") {
      $adder = "i 100\n";
      print STDERR "${ph0}-$ph: changeto i-$ph\n";
    }
    if ($ph eq "j") {
      print STDERR "${ph0}-$ph: changeto ${ph0}-i\n";
      $ph = "i";
    }
    if (($ph0 ne "j") && ($ph ne "j")) {
      $adder .= "_ 50\n";
      print STDERR "${ph0}-$ph: insert break\n";
    }
  } elsif (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pspace}]/)) {
    my $de = $ph0;
    $de =~ tr/bdcvzZGhJjg/ptxfsSx_IIk/;
          #  ptxfsSx IIk
    print STDERR "${ph0}-_: changeto ${de}-_\n";
    $adder = "$de 100\n";
  }

  if (($ph0 eq "j") && ($ph =~ /[${Pspace}]/)) {
    print STDERR "j-_: insert \@\n";
    $adder .= "\@ 100\n";
  }

  if ($adder) {
    $text .= $adder; # add possibly corrected recent phoneme
    $adder =~ s{^([^ ]*).*$}
               {$1}gm;   # reduce to phonemes, multiline
    die "<$adder> ?\n" if ($adder =~ / /);
    $adder = join("-",split(/\n/,$adder)); # a\nb\n -> a-b-
    print STRING "${adder}-";
  }

  if ($ph) {
    $dur = ($ph =~ /[iuyaeo213456789rmnNJ]/) ? 200 : 100;
                                  # longer for long vowels/rmnNJ
    $freq = 200 if ($ph eq " ");  # default freq
    $freq = 252 if ($ph eq "?");  # go up for questions
    $freq = 159 if ($ph eq ".");  # go down for boundaries
    $freq = 178 if ($ph eq ",");  # go down a bit for commas
    if ($ph =~ /[${Pspace}]/) {   # various breaks
      $adder = "_ 100 (50 , $freq)\n";
    } else {
      $adder = ( (defined $xlate{$ph}) ? $xlate{$ph} : $ph );
                                  # use 1..2 char phone names
      $adder .= " $dur\n";
    }
  } else {
    print STDERR "Skip\n";
    $adder = "";
  }

  $ph0 = $ph;

}

print "$text\n";

print STRING "\n";
close STRING;

