/*
 *  utext.c -- Uyghur Text Processing Library, C version, C source file
 *  $Author:   Muhammad Abdulla
 *  $version:  1.1
 *  License:   GPL
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stddef.h>

#include <wchar.h>
#include <locale.h>

#include <assert.h>
#include <errno.h>

#include "utext.h"

#define U UText
#define S Syntax

#define BASELEN 256 

#define CHEE  (0x0686)
#define GHEE  (0x063A)
#define NGEE  (0x06AD)
#define SHEE  (0x0634)
#define SZEE  (0x0698)

#define LA    (0xFEFB)
#define _LA   (0xFEFC)

#define HAMZA (0x0626)

#define BPAD  (0x0600)
#define BMAX  (0x06FF)

#define NELEMS(x) ((sizeof(x))/(sizeof((x)[0])))
#define CM(x) (cmap[(x)]-BPAD)

enum {
 PRIMe = 233, // 'e
 PRIME = 201, // 'E
 COLo  = 246, // :o
 COLO  = 214, // :O
 COLu  = 252, // :u
 COLU  = 220  // :U
} ;

struct U {
   char *ustr ;
   char *u8str ;
   char *u8pfstr ;
   wchar_t *uwstr ;
   int ulen ;
   int u8len ;
   int u8pflen ;
   int uwlen ;
} ;

typedef struct S *S ;

typedef enum { WDBEG, INBEG, NOBEG } begtype ;

// this is for rendering Uyghur letters in presentation form 
// (also called extended region)
struct S {
   wchar_t iform ; // isolated  form
   wchar_t bform ; // beginning form
   wchar_t mform ; // medial    form
   wchar_t eform ; // end       form
   begtype btype ; // specifies in which form the next letter should be
} ; 

wchar_t  cmap[BASELEN] ; 
unsigned char cmapinv[BASELEN] ;
S pform[BASELEN] ;

/*
 * forward function declarations 
 */
void *xmalloc ( size_t n ) ;
S new_syn ( wchar_t i, wchar_t b, wchar_t m, wchar_t e, begtype bt ) ;
bool isvowel ( int ch ) ;

int inited = 0 ;    // flag for initialization
int pfinited = 0 ;  // flag for initialization of presentation form

// initialize the charmap and its inverse tables
void init ( )
{
   int i ;
   wchar_t wc ;

   // mark initialized, so that it won't be called again 
   inited = true ;

   setlocale ( LC_ALL, "en_US.UTF-8" ) ;

   memset ( cmap, 0, sizeof(cmap) ) ;
   memset ( cmapinv, 0, sizeof(cmapinv) ) ;

   cmap['A'] = 0x0627 ;
   cmap['a'] = 0x0627 ;
   cmap['B'] = 0x0628 ;
   cmap['b'] = 0x0628 ;
   cmap['C'] = 0x0643 ;
   cmap['c'] = 0x0643 ;
   cmap['D'] = 0x062F ;
   cmap['d'] = 0x062F ;
   cmap['E'] = 0x06D5 ;
   cmap['e'] = 0x06D5 ;
   cmap['F'] = 0x0641 ;
   cmap['f'] = 0x0641 ;
   cmap['G'] = 0x06AF ;
   cmap['g'] = 0x06AF ;
   cmap['H'] = 0x06BE ;
   cmap['h'] = 0x06BE ;
   cmap['I'] = 0x0649 ;
   cmap['i'] = 0x0649 ;
   cmap['J'] = 0x062C ;
   cmap['j'] = 0x062C ;
   cmap['K'] = 0x0643 ;
   cmap['k'] = 0x0643 ;
   cmap['L'] = 0x0644 ;
   cmap['l'] = 0x0644 ;
   cmap['M'] = 0x0645 ;
   cmap['m'] = 0x0645 ;
   cmap['N'] = 0x0646 ;
   cmap['n'] = 0x0646 ;
   cmap['O'] = 0x0648 ;
   cmap['o'] = 0x0648 ;
   cmap['P'] = 0x067E ;
   cmap['p'] = 0x067E ;
   cmap['Q'] = 0x0642 ;
   cmap['q'] = 0x0642 ;
   cmap['R'] = 0x0631 ;
   cmap['r'] = 0x0631 ;
   cmap['S'] = 0x0633 ;
   cmap['s'] = 0x0633 ;
   cmap['T'] = 0x062A ;
   cmap['t'] = 0x062A ;
   cmap['U'] = 0x06C7 ;
   cmap['u'] = 0x06C7 ;
   cmap['V'] = 0x06CB ;
   cmap['v'] = 0x06CB ;
   cmap['W'] = 0x06CB ;
   cmap['w'] = 0x06CB ;
   cmap['X'] = 0x062E ;
   cmap['x'] = 0x062E ;
   cmap['Y'] = 0x064A ;
   cmap['y'] = 0x064A ;
   cmap['Z'] = 0x0632 ;
   cmap['z'] = 0x0632 ;

   cmap[PRIMe] = 0x06D0 ; // 'e
   cmap[PRIME] = 0x06D0 ; // 'E
   cmap[COLo]  = 0x06C6 ; // :o
   cmap[COLO]  = 0x06C6 ; // :O
   cmap[COLu]  = 0x06C8 ; // :u
   cmap[COLU]  = 0x06C8 ; // :U

   // Uyghur punctuation marks
   cmap [ ';' ] = 0x061B ;
   cmap [ '?' ] = 0x061F ;
   cmap [ ',' ] = 0x060C ;

   // the inverse of cmap table, to speed up lookups (without wasting much space)
   // we use BPAD for index operations, we would be wasting BPAD many bytes.
   // We could have used a hash table instead, but didn't think it is worthwhile.
   for ( i = 0 ; i < NELEMS(cmapinv) ; i++ ) {
      wc = cmap[i] ;
      if ( wc != 0 ) {
         cmapinv [ wc - BPAD ] = i ;
      }
   }
}

// pfinit() -- initialize the presentation form table.
// We kept this separate from init() to keep init() lightweight,
// as not every utext type is used to generate texts in presentation form.
void pfinit ( )
{
   // S new_syn ( wchar_t i, wchar_t b, wchar_t m, wchar_t e, begtype bt ) ;
   int i ;
   wchar_t wc ;

   pfinited = true ;

   for ( i = 0 ; i < NELEMS(pform) ; i++ ) {
      pform[i] = NULL ;
   }

   pform[ CM('a') ]    = new_syn ( 0xFE8D, 0xFE8D, 0xFE8D, 0xFE8E, WDBEG ) ;
   pform[ CM('e') ]    = new_syn ( 0xFEE9, 0xFEE9, 0xFEE9, 0xFEEA, WDBEG ) ;
   pform[ CM('b') ]    = new_syn ( 0xFE8F, 0xFE91, 0xFE92, 0xFE90, NOBEG ) ;
   pform[ CM('p') ]    = new_syn ( 0xFB56, 0xFB58, 0xFB59, 0xFB57, NOBEG ) ;
   pform[ CM('t') ]    = new_syn ( 0xFE95, 0xFE97, 0xFE98, 0xFE96, NOBEG ) ;
   pform[ CM('j') ]    = new_syn ( 0xFE9D, 0xFE9F, 0xFEA0, 0xFE9E, NOBEG ) ;
   pform[ CHEE-BPAD ]  = new_syn ( 0xFB7A, 0xFB7C, 0xFB7D, 0xFB7B, NOBEG ) ;
   pform[ CM('x') ]    = new_syn ( 0xFEA5, 0xFEA7, 0xFEA8, 0xFEA6, NOBEG ) ;
   pform[ CM('d') ]    = new_syn ( 0xFEA9, 0xFEA9, 0xFEAA, 0xFEAA, INBEG ) ;
   pform[ CM('r') ]    = new_syn ( 0xFEAD, 0xFEAD, 0xFEAE, 0xFEAE, INBEG ) ;
   pform[ CM('z') ]    = new_syn ( 0xFEAF, 0xFEAF, 0xFEB0, 0xFEB0, INBEG ) ;
   pform[ SZEE-BPAD ]  = new_syn ( 0xFB8A, 0xFB8A, 0xFB8B, 0xFB8B, INBEG ) ;
   pform[ CM('s') ]    = new_syn ( 0xFEB1, 0xFEB3, 0xFEB4, 0xFEB2, NOBEG ) ;
   pform[ SHEE-BPAD ]  = new_syn ( 0xFEB5, 0xFEB7, 0xFEB8, 0xFEB6, NOBEG ) ;
   pform[ GHEE-BPAD ]  = new_syn ( 0xFECD, 0xFECF, 0xFED0, 0xFECE, NOBEG ) ;
   pform[ CM('f') ]    = new_syn ( 0xFED1, 0xFED3, 0xFED4, 0xFED2, NOBEG ) ;
   pform[ CM('q') ]    = new_syn ( 0xFED5, 0xFED7, 0xFED8, 0xFED6, NOBEG ) ;
   pform[ CM('k') ]    = new_syn ( 0xFED9, 0xFEDB, 0xFEDC, 0xFEDA, NOBEG ) ;
   pform[ CM('g') ]    = new_syn ( 0xFB92, 0xFB94, 0xFB95, 0xFB93, NOBEG ) ;
   pform[ NGEE-BPAD ]  = new_syn ( 0xFBD3, 0xFBD5, 0xFBD6, 0xFBD4, NOBEG ) ;
   pform[ CM('l') ]    = new_syn ( 0xFEDD, 0xFEDF, 0xFEE0, 0xFEDE, NOBEG ) ;
   pform[ CM('m') ]    = new_syn ( 0xFEE1, 0xFEE3, 0xFEE4, 0xFEE2, NOBEG ) ;
   pform[ CM('n') ]    = new_syn ( 0xFEE5, 0xFEE7, 0xFEE8, 0xFEE6, NOBEG ) ;
   pform[ CM('h') ]    = new_syn ( 0xFEEB, 0xFEEB, 0xFEEC, 0xFEEC, NOBEG ) ;
   //pform[ CM('h') ]    = new_syn ( 0xFBAA, 0xFBAA, 0xFBAD, 0xFBAD, NOBEG ) ;
   pform[ CM('o') ]    = new_syn ( 0xFEED, 0xFEED, 0xFEEE, 0xFEEE, INBEG ) ;
   pform[ CM('u') ]    = new_syn ( 0xFBD7, 0xFBD7, 0xFBD8, 0xFBD8, INBEG ) ;
   pform[ CM(COLo) ]   = new_syn ( 0xFBD9, 0xFBD9, 0xFBDA, 0xFBDA, INBEG ) ;
   pform[ CM(COLu) ]   = new_syn ( 0xFBDB, 0xFBDB, 0xFBDC, 0xFBDC, INBEG ) ;
   pform[ CM('w') ]    = new_syn ( 0xFBDE, 0xFBDE, 0xFBDF, 0xFBDF, INBEG ) ;
   pform[ CM(PRIMe) ]  = new_syn ( 0xFBE4, 0xFBE6, 0xFBE7, 0xFBE5, NOBEG ) ;
   pform[ CM('i') ]    = new_syn ( 0xFEEF, 0xFBE8, 0xFBE9, 0xFEF0, NOBEG ) ;
   pform[ CM('y') ]    = new_syn ( 0xFEF1, 0xFEF3, 0xFEF4, 0xFEF2, NOBEG ) ;
   pform[ HAMZA-BPAD ] = new_syn ( 0xFE8B, 0xFE8B, 0xFE8C, 0xFB8C, NOBEG ) ;
}

/*
 * zinit( U u ) -- zero-out members of u
 */

void zinit ( U u )
{
   assert(u);

   u->ustr = NULL;
   u->u8str = NULL;
   u->u8pfstr = NULL;
   u->uwstr = NULL;
   u->ulen = -1;
   u->u8len = -1;
   u->u8pflen = -1;
   u->uwlen = -1;

   return;
}

/*
 * new_utext ( const char *ukystr ) -- creates a new UText from UKY string ukystr
 * ulen will be updated, but u8len and uwlen are set to -1 to indicate that utf8str
 * and uwstr are not created/updated since ustr is updated, respectively. Similar
 * methods are used throughout the implementation.
 */
U new_utext ( const char *ukystr )
{
   U utext ;

   if ( !inited ) {
      init() ;
   }

   utext = ( U ) xmalloc ( sizeof ( *utext ) ) ; 
   zinit(utext);

   setUstr ( utext, ukystr ) ;

   return utext ;
}

// new_u8text -- returns a new utext type u, whose u8str member
// initialized by u8str which is supposed to be in UTF-8 format.
U new_u8text ( const char *u8str )
{
   U utext ;

   if ( !inited ) {
      init() ;
   }

   utext = ( U ) xmalloc ( sizeof ( *utext ) ) ; 
   zinit(utext);

   setU8str ( utext, u8str ) ;

   return utext ;
}

// new_uwtext -- returns a new utext type u, whose uwstr member
// initialized by wcstr which is supposed to be in UTF-32 format.
U new_uwtext ( const wchar_t *wcstr, int len )
{
   U utext ;

   if ( !inited ) {
      init() ;
   }

   utext = ( U ) xmalloc ( sizeof ( *utext ) ) ; 
   zinit(utext);

   utext->uwstr = (wchar_t *) xmalloc ( sizeof(wchar_t) * (len + 1) ) ;

   memcpy ( (void *)utext->uwstr, (void *)wcstr, len * sizeof(wchar_t) ) ;
   utext->uwstr[len] = 0 ;
   utext->uwlen = len ;

   // minus one indicates corresponding string not initialized/updated
   utext->ulen = -1 ; 
   utext->u8len = -1 ;
   utext->u8pflen = -1 ;

   return utext ;
}

// getUstr -- returns up-to-date ascii string of u
char *getUstr ( U u ) 
{
   wchar_t wc ;
   wchar_t *wp ;
   int i, j, len ;
   char *p ;
   unsigned char ch ;

   assert ( u ) ;

   // We are duplicating to protect data type members from being altered.
   // This means they should (or can) be released using free system call. 
   if ( u->ulen != -1 ) {
      return strdup(u->ustr) ;
   } 

   wp = getUwstr ( u, &len ) ;

   p = xmalloc ( len + 1 ) ;

   j = 0 ;

   for ( i = 0 ; i < len ; i++ ) {
      wc = wp[i] ;

      if ( wc == HAMZA ) {
         continue ;
      }

      if ( BPAD <= wc && wc < BMAX && cmapinv[wc-BPAD] != 0 ) {
         p[j] = cmapinv[wc-BPAD] ;
      } else {
         p[j] = (unsigned char) wc ;
      }

      j++ ;
   }

   free ( wp ) ;

   if ( u->ustr ) {
      free ( u->ustr ) ;
   }

   u->ustr = p ;
   u->ulen = j ;

   return strdup ( u->ustr ) ;
}

// getU8str -- returns up-to-date UTF-8 string of u
char *getU8str ( U u ) 
{
   int i, len, n ;
   wchar_t wch ;
   wchar_t *wp ;
   char *p ;

   assert ( u ) ;

   if ( u->u8len == -1 ) {
      wp = getUwstr ( u, &len ) ;

      // one character in basic range takes two bytes in UTF-8
      len = len * 2 + 1 ; 
      p = xmalloc ( len ) ;

      n = wcstombs ( p, wp, len ) ; 

      if ( u->u8str ) {
         free ( u->u8str ) ;
      }

      u->u8str = p ;
      free(wp) ;
   }

   // we are duplicating to protect data type members from being altered 
   // This means they should (or can) be released using free system call. 
   return strdup(u->u8str) ;
}

/* getU8PFstr -- returns up-to-date UTF-8 string of u in presentation form
 * range. The method getU8str returns Uyghur character in basic range. The
 * layout is delegated to the user agent or the operating system. This is
 * recommed by the Unicode Consortium to avoid problems in searching and
 * sorting, etc. However, in some special cases, a processed version that
 * that renders the letters in presentation form region of Unicode table is
 * necessary for environments that do not support ligatures for Uyghur or
 * for generating images on the fly, etc. 
 */
char *getU8PFstr ( U u ) 
{
   int i, len, n ;
   wchar_t wch ;
   wchar_t *wp ;
   char *p ;

   assert ( u ) ;

   if ( u->u8pflen == -1 ) {
      wp = getUwPFstr ( u, &len ) ;

      // one character in presentation form range takes three bytes in UTF-8
      len = len * 3 + 1 ; 
      p = xmalloc ( len ) ;

      n = wcstombs ( p, wp, len ) ; 

      if ( u->u8pfstr ) {
         free ( u->u8pfstr ) ;
      }

      u->u8pfstr = p ;
      free(wp) ;
   }

   // we are duplicating to protect data type members from being altered 
   // This means they should (or can) be released using free system call. 
   return strdup(u->u8pfstr) ;
}

// getUwstr -- returns up-to-date UTF-32 string of u
wchar_t *getUwstr ( U u, int *len )
{
   int i, j, l, n ;
   wchar_t *wp ;
   wchar_t wch ;
   char *p ;
   unsigned char prev, cur, next ;

   // flag if we're in word-beginning position, in which case 
   // we should put hamza in front vowels
   bool  wdbeg = true ; 

   assert ( u ) ;

   p = u->ustr ;

   if ( u->uwlen == -1 && u->ulen != -1 ) {
      // In the worst case, the letter hamza (0x0626) can be added in front
      // of each letter, and we need an extra wchar_t to mark end of string
      wp = xmalloc ( (2*u->ulen + 1) * sizeof(wchar_t)) ;

      j = 0 ;
      prev = 0 ;
      for ( i = 0 ; i < u->ulen ; i++ ) {
         cur = p[i] ;
         next = p[i+1] ;
         wch = 0 ;

         /* In some words that come from foreign languages, such as zhungxua, jiayuguan, etc.,
          * we use medial forms of AA or AE. Compare this to Uyghur word sual, for example.
          * By default, we use beginning forms of AA and AE in such cases, as in normal Uyghur.
          * To force medial forms, put a '|' between vowels, e.g., "shinxu|a".
          */
         if ( cur == '|' && ( prev == 'u' || prev == 'U' ) && 
               ( next == 'a' || next == 'A' || next == 'e' || next == 'E' ) ) {
            wdbeg = false ;
            continue ;
         }
                  
         // add hamza in front of vowels in word-beginning positions
         if ( wdbeg == true ) {
            if ( isvowel(cur) ) { 
               wp[j++] = HAMZA ;
            }
         } else {
            if ( cur == '\'' ) { // wdbeg == false means prev is a non-vowel letter
              /* we try to force a hamza in certain occasions, e.g., compare 
               * suret (picture) and sur'et (velocity). To minimize the effects
               * of this substitution, we only do this if "'" is fllowed by a
               * vowel and it is not in the word-beginning position. 
               */
               if ( isvowel(next) ) { 
                  wdbeg = false ; // don't add another hamza in next round
                  wp[j++] = HAMZA ;
                  continue ;
               } else if ( isalpha(next) ) {
                  /* Besides, we also want to separate two letters that form
                   * joint letter using "'". For example, to avoid the "ng" 
                   * from being treated as a joint letter NGEE in words
                   * yemenge, yigenge,...,  a "'" can be placed between them.
                   * For example, yemen'ge, yigen'ge,... .
                   */
                  continue ;
               }
            } 
         }

         // AA, AE, and non-alpha-numeric letters makes word beginning
         if ( isvowel(cur) || !isalpha(cur) ) {
            wdbeg = true ;
         } else {
            wdbeg = false ;
         }

         switch ( cur ) {
            case 'c':
            case 'C':
               if ( next == 'h' || next == 'H' ) {
                  wch = CHEE ;
               }
               break ;
           case 'g':
           case 'G':
              if ( next == 'h' || next == 'H' ) {
                 wch = GHEE ;
              }
              break ;
           case 'n':
           case 'N':
              if ( next == 'g' || next == 'G' ) { 
                 /* for cases where we have a sequence of ngh, it could be
                  * translated as either NGEE + EHE or NEE + GHEE. However, the
                  * latter is much more common than the former in Uyghur language
                  * and we opt to translate it as NEE + GHEE. If there is a
                  * need to have NGEE + EHE, a single quote ("'") can be used.
                  */
                 if ( tolower ( p[i+2] ) != 'h' ) {
                    wch = NGEE ;
                 }
              }
              break ;
           case 's':
           case 'S':
              if ( next == 'h' || next == 'H' ) { 
                 wch = SHEE ;
              } else if ( next == 'z' || next == 'Z' ) {
                 // UKY does not provide a unique SZEE, we take joint 
                 // letters "sz" for SZEE, as in purszin [spring (coil)]
                 wch = SZEE ;
              }
              break ;
           default:
              break ;
         }

         if ( wch != 0 ) {
            i++ ; // there is a joint letter, advance index
            wp[j] = wch ;
         } else if ( cmap[cur] ) {
            wp[j] = cmap[cur] ; // no joint letter, but valid UKY
         } else {
            wp[j] = p[i] ; // non-UKY, return whatever is entered
         }

         prev = cur ;
         j++ ;
      }

      if ( u->uwstr ) {
         free ( u->uwstr ) ;
      }

      wp[j] = 0 ;
      u->uwstr = wp ;
      u->uwlen = j ;
   } else if ( u->uwlen == -1 && u->u8len != -1 ) {
      n = u->u8len ;

      wp = xmalloc ( (n + 1) * sizeof(wchar_t)) ;

      n = mbstowcs ( wp, u->u8str, n ) ;

      if ( u->uwstr ) {
         free ( u->uwstr ) ;
      }

      u->uwlen = n ;
      wp[u->uwlen] = 0 ;
      u->uwstr = wp ;
   }

   *len = u->uwlen ;

   // we are duplicating to protect data type members from being altered 
   // This means they should (or can) be released using free system call. 
   wp = (wchar_t *) xmalloc ( sizeof(wchar_t) * (u->uwlen + 1) ) ;
   memcpy ( (void *)wp, (void *)u->uwstr, u->uwlen * sizeof(wchar_t) ) ;

   wp[u->uwlen] = 0 ;

   return wp ;
}

/* getUwPFstr -- returns UTF-32 representation of u, fills len with length
 * of the UTF-32 string.
 */
wchar_t *getUwPFstr ( U u, int *len )
{
   wchar_t wc, pfwc, prevwc, ppfwc ;
   wchar_t *wp, *pfwp ;
   int i, j, n ;
   char *p ;
   begtype bt ;
   S syn, tsyn, lsyn ;

   if ( !pfinited ) {
      pfinit() ;
   }

   wp = getUwstr ( u, &n ) ;

   pfwp = xmalloc ( (n + 1) * sizeof(wchar_t) ) ;

   lsyn = pform[ CM('l') ] ;   

   bt = WDBEG ;
   j = 0 ;
   for ( i = 0 ; i < n ; i++ ) {
      wc  = wp[i] ;
      if ( BPAD <= wc && wc < BMAX ) {
         syn = pform [ wc - BPAD ] ;

         if ( syn != NULL ) {
            switch ( bt ) {
               case WDBEG:
                  pfwc = syn->iform ;
                  break ;
               case INBEG:
                  pfwc = syn->iform ;
                  break ;
               case NOBEG:
                  pfwc = syn->eform ;
                  break ;
               default:
                  break ;
            }

            /* previous letter does not ask for word-beginning form,
             * and we have to change it to either medial or beginning form,
             * depending on the previous letter's current form.
             */
            //this means the previous letter was a joinable Uyghur letter
            if ( bt != WDBEG ) { 
               tsyn = pform [ prevwc - BPAD ] ;

               // special cases for LA and _LA
               if ( ppfwc == lsyn->iform && wc == cmap['a'] ) {
                  pfwp[j-1] = LA ;
                  bt = WDBEG ;
                  continue ;
               } else if ( ppfwc == lsyn->eform && wc == cmap['a'] ) {
                  pfwp[j-1] = _LA ;
                  bt = WDBEG ;
                  continue ;
               }

               // update previous character
               if ( ppfwc == tsyn->iform ) {
                  pfwp[j-1] = tsyn->bform ;
               } else if ( ppfwc == tsyn->eform ) {
                  pfwp[j-1] = tsyn->mform ;
               }
            }
            bt = syn->btype ; // we will need this in next round
         } else { // a non-Uyghur char in basic range
            pfwc = wc ;
            bt = WDBEG ;
         }
      } else { // not in basic Arabic range ( 0x0600-0x06FF )
         pfwc = wc ;
         bt = WDBEG ;
      }

      pfwp[j] = pfwc ;
      ppfwc   = pfwc ; // previous presentation form wide character
      prevwc  = wc ;
      j++ ;
   }

   pfwp[j] = 0 ;

   free(wp);

   *len = j ;
   return pfwp ;
}

/* getInAscii -- returns ascii representation of u in the form "&#dddd;". 
 * If useBasic is true, then letters will be encoded in basic Arabic Unicode
 * range, and in presentation form range otherwise.
 */
char *getInAscii ( U u, bool useBasic ) 
{
   int len, i ;
   wchar_t *wp ;
   char *p, *tp ;
   int factor ;
   char asc[9] ;

   assert ( u ) ;

   if ( useBasic ) {
      factor = 7 ;
      wp = getUwstr ( u, &len ) ;
   } else {
      factor = 8 ;
      wp = getUwPFstr ( u, &len ) ;
   }


   p = (char *) xmalloc ( len * factor + 1 ) ;

   tp = p ;
   for ( i = 0 ; i < len ; i++ ) {
      if ( wp[i] > 127 ) {
         sprintf ( asc, "&#%d;", wp[i] ) ;
      } else {
         sprintf ( asc, "%c", wp[i] ) ;
      }
      sprintf ( tp, asc ) ;
      tp += strlen ( asc ) ; 
   }

   free(wp) ;

   return p ;
}

// setUstr -- set ascii string of u (accepts ascii string)
// This method renders other string members out of date.
void setUstr  ( U u, const char *ukystr )
{
   assert ( u ) ;

   if ( u->ustr ) {
      free ( u->ustr ) ;
   }

   u->ustr = strdup ( ukystr ) ;
   u->ulen = strlen ( ukystr ) ;

   // minus one indicates corresponding string not initialized/updated
   u->u8len = -1 ; 
   u->u8pflen = -1 ; 
   u->uwlen = -1 ;

}

// setU8str -- sets UTF-8 string of u (accepts UTF-8 string)
// This method renders other string members out of date.
void setU8str ( U u, const char *u8str )
{
   int n, len ;
   wchar_t *wp ;

   assert ( u ) ;

   len = strlen ( u8str ) ;

   wp = xmalloc ( (len + 1) * sizeof(wchar_t)) ;
   n = mbstowcs ( wp, u8str, len ) ;

   if ( n == -1 ) { // invalid UTF-8 string
      free(wp) ;
      setUstr ( u, (char *) u8str ) ;
      return ;
   }

   if ( u->u8str ) {
      free ( u->u8str ) ;
   }
   u->u8str = strdup ( u8str ) ;
   u->u8len = len ; 
   
   // while checking the validity of UTF-8 string, we also got UTF-32
   if ( u->uwstr ) {
      free ( u->uwstr ) ;
   }
   u->uwstr = wp ;
   u->uwlen = n ;

   // minus one indicates corresponding string not initialized/updated
   u->ulen = -1 ; 
   u->u8pflen = -1 ; 
}

// setUwstr -- sets UTF-32 string of u, (accepts UTF-32 string)
// This method renders other string members out of date.
void setUwstr ( U u, const wchar_t *wcstr, int len )
{
   assert ( u ) ;
   assert ( len > -1 ) ;

   wchar_t *wp ;

   wp = (wchar_t *) xmalloc ( sizeof(wchar_t) * (len + 1) ) ;
   memcpy ( (void *)wp, (void *)wcstr, len * sizeof(wchar_t) ) ;

   if ( u->uwstr ) {
      free ( u->uwstr ) ;
   }

   u->uwlen = len ;
   u->uwstr = wp ;
   wp[u->uwlen] = 0 ;

   u->ulen = -1 ;
   u->u8len = - 1 ;
   u->u8pflen = - 1 ;
}

// ulength (U u) -- returns up-to-date length of ascii string
int ulength ( U u ) 
{
   char *p ;

   assert ( u ) ;

   if ( u->ulen == -1 ) {
      // this will update ulen (and ustr too).
      if ( (p = getUstr ( u )) != NULL ) {
         free ( p ) ;
      }
   }

   return u->ulen ;
}

// u8length (U u) -- returns up-to-date length of UTF-8 string
int u8length ( U u )
{
   char *p ;

   assert ( u ) ;

   if ( u->u8len == -1 ) {
      // this will update u8len (and u8str too)
      if ( (p = getU8str ( u )) != NULL ) {
         free ( p ) ;
      }
   }

   return u->u8len ;
}

// uwlength (U u) -- returns up-to-date length of UTF-32 string
int uwlength ( U u )
{
   wchar_t *p ;
   int len ;

   assert ( u ) ;

   if ( u->uwlen == -1 ) {
      // this will update uwlen (and uwstr too)
      if ( (p = getUwstr ( u, &len )) != NULL ) {
         free ( p ) ;
      }
   }

   return u->uwlen ;
}

// utext_destroy(U u) -- cleans up structure u and its members
void utext_destroy ( U u )
{
   // We chose not to abort when u is NULL.
   if ( !u ) {
      return ;
   }

   //assert ( u ) ;

   if ( u->ustr ) {
      free ( u->ustr ) ;
   }

   if ( u->u8str ) {
      free ( u->u8str ) ;
   }

   if ( u->uwstr ) {
      free ( u->uwstr ) ;
   }

   if ( u->u8pfstr ) {
      free ( u->u8pfstr ) ;
   }

   free ( u ) ;
}

// new_syn -- returns a pointer to struct Syntax initialized by parameters
S new_syn ( wchar_t i, wchar_t b, wchar_t m, wchar_t e, begtype bt )
{
   S syntax ;

   syntax = ( S ) xmalloc ( sizeof ( *syntax ) ) ;

   syntax->iform = i ;
   syntax->bform = b ;
   syntax->mform = m ;
   syntax->eform = e ;
   syntax->btype = bt ;

   return syntax ;
}

/*
 * utiliy functions below ... 
 */
void *xmalloc ( size_t n )
{
   void *p ;

   p = (void *) malloc ( n ) ;

   if ( !p ) { // we should not see many of these
      fprintf ( stderr, "malloc failed for %d bytes\n", n ) ;
      exit ( 1 ) ;
   }

   return p ;
}

// isvowel -- returns true if ch is a vowel in Uyghur
bool isvowel ( int ch )
{
   if ( ch == 'a' || ch == 'A' || ch == 'e' || ch == 'E' ||
        ch == PRIMe || ch == PRIME || ch == 'i' || ch == 'I' ||
        ch == 'o' || ch == 'O' || ch == COLo || ch == COLO ||
        ch == 'u' || ch == 'U' || ch == COLu || ch == COLU ) {
      return true ;
   }

   return false ;
}
