/*
* utext.c -- Uyghur Text Processing Library, C version, C source file
* $Author: Muhammad Abdulla
* $version: 1.1
* License: GPL
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stddef.h>
#include <wchar.h>
#include <locale.h>
#include <assert.h>
#include <errno.h>
#include "utext.h"
#define U UText
#define S Syntax
#define BASELEN 256
#define CHEE (0x0686)
#define GHEE (0x063A)
#define NGEE (0x06AD)
#define SHEE (0x0634)
#define SZEE (0x0698)
#define LA (0xFEFB)
#define _LA (0xFEFC)
#define HAMZA (0x0626)
#define BPAD (0x0600)
#define BMAX (0x06FF)
#define NELEMS(x) ((sizeof(x))/(sizeof((x)[0])))
#define CM(x) (cmap[(x)]-BPAD)
enum {
PRIMe = 233, // 'e
PRIME = 201, // 'E
COLo = 246, // :o
COLO = 214, // :O
COLu = 252, // :u
COLU = 220 // :U
} ;
struct U {
char *ustr ;
char *u8str ;
char *u8pfstr ;
wchar_t *uwstr ;
int ulen ;
int u8len ;
int u8pflen ;
int uwlen ;
} ;
typedef struct S *S ;
typedef enum { WDBEG, INBEG, NOBEG } begtype ;
// this is for rendering Uyghur letters in presentation form
// (also called extended region)
struct S {
wchar_t iform ; // isolated form
wchar_t bform ; // beginning form
wchar_t mform ; // medial form
wchar_t eform ; // end form
begtype btype ; // specifies in which form the next letter should be
} ;
wchar_t cmap[BASELEN] ;
unsigned char cmapinv[BASELEN] ;
S pform[BASELEN] ;
/*
* forward function declarations
*/
void *xmalloc ( size_t n ) ;
S new_syn ( wchar_t i, wchar_t b, wchar_t m, wchar_t e, begtype bt ) ;
bool isvowel ( int ch ) ;
int inited = 0 ; // flag for initialization
int pfinited = 0 ; // flag for initialization of presentation form
// initialize the charmap and its inverse tables
void init ( )
{
int i ;
wchar_t wc ;
// mark initialized, so that it won't be called again
inited = true ;
setlocale ( LC_ALL, "en_US.UTF-8" ) ;
memset ( cmap, 0, sizeof(cmap) ) ;
memset ( cmapinv, 0, sizeof(cmapinv) ) ;
cmap['A'] = 0x0627 ;
cmap['a'] = 0x0627 ;
cmap['B'] = 0x0628 ;
cmap['b'] = 0x0628 ;
cmap['C'] = 0x0643 ;
cmap['c'] = 0x0643 ;
cmap['D'] = 0x062F ;
cmap['d'] = 0x062F ;
cmap['E'] = 0x06D5 ;
cmap['e'] = 0x06D5 ;
cmap['F'] = 0x0641 ;
cmap['f'] = 0x0641 ;
cmap['G'] = 0x06AF ;
cmap['g'] = 0x06AF ;
cmap['H'] = 0x06BE ;
cmap['h'] = 0x06BE ;
cmap['I'] = 0x0649 ;
cmap['i'] = 0x0649 ;
cmap['J'] = 0x062C ;
cmap['j'] = 0x062C ;
cmap['K'] = 0x0643 ;
cmap['k'] = 0x0643 ;
cmap['L'] = 0x0644 ;
cmap['l'] = 0x0644 ;
cmap['M'] = 0x0645 ;
cmap['m'] = 0x0645 ;
cmap['N'] = 0x0646 ;
cmap['n'] = 0x0646 ;
cmap['O'] = 0x0648 ;
cmap['o'] = 0x0648 ;
cmap['P'] = 0x067E ;
cmap['p'] = 0x067E ;
cmap['Q'] = 0x0642 ;
cmap['q'] = 0x0642 ;
cmap['R'] = 0x0631 ;
cmap['r'] = 0x0631 ;
cmap['S'] = 0x0633 ;
cmap['s'] = 0x0633 ;
cmap['T'] = 0x062A ;
cmap['t'] = 0x062A ;
cmap['U'] = 0x06C7 ;
cmap['u'] = 0x06C7 ;
cmap['V'] = 0x06CB ;
cmap['v'] = 0x06CB ;
cmap['W'] = 0x06CB ;
cmap['w'] = 0x06CB ;
cmap['X'] = 0x062E ;
cmap['x'] = 0x062E ;
cmap['Y'] = 0x064A ;
cmap['y'] = 0x064A ;
cmap['Z'] = 0x0632 ;
cmap['z'] = 0x0632 ;
cmap[PRIMe] = 0x06D0 ; // 'e
cmap[PRIME] = 0x06D0 ; // 'E
cmap[COLo] = 0x06C6 ; // :o
cmap[COLO] = 0x06C6 ; // :O
cmap[COLu] = 0x06C8 ; // :u
cmap[COLU] = 0x06C8 ; // :U
// Uyghur punctuation marks
cmap [ ';' ] = 0x061B ;
cmap [ '?' ] = 0x061F ;
cmap [ ',' ] = 0x060C ;
// the inverse of cmap table, to speed up lookups (without wasting much space)
// we use BPAD for index operations, we would be wasting BPAD many bytes.
// We could have used a hash table instead, but didn't think it is worthwhile.
for ( i = 0 ; i < NELEMS(cmapinv) ; i++ ) {
wc = cmap[i] ;
if ( wc != 0 ) {
cmapinv [ wc - BPAD ] = i ;
}
}
}
// pfinit() -- initialize the presentation form table.
// We kept this separate from init() to keep init() lightweight,
// as not every utext type is used to generate texts in presentation form.
void pfinit ( )
{
// S new_syn ( wchar_t i, wchar_t b, wchar_t m, wchar_t e, begtype bt ) ;
int i ;
wchar_t wc ;
pfinited = true ;
for ( i = 0 ; i < NELEMS(pform) ; i++ ) {
pform[i] = NULL ;
}
pform[ CM('a') ] = new_syn ( 0xFE8D, 0xFE8D, 0xFE8D, 0xFE8E, WDBEG ) ;
pform[ CM('e') ] = new_syn ( 0xFEE9, 0xFEE9, 0xFEE9, 0xFEEA, WDBEG ) ;
pform[ CM('b') ] = new_syn ( 0xFE8F, 0xFE91, 0xFE92, 0xFE90, NOBEG ) ;
pform[ CM('p') ] = new_syn ( 0xFB56, 0xFB58, 0xFB59, 0xFB57, NOBEG ) ;
pform[ CM('t') ] = new_syn ( 0xFE95, 0xFE97, 0xFE98, 0xFE96, NOBEG ) ;
pform[ CM('j') ] = new_syn ( 0xFE9D, 0xFE9F, 0xFEA0, 0xFE9E, NOBEG ) ;
pform[ CHEE-BPAD ] = new_syn ( 0xFB7A, 0xFB7C, 0xFB7D, 0xFB7B, NOBEG ) ;
pform[ CM('x') ] = new_syn ( 0xFEA5, 0xFEA7, 0xFEA8, 0xFEA6, NOBEG ) ;
pform[ CM('d') ] = new_syn ( 0xFEA9, 0xFEA9, 0xFEAA, 0xFEAA, INBEG ) ;
pform[ CM('r') ] = new_syn ( 0xFEAD, 0xFEAD, 0xFEAE, 0xFEAE, INBEG ) ;
pform[ CM('z') ] = new_syn ( 0xFEAF, 0xFEAF, 0xFEB0, 0xFEB0, INBEG ) ;
pform[ SZEE-BPAD ] = new_syn ( 0xFB8A, 0xFB8A, 0xFB8B, 0xFB8B, INBEG ) ;
pform[ CM('s') ] = new_syn ( 0xFEB1, 0xFEB3, 0xFEB4, 0xFEB2, NOBEG ) ;
pform[ SHEE-BPAD ] = new_syn ( 0xFEB5, 0xFEB7, 0xFEB8, 0xFEB6, NOBEG ) ;
pform[ GHEE-BPAD ] = new_syn ( 0xFECD, 0xFECF, 0xFED0, 0xFECE, NOBEG ) ;
pform[ CM('f') ] = new_syn ( 0xFED1, 0xFED3, 0xFED4, 0xFED2, NOBEG ) ;
pform[ CM('q') ] = new_syn ( 0xFED5, 0xFED7, 0xFED8, 0xFED6, NOBEG ) ;
pform[ CM('k') ] = new_syn ( 0xFED9, 0xFEDB, 0xFEDC, 0xFEDA, NOBEG ) ;
pform[ CM('g') ] = new_syn ( 0xFB92, 0xFB94, 0xFB95, 0xFB93, NOBEG ) ;
pform[ NGEE-BPAD ] = new_syn ( 0xFBD3, 0xFBD5, 0xFBD6, 0xFBD4, NOBEG ) ;
pform[ CM('l') ] = new_syn ( 0xFEDD, 0xFEDF, 0xFEE0, 0xFEDE, NOBEG ) ;
pform[ CM('m') ] = new_syn ( 0xFEE1, 0xFEE3, 0xFEE4, 0xFEE2, NOBEG ) ;
pform[ CM('n') ] = new_syn ( 0xFEE5, 0xFEE7, 0xFEE8, 0xFEE6, NOBEG ) ;
pform[ CM('h') ] = new_syn ( 0xFEEB, 0xFEEB, 0xFEEC, 0xFEEC, NOBEG ) ;
//pform[ CM('h') ] = new_syn ( 0xFBAA, 0xFBAA, 0xFBAD, 0xFBAD, NOBEG ) ;
pform[ CM('o') ] = new_syn ( 0xFEED, 0xFEED, 0xFEEE, 0xFEEE, INBEG ) ;
pform[ CM('u') ] = new_syn ( 0xFBD7, 0xFBD7, 0xFBD8, 0xFBD8, INBEG ) ;
pform[ CM(COLo) ] = new_syn ( 0xFBD9, 0xFBD9, 0xFBDA, 0xFBDA, INBEG ) ;
pform[ CM(COLu) ] = new_syn ( 0xFBDB, 0xFBDB, 0xFBDC, 0xFBDC, INBEG ) ;
pform[ CM('w') ] = new_syn ( 0xFBDE, 0xFBDE, 0xFBDF, 0xFBDF, INBEG ) ;
pform[ CM(PRIMe) ] = new_syn ( 0xFBE4, 0xFBE6, 0xFBE7, 0xFBE5, NOBEG ) ;
pform[ CM('i') ] = new_syn ( 0xFEEF, 0xFBE8, 0xFBE9, 0xFEF0, NOBEG ) ;
pform[ CM('y') ] = new_syn ( 0xFEF1, 0xFEF3, 0xFEF4, 0xFEF2, NOBEG ) ;
pform[ HAMZA-BPAD ] = new_syn ( 0xFE8B, 0xFE8B, 0xFE8C, 0xFB8C, NOBEG ) ;
}
/*
* zinit( U u ) -- zero-out members of u
*/
void zinit ( U u )
{
assert(u);
u->ustr = NULL;
u->u8str = NULL;
u->u8pfstr = NULL;
u->uwstr = NULL;
u->ulen = -1;
u->u8len = -1;
u->u8pflen = -1;
u->uwlen = -1;
return;
}
/*
* new_utext ( const char *ukystr ) -- creates a new UText from UKY string ukystr
* ulen will be updated, but u8len and uwlen are set to -1 to indicate that utf8str
* and uwstr are not created/updated since ustr is updated, respectively. Similar
* methods are used throughout the implementation.
*/
U new_utext ( const char *ukystr )
{
U utext ;
if ( !inited ) {
init() ;
}
utext = ( U ) xmalloc ( sizeof ( *utext ) ) ;
zinit(utext);
setUstr ( utext, ukystr ) ;
return utext ;
}
// new_u8text -- returns a new utext type u, whose u8str member
// initialized by u8str which is supposed to be in UTF-8 format.
U new_u8text ( const char *u8str )
{
U utext ;
if ( !inited ) {
init() ;
}
utext = ( U ) xmalloc ( sizeof ( *utext ) ) ;
zinit(utext);
setU8str ( utext, u8str ) ;
return utext ;
}
// new_uwtext -- returns a new utext type u, whose uwstr member
// initialized by wcstr which is supposed to be in UTF-32 format.
U new_uwtext ( const wchar_t *wcstr, int len )
{
U utext ;
if ( !inited ) {
init() ;
}
utext = ( U ) xmalloc ( sizeof ( *utext ) ) ;
zinit(utext);
utext->uwstr = (wchar_t *) xmalloc ( sizeof(wchar_t) * (len + 1) ) ;
memcpy ( (void *)utext->uwstr, (void *)wcstr, len * sizeof(wchar_t) ) ;
utext->uwstr[len] = 0 ;
utext->uwlen = len ;
// minus one indicates corresponding string not initialized/updated
utext->ulen = -1 ;
utext->u8len = -1 ;
utext->u8pflen = -1 ;
return utext ;
}
// getUstr -- returns up-to-date ascii string of u
char *getUstr ( U u )
{
wchar_t wc ;
wchar_t *wp ;
int i, j, len ;
char *p ;
unsigned char ch ;
assert ( u ) ;
// We are duplicating to protect data type members from being altered.
// This means they should (or can) be released using free system call.
if ( u->ulen != -1 ) {
return strdup(u->ustr) ;
}
wp = getUwstr ( u, &len ) ;
p = xmalloc ( len + 1 ) ;
j = 0 ;
for ( i = 0 ; i < len ; i++ ) {
wc = wp[i] ;
if ( wc == HAMZA ) {
continue ;
}
if ( BPAD <= wc && wc < BMAX && cmapinv[wc-BPAD] != 0 ) {
p[j] = cmapinv[wc-BPAD] ;
} else {
p[j] = (unsigned char) wc ;
}
j++ ;
}
free ( wp ) ;
if ( u->ustr ) {
free ( u->ustr ) ;
}
u->ustr = p ;
u->ulen = j ;
return strdup ( u->ustr ) ;
}
// getU8str -- returns up-to-date UTF-8 string of u
char *getU8str ( U u )
{
int i, len, n ;
wchar_t wch ;
wchar_t *wp ;
char *p ;
assert ( u ) ;
if ( u->u8len == -1 ) {
wp = getUwstr ( u, &len ) ;
// one character in basic range takes two bytes in UTF-8
len = len * 2 + 1 ;
p = xmalloc ( len ) ;
n = wcstombs ( p, wp, len ) ;
if ( u->u8str ) {
free ( u->u8str ) ;
}
u->u8str = p ;
free(wp) ;
}
// we are duplicating to protect data type members from being altered
// This means they should (or can) be released using free system call.
return strdup(u->u8str) ;
}
/* getU8PFstr -- returns up-to-date UTF-8 string of u in presentation form
* range. The method getU8str returns Uyghur character in basic range. The
* layout is delegated to the user agent or the operating system. This is
* recommed by the Unicode Consortium to avoid problems in searching and
* sorting, etc. However, in some special cases, a processed version that
* that renders the letters in presentation form region of Unicode table is
* necessary for environments that do not support ligatures for Uyghur or
* for generating images on the fly, etc.
*/
char *getU8PFstr ( U u )
{
int i, len, n ;
wchar_t wch ;
wchar_t *wp ;
char *p ;
assert ( u ) ;
if ( u->u8pflen == -1 ) {
wp = getUwPFstr ( u, &len ) ;
// one character in presentation form range takes three bytes in UTF-8
len = len * 3 + 1 ;
p = xmalloc ( len ) ;
n = wcstombs ( p, wp, len ) ;
if ( u->u8pfstr ) {
free ( u->u8pfstr ) ;
}
u->u8pfstr = p ;
free(wp) ;
}
// we are duplicating to protect data type members from being altered
// This means they should (or can) be released using free system call.
return strdup(u->u8pfstr) ;
}
// getUwstr -- returns up-to-date UTF-32 string of u
wchar_t *getUwstr ( U u, int *len )
{
int i, j, l, n ;
wchar_t *wp ;
wchar_t wch ;
char *p ;
unsigned char prev, cur, next ;
// flag if we're in word-beginning position, in which case
// we should put hamza in front vowels
bool wdbeg = true ;
assert ( u ) ;
p = u->ustr ;
if ( u->uwlen == -1 && u->ulen != -1 ) {
// In the worst case, the letter hamza (0x0626) can be added in front
// of each letter, and we need an extra wchar_t to mark end of string
wp = xmalloc ( (2*u->ulen + 1) * sizeof(wchar_t)) ;
j = 0 ;
prev = 0 ;
for ( i = 0 ; i < u->ulen ; i++ ) {
cur = p[i] ;
next = p[i+1] ;
wch = 0 ;
/* In some words that come from foreign languages, such as zhungxua, jiayuguan, etc.,
* we use medial forms of AA or AE. Compare this to Uyghur word sual, for example.
* By default, we use beginning forms of AA and AE in such cases, as in normal Uyghur.
* To force medial forms, put a '|' between vowels, e.g., "shinxu|a".
*/
if ( cur == '|' && ( prev == 'u' || prev == 'U' ) &&
( next == 'a' || next == 'A' || next == 'e' || next == 'E' ) ) {
wdbeg = false ;
continue ;
}
// add hamza in front of vowels in word-beginning positions
if ( wdbeg == true ) {
if ( isvowel(cur) ) {
wp[j++] = HAMZA ;
}
} else {
if ( cur == '\'' ) { // wdbeg == false means prev is a non-vowel letter
/* we try to force a hamza in certain occasions, e.g., compare
* suret (picture) and sur'et (velocity). To minimize the effects
* of this substitution, we only do this if "'" is fllowed by a
* vowel and it is not in the word-beginning position.
*/
if ( isvowel(next) ) {
wdbeg = false ; // don't add another hamza in next round
wp[j++] = HAMZA ;
continue ;
} else if ( isalpha(next) ) {
/* Besides, we also want to separate two letters that form
* joint letter using "'". For example, to avoid the "ng"
* from being treated as a joint letter NGEE in words
* yemenge, yigenge,..., a "'" can be placed between them.
* For example, yemen'ge, yigen'ge,... .
*/
continue ;
}
}
}
// AA, AE, and non-alpha-numeric letters makes word beginning
if ( isvowel(cur) || !isalpha(cur) ) {
wdbeg = true ;
} else {
wdbeg = false ;
}
switch ( cur ) {
case 'c':
case 'C':
if ( next == 'h' || next == 'H' ) {
wch = CHEE ;
}
break ;
case 'g':
case 'G':
if ( next == 'h' || next == 'H' ) {
wch = GHEE ;
}
break ;
case 'n':
case 'N':
if ( next == 'g' || next == 'G' ) {
/* for cases where we have a sequence of ngh, it could be
* translated as either NGEE + EHE or NEE + GHEE. However, the
* latter is much more common than the former in Uyghur language
* and we opt to translate it as NEE + GHEE. If there is a
* need to have NGEE + EHE, a single quote ("'") can be used.
*/
if ( tolower ( p[i+2] ) != 'h' ) {
wch = NGEE ;
}
}
break ;
case 's':
case 'S':
if ( next == 'h' || next == 'H' ) {
wch = SHEE ;
} else if ( next == 'z' || next == 'Z' ) {
// UKY does not provide a unique SZEE, we take joint
// letters "sz" for SZEE, as in purszin [spring (coil)]
wch = SZEE ;
}
break ;
default:
break ;
}
if ( wch != 0 ) {
i++ ; // there is a joint letter, advance index
wp[j] = wch ;
} else if ( cmap[cur] ) {
wp[j] = cmap[cur] ; // no joint letter, but valid UKY
} else {
wp[j] = p[i] ; // non-UKY, return whatever is entered
}
prev = cur ;
j++ ;
}
if ( u->uwstr ) {
free ( u->uwstr ) ;
}
wp[j] = 0 ;
u->uwstr = wp ;
u->uwlen = j ;
} else if ( u->uwlen == -1 && u->u8len != -1 ) {
n = u->u8len ;
wp = xmalloc ( (n + 1) * sizeof(wchar_t)) ;
n = mbstowcs ( wp, u->u8str, n ) ;
if ( u->uwstr ) {
free ( u->uwstr ) ;
}
u->uwlen = n ;
wp[u->uwlen] = 0 ;
u->uwstr = wp ;
}
*len = u->uwlen ;
// we are duplicating to protect data type members from being altered
// This means they should (or can) be released using free system call.
wp = (wchar_t *) xmalloc ( sizeof(wchar_t) * (u->uwlen + 1) ) ;
memcpy ( (void *)wp, (void *)u->uwstr, u->uwlen * sizeof(wchar_t) ) ;
wp[u->uwlen] = 0 ;
return wp ;
}
/* getUwPFstr -- returns UTF-32 representation of u, fills len with length
* of the UTF-32 string.
*/
wchar_t *getUwPFstr ( U u, int *len )
{
wchar_t wc, pfwc, prevwc, ppfwc ;
wchar_t *wp, *pfwp ;
int i, j, n ;
char *p ;
begtype bt ;
S syn, tsyn, lsyn ;
if ( !pfinited ) {
pfinit() ;
}
wp = getUwstr ( u, &n ) ;
pfwp = xmalloc ( (n + 1) * sizeof(wchar_t) ) ;
lsyn = pform[ CM('l') ] ;
bt = WDBEG ;
j = 0 ;
for ( i = 0 ; i < n ; i++ ) {
wc = wp[i] ;
if ( BPAD <= wc && wc < BMAX ) {
syn = pform [ wc - BPAD ] ;
if ( syn != NULL ) {
switch ( bt ) {
case WDBEG:
pfwc = syn->iform ;
break ;
case INBEG:
pfwc = syn->iform ;
break ;
case NOBEG:
pfwc = syn->eform ;
break ;
default:
break ;
}
/* previous letter does not ask for word-beginning form,
* and we have to change it to either medial or beginning form,
* depending on the previous letter's current form.
*/
//this means the previous letter was a joinable Uyghur letter
if ( bt != WDBEG ) {
tsyn = pform [ prevwc - BPAD ] ;
// special cases for LA and _LA
if ( ppfwc == lsyn->iform && wc == cmap['a'] ) {
pfwp[j-1] = LA ;
bt = WDBEG ;
continue ;
} else if ( ppfwc == lsyn->eform && wc == cmap['a'] ) {
pfwp[j-1] = _LA ;
bt = WDBEG ;
continue ;
}
// update previous character
if ( ppfwc == tsyn->iform ) {
pfwp[j-1] = tsyn->bform ;
} else if ( ppfwc == tsyn->eform ) {
pfwp[j-1] = tsyn->mform ;
}
}
bt = syn->btype ; // we will need this in next round
} else { // a non-Uyghur char in basic range
pfwc = wc ;
bt = WDBEG ;
}
} else { // not in basic Arabic range ( 0x0600-0x06FF )
pfwc = wc ;
bt = WDBEG ;
}
pfwp[j] = pfwc ;
ppfwc = pfwc ; // previous presentation form wide character
prevwc = wc ;
j++ ;
}
pfwp[j] = 0 ;
free(wp);
*len = j ;
return pfwp ;
}
/* getInAscii -- returns ascii representation of u in the form "&#dddd;".
* If useBasic is true, then letters will be encoded in basic Arabic Unicode
* range, and in presentation form range otherwise.
*/
char *getInAscii ( U u, bool useBasic )
{
int len, i ;
wchar_t *wp ;
char *p, *tp ;
int factor ;
char asc[9] ;
assert ( u ) ;
if ( useBasic ) {
factor = 7 ;
wp = getUwstr ( u, &len ) ;
} else {
factor = 8 ;
wp = getUwPFstr ( u, &len ) ;
}
p = (char *) xmalloc ( len * factor + 1 ) ;
tp = p ;
for ( i = 0 ; i < len ; i++ ) {
if ( wp[i] > 127 ) {
sprintf ( asc, "&#%d;", wp[i] ) ;
} else {
sprintf ( asc, "%c", wp[i] ) ;
}
sprintf ( tp, asc ) ;
tp += strlen ( asc ) ;
}
free(wp) ;
return p ;
}
// setUstr -- set ascii string of u (accepts ascii string)
// This method renders other string members out of date.
void setUstr ( U u, const char *ukystr )
{
assert ( u ) ;
if ( u->ustr ) {
free ( u->ustr ) ;
}
u->ustr = strdup ( ukystr ) ;
u->ulen = strlen ( ukystr ) ;
// minus one indicates corresponding string not initialized/updated
u->u8len = -1 ;
u->u8pflen = -1 ;
u->uwlen = -1 ;
}
// setU8str -- sets UTF-8 string of u (accepts UTF-8 string)
// This method renders other string members out of date.
void setU8str ( U u, const char *u8str )
{
int n, len ;
wchar_t *wp ;
assert ( u ) ;
len = strlen ( u8str ) ;
wp = xmalloc ( (len + 1) * sizeof(wchar_t)) ;
n = mbstowcs ( wp, u8str, len ) ;
if ( n == -1 ) { // invalid UTF-8 string
free(wp) ;
setUstr ( u, (char *) u8str ) ;
return ;
}
if ( u->u8str ) {
free ( u->u8str ) ;
}
u->u8str = strdup ( u8str ) ;
u->u8len = len ;
// while checking the validity of UTF-8 string, we also got UTF-32
if ( u->uwstr ) {
free ( u->uwstr ) ;
}
u->uwstr = wp ;
u->uwlen = n ;
// minus one indicates corresponding string not initialized/updated
u->ulen = -1 ;
u->u8pflen = -1 ;
}
// setUwstr -- sets UTF-32 string of u, (accepts UTF-32 string)
// This method renders other string members out of date.
void setUwstr ( U u, const wchar_t *wcstr, int len )
{
assert ( u ) ;
assert ( len > -1 ) ;
wchar_t *wp ;
wp = (wchar_t *) xmalloc ( sizeof(wchar_t) * (len + 1) ) ;
memcpy ( (void *)wp, (void *)wcstr, len * sizeof(wchar_t) ) ;
if ( u->uwstr ) {
free ( u->uwstr ) ;
}
u->uwlen = len ;
u->uwstr = wp ;
wp[u->uwlen] = 0 ;
u->ulen = -1 ;
u->u8len = - 1 ;
u->u8pflen = - 1 ;
}
// ulength (U u) -- returns up-to-date length of ascii string
int ulength ( U u )
{
char *p ;
assert ( u ) ;
if ( u->ulen == -1 ) {
// this will update ulen (and ustr too).
if ( (p = getUstr ( u )) != NULL ) {
free ( p ) ;
}
}
return u->ulen ;
}
// u8length (U u) -- returns up-to-date length of UTF-8 string
int u8length ( U u )
{
char *p ;
assert ( u ) ;
if ( u->u8len == -1 ) {
// this will update u8len (and u8str too)
if ( (p = getU8str ( u )) != NULL ) {
free ( p ) ;
}
}
return u->u8len ;
}
// uwlength (U u) -- returns up-to-date length of UTF-32 string
int uwlength ( U u )
{
wchar_t *p ;
int len ;
assert ( u ) ;
if ( u->uwlen == -1 ) {
// this will update uwlen (and uwstr too)
if ( (p = getUwstr ( u, &len )) != NULL ) {
free ( p ) ;
}
}
return u->uwlen ;
}
// utext_destroy(U u) -- cleans up structure u and its members
void utext_destroy ( U u )
{
// We chose not to abort when u is NULL.
if ( !u ) {
return ;
}
//assert ( u ) ;
if ( u->ustr ) {
free ( u->ustr ) ;
}
if ( u->u8str ) {
free ( u->u8str ) ;
}
if ( u->uwstr ) {
free ( u->uwstr ) ;
}
if ( u->u8pfstr ) {
free ( u->u8pfstr ) ;
}
free ( u ) ;
}
// new_syn -- returns a pointer to struct Syntax initialized by parameters
S new_syn ( wchar_t i, wchar_t b, wchar_t m, wchar_t e, begtype bt )
{
S syntax ;
syntax = ( S ) xmalloc ( sizeof ( *syntax ) ) ;
syntax->iform = i ;
syntax->bform = b ;
syntax->mform = m ;
syntax->eform = e ;
syntax->btype = bt ;
return syntax ;
}
/*
* utiliy functions below ...
*/
void *xmalloc ( size_t n )
{
void *p ;
p = (void *) malloc ( n ) ;
if ( !p ) { // we should not see many of these
fprintf ( stderr, "malloc failed for %d bytes\n", n ) ;
exit ( 1 ) ;
}
return p ;
}
// isvowel -- returns true if ch is a vowel in Uyghur
bool isvowel ( int ch )
{
if ( ch == 'a' || ch == 'A' || ch == 'e' || ch == 'E' ||
ch == PRIMe || ch == PRIME || ch == 'i' || ch == 'I' ||
ch == 'o' || ch == 'O' || ch == COLo || ch == COLO ||
ch == 'u' || ch == 'U' || ch == COLu || ch == COLU ) {
return true ;
}
return false ;
}