"SfR Fresh" - the SfR Freeware/Shareware Archive

Member "less-424/charset.c" of archive less-424.tar.gz:


As a special service "SfR Fresh" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
    1 /*
    2  * Copyright (C) 1984-2008  Mark Nudelman
    3  *
    4  * You may distribute under the terms of either the GNU General Public
    5  * License or the Less License, as specified in the README file.
    6  *
    7  * For more information about less, or for information on how to
    8  * contact the author, see the README file.
    9  */
   10 
   11 
   12 /*
   13  * Functions to define the character set
   14  * and do things specific to the character set.
   15  */
   16 
   17 #include "less.h"
   18 #if HAVE_LOCALE
   19 #include <locale.h>
   20 #include <ctype.h>
   21 #include <langinfo.h>
   22 #endif
   23 
   24 #include "charset.h"
   25 
   26 public int utf_mode = 0;
   27 
   28 /*
   29  * Predefined character sets,
   30  * selected by the LESSCHARSET environment variable.
   31  */
   32 struct charset {
   33 	char *name;
   34 	int *p_flag;
   35 	char *desc;
   36 } charsets[] = {
   37 	{ "ascii",		NULL,       "8bcccbcc18b95.b" },
   38 	{ "utf-8",		&utf_mode,  "8bcccbcc18b95.b126.bb" },
   39 	{ "iso8859",		NULL,       "8bcccbcc18b95.33b." },
   40 	{ "latin3",		NULL,       "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." },
   41 	{ "arabic",		NULL,       "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" },
   42 	{ "greek",		NULL,       "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" },
   43 	{ "greek2005",		NULL,       "8bcccbcc18b95.33b14.b35.b44.b" },
   44 	{ "hebrew",		NULL,       "8bcccbcc18b95.33b.b29.32b28.2b2.b" },
   45 	{ "koi8-r",		NULL,       "8bcccbcc18b95.b." },
   46 	{ "KOI8-T",		NULL,       "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." },
   47 	{ "georgianps",		NULL,       "8bcccbcc18b95.3b11.4b12.2b." },
   48 	{ "tcvn",		NULL,       "b..b...bcccbccbbb7.8b95.b48.5b." },
   49 	{ "TIS-620",		NULL,       "8bcccbcc18b95.b.4b.11b7.8b." },
   50 	{ "next",		NULL,       "8bcccbcc18b95.bb125.bb" },
   51 	{ "dos",		NULL,       "8bcccbcc12bc5b95.b." },
   52 	{ "windows-1251",	NULL,       "8bcccbcc12bc5b95.b24.b." },
   53 	{ "windows-1252",	NULL,       "8bcccbcc12bc5b95.b.b11.b.2b12.b." },
   54 	{ "windows-1255",	NULL,       "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." },
   55 	{ "ebcdic",		NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
   56 	{ "IBM-1047",		NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
   57 	{ NULL, NULL, NULL }
   58 };
   59 
   60 /*
   61  * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others.
   62  */
   63 struct cs_alias {
   64 	char *name;
   65 	char *oname;
   66 } cs_aliases[] = {
   67 	{ "UTF-8",		"utf-8" },
   68 	{ "ANSI_X3.4-1968",	"ascii" },
   69 	{ "US-ASCII",		"ascii" },
   70 	{ "latin1",		"iso8859" },
   71 	{ "ISO-8859-1",		"iso8859" },
   72 	{ "latin9",		"iso8859" },
   73 	{ "ISO-8859-15",	"iso8859" },
   74 	{ "latin2",		"iso8859" },
   75 	{ "ISO-8859-2",		"iso8859" },
   76 	{ "ISO-8859-3",		"latin3" },
   77 	{ "latin4",		"iso8859" },
   78 	{ "ISO-8859-4",		"iso8859" },
   79 	{ "cyrillic",		"iso8859" },
   80 	{ "ISO-8859-5",		"iso8859" },
   81 	{ "ISO-8859-6",		"arabic" },
   82 	{ "ISO-8859-7",		"greek" },
   83 	{ "IBM9005",		"greek2005" },
   84 	{ "ISO-8859-8",		"hebrew" },
   85 	{ "latin5",		"iso8859" },
   86 	{ "ISO-8859-9",		"iso8859" },
   87 	{ "latin6",		"iso8859" },
   88 	{ "ISO-8859-10",	"iso8859" },
   89 	{ "latin7",		"iso8859" },
   90 	{ "ISO-8859-13",	"iso8859" },
   91 	{ "latin8",		"iso8859" },
   92 	{ "ISO-8859-14",	"iso8859" },
   93 	{ "latin10",		"iso8859" },
   94 	{ "ISO-8859-16",	"iso8859" },
   95 	{ "IBM437",		"dos" },
   96 	{ "EBCDIC-US",		"ebcdic" },
   97 	{ "IBM1047",		"IBM-1047" },
   98 	{ "KOI8-R",		"koi8-r" },
   99 	{ "KOI8-U",		"koi8-r" },
  100 	{ "GEORGIAN-PS",	"georgianps" },
  101 	{ "TCVN5712-1", 	"tcvn" },
  102 	{ "NEXTSTEP",		"next" },
  103 	{ "windows",		"windows-1252" }, /* backward compatibility */
  104 	{ "CP1251",		"windows-1251" },
  105 	{ "CP1252",		"windows-1252" },
  106 	{ "CP1255",		"windows-1255" },
  107 	{ NULL, NULL }
  108 };
  109 
  110 #define	IS_BINARY_CHAR	01
  111 #define	IS_CONTROL_CHAR	02
  112 
  113 static char chardef[256];
  114 static char *binfmt = NULL;
  115 static char *utfbinfmt = NULL;
  116 public int binattr = AT_STANDOUT;
  117 
  118 
  119 /*
  120  * Define a charset, given a description string.
  121  * The string consists of 256 letters,
  122  * one for each character in the charset.
  123  * If the string is shorter than 256 letters, missing letters
  124  * are taken to be identical to the last one.
  125  * A decimal number followed by a letter is taken to be a
  126  * repetition of the letter.
  127  *
  128  * Each letter is one of:
  129  *	. normal character
  130  *	b binary character
  131  *	c control character
  132  */
  133 	static void
  134 ichardef(s)
  135 	char *s;
  136 {
  137 	register char *cp;
  138 	register int n;
  139 	register char v;
  140 
  141 	n = 0;
  142 	v = 0;
  143 	cp = chardef;
  144 	while (*s != '\0')
  145 	{
  146 		switch (*s++)
  147 		{
  148 		case '.':
  149 			v = 0;
  150 			break;
  151 		case 'c':
  152 			v = IS_CONTROL_CHAR;
  153 			break;
  154 		case 'b':
  155 			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
  156 			break;
  157 
  158 		case '0': case '1': case '2': case '3': case '4':
  159 		case '5': case '6': case '7': case '8': case '9':
  160 			n = (10 * n) + (s[-1] - '0');
  161 			continue;
  162 
  163 		default:
  164 			error("invalid chardef", NULL_PARG);
  165 			quit(QUIT_ERROR);
  166 			/*NOTREACHED*/
  167 		}
  168 
  169 		do
  170 		{
  171 			if (cp >= chardef + sizeof(chardef))
  172 			{
  173 				error("chardef longer than 256", NULL_PARG);
  174 				quit(QUIT_ERROR);
  175 				/*NOTREACHED*/
  176 			}
  177 			*cp++ = v;
  178 		} while (--n > 0);
  179 		n = 0;
  180 	}
  181 
  182 	while (cp < chardef + sizeof(chardef))
  183 		*cp++ = v;
  184 }
  185 
  186 /*
  187  * Define a charset, given a charset name.
  188  * The valid charset names are listed in the "charsets" array.
  189  */
  190 	static int
  191 icharset(name, no_error)
  192 	register char *name;
  193 	int no_error;
  194 {
  195 	register struct charset *p;
  196 	register struct cs_alias *a;
  197 
  198 	if (name == NULL || *name == '\0')
  199 		return (0);
  200 
  201 	/* First see if the name is an alias. */
  202 	for (a = cs_aliases;  a->name != NULL;  a++)
  203 	{
  204 		if (strcmp(name, a->name) == 0)
  205 		{
  206 			name = a->oname;
  207 			break;
  208 		}
  209 	}
  210 
  211 	for (p = charsets;  p->name != NULL;  p++)
  212 	{
  213 		if (strcmp(name, p->name) == 0)
  214 		{
  215 			ichardef(p->desc);
  216 			if (p->p_flag != NULL)
  217 				*(p->p_flag) = 1;
  218 			return (1);
  219 		}
  220 	}
  221 
  222 	if (!no_error) {
  223 		error("invalid charset name", NULL_PARG);
  224 		quit(QUIT_ERROR);
  225 	}
  226 	return (0);
  227 }
  228 
  229 #if HAVE_LOCALE
  230 /*
  231  * Define a charset, given a locale name.
  232  */
  233 	static void
  234 ilocale()
  235 {
  236 	register int c;
  237 
  238 	for (c = 0;  c < (int) sizeof(chardef);  c++)
  239 	{
  240 		if (isprint(c))
  241 			chardef[c] = 0;
  242 		else if (iscntrl(c))
  243 			chardef[c] = IS_CONTROL_CHAR;
  244 		else
  245 			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
  246 	}
  247 }
  248 #endif
  249 
  250 /*
  251  * Define the printing format for control (or binary utf) chars.
  252  */
  253    	static void
  254 setbinfmt(s, fmtvarptr, default_fmt)
  255 	char *s;
  256 	char **fmtvarptr;
  257 	char *default_fmt;
  258 {
  259 	if (s && utf_mode)
  260 	{
  261 		/* It would be too hard to account for width otherwise.  */
  262 		char *t = s;
  263 		while (*t)
  264 		{
  265 			if (*t < ' ' || *t > '~')
  266 			{
  267 				s = default_fmt;
  268 				goto attr;
  269 			}
  270 			t++;
  271 		}
  272 	}
  273 
  274 	/* %n is evil */
  275 	if (s == NULL || *s == '\0' ||
  276 	    (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
  277 	    (*s != '*' && strchr(s, 'n')))
  278 		s = default_fmt;
  279 
  280 	/*
  281 	 * Select the attributes if it starts with "*".
  282 	 */
  283  attr:
  284 	if (*s == '*')
  285 	{
  286 		switch (s[1])
  287 		{
  288 		case 'd':  binattr = AT_BOLD;      break;
  289 		case 'k':  binattr = AT_BLINK;     break;
  290 		case 's':  binattr = AT_STANDOUT;  break;
  291 		case 'u':  binattr = AT_UNDERLINE; break;
  292 		default:   binattr = AT_NORMAL;    break;
  293 		}
  294 		s += 2;
  295 	}
  296 	*fmtvarptr = s;
  297 }
  298 
  299 /*
  300  *
  301  */
  302 	static void
  303 set_charset()
  304 {
  305 	char *s;
  306 
  307 	/*
  308 	 * See if environment variable LESSCHARSET is defined.
  309 	 */
  310 	s = lgetenv("LESSCHARSET");
  311 	if (icharset(s, 0))
  312 		return;
  313 
  314 	/*
  315 	 * LESSCHARSET is not defined: try LESSCHARDEF.
  316 	 */
  317 	s = lgetenv("LESSCHARDEF");
  318 	if (s != NULL && *s != '\0')
  319 	{
  320 		ichardef(s);
  321 		return;
  322 	}
  323 
  324 #if HAVE_LOCALE
  325 #ifdef CODESET
  326 	/*
  327 	 * Try using the codeset name as the charset name.
  328 	 */
  329 	s = nl_langinfo(CODESET);
  330 	if (icharset(s, 1))
  331 		return;
  332 #endif
  333 #endif
  334 
  335 #if HAVE_STRSTR
  336 	/*
  337 	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
  338 	 */
  339 	if ((s = lgetenv("LC_ALL")) != NULL ||
  340 	    (s = lgetenv("LC_CTYPE")) != NULL ||
  341 	    (s = lgetenv("LANG")) != NULL)
  342 	{
  343 		if (   strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL
  344 		    || strstr(s, "UTF8")  != NULL || strstr(s, "utf8")  != NULL)
  345 			if (icharset("utf-8", 1))
  346 				return;
  347 	}
  348 #endif
  349 
  350 #if HAVE_LOCALE
  351 	/*
  352 	 * Get character definitions from locale functions,
  353 	 * rather than from predefined charset entry.
  354 	 */
  355 	ilocale();
  356 #if MSDOS_COMPILER
  357 	/*
  358 	 * Default to "dos".
  359 	 */
  360 	(void) icharset("dos", 1);
  361 #else
  362 	/*
  363 	 * Default to "latin1".
  364 	 */
  365 	(void) icharset("latin1", 1);
  366 #endif
  367 #endif
  368 }
  369 
  370 /*
  371  * Initialize charset data structures.
  372  */
  373 	public void
  374 init_charset()
  375 {
  376 	char *s;
  377 
  378 #if HAVE_LOCALE
  379 	setlocale(LC_ALL, "");
  380 #endif
  381 
  382 	set_charset();
  383 
  384 	s = lgetenv("LESSBINFMT");
  385 	setbinfmt(s, &binfmt, "*s<%02X>");
  386 
  387 	s = lgetenv("LESSUTFBINFMT");
  388 	setbinfmt(s, &utfbinfmt, "<U+%04lX>");
  389 }
  390 
  391 /*
  392  * Is a given character a "binary" character?
  393  */
  394 	public int
  395 binary_char(c)
  396 	unsigned char c;
  397 {
  398 	c &= 0377;
  399 	return (chardef[c] & IS_BINARY_CHAR);
  400 }
  401 
  402 /*
  403  * Is a given character a "control" character?
  404  */
  405 	public int
  406 control_char(c)
  407 	int c;
  408 {
  409 	c &= 0377;
  410 	return (chardef[c] & IS_CONTROL_CHAR);
  411 }
  412 
  413 /*
  414  * Return the printable form of a character.
  415  * For example, in the "ascii" charset '\3' is printed as "^C".
  416  */
  417 	public char *
  418 prchar(c)
  419 	int c;
  420 {
  421 	/* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */
  422 	static char buf[32];
  423 
  424 	c &= 0377;
  425 	if ((c < 128 || !utf_mode) && !control_char(c))
  426 		SNPRINTF1(buf, sizeof(buf), "%c", c);
  427 	else if (c == ESC)
  428 		strcpy(buf, "ESC");
  429 #if IS_EBCDIC_HOST
  430 	else if (!binary_char(c) && c < 64)
  431 		SNPRINTF1(buf, sizeof(buf), "^%c",
  432 		/*
  433 		 * This array roughly inverts CONTROL() #defined in less.h,
  434 	 	 * and should be kept in sync with CONTROL() and IBM-1047.
  435  	 	 */
  436 		"@ABC.I.?...KLMNO"
  437 		"PQRS.JH.XY.."
  438 		"\\]^_"
  439 		"......W[.....EFG"
  440 		"..V....D....TU.Z"[c]);
  441 #else
  442   	else if (c < 128 && !control_char(c ^ 0100))
  443   		SNPRINTF1(buf, sizeof(buf), "^%c", c ^ 0100);
  444 #endif
  445 	else
  446 		SNPRINTF1(buf, sizeof(buf), binfmt, c);
  447 	return (buf);
  448 }
  449 
  450 /*
  451  * Return the printable form of a UTF-8 character.
  452  */
  453 	public char *
  454 prutfchar(ch)
  455 	LWCHAR ch;
  456 {
  457 	static char buf[32];
  458 
  459 	if (ch == ESC)
  460 		strcpy(buf, "ESC");
  461   	else if (ch < 128 && control_char(ch))
  462 	{
  463 		if (!control_char(ch ^ 0100))
  464 			SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100);
  465 		else
  466 			SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch);
  467 	} else if (is_ubin_char(ch))
  468 		SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch);
  469 	else
  470 	{
  471 		int len;
  472 		if (ch >= 0x80000000)
  473 		{
  474 			len = 3;
  475 			ch = 0xFFFD;
  476 		} else
  477 		{
  478 			len =   (ch < 0x80) ? 1
  479 			      : (ch < 0x800) ? 2
  480 			      : (ch < 0x10000) ? 3
  481 			      : (ch < 0x200000) ? 4
  482 			      : (ch < 0x4000000) ? 5
  483 			      : 6;
  484 		}
  485 		buf[len] = '\0';
  486 		if (len == 1)
  487 			*buf = (char) ch;
  488 		else
  489 		{
  490 			*buf = ((1 << len) - 1) << (8 - len);
  491 			while (--len > 0)
  492 			{
  493 				buf[len] = (char) (0x80 | (ch & 0x3F));
  494 				ch >>= 6;
  495 			}
  496 			*buf |= ch;
  497 		}
  498 	}
  499 	return (buf);
  500 }
  501 
  502 /*
  503  * Get the length of a UTF-8 character in bytes.
  504  */
  505 	public int
  506 utf_len(ch)
  507 	char ch;
  508 {
  509 	if ((ch & 0x80) == 0)
  510 		return 1;
  511 	if ((ch & 0xE0) == 0xC0)
  512 		return 2;
  513 	if ((ch & 0xF0) == 0xE0)
  514 		return 3;
  515 	if ((ch & 0xF8) == 0xF0)
  516 		return 4;
  517 	if ((ch & 0xFC) == 0xF8)
  518 		return 5;
  519 	if ((ch & 0xFE) == 0xFC)
  520 		return 6;
  521 	/* Invalid UTF-8 encoding. */
  522 	return 1;
  523 }
  524 
  525 /*
  526  * Is a UTF-8 character well-formed?
  527  */
  528 	public int
  529 is_utf8_well_formed(s)
  530 	unsigned char *s;
  531 {
  532 	int i;
  533 	int len;
  534 
  535 	if (IS_UTF8_INVALID(s[0]))
  536 		return (0);
  537 
  538 	len = utf_len((char) s[0]);
  539 	if (len == 1)
  540 		return (1);
  541 	if (len == 2)
  542 	{
  543 		if (s[0] < 0xC2)
  544 		    return (0);
  545 	} else
  546 	{
  547 		unsigned char mask;
  548 		mask = (~((1 << (8-len)) - 1)) & 0xFF;
  549 		if (s[0] == mask && (s[1] & mask) == 0x80)
  550 			return (0);
  551 	}
  552 
  553 	for (i = 1;  i < len;  i++)
  554 		if (!IS_UTF8_TRAIL(s[i]))
  555 			return (0);
  556 	return (1);
  557 }
  558 
  559 /*
  560  * Get the value of a UTF-8 character.
  561  */
  562 	public LWCHAR
  563 get_wchar(p)
  564 	char *p;
  565 {
  566 	switch (utf_len(p[0]))
  567 	{
  568 	case 1:
  569 	default:
  570 		/* 0xxxxxxx */
  571 		return (LWCHAR)
  572 			(p[0] & 0xFF);
  573 	case 2:
  574 		/* 110xxxxx 10xxxxxx */
  575 		return (LWCHAR) (
  576 			((p[0] & 0x1F) << 6) |
  577 			(p[1] & 0x3F));
  578 	case 3:
  579 		/* 1110xxxx 10xxxxxx 10xxxxxx */
  580 		return (LWCHAR) (
  581 			((p[0] & 0x0F) << 12) |
  582 			((p[1] & 0x3F) << 6) |
  583 			(p[2] & 0x3F));
  584 	case 4:
  585 		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  586 		return (LWCHAR) (
  587 			((p[0] & 0x07) << 18) |
  588 			((p[1] & 0x3F) << 12) |
  589 			((p[2] & 0x3F) << 6) |
  590 			(p[3] & 0x3F));
  591 	case 5:
  592 		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  593 		return (LWCHAR) (
  594 			((p[0] & 0x03) << 24) |
  595 			((p[1] & 0x3F) << 18) |
  596 			((p[2] & 0x3F) << 12) |
  597 			((p[3] & 0x3F) << 6) |
  598 			(p[4] & 0x3F));
  599 	case 6:
  600 		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  601 		return (LWCHAR) (
  602 			((p[0] & 0x01) << 30) |
  603 			((p[1] & 0x3F) << 24) |
  604 			((p[2] & 0x3F) << 18) |
  605 			((p[3] & 0x3F) << 12) |
  606 			((p[4] & 0x3F) << 6) |
  607 			(p[5] & 0x3F));
  608 	}
  609 }
  610 
  611 /*
  612  * Store a character into a UTF-8 string.
  613  */
  614 	public void
  615 put_wchar(pp, ch)
  616 	char **pp;
  617 	LWCHAR ch;
  618 {
  619 	if (!utf_mode || ch < 0x80)
  620 	{
  621 		/* 0xxxxxxx */
  622 		*(*pp)++ = (char) ch;
  623 	} else if (ch < 0x800)
  624 	{
  625 		/* 110xxxxx 10xxxxxx */
  626 		*(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F));
  627 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
  628 	} else if (ch < 0x10000)
  629 	{
  630 		/* 1110xxxx 10xxxxxx 10xxxxxx */
  631 		*(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F));
  632 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
  633 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
  634 	} else if (ch < 0x200000)
  635 	{
  636 		/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  637 		*(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07));
  638 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
  639 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
  640 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
  641 	} else if (ch < 0x4000000)
  642 	{
  643 		/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  644 		*(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03));
  645 		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
  646 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
  647 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
  648 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
  649 	} else
  650 	{
  651 		/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
  652 		*(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01));
  653 		*(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F));
  654 		*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
  655 		*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
  656 		*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
  657 		*(*pp)++ = (char) (0x80 | (ch & 0x3F));
  658 	}
  659 }
  660 
  661 /*
  662  * Step forward or backward one character in a string.
  663  */
  664 	public LWCHAR
  665 step_char(pp, dir, limit)
  666 	char **pp;
  667 	signed int dir;
  668 	char *limit;
  669 {
  670 	LWCHAR ch;
  671 	int len;
  672 	char *p = *pp;
  673 
  674 	if (!utf_mode)
  675 	{
  676 		/* It's easy if chars are one byte. */
  677 		if (dir > 0)
  678 			ch = (LWCHAR) ((p < limit) ? *p++ : 0);
  679 		else
  680 			ch = (LWCHAR) ((p > limit) ? *--p : 0);
  681 	} else if (dir > 0)
  682 	{
  683 		len = utf_len(*p);
  684 		if (p + len > limit)
  685 		{
  686 			ch = 0;
  687 			p = limit;
  688 		} else
  689 		{
  690 			ch = get_wchar(p);
  691 			p += len;
  692 		}
  693 	} else
  694 	{
  695 		while (p > limit && IS_UTF8_TRAIL(p[-1]))
  696 			p--;
  697 		if (p > limit)
  698 			ch = get_wchar(--p);
  699 		else
  700 			ch = 0;
  701 	}
  702 	*pp = p;
  703 	return ch;
  704 }
  705 
  706 /*
  707  * Unicode characters data
  708  */
  709 struct wchar_range { LWCHAR first, last; };
  710 
  711 /*
  712  * Characters with general category values
  713  *	Mn: Mark, Nonspacing
  714  *	Me: Mark, Enclosing
  715  * Last synched with
  716  *	<http://www.unicode.org/Public/5.0.0/ucd/UnicodeData-5.0.0d7.txt>
  717  *	dated 2005-11-30T00:58:48Z
  718  */
  719 static struct wchar_range comp_table[] = {
  720 	{  0x0300,  0x036F} /* Mn */, {  0x0483,  0x0486} /* Mn */,
  721 	{  0x0488,  0x0489} /* Me */,
  722 	{  0x0591,  0x05BD} /* Mn */, {  0x05BF,  0x05BF} /* Mn */,
  723 	{  0x05C1,  0x05C2} /* Mn */, {  0x05C4,  0x05C5} /* Mn */,
  724 	{  0x05C7,  0x05C7} /* Mn */, {  0x0610,  0x0615} /* Mn */,
  725 	{  0x064B,  0x065E} /* Mn */, {  0x0670,  0x0670} /* Mn */,
  726 	{  0x06D6,  0x06DC} /* Mn */,
  727 	{  0x06DE,  0x06DE} /* Me */,
  728 	{  0x06DF,  0x06E4} /* Mn */, {  0x06E7,  0x06E8} /* Mn */,
  729 	{  0x06EA,  0x06ED} /* Mn */, {  0x0711,  0x0711} /* Mn */,
  730 	{  0x0730,  0x074A} /* Mn */, {  0x07A6,  0x07B0} /* Mn */,
  731 	{  0x07EB,  0x07F3} /* Mn */, {  0x0901,  0x0902} /* Mn */,
  732 	{  0x093C,  0x093C} /* Mn */, {  0x0941,  0x0948} /* Mn */,
  733 	{  0x094D,  0x094D} /* Mn */, {  0x0951,  0x0954} /* Mn */,
  734 	{  0x0962,  0x0963} /* Mn */, {  0x0981,  0x0981} /* Mn */,
  735 	{  0x09BC,  0x09BC} /* Mn */, {  0x09C1,  0x09C4} /* Mn */,
  736 	{  0x09CD,  0x09CD} /* Mn */, {  0x09E2,  0x09E3} /* Mn */,
  737 	{  0x0A01,  0x0A02} /* Mn */, {  0x0A3C,  0x0A3C} /* Mn */,
  738 	{  0x0A41,  0x0A42} /* Mn */, {  0x0A47,  0x0A48} /* Mn */,
  739 	{  0x0A4B,  0x0A4D} /* Mn */, {  0x0A70,  0x0A71} /* Mn */,
  740 	{  0x0A81,  0x0A82} /* Mn */, {  0x0ABC,  0x0ABC} /* Mn */,
  741 	{  0x0AC1,  0x0AC5} /* Mn */, {  0x0AC7,  0x0AC8} /* Mn */,
  742 	{  0x0ACD,  0x0ACD} /* Mn */, {  0x0AE2,  0x0AE3} /* Mn */,
  743 	{  0x0B01,  0x0B01} /* Mn */, {  0x0B3C,  0x0B3C} /* Mn */,
  744 	{  0x0B3F,  0x0B3F} /* Mn */, {  0x0B41,  0x0B43} /* Mn */,
  745 	{  0x0B4D,  0x0B4D} /* Mn */, {  0x0B56,  0x0B56} /* Mn */,
  746 	{  0x0B82,  0x0B82} /* Mn */, {  0x0BC0,  0x0BC0} /* Mn */,
  747 	{  0x0BCD,  0x0BCD} /* Mn */, {  0x0C3E,  0x0C40} /* Mn */,
  748 	{  0x0C46,  0x0C48} /* Mn */, {  0x0C4A,  0x0C4D} /* Mn */,
  749 	{  0x0C55,  0x0C56} /* Mn */, {  0x0CBC,  0x0CBC} /* Mn */,
  750 	{  0x0CBF,  0x0CBF} /* Mn */, {  0x0CC6,  0x0CC6} /* Mn */,
  751 	{  0x0CCC,  0x0CCD} /* Mn */, {  0x0CE2,  0x0CE3} /* Mn */,
  752 	{  0x0D41,  0x0D43} /* Mn */, {  0x0D4D,  0x0D4D} /* Mn */,
  753 	{  0x0DCA,  0x0DCA} /* Mn */, {  0x0DD2,  0x0DD4} /* Mn */,
  754 	{  0x0DD6,  0x0DD6} /* Mn */, {  0x0E31,  0x0E31} /* Mn */,
  755 	{  0x0E34,  0x0E3A} /* Mn */, {  0x0E47,  0x0E4E} /* Mn */,
  756 	{  0x0EB1,  0x0EB1} /* Mn */, {  0x0EB4,  0x0EB9} /* Mn */,
  757 	{  0x0EBB,  0x0EBC} /* Mn */, {  0x0EC8,  0x0ECD} /* Mn */,
  758 	{  0x0F18,  0x0F19} /* Mn */, {  0x0F35,  0x0F35} /* Mn */,
  759 	{  0x0F37,  0x0F37} /* Mn */, {  0x0F39,  0x0F39} /* Mn */,
  760 	{  0x0F71,  0x0F7E} /* Mn */, {  0x0F80,  0x0F84} /* Mn */,
  761 	{  0x0F86,  0x0F87} /* Mn */, {  0x0F90,  0x0F97} /* Mn */,
  762 	{  0x0F99,  0x0FBC} /* Mn */, {  0x0FC6,  0x0FC6} /* Mn */,
  763 	{  0x102D,  0x1030} /* Mn */, {  0x1032,  0x1032} /* Mn */,
  764 	{  0x1036,  0x1037} /* Mn */, {  0x1039,  0x1039} /* Mn */,
  765 	{  0x1058,  0x1059} /* Mn */, {  0x135F,  0x135F} /* Mn */,
  766 	{  0x1712,  0x1714} /* Mn */, {  0x1732,  0x1734} /* Mn */,
  767 	{  0x1752,  0x1753} /* Mn */, {  0x1772,  0x1773} /* Mn */,
  768 	{  0x17B7,  0x17BD} /* Mn */, {  0x17C6,  0x17C6} /* Mn */,
  769 	{  0x17C9,  0x17D3} /* Mn */, {  0x17DD,  0x17DD} /* Mn */,
  770 	{  0x180B,  0x180D} /* Mn */, {  0x18A9,  0x18A9} /* Mn */,
  771 	{  0x1920,  0x1922} /* Mn */, {  0x1927,  0x1928} /* Mn */,
  772 	{  0x1932,  0x1932} /* Mn */, {  0x1939,  0x193B} /* Mn */,
  773 	{  0x1A17,  0x1A18} /* Mn */, {  0x1B00,  0x1B03} /* Mn */