"SfR Fresh" - the SfR Freeware/Shareware Archive 
Member "less-424/charset.c" of archive less-424.tar.gz:
As a special service "SfR Fresh" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting with prefixed line numbers.
Alternatively you can here view or download the uninterpreted source code file.
That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
1 /*
2 * Copyright (C) 1984-2008 Mark Nudelman
3 *
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Less License, as specified in the README file.
6 *
7 * For more information about less, or for information on how to
8 * contact the author, see the README file.
9 */
10
11
12 /*
13 * Functions to define the character set
14 * and do things specific to the character set.
15 */
16
17 #include "less.h"
18 #if HAVE_LOCALE
19 #include <locale.h>
20 #include <ctype.h>
21 #include <langinfo.h>
22 #endif
23
24 #include "charset.h"
25
26 public int utf_mode = 0;
27
28 /*
29 * Predefined character sets,
30 * selected by the LESSCHARSET environment variable.
31 */
32 struct charset {
33 char *name;
34 int *p_flag;
35 char *desc;
36 } charsets[] = {
37 { "ascii", NULL, "8bcccbcc18b95.b" },
38 { "utf-8", &utf_mode, "8bcccbcc18b95.b126.bb" },
39 { "iso8859", NULL, "8bcccbcc18b95.33b." },
40 { "latin3", NULL, "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." },
41 { "arabic", NULL, "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" },
42 { "greek", NULL, "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" },
43 { "greek2005", NULL, "8bcccbcc18b95.33b14.b35.b44.b" },
44 { "hebrew", NULL, "8bcccbcc18b95.33b.b29.32b28.2b2.b" },
45 { "koi8-r", NULL, "8bcccbcc18b95.b." },
46 { "KOI8-T", NULL, "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." },
47 { "georgianps", NULL, "8bcccbcc18b95.3b11.4b12.2b." },
48 { "tcvn", NULL, "b..b...bcccbccbbb7.8b95.b48.5b." },
49 { "TIS-620", NULL, "8bcccbcc18b95.b.4b.11b7.8b." },
50 { "next", NULL, "8bcccbcc18b95.bb125.bb" },
51 { "dos", NULL, "8bcccbcc12bc5b95.b." },
52 { "windows-1251", NULL, "8bcccbcc12bc5b95.b24.b." },
53 { "windows-1252", NULL, "8bcccbcc12bc5b95.b.b11.b.2b12.b." },
54 { "windows-1255", NULL, "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." },
55 { "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
56 { "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
57 { NULL, NULL, NULL }
58 };
59
60 /*
61 * Support "locale charmap"/nl_langinfo(CODESET) values, as well as others.
62 */
63 struct cs_alias {
64 char *name;
65 char *oname;
66 } cs_aliases[] = {
67 { "UTF-8", "utf-8" },
68 { "ANSI_X3.4-1968", "ascii" },
69 { "US-ASCII", "ascii" },
70 { "latin1", "iso8859" },
71 { "ISO-8859-1", "iso8859" },
72 { "latin9", "iso8859" },
73 { "ISO-8859-15", "iso8859" },
74 { "latin2", "iso8859" },
75 { "ISO-8859-2", "iso8859" },
76 { "ISO-8859-3", "latin3" },
77 { "latin4", "iso8859" },
78 { "ISO-8859-4", "iso8859" },
79 { "cyrillic", "iso8859" },
80 { "ISO-8859-5", "iso8859" },
81 { "ISO-8859-6", "arabic" },
82 { "ISO-8859-7", "greek" },
83 { "IBM9005", "greek2005" },
84 { "ISO-8859-8", "hebrew" },
85 { "latin5", "iso8859" },
86 { "ISO-8859-9", "iso8859" },
87 { "latin6", "iso8859" },
88 { "ISO-8859-10", "iso8859" },
89 { "latin7", "iso8859" },
90 { "ISO-8859-13", "iso8859" },
91 { "latin8", "iso8859" },
92 { "ISO-8859-14", "iso8859" },
93 { "latin10", "iso8859" },
94 { "ISO-8859-16", "iso8859" },
95 { "IBM437", "dos" },
96 { "EBCDIC-US", "ebcdic" },
97 { "IBM1047", "IBM-1047" },
98 { "KOI8-R", "koi8-r" },
99 { "KOI8-U", "koi8-r" },
100 { "GEORGIAN-PS", "georgianps" },
101 { "TCVN5712-1", "tcvn" },
102 { "NEXTSTEP", "next" },
103 { "windows", "windows-1252" }, /* backward compatibility */
104 { "CP1251", "windows-1251" },
105 { "CP1252", "windows-1252" },
106 { "CP1255", "windows-1255" },
107 { NULL, NULL }
108 };
109
110 #define IS_BINARY_CHAR 01
111 #define IS_CONTROL_CHAR 02
112
113 static char chardef[256];
114 static char *binfmt = NULL;
115 static char *utfbinfmt = NULL;
116 public int binattr = AT_STANDOUT;
117
118
119 /*
120 * Define a charset, given a description string.
121 * The string consists of 256 letters,
122 * one for each character in the charset.
123 * If the string is shorter than 256 letters, missing letters
124 * are taken to be identical to the last one.
125 * A decimal number followed by a letter is taken to be a
126 * repetition of the letter.
127 *
128 * Each letter is one of:
129 * . normal character
130 * b binary character
131 * c control character
132 */
133 static void
134 ichardef(s)
135 char *s;
136 {
137 register char *cp;
138 register int n;
139 register char v;
140
141 n = 0;
142 v = 0;
143 cp = chardef;
144 while (*s != '\0')
145 {
146 switch (*s++)
147 {
148 case '.':
149 v = 0;
150 break;
151 case 'c':
152 v = IS_CONTROL_CHAR;
153 break;
154 case 'b':
155 v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
156 break;
157
158 case '0': case '1': case '2': case '3': case '4':
159 case '5': case '6': case '7': case '8': case '9':
160 n = (10 * n) + (s[-1] - '0');
161 continue;
162
163 default:
164 error("invalid chardef", NULL_PARG);
165 quit(QUIT_ERROR);
166 /*NOTREACHED*/
167 }
168
169 do
170 {
171 if (cp >= chardef + sizeof(chardef))
172 {
173 error("chardef longer than 256", NULL_PARG);
174 quit(QUIT_ERROR);
175 /*NOTREACHED*/
176 }
177 *cp++ = v;
178 } while (--n > 0);
179 n = 0;
180 }
181
182 while (cp < chardef + sizeof(chardef))
183 *cp++ = v;
184 }
185
186 /*
187 * Define a charset, given a charset name.
188 * The valid charset names are listed in the "charsets" array.
189 */
190 static int
191 icharset(name, no_error)
192 register char *name;
193 int no_error;
194 {
195 register struct charset *p;
196 register struct cs_alias *a;
197
198 if (name == NULL || *name == '\0')
199 return (0);
200
201 /* First see if the name is an alias. */
202 for (a = cs_aliases; a->name != NULL; a++)
203 {
204 if (strcmp(name, a->name) == 0)
205 {
206 name = a->oname;
207 break;
208 }
209 }
210
211 for (p = charsets; p->name != NULL; p++)
212 {
213 if (strcmp(name, p->name) == 0)
214 {
215 ichardef(p->desc);
216 if (p->p_flag != NULL)
217 *(p->p_flag) = 1;
218 return (1);
219 }
220 }
221
222 if (!no_error) {
223 error("invalid charset name", NULL_PARG);
224 quit(QUIT_ERROR);
225 }
226 return (0);
227 }
228
229 #if HAVE_LOCALE
230 /*
231 * Define a charset, given a locale name.
232 */
233 static void
234 ilocale()
235 {
236 register int c;
237
238 for (c = 0; c < (int) sizeof(chardef); c++)
239 {
240 if (isprint(c))
241 chardef[c] = 0;
242 else if (iscntrl(c))
243 chardef[c] = IS_CONTROL_CHAR;
244 else
245 chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
246 }
247 }
248 #endif
249
250 /*
251 * Define the printing format for control (or binary utf) chars.
252 */
253 static void
254 setbinfmt(s, fmtvarptr, default_fmt)
255 char *s;
256 char **fmtvarptr;
257 char *default_fmt;
258 {
259 if (s && utf_mode)
260 {
261 /* It would be too hard to account for width otherwise. */
262 char *t = s;
263 while (*t)
264 {
265 if (*t < ' ' || *t > '~')
266 {
267 s = default_fmt;
268 goto attr;
269 }
270 t++;
271 }
272 }
273
274 /* %n is evil */
275 if (s == NULL || *s == '\0' ||
276 (*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
277 (*s != '*' && strchr(s, 'n')))
278 s = default_fmt;
279
280 /*
281 * Select the attributes if it starts with "*".
282 */
283 attr:
284 if (*s == '*')
285 {
286 switch (s[1])
287 {
288 case 'd': binattr = AT_BOLD; break;
289 case 'k': binattr = AT_BLINK; break;
290 case 's': binattr = AT_STANDOUT; break;
291 case 'u': binattr = AT_UNDERLINE; break;
292 default: binattr = AT_NORMAL; break;
293 }
294 s += 2;
295 }
296 *fmtvarptr = s;
297 }
298
299 /*
300 *
301 */
302 static void
303 set_charset()
304 {
305 char *s;
306
307 /*
308 * See if environment variable LESSCHARSET is defined.
309 */
310 s = lgetenv("LESSCHARSET");
311 if (icharset(s, 0))
312 return;
313
314 /*
315 * LESSCHARSET is not defined: try LESSCHARDEF.
316 */
317 s = lgetenv("LESSCHARDEF");
318 if (s != NULL && *s != '\0')
319 {
320 ichardef(s);
321 return;
322 }
323
324 #if HAVE_LOCALE
325 #ifdef CODESET
326 /*
327 * Try using the codeset name as the charset name.
328 */
329 s = nl_langinfo(CODESET);
330 if (icharset(s, 1))
331 return;
332 #endif
333 #endif
334
335 #if HAVE_STRSTR
336 /*
337 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
338 */
339 if ((s = lgetenv("LC_ALL")) != NULL ||
340 (s = lgetenv("LC_CTYPE")) != NULL ||
341 (s = lgetenv("LANG")) != NULL)
342 {
343 if ( strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL
344 || strstr(s, "UTF8") != NULL || strstr(s, "utf8") != NULL)
345 if (icharset("utf-8", 1))
346 return;
347 }
348 #endif
349
350 #if HAVE_LOCALE
351 /*
352 * Get character definitions from locale functions,
353 * rather than from predefined charset entry.
354 */
355 ilocale();
356 #if MSDOS_COMPILER
357 /*
358 * Default to "dos".
359 */
360 (void) icharset("dos", 1);
361 #else
362 /*
363 * Default to "latin1".
364 */
365 (void) icharset("latin1", 1);
366 #endif
367 #endif
368 }
369
370 /*
371 * Initialize charset data structures.
372 */
373 public void
374 init_charset()
375 {
376 char *s;
377
378 #if HAVE_LOCALE
379 setlocale(LC_ALL, "");
380 #endif
381
382 set_charset();
383
384 s = lgetenv("LESSBINFMT");
385 setbinfmt(s, &binfmt, "*s<%02X>");
386
387 s = lgetenv("LESSUTFBINFMT");
388 setbinfmt(s, &utfbinfmt, "<U+%04lX>");
389 }
390
391 /*
392 * Is a given character a "binary" character?
393 */
394 public int
395 binary_char(c)
396 unsigned char c;
397 {
398 c &= 0377;
399 return (chardef[c] & IS_BINARY_CHAR);
400 }
401
402 /*
403 * Is a given character a "control" character?
404 */
405 public int
406 control_char(c)
407 int c;
408 {
409 c &= 0377;
410 return (chardef[c] & IS_CONTROL_CHAR);
411 }
412
413 /*
414 * Return the printable form of a character.
415 * For example, in the "ascii" charset '\3' is printed as "^C".
416 */
417 public char *
418 prchar(c)
419 int c;
420 {
421 /* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */
422 static char buf[32];
423
424 c &= 0377;
425 if ((c < 128 || !utf_mode) && !control_char(c))
426 SNPRINTF1(buf, sizeof(buf), "%c", c);
427 else if (c == ESC)
428 strcpy(buf, "ESC");
429 #if IS_EBCDIC_HOST
430 else if (!binary_char(c) && c < 64)
431 SNPRINTF1(buf, sizeof(buf), "^%c",
432 /*
433 * This array roughly inverts CONTROL() #defined in less.h,
434 * and should be kept in sync with CONTROL() and IBM-1047.
435 */
436 "@ABC.I.?...KLMNO"
437 "PQRS.JH.XY.."
438 "\\]^_"
439 "......W[.....EFG"
440 "..V....D....TU.Z"[c]);
441 #else
442 else if (c < 128 && !control_char(c ^ 0100))
443 SNPRINTF1(buf, sizeof(buf), "^%c", c ^ 0100);
444 #endif
445 else
446 SNPRINTF1(buf, sizeof(buf), binfmt, c);
447 return (buf);
448 }
449
450 /*
451 * Return the printable form of a UTF-8 character.
452 */
453 public char *
454 prutfchar(ch)
455 LWCHAR ch;
456 {
457 static char buf[32];
458
459 if (ch == ESC)
460 strcpy(buf, "ESC");
461 else if (ch < 128 && control_char(ch))
462 {
463 if (!control_char(ch ^ 0100))
464 SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100);
465 else
466 SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch);
467 } else if (is_ubin_char(ch))
468 SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch);
469 else
470 {
471 int len;
472 if (ch >= 0x80000000)
473 {
474 len = 3;
475 ch = 0xFFFD;
476 } else
477 {
478 len = (ch < 0x80) ? 1
479 : (ch < 0x800) ? 2
480 : (ch < 0x10000) ? 3
481 : (ch < 0x200000) ? 4
482 : (ch < 0x4000000) ? 5
483 : 6;
484 }
485 buf[len] = '\0';
486 if (len == 1)
487 *buf = (char) ch;
488 else
489 {
490 *buf = ((1 << len) - 1) << (8 - len);
491 while (--len > 0)
492 {
493 buf[len] = (char) (0x80 | (ch & 0x3F));
494 ch >>= 6;
495 }
496 *buf |= ch;
497 }
498 }
499 return (buf);
500 }
501
502 /*
503 * Get the length of a UTF-8 character in bytes.
504 */
505 public int
506 utf_len(ch)
507 char ch;
508 {
509 if ((ch & 0x80) == 0)
510 return 1;
511 if ((ch & 0xE0) == 0xC0)
512 return 2;
513 if ((ch & 0xF0) == 0xE0)
514 return 3;
515 if ((ch & 0xF8) == 0xF0)
516 return 4;
517 if ((ch & 0xFC) == 0xF8)
518 return 5;
519 if ((ch & 0xFE) == 0xFC)
520 return 6;
521 /* Invalid UTF-8 encoding. */
522 return 1;
523 }
524
525 /*
526 * Is a UTF-8 character well-formed?
527 */
528 public int
529 is_utf8_well_formed(s)
530 unsigned char *s;
531 {
532 int i;
533 int len;
534
535 if (IS_UTF8_INVALID(s[0]))
536 return (0);
537
538 len = utf_len((char) s[0]);
539 if (len == 1)
540 return (1);
541 if (len == 2)
542 {
543 if (s[0] < 0xC2)
544 return (0);
545 } else
546 {
547 unsigned char mask;
548 mask = (~((1 << (8-len)) - 1)) & 0xFF;
549 if (s[0] == mask && (s[1] & mask) == 0x80)
550 return (0);
551 }
552
553 for (i = 1; i < len; i++)
554 if (!IS_UTF8_TRAIL(s[i]))
555 return (0);
556 return (1);
557 }
558
559 /*
560 * Get the value of a UTF-8 character.
561 */
562 public LWCHAR
563 get_wchar(p)
564 char *p;
565 {
566 switch (utf_len(p[0]))
567 {
568 case 1:
569 default:
570 /* 0xxxxxxx */
571 return (LWCHAR)
572 (p[0] & 0xFF);
573 case 2:
574 /* 110xxxxx 10xxxxxx */
575 return (LWCHAR) (
576 ((p[0] & 0x1F) << 6) |
577 (p[1] & 0x3F));
578 case 3:
579 /* 1110xxxx 10xxxxxx 10xxxxxx */
580 return (LWCHAR) (
581 ((p[0] & 0x0F) << 12) |
582 ((p[1] & 0x3F) << 6) |
583 (p[2] & 0x3F));
584 case 4:
585 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
586 return (LWCHAR) (
587 ((p[0] & 0x07) << 18) |
588 ((p[1] & 0x3F) << 12) |
589 ((p[2] & 0x3F) << 6) |
590 (p[3] & 0x3F));
591 case 5:
592 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
593 return (LWCHAR) (
594 ((p[0] & 0x03) << 24) |
595 ((p[1] & 0x3F) << 18) |
596 ((p[2] & 0x3F) << 12) |
597 ((p[3] & 0x3F) << 6) |
598 (p[4] & 0x3F));
599 case 6:
600 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
601 return (LWCHAR) (
602 ((p[0] & 0x01) << 30) |
603 ((p[1] & 0x3F) << 24) |
604 ((p[2] & 0x3F) << 18) |
605 ((p[3] & 0x3F) << 12) |
606 ((p[4] & 0x3F) << 6) |
607 (p[5] & 0x3F));
608 }
609 }
610
611 /*
612 * Store a character into a UTF-8 string.
613 */
614 public void
615 put_wchar(pp, ch)
616 char **pp;
617 LWCHAR ch;
618 {
619 if (!utf_mode || ch < 0x80)
620 {
621 /* 0xxxxxxx */
622 *(*pp)++ = (char) ch;
623 } else if (ch < 0x800)
624 {
625 /* 110xxxxx 10xxxxxx */
626 *(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F));
627 *(*pp)++ = (char) (0x80 | (ch & 0x3F));
628 } else if (ch < 0x10000)
629 {
630 /* 1110xxxx 10xxxxxx 10xxxxxx */
631 *(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F));
632 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
633 *(*pp)++ = (char) (0x80 | (ch & 0x3F));
634 } else if (ch < 0x200000)
635 {
636 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
637 *(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07));
638 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
639 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
640 *(*pp)++ = (char) (0x80 | (ch & 0x3F));
641 } else if (ch < 0x4000000)
642 {
643 /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
644 *(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03));
645 *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
646 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
647 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
648 *(*pp)++ = (char) (0x80 | (ch & 0x3F));
649 } else
650 {
651 /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
652 *(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01));
653 *(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F));
654 *(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
655 *(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
656 *(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
657 *(*pp)++ = (char) (0x80 | (ch & 0x3F));
658 }
659 }
660
661 /*
662 * Step forward or backward one character in a string.
663 */
664 public LWCHAR
665 step_char(pp, dir, limit)
666 char **pp;
667 signed int dir;
668 char *limit;
669 {
670 LWCHAR ch;
671 int len;
672 char *p = *pp;
673
674 if (!utf_mode)
675 {
676 /* It's easy if chars are one byte. */
677 if (dir > 0)
678 ch = (LWCHAR) ((p < limit) ? *p++ : 0);
679 else
680 ch = (LWCHAR) ((p > limit) ? *--p : 0);
681 } else if (dir > 0)
682 {
683 len = utf_len(*p);
684 if (p + len > limit)
685 {
686 ch = 0;
687 p = limit;
688 } else
689 {
690 ch = get_wchar(p);
691 p += len;
692 }
693 } else
694 {
695 while (p > limit && IS_UTF8_TRAIL(p[-1]))
696 p--;
697 if (p > limit)
698 ch = get_wchar(--p);
699 else
700 ch = 0;
701 }
702 *pp = p;
703 return ch;
704 }
705
706 /*
707 * Unicode characters data
708 */
709 struct wchar_range { LWCHAR first, last; };
710
711 /*
712 * Characters with general category values
713 * Mn: Mark, Nonspacing
714 * Me: Mark, Enclosing
715 * Last synched with
716 * <http://www.unicode.org/Public/5.0.0/ucd/UnicodeData-5.0.0d7.txt>
717 * dated 2005-11-30T00:58:48Z
718 */
719 static struct wchar_range comp_table[] = {
720 { 0x0300, 0x036F} /* Mn */, { 0x0483, 0x0486} /* Mn */,
721 { 0x0488, 0x0489} /* Me */,
722 { 0x0591, 0x05BD} /* Mn */, { 0x05BF, 0x05BF} /* Mn */,
723 { 0x05C1, 0x05C2} /* Mn */, { 0x05C4, 0x05C5} /* Mn */,
724 { 0x05C7, 0x05C7} /* Mn */, { 0x0610, 0x0615} /* Mn */,
725 { 0x064B, 0x065E} /* Mn */, { 0x0670, 0x0670} /* Mn */,
726 { 0x06D6, 0x06DC} /* Mn */,
727 { 0x06DE, 0x06DE} /* Me */,
728 { 0x06DF, 0x06E4} /* Mn */, { 0x06E7, 0x06E8} /* Mn */,
729 { 0x06EA, 0x06ED} /* Mn */, { 0x0711, 0x0711} /* Mn */,
730 { 0x0730, 0x074A} /* Mn */, { 0x07A6, 0x07B0} /* Mn */,
731 { 0x07EB, 0x07F3} /* Mn */, { 0x0901, 0x0902} /* Mn */,
732 { 0x093C, 0x093C} /* Mn */, { 0x0941, 0x0948} /* Mn */,
733 { 0x094D, 0x094D} /* Mn */, { 0x0951, 0x0954} /* Mn */,
734 { 0x0962, 0x0963} /* Mn */, { 0x0981, 0x0981} /* Mn */,
735 { 0x09BC, 0x09BC} /* Mn */, { 0x09C1, 0x09C4} /* Mn */,
736 { 0x09CD, 0x09CD} /* Mn */, { 0x09E2, 0x09E3} /* Mn */,
737 { 0x0A01, 0x0A02} /* Mn */, { 0x0A3C, 0x0A3C} /* Mn */,
738 { 0x0A41, 0x0A42} /* Mn */, { 0x0A47, 0x0A48} /* Mn */,
739 { 0x0A4B, 0x0A4D} /* Mn */, { 0x0A70, 0x0A71} /* Mn */,
740 { 0x0A81, 0x0A82} /* Mn */, { 0x0ABC, 0x0ABC} /* Mn */,
741 { 0x0AC1, 0x0AC5} /* Mn */, { 0x0AC7, 0x0AC8} /* Mn */,
742 { 0x0ACD, 0x0ACD} /* Mn */, { 0x0AE2, 0x0AE3} /* Mn */,
743 { 0x0B01, 0x0B01} /* Mn */, { 0x0B3C, 0x0B3C} /* Mn */,
744 { 0x0B3F, 0x0B3F} /* Mn */, { 0x0B41, 0x0B43} /* Mn */,
745 { 0x0B4D, 0x0B4D} /* Mn */, { 0x0B56, 0x0B56} /* Mn */,
746 { 0x0B82, 0x0B82} /* Mn */, { 0x0BC0, 0x0BC0} /* Mn */,
747 { 0x0BCD, 0x0BCD} /* Mn */, { 0x0C3E, 0x0C40} /* Mn */,
748 { 0x0C46, 0x0C48} /* Mn */, { 0x0C4A, 0x0C4D} /* Mn */,
749 { 0x0C55, 0x0C56} /* Mn */, { 0x0CBC, 0x0CBC} /* Mn */,
750 { 0x0CBF, 0x0CBF} /* Mn */, { 0x0CC6, 0x0CC6} /* Mn */,
751 { 0x0CCC, 0x0CCD} /* Mn */, { 0x0CE2, 0x0CE3} /* Mn */,
752 { 0x0D41, 0x0D43} /* Mn */, { 0x0D4D, 0x0D4D} /* Mn */,
753 { 0x0DCA, 0x0DCA} /* Mn */, { 0x0DD2, 0x0DD4} /* Mn */,
754 { 0x0DD6, 0x0DD6} /* Mn */, { 0x0E31, 0x0E31} /* Mn */,
755 { 0x0E34, 0x0E3A} /* Mn */, { 0x0E47, 0x0E4E} /* Mn */,
756 { 0x0EB1, 0x0EB1} /* Mn */, { 0x0EB4, 0x0EB9} /* Mn */,
757 { 0x0EBB, 0x0EBC} /* Mn */, { 0x0EC8, 0x0ECD} /* Mn */,
758 { 0x0F18, 0x0F19} /* Mn */, { 0x0F35, 0x0F35} /* Mn */,
759 { 0x0F37, 0x0F37} /* Mn */, { 0x0F39, 0x0F39} /* Mn */,
760 { 0x0F71, 0x0F7E} /* Mn */, { 0x0F80, 0x0F84} /* Mn */,
761 { 0x0F86, 0x0F87} /* Mn */, { 0x0F90, 0x0F97} /* Mn */,
762 { 0x0F99, 0x0FBC} /* Mn */, { 0x0FC6, 0x0FC6} /* Mn */,
763 { 0x102D, 0x1030} /* Mn */, { 0x1032, 0x1032} /* Mn */,
764 { 0x1036, 0x1037} /* Mn */, { 0x1039, 0x1039} /* Mn */,
765 { 0x1058, 0x1059} /* Mn */, { 0x135F, 0x135F} /* Mn */,
766 { 0x1712, 0x1714} /* Mn */, { 0x1732, 0x1734} /* Mn */,
767 { 0x1752, 0x1753} /* Mn */, { 0x1772, 0x1773} /* Mn */,
768 { 0x17B7, 0x17BD} /* Mn */, { 0x17C6, 0x17C6} /* Mn */,
769 { 0x17C9, 0x17D3} /* Mn */, { 0x17DD, 0x17DD} /* Mn */,
770 { 0x180B, 0x180D} /* Mn */, { 0x18A9, 0x18A9} /* Mn */,
771 { 0x1920, 0x1922} /* Mn */, { 0x1927, 0x1928} /* Mn */,
772 { 0x1932, 0x1932} /* Mn */, { 0x1939, 0x193B} /* Mn */,
773 { 0x1A17, 0x1A18} /* Mn */, { 0x1B00, 0x1B03} /* Mn */