summaryrefslogtreecommitdiffstats
path: root/tools/encoding/txt2enc.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/encoding/txt2enc.c')
-rw-r--r--tools/encoding/txt2enc.c244
1 files changed, 244 insertions, 0 deletions
diff --git a/tools/encoding/txt2enc.c b/tools/encoding/txt2enc.c
new file mode 100644
index 0000000..73870bf
--- /dev/null
+++ b/tools/encoding/txt2enc.c
@@ -0,0 +1,244 @@
+/*
+ * txt2enc.c --
+ *
+ * Simple program to compile up the encodings tables from the CD that
+ * came with "The Unicode Standard, Version 2.0" into a form that can
+ * be quickly loaded into Tcl.
+ *
+ * Copyright (c) 1997 Sun Microsystems, Inc.
+ *
+ * See the file "license.terms" for information on usage and redistribution
+ * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
+ *
+ * SCCS: @(#) txt2enc.c 1.1 98/01/28 11:42:09
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <unistd.h>
+
+typedef unsigned short Rune;
+
+int
+main(int argc, char **argv)
+{
+ FILE *fp;
+ Rune *toUnicode[256];
+ int i, multiByte, enc, uni, hi, lo, fixmissing, used, maxEnc;
+ int ch, encColumn, uniColumn, fallbackKnown, width;
+ char *fallbackString, *str, *rest, *dot;
+ unsigned int magic, type, symbol, fallbackChar;
+ Rune rune;
+ char buf[256];
+ extern char *optarg;
+ extern int optind, opterr;
+ static char *typeString[] = {"single", "double", "multi"};
+
+ encColumn = 0;
+ uniColumn = 1;
+ fallbackString = "QUESTION MARK";
+ fallbackChar = '\0';
+ fallbackKnown = 0;
+ type = -1;
+ symbol = 0;
+ fixmissing = 1;
+
+ opterr = 0;
+ while (1) {
+ ch = getopt(argc, argv, "e:u:f:t:sm");
+ if (ch == -1) {
+ break;
+ }
+ switch (ch) {
+ case 'e':
+ encColumn = atoi(optarg);
+ break;
+
+ case 'u':
+ uniColumn = atoi(optarg);
+ break;
+
+ case 'f':
+ fallbackKnown = 1;
+ if (optarg[1] == '\0') {
+ fallbackChar = optarg[0];
+ } else {
+ fallbackChar = strtol(optarg, &rest, 16);
+ if (*rest != '\0') {
+ fallbackChar = '\0';
+ fallbackKnown = 0;
+ fallbackString = optarg;
+ }
+ }
+
+ case 't':
+ if (strcmp(optarg, "single") == 0) {
+ type = 0;
+ } else if (strcmp(optarg, "double") == 0) {
+ type = 1;
+ } else if (strcmp(optarg, "multi") == 0) {
+ type = 2;
+ } else {
+ goto usage;
+ }
+ break;
+
+ case 's':
+ symbol = 1;
+ break;
+
+ case 'm':
+ fixmissing = 0;
+ break;
+
+ default:
+ goto usage;
+ }
+ }
+
+ if ((optind < argc - 1) || (optind >= argc)) {
+ usage:
+ fputs("usage: mkencoding [-e column] [-u column] [-f fallback] [-t type] [-s] [-m] file\n", stderr);
+ fputs(" -e\tcolumn containing characters in encoding (default: 0)\n", stderr);
+ fputs(" -u\tcolumn containing characters in Unicode (default: 1)\n", stderr);
+ fputs(" -f\tfallback character (default: QUESTION MARK)\n", stderr);
+ fputs(" -t\toverride implicit type with single, double, or multi\n", stderr);
+ fputs(" -s\tsymbol+ascii encoding\n", stderr);
+ fputs(" -m\tdon't implicitly include range 0080 to 00FF\n", stderr);
+ return 1;
+ }
+
+ fp = fopen(argv[argc - 1], "r");
+ if (fp == NULL) {
+ perror(argv[argc - 1]);
+ return 1;
+ }
+
+ for (i = 0; i < 256; i++) {
+ toUnicode[i] = NULL;
+ }
+
+ maxEnc = 0;
+ width = 0;
+ multiByte = 0;
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ str = buf;
+ enc = -1;
+ uni = -1;
+ while (isspace(*str)) {
+ str++;
+ }
+ if (str[0] == '#') {
+ continue;
+ }
+ for (i = 0; *str != '\0'; i++) {
+ if (*str == '#') {
+ if (fallbackKnown == 0) {
+ str++;
+ while (isspace(*str)) {
+ str++;
+ }
+ str[strlen(str) - 1] = '\0';
+ if (strcmp(str, fallbackString) == 0) {
+ fallbackChar = enc;
+ fallbackKnown = 1;
+ } else if (strstr(str, fallbackString) != NULL) {
+ fallbackChar = enc;
+ }
+ }
+ break;
+ } else {
+ rune = strtol(str, &rest, 16);
+ if (rest == str) {
+ rest++;
+ } else if (i == uniColumn) {
+ uni = rune;
+ } else if (i == encColumn) {
+ enc = rune;
+ if ((width != 0) && (width != rest - str)) {
+ multiByte = 1;
+ }
+ width = rest - str;
+ if (enc > maxEnc) {
+ maxEnc = enc;
+ }
+ }
+ }
+ while (isspace(*rest)) {
+ rest++;
+ }
+ str = rest;
+ }
+ if (enc < 32 || uni < 32) {
+ continue;
+ }
+
+ hi = enc >> 8;
+ lo = enc & 0xff;
+ if (toUnicode[hi] == NULL) {
+ toUnicode[hi] = (Rune *) malloc(256 * sizeof(Rune));
+ memset(toUnicode[hi], 0, 256 * sizeof(Rune));
+ }
+ toUnicode[hi][lo] = uni;
+ }
+
+ fclose(fp);
+
+ dot = strrchr(argv[argc - 1], '.');
+ if (dot != NULL) {
+ *dot = '\0';
+ }
+ if (type == -1) {
+ if (multiByte) {
+ type = 2;
+ } else if (maxEnc > 255) {
+ type = 1;
+ } else {
+ type = 0;
+ }
+ }
+ if (type != 1) {
+ if (toUnicode[0] == NULL) {
+ toUnicode[0] = (Rune *) malloc(256 * sizeof(Rune));
+ memset(toUnicode[0], 0, 256 * sizeof(Rune));
+ }
+ for (i = 0; i < 0x20; i++) {
+ toUnicode[0][i] = i;
+ }
+ if (fixmissing) {
+ for (i = 0x7F; i < 0xa0; i++) {
+ if (toUnicode[i] == NULL && toUnicode[0][i] == 0) {
+ toUnicode[0][i] = i;
+ }
+ }
+ }
+ }
+
+ printf("# Encoding file: %s, %s-byte\n", argv[argc - 1], typeString[type]);
+
+ if (fallbackChar == '\0') {
+ fallbackChar = '?';
+ }
+ used = 0;
+ for (hi = 0; hi < 256; hi++) {
+ if (toUnicode[hi] != NULL) {
+ used++;
+ }
+ }
+ printf("%c\n%04X %d %d\n", "SDM"[type], fallbackChar, symbol, used);
+
+ for (hi = 0; hi < 256; hi++) {
+ if (toUnicode[hi] != NULL) {
+ printf("%02X\n", hi);
+ for (lo = 0; lo < 256; lo++) {
+ printf("%04X", toUnicode[hi][lo]);
+ if ((lo & 0x0f) == 0x0f) {
+ putchar('\n');
+ }
+ }
+ }
+ }
+ return 0;
+}