/*
 * txt2enc.c --
 *
 *	Simple program to compile up the encodings tables from the CD that
 *	came with "The Unicode Standard, Version 2.0" into a form that can
 *	be quickly loaded into Tcl.
 *
 * Copyright (c) 1997 Sun Microsystems, Inc.
 *
 * See the file "license.terms" for information on usage and redistribution
 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 *
 * SCCS: @(#) txt2enc.c 1.1 98/01/28 11:42:09
 */

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <unistd.h>

typedef unsigned short Rune;

int
main(int argc, char **argv)
{
    FILE *fp;
    Rune *toUnicode[256];
    int i, multiByte, enc, uni, hi, lo, fixmissing, used, maxEnc;
    int ch, encColumn, uniColumn, fallbackKnown, width;
    char *fallbackString, *str, *rest, *dot;
    unsigned int magic, type, symbol, fallbackChar;
    Rune rune;
    char buf[256];
    extern char *optarg;
    extern int optind, opterr;
    static char *typeString[] = {"single", "double", "multi"};

    encColumn = 0;
    uniColumn = 1;
    fallbackString = "QUESTION MARK";
    fallbackChar = '\0';
    fallbackKnown = 0;
    type = -1;
    symbol = 0;
    fixmissing = 1;

    opterr = 0;
    while (1) {
	ch = getopt(argc, argv, "e:u:f:t:sm");
	if (ch == -1) {
	    break;
	}
	switch (ch) {
	case 'e':
	    encColumn = atoi(optarg);
	    break;

	case 'u':
	    uniColumn = atoi(optarg);
	    break;

	case 'f':
	    fallbackKnown = 1;
	    if (optarg[1] == '\0') {
		fallbackChar = optarg[0];
	    } else  {
		fallbackChar = strtol(optarg, &rest, 16);
		if (*rest != '\0') {
		    fallbackChar = '\0';
		    fallbackKnown = 0;
		    fallbackString = optarg;
		}
	    }

	case 't':
	    if (strcmp(optarg, "single") == 0) {
		type = 0;
	    } else if (strcmp(optarg, "double") == 0) {
		type = 1;
	    } else if (strcmp(optarg, "multi") == 0) {
		type = 2;
	    } else {
		goto usage;
	    }
	    break;

	case 's':
	    symbol = 1;
	    break;

	case 'm':
	    fixmissing = 0;
	    break;
	    
	default:
	    goto usage;
	}
    }

    if ((optind < argc - 1) || (optind >= argc)) {
	usage:
	fputs("usage: mkencoding [-e column] [-u column] [-f fallback] [-t type] [-s] [-m] file\n", stderr);
	fputs("    -e\tcolumn containing characters in encoding (default: 0)\n", stderr);
	fputs("    -u\tcolumn containing characters in Unicode (default: 1)\n", stderr);
	fputs("    -f\tfallback character (default: QUESTION MARK)\n", stderr);
	fputs("    -t\toverride implicit type with single, double, or multi\n", stderr);
	fputs("    -s\tsymbol+ascii encoding\n", stderr);
	fputs("    -m\tdon't implicitly include range 0080 to 00FF\n", stderr);
	return 1;
    }

    fp = fopen(argv[argc - 1], "r");
    if (fp == NULL) {
        perror(argv[argc - 1]);
	return 1;
    }

    for (i = 0; i < 256; i++) {
        toUnicode[i] = NULL;
    }

    maxEnc = 0;
    width = 0;
    multiByte = 0;
    while (fgets(buf, sizeof(buf), fp) != NULL) {
	str = buf;
	enc = -1;
	uni = -1;
	while (isspace(*str)) {
	    str++;
	}
	if (str[0] == '#') {
	    continue;
	}
	for (i = 0; *str != '\0'; i++) {
	    if (*str == '#') {
		if (fallbackKnown == 0) {
		    str++;
		    while (isspace(*str)) {
			str++;
		    }
		    str[strlen(str) - 1] = '\0';
		    if (strcmp(str, fallbackString) == 0) {
			fallbackChar = enc;
			fallbackKnown = 1;
		    } else if (strstr(str, fallbackString) != NULL) {
			fallbackChar = enc;
		    }
		}
		break;
	    } else {
		rune = strtol(str, &rest, 16);
		if (rest == str) {
		    rest++;
		} else if (i == uniColumn) {
		    uni = rune;
		} else if (i == encColumn) {
		    enc = rune;
		    if ((width != 0) && (width != rest - str)) {
			multiByte = 1;
		    }
		    width = rest - str;
		    if (enc > maxEnc) {
			maxEnc = enc;
		    }
		}
	    }
	    while (isspace(*rest)) {
		rest++;
	    }
	    str = rest;
	}
	if (enc < 32 || uni < 32) {
	    continue;
	}
	
	hi = enc >> 8;
	lo = enc & 0xff;
	if (toUnicode[hi] == NULL) {
	    toUnicode[hi] = (Rune *) malloc(256 * sizeof(Rune));
	    memset(toUnicode[hi], 0, 256 * sizeof(Rune));
	}
	toUnicode[hi][lo] = uni;
    }
	
    fclose(fp);

    dot = strrchr(argv[argc - 1], '.');
    if (dot != NULL) {
	*dot = '\0';
    }
    if (type == -1) {
	if (multiByte) {
	    type = 2;
	} else if (maxEnc > 255) {
	    type = 1;
	} else {
	    type = 0;
	}
    }
    if (type != 1) {
	if (toUnicode[0] == NULL) {
	    toUnicode[0] = (Rune *) malloc(256 * sizeof(Rune));
	    memset(toUnicode[0], 0, 256 * sizeof(Rune));
	}
	for (i = 0; i < 0x20; i++) {
	    toUnicode[0][i] = i;
	}
	if (fixmissing) {
	    for (i = 0x7F; i < 0xa0; i++) {
		if (toUnicode[i] == NULL && toUnicode[0][i] == 0) {
		    toUnicode[0][i] = i;
		}
	    }
	}
    }

    printf("# Encoding file: %s, %s-byte\n", argv[argc - 1], typeString[type]);

    if (fallbackChar == '\0') {
	fallbackChar = '?';
    }
    used = 0;
    for (hi = 0; hi < 256; hi++) {
	if (toUnicode[hi] != NULL) {
	    used++;
	}
    }
    printf("%c\n%04X %d %d\n", "SDM"[type], fallbackChar, symbol, used);
    
    for (hi = 0; hi < 256; hi++) {
	if (toUnicode[hi] != NULL) {
	    printf("%02X\n", hi);
	    for (lo = 0; lo < 256; lo++) {
		printf("%04X", toUnicode[hi][lo]);
		if ((lo & 0x0f) == 0x0f) {
		    putchar('\n');
		}
	    }
	}
    }
    return 0;
}