/* join - join lines of two files on a common field
   Copyright (C) 1991, 1995 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

   Written by Mike Haertel, mike@gnu.ai.mit.edu. */

#include <config.h>

/* Get isblank from GNU libc.  */
#define _GNU_SOURCE

#include <xos.h>
#include <xfile.h>
#include <sys/types.h>
#include "system.h"
#include "version.h"

#include <table.h>

int	table = 1; 
File ofile;

char *xmalloc ();
char *xrealloc ();
static void usage ();

#define min(A, B) ((A) < (B) ? (A) : (B))
#define max(A, B) ((A) > (B) ? (A) : (B))

/* An element of the list describing the format of each
   output line. */
struct outlist
  {
    int file;			/* File to take field from (1 or 2). */
    int field;			/* Field number to print. */
    struct outlist *next;
  };

/* A field of a line. */
struct field
  {
    char *beg;			/* First character in field. */
    size_t len;			/* The length of the field. */
  };

/* A line read from an input file.  Newlines are not stored. */
struct line
  {
    char *beg;			/* First character in line. */
    char *lim;			/* Character after last character in line. */
    int nfields;		/* Number of elements in `fields'. */
    int nfields_allocated;	/* Number of elements in `fields'. */
    struct field *fields;
  };

/* One or more consecutive lines read from a file that all have the
   same join field value. */
struct seq
  {
    int count;			/* Elements used in `lines'. */
    int alloc;			/* Elements allocated in `lines'. */
    struct line *lines;
  };

/* The name this program was run with. */
char *program_name;

/* If nonzero, print unpairable lines in file 1 or 2. */
static int print_unpairables_1, print_unpairables_2;

/* If nonzero, print pairable lines. */
static int print_pairables;

/* Empty output field filler. */
static char *empty_filler = NULL;

/* Field to join on. */
static int join_field_1, join_field_2;

/* List of fields to print. */
static struct outlist *outlist;

/* Last element in `outlist', where a new element can be added. */
static struct outlist *outlist_end;

/* Tab character separating fields; if this is NUL fields are separated
   by any nonempty string of white space, otherwise by exactly one
   tab character. */
static char tab = '\t';

/* Used to print non-joining lines */
static struct line uni_blank;
static struct line blank1;
static struct line blank2;

static void
ADD_FIELD (line, field, len)
     struct line *line;
     char *field;
     size_t len;
{
  if (line->nfields >= line->nfields_allocated)
    {
      line->nfields_allocated = (3 * line->nfields_allocated) / 2 + 1;
      line->fields = (struct field *) xrealloc ((char *) line->fields,
						(line->nfields_allocated
						 * sizeof (struct field)));
    }
  line->fields[line->nfields].beg = field;
  line->fields[line->nfields].len = len;
  ++(line->nfields);
}

/* Fill in the `fields' structure in LINE. */

static void
xfields (line)
     struct line *line;
{
  int i;
  register char *ptr, *lim;

  ptr = line->beg;
  lim = line->lim;

  for (i = 0; ptr < lim; ++i)
    {
      if (tab)
	{
	  char *beg, *end;

	  if ( table ) while ( ptr < lim && *ptr == ' ' ) ptr++;
	  beg = ptr;
	  while (ptr < lim && *ptr != tab)
	    ++ptr;
	  if ( table ) {
		end = ptr;
		while (ptr > beg && *(ptr - 1) == ' ' ) ptr--;
		ADD_FIELD (line, beg, ptr - beg);
		ptr = end;
	  } else
		ADD_FIELD (line, beg, ptr - beg);

          if (ptr < lim)
            ++ptr;
	}
      else
	{
	  char *beg;

	  beg = ptr;
	  while (ptr < lim && !ISSPACE (*ptr))
	    ++ptr;
	  ADD_FIELD (line, beg, ptr - beg);
	  while (ptr < lim && ISSPACE (*ptr))
	    ++ptr;
	}
    }

  if ( !table ) {
      if (ptr > line->beg && ((tab && ISSPACE (ptr[-1])) || ptr[-1] == tab))
	{
	  /* Add one more (empty) field because the last character of the
	     line was a delimiter.  */
	  ADD_FIELD (line, NULL, 0);
	}
  } else {
      if ( ptr > line->beg && ptr[-1] == tab ) {
	  ADD_FIELD (line, NULL, 0);
      }
  }
}

/* Read a line from FP into LINE and split it into fields.
   Return 0 if EOF, 1 otherwise. */

int GetCh(fp) 
	FILE	*fp;
{
	int	 c;

	while ( (c = getc(fp)) == '\r' ) ;
	return c;
}

static int
get_line (fp, line)
     FILE *fp;
     struct line *line;
{
  static int linesize = 80;
  int c, i;
  char *ptr;

  if (feof (fp))
    return 0;

  ptr = xmalloc (linesize);

  for (i = 0; (c = GetCh (fp)) != EOF && c != '\n'; ++i)
    {
      if (i == linesize)
	{
	  linesize *= 2;
	  ptr = xrealloc (ptr, linesize);
	}
      ptr[i] = c;
    }

  if (c == EOF && i == 0)
    {
      free (ptr);
      return 0;
    }

  line->beg = ptr;
  line->lim = line->beg + i;
  line->nfields_allocated = 0;
  line->nfields = 0;
  line->fields = NULL;
  xfields (line);
  return 1;
}

static void
freeline (line)
     struct line *line;
{
  free ((char *) line->fields);
  free (line->beg);
  line->beg = NULL;
}

static void
initseq (seq)
     struct seq *seq;
{
  seq->count = 0;
  seq->alloc = 1;
  seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
}

/* Read a line from FP and add it to SEQ.  Return 0 if EOF, 1 otherwise. */

static int
getseq (fp, seq)
     FILE *fp;
     struct seq *seq;
{
  if (seq->count == seq->alloc)
    {
      seq->alloc *= 2;
      seq->lines = (struct line *)
	xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
    }

  if (get_line (fp, &seq->lines[seq->count]))
    {
      ++seq->count;
      return 1;
    }
  return 0;
}

static void
delseq (seq)
     struct seq *seq;
{
  int i;
  for (i = 0; i < seq->count; i++)
    if (seq->lines[i].beg)
      freeline (&seq->lines[i]);
  free ((char *) seq->lines);
}

/* Return <0 if the join field in LINE1 compares less than the one in LINE2;
   >0 if it compares greater; 0 if it compares equal. */

static   int Numeric = 0;
static   int Reverse = 0;

static int
keycmp (line1, line2)
     struct line *line1;
     struct line *line2;
{
  char *beg1, *beg2;		/* Start of field to compare in each file. */
  int len1, len2;		/* Length of fields to compare. */
  int diff;

  if (join_field_1 < line1->nfields)
    {
      beg1 = line1->fields[join_field_1].beg;
      len1 = line1->fields[join_field_1].len;
    }
  else
    {
      beg1 = NULL;
      len1 = 0;
    }

  if (join_field_2 < line2->nfields)
    {
      beg2 = line2->fields[join_field_2].beg;
      len2 = line2->fields[join_field_2].len;
    }
  else
    {
      beg2 = NULL;
      len2 = 0;
    }

  if (len1 == 0)
    return len2 == 0 ? 0 : -1;
  if (len2 == 0)
    return 1;

#ifdef SAOMOD
 if ( Numeric ) {
	char chr1 = *(beg1 + len1);
	char chr2 = *(beg2 + len2);

	*(beg1+len1) = '\0';
	*(beg2+len2) = '\0';

	  if ( *beg1 != '+' && *beg1 != '-' && !ISDIGIT(*beg1) )

	    if ( *beg2 != '+' && *beg2 != '-' && !ISDIGIT(*beg2) )
		diff = memcmp (beg1, beg2, min(len1, len2));
	    else
		diff =  1;
	  else
	  if ( *beg2 != '+' && *beg2 != '-' && !ISDIGIT(*beg2) )

	    diff = -1;
	  else
 	  if ( Numeric == 1 ) {
		double	f1 = strtod(beg1, 0);
		double	f2 = strtod(beg2, 0);


             diff = f1 < f2 ? -1 : (f1 == f2 ? 0 : 1);
	  } else {
		double	f1 = SAOstrtod(beg1, 0);
		double	f2 = SAOstrtod(beg2, 0);

     	    diff = f1 < f2 ? -1 : (f1 == f2 ? 0 : 1);
	  }

	*(beg1+len1) = chr1;
	*(beg2+len2) = chr2;

 } else
#endif
  diff = memcmp (beg1, beg2, min (len1, len2));

  if (diff)
#ifndef SAOMOD
	return diff;
#else
        return Reverse ? -diff : diff;

  if ( Numeric ) return diff;
#endif

  return len1 - len2;
}

/* Print field N of LINE if it exists and is nonempty, otherwise
   `empty_filler' if it is nonempty. */

static void
prfield (n, line)
     int n;
     struct line *line;
{
  int len;

  if (n < line->nfields)
    {
      len = line->fields[n].len;
      if (len)
	fwrite (line->fields[n].beg, 1, len, ofile);
      else if (empty_filler)
	fputs (empty_filler, ofile);
    }
  else if (empty_filler)
    fputs (empty_filler, ofile);
}

/* Print the join of LINE1 and LINE2. */

static void
prjoin (line1, line2)
     struct line *line1;
     struct line *line2;
{
  if (outlist)
    {
      struct outlist *o;

      prfield (outlist->field - 1, outlist->file == 1 ? line1 : line2);
      for (o = outlist->next; o; o = o->next)
	{
	  putchar (tab ? tab : ' ');
	  prfield (o->field - 1, o->file == 1 ? line1 : line2);
	}
      putchar ('\n');
    }
  else
    {
      int i;

      if ( line1 == &blank1 )
		prfield (join_field_2, line2);
      else
		prfield (join_field_1, line1);

      for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
	{
	  putchar (tab ? tab : ' ');
	  prfield (i, line1);
	}
      for (i = join_field_1 + 1; i < line1->nfields; ++i)
	{
	  putchar (tab ? tab : ' ');
	  prfield (i, line1);
	}

      for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
	{
	  putchar (tab ? tab : ' ');
	  prfield (i, line2);
	}
      for (i = join_field_2 + 1; i < line2->nfields; ++i)
	{
	  putchar (tab ? tab : ' ');
	  prfield (i, line2);
	}
      putchar ('\n');
    }
}

/* Print the join of the files in FP1 and FP2. */

static void
join (fp1, fp2)
     FILE *fp1;
     FILE *fp2;
{
  struct seq seq1, seq2;
  struct line line;
  int diff, i, j, eof1, eof2;

  /* Read the first line of each file. */
  initseq (&seq1);
  getseq (fp1, &seq1);
  initseq (&seq2);
  getseq (fp2, &seq2);

  while (seq1.count && seq2.count)
    {
      diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
      if (diff < 0)
	{
	  if (print_unpairables_1)
	    prjoin (&seq1.lines[0], &blank2);
	  freeline (&seq1.lines[0]);
	  seq1.count = 0;
	  getseq (fp1, &seq1);
	  continue;
	}
      if (diff > 0)
	{
	  if (print_unpairables_2)
	    prjoin (&blank1, &seq2.lines[0]);
	  freeline (&seq2.lines[0]);
	  seq2.count = 0;
	  getseq (fp2, &seq2);
	  continue;
	}

      /* Keep reading lines from file1 as long as they continue to
         match the current line from file2. */
      eof1 = 0;
      do
	if (!getseq (fp1, &seq1))
	  {
	    eof1 = 1;
	    ++seq1.count;
	    break;
	  }
      while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));

      /* Keep reading lines from file2 as long as they continue to
         match the current line from file1. */
      eof2 = 0;
      do
	if (!getseq (fp2, &seq2))
	  {
	    eof2 = 1;
	    ++seq2.count;
	    break;
	  }
      while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));

      if (print_pairables)
	{
	  for (i = 0; i < seq1.count - 1; ++i)
	    for (j = 0; j < seq2.count - 1; ++j)
	      prjoin (&seq1.lines[i], &seq2.lines[j]);
	}

      for (i = 0; i < seq1.count - 1; ++i)
	freeline (&seq1.lines[i]);
      if (!eof1)
	{
	  seq1.lines[0] = seq1.lines[seq1.count - 1];
	  seq1.count = 1;
	}
      else
	seq1.count = 0;

      for (i = 0; i < seq2.count - 1; ++i)
	freeline (&seq2.lines[i]);
      if (!eof2)
	{
	  seq2.lines[0] = seq2.lines[seq2.count - 1];
	  seq2.count = 1;
	}
      else
	seq2.count = 0;
    }

  if (print_unpairables_1 && seq1.count)
    {
      prjoin (&seq1.lines[0], &blank2);
      freeline (&seq1.lines[0]);
      while (get_line (fp1, &line))
	{
	  prjoin (&line, &blank2);
	  freeline (&line);
	}
    }

  if (print_unpairables_2 && seq2.count)
    {
      prjoin (&blank1, &seq2.lines[0]);
      freeline (&seq2.lines[0]);
      while (get_line (fp2, &line))
	{
	  prjoin (&blank1, &line);
	  freeline (&line);
	}
    }

  delseq (&seq1);
  delseq (&seq2);
}

/* Add a field spec for field FIELD of file FILE to `outlist' and return 1,
   unless either argument is invalid; then just return 0. */

static int
add_field (file, field)
     int file;
     int field;
{
  struct outlist *o;

  if (file < 1 || file > 2 || field < 1)
    return 0;
  o = (struct outlist *) xmalloc (sizeof (struct outlist));
  o->file = file;
  o->field = field;
  o->next = NULL;

  /* Add to the end of the list so the fields are in the right order. */
  if (outlist == NULL)
    outlist = o;
  else
    outlist_end->next = o;
  outlist_end = o;

  return 1;
}

/* Add the comma or blank separated field spec(s) in STR to `outlist'.
   Return the number of fields added. */

static int
add_field_list (str)
     char *str;
{
  int added = 0;
  int file = -1, field = -1;
  int dot_found = 0;

  for (; *str; str++)
    {
      if (*str == ',' || ISBLANK (*str))
	{
	  added += add_field (file, field);
	  uni_blank.nfields = max (uni_blank.nfields, field);
	  file = field = -1;
	  dot_found = 0;
	}
      else if (*str == '.')
	dot_found = 1;
      else if (ISDIGIT (*str))
	{
	  if (!dot_found)
	    {
	      if (file == -1)
		file = 0;
	      file = file * 10 + *str - '0';
	    }
	  else
	    {
	      if (field == -1)
		field = 0;
	      field = field * 10 + *str - '0';
	    }
	}
      else
	return 0;
    }

  uni_blank.nfields = max (uni_blank.nfields, field);
  added += add_field (file, field);
  return added;
}

/* Create a blank line with COUNT fields separated by tabs. */

void
make_blank (blank, count)
     struct line *blank;
     int count;
{
  int i;
  blank->nfields = count;
  blank->beg = xmalloc (blank->nfields + 1);
  blank->fields = (struct field *) xmalloc (sizeof (struct field) * count);
  for (i = 0; i < blank->nfields; i++)
    {
      blank->beg[i] = '\t';
      blank->fields[i].beg = &blank->beg[i];
      blank->fields[i].len = 0;
    }
  blank->beg[i] = '\0';
  blank->lim = &blank->beg[i];
}

char *trim();

main (argc, argv)
     int argc;
     char *argv[];
{
  char *names[2];
  FILE *fp1, *fp2;
  int i, nfiles, val;

  TableHead	T1, T2;

  char	*join_column1 = NULL, *join_column2 = NULL;

  program_name = argv[0];
  ofile = Stdout;

  /* Initialize this before parsing options.  In parsing options,
     it may be increased.  */
  uni_blank.nfields = 1;

  nfiles = 0;
  print_pairables = 1;

for ( i = 1; i < argc; i++ ) {
    if ( argv[i][0] == '-' ) {
      switch (argv[i][1]) {
	case 'a':
	  val = atoi(&argv[i][2]);
	  if (val == 1)
	    print_unpairables_1 = 1;
	  else if (val == 2)
	    print_unpairables_2 = 1;
	  else {
	    fprintf(stderr, "invalid file number for `-a\n");
	    exit(2);
	  }
	  break;

	case 'e': empty_filler = argv[++i]; break;

	case '1': i++; join_column1 = argv[i]; 	break;
	case '2': i++; join_column2 = argv[i]; 	break;

	case 'j':
	    switch ( argv[i][2] ) {
	     case '1': i++; join_column1 = argv[i]; 	break;
	     case '2': i++; join_column2 = argv[i]; 	break;
	     default:
		i++; join_column1 = argv[i];
		     join_column2 = argv[i]; 	break;
	    }; 
	    break;

	case 't': tab = argv[i][2];		break;

	case 'v': val = atoi(&argv[i][2]);

	  if (val == 1)
	    print_unpairables_1 = 1;
	  else if (val == 2)
	    print_unpairables_2 = 1;
	  else {
	    fprintf(stderr, "invalid file number for `-v\n");
	    exit(2);
	  }
	  print_pairables = 0;
	  break;

	case 'A': Numeric = 0;	break;
	case 'n': Numeric = 1;	break;
	case 'h': Numeric = 2;	break;
	case 'r': Reverse = 1;	break;

	case 'o' : i++;
	    if ( (ofile = (!strcmp("-", argv[i]))
		    ? Stdout : Open(argv[i], "w")) == NULL ) {
		FPrint(Stderr, "jointable: can't open output file: %s\n"
			, argv[i]);
		perror("jointable");
		exit(1);
	    }

	case '?':
	  usage (0);

	case 0:
	  if (nfiles > 1)
	    usage (1);

	  names[nfiles++] = argv[i];
	  break;

	}
	continue;
    }
    if (nfiles > 1) usage (1);

    names[nfiles++] = argv[i];
}

  /* Now that we've seen the options, we can construct the blank line
     structure.  */
  make_blank (&uni_blank, uni_blank.nfields);

  if (nfiles != 2)
    usage (1);

  fp1 = Open(names[0], "r");
  if (!fp1) {
      perror(names[0]);
      exit(2);
  }
  fp2 = Open(names[1], "r");
  if (!fp2) {
      perror(names[1]);
      exit(2);
  }
  if (fp1 == fp2) {
      fprintf(stderr, "both files cannot be standard input\n");
      exit(2);
  }

  if ( (T1 = table_header(fp1, 0)) == NULL ) {
        FPrint(Stderr, "jointable: can't read table header from: %s\n", names[0]);
        exit(1);
  }
  if ( (T2 = table_header(fp2, 0)) == NULL ) {
        FPrint(stderr, "jointable: can't read table header from: %s\n", names[1]);
        exit(1);
  }

  if ( join_column1 == NULL ) {
        FPrint(stderr, "jointable: no join column for table 1\n");
        exit(1);
  }

  if ( join_column2 == NULL ) {
        FPrint(stderr, "jointable: no join column for table 2\n");
        exit(1);
  }
	
  if ( !(join_field_1 = table_colnum(T1, join_column1)) ) {
        FPrint(Stderr, "jointable: can't find column: %s in file: %s\n", join_column1, names[0]);
        exit(1);
  }
  if ( !(join_field_2 = table_colnum(T2, join_column2)) ) {
        FPrint(Stderr, "jointable: can't find column: %s in file: %s\n", join_column2, names[1]);
        exit(1);
  }

        FPrint(ofile, "%s%c", T1->header->column[join_field_1]
			, T1->header->ncol == 1 && T2->header->ncol == 1 ? '\n' : '\t');

        for ( i = 1; i <= T1->header->ncol; i++ )
            /* look up column name in the other table.
             */
          if ( i != join_field_1 ) {
		int c2 = table_colnum(T2, trim(table_colval(T1->header, i)));

            hdrfix(ofile, table_colval(T1->header, i)
                 , c2 != 0 && c2 != join_field_2 ? 1 : 0
                 , i == table_ncol(T1) && T2->header->ncol == 1 ? '\n' : '\t');
	  }

        { int lastcol = T2->header->ncol == join_field_2 ? join_field_2 - 1 : T2->header->ncol;

	  if ( lastcol != 0 ) {
            for ( i = 1; i <= lastcol; i++ )
              if ( i != join_field_2 ) {
		    int c1 = table_colnum(T1, trim(table_colval(T2->header, i)));

                hdrfix(ofile, table_colval(T2->header, i)
                     , c1 != 0 && c1 != join_field_1 ? 2 : 0
                     , i == lastcol ? '\n' : '\t');
	      }
	  }
        }

        FPrint(ofile, "%s%c", T1->dashes->column[join_field_1]
			, T1->header->ncol == 1 && T2->header->ncol == 1 ? '\n' : '\t');
        for ( i = 1; i <= table_ncol(T1); i++ )
            if ( i != join_field_1 ) FPrint(ofile, "%s%c"
                                , T1->dashes->column[i]
                                , i == table_ncol(T1) && T2->header->ncol == 1 ? '\n' : '\t');
        { int lastcol = T2->header->ncol == join_field_2 ? join_field_2 - 1 : T2->header->ncol;

	  if ( lastcol != 0 ) {
            for ( i = 1; i <= lastcol; i++ )
                if ( i != join_field_2 ) FPrint(ofile, "%s%c"
                                    , T2->dashes->column[i]
                                    , i == lastcol ? '\n' : '\t');
	  }
        }

    blank1.nfields = T1->header->ncol;
    blank2.nfields = T2->header->ncol;

    table_hdrfree(T1);
    table_hdrfree(T2);

  /* These line are moved from above so that the number of fields in a
     blank line may be set by the table headers.
   */
  /* Now that we've seen the options, we can construct the blank line
     structures.  */
  make_blank(&blank1, blank1.nfields);
  make_blank(&blank2, blank2.nfields);

  join_field_1--;
  join_field_2--;

  join (fp1, fp2);

  if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF) {
      perror("-");
      exit(2);
  }
  if (ferror (ofile) || fclose (ofile) == EOF) {
      perror("write error");
      exit(2);
  }

  exit (0);
}

static void
usage (status)
     int status;
{
  if (status != 0)
    fprintf (stderr, "jointable -? for more information.\n",
	     program_name);
  else
    {
      printf ("\
Usage: %s [OPTION]... FILE1 FILE2\n\
",
	      program_name);
      printf ("\
For each pair of input lines with identical join fields, write a line to\n\
standard output.  The default join field is the first, delimited\n\
by whitespace.  When FILE1 or FILE2 (not both) is -, read standard input.\n\
\n\
  -a SIDE          print unpairable lines coming from file SIDE\n\
  -e EMPTY         replace missing input fields with EMPTY\n\
  -j FIELD         join on this FIELD for both files\n\
  -[j]SIDE FIELD   join on this FIELD for file SIDE\n\
  -o FORMAT        obey FORMAT while constructing output line\n\
  -t CHAR          use CHAR as input and output field separator\n\
  -v SIDE          like -a SIDE, but suppress joined output lines\n\
  -A               ascii sort order (default)\n\
  -n               numeric sort order\n\
  -h               numeric sort order w/ sexagesmal format accepted\n\
\n\
SIDE is 1 for FILE1 or 2 for FILE2.  Unless -t CHAR is given, leading blanks\n\
separate fields and are ignored, else fields are separated by CHAR.\n\
Any FIELD is a field number counted from 1.  FORMAT is one or more\n\
comma or blank separated specifications, each being `SIDE.FIELD'.\n\
Default FORMAT outputs the join field, the remaining fields from\n\
FILE1, the remaining fields from FILE2, all separated by CHAR.\n\
");
    }
  exit (status);
}