/* * Copyright (c) 2005 Smithsonian Astrophysical Observatory */ /* * funjoin -j key t1.fits t2.fits t3.fits foo.fits */ #include #include #include /* #define JOIN_DEBUG 1 */ #if JOIN_DEBUG #define IPRINTF(x) fprintf x #else #define IPRINTF(x) #endif #ifndef ftol #define ftol(x,y,t) (fabs((double)x-(double)y)<=((double)t+(double)1.0E-15)) #endif #ifndef feq #define feq(x,y) (fabs((double)x-(double)y)<=(double)1.0E-15) #endif #define MAXIFILE 32 #define MAXOFILE 1 #define MAXROW 8192 #define KEY_STRING 1 #define KEY_NUMERIC 2 #define JFILES_COL "JFILES" typedef unsigned int JBITMASK; #define JBITSIZE (int)sizeof(JBITMASK) static int maxrow=MAXROW; typedef struct _colrec{ char *name, *oname; int type, mode, offset, n, width; int coffset; } *XCol, XColRec; typedef struct _filerec{ char *fname; char *jname; char *bstr; char *actstr; Fun fun; int eof; int rowsize; int rowoffset; char *rowbuf; char *blank; int counter; int tcol; int maxcol; int ncol; XCol cols; int dtype; int jtype, jmode, joffset, jn, jwidth; int idx; double dval; double mval; char *sval; int ibase; GIO igio; int irow; int maxindex; int nindex; int *indexes; } *XFile, XFileRec; extern char *optarg; extern int optind; #ifdef ANSI_FUNC static void usage (char *fname) #else static void usage(fname) char *fname; #endif { fprintf(stderr, "usage: %s ifile1 ifile2 ... ifilen ofile\n", fname); fprintf(stderr, "optional switches:\n"); fprintf(stderr, " -a cols # columns to activate in all files\n"); fprintf(stderr, " -a1 cols ... an cols # columns to activate in each file\n"); fprintf(stderr, " -b 'c1:bvl,c2:bv2' # blank values for common columns in all files\n"); fprintf(stderr, " -bn 'c1:bv1,c2:bv2' # blank values for columns in specific files\n"); fprintf(stderr, " -j col # column to join in all files\n"); fprintf(stderr, " -j1 col ... jn col # column to join in each file\n"); fprintf(stderr, " -m min # min matches to output a row\n"); fprintf(stderr, " -M max # max matches to output a row\n"); fprintf(stderr, " -s # add 'jfiles' status column\n"); fprintf(stderr, " -S col # add col as status column\n"); fprintf(stderr, " -t tol # tolerance for joining numeric cols [2 files only]\n"); fprintf(stderr, "Between 2 and %d input files are allowed.\n", MAXIFILE); fprintf(stderr, "\n(version: %s)\n", FUN_VERSION); exit(1); } #ifdef ANSI_FUNC static void JoinMakeBlank(XFile file, char *defblank) #else static void JoinMakeBlank(file, defblank) XFile file; char *defblank; #endif { int i, j; int ip=0; char tbuf[SZ_LINE]; char *b; char *blanks[2]; char *t, *v; unsigned char bval; short sval; unsigned short usval; int ival; longlong lval; unsigned int uival; float fval; double dval; /* start with a blank line */ file->blank = xcalloc(file->rowsize, sizeof(char)); /* blank specifications are separated by comma or semi */ newdtable(",;"); /* there are 2 places where we can get blank values: default and file */ if( defblank ) blanks[0] = xstrdup(defblank); else blanks[0] = NULL; if( file->bstr ) blanks[1] = xstrdup(file->bstr); else blanks[1] = NULL; /* process defaults first, then overwrite with specifics */ for(j=0; j<2; j++){ if( !blanks[j] || !*blanks[j] ) continue; ip = 0; while( word(blanks[j], tbuf, &ip) ){ if( (t=strchr(tbuf, ':')) ){ v=t+1; *t = '\0'; for(i=0; incol; i++){ if( !strcasecmp(tbuf, file->cols[i].oname) ){ b = file->blank+file->cols[i].coffset; switch(file->cols[i].type){ case 'L': bval = atoi(v); memcpy(b, &bval, sizeof(unsigned char)); break; case 'X': switch(file->cols[i].width/file->cols[i].n){ case 0: case 1: bval = atoi(v); memcpy(b, &bval, sizeof(unsigned char)); break; case 2: sval = atoi(v); memcpy(b, &sval, sizeof(short)); break; case 4: ival = atoi(v); memcpy(b, &ival, sizeof(int)); break; default: gerror(stderr, "only set blank value on X of size 1,2,4\n"); break; } break; case 'B': bval = atoi(v); memcpy(b, &bval, sizeof(unsigned char)); break; case 'I': sval = atoi(v); memcpy(b, &sval, sizeof(short)); break; case 'J': ival = atoi(v); memcpy(b, &ival, sizeof(int)); break; case 'K': #if HAVE_LONG_LONG == 0 gerror(stderr, "64-bit data support not built (long long not available)\n"); #endif lval = atoll(v); memcpy(b, &lval, sizeof(longlong)); break; case 'U': usval = atoi(v); memcpy(b, &usval, sizeof(unsigned short)); break; case 'V': uival = atoi(v); memcpy(b, &uival, sizeof(unsigned int)); break; case 'E': if( !strcasecmp(t, "nan") ){ fval = getnanf(); } else{ fval = atof(v); } memcpy(b, &fval, sizeof(float)); break; case 'D': if( !strcasecmp(t, "nan") ){ dval = getnand(); } else{ dval = atof(v); } memcpy(b, &dval, sizeof(double)); break; case 'A': strncpy(b, v, file->cols[i].width); break; } } } } else{ gerror(stderr, "invalid blank specification: %s\n", tbuf); } } } /* free up delim table */ freedtable(); /* free blank strings */ for(i=0; i<2; i++){ if( blanks[i] ) xfree(blanks[i]); } } #ifdef ANSI_FUNC static int JoinAddCol(XFile file, char *name, char *oname, int type, int mode, int offset, int n, int width, int coffset) #else static int JoinAddCol(file, name, oname, type, mode, offset, n, width, coffset) XFile file; char *name; char *oname; int type; int mode; int offset; int n; int width; int coffset; #endif { if( !file->maxcol ){ file->maxcol = 1; file->cols = xcalloc(file->maxcol, sizeof(XColRec)); } else if( file->ncol >= file->maxcol ){ file->maxcol *= 2; file->cols = xrealloc(file->cols, file->maxcol*sizeof(XColRec)); } file->cols[file->ncol].name = xstrdup(name);; file->cols[file->ncol].oname = xstrdup(oname);; file->cols[file->ncol].type = type; file->cols[file->ncol].mode = mode; file->cols[file->ncol].offset = offset; file->cols[file->ncol].n = n; file->cols[file->ncol].width = width; file->cols[file->ncol].coffset = coffset; file->ncol++; return file->ncol; } #ifdef ANSI_FUNC static int JoinFilesLeft(XFile ifiles, int nfile) #else static int JoinFilesLeft(ifiles, nfile) XFile ifiles; int nfile; #endif { int i; int left=0; for(i=0; i nfile ) return 0; for(i=lo; i<=hi; i++){ if( ifiles[i].eof ) continue; FunTableRowGet(ifiles[i].fun, (void *)&(ifiles[i]), 1, NULL, &got); if( !got ){ ifiles[i].eof = 1; } else{ nrec++; } } return nrec; } #ifdef ANSI_FUNC static int JoinGetMatches(XFile ifiles, int nfile, int ktype, double tol, char *matches) #else static int JoinGetMatches(ifiles, nfile, ktype, tol, matches) XFile ifiles; int nfile; int ktype; double tol; char *matches; #endif { int i; int m=0; int ibase=-1; char *sval=NULL; double dval=0.0; /* no matches yet */ memset(matches, 0, nfile); /* clear index position information */ for(i=0; i 0.0) && ftol(ifiles[i].dval,dval,tol) ){ matches[m++] = i; ifiles[i].mval = dval; FunInfoGet(ifiles[i].fun, FUN_ROW, &ifiles[i].irow, 0); } /* "exact" match */ else if( feq(ifiles[i].dval,dval) ){ matches[m++] = i; ifiles[i].irow = -1; } } break; } if( ibase >=0 ) ifiles[ibase].ibase = 1; return m; } #ifdef ANSI_FUNC static int JoinAddIndex(XFile ifile, int idx) #else static int JoinAddIndex(ifile, idx) XFile ifile; int idx; #endif { if( !ifile->maxindex ){ ifile->maxindex = 1; ifile->indexes = xcalloc(ifile->maxindex, sizeof(int)); } else if( ifile->nindex >= ifile->maxindex ){ ifile->maxindex *= 2; ifile->indexes = xrealloc(ifile->indexes, ifile->maxindex*sizeof(int)); } ifile->indexes[ifile->nindex++] = idx; return ifile->nindex; } #ifdef ANSI_FUNC static void JoinGatherRows(XFile ifiles, int nfile, int ktype, double tol, char *matches, int nmatch, int *resetflag) #else static void JoinGatherRows(ifiles, nfile, ktype, tol, matches, nmatch, resetflag) XFile ifiles; int nfile; int ktype; double tol; char *matches; int nmatch; int *resetflag #endif { int i, j; int ibase=-1; char *sval=NULL; double dval; double mval; /* no need to reset rows yet */ *resetflag = -1; /* make sure we have matches to process */ if( !nmatch ) return; /* find base value */ for(j=0; j 0.0) && (i!=ibase) && ftol(ifiles[i].dval,mval,tol) ){ JoinAddIndex(&ifiles[i], ifiles[i].idx); } else if( feq(ifiles[i].dval,dval) ){ JoinAddIndex(&ifiles[i], ifiles[i].idx); } else{ /* If this is the base file and we are using a tolerance, and the difference between the last base and this base value is less than twice the tolerance, we have to reset all other file positions and re-check those values against this new base value. */ if( (tol > 0.0) && (i==ibase) && ftol(ifiles[i].dval,dval,(2*tol)) ){ *resetflag = ibase; } break; } } } } } #ifdef ANSI_FUNC static int JoinWriteRows(XFile files, XFile ifiles, int nfile, char *matches, int nmatch, int jbits, XFile ofiles) #else static int JoinWriteRows(files, ifiles, nfile, matches, nmatch, jbits, ofiles) XFile files; XFile ifiles; int nfile; char *matches; int nmatch; int jbits; XFile ofiles; #endif { int i, j, k; int ii; int got; int nrow=1; char *buf; char *rowptr; char *flags=NULL; JBITMASK *jfiles=NULL; /* allocate flags to tell which files we proces */ if( !(flags=xcalloc(nfile, sizeof(char))) ){ return 0; } /* allocate space for joinfiles bitflag */ if( jbits ){ if( !(jfiles=xcalloc(jbits/JBITSIZE, JBITSIZE)) ){ return 0; } } /* set initial values for files which have joins */ for(i=0; i=0; j--){ ii = matches[j]; ifiles[ii].counter++; if( ifiles[ii].counter >= ifiles[ii].nindex ){ ifiles[ii].counter = 0; } else{ break; } } } /* free up space */ if( flags ) xfree(flags); if( jfiles) xfree(jfiles); return 1; } #ifdef ANSI_FUNC static void JoinResetRows(XFile ifiles, char *matches, int nmatch, int resetflag) #else static void JoinResetRows(ifiles, matches, nmatch, resetflag) XFile ifiles; char *matches; int nmatch; int resetflag; #endif { int i, j; /* reset index positions so that we re-check tolerances */ if( resetflag >= 0 ){ for(j=0; j=0) ){ ifiles[i].eof = 0; FunTableRowSeek(ifiles[i].fun, ifiles[i].irow, NULL); JoinReadNext(ifiles, i, i); } } } } #ifdef ANSI_FUNC int main (int argc, char **argv) #else int main(argc, argv) int argc; char **argv; #endif { int i, j, k; int type, mode, offset, n, width; int namei; int coffset; int resetflag=-1; int jbits=0; int nmatch=0; int minmatch=1; int maxmatch=MAXIFILE; int nfile=0; int tcol=0; int osize=0; int oncol=0; int ktype=0; int *ooffsets=NULL; char *matches=NULL; char *name; char *basename; char *defact=NULL; char *defcol=NULL; char *defblank=NULL; char *s; char *filtstr; char *jfiles=NULL; char **onames=NULL; char **otypes=NULL; char **omodes=NULL; char tbuf[SZ_LINE]; char tbuf2[SZ_LINE]; char namebuf[SZ_LINE]; double tlmin, tlmax, binsiz, tscale, tzero; double tol; XFile files=NULL, ifiles=NULL, ofiles=NULL; /* exit on gio errors */ if( !getenv("GERROR") ) setgerror(2); /* get maxrow,if user-specified */ if( (s=getenv("FUN_MAXROW")) != NULL ) maxrow = atoi(s); /* we are using indexes specially and don't want to use them normally */ putenv("FILTER_IDX_ACTIVATE=false"); /* allocate input and output file arrays (we'll do the index array later) */ if( !(files=xcalloc(MAXIFILE, sizeof(XFileRec))) ){ gerror(stderr, "can't allocate primary record structure for join\n"); } if( !(ofiles=xcalloc(MAXOFILE, sizeof(XFileRec))) ){ gerror(stderr, "can't allocate output record structure for join\n"); } /* process arguments */ for(i=1; i= 0) && (j < MAXIFILE) && (i < argc-1)){ files[j].actstr = argv[++i]; } else{ gerror(stderr, "invalid index for column activate: %d\n", j+1); } } else{ defact = argv[++i]; } break; case 'b': if( argv[i][2] ){ j = atoi(&argv[i][2])-1; if( (j >= 0) && (j < MAXIFILE) && (i < argc-1)){ files[j].bstr = argv[++i]; } else{ gerror(stderr, "invalid index for join column: %d\n", j+1); } } else{ defblank = argv[++i]; } break; case 'j': if( argv[i][2] ){ j = atoi(&argv[i][2])-1; if( (j >= 0) && (j < MAXIFILE) && (i < argc-1)){ files[j].jname = argv[++i]; } else{ gerror(stderr, "invalid index for join column: %d\n", j+1); } } else{ if (i < argc-1) { defcol = argv[++i]; } } break; case 'm': if (i < argc-1) { minmatch = atoi(argv[++i])+1; } if( minmatch < 1 ) minmatch = 1; break; case 'M': if (i < argc-1) { maxmatch = atoi(argv[++i])+1; } if( maxmatch < 1 ) maxmatch = 1; break; case 's': jfiles = JFILES_COL; break; case 'S': if (i < argc-1) { jfiles = argv[++i]; } break; case 't': if (i < argc-1) { tol = atof(argv[++i]); } if( tol <= 0 ){ gerror(stderr, "tolerance value must be positive\n"); } break; } continue; } /* no switch -- must be a file name */ if( nfile < MAXIFILE ){ files[nfile].fname = argv[i]; nfile++; } else{ gerror(stderr, "too many files (%d > %d)\n", nfile, MAXIFILE); } } /* make sure we have at least 2 input + 1 output file args */ if( nfile < 3 ) usage(argv[0]); /* for now, tolerance only works with 2 files */ if( (tol > 0.0) && (nfile > 3) ){ gerror(stderr, "for now, -t [tol] can only join 2 files\n"); } /* move last input to output and decrement number of input files */ ofiles[0].fname = files[nfile-1].fname; nfile--; /* reallocate input files */ if( !(files=xrealloc(files, nfile*sizeof(XFileRec))) ){ gerror(stderr, "can't re-allocate primary record structure for join\n"); } /* make sure we have a join column name for each file */ for(i=0; iheader->filename, NULL); s = idxindexfilename(files[i].jname, NULL); idxfreefilenames(); if( !s ){ gerror(stderr, "can't find index file for column '%s' in file: %s\n", files[i].jname, files[i].fname); } if( !(ifiles[i].fun = FunOpen(s, "r", NULL)) ){ gerror(stderr, "can't FunOpen index file (or find extension): %s\n", s); } /* get gio handle for seeking and sving */ FunInfoGet(ifiles[i].fun, FUN_GIO, &ifiles[i].igio, 0); /* make sure the join column is in the index file */ ifiles[i].fname = xstrdup(s); ifiles[i].jname = files[i].jname; if( !FunColumnLookup(ifiles[i].fun, ifiles[i].jname, 0, NULL, &ifiles[i].jtype, &ifiles[i].jmode, &ifiles[i].joffset, &ifiles[i].jn, &ifiles[i].jwidth) ){ gerror(stderr, "can't find column %s in index file: %s\n", ifiles[i].jname, ifiles[i].fname); } /* define how we will read index file, based on data type of join column */ switch(ifiles[i].jtype){ case 'B': case 'I': case 'J': case 'K': case 'U': case 'V': case 'L': case 'X': FunColumnSelect(ifiles[i].fun, sizeof(XFileRec), NULL, "n", "J", "r", FUN_OFFSET(XFile, idx), ifiles[i].jname, "D", "r", FUN_OFFSET(XFile, dval), NULL); ifiles[i].dtype = 'D'; ktype |= KEY_NUMERIC; break; case 'D': case 'E': FunColumnSelect(ifiles[i].fun, sizeof(XFileRec), NULL, "n", "J", "r", FUN_OFFSET(XFile, idx), ifiles[i].jname, "D", "r", FUN_OFFSET(XFile, dval), NULL); ifiles[i].dtype = 'D'; ktype |= KEY_NUMERIC; break; case 'A': snprintf(tbuf, SZ_LINE-1, "@%dA", ifiles[i].jn); FunColumnSelect(ifiles[i].fun, sizeof(XFileRec), NULL, "n", "J", "r", FUN_OFFSET(XFile, idx), ifiles[i].jname, tbuf, "r", FUN_OFFSET(XFile, sval), NULL); ifiles[i].dtype = 'A'; ifiles[i].sval = xcalloc(ifiles[i].jn+1, sizeof(char)); ktype |= KEY_STRING; break; default: gerror(stderr, "bad datatype for join column: %c\n", ifiles[i].jtype); } /* free up temp space */ if( s ) xfree(s); } /* we don't allow mixing of string and numeric values */ if( ktype == (KEY_STRING|KEY_NUMERIC) ){ gerror(stderr, "can't mix string and numeric join columns\n"); } /* open output file */ if( !(ofiles[0].fun = FunOpen(ofiles[0].fname, "w", NULL)) ){ gerror(stderr, "can't FunOpen output file: %s\n", ofiles[0].fname); } /* allocate space for the max number of columns we can have (incl jfiles) */ onames = (char **)xcalloc(tcol+1, sizeof(char *)); otypes = (char **)xcalloc(tcol+1, sizeof(char *)); omodes = (char **)xcalloc(tcol+1, sizeof(char *)); ooffsets = (int *)xcalloc(tcol+1, sizeof(int)); /* contruct list of output columns */ for(i=0; i %s\n", i, name, type, mode, offset, n, tbuf)); /* save column info */ onames[oncol] = xstrdup(name); otypes[oncol] = xstrdup(tbuf); omodes[oncol] = "w"; ooffsets[oncol] = osize; /* total size for all files */ osize += width; oncol++; } } /* create blank line for this file */ JoinMakeBlank(&files[i], defblank); /* add filename to header */ FunParamPuts(ofiles[0].fun, "JFILE", i+1, files[i].fname, "join file", 1); } /* and one more for joinfiles, if needed */ if( jfiles ){ strncpy(tbuf, jfiles, SZ_LINE-1); cluc(tbuf); onames[oncol] = xstrdup(tbuf); jbits = ((MAXIFILE+7)/8)*8; snprintf(tbuf, SZ_LINE-1, "%dX", jbits); otypes[oncol] = xstrdup(tbuf); omodes[oncol] = "w"; ooffsets[oncol] = osize; /* save for later use */ ofiles[0].rowoffset = osize; /* total size for all files */ osize += (jbits/8); oncol++; } /* reallocate output column array to correct size */ onames = (char **)xrealloc(onames, oncol*sizeof(char *)); otypes = (char **)xrealloc(otypes, oncol*sizeof(char *)); omodes = (char **)xrealloc(omodes, oncol*sizeof(char *)); ooffsets = (int *)xrealloc(ooffsets, oncol*sizeof(int)); /* set up the output columns */ FunColumnSelectArr(ofiles[0].fun, osize, NULL, onames, otypes, omodes, ooffsets, oncol); /* this tells us the size of the output buffer */ FunInfoGet(ofiles[0].fun, FUN_ROWSIZE, &(ofiles[0].rowsize), 0); ofiles[0].rowbuf = xcalloc(ofiles[0].rowsize, sizeof(char)); /* read a record from each index */ JoinReadNext(ifiles, nfile, -nfile); /* check for joins, all rows in all files */ while( JoinFilesLeft(ifiles, nfile) > 0 ){ nmatch = JoinGetMatches(ifiles, nfile, ktype, tol, matches); /* yikes ... when we reset rows to check against the next base, we don't want to write out anything if there is no match, since this was already done with the last base ... its confusing */ if( (tol > 0.0) && (nmatch == 1) && (resetflag >=0) && (resetflag != matches[0]) ){ for(i=0; i= minmatch) && (nmatch <= maxmatch) ){ JoinGatherRows(ifiles, nfile, ktype, tol, matches, nmatch, &resetflag); /* write all matched rows */ if(!JoinWriteRows(files, ifiles, nfile, matches, nmatch, jbits, ofiles)){ gerror(stderr, "can't write rows for join\n"); } /* might have to reset the rows when using tolerance values */ JoinResetRows(ifiles, matches, nmatch, resetflag); } /* no output, so we advance the rows that matched */ else{ for(i=0; i