/****************************************************************************** * * $Id$ * * Copyright (C) 1997-1999 by Dimitri van Heesch. * * Permission to use, copy, modify, and distribute this software and its * documentation under the terms of the GNU General Public License is hereby * granted. No representations are made about the suitability of this software * for any purpose. It is provided "as is" without express or implied warranty. * See the GNU General Public License for more details. * * All output generated with Doxygen is not covered by this license. * */ #include #include "suffixtree.h" #define MAXWORDLEN 1024 //---------------------------------------------------------------------------- bool writeString(QFile &f,const char *s) { int len=strlen(s)+1; return (f.writeBlock(s,len)!=len); } bool writeNumber(QFile &f,int num) { return (f.putch((num>>24)&0xff)==-1) || (f.putch((num>>16)&0xff)==-1) || (f.putch((num>>8)&0xff)==-1) || (f.putch(num&0xff)==-1); } bool writeEncodedNumber(QFile &f,uint number) { bool error=FALSE; uint n=number; while (n>=128) { int frac=n&0x7f; error = error || (f.putch(frac|0x80)==-1); n>>=7; } error = error || (f.putch(n)==-1); return error; } int encodedNumberSize(uint number) { uint n=number; int size=1; while (n>=128) { size++; n>>=7; } return size; } //---------------------------------------------------------------------------- int SuffixNodeList::compareItems(GCI item1,GCI item2) { SuffixNode *n1=(SuffixNode *)item1; SuffixNode *n2=(SuffixNode *)item2; return strcmp(n1->label,n2->label); } SuffixNode::SuffixNode(const char *lab) : references(0) { children = new SuffixNodeList; children->setAutoDelete(TRUE); label=lab; totalFreq=0; branchOffset=0; } SuffixNode::~SuffixNode() { delete children; } void SuffixNode::addReference(int refId,int inName,int fullWord) { totalFreq++; uint s=references.size(); if (s>0 && references.at(s-1).id==refId) // word occured in the same document { references.at(s-1).freq++; // increase word's frequency references.at(s-1).flags=((references.at(s-1).flags & INNAME_MASK) | (inName<first(); while (sn) { char *lab=sn->label.data(); char w=word[0],l=lab[0],i=0; while (w!=0 && l!=0 && w==l) { i++; w=word[i]; l=lab[i]; } if (w==0 && l==0) // match found { sn->addReference(refId,inName,fullWord); return numNewNodes; } if (i>0) // w and l contain a common prefix of length i { if (l==0) // w!=0 => follow this branch { sn->addReference(refId,inName,FALSE); numNewNodes+=sn->insert(&word[i],refId,inName,fullWord); } else // l!=0 => split branch { char leftlab[MAXWORDLEN]; memcpy(leftlab,lab,i); leftlab[i]='\0'; SuffixNode *r = new SuffixNode(leftlab); numNewNodes++; SuffixNode *n2 = children->take(); // copy reference info r->references = n2->references.copy(); int j,refSize = r->references.size(); for (j=0;jreferences[j].fullWord=FALSE; //r->references[j].wordInName=FALSE; r->references[j].flags &= ~(FULLWORD_MASK|WORDINNAME_MASK); } r->totalFreq = n2->totalFreq; //printf("root branch `%s'\n",leftlab); if (w!=0) // two sub branches { SuffixNode *n1 = new SuffixNode(&word[i]); numNewNodes++; n1->addReference(refId,inName,fullWord); r->addReference(refId,inName,FALSE); r->children->append(n1); //printf("Right branch `%s'\n",&word[i]); } else // one sub branch { r->addReference(refId,inName,fullWord); } //printf("Left branch `%s'\n",&lab[i]); n2->label=&lab[i]; r->children->append(n2); children->append(r); } return numNewNodes; } sn=children->next(); } //printf("new branch `%s'\n",word); SuffixNode *n=new SuffixNode(word); numNewNodes++; n->addReference(refId,inName,fullWord); children->append(n); return numNewNodes; } void SuffixNode::dump(int level,const char *prefix) { uint i; if (references.size()>0) { printf("%s (level=%d offset=%d freq=%d) ", prefix,level,branchOffset,totalFreq); for (i=0;i%d ",references.at(i).id,references.at(i).freq); printf("\n"); } SuffixNode *sn=children->first(); while (sn) { sn->dump(level+1,prefix+("-"+sn->label)); sn=children->next(); } } void SuffixNode::resolveForwardReferences(int &offset) { if (children->count()>0) { if (label.length()>0) offset++; // terminator for the previous level branchOffset=offset; } else branchOffset=0; SuffixNode *sn=children->first(); while (sn) { offset+=sn->label.length()+5; uint i,refs=sn->references.size(); if (refs>0) { offset+=encodedNumberSize(sn->totalFreq); offset+=encodedNumberSize((sn->references[0].id<<3)+ sn->references[0].flags); offset+=encodedNumberSize(sn->references[0].freq); for (i=1;ireferences.at(i).id - sn->references.at(i-1).id)<<3)+ sn->references.at(i).flags); offset+=encodedNumberSize(sn->references.at(i).freq); } offset+=encodedNumberSize(0); } //printf("Lab=%s offset=%d\n",sn->lab.data(),offset); sn=children->next(); } sn=children->first(); while (sn) { //printf("Lab=%s offset=%d\n",sn->lab.data(),offset); sn->resolveForwardReferences(offset); sn=children->next(); } } int SuffixNode::size() { int s=0; if (label.length()>0 && children->count()>0) s++; // for the terminator SuffixNode *sn=children->first(); while (sn) { uint i,refs=sn->references.size(); s+=sn->size()+sn->label.length()+5; if (refs>0) { s+=encodedNumberSize(sn->totalFreq); s+=encodedNumberSize( (sn->references[0].id<<3)+ sn->references[0].flags); s+=encodedNumberSize(sn->references[0].freq); for (i=1;ireferences.at(i).id - sn->references.at(i-1).id)<<3)+ sn->references.at(i).flags); s+=encodedNumberSize(sn->references.at(i).freq); } s+=encodedNumberSize(0); } sn=children->next(); } return s; } bool SuffixNode::write(QFile &f) { bool error=FALSE; if (children->count()>0 && label.length()>0) error=error || (f.putch(0)==-1); SuffixNode *sn=children->first(); while (sn) { //offset+=sn->lab.length()+1+2*sizeof(int); int i,refs=sn->references.size(); error=error || writeString(f,sn->label); error=error || writeNumber(f,sn->branchOffset|((refs==0)?0x80000000:0)); if (refs>0) { error=error || writeEncodedNumber(f,sn->totalFreq); error=error || writeEncodedNumber(f, (sn->references[0].id<<3)+ sn->references[0].flags); error=error || writeEncodedNumber(f,sn->references[0].freq); for (i=1;ireferences[i].id - sn->references[i-1].id)<<3)+ sn->references[i].flags); error=error || writeEncodedNumber(f,sn->references[i].freq); } error=error || writeEncodedNumber(f,0); } //printf("Lab=%s offset=%d\n",sn->lab.data(),offset); sn=children->next(); } sn=children->first(); while (sn) { error=error || sn->write(f); sn=children->next(); } return error; } //---------------------------------------------------------------------------- SuffixTree::SuffixTree() { root=new SuffixNode(""); nodes=1; } SuffixTree::~SuffixTree() { delete root; } void SuffixTree::insertWord(const char *word,int index,bool inName) { QString suffix=word; uint i; for (i=2;iinsert(suffix.right(i),index,inName?1:0,FALSE); } nodes+=root->insert(word,index,inName?1:0,TRUE); } void SuffixTree::dump() { root->dump(0,""); } void SuffixTree::resolveForwardReferences() { int offset=8; root->resolveForwardReferences(offset); } int SuffixTree::size() { return root->size(); } bool SuffixTree::write(QFile &f) { if (!f.isOpen()) { printf("File not open\n"); return FALSE; } bool error=FALSE; error = error || root->write(f); return !error; }