diff options
Diffstat (limited to 'src/suffixtree.cpp')
-rw-r--r-- | src/suffixtree.cpp | 358 |
1 files changed, 0 insertions, 358 deletions
diff --git a/src/suffixtree.cpp b/src/suffixtree.cpp deleted file mode 100644 index 641f502..0000000 --- a/src/suffixtree.cpp +++ /dev/null @@ -1,358 +0,0 @@ -/****************************************************************************** - * - * - * - * Copyright (C) 1997-2003 by Dimitri van Heesch. - * - * Permission to use, copy, modify, and distribute this software and its - * documentation under the terms of the GNU General Public License is hereby - * granted. No representations are made about the suitability of this software - * for any purpose. It is provided "as is" without express or implied warranty. - * See the GNU General Public License for more details. - * - * Documents produced by Doxygen are derivative works derived from the - * input used in their production; they are not affected by this license. - * - */ - -#include <stdio.h> - -#include "qtbc.h" -#include "suffixtree.h" - -#define MAXWORDLEN 1024 - -//---------------------------------------------------------------------------- - -bool writeString(QFile &f,const char *s) -{ - int len=strlen(s)+1; - return (f.writeBlock(s,len)!=len); -} - -bool writeNumber(QFile &f,int num) -{ - return (f.putch((num>>24)&0xff)==-1) || - (f.putch((num>>16)&0xff)==-1) || - (f.putch((num>>8)&0xff)==-1) || - (f.putch(num&0xff)==-1); -} - -static bool writeEncodedNumber(QFile &f,uint number) -{ - bool error=FALSE; - uint n=number; - while (n>=128) - { - int frac=n&0x7f; - error = error || (f.putch(frac|0x80)==-1); - n>>=7; - } - error = error || (f.putch(n)==-1); - return error; -} - -static int encodedNumberSize(uint number) -{ - uint n=number; - int size=1; - while (n>=128) { size++; n>>=7; } - return size; -} - -//---------------------------------------------------------------------------- - -int SuffixNodeList::compareItems(GCI item1,GCI item2) -{ - SuffixNode *n1=(SuffixNode *)item1; - SuffixNode *n2=(SuffixNode *)item2; - return strcmp(n1->label,n2->label); -} - -SuffixNode::SuffixNode(const char *lab) : references(0) -{ - children = new SuffixNodeList; - children->setAutoDelete(TRUE); - label=lab; - totalFreq=0; - branchOffset=0; -} - -SuffixNode::~SuffixNode() -{ - delete children; -} - -void SuffixNode::addReference(int refId,int inName,int fullWord) -{ - totalFreq++; - uint s=references.size(); - if (s>0 && references.at(s-1).id==refId) // word occured in the same document - { - references.at(s-1).freq++; // increase word's frequency - references.at(s-1).flags=((references.at(s-1).flags & INNAME_MASK) - | (inName<<INNAME_BIT)) - +((references.at(s-1).flags & FULLWORD_MASK) - | (fullWord<<FULLWORD_BIT)) - +((references.at(s-1).flags & WORDINNAME_MASK) - | ((inName & fullWord)<<WORDINNAME_BIT)); - } - else - { - references.resize(s+1); - references.at(s).id=refId; - references.at(s).freq=1; - references.at(s).flags=(inName<<INNAME_BIT) - +(fullWord<<FULLWORD_BIT) - +((inName && fullWord)<<WORDINNAME_BIT); - } -} - -int SuffixNode::insert(const char *word,int refId,int inName,int fullWord) -{ - int numNewNodes=0; - //printf("SuffixNode::insert(%s,%d)\n",word,refId); - SuffixNode *sn=children->first(); - while (sn) - { - const char *lab=sn->label.data(); - char w=word[0],l=lab[0],i=0; - while (w!=0 && l!=0 && w==l) { i++; w=word[i]; l=lab[i]; } - if (w==0 && l==0) // match found - { - sn->addReference(refId,inName,fullWord); - return numNewNodes; - } - if (i>0) // w and l contain a common prefix of length i - { - if (l==0) // w!=0 => follow this branch - { - sn->addReference(refId,inName,FALSE); - numNewNodes+=sn->insert(&word[i],refId,inName,fullWord); - } - else // l!=0 => split branch - { - char leftlab[MAXWORDLEN]; - memcpy(leftlab,lab,i); - leftlab[i]='\0'; - SuffixNode *r = new SuffixNode(leftlab); - numNewNodes++; - SuffixNode *n2 = children->take(); - // copy reference info - r->references = n2->references.copy(); - int j,refSize = r->references.size(); - for (j=0;j<refSize;j++) - { - //r->references[j].fullWord=FALSE; - //r->references[j].wordInName=FALSE; - r->references[j].flags &= ~(FULLWORD_MASK|WORDINNAME_MASK); - } - r->totalFreq = n2->totalFreq; - //printf("root branch `%s'\n",leftlab); - if (w!=0) // two sub branches - { - SuffixNode *n1 = new SuffixNode(&word[i]); - numNewNodes++; - n1->addReference(refId,inName,fullWord); - r->addReference(refId,inName,FALSE); - r->children->append(n1); - //printf("Right branch `%s'\n",&word[i]); - } - else // one sub branch - { - r->addReference(refId,inName,fullWord); - } - //printf("Left branch `%s'\n",&lab[i]); - n2->label=&lab[i]; - r->children->append(n2); - children->append(r); - } - return numNewNodes; - } - sn=children->next(); - } - //printf("new branch `%s'\n",word); - SuffixNode *n=new SuffixNode(word); - numNewNodes++; - n->addReference(refId,inName,fullWord); - children->append(n); - return numNewNodes; -} - -void SuffixNode::dump(int level,const char *prefix) -{ - uint i; - if (references.size()>0) - { - printf("%s (level=%d offset=%d freq=%d) ", - prefix,level,branchOffset,totalFreq); - for (i=0;i<references.size();i++) - printf("%d->%d ",references.at(i).id,references.at(i).freq); - printf("\n"); - } - SuffixNode *sn=children->first(); - while (sn) - { - sn->dump(level+1,prefix+("-"+sn->label)); - sn=children->next(); - } -} - -void SuffixNode::resolveForwardReferences(int &offset) -{ - if (children->count()>0) - { - if (!label.isEmpty()) offset++; // terminator for the previous level - branchOffset=offset; - } - else - branchOffset=0; - SuffixNode *sn=children->first(); - while (sn) - { - offset+=sn->label.length()+5; - uint i,refs=sn->references.size(); - if (refs>0) - { - offset+=encodedNumberSize(sn->totalFreq); - offset+=encodedNumberSize((sn->references[0].id<<3)+ - sn->references[0].flags); - offset+=encodedNumberSize(sn->references[0].freq); - for (i=1;i<refs;i++) - { - offset+=encodedNumberSize( - ((sn->references.at(i).id - sn->references.at(i-1).id)<<3)+ - sn->references.at(i).flags); - offset+=encodedNumberSize(sn->references.at(i).freq); - } - offset+=encodedNumberSize(0); - } - //printf("Lab=%s offset=%d\n",sn->lab.data(),offset); - sn=children->next(); - } - sn=children->first(); - while (sn) - { - //printf("Lab=%s offset=%d\n",sn->lab.data(),offset); - sn->resolveForwardReferences(offset); - sn=children->next(); - } -} - -int SuffixNode::size() -{ - int s=0; - if (!label.isEmpty() && children->count()>0) s++; // for the terminator - SuffixNode *sn=children->first(); - while (sn) - { - uint i,refs=sn->references.size(); - s+=sn->size()+sn->label.length()+5; - if (refs>0) - { - s+=encodedNumberSize(sn->totalFreq); - s+=encodedNumberSize( - (sn->references[0].id<<3)+ - sn->references[0].flags); - s+=encodedNumberSize(sn->references[0].freq); - for (i=1;i<refs;i++) - { - s+=encodedNumberSize( - ((sn->references.at(i).id - sn->references.at(i-1).id)<<3)+ - sn->references.at(i).flags); - s+=encodedNumberSize(sn->references.at(i).freq); - } - s+=encodedNumberSize(0); - } - sn=children->next(); - } - return s; -} - -bool SuffixNode::write(QFile &f) -{ - bool error=FALSE; - if (children->count()>0 && !label.isEmpty()) error=error || (f.putch(0)==-1); - SuffixNode *sn=children->first(); - while (sn) - { - //offset+=sn->lab.length()+1+2*sizeof(int); - int i,refs=sn->references.size(); - error=error || writeString(f,sn->label); - error=error || writeNumber(f,sn->branchOffset|((refs==0)?0x80000000:0)); - if (refs>0) - { - error=error || writeEncodedNumber(f,sn->totalFreq); - error=error || writeEncodedNumber(f, - (sn->references[0].id<<3)+ - sn->references[0].flags); - error=error || writeEncodedNumber(f,sn->references[0].freq); - for (i=1;i<refs;i++) - { - error=error || writeEncodedNumber(f, - ((sn->references[i].id - sn->references[i-1].id)<<3)+ - sn->references[i].flags); - error=error || writeEncodedNumber(f,sn->references[i].freq); - } - error=error || writeEncodedNumber(f,0); - } - //printf("Lab=%s offset=%d\n",sn->lab.data(),offset); - sn=children->next(); - } - sn=children->first(); - while (sn) - { - error=error || sn->write(f); - sn=children->next(); - } - return error; -} - -//---------------------------------------------------------------------------- - -SuffixTree::SuffixTree() -{ - root=new SuffixNode(""); - nodes=1; -} - -SuffixTree::~SuffixTree() -{ - delete root; -} - -void SuffixTree::insertWord(const char *word,int index,bool inName) -{ - QCString suffix=word; - uint i; - for (i=2;i<suffix.length();i++) - { - //printf("Inserting suffix %s\n",suffix.right(i).data()); - nodes+=root->insert(suffix.right(i),index,inName?1:0,FALSE); - } - nodes+=root->insert(word,index,inName?1:0,TRUE); -} - -void SuffixTree::dump() -{ - root->dump(0,""); -} - -void SuffixTree::resolveForwardReferences() -{ - int offset=8; - root->resolveForwardReferences(offset); -} - -int SuffixTree::size() -{ - return root->size(); -} - -bool SuffixTree::write(QFile &f) -{ - if (!f.isOpen()) { printf("File not open\n"); return FALSE; } - bool error=FALSE; - error = error || root->write(f); - return !error; -} |