summaryrefslogtreecommitdiffstats
path: root/src/suffixtree.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/suffixtree.cpp')
-rw-r--r--src/suffixtree.cpp355
1 files changed, 355 insertions, 0 deletions
diff --git a/src/suffixtree.cpp b/src/suffixtree.cpp
new file mode 100644
index 0000000..604531f
--- /dev/null
+++ b/src/suffixtree.cpp
@@ -0,0 +1,355 @@
+/******************************************************************************
+ *
+ * $Id$
+ *
+ * Copyright (C) 1997-1999 by Dimitri van Heesch.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation under the terms of the GNU General Public License is hereby
+ * granted. No representations are made about the suitability of this software
+ * for any purpose. It is provided "as is" without express or implied warranty.
+ * See the GNU General Public License for more details.
+ *
+ * All output generated with Doxygen is not covered by this license.
+ *
+ */
+
+#include <stdio.h>
+#include "suffixtree.h"
+
+#define MAXWORDLEN 1024
+
+//----------------------------------------------------------------------------
+
+bool writeString(QFile &f,const char *s)
+{
+ int len=strlen(s)+1;
+ return (f.writeBlock(s,len)!=len);
+}
+
+bool writeNumber(QFile &f,int num)
+{
+ return (f.putch((num>>24)&0xff)==-1) ||
+ (f.putch((num>>16)&0xff)==-1) ||
+ (f.putch((num>>8)&0xff)==-1) ||
+ (f.putch(num&0xff)==-1);
+}
+
+bool writeEncodedNumber(QFile &f,uint number)
+{
+ bool error=FALSE;
+ uint n=number;
+ while (n>=128)
+ {
+ int frac=n&0x7f;
+ error = error || (f.putch(frac|0x80)==-1);
+ n>>=7;
+ }
+ error = error || (f.putch(n)==-1);
+ return error;
+}
+
+int encodedNumberSize(uint number)
+{
+ uint n=number;
+ int size=1;
+ while (n>=128) { size++; n>>=7; }
+ return size;
+}
+
+//----------------------------------------------------------------------------
+
+int SuffixNodeList::compareItems(GCI item1,GCI item2)
+{
+ SuffixNode *n1=(SuffixNode *)item1;
+ SuffixNode *n2=(SuffixNode *)item2;
+ return strcmp(n1->label,n2->label);
+}
+
+SuffixNode::SuffixNode(const char *lab) : references(0)
+{
+ children = new SuffixNodeList;
+ children->setAutoDelete(TRUE);
+ label=lab;
+ totalFreq=0;
+ branchOffset=0;
+}
+
+SuffixNode::~SuffixNode()
+{
+ delete children;
+}
+
+void SuffixNode::addReference(int refId,int inName,int fullWord)
+{
+ totalFreq++;
+ uint s=references.size();
+ if (s>0 && references.at(s-1).id==refId) // word occured in the same document
+ {
+ references.at(s-1).freq++; // increase word's frequency
+ references.at(s-1).flags=((references.at(s-1).flags & INNAME_MASK)
+ | (inName<<INNAME_BIT))
+ +((references.at(s-1).flags & FULLWORD_MASK)
+ | (fullWord<<FULLWORD_BIT))
+ +((references.at(s-1).flags & WORDINNAME_MASK)
+ | ((inName & fullWord)<<WORDINNAME_BIT));
+ }
+ else
+ {
+ references.resize(s+1);
+ references.at(s).id=refId;
+ references.at(s).freq=1;
+ references.at(s).flags=(inName<<INNAME_BIT)
+ +(fullWord<<FULLWORD_BIT)
+ +((inName && fullWord)<<WORDINNAME_BIT);
+ }
+}
+
+int SuffixNode::insert(const char *word,int refId,int inName,int fullWord)
+{
+ int numNewNodes=0;
+ //printf("SuffixNode::insert(%s,%d)\n",word,refId);
+ SuffixNode *sn=children->first();
+ while (sn)
+ {
+ char *lab=sn->label.data();
+ char w=word[0],l=lab[0],i=0;
+ while (w!=0 && l!=0 && w==l) { i++; w=word[i]; l=lab[i]; }
+ if (w==0 && l==0) // match found
+ {
+ sn->addReference(refId,inName,fullWord);
+ return numNewNodes;
+ }
+ if (i>0) // w and l contain a common prefix of length i
+ {
+ if (l==0) // w!=0 => follow this branch
+ {
+ sn->addReference(refId,inName,FALSE);
+ numNewNodes+=sn->insert(&word[i],refId,inName,fullWord);
+ }
+ else // l!=0 => split branch
+ {
+ char leftlab[MAXWORDLEN];
+ memcpy(leftlab,lab,i);
+ leftlab[i]='\0';
+ SuffixNode *r = new SuffixNode(leftlab);
+ numNewNodes++;
+ SuffixNode *n2 = children->take();
+ // copy reference info
+ r->references = n2->references.copy();
+ int j,refSize = r->references.size();
+ for (j=0;j<refSize;j++)
+ {
+ //r->references[j].fullWord=FALSE;
+ //r->references[j].wordInName=FALSE;
+ r->references[j].flags &= ~(FULLWORD_MASK|WORDINNAME_MASK);
+ }
+ r->totalFreq = n2->totalFreq;
+ //printf("root branch `%s'\n",leftlab);
+ if (w!=0) // two sub branches
+ {
+ SuffixNode *n1 = new SuffixNode(&word[i]);
+ numNewNodes++;
+ n1->addReference(refId,inName,fullWord);
+ r->addReference(refId,inName,FALSE);
+ r->children->append(n1);
+ //printf("Right branch `%s'\n",&word[i]);
+ }
+ else // one sub branch
+ {
+ r->addReference(refId,inName,fullWord);
+ }
+ //printf("Left branch `%s'\n",&lab[i]);
+ n2->label=&lab[i];
+ r->children->append(n2);
+ children->append(r);
+ }
+ return numNewNodes;
+ }
+ sn=children->next();
+ }
+ //printf("new branch `%s'\n",word);
+ SuffixNode *n=new SuffixNode(word);
+ numNewNodes++;
+ n->addReference(refId,inName,fullWord);
+ children->append(n);
+ return numNewNodes;
+}
+
+void SuffixNode::dump(int level,const char *prefix)
+{
+ uint i;
+ if (references.size()>0)
+ {
+ printf("%s (level=%d offset=%d freq=%d) ",
+ prefix,level,branchOffset,totalFreq);
+ for (i=0;i<references.size();i++)
+ printf("%d->%d ",references.at(i).id,references.at(i).freq);
+ printf("\n");
+ }
+ SuffixNode *sn=children->first();
+ while (sn)
+ {
+ sn->dump(level+1,prefix+("-"+sn->label));
+ sn=children->next();
+ }
+}
+
+void SuffixNode::resolveForwardReferences(int &offset)
+{
+ if (children->count()>0)
+ {
+ if (label.length()>0) offset++; // terminator for the previous level
+ branchOffset=offset;
+ }
+ else
+ branchOffset=0;
+ SuffixNode *sn=children->first();
+ while (sn)
+ {
+ offset+=sn->label.length()+5;
+ uint i,refs=sn->references.size();
+ if (refs>0)
+ {
+ offset+=encodedNumberSize(sn->totalFreq);
+ offset+=encodedNumberSize((sn->references[0].id<<3)+
+ sn->references[0].flags);
+ offset+=encodedNumberSize(sn->references[0].freq);
+ for (i=1;i<refs;i++)
+ {
+ offset+=encodedNumberSize(
+ ((sn->references.at(i).id - sn->references.at(i-1).id)<<3)+
+ sn->references.at(i).flags);
+ offset+=encodedNumberSize(sn->references.at(i).freq);
+ }
+ offset+=encodedNumberSize(0);
+ }
+ //printf("Lab=%s offset=%d\n",sn->lab.data(),offset);
+ sn=children->next();
+ }
+ sn=children->first();
+ while (sn)
+ {
+ //printf("Lab=%s offset=%d\n",sn->lab.data(),offset);
+ sn->resolveForwardReferences(offset);
+ sn=children->next();
+ }
+}
+
+int SuffixNode::size()
+{
+ int s=0;
+ if (label.length()>0 && children->count()>0) s++; // for the terminator
+ SuffixNode *sn=children->first();
+ while (sn)
+ {
+ uint i,refs=sn->references.size();
+ s+=sn->size()+sn->label.length()+5;
+ if (refs>0)
+ {
+ s+=encodedNumberSize(sn->totalFreq);
+ s+=encodedNumberSize(
+ (sn->references[0].id<<3)+
+ sn->references[0].flags);
+ s+=encodedNumberSize(sn->references[0].freq);
+ for (i=1;i<refs;i++)
+ {
+ s+=encodedNumberSize(
+ ((sn->references.at(i).id - sn->references.at(i-1).id)<<3)+
+ sn->references.at(i).flags);
+ s+=encodedNumberSize(sn->references.at(i).freq);
+ }
+ s+=encodedNumberSize(0);
+ }
+ sn=children->next();
+ }
+ return s;
+}
+
+bool SuffixNode::write(QFile &f)
+{
+ bool error=FALSE;
+ if (children->count()>0 && label.length()>0) error=error || (f.putch(0)==-1);
+ SuffixNode *sn=children->first();
+ while (sn)
+ {
+ //offset+=sn->lab.length()+1+2*sizeof(int);
+ int i,refs=sn->references.size();
+ error=error || writeString(f,sn->label);
+ error=error || writeNumber(f,sn->branchOffset|((refs==0)?0x80000000:0));
+ if (refs>0)
+ {
+ error=error || writeEncodedNumber(f,sn->totalFreq);
+ error=error || writeEncodedNumber(f,
+ (sn->references[0].id<<3)+
+ sn->references[0].flags);
+ error=error || writeEncodedNumber(f,sn->references[0].freq);
+ for (i=1;i<refs;i++)
+ {
+ error=error || writeEncodedNumber(f,
+ ((sn->references[i].id - sn->references[i-1].id)<<3)+
+ sn->references[i].flags);
+ error=error || writeEncodedNumber(f,sn->references[i].freq);
+ }
+ error=error || writeEncodedNumber(f,0);
+ }
+ //printf("Lab=%s offset=%d\n",sn->lab.data(),offset);
+ sn=children->next();
+ }
+ sn=children->first();
+ while (sn)
+ {
+ error=error || sn->write(f);
+ sn=children->next();
+ }
+ return error;
+}
+
+//----------------------------------------------------------------------------
+
+SuffixTree::SuffixTree()
+{
+ root=new SuffixNode("");
+ nodes=1;
+}
+
+SuffixTree::~SuffixTree()
+{
+ delete root;
+}
+
+void SuffixTree::insertWord(const char *word,int index,bool inName)
+{
+ QString suffix=word;
+ uint i;
+ for (i=2;i<suffix.length()-1;i++)
+ {
+ //printf("Inserting suffix %s\n",suffix.right(i).data());
+ nodes+=root->insert(suffix.right(i),index,inName?1:0,FALSE);
+ }
+ nodes+=root->insert(word,index,inName?1:0,TRUE);
+}
+
+void SuffixTree::dump()
+{
+ root->dump(0,"");
+}
+
+void SuffixTree::resolveForwardReferences()
+{
+ int offset=8;
+ root->resolveForwardReferences(offset);
+}
+
+int SuffixTree::size()
+{
+ return root->size();
+}
+
+bool SuffixTree::write(QFile &f)
+{
+ if (!f.isOpen()) { printf("File not open\n"); return FALSE; }
+ bool error=FALSE;
+ error = error || root->write(f);
+ return !error;
+}