diff options
Diffstat (limited to 'src/searchindex.cpp')
-rw-r--r-- | src/searchindex.cpp | 257 |
1 files changed, 257 insertions, 0 deletions
diff --git a/src/searchindex.cpp b/src/searchindex.cpp index 48047fb..36b416c 100644 --- a/src/searchindex.cpp +++ b/src/searchindex.cpp @@ -17,6 +17,9 @@ #include "qtbc.h" #include "searchindex.h" +#include <qfile.h> + +#if 0 #include "suffixtree.h" SearchIndex::SearchIndex() : refDict(10007), nameIndex(10007) @@ -124,3 +127,257 @@ bool SearchIndex::saveIndex(const char *fileName) //suffixTree->dump(); return TRUE; } +#endif + + +// file format: +// 4 byte header +// 256*256*4 byte index +// for each index entry: a zero terminated list of words +// for each word: a 0 terminated string + 4 bytes stats index +// padding bytes to align at 4 byte boundary +// for each word: a counter + for each url containing the word 8 bytes statistics +// for each url: a 0 terminated string + +const int numIndexEntries = 256*256; + +//-------------------------------------------------------------------- + +IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17) +{ + m_urls.setAutoDelete(TRUE); +} + +void IndexWord::addUrlIndex(int idx) +{ + URLInfo *ui = m_urls.find(idx); + if (ui==0) + { + ui=new URLInfo(idx,0); + m_urls.insert(idx,ui); + } + ui->freq++; +} + +//-------------------------------------------------------------------- + +SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1) +{ + int i; + m_words.setAutoDelete(TRUE); + m_urls.setAutoDelete(TRUE); + for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>); +} + +void SearchIndex::setCurrentDoc(const char *name,const char *url) +{ + m_urlIndex++; + m_urls.insert(m_urlIndex,new URL(name,url)); +} + + +static int charsToIndex(const char *word) +{ + if (word==0) return -1; + int c1=word[0]; + if (c1==0) return -1; + int c2=word[1]; + if (c2==0) return -1; + return c1*256+c2; +} + +void SearchIndex::addWord(const char *word) +{ + IndexWord *w = m_words[word]; + if (w==0) + { + int idx=charsToIndex(word); + if (idx==-1) return; + w = new IndexWord(word); + //printf("addWord(%s) at index %d\n",word,idx); + m_index[idx]->append(w); + m_words.insert(word,w); + } + w->addUrlIndex(m_urlIndex); +} + + +static void writeInt(QFile &f,int index) +{ + f.putch(((uint)index)>>24); + f.putch((((uint)index)>>16)&0xff); + f.putch((((uint)index)>>8)&0xff); + f.putch(((uint)index)&0xff); +} + +static void writeString(QFile &f,const char *s) +{ + const char *p = s; + while (*p) f.putch(*p++); + f.putch(0); +} + +void SearchIndex::write(const char *fileName) +{ + int i; + int size=4; // for the header + size+=4*numIndexEntries; // for the index + int wordsOffset = size; + // first pass: compute the size of the wordlist + for (i=0;i<numIndexEntries;i++) + { + QList<IndexWord> *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + QListIterator<IndexWord> iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + int ws = iw->word().length()+1; + size+=ws+4; // word + url info list offset + } + size+=1; // zero list terminator + } + } + + // second pass: compute the offsets in the index + int indexOffsets[numIndexEntries]; + int offset=wordsOffset; + for (i=0;i<numIndexEntries;i++) + { + QList<IndexWord> *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + indexOffsets[i]=offset; + QListIterator<IndexWord> iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + offset+= iw->word().length()+1; + offset+=4; // word + offset to url info array + } + offset+=1; // zero list terminator + } + else + { + indexOffsets[i]=0; + } + } + int padding = size; + size = (size+3)&~3; // round up to 4 byte boundary + padding = size - padding; + + //int statsOffset = size; + QDictIterator<IndexWord> wdi(m_words); + //IndexWord *iw; + int *wordStatOffsets = new int[m_words.count()]; + + int count=0; + + // third pass: compute offset to stats info for each word + for (i=0;i<numIndexEntries;i++) + { + QList<IndexWord> *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + QListIterator<IndexWord> iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + //printf("wordStatOffsets[%d]=%d\n",count,size); + wordStatOffsets[count++] = size; + size+=4+iw->urls().count()*8; // count + (url_index,freq) per url + } + } + } + int *urlOffsets = new int[m_urls.count()]; + //int urlsOffset = size; + QIntDictIterator<URL> udi(m_urls); + URL *url; + for (udi.toFirst();(url=udi.current());++udi) + { + urlOffsets[udi.currentKey()]=size; + size+=url->name.length()+1+ + url->url.length()+1; + } + //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset); + QFile f(fileName); + if (f.open(IO_WriteOnly)) + { + // write header + f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S'); + // write index + for (i=0;i<numIndexEntries;i++) + { + writeInt(f,indexOffsets[i]); + } + // write word lists + count=0; + for (i=0;i<numIndexEntries;i++) + { + QList<IndexWord> *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + QListIterator<IndexWord> iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + writeString(f,iw->word()); + writeInt(f,wordStatOffsets[count++]); + } + f.putch(0); + } + } + // write extra padding bytes + for (i=0;i<padding;i++) f.putch(0); + // write word statistics + for (i=0;i<numIndexEntries;i++) + { + QList<IndexWord> *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + QListIterator<IndexWord> iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + int numUrls = iw->urls().count(); + writeInt(f,numUrls); + QIntDictIterator<URLInfo> uli(iw->urls()); + URLInfo *ui; + for (uli.toFirst();(ui=uli.current());++uli) + { + writeInt(f,urlOffsets[ui->urlIdx]); + writeInt(f,ui->freq); + } + } + } + } + // write urls + QIntDictIterator<URL> udi(m_urls); + URL *url; + for (udi.toFirst();(url=udi.current());++udi) + { + writeString(f,url->name); + writeString(f,url->url); + } + } + + //for (wdi.toFirst();(iw=wdi.current());++wdi) + //{ + // printf("Word %s:\n",wdi.currentKey().data()); + // QIntDictIterator<URLInfo> udi(iw->urls()); + // URLInfo *ui; + // for (udi.toFirst();(ui=udi.current());++udi) + // { + // printf(" url[%d]=(name=%s,url=%s),freq=%d\n", + // ui->urlIdx, + // m_urls[ui->urlIdx]->name.data(), + // m_urls[ui->urlIdx]->url.data(), + // ui->freq); + // } + //} + + delete urlOffsets; + delete wordStatOffsets; +} + |