diff options
Diffstat (limited to 'src/searchindex.cpp')
-rw-r--r-- | src/searchindex.cpp | 285 |
1 files changed, 0 insertions, 285 deletions
diff --git a/src/searchindex.cpp b/src/searchindex.cpp deleted file mode 100644 index 400702b..0000000 --- a/src/searchindex.cpp +++ /dev/null @@ -1,285 +0,0 @@ -/****************************************************************************** - * - * - * - * Copyright (C) 1997-2005 by Dimitri van Heesch. - * - * Permission to use, copy, modify, and distribute this software and its - * documentation under the terms of the GNU General Public License is hereby - * granted. No representations are made about the suitability of this software - * for any purpose. It is provided "as is" without express or implied warranty. - * See the GNU General Public License for more details. - * - * Documents produced by Doxygen are derivative works derived from the - * input used in their production; they are not affected by this license. - * - */ - -#include "qtbc.h" -#include "searchindex.h" -#include "config.h" -#include <qfile.h> -#include <ctype.h> - - -// file format: (all multi-byte values are stored in big endian format) -// 4 byte header -// 256*256*4 byte index (4 bytes) -// for each index entry: a zero terminated list of words -// for each word: a \0 terminated string + 4 byte offset to the stats info -// padding bytes to align at 4 byte boundary -// for each word: the number of urls (4 bytes) -// + for each url containing the word 8 bytes statistics -// (4 bytes index to url string + 4 bytes frequency counter) -// for each url: a \0 terminated string - -const int numIndexEntries = 256*256; - -//-------------------------------------------------------------------- - -IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17) -{ - m_urls.setAutoDelete(TRUE); - //printf("IndexWord::IndexWord(%s)\n",word); -} - -void IndexWord::addUrlIndex(int idx,bool hiPriority) -{ - //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority); - URLInfo *ui = m_urls.find(idx); - if (ui==0) - { - //printf("URLInfo::URLInfo(%d)\n",idx); - ui=new URLInfo(idx,0); - m_urls.insert(idx,ui); - } - ui->freq+=2; - if (hiPriority) ui->freq|=1; // mark as high priority document -} - -//-------------------------------------------------------------------- - -SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1) -{ - int i; - m_words.setAutoDelete(TRUE); - m_urls.setAutoDelete(TRUE); - for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>); -} - -void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor) -{ - //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor); - QCString url=baseName+Config_getString("HTML_FILE_EXTENSION"); - if (anchor) url+=(QCString)"#"+anchor; - m_urlIndex++; - m_urls.insert(m_urlIndex,new URL(name,url)); -} - -static int charsToIndex(const char *word) -{ - if (word==0) return -1; - - // Fast string hashing algorithm - //register ushort h=0; - //const char *k = word; - //ushort mask=0xfc00; - //while ( *k ) - //{ - // h = (h&mask)^(h<<6)^(*k++); - //} - //return h; - - // Simple hashing that allows for substring searching - uint c1=word[0]; - if (c1==0) return -1; - uint c2=word[1]; - if (c2==0) return -1; - return c1*256+c2; -} - -void SearchIndex::addWord(const char *word,bool hiPriority) -{ - //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority); - //QString wStr=QString(word).lower(); - QString wStr(word); - wStr=wStr.lower(); - if (wStr.isEmpty()) return; - IndexWord *w = m_words[wStr]; - if (w==0) - { - int idx=charsToIndex(wStr); - if (idx<0) return; - w = new IndexWord(wStr); - //fprintf(stderr,"addWord(%s) at index %d\n",word,idx); - m_index[idx]->append(w); - m_words.insert(wStr,w); - } - w->addUrlIndex(m_urlIndex,hiPriority); -} - - -static void writeInt(QFile &f,int index) -{ - f.putch(((uint)index)>>24); - f.putch((((uint)index)>>16)&0xff); - f.putch((((uint)index)>>8)&0xff); - f.putch(((uint)index)&0xff); -} - -static void writeString(QFile &f,const char *s) -{ - const char *p = s; - while (*p) f.putch(*p++); - f.putch(0); -} - -void SearchIndex::write(const char *fileName) -{ - int i; - int size=4; // for the header - size+=4*numIndexEntries; // for the index - int wordsOffset = size; - // first pass: compute the size of the wordlist - for (i=0;i<numIndexEntries;i++) - { - QList<IndexWord> *wlist = m_index[i]; - if (!wlist->isEmpty()) - { - QListIterator<IndexWord> iwi(*wlist); - IndexWord *iw; - for (iwi.toFirst();(iw=iwi.current());++iwi) - { - int ws = iw->word().length()+1; - size+=ws+4; // word + url info list offset - } - size+=1; // zero list terminator - } - } - - // second pass: compute the offsets in the index - int indexOffsets[numIndexEntries]; - int offset=wordsOffset; - for (i=0;i<numIndexEntries;i++) - { - QList<IndexWord> *wlist = m_index[i]; - if (!wlist->isEmpty()) - { - indexOffsets[i]=offset; - QListIterator<IndexWord> iwi(*wlist); - IndexWord *iw; - for (iwi.toFirst();(iw=iwi.current());++iwi) - { - offset+= iw->word().length()+1; - offset+=4; // word + offset to url info array - } - offset+=1; // zero list terminator - } - else - { - indexOffsets[i]=0; - } - } - int padding = size; - size = (size+3)&~3; // round up to 4 byte boundary - padding = size - padding; - - //int statsOffset = size; - QDictIterator<IndexWord> wdi(m_words); - //IndexWord *iw; - int *wordStatOffsets = new int[m_words.count()]; - - int count=0; - - // third pass: compute offset to stats info for each word - for (i=0;i<numIndexEntries;i++) - { - QList<IndexWord> *wlist = m_index[i]; - if (!wlist->isEmpty()) - { - QListIterator<IndexWord> iwi(*wlist); - IndexWord *iw; - for (iwi.toFirst();(iw=iwi.current());++iwi) - { - //printf("wordStatOffsets[%d]=%d\n",count,size); - wordStatOffsets[count++] = size; - size+=4+iw->urls().count()*8; // count + (url_index,freq) per url - } - } - } - int *urlOffsets = new int[m_urls.count()]; - //int urlsOffset = size; - QIntDictIterator<URL> udi(m_urls); - URL *url; - for (udi.toFirst();(url=udi.current());++udi) - { - urlOffsets[udi.currentKey()]=size; - size+=url->name.length()+1+ - url->url.length()+1; - } - //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset); - QFile f(fileName); - if (f.open(IO_WriteOnly)) - { - // write header - f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S'); - // write index - for (i=0;i<numIndexEntries;i++) - { - writeInt(f,indexOffsets[i]); - } - // write word lists - count=0; - for (i=0;i<numIndexEntries;i++) - { - QList<IndexWord> *wlist = m_index[i]; - if (!wlist->isEmpty()) - { - QListIterator<IndexWord> iwi(*wlist); - IndexWord *iw; - for (iwi.toFirst();(iw=iwi.current());++iwi) - { - writeString(f,iw->word()); - writeInt(f,wordStatOffsets[count++]); - } - f.putch(0); - } - } - // write extra padding bytes - for (i=0;i<padding;i++) f.putch(0); - // write word statistics - for (i=0;i<numIndexEntries;i++) - { - QList<IndexWord> *wlist = m_index[i]; - if (!wlist->isEmpty()) - { - QListIterator<IndexWord> iwi(*wlist); - IndexWord *iw; - for (iwi.toFirst();(iw=iwi.current());++iwi) - { - int numUrls = iw->urls().count(); - writeInt(f,numUrls); - QIntDictIterator<URLInfo> uli(iw->urls()); - URLInfo *ui; - for (uli.toFirst();(ui=uli.current());++uli) - { - writeInt(f,urlOffsets[ui->urlIdx]); - writeInt(f,ui->freq); - } - } - } - } - // write urls - QIntDictIterator<URL> udi(m_urls); - URL *url; - for (udi.toFirst();(url=udi.current());++udi) - { - writeString(f,url->name); - writeString(f,url->url); - } - } - - delete urlOffsets; - delete wordStatOffsets; -} - |