diff options
Diffstat (limited to 'src/3rdparty/clucene/src/CLucene/search')
73 files changed, 10893 insertions, 0 deletions
diff --git a/src/3rdparty/clucene/src/CLucene/search/BooleanClause.h b/src/3rdparty/clucene/src/CLucene/search/BooleanClause.h new file mode 100644 index 0000000..b89cb31 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/BooleanClause.h @@ -0,0 +1,90 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_BooleanClause_ +#define _lucene_search_BooleanClause_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif +#include "SearchHeader.h" + +CL_NS_DEF(search) + // A clause in a BooleanQuery. + class BooleanClause:LUCENE_BASE { + public: + class Compare:public CL_NS_STD(binary_function)<const BooleanClause*,const BooleanClause*,bool> + { + public: + bool operator()( const BooleanClause* val1, const BooleanClause* val2 ) const{ + return val1->equals(val2); + } + }; + + // The query whose matching documents are combined by the boolean query. + Query* query; + + int32_t getClauseCount(); + + // If true, documents documents which <i>do not</i> + // match this sub-query will <i>not</i> match the boolean query. + bool required; + + // If true, documents documents which <i>do</i> + // match this sub-query will <i>not</i> match the boolean query. + bool prohibited; + + bool deleteQuery; + + // Constructs a BooleanClause with query <code>q</code>, required + // <code>r</code> and prohibited <code>p</code>. + BooleanClause(Query* q, const bool DeleteQuery,const bool req, const bool p): + query(q), + required(req), + prohibited(p), + deleteQuery(DeleteQuery) + { + } + + BooleanClause(const BooleanClause& clone): +#if defined(LUCENE_ENABLE_MEMLEAKTRACKING) +#elif defined(LUCENE_ENABLE_REFCOUNT) +#else + LuceneVoidBase(), +#endif + query(clone.query->clone()), + required(clone.required), + prohibited(clone.prohibited), + deleteQuery(true) + { + } + + BooleanClause* clone() const{ + BooleanClause* ret = _CLNEW BooleanClause(*this); + return ret; + } + + ~BooleanClause(){ + if ( deleteQuery ) + _CLDELETE( query ); + } + + /** Returns true iff <code>o</code> is equal to this. */ + bool equals(const BooleanClause* other) const { + return this->query->equals(other->query) + && (this->required == other->required) + && (this->prohibited == other->prohibited); + } + + size_t hashCode() const{ + return query->hashCode() ^ (this->required?1:0) ^ (this->prohibited?2:0); + } + }; + + +CL_NS_END +#endif + diff --git a/src/3rdparty/clucene/src/CLucene/search/BooleanQuery.cpp b/src/3rdparty/clucene/src/CLucene/search/BooleanQuery.cpp new file mode 100644 index 0000000..3fd36d8 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/BooleanQuery.cpp @@ -0,0 +1,363 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "BooleanQuery.h" + +#include "BooleanClause.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/util/StringBuffer.h" +#include "CLucene/util/Arrays.h" +#include "SearchHeader.h" +#include "BooleanScorer.h" +#include "Scorer.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_DEF(search) + + BooleanQuery::BooleanQuery(): + clauses(true) + { + } + + BooleanQuery::BooleanQuery(const BooleanQuery& clone): + Query(clone) + { + for ( uint32_t i=0;i<clone.clauses.size();i++ ){ + BooleanClause* clause = clone.clauses[i]->clone(); + clause->deleteQuery=true; + add(clause); + } + } + + BooleanQuery::~BooleanQuery(){ + clauses.clear(); + } + + size_t BooleanQuery::hashCode() const { + //todo: do cachedHashCode, and invalidate on add/remove clause + size_t ret = 0; + for (uint32_t i = 0 ; i < clauses.size(); i++) { + BooleanClause* c = clauses[i]; + ret = 31 * ret + c->hashCode(); + } + ret = ret ^ Similarity::floatToByte(getBoost()); + return ret; + } + + const TCHAR* BooleanQuery::getQueryName() const{ + return getClassName(); + } + const TCHAR* BooleanQuery::getClassName(){ + return _T("BooleanQuery"); + } + + /** + * Default value is 1024. Use <code>org.apache.lucene.maxClauseCount</code> + * system property to override. + */ + size_t BooleanQuery::maxClauseCount = LUCENE_BOOLEANQUERY_MAXCLAUSECOUNT; + size_t BooleanQuery::getMaxClauseCount(){ + return maxClauseCount; + } + + void BooleanQuery::setMaxClauseCount(size_t maxClauseCount){ + BooleanQuery::maxClauseCount = maxClauseCount; + } + + void BooleanQuery::add(Query* query, const bool deleteQuery, const bool required, const bool prohibited) { + BooleanClause* bc = _CLNEW BooleanClause(query,deleteQuery,required, prohibited); + try{ + add(bc); + }catch(...){ + _CLDELETE(bc); + throw; + } + } + + void BooleanQuery::add(BooleanClause* clause) { + if (clauses.size() >= getMaxClauseCount()) + _CLTHROWA(CL_ERR_TooManyClauses,"Too Many Clauses"); + + clauses.push_back(clause); + } + + + size_t BooleanQuery::getClauseCount() const { + return (int32_t) clauses.size(); + } + + TCHAR* BooleanQuery::toString(const TCHAR* field) const{ + StringBuffer buffer; + if (getBoost() != 1.0) { + buffer.append(_T("(")); + } + + for (uint32_t i = 0 ; i < clauses.size(); i++) { + BooleanClause* c = clauses[i]; + if (c->prohibited) + buffer.append(_T("-")); + else if (c->required) + buffer.append(_T("+")); + + if ( c->query->instanceOf(BooleanQuery::getClassName()) ) { // wrap sub-bools in parens + buffer.append(_T("(")); + + TCHAR* buf = c->query->toString(field); + buffer.append(buf); + _CLDELETE_CARRAY( buf ); + + buffer.append(_T(")")); + } else { + TCHAR* buf = c->query->toString(field); + buffer.append(buf); + _CLDELETE_CARRAY( buf ); + } + if (i != clauses.size()-1) + buffer.append(_T(" ")); + + if (getBoost() != 1.0) { + buffer.append(_T(")^")); + buffer.appendFloat(getBoost(),1); + } + } + return buffer.toString(); + } + + + + + BooleanClause** BooleanQuery::getClauses() const + { + CND_MESSAGE(false, "Warning: BooleanQuery::getClauses() is deprecated") + BooleanClause** ret = _CL_NEWARRAY(BooleanClause*, clauses.size()+1); + getClauses(ret); + return ret; + } + + void BooleanQuery::getClauses(BooleanClause** ret) const + { + size_t size=clauses.size(); + for ( uint32_t i=0;i<size;i++ ) + ret[i] = clauses[i]; + } + Query* BooleanQuery::rewrite(IndexReader* reader) { + if (clauses.size() == 1) { // optimize 1-clause queries + BooleanClause* c = clauses[0]; + if (!c->prohibited) { // just return clause + Query* query = c->query->rewrite(reader); // rewrite first + + //if the query doesn't actually get re-written, + //then return a clone (because the BooleanQuery + //will register different to the returned query. + if ( query == c->query ) + query = query->clone(); + + if (getBoost() != 1.0f) { // incorporate boost + query->setBoost(getBoost() * query->getBoost()); + } + + return query; + } + } + + BooleanQuery* clone = NULL; // recursively rewrite + for (uint32_t i = 0 ; i < clauses.size(); i++) { + BooleanClause* c = clauses[i]; + Query* query = c->query->rewrite(reader); + if (query != c->query) { // clause rewrote: must clone + if (clone == NULL) + clone = (BooleanQuery*)this->clone(); + //todo: check if delete query should be on... + //in fact we should try and get rid of these + //for compatibility sake + clone->clauses.set (i, _CLNEW BooleanClause(query, true, c->required, c->prohibited)); + } + } + if (clone != NULL) { + return clone; // some clauses rewrote + } else + return this; // no clauses rewrote + } + + + Query* BooleanQuery::clone() const{ + BooleanQuery* clone = _CLNEW BooleanQuery(*this); + return clone; + } + + /** Returns true iff <code>o</code> is equal to this. */ + bool BooleanQuery::equals(Query* o)const { + if (!(o->instanceOf(BooleanQuery::getClassName()))) + return false; + const BooleanQuery* other = (BooleanQuery*)o; + + bool ret = (this->getBoost() == other->getBoost()); + if ( ret ){ + CLListEquals<BooleanClause,BooleanClause::Compare, const ClausesType, const ClausesType> comp; + ret = comp.equals(&this->clauses,&other->clauses); + } + return ret; + } + + qreal BooleanQuery::BooleanWeight::getValue() { return parentQuery->getBoost(); } + Query* BooleanQuery::BooleanWeight::getQuery() { return (Query*)parentQuery; } + + + + + + BooleanQuery::BooleanWeight::BooleanWeight(Searcher* searcher, + CLVector<BooleanClause*,Deletor::Object<BooleanClause> >* clauses, BooleanQuery* parentQuery) + { + this->searcher = searcher; + this->parentQuery = parentQuery; + this->clauses = clauses; + for (uint32_t i = 0 ; i < clauses->size(); i++) { + weights.push_back((*clauses)[i]->query->_createWeight(searcher)); + } + } + BooleanQuery::BooleanWeight::~BooleanWeight(){ + this->weights.clear(); + } + + qreal BooleanQuery::BooleanWeight::sumOfSquaredWeights() { + qreal sum = 0.0f; + for (uint32_t i = 0 ; i < weights.size(); i++) { + BooleanClause* c = (*clauses)[i]; + Weight* w = weights[i]; + if (!c->prohibited) + sum += w->sumOfSquaredWeights(); // sum sub weights + } + sum *= parentQuery->getBoost() * parentQuery->getBoost(); // boost each sub-weight + return sum ; + } + + void BooleanQuery::BooleanWeight::normalize(qreal norm) { + norm *= parentQuery->getBoost(); // incorporate boost + for (uint32_t i = 0 ; i < weights.size(); i++) { + BooleanClause* c = (*clauses)[i]; + Weight* w = weights[i]; + if (!c->prohibited) + w->normalize(norm); + } + } + + Scorer* BooleanQuery::BooleanWeight::scorer(IndexReader* reader){ + // First see if the (faster) ConjunctionScorer will work. This can be + // used when all clauses are required. Also, at this point a + // BooleanScorer cannot be embedded in a ConjunctionScorer, as the hits + // from a BooleanScorer are not always sorted by document number (sigh) + // and hence BooleanScorer cannot implement skipTo() correctly, which is + // required by ConjunctionScorer. + bool allRequired = true; + bool noneBoolean = true; + { //msvc6 scope fix + for (uint32_t i = 0 ; i < weights.size(); i++) { + BooleanClause* c = (*clauses)[i]; + if (!c->required) + allRequired = false; + if (c->query->instanceOf(BooleanQuery::getClassName())) + noneBoolean = false; + } + } + + if (allRequired && noneBoolean) { // ConjunctionScorer is okay + ConjunctionScorer* result = + _CLNEW ConjunctionScorer(parentQuery->getSimilarity(searcher)); + for (uint32_t i = 0 ; i < weights.size(); i++) { + Weight* w = weights[i]; + Scorer* subScorer = w->scorer(reader); + if (subScorer == NULL) + return NULL; + result->add(subScorer); + } + return result; + } + + // Use good-old BooleanScorer instead. + BooleanScorer* result = _CLNEW BooleanScorer(parentQuery->getSimilarity(searcher)); + + { //msvc6 scope fix + for (uint32_t i = 0 ; i < weights.size(); i++) { + BooleanClause* c = (*clauses)[i]; + Weight* w = weights[i]; + Scorer* subScorer = w->scorer(reader); + if (subScorer != NULL) + result->add(subScorer, c->required, c->prohibited); + else if (c->required) + return NULL; + } + } + + return result; + } + + void BooleanQuery::BooleanWeight::explain(IndexReader* reader, int32_t doc, Explanation* result){ + int32_t coord = 0; + int32_t maxCoord = 0; + qreal sum = 0.0f; + Explanation* sumExpl = _CLNEW Explanation; + for (uint32_t i = 0 ; i < weights.size(); i++) { + BooleanClause* c = (*clauses)[i]; + Weight* w = weights[i]; + Explanation* e = _CLNEW Explanation; + w->explain(reader, doc, e); + if (!c->prohibited) + maxCoord++; + if (e->getValue() > 0) { + if (!c->prohibited) { + sumExpl->addDetail(e); + sum += e->getValue(); + coord++; + e = NULL; //prevent e from being deleted + } else { + //we want to return something else... + _CLDELETE(sumExpl); + result->setValue(0.0f); + result->setDescription(_T("match prohibited")); + return; + } + } else if (c->required) { + _CLDELETE(sumExpl); + result->setValue(0.0f); + result->setDescription(_T("match prohibited")); + return; + } + + _CLDELETE(e); + } + sumExpl->setValue(sum); + + if (coord == 1){ // only one clause matched + Explanation* tmp = sumExpl; + sumExpl = sumExpl->getDetail(0)->clone(); // eliminate wrapper + _CLDELETE(tmp); + } + + sumExpl->setDescription(_T("sum of:")); + qreal coordFactor = parentQuery->getSimilarity(searcher)->coord(coord, maxCoord); + if (coordFactor == 1.0f){ // coord is no-op + result->set(*sumExpl); // eliminate wrapper + _CLDELETE(sumExpl); + } else { + result->setDescription( _T("product of:")); + result->addDetail(sumExpl); + + StringBuffer explbuf; + explbuf.append(_T("coord(")); + explbuf.appendInt(coord); + explbuf.append(_T("/")); + explbuf.appendInt(maxCoord); + explbuf.append(_T(")")); + result->addDetail(_CLNEW Explanation(coordFactor, explbuf.getBuffer())); + result->setValue(sum*coordFactor); + } + } + + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/BooleanQuery.h b/src/3rdparty/clucene/src/CLucene/search/BooleanQuery.h new file mode 100644 index 0000000..27b67d1 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/BooleanQuery.h @@ -0,0 +1,126 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_BooleanQuery_ +#define _lucene_search_BooleanQuery_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "ConjunctionScorer.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/util/StringBuffer.h" +#include "SearchHeader.h" +#include "BooleanClause.h" +#include "BooleanScorer.h" +#include "Scorer.h" + +CL_NS_DEF(search) + + + // A Query that matches documents matching boolean combinations of other + // queries, typically {@link TermQuery}s or {@link PhraseQuery}s. + class BooleanQuery:public Query { + public: + typedef CL_NS(util)::CLVector<BooleanClause*,CL_NS(util)::Deletor::Object<BooleanClause> > ClausesType; + private: + BooleanQuery::ClausesType clauses; + static size_t maxClauseCount; + + class BooleanWeight: public Weight { + private: + Searcher* searcher; + CL_NS(util)::CLVector<Weight*,CL_NS(util)::Deletor::Object<Weight> > weights; + ClausesType* clauses; + BooleanQuery* parentQuery; + public: + BooleanWeight(Searcher* searcher, + CL_NS(util)::CLVector<BooleanClause*,CL_NS(util)::Deletor::Object<BooleanClause> >* clauses, + BooleanQuery* parentQuery); + ~BooleanWeight(); + Query* getQuery(); + qreal getValue(); + qreal sumOfSquaredWeights(); + void normalize(qreal norm); + Scorer* scorer(CL_NS(index)::IndexReader* reader); + void explain(CL_NS(index)::IndexReader* reader, int32_t doc, Explanation* ret); + };//booleanweight + + protected: + Weight* _createWeight(Searcher* searcher) { + return _CLNEW BooleanWeight(searcher,&clauses,this); + } + BooleanQuery(const BooleanQuery& clone); + public: + /** Constructs an empty boolean query. */ + BooleanQuery(); + + ~BooleanQuery(); + + const TCHAR* getQueryName() const; + static const TCHAR* getClassName(); + + /** Return the maximum number of clauses permitted, 1024 by default. + * Attempts to add more than the permitted number of clauses cause {@link + * TooManyClauses} to be thrown.*/ + static size_t getMaxClauseCount(); + + /** Set the maximum number of clauses permitted. */ + static void setMaxClauseCount(size_t maxClauseCount); + + /** Adds a clause to a boolean query. Clauses may be: + * <ul> + * <li><code>required</code> which means that documents which <i>do not</i> + * match this sub-query will <i>not</i> match the boolean query; + * <li><code>prohibited</code> which means that documents which <i>do</i> + * match this sub-query will <i>not</i> match the boolean query; or + * <li>neither, in which case matched documents are neither prohibited from + * nor required to match the sub-query. However, a document must match at + * least 1 sub-query to match the boolean query. + * </ul> + * It is an error to specify a clause as both <code>required</code> and + * <code>prohibited</code>. + * + * @see #getMaxClauseCount() + */ + void add(Query* query, const bool required, const bool prohibited){ + add(query,false,required,prohibited); + } + void add(Query* query, const bool deleteQuery, const bool required, const bool prohibited); + + /** Copies the clauses of this query into the array. + * The array must be at least as long as getClauseCount() + * If you want to use the clauses, make sure you null terminate it. + */ + void getClauses(BooleanClause** clauses) const; + + ///@deprecated + _CL_DEPRECATED( getClauses(clauses) ) BooleanClause** getClauses() const; + + /** + * Give client code access to clauses.size() so we know how + * large the array returned by getClauses is. + */ + size_t getClauseCount() const; + + /** Adds a clause to a boolean query. + * @see #getMaxClauseCount() + */ + void add(BooleanClause* clause); + + Query* rewrite(CL_NS(index)::IndexReader* reader); + Query* clone() const; + bool equals(Query* o) const; + + /** Prints a user-readable version of this query. */ + TCHAR* toString(const TCHAR* field) const; + /** Returns a hash code value for this object.*/ + size_t hashCode() const; + }; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/BooleanScorer.cpp b/src/3rdparty/clucene/src/CLucene/search/BooleanScorer.cpp new file mode 100644 index 0000000..ae7ee40 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/BooleanScorer.cpp @@ -0,0 +1,248 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "BooleanScorer.h" + +#include "Scorer.h" +#include "Similarity.h" + +CL_NS_USE(util) +CL_NS_DEF(search) + + BooleanScorer::BooleanScorer(Similarity* similarity): + Scorer(similarity), + scorers(NULL), + maxCoord (1), + nextMask (1), + end(0), + current(NULL), + requiredMask (0), + prohibitedMask (0), + coordFactors (NULL) + { + bucketTable = _CLNEW BucketTable(this); + } + + BooleanScorer::~BooleanScorer(){ + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + _CLDELETE(bucketTable); + _CLDELETE_ARRAY(coordFactors); + _CLDELETE(scorers); + } + + + bool BooleanScorer::next() { + bool more; + do { + while (bucketTable->first != NULL) { // more queued + current = bucketTable->first; + bucketTable->first = current->next; // pop the queue + + // check prohibited & required + if ((current->bits & prohibitedMask) == 0 && + (current->bits & requiredMask) == requiredMask) { + return true; + } + } + + // refill the queue + more = false; + end += BooleanScorer::BucketTable_SIZE; + for (SubScorer* sub = scorers; sub != NULL; sub = sub->next) { + Scorer* scorer = sub->scorer; + int32_t doc; + while (!sub->done && (doc=scorer->doc()) < end) { + sub->collector->collect(doc, scorer->score()); + sub->done = !scorer->next(); + } + if (!sub->done) { + more = true; + } + } + } while (bucketTable->first != NULL || more); + + return false; + } + + qreal BooleanScorer::score(){ + if (coordFactors == NULL) + computeCoordFactors(); + return current->score * coordFactors[current->coord]; + } + + bool BooleanScorer::skipTo(int32_t target) { + _CLTHROWA(CL_ERR_UnsupportedOperation,"UnsupportedOperationException: BooleanScorer::skipTo"); + } + + void BooleanScorer::explain(int32_t doc, Explanation* ret) { + _CLTHROWA(CL_ERR_UnsupportedOperation,"UnsupportedOperationException: BooleanScorer::explain"); + } + + TCHAR* BooleanScorer::toString() { + CL_NS(util)::StringBuffer buffer; + buffer.append(_T("boolean(")); + for (SubScorer* sub = scorers; sub != NULL; sub = sub->next) { + buffer.append(sub->scorer->toString()); + buffer.append(_T(" ")); + } + buffer.appendChar(')'); + return buffer.toString(); + } + + void BooleanScorer::add(Scorer* scorer, const bool required, const bool prohibited) { + int32_t mask = 0; + if (required || prohibited) { + if (nextMask == 0) + _CLTHROWA(CL_ERR_IndexOutOfBounds, "More than 32 required/prohibited clauses in query."); + mask = nextMask; + nextMask = ( nextMask << 1 ); + } else + mask = 0; + + if (!prohibited) + maxCoord++; + + if (prohibited) + prohibitedMask |= mask; // update prohibited mask + else if (required) + requiredMask |= mask; // update required mask + + //scorer, HitCollector, and scorers is delete in the SubScorer + scorers = _CLNEW SubScorer(scorer, required, prohibited, + bucketTable->newCollector(mask), scorers); + } + + void BooleanScorer::computeCoordFactors(){ + coordFactors = _CL_NEWARRAY(qreal,maxCoord); + for (int32_t i = 0; i < maxCoord; i++) + coordFactors[i] = getSimilarity()->coord(i, maxCoord-1); + } + + /*void BooleanScorer::score(HitCollector* results, const int32_t maxDoc) { + if (coordFactors == NULL) + computeCoordFactors(); + + while (currentDoc < maxDoc) { + currentDoc = (currentDoc+BucketTable_SIZE<maxDoc?currentDoc+BucketTable_SIZE:maxDoc); + for (SubScorer* t = scorers; t != NULL; t = t->next) + t->scorer->score((t->collector), currentDoc); + bucketTable->collectHits(results); + } + }*/ + + + + + BooleanScorer::SubScorer::SubScorer(Scorer* scr, const bool r, const bool p, HitCollector* c, SubScorer* nxt): + scorer(scr), + required(r), + prohibited(p), + collector(c), + next(nxt) + { + //Func - Constructor + //Pre - scr != NULL, + // c != NULL + // nxt may or may not be NULL + //Post - The instance has been created + + CND_PRECONDITION(scr != NULL,"scr is NULL"); + CND_PRECONDITION(c != NULL,"c is NULL"); + + done = !scorer->next(); + } + + BooleanScorer::SubScorer::~SubScorer(){ + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + for (SubScorer * ptr = next; ptr; ){ + SubScorer* next = ptr->next; + ptr->next = NULL; + _CLDELETE(ptr); + ptr = next; + } + _CLDELETE(scorer); + _CLDELETE(collector); + } + + BooleanScorer::Bucket::Bucket(): + doc(-1), + score(0.0), + bits(0), + coord(0), + next(NULL) + { + } + BooleanScorer::Bucket::~Bucket(){ + } + + + + + BooleanScorer::BucketTable::BucketTable(BooleanScorer* scr): + scorer(scr), + first(NULL) + { + buckets = _CL_NEWARRAY(Bucket,BucketTable_SIZE); + } + BooleanScorer::BucketTable::~BucketTable(){ + clear(); + _CLDELETE_ARRAY(buckets); + } + + void BooleanScorer::BucketTable::clear(){ + //delete first; + first = NULL; + } + int32_t BooleanScorer::BucketTable::size() const { return BooleanScorer::BucketTable_SIZE; } + + HitCollector* BooleanScorer::BucketTable::newCollector(const int32_t mask) { + return _CLNEW Collector(mask, this); + } + + + + + + + + + + BooleanScorer::Collector::Collector(const int32_t msk, BucketTable* bucketTbl): + bucketTable(bucketTbl), + mask(msk) + { + } + + void BooleanScorer::Collector::collect(const int32_t doc, const qreal score){ + BucketTable* table = bucketTable; + int32_t i = doc & (BooleanScorer::BucketTable_SIZE-1); + Bucket* bucket = &table->buckets[i]; + + if (bucket->doc != doc) { // invalid bucket + bucket->doc = doc; // set doc + bucket->score = score; // initialize score + bucket->bits = mask; // initialize mask + bucket->coord = 1; // initialize coord + + bucket->next = table->first; // push onto valid list + table->first = bucket; + } else { // valid bucket + bucket->score += score; // increment score + bucket->bits |= mask; // add bits in mask + bucket->coord++; // increment coord + } + } + + + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/BooleanScorer.h b/src/3rdparty/clucene/src/CLucene/search/BooleanScorer.h new file mode 100644 index 0000000..2147bc5 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/BooleanScorer.h @@ -0,0 +1,99 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +* +* Changes are Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_BooleanScorer_ +#define _lucene_search_BooleanScorer_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "Scorer.h" + +CL_NS_DEF(search) + +class BooleanScorer : public Scorer { +public: + class Bucket : LUCENE_BASE { + public: + int32_t doc; // tells if bucket is valid + qreal score; // incremental score + int32_t bits; // used for bool constraints + int32_t coord; // count of terms in score + Bucket* next; // next valid bucket + + Bucket(); + ~Bucket(); + }; + + class SubScorer: LUCENE_BASE { + public: + bool done; + Scorer* scorer; + bool required; + bool prohibited; + HitCollector* collector; + SubScorer* next; + SubScorer(Scorer* scr, const bool r, const bool p, HitCollector* c, SubScorer* nxt); + ~SubScorer(); + }; + + class BucketTable:LUCENE_BASE { + private: + BooleanScorer* scorer; + public: + Bucket* buckets; + Bucket* first; // head of valid list + + BucketTable(BooleanScorer* scr); + int32_t size() const; + HitCollector* newCollector(const int32_t mask); + void clear(); + ~BucketTable(); + + }; + + class Collector: public HitCollector { + private: + BucketTable* bucketTable; + int32_t mask; + public: + Collector(const int32_t mask, BucketTable* bucketTable); + + void collect(const int32_t doc, const qreal score); + }; + + SubScorer* scorers; + BucketTable* bucketTable; + + int32_t maxCoord; + int32_t nextMask; + + int32_t end; + Bucket* current; + +public: + LUCENE_STATIC_CONSTANT(int32_t,BucketTable_SIZE=1024); + int32_t requiredMask; + int32_t prohibitedMask; + qreal* coordFactors; + + BooleanScorer(Similarity* similarity); + ~BooleanScorer(); + void add(Scorer* scorer, const bool required, const bool prohibited); + int32_t doc() const { return current->doc; } + bool next(); + qreal score(); + bool skipTo(int32_t target); + void explain(int32_t doc, Explanation* ret); + TCHAR* toString(); + void computeCoordFactors(); +}; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/CachingWrapperFilter.cpp b/src/3rdparty/clucene/src/CLucene/search/CachingWrapperFilter.cpp new file mode 100644 index 0000000..694556c --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/CachingWrapperFilter.cpp @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "CachingWrapperFilter.h" + +CL_NS_DEF(search) +CL_NS_USE(index) +CL_NS_USE(util) + +AbstractCachingFilter::AbstractCachingFilter(): + cache(false,true) +{ +} +AbstractCachingFilter::AbstractCachingFilter(const AbstractCachingFilter& copy): + cache(false,true) +{ +} +AbstractCachingFilter::~AbstractCachingFilter(){ +} +AbstractCachingFilter::BitSetHolder::BitSetHolder(CL_NS(util)::BitSet* bits, bool deleteBs){ + this->bits = bits; + this->deleteBs = deleteBs; +} +AbstractCachingFilter::BitSetHolder::~BitSetHolder(){ + if ( deleteBs ) + _CLDELETE(bits); +} + + +BitSet* AbstractCachingFilter::bits(IndexReader* reader){ + SCOPED_LOCK_MUTEX(cache.THIS_LOCK) + BitSetHolder* cached = cache.get(reader); + if ( cached != NULL ) + return cached->bits; + BitSet* bs = doBits(reader); + BitSetHolder* bsh = _CLNEW BitSetHolder(bs, doShouldDeleteBitSet(bs)); + cache.put(reader,bsh); + return bs; +} +void AbstractCachingFilter::closeCallback(CL_NS(index)::IndexReader* reader, void*){ + SCOPED_LOCK_MUTEX(cache.THIS_LOCK) + cache.remove(reader); +} + + + + +CachingWrapperFilter::CachingWrapperFilter(Filter* filter, bool deleteFilter){ + this->filter = filter; + this->deleteFilter = deleteFilter; +} +CachingWrapperFilter::CachingWrapperFilter(const CachingWrapperFilter& copy): + AbstractCachingFilter(copy) +{ + this->filter = copy.filter->clone(); + this->deleteFilter = true; +} +Filter* CachingWrapperFilter::clone() const{ + return _CLNEW CachingWrapperFilter(*this); +} +TCHAR* CachingWrapperFilter::toString(){ + TCHAR* fs = filter->toString(); + int len = _tcslen(fs)+23; + TCHAR* ret = _CL_NEWARRAY(TCHAR,len); + _sntprintf(ret,len,_T("CachingWrapperFilter(%s)"),fs); + _CLDELETE_CARRAY(fs); + return ret; +} +BitSet* CachingWrapperFilter::doBits(IndexReader* reader){ + return filter->bits(reader); +} +bool CachingWrapperFilter::doShouldDeleteBitSet( CL_NS(util)::BitSet* bits ){ + return filter->shouldDeleteBitSet(bits); +} +CachingWrapperFilter::~CachingWrapperFilter(){ + if ( deleteFilter ){ + _CLDELETE(filter); + }else + filter=NULL; +} + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/CachingWrapperFilter.h b/src/3rdparty/clucene/src/CLucene/search/CachingWrapperFilter.h new file mode 100644 index 0000000..e48a182 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/CachingWrapperFilter.h @@ -0,0 +1,80 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_CachingWrapperFilter_ +#define _lucene_search_CachingWrapperFilter_ + +#include "CLucene/util/BitSet.h" +#include "CLucene/index/IndexReader.h" +#include "Filter.h" + +CL_NS_DEF(search) +/** + * Wraps another filter's result and caches it. The purpose is to allow + * filters to implement this and allow itself to be cached. Alternatively, + * use the CachingWrapperFilter to cache the filter. + */ +class AbstractCachingFilter: public Filter +{ + class BitSetHolder: LUCENE_BASE{ + bool deleteBs; + public: + BitSetHolder(CL_NS(util)::BitSet* bits, bool deleteBs); + ~BitSetHolder(); + CL_NS(util)::BitSet* bits; + }; + void closeCallback(CL_NS(index)::IndexReader* reader, void* param); + typedef CL_NS(util)::CLHashMap<CL_NS(index)::IndexReader*, + BitSetHolder*, + CL_NS(util)::Compare::Void<CL_NS(index)::IndexReader>, + CL_NS(util)::Equals::Void<CL_NS(index)::IndexReader>, + CL_NS(util)::Deletor::Object<CL_NS(index)::IndexReader>, + CL_NS(util)::Deletor::Object<BitSetHolder> > CacheType; + + CacheType cache; + +protected: + AbstractCachingFilter( const AbstractCachingFilter& copy ); + virtual CL_NS(util)::BitSet* doBits( CL_NS(index)::IndexReader* reader ) = 0; + virtual bool doShouldDeleteBitSet( CL_NS(util)::BitSet* bits ){ return false; } + AbstractCachingFilter(); +public: + virtual ~AbstractCachingFilter(); + + /** Returns a BitSet with true for documents which should be permitted in + search results, and false for those that should not. */ + CL_NS(util)::BitSet* bits( CL_NS(index)::IndexReader* reader ); + + virtual Filter *clone() const = 0; + virtual TCHAR *toString() = 0; + + bool shouldDeleteBitSet( const CL_NS(util)::BitSet* bits ) const{ return false; } +}; + +/** + * Wraps another filter's result and caches it. The purpose is to allow + * filters to simply filter, and then wrap with this class to add + * caching, keeping the two concerns decoupled yet composable. + */ +class CachingWrapperFilter: public AbstractCachingFilter +{ +private: + Filter* filter; + bool deleteFilter; +protected: + CachingWrapperFilter( const CachingWrapperFilter& copy ); + CL_NS(util)::BitSet* doBits( CL_NS(index)::IndexReader* reader ); + bool doShouldDeleteBitSet( CL_NS(util)::BitSet* bits ); +public: + CachingWrapperFilter( Filter* filter, bool deleteFilter=true ); + ~CachingWrapperFilter(); + + Filter *clone() const; + TCHAR *toString(); +}; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/ChainedFilter.cpp b/src/3rdparty/clucene/src/CLucene/search/ChainedFilter.cpp new file mode 100644 index 0000000..4b6389c --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/ChainedFilter.cpp @@ -0,0 +1,213 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ + +#include <CLucene/StdHeader.h> +#include <CLucene/util/Misc.h> +#include "ChainedFilter.h" + +CL_NS_DEF(search) +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_USE(document) + + +ChainedFilter::ChainedFilter( Filter ** _filters, int _op ): + filters(_filters), + logicArray(NULL), + logic(_op) +{ +} +ChainedFilter::ChainedFilter( Filter** _filters, int* _array ): + filters(_filters), + logicArray(_array), + logic(-1) +{ +} +ChainedFilter::ChainedFilter( const ChainedFilter& copy ) : + logicArray( copy.logicArray ), + logic( copy.logic ) +{ + filters = copy.filters; +} +ChainedFilter::~ChainedFilter(void) +{ + +} + +Filter* ChainedFilter::clone() const { + return _CLNEW ChainedFilter(*this ); +} + +const TCHAR* ChainedFilter::getLogicString(int logic){ + if ( logic == ChainedFilter::OR ) + return _T("OR"); + else if ( logic == ChainedFilter::AND ) + return _T("AND"); + else if ( logic == ChainedFilter::ANDNOT ) + return _T("ANDNOT"); + else if ( logic == ChainedFilter::XOR ) + return _T("XOR"); + else if ( logic >= ChainedFilter::USER ){ + return _T("USER"); + } + return _T(""); +} + +TCHAR* ChainedFilter::toString() +{ + + Filter** filter = filters; + + StringBuffer buf(_T("ChainedFilter: [")); + int* la = logicArray; + while(*filter ) + { + if ( filter != filters ) + buf.appendChar(' '); + buf.append(getLogicString(logic==-1?*la:logic)); + buf.appendChar(' '); + + TCHAR* filterstr = (*filter)->toString(); + buf.append(filterstr); + _CLDELETE_ARRAY( filterstr ); + + filter++; + if ( logic == -1 ) + la++; + } + + buf.appendChar(']'); + + return buf.toString(); +} + + +/** Returns a BitSet with true for documents which should be permitted in +search results, and false for those that should not. */ +BitSet* ChainedFilter::bits( IndexReader* reader ) +{ + if( logic != -1 ) + return bits( reader, logic ); + else if( logicArray != NULL ) + return bits( reader, logicArray ); + else + return bits( reader, DEFAULT ); +} + + +BitSet* ChainedFilter::bits( IndexReader* reader, int logic ) +{ + BitSet* bts = NULL; + + Filter** filter = filters; + + // see discussion at top of file + if( *filter ) { + BitSet* tmp = (*filter)->bits( reader ); + if ( (*filter)->shouldDeleteBitSet(tmp) ) //if we are supposed to delete this BitSet, then + bts = tmp; //we can safely call it our own + else if ( tmp == NULL ){ + int32_t len = reader->maxDoc(); + bts = _CLNEW BitSet( len ); //bitset returned null, which means match _all_ + for (int32_t i=0;i<len;i++ ) + bts->set(i); + }else{ + bts = tmp->clone(); //else it is probably cached, so we need to copy it before using it. + } + filter++; + } + else + bts = _CLNEW BitSet( reader->maxDoc() ); + + while( *filter ) { + doChain( bts, reader, logic, *filter ); + filter++; + } + + return bts; +} + + +BitSet* ChainedFilter::bits( IndexReader* reader, int* _logicArray ) +{ + BitSet* bts = NULL; + + Filter** filter = filters; + int* logic = _logicArray; + + // see discussion at top of file + if( *filter ) { + BitSet* tmp = (*filter)->bits( reader ); + if ( (*filter)->shouldDeleteBitSet(tmp) ) //if we are supposed to delete this BitSet, then + bts = tmp; //we can safely call it our own + else if ( tmp == NULL ){ + int32_t len = reader->maxDoc(); + bts = _CLNEW BitSet( len ); //bitset returned null, which means match _all_ + for (int32_t i=0;i<len;i++ ) + bts->set(i); //todo: this could mean that we can skip certain types of filters + } + else + { + bts = tmp->clone(); //else it is probably cached, so we need to copy it before using it. + } + filter++; + logic++; + } + else + bts = _CLNEW BitSet( reader->maxDoc() ); + + while( *filter ) { + doChain( bts, reader, *logic, *filter ); + filter++; + logic++; + } + + return bts; +} + +void ChainedFilter::doUserChain( CL_NS(util)::BitSet* chain, CL_NS(util)::BitSet* filter, int logic ){ + _CLTHROWA(CL_ERR_Runtime,"User chain logic not implemented by superclass"); +} + +BitSet* ChainedFilter::doChain( BitSet* resultset, IndexReader* reader, int logic, Filter* filter ) +{ + BitSet* filterbits = filter->bits( reader ); + int32_t maxDoc = reader->maxDoc(); + int32_t i=0; + if ( logic >= ChainedFilter::USER ){ + doUserChain(resultset,filterbits,logic); + }else{ + switch( logic ) + { + case OR: + for( i=0; i < maxDoc; i++ ) + resultset->set( i, (resultset->get(i) || (filterbits==NULL || filterbits->get(i) ))?1:0 ); + break; + case AND: + for( i=0; i < maxDoc; i++ ) + resultset->set( i, (resultset->get(i) && (filterbits==NULL || filterbits->get(i) ))?1:0 ); + break; + case ANDNOT: + for( i=0; i < maxDoc; i++ ) + resultset->set( i, (resultset->get(i) && (filterbits==NULL || filterbits->get(i)))?0:1 ); + break; + case XOR: + for( i=0; i < maxDoc; i++ ) + resultset->set( i, resultset->get(i) ^ ((filterbits==NULL || filterbits->get(i) )?1:0) ); + break; + default: + doChain( resultset, reader, DEFAULT, filter ); + } + } + + if ( filter->shouldDeleteBitSet(filterbits) ) + _CLDELETE( filterbits ); + + return resultset; +} + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/ChainedFilter.h b/src/3rdparty/clucene/src/CLucene/search/ChainedFilter.h new file mode 100644 index 0000000..f4d9d00 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/ChainedFilter.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_ChainedFilter_ +#define _lucene_search_ChainedFilter_ + +#include "CLucene/index/IndexReader.h" +#include "CLucene/util/BitSet.h" +#include "CLucene/search/Filter.h" + +CL_NS_DEF(search) + +/* +Discussion - brian@unixpoet.com + +From ChainedFilter.java: + +... + +// First AND operation takes place against a completely false +// bitset and will always return zero results. Thanks to +// Daniel Armbrust for pointing this out and suggesting workaround. + +if (logic[0] == AND) +{ + result = (BitSet) chain[i].bits(reader).clone(); + ++i; +} + +... + +The observation is correct and it was buggy. The problem is that the same +issue remains for the ANDNOT logic op but with the inverse result: all bits +set to 1. The result of the other ops, i.e. OR, AND, XOR for the first filter +ends up just copying the bitset of the first filter (explicitly in the case of the AND). + +Why not do the same for the NAND? This will have the side effect of rendering the first op +in the logic array superflous - not a big problem. + +The only "problem" is that we will return different results then the Java +Lucene code - though I prefer CLucene to be a correct implementation and only maintain +API compat rather than full 100% compat with Lucene. +*/ +class ChainedFilter: public Filter +{ +public: + LUCENE_STATIC_CONSTANT(int, OR = 0); //set current bit if the chain is set OR if the filter bit is set + LUCENE_STATIC_CONSTANT(int, AND = 1); //set current bit if the chain is set AND the filter bit is set + LUCENE_STATIC_CONSTANT(int, ANDNOT = 2); //set current bit if the chain is not set AND the filter bit is not set + LUCENE_STATIC_CONSTANT(int, XOR = 3); //set current bit if the chain is set OR the filter bit is set BUT not both is set + + LUCENE_STATIC_CONSTANT(int, USER = 5); //add this value to user defined value, then override doUserChain + + LUCENE_STATIC_CONSTANT(int, DEFAULT = OR); + +protected: + Filter **filters; + int *logicArray; + int logic; + + ChainedFilter( const ChainedFilter& copy ); + CL_NS(util)::BitSet* bits( CL_NS(index)::IndexReader* reader, int logic ); + CL_NS(util)::BitSet* bits( CL_NS(index)::IndexReader* reader, int* logicArray ); + CL_NS(util)::BitSet* doChain( CL_NS(util)::BitSet* result, CL_NS(index)::IndexReader* reader, int logic, Filter* filter ); + + virtual void doUserChain( CL_NS(util)::BitSet* chain, CL_NS(util)::BitSet* filter, int logic ); + virtual const TCHAR* getLogicString(int logic); +public: + ChainedFilter( Filter** filters, int op = DEFAULT ); + ChainedFilter( Filter** filters, int* _array ); + virtual ~ChainedFilter(); + + /** Returns a BitSet with true for documents which should be permitted in + search results, and false for those that should not. */ + CL_NS(util)::BitSet* bits( CL_NS(index)::IndexReader* reader ); + + virtual Filter* clone() const; + + TCHAR* toString(); +}; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/Compare.h b/src/3rdparty/clucene/src/CLucene/search/Compare.h new file mode 100644 index 0000000..ab38b17 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Compare.h @@ -0,0 +1,161 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_Compare_ +#define _lucene_search_Compare_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "FieldSortedHitQueue.h" + +CL_NS_DEF(search) + + +class ScoreDocComparators:LUCENE_BASE { +protected: + ScoreDocComparators(){} +public: + ~ScoreDocComparators(){ + } + + class Relevance:public ScoreDocComparator { + public: + int32_t compare (struct ScoreDoc* i, struct ScoreDoc* j) { + if (i->score > j->score) return -1; + if (i->score < j->score) return 1; + return 0; + } + CL_NS(util)::Comparable* sortValue (struct ScoreDoc* i) { + return _CLNEW CL_NS(util)::Compare::Float (i->score); + } + int32_t sortType() { + return SortField::DOCSCORE; + } + }; + + class IndexOrder:public ScoreDocComparator{ + public: + IndexOrder(): + ScoreDocComparator() + { + + } + int32_t compare (struct ScoreDoc* i, struct ScoreDoc* j) { + if (i->doc < j->doc) return -1; + if (i->doc > j->doc) return 1; + return 0; + } + CL_NS(util)::Comparable* sortValue (struct ScoreDoc* i) { + return _CLNEW CL_NS(util)::Compare::Int32(i->doc); + } + int32_t sortType() { + return SortField::DOC; + } + }; + + + class String: public ScoreDocComparator { + FieldCache::StringIndex* index; +#ifdef _CL__CND_DEBUG + int32_t length; +#endif + public: + String(FieldCache::StringIndex* index, int32_t len) + { +#ifdef _CL__CND_DEBUG + this->length = len; +#endif + this->index = index; + } + + int32_t compare (struct ScoreDoc* i, struct ScoreDoc* j) { + CND_PRECONDITION(i->doc<length, "i->doc>=length") + CND_PRECONDITION(j->doc<length, "j->doc>=length") + if (index->order[i->doc] < index->order[j->doc]) return -1; + if (index->order[i->doc] > index->order[j->doc]) return 1; + return 0; + } + + CL_NS(util)::Comparable* sortValue (struct ScoreDoc* i) { + return _CLNEW CL_NS(util)::Compare::TChar(index->lookup[index->order[i->doc]]); + } + + int32_t sortType() { + return SortField::STRING; + } + }; + + class Int32:public ScoreDocComparator{ + int32_t* fieldOrder; +#ifdef _CL__CND_DEBUG + int32_t length; +#endif + public: + Int32(int32_t* fieldOrder, int32_t len) + { + this->fieldOrder = fieldOrder; +#ifdef _CL__CND_DEBUG + this->length = len; +#endif + } + + + int32_t compare (struct ScoreDoc* i, struct ScoreDoc* j) { + CND_PRECONDITION(i->doc<length, "i->doc>=length") + CND_PRECONDITION(j->doc<length, "j->doc>=length") + if (fieldOrder[i->doc] < fieldOrder[j->doc]) return -1; + if (fieldOrder[i->doc] > fieldOrder[j->doc]) return 1; + return 0; + } + + CL_NS(util)::Comparable* sortValue (struct ScoreDoc* i) { + CND_PRECONDITION(i->doc<length, "i->doc>=length") + return _CLNEW CL_NS(util)::Compare::Int32(fieldOrder[i->doc]); + } + + int32_t sortType() { + return SortField::INT; + } + }; + + class Float:public ScoreDocComparator { + qreal* fieldOrder; +#ifdef _CL__CND_DEBUG + int32_t length; +#endif + public: + Float(qreal* fieldOrder, int32_t len) + { + this->fieldOrder = fieldOrder; +#ifdef _CL__CND_DEBUG + this->length = len; +#endif + } + + int32_t compare (struct ScoreDoc* i, struct ScoreDoc* j) { + CND_PRECONDITION(i->doc<length, "i->doc>=length") + CND_PRECONDITION(j->doc<length, "j->doc>=length") + if (fieldOrder[i->doc] < fieldOrder[j->doc]) return -1; + if (fieldOrder[i->doc] > fieldOrder[j->doc]) return 1; + return 0; + } + + CL_NS(util)::Comparable* sortValue (struct ScoreDoc* i) { + CND_PRECONDITION(i->doc<length, "i->doc>=length") + return _CLNEW CL_NS(util)::Compare::Float(fieldOrder[i->doc]); + } + + int32_t sortType() { + return SortField::FLOAT; + } + }; +}; + + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/ConjunctionScorer.cpp b/src/3rdparty/clucene/src/CLucene/search/ConjunctionScorer.cpp new file mode 100644 index 0000000..9b7846f --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/ConjunctionScorer.cpp @@ -0,0 +1,144 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "ConjunctionScorer.h" +#include "CLucene/util/Arrays.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_DEF(search) + + Scorer* ConjunctionScorer::first() const{ + if ( scorers.end() == scorers.begin() ) + return NULL; + + return *scorers.begin(); + } //get First + Scorer* ConjunctionScorer::last() { + if ( scorers.end() == scorers.begin() ) + return NULL; + + CL_NS_STD(list)<Scorer*>::iterator i = scorers.end(); + --i; + return *i; + } //get Last + + class _ScorerSorter:public CL_NS(util)::Arrays::_Arrays<Scorer*>{ + public: + bool equals(Scorer* o1,Scorer* o2) const{ + return o1->doc() == o2->doc(); + } + int32_t compare(Scorer* o1,Scorer* o2) const{ + return o1->doc() - o2->doc(); + } + }; + _ScorerSorter __ScorerSorter; + + void ConjunctionScorer::sortScorers() { + // move scorers to an array + int32_t size = scorers.size(); + Scorer** array = _CL_NEWARRAY(Scorer*,size+1); + scorers.toArray(array); + scorers.clear(); // empty the list + + // note that this comparator is not consistent with equals! + __ScorerSorter.sort(array,size,0,size); + + for (int32_t i = 0; i<size; i++) { + scorers.push_back(array[i]); // re-build list, now sorted + } + _CLDELETE_ARRAY(array); + } + + bool ConjunctionScorer::doNext() { + while (more && first()->doc() < last()->doc()) { // find doc w/ all clauses + more = first()->skipTo(last()->doc()); // skip first upto last + Scorer* scorer = *scorers.begin(); + scorers.delete_front(); + scorers.push_back(scorer); // move first to last + } + return more; // found a doc with all clauses + } + + + void ConjunctionScorer::init() { + more = scorers.size() > 0; + + // compute coord factor + coord = getSimilarity()->coord(scorers.size(), scorers.size()); + + // move each scorer to its first entry + CL_NS_STD(list)<Scorer*>::iterator i = scorers.begin(); + while (more && i!=scorers.end()) { + more = ((Scorer*)*i)->next(); + ++i; + } + + if (more) + sortScorers(); // initial sort of list + + firstTime = false; + } + + ConjunctionScorer::ConjunctionScorer(Similarity* similarity): + Scorer(similarity), + scorers(false), + firstTime(true), + more(true), + coord(0.0) + { + } + ConjunctionScorer::~ConjunctionScorer(){ + scorers.setDoDelete(true); + } + + TCHAR *CL_NS(search)::Scorer::toString(void){ + return STRDUP_TtoT(_T("ConjunctionScorer")); + } + + + void ConjunctionScorer::add(Scorer* scorer){ + scorers.push_back(scorer); + } + + + int32_t ConjunctionScorer::doc() const{ return first()->doc(); } + + bool ConjunctionScorer::next() { + if (firstTime) { + init(); + } else if (more) { + more = last()->next(); // trigger further scanning + } + return doNext(); + } + + bool ConjunctionScorer::skipTo(int32_t target) { + CL_NS_STD(list)<Scorer*>::iterator i = scorers.begin(); + while (more && i!=scorers.end()) { + more = ((Scorer*)*i)->skipTo(target); + ++i; + } + if (more) + sortScorers(); // re-sort scorers + return doNext(); + } + + qreal ConjunctionScorer::score(){ + qreal score = 0.0f; // sum scores + CL_NS_STD(list)<Scorer*>::const_iterator i = scorers.begin(); + while (i!=scorers.end()){ + score += (*i)->score(); + ++i; + } + score *= coord; + return score; + } + + + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/ConjunctionScorer.h b/src/3rdparty/clucene/src/CLucene/search/ConjunctionScorer.h new file mode 100644 index 0000000..4b68072 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/ConjunctionScorer.h @@ -0,0 +1,50 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_ConjunctionScorer_ +#define _lucene_search_ConjunctionScorer_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif +#include "Scorer.h" +#include "Similarity.h" + +CL_NS_DEF(search) + +/** Scorer for conjunctions, sets of queries, all of which are required. */ +class ConjunctionScorer: public Scorer { +private: + CL_NS(util)::CLLinkedList<Scorer*,CL_NS(util)::Deletor::Object<Scorer> > scorers; + bool firstTime; + bool more; + qreal coord; + + Scorer* first() const; + Scorer* last(); + void sortScorers(); + bool doNext(); + void init(); +public: + ConjunctionScorer(Similarity* similarity); + virtual ~ConjunctionScorer(); + TCHAR* toString(void){ + return STRDUP_TtoT(_T("ConjunctionScorer")); + } + void add(Scorer* scorer); + int32_t doc() const; + bool next(); + bool skipTo(int32_t target); + qreal score(); + virtual void explain(int32_t doc, Explanation* ret) { + _CLTHROWA(CL_ERR_UnsupportedOperation,"UnsupportedOperationException: ConjunctionScorer::explain"); + } + + +}; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/DateFilter.cpp b/src/3rdparty/clucene/src/CLucene/search/DateFilter.cpp new file mode 100644 index 0000000..9258582 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/DateFilter.cpp @@ -0,0 +1,93 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "DateFilter.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_USE(document) +CL_NS_DEF(search) + + DateFilter::~DateFilter(){ + _CLDECDELETE( start ); + _CLDECDELETE( end ); + } + + DateFilter::DateFilter(const DateFilter& copy): + start( _CL_POINTER(copy.start) ), + end ( _CL_POINTER(copy.end) ) + { + } + + /** Constructs a filter for field <code>f</code> matching times between + <code>from</code> and <code>to</code>. */ + DateFilter::DateFilter(const TCHAR* f, int64_t from, int64_t to) + { + TCHAR* tmp = DateField::timeToString(from); + start = _CLNEW Term(f, tmp); + _CLDELETE_CARRAY(tmp); + + tmp = DateField::timeToString(to); + end = _CLNEW Term(start, tmp); + _CLDELETE_CARRAY(tmp); + } + + /** Constructs a filter for field <code>f</code> matching times before + <code>time</code>. */ + DateFilter* DateFilter::Before(const TCHAR* field, int64_t time) { + return _CLNEW DateFilter(field, 0,time); + } + + /** Constructs a filter for field <code>f</code> matching times after + <code>time</code>. */ + DateFilter* DateFilter::After(const TCHAR* field, int64_t time) { + return _CLNEW DateFilter(field,time, DATEFIELD_DATE_MAX ); + } + + /** Returns a BitSet with true for documents which should be permitted in + search results, and false for those that should not. */ + BitSet* DateFilter::bits(IndexReader* reader) { + BitSet* bts = _CLNEW BitSet(reader->maxDoc()); + + TermEnum* enumerator = reader->terms(start); + if (enumerator->term(false) == NULL){ + _CLDELETE(enumerator); + return bts; + } + TermDocs* termDocs = reader->termDocs(); + + try { + while (enumerator->term(false)->compareTo(end) <= 0) { + termDocs->seek(enumerator->term(false)); + while (termDocs->next()) { + bts->set(termDocs->doc()); + } + if (!enumerator->next()) { + break; + } + } + } _CLFINALLY ( + termDocs->close(); + _CLDELETE(termDocs); + enumerator->close(); + _CLDELETE(enumerator); + ); + return bts; + } + + Filter* DateFilter::clone() const{ + return _CLNEW DateFilter(*this); + } + + TCHAR* DateFilter::toString(){ + size_t len = _tcslen(start->field()) + start->textLength() + end->textLength() + 8; + TCHAR* ret = _CL_NEWARRAY(TCHAR,len); + ret[0]=0; + _sntprintf(ret,len,_T("%s: [%s-%s]"), start->field(),start->text(),end->text()); + return ret; + } +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/DateFilter.h b/src/3rdparty/clucene/src/CLucene/search/DateFilter.h new file mode 100644 index 0000000..b37272b --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/DateFilter.h @@ -0,0 +1,59 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_DateFilter_ +#define _lucene_search_DateFilter_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/document/DateField.h" +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/util/BitSet.h" +#include "Filter.h" + +CL_NS_DEF(search) + /** + * A Filter that restricts search results to a range of time. + * + * <p>For this to work, documents must have been indexed with a + * {@link DateField}. + */ + class DateFilter: public Filter { + private: + CL_NS(index)::Term* start; + CL_NS(index)::Term* end; + + protected: + DateFilter(const DateFilter& copy); + public: + ~DateFilter(); + + /** Constructs a filter for field <code>f</code> matching times between + <code>from</code> and <code>to</code>. */ + DateFilter(const TCHAR* f, int64_t from, int64_t to); + + /** Constructs a filter for field <code>f</code> matching times before + <code>time</code>. */ + static DateFilter* Before(const TCHAR* field, int64_t time) ; + + /** Constructs a filter for field <code>f</code> matching times after + <code>time</code>. */ + static DateFilter* After(const TCHAR* field, int64_t time) ; + + /** Returns a BitSet with true for documents which should be permitted in + search results, and false for those that should not. */ + CL_NS(util)::BitSet* bits(CL_NS(index)::IndexReader* reader) ; + + Filter* clone() const; + + TCHAR* toString(); + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/ExactPhraseScorer.cpp b/src/3rdparty/clucene/src/CLucene/search/ExactPhraseScorer.cpp new file mode 100644 index 0000000..1fbf2e9 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/ExactPhraseScorer.cpp @@ -0,0 +1,85 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "ExactPhraseScorer.h" + +#include "PhraseScorer.h" +#include "CLucene/index/Terms.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + + ExactPhraseScorer::ExactPhraseScorer(Weight* weight, TermPositions** tps, + int32_t* positions, Similarity* similarity, uint8_t* norms): + PhraseScorer(weight, tps, positions, similarity, norms){ + //Func - Constructor + //Pre - tps != NULL + // tpsLength >= 0 + // n != NULL + //Post - Instance has been created + + CND_PRECONDITION(tps != NULL,"tps is NULL"); + CND_PRECONDITION(tps[0] != NULL,"tps is NULL"); + //CND_PRECONDITION(n != NULL,"n is NULL") =this is checked already in PhraseScorer + + } + + qreal ExactPhraseScorer::phraseFreq(){ + //Func - Returns the freqency of the phrase + //Pre - first != NULL + // last != NULL + // pq != NULL + // size of the PhraseQueue pq is 0 + //Post - The frequency of the phrase has been returned + + CND_PRECONDITION(first != NULL,"first is NULL"); + CND_PRECONDITION(last != NULL,"last is NULL"); + CND_PRECONDITION(pq != NULL,"pq is NULL"); + CND_PRECONDITION(pq->size()==0,"pq is not empty"); + + //build pq from list + + //Add the nodes of the list of PhrasePositions and store them + //into the PhraseQueue pq so it can used to build + //a list of sorted nodes + for (PhrasePositions* pp = first; pp != NULL; pp = pp->_next) { + //Read the first TermPosition of the current PhrasePositions pp + pp->firstPosition(); + //Store the current PhrasePositions pp into the PhraseQueue pq + pq->put(pp); + } + //pqToList requires that first and last be NULL when it's called. + //This is done at the beginning of pqToList() + //In this case, the nodes of the linked list are referenced by pq (see + //above loop), so we can clear our pointers to the head and tail of the + //linked list without fear of leaking the nodes. + + //rebuild list from pq + pqToList(); + + //Initialize freq at 0 + int32_t freq = 0; + + //find position with all terms + do { + //scan forward in first + while (first->position < last->position){ + do{ + if (!first->nextPosition()){ + return (qreal)freq; + } + } while (first->position < last->position); + //Make the current first node the last node in the list + firstToLast(); + } + //all equal: a match has been found + freq++; + } while (last->nextPosition()); + + return (qreal)freq; + } +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/ExactPhraseScorer.h b/src/3rdparty/clucene/src/CLucene/search/ExactPhraseScorer.h new file mode 100644 index 0000000..d82aa9e --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/ExactPhraseScorer.h @@ -0,0 +1,31 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_ExactPhraseScorer_ +#define _lucene_search_ExactPhraseScorer_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "PhraseScorer.h" +#include "CLucene/index/Terms.h" + +CL_NS_DEF(search) + + class ExactPhraseScorer: public PhraseScorer { + public: + ExactPhraseScorer(Weight* weight, CL_NS(index)::TermPositions** tps, int32_t* positions, + Similarity* similarity, uint8_t* norms ); + + ~ExactPhraseScorer(){ + } + protected: + //Returns the exact freqency of the phrase + qreal phraseFreq(); + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/Explanation.cpp b/src/3rdparty/clucene/src/CLucene/search/Explanation.cpp new file mode 100644 index 0000000..87189b7 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Explanation.cpp @@ -0,0 +1,133 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "Explanation.h" +#include "CLucene/util/StringBuffer.h" + +CL_NS_USE(util) +CL_NS_DEF(search) + + +Explanation::Explanation(qreal value, const TCHAR* description) { + this->value = value; + _tcsncpy(this->description,description,LUCENE_SEARCH_EXPLANATION_DESC_LEN); +} + +Explanation::Explanation() { + this->value = 0; + this->description[0]=0; +} + +Explanation::Explanation(const Explanation& copy){ + set(copy); +} +void Explanation::set(const Explanation& copy){ + this->value = copy.value; + STRCPY_TtoT(description,copy.description,LUCENE_SEARCH_EXPLANATION_DESC_LEN); + + details.clear(); + typedef CL_NS(util)::Deletor::Object<Explanation> Deletor; + CL_NS(util)::CLArrayList<Explanation*, Deletor>::const_iterator itr; + itr = copy.details.begin(); + while ( itr != copy.details.end() ){ + details.push_back( (*itr)->clone() ); + ++itr; + } +} + +Explanation::~Explanation(){ +} + +void Explanation::setDescription(const TCHAR* description) { + _tcsncpy(this->description,description,LUCENE_SEARCH_EXPLANATION_DESC_LEN); +} + + +Explanation* Explanation::clone() const{ + return _CLNEW Explanation(*this); +} + +qreal Explanation::getValue() const{ + return value; +} + +void Explanation::setValue(qreal value) { + this->value = value; +} + +const TCHAR* Explanation::getDescription() const { + return description; +} + +///todo: mem leaks +TCHAR* Explanation::toString(int32_t depth) { + StringBuffer buffer; + for (int32_t i = 0; i < depth; i++) { + buffer.append(_T(" ")); + } + buffer.appendFloat(getValue(),2); + buffer.append(_T(" = ")); + buffer.append(getDescription()); + buffer.append(_T("\n")); + + for ( uint32_t j=0;j<details.size();j++ ){ + TCHAR* tmp = details[j]->toString(depth+1); + buffer.append(tmp); + _CLDELETE_CARRAY(tmp); + } + return buffer.toString(); +} + +int Explanation::getDetailsLength(){ + return details.size(); +} +Explanation* Explanation::getDetail(int i){ + return details[i]; +} +/** The sub-nodes of this explanation node. */ +void Explanation::getDetails(Explanation** ret) { + uint32_t size = details.size(); + for ( uint32_t i=0;i<size;i++ ){ + ret[i] = details[i]->clone(); + } + ret[size] = NULL; +} + +/** Adds a sub-node to this explanation node. */ +void Explanation::addDetail(Explanation* detail) { + details.push_back(detail); +} + +/** Render an explanation as text. */ +TCHAR* Explanation::toString() { + return toString(0); +} + +/** Render an explanation as HTML. */ +///todo: mem leaks +TCHAR* Explanation::toHtml() { + StringBuffer buffer; + TCHAR* tmp; + buffer.append(_T("<ul>\n")); + + buffer.append(_T("<li>")); + buffer.appendFloat(getValue(),2); + buffer.append(_T(" = ")); + + buffer.append(getDescription()); + buffer.append(_T("</li>\n")); + + for ( uint32_t i=0;i<details.size();i++ ){ + tmp = details[i]->toHtml(); + buffer.append(tmp); + _CLDELETE_CARRAY(tmp); + } + buffer.append(_T("</ul>\n")); + + return buffer.toString(); +} +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/Explanation.h b/src/3rdparty/clucene/src/CLucene/search/Explanation.h new file mode 100644 index 0000000..7c95822 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Explanation.h @@ -0,0 +1,66 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_Explanation +#define _lucene_search_Explanation + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +CL_NS_DEF(search) + + #define LUCENE_SEARCH_EXPLANATION_DESC_LEN 200 + class Explanation :LUCENE_BASE { + private: + qreal value; // the value of this node + TCHAR description[LUCENE_SEARCH_EXPLANATION_DESC_LEN]; // what it represents + CL_NS(util)::CLArrayList<Explanation*,CL_NS(util)::Deletor::Object<Explanation> > details; // sub-explanations + + TCHAR* toString(int32_t depth); + protected: + Explanation(const Explanation& copy); + public: + Explanation(); + ~Explanation(); + + Explanation(qreal value, const TCHAR* description); + void set(const Explanation& other); + + Explanation* clone() const; + + /** The value assigned to this explanation node. */ + qreal getValue() const; + + /** Sets the value assigned to this explanation node. */ + void setValue(qreal value); + + /** A description of this explanation node. */ + const TCHAR* getDescription() const; ///<returns reference + + /** Sets the description of this explanation node. */ + void setDescription(const TCHAR* description); + + /** The sub-nodes of this explanation node. + * @param ret this array of Explanations should be getDetailsLength()+1 in size. + The array will be null terminated. + */ + void getDetails(Explanation** ret); + int getDetailsLength(); + Explanation* getDetail(int i); + + /** Adds a sub-node to this explanation node. */ + void addDetail(Explanation* detail); + + /** Render an explanation as text. */ + TCHAR* toString(); + + /** Render an explanation as HTML. */ + TCHAR* toHtml(); + }; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/FieldCache.cpp b/src/3rdparty/clucene/src/CLucene/search/FieldCache.cpp new file mode 100644 index 0000000..fae6720 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FieldCache.cpp @@ -0,0 +1,55 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "FieldCache.h" +#include "FieldCacheImpl.h" + +CL_NS_DEF(search) + +FieldCache* FieldCache::DEFAULT = _CLNEW FieldCacheImpl(); +int32_t FieldCache::STRING_INDEX = -1; + +FieldCacheAuto::FieldCacheAuto(int32_t len, int32_t type){ + contentType = type; + contentLen = len; + ownContents = false; + + intArray=NULL; + floatArray=NULL; + stringIndex=NULL; + stringArray=NULL; + comparableArray=NULL; + sortComparator=NULL; + scoreDocComparator=NULL; +} +FieldCacheAuto::~FieldCacheAuto(){ + if ( contentType == FieldCacheAuto::INT_ARRAY ){ + _CLDELETE_ARRAY(intArray); + }else if ( contentType == FieldCacheAuto::FLOAT_ARRAY ){ + _CLDELETE_ARRAY(floatArray); + }else if ( contentType == FieldCacheAuto::STRING_INDEX ){ + _CLDELETE(stringIndex); + }else if ( contentType == FieldCacheAuto::STRING_ARRAY ){ + if ( ownContents ){ + for ( int32_t i=0;i<contentLen;i++ ) + _CLDELETE_CARRAY(stringArray[i]); + } + _CLDELETE_ARRAY(stringArray); + }else if ( contentType == FieldCacheAuto::COMPARABLE_ARRAY ){ + if ( ownContents ){ + for ( int32_t i=0;i<contentLen;i++ ) + _CLDELETE(comparableArray[i]); + } + _CLDELETE_ARRAY(comparableArray); + }else if ( contentType == FieldCacheAuto::SORT_COMPARATOR ){ + _CLDELETE(sortComparator); + }else if ( contentType == FieldCacheAuto::SCOREDOC_COMPARATOR ){ + _CLDELETE(scoreDocComparator); + } +} + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/FieldCache.h b/src/3rdparty/clucene/src/CLucene/search/FieldCache.h new file mode 100644 index 0000000..eeec26f --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FieldCache.h @@ -0,0 +1,182 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_FieldCache_ +#define _lucene_search_FieldCache_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/IndexReader.h" +#include "Sort.h" + + +CL_NS_DEF(search) + +class FieldCacheAuto; //predefine + +/** + * Expert: Maintains caches of term values. + * + */ +class FieldCache :LUCENE_BASE { +public: + virtual ~FieldCache(){ + } + + /** Expert: Stores term text values and document ordering data. */ + class StringIndex:LUCENE_BASE { + public: + /** All the term values, in natural order. */ + TCHAR** lookup; + + /** For each document, an index into the lookup array. */ + int32_t* order; + + int count; + + /** Creates one of these objects + Consumes all memory given. + */ + StringIndex (int32_t* values, TCHAR** lookup, int count) { + this->count = count; + this->order = values; + this->lookup = lookup; + } + + ~StringIndex(){ + _CLDELETE_ARRAY(order); + + for ( int i=0;i<count;i++ ) + _CLDELETE_CARRAY(lookup[i]); + _CLDELETE_ARRAY(lookup); + } + }; + + + /** Indicator for FieldCache::StringIndex values in the cache. + NOTE: the value assigned to this constant must not be + the same as any of those in SortField!! + */ + static int32_t STRING_INDEX; + + /** Expert: The cache used internally by sorting and range query classes. */ + static FieldCache* DEFAULT; + + /** Checks the internal cache for an appropriate entry, and if none is + * found, reads the terms in <code>field</code> as integers and returns an array + * of size <code>reader.maxDoc()</code> of the value each document + * has in the given field. + * @param reader Used to get field values. + * @param field Which field contains the integers. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + virtual FieldCacheAuto* getInts (CL_NS(index)::IndexReader* reader, const TCHAR* field) = 0; + + /** Checks the internal cache for an appropriate entry, and if + * none is found, reads the terms in <code>field</code> as floats and returns an array + * of size <code>reader.maxDoc()</code> of the value each document + * has in the given field. + * @param reader Used to get field values. + * @param field Which field contains the floats. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + virtual FieldCacheAuto* getFloats (CL_NS(index)::IndexReader* reader, const TCHAR* field) = 0; + + /** Checks the internal cache for an appropriate entry, and if none + * is found, reads the term values in <code>field</code> and returns an array + * of size <code>reader.maxDoc()</code> containing the value each document + * has in the given field. + * @param reader Used to get field values. + * @param field Which field contains the strings. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + virtual FieldCacheAuto* getStrings (CL_NS(index)::IndexReader* reader, const TCHAR* field) = 0; + + /** Checks the internal cache for an appropriate entry, and if none + * is found reads the term values in <code>field</code> and returns + * an array of them in natural order, along with an array telling + * which element in the term array each document uses. + * @param reader Used to get field values. + * @param field Which field contains the strings. + * @return Array of terms and index into the array for each document. + * @throws IOException If any error occurs. + */ + virtual FieldCacheAuto* getStringIndex (CL_NS(index)::IndexReader* reader, const TCHAR* field) = 0; + + /** Checks the internal cache for an appropriate entry, and if + * none is found reads <code>field</code> to see if it contains integers, floats + * or strings, and then calls one of the other methods in this class to get the + * values. For string values, a FieldCache::StringIndex is returned. After + * calling this method, there is an entry in the cache for both + * type <code>AUTO</code> and the actual found type. + * @param reader Used to get field values. + * @param field Which field contains the values. + * @return int32_t[], qreal[] or FieldCache::StringIndex. + * @throws IOException If any error occurs. + */ + virtual FieldCacheAuto* getAuto (CL_NS(index)::IndexReader* reader, const TCHAR* field) = 0; + + /** Checks the internal cache for an appropriate entry, and if none + * is found reads the terms out of <code>field</code> and calls the given SortComparator + * to get the sort values. A hit in the cache will happen if <code>reader</code>, + * <code>field</code>, and <code>comparator</code> are the same (using <code>equals()</code>) + * as a previous call to this method. + * @param reader Used to get field values. + * @param field Which field contains the values. + * @param comparator Used to convert terms into something to sort by. + * @return Array of sort objects, one for each document. + * @throws IOException If any error occurs. + */ + virtual FieldCacheAuto* getCustom (CL_NS(index)::IndexReader* reader, const TCHAR* field, SortComparator* comparator) = 0; +}; + +/** A class holding an AUTO field. In java lucene an Object + is used, but we use this. + contentType: + 1 - integer array + 2 - float array + 3 - FieldCache::StringIndex object + This class is also used when returning getInt, getFloat, etc + because we have no way of returning the size of the array and this + class can be used to determine the array size +*/ +class FieldCacheAuto:LUCENE_BASE{ +public: + enum{ + INT_ARRAY=1, + FLOAT_ARRAY=2, + STRING_INDEX=3, + STRING_ARRAY=4, + COMPARABLE_ARRAY=5, + SORT_COMPARATOR=6, + SCOREDOC_COMPARATOR=7 + }; + + FieldCacheAuto(int32_t len, int32_t type); + ~FieldCacheAuto(); + ///if contents should be deleted too, depending on type + bool ownContents; + int32_t contentLen; //number of items in the list + uint8_t contentType; + int32_t* intArray; //item 1 + qreal* floatArray; //item 2 + FieldCache::StringIndex* stringIndex; //item 3 + TCHAR** stringArray; //item 4 + CL_NS(util)::Comparable** comparableArray; //item 5 + SortComparator* sortComparator; //item 6 + ScoreDocComparator* scoreDocComparator; //item 7 + +}; + + +CL_NS_END + +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/FieldCacheImpl.cpp b/src/3rdparty/clucene/src/CLucene/search/FieldCacheImpl.cpp new file mode 100644 index 0000000..6205209 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FieldCacheImpl.cpp @@ -0,0 +1,529 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "FieldCacheImpl.h" + +CL_NS_USE(util) +CL_NS_USE(index) +CL_NS_DEF(search) + +FieldCacheImpl::FieldCacheImpl(): + cache(false,true){ +} +FieldCacheImpl::~FieldCacheImpl(){ + cache.clear(); +} + +FieldCacheImpl::FileEntry::FileEntry (const TCHAR* field, int32_t type) { + this->field = CLStringIntern::intern(field CL_FILELINE); + this->type = type; + this->custom = NULL; + this->_hashCode = 0; + } + + /** Creates one of these objects for a custom comparator. */ + FieldCacheImpl::FileEntry::FileEntry (const TCHAR* field, SortComparatorSource* custom) { + this->field = CLStringIntern::intern(field CL_FILELINE); + this->type = SortField::CUSTOM; + this->custom = custom; + this->_hashCode = 0; + } + FieldCacheImpl::FileEntry::~FileEntry(){ + CLStringIntern::unintern(field); + } + + size_t FieldCacheImpl::FileEntry::hashCode(){ + if ( _hashCode == 0 ){ + //todo: cache hashcode? + size_t ret = Misc::thashCode(field); + if ( custom != NULL ) + ret = ret ^ custom->hashCode(); + ret = ret ^ (type*7); //type with a seed + _hashCode = ret; + } + return _hashCode; + } + int32_t FieldCacheImpl::FileEntry::compareTo(const FieldCacheImpl::FileEntry* other) const{ + if ( other->field == this->field ){ + if ( other->type == this->type ){ + if ( other->custom == NULL ){ + if ( this->custom == NULL ) + return 0; //both null + else + return 1; + }else if ( this->custom == NULL ) + return -1; + else if ( other->custom < this->custom ) + return -1; + else if ( other->custom > this->custom ) + return 1; + else + return 0; + }else if ( other->type > this->type ) + return 1; + else + return -1; + + }else + return _tcscmp(other->field,this->field); + } + + /** Two of these are equal iff they reference the same field and type. */ + /*bool FieldCacheImpl::FileEntry::equals (FileEntry* other) { + if (other->field == field && other->type == type) { + if (other->custom == NULL) { + if (custom == NULL) + return true; + } else if (other->custom->equals (custom)) { + return true; + } + } + }*/ + + /** Composes a hashcode based on the field and type. */ + /*size_t FieldCacheImpl::FileEntry::hashCode() { + return field->hashCode() ^ type ^ (custom==NULL ? 0 : custom->hashCode()); + }*/ + + + + + + /** See if an object is in the cache. */ + FieldCacheAuto* FieldCacheImpl::lookup (IndexReader* reader, const TCHAR* field, int32_t type) { + FieldCacheAuto* ret = NULL; + FileEntry* entry = _CLNEW FileEntry (field, type); + { + SCOPED_LOCK_MUTEX(THIS_LOCK) + fieldcacheCacheReaderType* readerCache = cache.get(reader); + if (readerCache != NULL) + ret = readerCache->get (entry); + _CLDELETE(entry); + } + return ret; + } + + + /** See if a custom object is in the cache. */ + FieldCacheAuto* FieldCacheImpl::lookup (IndexReader* reader, const TCHAR* field, SortComparatorSource* comparer) { + FieldCacheAuto* ret = NULL; + FileEntry* entry = _CLNEW FileEntry (field, comparer); + { + SCOPED_LOCK_MUTEX(THIS_LOCK) + fieldcacheCacheReaderType* readerCache = cache.get(reader); + if (readerCache != NULL) + ret = readerCache->get (entry); + _CLDELETE(entry); +} + return ret; + } + + void FieldCacheImpl::closeCallback(CL_NS(index)::IndexReader* reader, void* fieldCacheImpl){ + FieldCacheImpl* fci = (FieldCacheImpl*)fieldCacheImpl; + SCOPED_LOCK_MUTEX(fci->THIS_LOCK) + fci->cache.remove(reader); + } + + /** Put an object into the cache. */ + void FieldCacheImpl::store (IndexReader* reader, const TCHAR* field, int32_t type, FieldCacheAuto* value) { + FileEntry* entry = _CLNEW FileEntry (field, type); + { + SCOPED_LOCK_MUTEX(THIS_LOCK) + fieldcacheCacheReaderType* readerCache = cache.get(reader); + if (readerCache == NULL) { + readerCache = _CLNEW fieldcacheCacheReaderType; + cache.put(reader,readerCache); + reader->addCloseCallback(closeCallback, this); + } + readerCache->put (entry, value); + //this is supposed to return the previous value, but it needs to be deleted!!! + } + } + + /** Put a custom object into the cache. */ + void FieldCacheImpl::store (IndexReader* reader, const TCHAR* field, SortComparatorSource* comparer, FieldCacheAuto* value) { + FileEntry* entry = _CLNEW FileEntry (field, comparer); + { + SCOPED_LOCK_MUTEX(THIS_LOCK) + fieldcacheCacheReaderType* readerCache = cache.get(reader); + if (readerCache == NULL) { + readerCache = _CLNEW fieldcacheCacheReaderType; + cache.put(reader, readerCache); + reader->addCloseCallback(FieldCacheImpl::closeCallback, this); + } + readerCache->put(entry, value); + //this is supposed to return the previous value, but it needs to be deleted!!! + } + } + + + + + + // inherit javadocs + FieldCacheAuto* FieldCacheImpl::getInts (IndexReader* reader, const TCHAR* field) { + field = CLStringIntern::intern(field CL_FILELINE); + FieldCacheAuto* ret = lookup (reader, field, SortField::INT); + if (ret == NULL) { + int32_t retLen = reader->maxDoc(); + int32_t* retArray = _CL_NEWARRAY(int32_t,retLen); + memset(retArray,0,sizeof(int32_t)*retLen); + if (retLen > 0) { + TermDocs* termDocs = reader->termDocs(); + + Term* term = _CLNEW Term (field, LUCENE_BLANK_STRING, false); + TermEnum* termEnum = reader->terms (term); + _CLDECDELETE(term); + try { + if (termEnum->term(false) == NULL) { + _CLTHROWA(CL_ERR_Runtime,"no terms in field"); //todo: add detailed error: + field); + } + do { + Term* term = termEnum->term(false); + if (term->field() != field) + break; + + TCHAR* end; + int32_t termval = (int32_t)_tcstoi64(term->text(), &end, 10); + termDocs->seek (termEnum); + while (termDocs->next()) { + retArray[termDocs->doc()] = termval; + } + } while (termEnum->next()); + } _CLFINALLY( + termDocs->close(); + _CLDELETE(termDocs); + termEnum->close(); + _CLDELETE(termEnum); + ) + } + + FieldCacheAuto* fa = _CLNEW FieldCacheAuto(retLen,FieldCacheAuto::INT_ARRAY); + fa->intArray = retArray; + + store (reader, field, SortField::INT, fa); + CLStringIntern::unintern(field); + return fa; + } + CLStringIntern::unintern(field); + return ret; + } + + // inherit javadocs + FieldCacheAuto* FieldCacheImpl::getFloats (IndexReader* reader, const TCHAR* field){ + field = CLStringIntern::intern(field CL_FILELINE); + FieldCacheAuto* ret = lookup (reader, field, SortField::FLOAT); + if (ret == NULL) { + int32_t retLen = reader->maxDoc(); + qreal* retArray = _CL_NEWARRAY(qreal,retLen); + memset(retArray,0,sizeof(qreal)*retLen); + if (retLen > 0) { + TermDocs* termDocs = reader->termDocs(); + + Term* term = _CLNEW Term (field, LUCENE_BLANK_STRING, false); + TermEnum* termEnum = reader->terms (term); + _CLDECDELETE(term); + + try { + if (termEnum->term(false) == NULL) { + _CLTHROWA(CL_ERR_Runtime,"no terms in field "); //todo: make richer error + field); + } + do { + Term* term = termEnum->term(false); + if (term->field() != field) + break; + + TCHAR* tmp; + qreal termval = _tcstod(term->text(),&tmp); + termDocs->seek (termEnum); + while (termDocs->next()) { + retArray[termDocs->doc()] = termval; + } + } while (termEnum->next()); + } _CLFINALLY( + termDocs->close(); + _CLDELETE(termDocs); + termEnum->close(); + _CLDELETE(termEnum); + ) + } + + FieldCacheAuto* fa = _CLNEW FieldCacheAuto(retLen,FieldCacheAuto::FLOAT_ARRAY); + fa->floatArray = retArray; + + store (reader, field, SortField::FLOAT, fa); + CLStringIntern::unintern(field); + return fa; + } + CLStringIntern::unintern(field); + return ret; + } + + + // inherit javadocs + FieldCacheAuto* FieldCacheImpl::getStrings (IndexReader* reader, const TCHAR* field){ + //todo: this is not really used, i think? + field = CLStringIntern::intern(field CL_FILELINE); + FieldCacheAuto* ret = lookup (reader, field, SortField::STRING); + if (ret == NULL) { + int32_t retLen = reader->maxDoc(); + TCHAR** retArray = _CL_NEWARRAY(TCHAR*,retLen+1); + memset(retArray,0,sizeof(TCHAR*)*(retLen+1)); + if (retLen > 0) { + TermDocs* termDocs = reader->termDocs(); + + Term* term = _CLNEW Term (field, LUCENE_BLANK_STRING, false); + TermEnum* termEnum = reader->terms (term); + _CLDECDELETE(term); + + try { + if (termEnum->term(false) == NULL) { + _CLTHROWA(CL_ERR_Runtime,"no terms in field "); //todo: extend to + field); + } + do { + Term* term = termEnum->term(false); + if (term->field() != field) + break; + const TCHAR* termval = term->text(); + termDocs->seek (termEnum); + while (termDocs->next()) { + retArray[termDocs->doc()] = STRDUP_TtoT(termval); //todo: any better way of doing this??? + } + } while (termEnum->next()); + } _CLFINALLY( + retArray[retLen]=NULL; + termDocs->close(); + _CLDELETE(termDocs); + termEnum->close(); + _CLDELETE(termEnum); + ) + } + + + FieldCacheAuto* fa = _CLNEW FieldCacheAuto(retLen,FieldCacheAuto::STRING_ARRAY); + fa->stringArray = retArray; + fa->ownContents=true; + store (reader, field, SortField::STRING, fa); + CLStringIntern::unintern(field); + return fa; + } + CLStringIntern::unintern(field); + return ret; + } + + // inherit javadocs + FieldCacheAuto* FieldCacheImpl::getStringIndex (IndexReader* reader, const TCHAR* field){ + field = CLStringIntern::intern(field CL_FILELINE); + FieldCacheAuto* ret = lookup (reader, field, STRING_INDEX); + int32_t t = 0; // current term number + if (ret == NULL) { + int32_t retLen = reader->maxDoc(); + int32_t* retArray = _CL_NEWARRAY(int32_t,retLen); + memset(retArray,0,sizeof(int32_t)*retLen); + + TCHAR** mterms = _CL_NEWARRAY(TCHAR*,retLen+2); + mterms[0]=NULL; + if ( retLen > 0 ) { + TermDocs* termDocs = reader->termDocs(); + + Term* term = _CLNEW Term (field, LUCENE_BLANK_STRING, false); + TermEnum* termEnum = reader->terms (term); + _CLDECDELETE(term); + + + CND_PRECONDITION(t+1 <= retLen, "t out of bounds"); + + // an entry for documents that have no terms in this field + // should a document with no terms be at top or bottom? + // this puts them at the top - if it is changed, FieldDocSortedHitQueue + // needs to change as well. + mterms[t++] = NULL; + + try { + if (termEnum->term(false) == NULL) { + _CLTHROWA(CL_ERR_Runtime,"no terms in field"); //todo: make rich message " + field); + } + do { + Term* term = termEnum->term(false); + if (term->field() != field) + break; + + // store term text + // we expect that there is at most one term per document + if (t >= retLen+1) + _CLTHROWA(CL_ERR_Runtime,"there are more terms than documents in field"); //todo: rich error \"" + field + "\""); + mterms[t] = STRDUP_TtoT(term->text()); + + termDocs->seek (termEnum); + while (termDocs->next()) { + retArray[termDocs->doc()] = t; + } + + t++; + } while (termEnum->next()); + CND_PRECONDITION(t<retLen+2,"t out of bounds"); + mterms[t] = NULL; + } _CLFINALLY( + termDocs->close(); + _CLDELETE(termDocs); + termEnum->close(); + _CLDELETE(termEnum); + ); + + if (t == 0) { + // if there are no terms, make the term array + // have a single NULL entry + _CLDELETE_ARRAY(mterms); + mterms = _CL_NEWARRAY(TCHAR*,1); //todo: delete old mterms? + mterms[0]=NULL; + } else if (t < retLen) { //todo: check, was mterms.length + // if there are less terms than documents, + // trim off the dead array space + //const TCHAR** terms = _CL_NEWARRAY(TCHAR,t); + //System.arraycopy (mterms, 0, terms, 0, t); + //mterms = terms; + + //we simply shorten the length of the array... + + } + } + FieldCache::StringIndex* value = _CLNEW FieldCache::StringIndex (retArray, mterms,t); + + FieldCacheAuto* fa = _CLNEW FieldCacheAuto(retLen,FieldCacheAuto::STRING_INDEX); + fa->stringIndex = value; + fa->ownContents=true; + store (reader, field, STRING_INDEX, fa); + CLStringIntern::unintern(field); + return fa; + } + CLStringIntern::unintern(field); + return ret; + } + + // inherit javadocs + FieldCacheAuto* FieldCacheImpl::getAuto (IndexReader* reader, const TCHAR* field) { + field = CLStringIntern::intern(field CL_FILELINE); + FieldCacheAuto* ret = lookup (reader, field, SortField::AUTO); + if (ret == NULL) { + Term* term = _CLNEW Term (field, LUCENE_BLANK_STRING, false); + TermEnum* enumerator = reader->terms (term); + _CLDECDELETE(term); + + try { + Term* term = enumerator->term(false); + if (term == NULL) { + _CLTHROWA(CL_ERR_Runtime,"no terms in field - cannot determine sort type"); //todo: make rich error: " + field + " + } + if (term->field() == field) { + const TCHAR* termtext = term->text(); + size_t termTextLen = term->textLength(); + + bool isint=true; + for ( size_t i=0;i<termTextLen;i++ ){ + if ( _tcschr(_T("0123456789 +-"),termtext[i]) == NULL ){ + isint = false; + break; + } + } + if ( isint ) + ret = getInts (reader, field); + else{ + bool isfloat=true; + + int32_t searchLen = termTextLen; + if ( termtext[termTextLen-1] == 'f' ) + searchLen--; + for ( int32_t i=0;i<searchLen;i++ ){ + if ( _tcschr(_T("0123456789 Ee.+-"),termtext[i]) == NULL ){ + isfloat = false; + break; + } + } + if ( isfloat ) + ret = getFloats (reader, field); + else{ + ret = getStringIndex (reader, field); + } + } + + if (ret != NULL) { + store (reader, field, SortField::AUTO, ret); + } + } else { + _CLTHROWA (CL_ERR_Runtime,"field does not appear to be indexed"); //todo: make rich error: \"" + field + "\" + } + } _CLFINALLY( enumerator->close(); _CLDELETE(enumerator) ); + + } + CLStringIntern::unintern(field); + return ret; + } + + + // inherit javadocs + FieldCacheAuto* FieldCacheImpl::getCustom (IndexReader* reader, const TCHAR* field, SortComparator* comparator){ + field = CLStringIntern::intern(field CL_FILELINE); + + FieldCacheAuto* ret = lookup (reader, field, comparator); + if (ret == NULL) { + int32_t retLen = reader->maxDoc(); + Comparable** retArray = _CL_NEWARRAY(Comparable*,retLen); + memset(retArray,0,sizeof(Comparable*)*retLen); + if (retLen > 0) { + TermDocs* termDocs = reader->termDocs(); + TermEnum* termEnum = reader->terms (); + + try { + if (termEnum->term(false) == NULL) { + _CLTHROWA(CL_ERR_Runtime,"no terms in field "); //todo: make rich error + field); + } + do { + Term* term = termEnum->term(false); + if (term->field() != field) + break; + Comparable* termval = comparator->getComparable (term->text()); + termDocs->seek (termEnum); + while (termDocs->next()) { + retArray[termDocs->doc()] = termval; + } + } while (termEnum->next()); + } _CLFINALLY ( + termDocs->close(); + _CLDELETE(termDocs); + termEnum->close(); + _CLDELETE(termEnum); + ); + } + + FieldCacheAuto* fa = _CLNEW FieldCacheAuto(retLen,FieldCacheAuto::COMPARABLE_ARRAY); + fa->comparableArray = retArray; + fa->ownContents=true; + store (reader, field, SortField::CUSTOM, fa); + CLStringIntern::unintern(field); + return fa; + } + CLStringIntern::unintern(field); + return ret; + } + + + FieldCacheImpl::fieldcacheCacheReaderType::fieldcacheCacheReaderType(){ + setDeleteKey(false); + setDeleteValue(false); + } + FieldCacheImpl::fieldcacheCacheReaderType::~fieldcacheCacheReaderType(){ + iterator itr = begin(); + while ( itr != end() ){ + FileEntry* f = itr->first; + if ( f->getType() != SortField::AUTO ) + _CLDELETE( itr->second ); + _CLDELETE( f ); + ++itr; + } + clear(); + } +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/FieldCacheImpl.h b/src/3rdparty/clucene/src/CLucene/search/FieldCacheImpl.h new file mode 100644 index 0000000..ac3c4ca --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FieldCacheImpl.h @@ -0,0 +1,144 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_FieldCacheImpl_ +#define _lucene_search_FieldCacheImpl_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/IndexReader.h" +#include "FieldCache.h" +#include "Sort.h" + + +CL_NS_DEF(search) + + +/** + * Expert: The default cache implementation, storing all values in memory. + * + */ +class FieldCacheImpl: public FieldCache { +public: + DEFINE_MUTEX(THIS_LOCK) + + /** Expert: Every key in the internal cache is of this type. */ + class FileEntry:LUCENE_BASE { + const TCHAR* field; // which Field + int32_t type; // which SortField type + SortComparatorSource* custom; // which custom comparator + size_t _hashCode; + public: + /** Creates one of these objects. */ + FileEntry (const TCHAR* field, int32_t type); + + /** Creates one of these objects for a custom comparator. */ + FileEntry (const TCHAR* field, SortComparatorSource* custom); + ~FileEntry(); + + int32_t getType() const{ return type; } + + /** Two of these are equal iff they reference the same field and type. */ + bool equals (FileEntry* other) const; + + /** Composes a hashcode based on the field and type. */ + size_t hashCode(); + + int32_t compareTo(const FileEntry* other) const; + + class Compare:LUCENE_BASE, public CL_NS(util)::Compare::_base //<Term*> + { + public: + bool operator()( FileEntry* f1, FileEntry* f2 ) const{ + return ( f1->compareTo(f2) < 0 ); + } + size_t operator()( FileEntry* t ) const{ + return t->hashCode(); + } + }; + class Equals:LUCENE_BASE, public CL_NS(util)::Compare::_base //<Term*> + { + public: + bool operator()( FileEntry* f1, FileEntry* f2 ) const{ + return ( f1->compareTo(f2) == 0 ); + } + }; + }; + + FieldCacheImpl(); + ~FieldCacheImpl(); +private: + + ///the type that is stored in the field cache. can't use a typedef because + ///the decorated name would become too long + class fieldcacheCacheReaderType: public CL_NS(util)::CLHashMap<FileEntry*, + FieldCacheAuto*, + FileEntry::Compare, + FileEntry::Equals, + CL_NS(util)::Deletor::Object<FileEntry>, + CL_NS(util)::Deletor::Object<FieldCacheAuto> >{ + public: + fieldcacheCacheReaderType(); + ~fieldcacheCacheReaderType(); + }; + + //note: typename gets too long if using cacheReaderType as a typename + typedef CL_NS(util)::CLHashMap<CL_NS(index)::IndexReader*, + fieldcacheCacheReaderType*, + CL_NS(util)::Compare::Void<CL_NS(index)::IndexReader>, + CL_NS(util)::Equals::Void<CL_NS(index)::IndexReader>, + CL_NS(util)::Deletor::Object<CL_NS(index)::IndexReader>, + CL_NS(util)::Deletor::Object<fieldcacheCacheReaderType> > fieldcacheCacheType; + + /** The internal cache. Maps FileEntry to array of interpreted term values. **/ + //todo: make indexreader remove itself from here when the reader is shut + fieldcacheCacheType cache; + + /** See if an object is in the cache. */ + FieldCacheAuto* lookup (CL_NS(index)::IndexReader* reader, const TCHAR* field, int32_t type) ; + + /** See if a custom object is in the cache. */ + FieldCacheAuto* lookup (CL_NS(index)::IndexReader* reader, const TCHAR* field, SortComparatorSource* comparer); + + /** Put an object into the cache. */ + void store (CL_NS(index)::IndexReader* reader, const TCHAR* field, int32_t type, FieldCacheAuto* value); + + /** Put a custom object into the cache. */ + void store (CL_NS(index)::IndexReader* reader, const TCHAR* field, SortComparatorSource* comparer, FieldCacheAuto* value); + +public: + + // inherit javadocs + FieldCacheAuto* getInts (CL_NS(index)::IndexReader* reader, const TCHAR* field); + + // inherit javadocs + FieldCacheAuto* getFloats (CL_NS(index)::IndexReader* reader, const TCHAR* field); + + // inherit javadocs + FieldCacheAuto* getStrings (CL_NS(index)::IndexReader* reader, const TCHAR* field); + + // inherit javadocs + FieldCacheAuto* getStringIndex (CL_NS(index)::IndexReader* reader, const TCHAR* field); + + // inherit javadocs + FieldCacheAuto* getAuto (CL_NS(index)::IndexReader* reader, const TCHAR* field); + + // inherit javadocs + FieldCacheAuto* getCustom (CL_NS(index)::IndexReader* reader, const TCHAR* field, SortComparator* comparator); + + + /** + * Callback for when IndexReader closes. This causes + * any cache to be removed for the specified reader. + */ + static void closeCallback(CL_NS(index)::IndexReader* reader, void* fieldCacheImpl); +}; + + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/FieldDoc.h b/src/3rdparty/clucene/src/CLucene/search/FieldDoc.h new file mode 100644 index 0000000..6ce915a --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FieldDoc.h @@ -0,0 +1,70 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_FieldDoc_ +#define _lucene_search_FieldDoc_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "ScoreDoc.h" + +CL_NS_DEF(search) + +/** + * Expert: A ScoreDoc which also contains information about + * how to sort the referenced document. In addition to the + * document number and score, this object contains an array + * of values for the document from the field(s) used to sort. + * For example, if the sort criteria was to sort by fields + * "a", "b" then "c", the <code>fields</code> object array + * will have three elements, corresponding respectively to + * the term values for the document in fields "a", "b" and "c". + * The class of each element in the array will be either + * Integer, Float or String depending on the type of values + * in the terms of each field. + * + * @see ScoreDoc + * @see TopFieldDocs + */ +class FieldDoc: public ScoreDoc { +public: + + /** Expert: The values which are used to sort the referenced document. + * The order of these will match the original sort criteria given by a + * Sort object. Each Object will be either an Integer, Float or String, + * depending on the type of values in the terms of the original field. + * @see Sort + * @see Searchable#search(Query,Filter,int32_t,Sort) + */ + CL_NS(util)::Comparable** fields; + + /** Expert: Creates one of these objects with empty sort information. */ + FieldDoc (int32_t doc, qreal score): + ScoreDoc(doc,score) { + fields=NULL; + } + + /** Expert: Creates one of these objects with the given sort information. */ + FieldDoc (int32_t doc, qreal score, CL_NS(util)::Comparable** fields): + ScoreDoc(doc,score) + { + this->fields = fields; + } + + ~FieldDoc(){ + if ( fields != NULL ){ + for ( int i=0;fields[i]!=NULL;i++ ) + _CLDELETE(fields[i]); + _CLDELETE_ARRAY(fields); + } + } +}; + +CL_NS_END +#endif + diff --git a/src/3rdparty/clucene/src/CLucene/search/FieldDocSortedHitQueue.cpp b/src/3rdparty/clucene/src/CLucene/search/FieldDocSortedHitQueue.cpp new file mode 100644 index 0000000..0a52109 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FieldDocSortedHitQueue.cpp @@ -0,0 +1,171 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "FieldDocSortedHitQueue.h" + + +CL_NS_USE(util) +CL_NS_DEF(search) + + +FieldDoc::FieldDoc (int32_t doc, qreal score) +{ + this->scoreDoc.doc = doc; + this->scoreDoc.score = score; + fields=NULL; +} + +FieldDoc::FieldDoc (int32_t doc, qreal score, CL_NS(util)::Comparable** fields) +{ + this->scoreDoc.doc = doc; + this->scoreDoc.score = score; + this->fields = fields; +} + +FieldDoc::~FieldDoc(){ + if ( fields != NULL ){ + for ( int i=0;fields[i]!=NULL;i++ ) + _CLDELETE(fields[i]); + _CLDELETE_ARRAY(fields); + } +} + + + +FieldDocSortedHitQueue::FieldDocSortedHitQueue (SortField** fields, int32_t size) { + this->fields = fields; + _countsize(); + //this->collators = hasCollators (fields); + initialize (size,true); +} + +bool FieldDocSortedHitQueue::lessThan (FieldDoc* docA, FieldDoc* docB) { + int32_t n = fieldsLen; + int32_t c = 0; + qreal f1,f2,r1,r2; + int32_t i1,i2; + const TCHAR *s1, *s2; + + for (int32_t i=0; i<n && c==0; ++i) { + int32_t type = fields[i]->getType(); + if (fields[i]->getReverse()) { + switch (type) { + case SortField::DOCSCORE: + r1 = __REINTERPRET_CAST(Compare::Float*, docA->fields[i])->getValue(); + r2 = __REINTERPRET_CAST(Compare::Float*, docB->fields[i])->getValue(); + if (r1 < r2) c = -1; + if (r1 > r2) c = 1; + break; + case SortField::DOC: + case SortField::INT: + i1 = __REINTERPRET_CAST(Compare::Int32*, docA->fields[i])->getValue(); + i2 = __REINTERPRET_CAST(Compare::Int32*, docB->fields[i])->getValue(); + if (i1 > i2) c = -1; + if (i1 < i2) c = 1; + break; + case SortField::STRING: + s1 = __REINTERPRET_CAST(Compare::TChar*, docA->fields[i])->getValue(); + s2 = __REINTERPRET_CAST(Compare::TChar*, docB->fields[i])->getValue(); + if (s2 == NULL) c = -1; // could be NULL if there are + else if (s1 == NULL) c = 1; // no terms in the given field + else c = _tcscmp(s2,s1); //else if (fields[i].getLocale() == NULL) { + + /*todo: collators not impl + } else { + c = collators[i].compare (s2, s1); + }*/ + break; + case SortField::FLOAT: + f1 = __REINTERPRET_CAST(Compare::Float*, docA->fields[i])->getValue(); + f2 = __REINTERPRET_CAST(Compare::Float*, docB->fields[i])->getValue(); + if (f1 > f2) c = -1; + if (f1 < f2) c = 1; + break; + case SortField::CUSTOM: + c = docB->fields[i]->compareTo (docA->fields[i]); + break; + case SortField::AUTO: + // we cannot handle this - even if we determine the type of object (qreal or + // Integer), we don't necessarily know how to compare them (both SCORE and + // qreal both contain floats, but are sorted opposite of each other). Before + // we get here, each AUTO should have been replaced with its actual value. + _CLTHROWA (CL_ERR_Runtime,"FieldDocSortedHitQueue cannot use an AUTO SortField"); + default: + _CLTHROWA (CL_ERR_Runtime, "invalid SortField type"); //todo: rich error... : "+type); + } + } else { + switch (type) { + case SortField::DOCSCORE: + r1 = __REINTERPRET_CAST(Compare::Float*, docA->fields[i])->getValue(); + r2 = __REINTERPRET_CAST(Compare::Float*, docB->fields[i])->getValue(); + if (r1 > r2) c = -1; + if (r1 < r2) c = 1; + break; + case SortField::DOC: + case SortField::INT: + i1 = __REINTERPRET_CAST(Compare::Int32*, docA->fields[i])->getValue(); + i2 = __REINTERPRET_CAST(Compare::Int32*, docB->fields[i])->getValue(); + if (i1 < i2) c = -1; + if (i1 > i2) c = 1; + break; + case SortField::STRING: + s1 = __REINTERPRET_CAST(Compare::TChar*, docA->fields[i])->getValue(); + s2 = __REINTERPRET_CAST(Compare::TChar*, docB->fields[i])->getValue(); + // NULL values need to be sorted first, because of how FieldCache.getStringIndex() + // works - in that routine, any documents without a value in the given field are + // put first. + if (s1 == NULL) c = -1; // could be NULL if there are + else if (s2 == NULL) c = 1; // no terms in the given field + else c = _tcscmp(s1,s2); //else if (fields[i].getLocale() == NULL) { + + /* todo: collators not implemented } else { + c = collators[i].compare (s1, s2); + }*/ + break; + case SortField::FLOAT: + f1 = __REINTERPRET_CAST(Compare::Float*, docA->fields[i])->getValue(); + f2 = __REINTERPRET_CAST(Compare::Float*, docB->fields[i])->getValue(); + if (f1 < f2) c = -1; + if (f1 > f2) c = 1; + break; + case SortField::CUSTOM: + c = docA->fields[i]->compareTo (docB->fields[i]); + break; + case SortField::AUTO: + // we cannot handle this - even if we determine the type of object (qreal or + // Integer), we don't necessarily know how to compare them (both SCORE and + // qreal both contain floats, but are sorted opposite of each other). Before + // we get here, each AUTO should have been replaced with its actual value. + _CLTHROWA (CL_ERR_Runtime,"FieldDocSortedHitQueue cannot use an AUTO SortField"); + default: + _CLTHROWA (CL_ERR_Runtime,"invalid SortField type"); //todo: rich error... : "+type); + } + } + } + return c > 0; +} + +void FieldDocSortedHitQueue::setFields (SortField** fields) { + SCOPED_LOCK_MUTEX(THIS_LOCK) + if (this->fields == NULL) { + this->fields = fields; + _countsize(); + //this->collators = hasCollators (fields); + }else if ( fields == NULL ) + this->fields = NULL; +} + +FieldDocSortedHitQueue::~FieldDocSortedHitQueue(){ + if ( fields != NULL ){ + for ( int i=0;fields[i]!=NULL;i++ ) + _CLDELETE(fields[i]); + _CLDELETE_ARRAY(fields); + } +} + +CL_NS_END + diff --git a/src/3rdparty/clucene/src/CLucene/search/FieldDocSortedHitQueue.h b/src/3rdparty/clucene/src/CLucene/search/FieldDocSortedHitQueue.h new file mode 100644 index 0000000..5a46b3b --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FieldDocSortedHitQueue.h @@ -0,0 +1,159 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_FieldDocSortedHitQueue_ +#define _lucene_search_FieldDocSortedHitQueue_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "Sort.h" +#include "CLucene/util/PriorityQueue.h" + +CL_NS_DEF(search) + +/** + * Expert: A ScoreDoc which also contains information about + * how to sort the referenced document. In addition to the + * document number and score, this object contains an array + * of values for the document from the field(s) used to sort. + * For example, if the sort criteria was to sort by fields + * "a", "b" then "c", the <code>fields</code> object array + * will have three elements, corresponding respectively to + * the term values for the document in fields "a", "b" and "c". + * The class of each element in the array will be either + * Integer, Float or String depending on the type of values + * in the terms of each field. + * + * @see ScoreDoc + * @see TopFieldDocs + */ +class FieldDoc: LUCENE_BASE { +public: + //FieldDoc did inherit from ScoreDoc, but now we make the scoredoc a member + struct ScoreDoc scoreDoc; + + /** Expert: The values which are used to sort the referenced document. + * The order of these will match the original sort criteria given by a + * Sort object. Each Object will be either an Integer, Float or String, + * depending on the type of values in the terms of the original field. + * @see Sort + * @see Searchable#search(Query,Filter,int32_t,Sort) + */ + CL_NS(util)::Comparable** fields; + + /** Expert: Creates one of these objects with empty sort information. */ + FieldDoc (int32_t doc, qreal score); + /** Expert: Creates one of these objects with the given sort information. */ + FieldDoc (int32_t doc, qreal score, CL_NS(util)::Comparable** fields); + ~FieldDoc(); +}; + +/** + * Expert: Collects sorted results from Searchable's and collates them. + * The elements put into this queue must be of type FieldDoc. + */ +class FieldDocSortedHitQueue: + public CL_NS(util)::PriorityQueue<FieldDoc*,CL_NS(util)::Deletor::Object<FieldDoc> > +{ +private: + DEFINE_MUTEX(THIS_LOCK) + + // this cannot contain AUTO fields - any AUTO fields should + // have been resolved by the time this class is used. + SortField** fields; + int32_t fieldsLen; + + void _countsize(){ + fieldsLen=0; + while(fields[fieldsLen]!=NULL) + fieldsLen++; + } + + // used in the case where the fields are sorted by locale + // based strings + //todo: not implemented in clucene because locales has not been implemented + //Collator[] collators; //volatile + +public: + /** + * Creates a hit queue sorted by the given list of fields. + * @param fields Field names, in priority order (highest priority first). + * @param size The number of hits to retain. Must be greater than zero. + */ + FieldDocSortedHitQueue (SortField** fields, int32_t size); + ~FieldDocSortedHitQueue(); + + + /** + * Allows redefinition of sort fields if they are <code>NULL</code>. + * This is to handle the case using ParallelMultiSearcher where the + * original list contains AUTO and we don't know the actual sort + * type until the values come back. The fields can only be set once. + * This method is thread safe. + * @param fields + */ + void setFields (SortField** fields); + + /** Returns the fields being used to sort. */ + SortField** getFields() { + return fields; + } + + /** Returns an array of collators, possibly <code>NULL</code>. The collators + * correspond to any SortFields which were given a specific locale. + * @param fields Array of sort fields. + * @return Array, possibly <code>NULL</code>. + + private Collator[] hasCollators (SortField[] fields) { + if (fields == NULL) return NULL; + Collator[] ret = new Collator[fields.length]; + for (int32_t i=0; i<fields.length; ++i) { + Locale locale = fields[i].getLocale(); + if (locale != NULL) + ret[i] = Collator.getInstance (locale); + } + return ret; + }*/ + +protected: + /** + * Returns whether <code>a</code> is less relevant than <code>b</code>. + * @param a FieldDoc + * @param b FieldDoc + * @return <code>true</code> if document <code>a</code> should be sorted after document <code>b</code>. + */ + bool lessThan (FieldDoc* docA, FieldDoc* docB); +}; + + +/** +* Expert: Returned by low-level sorted search implementations. +* +* @see Searchable#search(Query,Filter,int32_t,Sort) +*/ +class TopFieldDocs: public TopDocs { +public: + /// The fields which were used to sort results by. + SortField** fields; + + FieldDoc** fieldDocs; + + /** Creates one of these objects. + * @param totalHits Total number of hits for the query. + * @param fieldDocs The top hits for the query. + * @param scoreDocs The top hits for the query. + * @param scoreDocsLen Length of fieldDocs and scoreDocs + * @param fields The sort criteria used to find the top hits. + */ + TopFieldDocs (int32_t totalHits, FieldDoc** fieldDocs, int32_t scoreDocsLen, SortField** fields); + ~TopFieldDocs(); +}; + +CL_NS_END +#endif + diff --git a/src/3rdparty/clucene/src/CLucene/search/FieldSortedHitQueue.cpp b/src/3rdparty/clucene/src/CLucene/search/FieldSortedHitQueue.cpp new file mode 100644 index 0000000..04f45e9 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FieldSortedHitQueue.cpp @@ -0,0 +1,212 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "FieldSortedHitQueue.h" +#include "FieldDocSortedHitQueue.h" +#include "Compare.h" + +CL_NS_USE(util) +CL_NS_USE(index) +CL_NS_DEF(search) + +FieldSortedHitQueue::hitqueueCacheType FieldSortedHitQueue::Comparators(false,true); + +FieldSortedHitQueue::FieldSortedHitQueue (IndexReader* reader, SortField** _fields, int32_t size): + fieldsLen(0), + maxscore(1.0f) +{ + while ( _fields[fieldsLen] != 0 ) + fieldsLen++; + + comparators = _CL_NEWARRAY(ScoreDocComparator*,fieldsLen+1); + SortField** tmp = _CL_NEWARRAY(SortField*,fieldsLen+1); + for (int32_t i=0; i<fieldsLen; ++i) { + const TCHAR* fieldname = _fields[i]->getField(); + //todo: fields[i].getLocale(), not implemented + comparators[i] = getCachedComparator (reader, fieldname, _fields[i]->getType(), _fields[i]->getFactory()); + tmp[i] = _CLNEW SortField (fieldname, comparators[i]->sortType(), _fields[i]->getReverse()); + } + comparatorsLen = fieldsLen; + comparators[fieldsLen]=NULL; + tmp[fieldsLen] = NULL; + this->fields = tmp; + + initialize(size,true); +} + + +bool FieldSortedHitQueue::lessThan (FieldDoc* docA, FieldDoc* docB) { + // keep track of maximum score + if (docA->scoreDoc.score > maxscore) maxscore = docA->scoreDoc.score; + if (docB->scoreDoc.score > maxscore) maxscore = docB->scoreDoc.score; + + // run comparators + int32_t c = 0; + for ( int32_t i=0; c==0 && i<comparatorsLen; ++i ) { + c = (fields[i]->getReverse()) ? comparators[i]->compare (&docB->scoreDoc, &docA->scoreDoc) : + comparators[i]->compare (&docA->scoreDoc, &docB->scoreDoc); + } + // avoid random sort order that could lead to duplicates (bug #31241): + if (c == 0) + return docA->scoreDoc.doc > docB->scoreDoc.doc; + return c > 0; +} + + +//static +ScoreDocComparator* FieldSortedHitQueue::comparatorString (IndexReader* reader, const TCHAR* field) { + //const TCHAR* field = CLStringIntern::intern(fieldname CL_FILELINE); + FieldCacheAuto* fa = FieldCache::DEFAULT->getStringIndex (reader, field); + //CLStringIntern::unintern(field); + + CND_PRECONDITION(fa->contentType==FieldCacheAuto::STRING_INDEX,"Content type is incorrect"); + fa->ownContents = false; + return _CLNEW ScoreDocComparators::String(fa->stringIndex, fa->contentLen); +} + +//static +ScoreDocComparator* FieldSortedHitQueue::comparatorInt (IndexReader* reader, const TCHAR* field){ + //const TCHAR* field = CLStringIntern::intern(fieldname CL_FILELINE); + FieldCacheAuto* fa = FieldCache::DEFAULT->getInts (reader, field); + //CLStringIntern::unintern(field); + + CND_PRECONDITION(fa->contentType==FieldCacheAuto::INT_ARRAY,"Content type is incorrect"); + return _CLNEW ScoreDocComparators::Int32(fa->intArray, fa->contentLen); + } + +//static + ScoreDocComparator* FieldSortedHitQueue::comparatorFloat (IndexReader* reader, const TCHAR* field) { + //const TCHAR* field = CLStringIntern::intern(fieldname CL_FILELINE); + FieldCacheAuto* fa = FieldCache::DEFAULT->getFloats (reader, field); + //CLStringIntern::unintern(field); + + CND_PRECONDITION(fa->contentType==FieldCacheAuto::FLOAT_ARRAY,"Content type is incorrect"); + return _CLNEW ScoreDocComparators::Float (fa->floatArray, fa->contentLen); + } +//static + ScoreDocComparator* FieldSortedHitQueue::comparatorAuto (IndexReader* reader, const TCHAR* field){ + //const TCHAR* field = CLStringIntern::intern(fieldname CL_FILELINE); + FieldCacheAuto* fa = FieldCache::DEFAULT->getAuto (reader, field); + //CLStringIntern::unintern(field); + + if (fa->contentType == FieldCacheAuto::STRING_INDEX ) { + return comparatorString (reader, field); + } else if (fa->contentType == FieldCacheAuto::INT_ARRAY) { + return comparatorInt (reader, field); + } else if (fa->contentType == FieldCacheAuto::FLOAT_ARRAY) { + return comparatorFloat (reader, field); + } else if (fa->contentType == FieldCacheAuto::STRING_ARRAY) { + return comparatorString (reader, field); + } else { + _CLTHROWA(CL_ERR_Runtime, "unknown data type in field"); //todo: rich error information: '"+field+"'"); + } + } + + + //todo: Locale locale, not implemented yet + ScoreDocComparator* FieldSortedHitQueue::getCachedComparator (IndexReader* reader, const TCHAR* fieldname, int32_t type, SortComparatorSource* factory){ + if (type == SortField::DOC) + return ScoreDocComparator::INDEXORDER; + if (type == SortField::DOCSCORE) + return ScoreDocComparator::RELEVANCE; + ScoreDocComparator* comparator = lookup (reader, fieldname, type, factory); + if (comparator == NULL) { + switch (type) { + case SortField::AUTO: + comparator = comparatorAuto (reader, fieldname); + break; + case SortField::INT: + comparator = comparatorInt (reader, fieldname); + break; + case SortField::FLOAT: + comparator = comparatorFloat (reader, fieldname); + break; + case SortField::STRING: + //if (locale != NULL) + // comparator = comparatorStringLocale (reader, fieldname, locale); + //else + comparator = comparatorString (reader, fieldname); + break; + case SortField::CUSTOM: + comparator = factory->newComparator (reader, fieldname); + break; + default: + _CLTHROWA(CL_ERR_Runtime,"unknown field type"); + //todo: extend error + //throw _CLNEW RuntimeException ("unknown field type: "+type); + } + store (reader, fieldname, type, factory, comparator); + } + return comparator; + } + + + FieldDoc* FieldSortedHitQueue::fillFields (FieldDoc* doc) const{ + int32_t n = comparatorsLen; + Comparable** fields = _CL_NEWARRAY(Comparable*,n+1); + for (int32_t i=0; i<n; ++i) + fields[i] = comparators[i]->sortValue(&doc->scoreDoc); + fields[n]=NULL; + doc->fields = fields; + if (maxscore > 1.0f) + doc->scoreDoc.score /= maxscore; // normalize scores + return doc; + } + + ScoreDocComparator* FieldSortedHitQueue::lookup (IndexReader* reader, const TCHAR* field, int32_t type, SortComparatorSource* factory) { + ScoreDocComparator* sdc = NULL; + FieldCacheImpl::FileEntry* entry = (factory != NULL) + ? _CLNEW FieldCacheImpl::FileEntry (field, factory) + : _CLNEW FieldCacheImpl::FileEntry (field, type); + + { + SCOPED_LOCK_MUTEX(Comparators.THIS_LOCK) + hitqueueCacheReaderType* readerCache = Comparators.get(reader); + if (readerCache == NULL){ + _CLDELETE(entry); + return NULL; + } + + sdc = readerCache->get (entry); + _CLDELETE(entry); + } + return sdc; + } + + void FieldSortedHitQueue::closeCallback(CL_NS(index)::IndexReader* reader, void*){ + SCOPED_LOCK_MUTEX(Comparators.THIS_LOCK) + Comparators.remove(reader); + } + + //static + void FieldSortedHitQueue::store (IndexReader* reader, const TCHAR* field, int32_t type, SortComparatorSource* factory, ScoreDocComparator* value) { + FieldCacheImpl::FileEntry* entry = (factory != NULL) + ? _CLNEW FieldCacheImpl::FileEntry (field, factory) + : _CLNEW FieldCacheImpl::FileEntry (field, type); + + { + SCOPED_LOCK_MUTEX(Comparators.THIS_LOCK) + hitqueueCacheReaderType* readerCache = Comparators.get(reader); + if (readerCache == NULL) { + readerCache = _CLNEW hitqueueCacheReaderType(true); + Comparators.put(reader,readerCache); + reader->addCloseCallback(FieldSortedHitQueue::closeCallback,NULL); + } + readerCache->put (entry, value); + //return NULL; //supposed to return previous value... + } + } + +FieldSortedHitQueue::~FieldSortedHitQueue(){ + _CLDELETE_ARRAY(comparators); + if ( fields != NULL ){ + for ( int i=0;fields[i]!=NULL;i++ ) + _CLDELETE(fields[i]); + _CLDELETE_ARRAY(fields); + } +} +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/FieldSortedHitQueue.h b/src/3rdparty/clucene/src/CLucene/search/FieldSortedHitQueue.h new file mode 100644 index 0000000..d7b16ce --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FieldSortedHitQueue.h @@ -0,0 +1,216 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_FieldSortedHitQueue_ +#define _lucene_search_FieldSortedHitQueue_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "FieldCache.h" +#include "Sort.h" +#include "FieldDocSortedHitQueue.h" +#include "SearchHeader.h" +#include "FieldCacheImpl.h" +#include "CLucene/util/PriorityQueue.h" + +CL_NS_DEF(search) + + +/** + * Expert: A hit queue for sorting by hits by terms in more than one field. + * Uses <code>FieldCache.DEFAULT</code> for maintaining internal term lookup tables. + * + * @see Searchable#search(Query,Filter,int32_t,Sort) + * @see FieldCache + */ +class FieldSortedHitQueue: public CL_NS(util)::PriorityQueue<FieldDoc*, + CL_NS(util)::Deletor::Object<FieldDoc> > { + + ///the type that is stored in the field cache. can't use a typedef because + ///the decorated name would become too long + class hitqueueCacheReaderType: public CL_NS(util)::CLHashMap<FieldCacheImpl::FileEntry*, + ScoreDocComparator*, + FieldCacheImpl::FileEntry::Compare, + FieldCacheImpl::FileEntry::Equals, + CL_NS(util)::Deletor::Object<FieldCacheImpl::FileEntry>, + CL_NS(util)::Deletor::Object<ScoreDocComparator> >{ + + public: + hitqueueCacheReaderType(bool deleteValue){ + setDeleteKey(true); + setDeleteValue(deleteValue); + } + ~hitqueueCacheReaderType(){ + clear(); + } + + }; + +public: //todo: remove this and below after close callback is implemented + //note: typename gets too long if using cacheReaderType as a typename + typedef CL_NS(util)::CLHashMap<CL_NS(index)::IndexReader*, + hitqueueCacheReaderType*, + CL_NS(util)::Compare::Void<CL_NS(index)::IndexReader>, + CL_NS(util)::Equals::Void<CL_NS(index)::IndexReader>, + CL_NS(util)::Deletor::Object<CL_NS(index)::IndexReader>, + CL_NS(util)::Deletor::Object<hitqueueCacheReaderType> > hitqueueCacheType; + + /** Internal cache of comparators. Similar to FieldCache, only + * caches comparators instead of term values. + */ + static hitqueueCacheType Comparators; +private: + + /** Returns a comparator if it is in the cache.*/ + static ScoreDocComparator* lookup (CL_NS(index)::IndexReader* reader, const TCHAR* field, int32_t type, SortComparatorSource* factory); + + /** Stores a comparator into the cache. + returns the valid ScoreDocComparator. + */ + static void store (CL_NS(index)::IndexReader* reader, const TCHAR* field, int32_t type, SortComparatorSource* factory, ScoreDocComparator* value); + + + //todo: Locale locale, not implemented yet + static ScoreDocComparator* getCachedComparator (CL_NS(index)::IndexReader* reader, + const TCHAR* fieldname, int32_t type, SortComparatorSource* factory); + + + /** + * Returns a comparator for sorting hits according to a field containing integers. + * @param reader Index to use. + * @param fieldname Field containg integer values. + * @return Comparator for sorting hits. + * @throws IOException If an error occurs reading the index. + */ + static ScoreDocComparator* comparatorInt (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname); + + /** + * Returns a comparator for sorting hits according to a field containing floats. + * @param reader Index to use. + * @param fieldname Field containg float values. + * @return Comparator for sorting hits. + * @throws IOException If an error occurs reading the index. + */ + static ScoreDocComparator* comparatorFloat (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname); + + /** + * Returns a comparator for sorting hits according to a field containing strings. + * @param reader Index to use. + * @param fieldname Field containg string values. + * @return Comparator for sorting hits. + * @throws IOException If an error occurs reading the index. + */ + static ScoreDocComparator* comparatorString (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname); + + + //todo: + /** + * Returns a comparator for sorting hits according to a field containing strings. + * @param reader Index to use. + * @param fieldname Field containg string values. + * @return Comparator for sorting hits. + * @throws IOException If an error occurs reading the index. + + static ScoreDocComparator* comparatorStringLocale (IndexReader* reader, TCHAR* fieldname, Locale locale){ + Collator collator = Collator.getInstance (locale); + TCHAR* field = fieldname.intern(); + TCHAR** index = FieldCache.DEFAULT.getStrings (reader, field); + return _CLNEW ScoreDocComparator() { + + public int32_t compare (ScoreDoc i, ScoreDoc j) { + return collator.compare (index[i.doc], index[j.doc]); + } + + public Comparable sortValue (ScoreDoc i) { + return index[i.doc]; + } + + public int32_t sortType() { + return SortField.STRING; + } + }; + }*/ + + /** + * Returns a comparator for sorting hits according to values in the given field. + * The terms in the field are looked at to determine whether they contain integers, + * floats or strings. Once the type is determined, one of the other static methods + * in this class is called to get the comparator. + * @param reader Index to use. + * @param fieldname Field containg values. + * @return Comparator for sorting hits. + * @throws IOException If an error occurs reading the index. + */ + static ScoreDocComparator* comparatorAuto (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname); + + +protected: + /** Stores a comparator corresponding to each field being sorted by */ + ScoreDocComparator** comparators; + int32_t comparatorsLen; + + /** Stores the sort criteria being used. */ + SortField** fields; + int32_t fieldsLen; + + /** Stores the maximum score value encountered, for normalizing. + * we only care about scores greater than 1.0 - if all the scores + * are less than 1.0, we don't have to normalize. */ + qreal maxscore; + + /** + * Returns whether <code>a</code> is less relevant than <code>b</code>. + * @param a ScoreDoc + * @param b ScoreDoc + * @return <code>true</code> if document <code>a</code> should be sorted after document <code>b</code>. + */ + bool lessThan (FieldDoc* docA, FieldDoc* docB); +public: + + /** + * Creates a hit queue sorted by the given list of fields. + * @param reader Index to use. + * @param fields Field names, in priority order (highest priority first). Cannot be <code>null</code> or empty. + * @param size The number of hits to retain. Must be greater than zero. + * @throws IOException + */ + FieldSortedHitQueue (CL_NS(index)::IndexReader* reader, SortField** fields, int32_t size); + + ~FieldSortedHitQueue(); + + /** + * Callback for when IndexReader closes. This causes + * any Comparators to be removed for the specified reader. + */ + static void closeCallback(CL_NS(index)::IndexReader* reader, void* param); + + /** + * Given a FieldDoc object, stores the values used + * to sort the given document. These values are not the raw + * values out of the index, but the internal representation + * of them. This is so the given search hit can be collated + * by a MultiSearcher with other search hits. + * @param doc The FieldDoc to store sort values into. + * @return The same FieldDoc passed in. + * @see Searchable#search(Query,Filter,int32_t,Sort) + */ + FieldDoc* fillFields (FieldDoc* doc) const; + + void setFields (SortField** fields){ + this->fields = fields; + } + + /** Returns the SortFields being used by this hit queue. */ + SortField** getFields() { + return fields; + } +}; + + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/Filter.h b/src/3rdparty/clucene/src/CLucene/search/Filter.h new file mode 100644 index 0000000..309c5a9 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Filter.h @@ -0,0 +1,46 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_Filter_ +#define _lucene_search_Filter_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/IndexReader.h" +#include "CLucene/util/BitSet.h" + +CL_NS_DEF(search) + // Abstract base class providing a mechanism to restrict searches to a subset + // of an index. + class Filter: LUCENE_BASE { + public: + virtual ~Filter(){ + } + + virtual Filter* clone() const = 0; + + /** + * Returns a BitSet with true for documents which should be permitted in + * search results, and false for those that should not. + * MEMORY: read shouldDeleteBitSet + */ + virtual CL_NS(util)::BitSet* bits(CL_NS(index)::IndexReader* reader)=0; + + /** + * Because of the problem of cached bitsets with the CachingWrapperFilter, + * CLucene has no way of knowing whether to delete the bitset returned from bits(). + * To properly clean memory from bits(), pass the bitset to this function. The + * Filter should be deleted if this function returns true. + */ + virtual bool shouldDeleteBitSet(const CL_NS(util)::BitSet* bs) const{ return true; } + + //Creates a user-readable version of this query and returns it as as string + virtual TCHAR* toString()=0; + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/FilteredTermEnum.cpp b/src/3rdparty/clucene/src/CLucene/search/FilteredTermEnum.cpp new file mode 100644 index 0000000..f90ceea --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FilteredTermEnum.cpp @@ -0,0 +1,136 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" + +#include "FilteredTermEnum.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + + + FilteredTermEnum::FilteredTermEnum(){ + //Func - Constructor + //Pre - true + //Post - Instance has been created + + currentTerm = NULL; + actualEnum = NULL; + } + + FilteredTermEnum::~FilteredTermEnum() { + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + close(); + } + + int32_t FilteredTermEnum::docFreq() const { + //Func - Returns the docFreq of the current Term in the enumeration. + //Pre - next() must have been called at least once + //Post - if actualEnum is NULL result is -1 otherwise the frequencey is returned + + if (actualEnum == NULL){ + return -1; + } + return actualEnum->docFreq(); + } + + bool FilteredTermEnum::next() { + //Func - Increments the enumeration to the next element. + //Pre - true + //Post - Returns True if the enumeration has been moved to the next element otherwise false + + //The actual enumerator is not initialized! + if (actualEnum == NULL){ + return false; + } + + //Finalize the currentTerm and reset it to NULL + _CLDECDELETE( currentTerm ); + + //Iterate through the enumeration + while (currentTerm == NULL) { + if (endEnum()) + return false; + if (actualEnum->next()) { + //Order term not to return reference ownership here. */ + Term* term = actualEnum->term(false); + //Compare the retrieved term + if (termCompare(term)){ + //Matched so finalize the current + _CLDECDELETE(currentTerm); + //Get a reference to the matched term + currentTerm = _CL_POINTER(term); + return true; + } + }else + return false; + } + _CLDECDELETE(currentTerm); + currentTerm = NULL; + + return false; + } + + Term* FilteredTermEnum::term() { + //Func - Returns the current Term in the enumeration. + //Pre - next() must have been called at least once + // pointer is true or false + //Post - if pre(pointer) is true the reference counter of currentTerm is increased + // and current Term is returned otherwise currentTerm is only returned + + return _CL_POINTER(currentTerm); + } + Term* FilteredTermEnum::term(bool pointer) { + if ( pointer ) + return _CL_POINTER(currentTerm); + else + return currentTerm; + } + + void FilteredTermEnum::close(){ + //Func - Closes the enumeration to further activity, freeing resources. + //Pre - true + //Post - The Enumeration has been closed + + //Check if actualEnum is valid + if (actualEnum){ + //Close the enumeration + actualEnum->close(); + } + + //Destroy the enumeration + _CLDELETE(actualEnum); + + //Destroy currentTerm + _CLDECDELETE(currentTerm); + } + + void FilteredTermEnum::setEnum(TermEnum* actualEnum) { + //Func - Sets the actual Enumeration + //Pre - actualEnum != NULL + //Post - The instance has been created + + CND_PRECONDITION(actualEnum != NULL,"actualEnum is NULL"); + + _CLDELETE(this->actualEnum); + + this->actualEnum = actualEnum; + + // Find the first term that matches + //Ordered term not to return reference ownership here. + Term* term = actualEnum->term(false); + if (term != NULL && termCompare(term)){ + _CLDECDELETE(currentTerm); + currentTerm = _CL_POINTER(term); + }else{ + next(); + } + } + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/FilteredTermEnum.h b/src/3rdparty/clucene/src/CLucene/search/FilteredTermEnum.h new file mode 100644 index 0000000..035ae38 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FilteredTermEnum.h @@ -0,0 +1,61 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_FilteredTermEnum_ +#define _lucene_search_FilteredTermEnum_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" + +CL_NS_DEF(search) + //FilteredTermEnum is an abstract class for enumerating a subset of all terms. + // + //Term enumerations are always ordered by term->compareTo(). Each term in + //the enumeration is greater than all that precede it. + + class FilteredTermEnum: public CL_NS(index)::TermEnum { + public: + //Constructor + FilteredTermEnum(); + //Destructor + virtual ~FilteredTermEnum(); + + //Equality measure on the term + virtual qreal difference() = 0; + + //Returns the docFreq of the current Term in the enumeration. + int32_t docFreq() const ; + + //Increments the enumeration to the next element + bool next() ; + + //Returns a pointer to the current Term in the enumeration. + CL_NS(index)::Term* term(); + CL_NS(index)::Term* term(bool pointer); + + //Closes the enumeration to further activity, freeing resources. + void close(); + + protected: + //Equality compare on the term */ + virtual bool termCompare(CL_NS(index)::Term* term) = 0; + + //Indiciates the end of the enumeration has been reached + virtual bool endEnum() = 0; + + void setEnum(CL_NS(index)::TermEnum* actualEnum) ; + + private: + CL_NS(index)::Term* currentTerm; + CL_NS(index)::TermEnum* actualEnum; + + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/FuzzyQuery.cpp b/src/3rdparty/clucene/src/CLucene/search/FuzzyQuery.cpp new file mode 100644 index 0000000..e95d48d --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FuzzyQuery.cpp @@ -0,0 +1,357 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "FuzzyQuery.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_DEF(search) + + /** + * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of + * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity > + * <code>minSimilarity</code>. + * + * @param reader Delivers terms. + * @param term Pattern term. + * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. + * @param prefixLength Length of required common prefix. Default value is 0. + * @throws IOException + */ + FuzzyTermEnum::FuzzyTermEnum(const IndexReader* reader, Term* term, qreal minSimilarity, size_t prefixLength): + distance(0), + _endEnum(false), + prefix(LUCENE_BLANK_STRING), + prefixLength(0), + minimumSimilarity(minSimilarity) + { + //Func - Constructor + //Pre - reader contains a valid reference to an IndexReader + // term != NULL + //Post - The instance has been created + + CND_PRECONDITION(term != NULL,"term is NULL"); + + scale_factor = 1.0f / (1.0f - minimumSimilarity); + searchTerm = _CL_POINTER(term); + + text = STRDUP_TtoT(term->text()); + textLen = term->textLength(); + + + //Initialize e to NULL + e = NULL; + eWidth = 0; + eHeight = 0; + + if(prefixLength > 0 && prefixLength < textLen){ + this->prefixLength = prefixLength; + + prefix = _CL_NEWARRAY(TCHAR,prefixLength+1); + _tcsncpy(prefix,text,prefixLength); + prefix[prefixLength]='\0'; + + textLen = prefixLength; + text[textLen]='\0'; + } + + + //Set the enumeration + Term* trm = _CLNEW Term(term, prefix); + setEnum(reader->terms(trm)); + _CLDECDELETE(trm); + } + + FuzzyTermEnum::~FuzzyTermEnum(){ + //Func - Destructor + //Pre - true + //Post - FuzzyTermEnum has been destroyed + + //Close the enumeration + close(); + } + + bool FuzzyTermEnum::endEnum() { + //Func - Returns the fact if the current term in the enumeration has reached the end + //Pre - true + //Post - The boolean value of endEnum has been returned + + return _endEnum; + } + + void FuzzyTermEnum::close(){ + //Func - Close the enumeration + //Pre - true + //Post - The enumeration has been closed + + FilteredTermEnum::close(); + + //Finalize the searchTerm + _CLDECDELETE(searchTerm); + //Destroy e + _CLDELETE_ARRAY(e); + + _CLDELETE_CARRAY(text); + + if ( prefix != LUCENE_BLANK_STRING ) + _CLDELETE_CARRAY(prefix); + } + + bool FuzzyTermEnum::termCompare(Term* term) { + //Func - Compares term with the searchTerm using the Levenshtein distance. + //Pre - term is NULL or term points to a Term + //Post - if pre(term) is NULL then false is returned otherwise + // if the distance of the current term in the enumeration is bigger than the FUZZY_THRESHOLD + // then true is returned + + if (term == NULL){ + return false; //Note that endEnum is not set to true! + } + + const TCHAR* termText = term->text(); + size_t termTextLen = term->textLength(); + + //Check if the field name of searchTerm of term match + //(we can use == because fields are interned) + if ( searchTerm->field() == term->field() && + (prefixLength==0 || _tcsncmp(termText,prefix,prefixLength)==0 )) { + + const TCHAR* target = termText+prefixLength; + size_t targetLen = termTextLen-prefixLength; + + //Calculate the Levenshtein distance + int32_t dist = editDistance(text, target, textLen, targetLen); + distance = 1 - ((qreal)dist / (qreal)min(textLen, targetLen)); + return (distance > minimumSimilarity); + } + _endEnum = true; + return false; + } + + qreal FuzzyTermEnum::difference() { + //Func - Returns the difference between the distance and the fuzzy threshold + // multiplied by the scale factor + //Pre - true + //Post - The difference is returned + + return (qreal)((distance - minimumSimilarity) * scale_factor ); + } + + + /** Finds and returns the smallest of three integers + precondition: Must define int32_t __t for temporary storage and result + */ + #define min3(a, b, c) __t = (a < b) ? a : b; __t = (__t < c) ? __t : c; + + int32_t FuzzyTermEnum::editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) { + //Func - Calculates the Levenshtein distance also known as edit distance is a measure of similiarity + // between two strings where the distance is measured as the number of character + // deletions, insertions or substitutions required to transform one string to + // the other string. + //Pre - s != NULL and contains the source string + // t != NULL and contains the target string + // n >= 0 and contains the length of the source string + // m >= 0 and containts the length of th target string + //Post - The distance has been returned + + CND_PRECONDITION(s != NULL, "s is NULL"); + CND_PRECONDITION(t != NULL, "t is NULL"); + CND_PRECONDITION(n >= 0," n is a negative number"); + CND_PRECONDITION(n >= 0," n is a negative number"); + + int32_t i; // iterates through s + int32_t j; // iterates through t + TCHAR s_i; // ith character of s + + if (n == 0) + return m; + if (m == 0) + return n; + + //Check if the array must be reallocated because it is too small or does not exist + if (e == NULL || eWidth <= n || eHeight <= m) { + //Delete e if possible + _CLDELETE_ARRAY(e); + //resize e + eWidth = max(eWidth, n+1); + eHeight = max(eHeight, m+1); + e = _CL_NEWARRAY(int32_t,eWidth*eHeight); + } + + CND_CONDITION(e != NULL,"e is NULL"); + + // init matrix e + for (i = 0; i <= n; i++){ + e[i + (0*eWidth)] = i; + } + for (j = 0; j <= m; j++){ + e[0 + (j*eWidth)] = j; + } + + int32_t __t; //temporary variable for min3 + + // start computing edit distance + for (i = 1; i <= n; i++) { + s_i = s[i - 1]; + for (j = 1; j <= m; j++) { + if (s_i != t[j-1]){ + min3(e[i + (j*eWidth) - 1], e[i + ((j-1)*eWidth)], e[i + ((j-1)*eWidth)-1]); + e[i + (j*eWidth)] = __t+1; + }else{ + min3(e[i + (j*eWidth) -1]+1, e[i + ((j-1)*eWidth)]+1, e[i + ((j-1)*eWidth)-1]); + e[i + (j*eWidth)] = __t; + } + } + } + + // we got the result! + return e[n + ((m)*eWidth)]; + } + + + /** + * Create a new FuzzyQuery that will match terms with a similarity + * of at least <code>minimumSimilarity</code> to <code>term</code>. + * If a <code>prefixLength</code> > 0 is specified, a common prefix + * of that length is also required. + * + * @param term the term to search for + * @param minimumSimilarity a value between 0 and 1 to set the required similarity + * between the query term and the matching terms. For example, for a + * <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length + * as the query term is considered similar to the query term if the edit distance + * between both terms is less than <code>length(term)*0.5</code> + * @param prefixLength length of common (non-fuzzy) prefix + * @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0 + * or if prefixLength < 0 or > <code>term.text().length()</code>. + */ + FuzzyQuery::FuzzyQuery(Term* term, qreal minimumSimilarity, size_t prefixLength): + MultiTermQuery(term) + { + //Func - Constructor + //Pre - term != NULL + //Post - The instance has been created + + CND_PRECONDITION(term != NULL,"term is NULL"); + + if (minimumSimilarity > 1.0f) + _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity > 1"); + else if (minimumSimilarity < 0.0f) + _CLTHROWA(CL_ERR_IllegalArgument,"minimumSimilarity < 0"); + + this->minimumSimilarity = minimumSimilarity; + + if(prefixLength >= term->textLength()) + _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()"); + this->prefixLength = prefixLength; + + } + + + qreal FuzzyQuery::defaultMinSimilarity = 0.5f; + + FuzzyQuery::~FuzzyQuery(){ + //Func - Destructor + //Pre - true + //Post - Instance has been destroyed + } + + TCHAR* FuzzyQuery::toString(const TCHAR* field) const{ + //Func - Returns the query string + //Pre - field != NULL + //Post - The query string has been returned + + CND_PRECONDITION(field != NULL,"field is NULL"); + + StringBuffer buffer; + const TCHAR* b = MultiTermQuery::toString(field); + + buffer.append ( b ); + _CLDELETE_CARRAY(b); + buffer.append( _T("~") ); + + buffer.appendFloat(minimumSimilarity,1); + + return buffer.toString(); + } + + const TCHAR* FuzzyQuery::getQueryName() const{ + //Func - Returns the name of the query + //Pre - true + //post - The string FuzzyQuery has been returned + + return getClassName(); + } + const TCHAR* FuzzyQuery::getClassName(){ + //Func - Returns the name of the query + //Pre - true + //post - The string FuzzyQuery has been returned + + return _T("FuzzyQuery"); + } + + + /** + * Returns the minimum similarity that is required for this query to match. + * @return float value between 0.0 and 1.0 + */ + qreal FuzzyQuery::getMinSimilarity() const { + return minimumSimilarity; + } + + FuzzyQuery::FuzzyQuery(const FuzzyQuery& clone): + MultiTermQuery(clone) + { + this->minimumSimilarity = clone.getMinSimilarity(); + this->prefixLength = clone.getPrefixLength(); + + //if(prefixLength < 0) + // _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength < 0"); + //else + if(prefixLength >= clone.getTerm()->textLength()) + _CLTHROWA(CL_ERR_IllegalArgument,"prefixLength >= term.textLength()"); + + } + + Query* FuzzyQuery::clone() const{ + return _CLNEW FuzzyQuery(*this); + } + size_t FuzzyQuery::hashCode() const{ + //todo: we should give the query a seeding value... but + //need to do it for all hascode functions + size_t val = Similarity::floatToByte(getBoost()) ^ getTerm()->hashCode(); + val ^= Similarity::floatToByte(this->getMinSimilarity()); + val ^= this->getPrefixLength(); + return val; + } + bool FuzzyQuery::equals(Query* other) const{ + if (!(other->instanceOf(FuzzyQuery::getClassName()))) + return false; + + FuzzyQuery* fq = (FuzzyQuery*)other; + return (this->getBoost() == fq->getBoost()) + && this->getMinSimilarity() == fq->getMinSimilarity() + && this->getPrefixLength() == fq->getPrefixLength() + && getTerm()->equals(fq->getTerm()); + } + + /** + * Returns the prefix length, i.e. the number of characters at the start + * of a term that must be identical (not fuzzy) to the query term if the query + * is to match that term. + */ + size_t FuzzyQuery::getPrefixLength() const { + return prefixLength; + } + + FilteredTermEnum* FuzzyQuery::getEnum(IndexReader* reader){ + Term* term = getTerm(false); + FuzzyTermEnum* ret = _CLNEW FuzzyTermEnum(reader, term, minimumSimilarity, prefixLength); + return ret; + } + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/FuzzyQuery.h b/src/3rdparty/clucene/src/CLucene/search/FuzzyQuery.h new file mode 100644 index 0000000..e58637b --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/FuzzyQuery.h @@ -0,0 +1,156 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_FuzzyQuery_ +#define _lucene_search_FuzzyQuery_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/IndexReader.h" +#include "CLucene/index/Term.h" +#include "MultiTermQuery.h" + + +CL_NS_DEF(search) + + // class FuzzyQuery implements the fuzzy search query + class FuzzyQuery: public MultiTermQuery { + private: + qreal minimumSimilarity; + size_t prefixLength; + protected: + FuzzyQuery(const FuzzyQuery& clone); + public: + static qreal defaultMinSimilarity; + + /** + * Create a new FuzzyQuery that will match terms with a similarity + * of at least <code>minimumSimilarity</code> to <code>term</code>. + * If a <code>prefixLength</code> > 0 is specified, a common prefix + * of that length is also required. + * + * @param term the term to search for + * @param minimumSimilarity a value between 0 and 1 to set the required similarity + * between the query term and the matching terms. For example, for a + * <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length + * as the query term is considered similar to the query term if the edit distance + * between both terms is less than <code>length(term)*0.5</code> + * @param prefixLength length of common (non-fuzzy) prefix + * @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0 + * or if prefixLength < 0 or > <code>term.text().length()</code>. + */ + FuzzyQuery(CL_NS(index)::Term* term, qreal minimumSimilarity=defaultMinSimilarity, size_t prefixLength=0); + //Destructor + ~FuzzyQuery(); + + TCHAR* toString(const TCHAR* field) const; + + //Returns the name "FuzzyQuery" + static const TCHAR* getClassName(); + const TCHAR* getQueryName() const; + + Query* clone() const; + bool equals(Query * other) const; + size_t hashCode() const; + + /** + * Returns the minimum similarity that is required for this query to match. + * @return float value between 0.0 and 1.0 + */ + qreal getMinSimilarity() const; + + /** + * Returns the prefix length, i.e. the number of characters at the start + * of a term that must be identical (not fuzzy) to the query term if the query + * is to match that term. + */ + size_t getPrefixLength() const; + + protected: + FilteredTermEnum* getEnum(CL_NS(index)::IndexReader* reader); + }; + + /** FuzzyTermEnum is a subclass of FilteredTermEnum for enumerating all + * terms that are similiar to the specified filter term. + * + * Term enumerations are always ordered by Term.compareTo(). Each term in + * the enumeration is greater than all that precede it. + */ + class FuzzyTermEnum: public FilteredTermEnum { + private: + qreal distance; + bool _endEnum; + + CL_NS(index)::Term* searchTerm; + TCHAR* text; + size_t textLen; + TCHAR* prefix; + size_t prefixLength; + qreal minimumSimilarity; + double scale_factor; + + + /** + * This static array saves us from the time required to create a new array + * everytime editDistance is called. + */ + int32_t* e; + int32_t eWidth; + int32_t eHeight; + + /****************************** + * Compute Levenshtein distance + ******************************/ + + /** + Levenshtein distance also known as edit distance is a measure of similiarity + between two strings where the distance is measured as the number of character + deletions, insertions or substitutions required to transform one string to + the other string. + <p>This method takes in four parameters; two strings and their respective + lengths to compute the Levenshtein distance between the two strings. + The result is returned as an integer. + */ + int32_t editDistance(const TCHAR* s, const TCHAR* t, const int32_t n, const int32_t m) ; + + protected: + /** + The termCompare method in FuzzyTermEnum uses Levenshtein distance to + calculate the distance between the given term and the comparing term. + */ + bool termCompare(CL_NS(index)::Term* term) ; + + ///Returns the fact if the current term in the enumeration has reached the end + bool endEnum(); + public: + + /** + * Empty prefix and minSimilarity of 0.5f are used. + * + * @param reader + * @param term + * @throws IOException + * @see #FuzzyTermEnum(IndexReader, Term, qreal, int32_t) + */ + FuzzyTermEnum(const CL_NS(index)::IndexReader* reader, CL_NS(index)::Term* term, qreal minSimilarity=FuzzyQuery::defaultMinSimilarity, size_t prefixLength=0); + /** Destructor */ + ~FuzzyTermEnum(); + /** Close the enumeration */ + void close(); + + /** Returns the difference between the distance and the fuzzy threshold + * multiplied by the scale factor + */ + qreal difference(); + + + const char* getObjectName(){ return FuzzyTermEnum::getClassName(); } + static const char* getClassName(){ return "FuzzyTermEnum"; } + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/HitQueue.cpp b/src/3rdparty/clucene/src/CLucene/search/HitQueue.cpp new file mode 100644 index 0000000..c9aecc6 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/HitQueue.cpp @@ -0,0 +1,107 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "HitQueue.h" + +CL_NS_DEF(search) + +void HitQueue::upHeap(){ + size_t i = _size; + ScoreDoc node = heap[i]; // save bottom node (WAS object) + int32_t j = ((uint32_t)i) >> 1; + while (j > 0 && lessThan(node,heap[j])) { + heap[i] = heap[j]; // shift parents down + i = j; + j = ((uint32_t)j) >> 1; + } + heap[i] = node; // install saved node +} +void HitQueue::downHeap(){ + size_t i = 1; + ScoreDoc node = heap[i]; // save top node + size_t j = i << 1; // find smaller child + size_t k = j + 1; + if (k <= _size && lessThan(heap[k], heap[j])) { + j = k; + } + while (j <= _size && lessThan(heap[j],node)) { + heap[i] = heap[j]; // shift up child + i = j; + j = i << 1; + k = j + 1; + if (k <= _size && lessThan(heap[k], heap[j])) { + j = k; + } + } + heap[i] = node; // install saved node +} + +void HitQueue::adjustTop(){ + downHeap(); +} +size_t HitQueue::size(){ + return _size; +} + +struct ScoreDoc& HitQueue::top(){ + if ( _size == 0 ) + _CLTHROWA(CL_ERR_IndexOutOfBounds, "Attempted to access empty hitqueue::top"); + return heap[1]; +} + +void HitQueue::put(struct ScoreDoc& element){ + if ( _size>=maxSize ) + _CLTHROWA(CL_ERR_IndexOutOfBounds,"add is out of bounds"); + + _size++; + heap[_size] = element; + upHeap(); +} + +ScoreDoc HitQueue::pop(){ + if (_size > 0) { + ScoreDoc result = heap[1]; // save first value + heap[1] = heap[_size]; // move last to first + + _size--; + downHeap(); // adjust heap + return result; + } else + _CLTHROWA(CL_ERR_IndexOutOfBounds, "Attempted to access empty hitqueue::top"); +} + +bool HitQueue::insert(struct ScoreDoc& element){ + if(_size < maxSize){ + put(element); + return true; + }else if(_size > 0 && !lessThan(element, heap[1])){ + heap[1] = element; + adjustTop(); + return true; + }else + return false; +} + +HitQueue::HitQueue(const int32_t maxSize){ + _size = 0; + this->maxSize = maxSize; + int32_t heapSize = maxSize + 1; + heap = _CL_NEWARRAY(ScoreDoc,heapSize); +} +HitQueue::~HitQueue(){ + _CLDELETE_ARRAY(heap); +} + +bool HitQueue::lessThan(struct ScoreDoc& hitA, struct ScoreDoc& hitB){ + if (hitA.score == hitB.score) + return hitA.doc > hitB.doc; + else + return hitA.score < hitB.score; +} + + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/HitQueue.h b/src/3rdparty/clucene/src/CLucene/search/HitQueue.h new file mode 100644 index 0000000..0bd196a --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/HitQueue.h @@ -0,0 +1,55 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_HitQueue_ +#define _lucene_search_HitQueue_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "SearchHeader.h" + +CL_NS_DEF(search) + +/** +* An optimised PriorityQueue which takes ScoreDoc structs. Some by-ref passing +* and memory related optimisations have been done. +*/ +class HitQueue: LUCENE_BASE { +private: + ScoreDoc* heap; + size_t _size; + size_t maxSize; + + void upHeap(); + void downHeap(); + +protected: + bool lessThan(struct ScoreDoc& hitA, struct ScoreDoc& hitB); + +public: + void adjustTop(); + struct ScoreDoc& top(); + void put(struct ScoreDoc& element); + ScoreDoc pop(); + /** + * Adds element to the PriorityQueue in log(size) time if either + * the PriorityQueue is not full, or not lessThan(element, top()). + * @param element + * @return true if element is added, false otherwise. + */ + bool insert(struct ScoreDoc& element); + /** + * Returns the number of elements currently stored in the PriorityQueue. + */ + size_t size(); + HitQueue(const int32_t maxSize); + ~HitQueue(); + +}; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/Hits.cpp b/src/3rdparty/clucene/src/CLucene/search/Hits.cpp new file mode 100644 index 0000000..38c489f --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Hits.cpp @@ -0,0 +1,174 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" + +#include "SearchHeader.h" +#include "CLucene/document/Document.h" +#include "CLucene/index/IndexReader.h" +#include "Filter.h" +#include "CLucene/search/SearchHeader.h" + +CL_NS_USE(document) +CL_NS_USE(util) +CL_NS_USE(index) + +CL_NS_DEF(search) + + HitDoc::HitDoc(const qreal s, const int32_t i) + { + //Func - Constructor + //Pre - true + //Post - The instance has been created + + next = NULL; + prev = NULL; + doc = NULL; + score = s; + id = i; + } + + HitDoc::~HitDoc(){ + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + _CLDELETE(doc); + } + + + Hits::Hits(Searcher* s, Query* q, Filter* f, const Sort* _sort): + query(q), searcher(s), filter(f), sort(_sort) + { + //Func - Constructor + //Pre - s contains a valid reference to a searcher s + // q contains a valid reference to a Query + // f is NULL or contains a pointer to a filter + //Post - The instance has been created + + _length = 0; + first = NULL; + last = NULL; + numDocs = 0; + maxDocs = 200; + + //retrieve 100 initially + getMoreDocs(50); + } + + Hits::~Hits(){ + + } + int32_t Hits::length() const { + return _length; + } + + Document& Hits::doc(const int32_t n){ + HitDoc* hitDoc = getHitDoc(n); + + // Update LRU cache of documents + remove(hitDoc); // remove from list, if there + addToFront(hitDoc); // add to front of list + if (numDocs > maxDocs) { // if cache is full + HitDoc* oldLast = last; + remove(last); // flush last + + _CLDELETE( oldLast->doc ); + oldLast->doc = NULL; + } + + if (hitDoc->doc == NULL){ + hitDoc->doc = _CLNEW Document; + searcher->doc(hitDoc->id, hitDoc->doc); // cache miss: read document + } + + return *hitDoc->doc; + } + + int32_t Hits::id (const int32_t n){ + return getHitDoc(n)->id; + } + + qreal Hits::score(const int32_t n){ + return getHitDoc(n)->score; + } + + void Hits::getMoreDocs(const size_t m){ + size_t _min = m; + { + size_t nHits = hitDocs.size(); + if ( nHits > _min) + _min = nHits; + } + + size_t n = _min * 2; // double # retrieved + TopDocs* topDocs = NULL; + if ( sort==NULL ) + topDocs = (TopDocs*)((Searchable*)searcher)->_search(query, filter, n); + else + topDocs = (TopDocs*)((Searchable*)searcher)->_search(query, filter, n, sort); + _length = topDocs->totalHits; + ScoreDoc* scoreDocs = topDocs->scoreDocs; + int32_t scoreDocsLength = topDocs->scoreDocsLength; + + qreal scoreNorm = 1.0f; + //Check that scoreDocs is a valid pointer before using it + if (scoreDocs != NULL){ + if (_length > 0 && scoreDocs[0].score > 1.0f){ + scoreNorm = 1.0f / scoreDocs[0].score; + } + + int32_t end = scoreDocsLength < _length ? scoreDocsLength : _length; + for (int32_t i = hitDocs.size(); i < end; i++) { + hitDocs.push_back(_CLNEW HitDoc(scoreDocs[i].score*scoreNorm, scoreDocs[i].doc)); + } + } + + _CLDELETE(topDocs); + } + + HitDoc* Hits::getHitDoc(const size_t n){ + if (n >= _length){ + TCHAR buf[100]; + _sntprintf(buf, 100,_T("Not a valid hit number: %d"),n); + _CLTHROWT(CL_ERR_IndexOutOfBounds, buf ); + } + if (n >= hitDocs.size()) + getMoreDocs(n); + + return hitDocs[n]; + } + + void Hits::addToFront(HitDoc* hitDoc) { // insert at front of cache + if (first == NULL) + last = hitDoc; + else + first->prev = hitDoc; + + hitDoc->next = first; + first = hitDoc; + hitDoc->prev = NULL; + + numDocs++; + } + + void Hits::remove(const HitDoc* hitDoc) { // remove from cache + if (hitDoc->doc == NULL) // it's not in the list + return; // abort + + if (hitDoc->next == NULL) + last = hitDoc->prev; + else + hitDoc->next->prev = hitDoc->prev; + + if (hitDoc->prev == NULL) + first = hitDoc->next; + else + hitDoc->prev->next = hitDoc->next; + + numDocs--; + } +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/IndexSearcher.cpp b/src/3rdparty/clucene/src/CLucene/search/IndexSearcher.cpp new file mode 100644 index 0000000..c948cfa --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/IndexSearcher.cpp @@ -0,0 +1,362 @@ +/* + * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team + * + * Distributable under the terms of either the Apache License (Version 2.0) or + * the GNU Lesser General Public License, as specified in the COPYING file. + * + * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved. +*/ +#include "CLucene/StdHeader.h" +#include "IndexSearcher.h" + +#include "SearchHeader.h" +#include "Scorer.h" +#include "FieldDocSortedHitQueue.h" +#include "CLucene/store/Directory.h" +#include "CLucene/document/Document.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/index/Term.h" +#include "CLucene/util/BitSet.h" +#include "FieldSortedHitQueue.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_USE(document) + +CL_NS_DEF(search) + +class SimpleTopDocsCollector : public HitCollector +{ +private: + qreal minScore; + const CL_NS(util)::BitSet* bits; + HitQueue* hq; + size_t nDocs; + int32_t* totalHits; + +public: + SimpleTopDocsCollector(const CL_NS(util)::BitSet* bs, HitQueue* hitQueue, + int32_t* totalhits, size_t ndocs, const qreal ms=-1.0f) + : minScore(ms), + bits(bs), + hq(hitQueue), + nDocs(ndocs), + totalHits(totalhits) {} + ~SimpleTopDocsCollector() {} + + void collect(const int32_t doc, const qreal score) + { + if (score > 0.0f // ignore zeroed buckets + && (bits == NULL || bits->get(doc))) { // skip docs not in bits + ++totalHits[0]; + if (hq->size() < nDocs || (minScore==-1.0f || score >= minScore)) { + ScoreDoc sd = {doc, score}; + hq->insert(sd); // update hit queue + if ( minScore != -1.0f ) + minScore = hq->top().score; // maintain minScore + } + } + } +}; + +class SortedTopDocsCollector : public HitCollector +{ +private: + const CL_NS(util)::BitSet* bits; + FieldSortedHitQueue* hq; + size_t nDocs; + int32_t* totalHits; +public: + SortedTopDocsCollector(const CL_NS(util)::BitSet* bs, + FieldSortedHitQueue* hitQueue, int32_t* totalhits, size_t _nDocs) + : bits(bs), + hq(hitQueue), + nDocs(_nDocs), + totalHits(totalhits) + { + } + ~SortedTopDocsCollector() {} + + void collect(const int32_t doc, const qreal score) + { + if (score > 0.0f && // ignore zeroed buckets + (bits==NULL || bits->get(doc))) { // skip docs not in bits + ++totalHits[0]; + // TODO: see jlucene way... with fields def??? + FieldDoc* fd = _CLNEW FieldDoc(doc, score); + if ( !hq->insert(fd) ) // update hit queue + _CLDELETE(fd); + } + } +}; + +class SimpleFilteredCollector : public HitCollector +{ +private: + CL_NS(util)::BitSet* bits; + HitCollector* results; +public: + SimpleFilteredCollector(CL_NS(util)::BitSet* bs, HitCollector* collector) + : bits(bs), + results(collector) {} + ~SimpleFilteredCollector() {} + +protected: + void collect(const int32_t doc, const qreal score) + { + // skip docs not in bits + if (bits->get(doc)) + results->collect(doc, score); + } +}; + + +IndexSearcher::IndexSearcher(const QString& path) +{ + //Func - Constructor + // Creates a searcher searching the index in the named directory. + //Pre - path != NULL + //Post - The instance has been created + + CND_PRECONDITION(!path.isEmpty(), "path is NULL"); + + reader = IndexReader::open(path); + readerOwner = true; +} + +IndexSearcher::IndexSearcher(CL_NS(store)::Directory* directory) +{ + //Func - Constructor + // Creates a searcher searching the index in the specified directory. + //Pre - path != NULL + //Post - The instance has been created + + CND_PRECONDITION(directory != NULL, "directory is NULL"); + + reader = IndexReader::open(directory); + readerOwner = true; +} + +IndexSearcher::IndexSearcher(IndexReader* r) +{ + //Func - Constructor + // Creates a searcher searching the index with the provide IndexReader + //Pre - path != NULL + //Post - The instance has been created + + reader = r; + readerOwner = false; +} + +IndexSearcher::~IndexSearcher() +{ + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + close(); +} + +void IndexSearcher::close() +{ + //Func - Frees resources associated with this Searcher. + //Pre - true + //Post - The resources associated have been freed + if (readerOwner && reader){ + reader->close(); + _CLDELETE(reader); + } +} + +// inherit javadoc +int32_t IndexSearcher::docFreq(const Term* term) const +{ + //Func - + //Pre - reader != NULL + //Post - + + CND_PRECONDITION(reader != NULL, "reader is NULL"); + return reader->docFreq(term); +} + +// inherit javadoc +bool IndexSearcher::doc(int32_t i, CL_NS(document)::Document* d) +{ + //Func - Retrieves i-th document found + // For use by HitCollector implementations. + //Pre - reader != NULL + //Post - The i-th document has been returned + + CND_PRECONDITION(reader != NULL, "reader is NULL"); + return reader->document(i,d); +} + +// inherit javadoc +int32_t IndexSearcher::maxDoc() const +{ + //Func - Return total number of documents including the ones marked deleted + //Pre - reader != NULL + //Post - The total number of documents including the ones marked deleted + // has been returned + + CND_PRECONDITION(reader != NULL, "reader is NULL"); + return reader->maxDoc(); +} + +TopDocs* IndexSearcher::_search(Query* query, Filter* filter, const int32_t nDocs) +{ + //Func - + //Pre - reader != NULL + //Post - + + CND_PRECONDITION(reader != NULL, "reader is NULL"); + CND_PRECONDITION(query != NULL, "query is NULL"); + + Weight* weight = query->weight(this); + Scorer* scorer = weight->scorer(reader); + if (scorer == NULL){ + return _CLNEW TopDocs(0, NULL, 0); + } + + BitSet* bits = filter != NULL ? filter->bits(reader) : NULL; + HitQueue* hq = _CLNEW HitQueue(nDocs); + + //Check hq has been allocated properly + CND_CONDITION(hq != NULL, "Could not allocate memory for HitQueue hq"); + + int32_t* totalHits = _CL_NEWARRAY(int32_t,1); + totalHits[0] = 0; + + SimpleTopDocsCollector hitCol(bits,hq,totalHits,nDocs,0.0f); + scorer->score( &hitCol ); + _CLDELETE(scorer); + + int32_t scoreDocsLength = hq->size(); + + ScoreDoc* scoreDocs = _CL_NEWARRAY(ScoreDoc,scoreDocsLength); + + for (int32_t i = scoreDocsLength-1; i >= 0; --i) // put docs in array + scoreDocs[i] = hq->pop(); + + int32_t totalHitsInt = totalHits[0]; + + _CLDELETE(hq); + if ( bits != NULL && filter->shouldDeleteBitSet(bits) ) + _CLDELETE(bits); + _CLDELETE_ARRAY(totalHits); + Query* wq = weight->getQuery(); + if ( query != wq ) //query was re-written + _CLLDELETE(wq); + _CLDELETE(weight); + + return _CLNEW TopDocs(totalHitsInt, scoreDocs, scoreDocsLength); +} + +// inherit javadoc +TopFieldDocs* IndexSearcher::_search(Query* query, Filter* filter, + const int32_t nDocs, const Sort* sort) +{ + CND_PRECONDITION(reader != NULL, "reader is NULL"); + CND_PRECONDITION(query != NULL, "query is NULL"); + + Weight* weight = query->weight(this); + Scorer* scorer = weight->scorer(reader); + if (scorer == NULL) { + return _CLNEW TopFieldDocs(0, NULL, 0, NULL ); + } + + BitSet* bits = filter != NULL ? filter->bits(reader) : NULL; + FieldSortedHitQueue hq(reader, sort->getSort(), nDocs); + int32_t* totalHits = _CL_NEWARRAY(int32_t,1); + totalHits[0]=0; + + SortedTopDocsCollector hitCol(bits,&hq,totalHits,nDocs); + scorer->score(&hitCol); + _CLDELETE(scorer); + + int32_t hqLen = hq.size(); + FieldDoc** fieldDocs = _CL_NEWARRAY(FieldDoc*,hqLen); + for (int32_t i = hqLen-1; i >= 0; --i){ // put docs in array + fieldDocs[i] = hq.fillFields (hq.pop()); + } + + Query* wq = weight->getQuery(); + if ( query != wq ) //query was re-written + _CLLDELETE(wq); + _CLDELETE(weight); + + SortField** hqFields = hq.getFields(); + hq.setFields(NULL); //move ownership of memory over to TopFieldDocs + int32_t totalHits0 = totalHits[0]; + if ( bits != NULL && filter->shouldDeleteBitSet(bits) ) + _CLDELETE(bits); + _CLDELETE_ARRAY(totalHits); + return _CLNEW TopFieldDocs(totalHits0, fieldDocs, hqLen, hqFields ); +} + +void IndexSearcher::_search(Query* query, Filter* filter, HitCollector* results) +{ + //Func - _search an index and fetch the results + // Applications should only use this if they need all of the + // matching documents. The high-level search API (search(Query)) + // is usually more efficient, as it skips non-high-scoring hits. + //Pre - query is a valid reference to a query filter may or may not be NULL + // results is a valid reference to a HitCollector and used to store the results + //Post - filter if non-NULL, a bitset used to eliminate some documents + + CND_PRECONDITION(reader != NULL, "reader is NULL"); + CND_PRECONDITION(query != NULL, "query is NULL"); + + BitSet* bits = NULL; + SimpleFilteredCollector* fc = NULL; + + if (filter != NULL){ + bits = filter->bits(reader); + fc = _CLNEW SimpleFilteredCollector(bits, results); + } + + Weight* weight = query->weight(this); + Scorer* scorer = weight->scorer(reader); + if (scorer != NULL) { + if (fc == NULL){ + scorer->score(results); + }else{ + scorer->score((HitCollector*)fc); + } + _CLDELETE(scorer); + } + + _CLDELETE(fc); + _CLDELETE(weight); + if ( bits != NULL && filter->shouldDeleteBitSet(bits) ) + _CLDELETE(bits); +} + +Query* IndexSearcher::rewrite(Query* original) +{ + Query* query = original; + Query* last = original; + for (Query* rewrittenQuery = query->rewrite(reader); + rewrittenQuery != query; + rewrittenQuery = query->rewrite(reader)) { + query = rewrittenQuery; + if ( query != last && last != original) { + _CLDELETE(last); + } + last = query; + } + return query; +} + +void IndexSearcher::explain(Query* query, int32_t doc, Explanation* ret) +{ + Weight* weight = query->weight(this); + weight->explain(reader, doc, ret); + + Query* wq = weight->getQuery(); + if ( query != wq ) //query was re-written + _CLLDELETE(wq); + _CLDELETE(weight); +} + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/IndexSearcher.h b/src/3rdparty/clucene/src/CLucene/search/IndexSearcher.h new file mode 100644 index 0000000..307e026 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/IndexSearcher.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team + * + * Distributable under the terms of either the Apache License (Version 2.0) or + * the GNU Lesser General Public License, as specified in the COPYING file. + * + * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved. +*/ +#ifndef _lucene_search_IndexSearcher_ +#define _lucene_search_IndexSearcher_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include <QtCore/QString> + +#include "SearchHeader.h" +#include "CLucene/store/Directory.h" +#include "CLucene/document/Document.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/index/Term.h" +#include "CLucene/util/BitSet.h" +#include "HitQueue.h" +#include "FieldSortedHitQueue.h" + +CL_NS_DEF(search) +/** Implements search over a single IndexReader. +* +* <p>Applications usually need only call the inherited {@link search(Query*)} +* or {@link search(Query*,Filter*)} methods. +*/ +class IndexSearcher:public Searcher{ + CL_NS(index)::IndexReader* reader; + bool readerOwner; + +public: + /// Creates a searcher searching the index in the named directory. + IndexSearcher(const QString& path); + + /// Creates a searcher searching the index in the specified directory. + IndexSearcher(CL_NS(store)::Directory* directory); + + /// Creates a searcher searching the provided index. + IndexSearcher(CL_NS(index)::IndexReader* r); + + ~IndexSearcher(); + + /// Frees resources associated with this Searcher. + void close(); + + int32_t docFreq(const CL_NS(index)::Term* term) const; + + bool doc(int32_t i, CL_NS(document)::Document* document); + + int32_t maxDoc() const; + + TopDocs* _search(Query* query, Filter* filter, const int32_t nDocs); + TopFieldDocs* _search(Query* query, Filter* filter, const int32_t nDocs, + const Sort* sort); + + void _search(Query* query, Filter* filter, HitCollector* results); + + CL_NS(index)::IndexReader* getReader() { + return reader; } + + Query* rewrite(Query* original); + void explain(Query* query, int32_t doc, Explanation* ret); +}; + +CL_NS_END + +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/MultiSearcher.cpp b/src/3rdparty/clucene/src/CLucene/search/MultiSearcher.cpp new file mode 100644 index 0000000..bed7f0d --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/MultiSearcher.cpp @@ -0,0 +1,227 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "MultiSearcher.h" + +#include "SearchHeader.h" +#include "HitQueue.h" +#include "CLucene/document/Document.h" +#include "CLucene/index/Term.h" +#include "FieldDocSortedHitQueue.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_USE(document) + +CL_NS_DEF(search) + + /** Creates a searcher which searches <i>searchers</i>. */ + MultiSearcher::MultiSearcher(Searchable** _searchables): + _maxDoc(0) { + searchablesLen = 0; + while ( _searchables[searchablesLen] != NULL ) + ++searchablesLen; + + searchables=_CL_NEWARRAY(Searchable*,searchablesLen+1); + starts = _CL_NEWARRAY(int32_t,searchablesLen + 1); // build starts array + for (int32_t i = 0; i < searchablesLen; ++i) { + searchables[i]=_searchables[i]; + starts[i] = _maxDoc; + _maxDoc += searchables[i]->maxDoc(); // compute maxDocs + } + starts[searchablesLen] = _maxDoc; + } + + MultiSearcher::~MultiSearcher() { + _CLDELETE_ARRAY(searchables); + _CLDELETE_ARRAY(starts); + } + + + // inherit javadoc + void MultiSearcher::close() { + for (int32_t i = 0; i < searchablesLen; ++i){ + searchables[i]->close(); + searchables[i]=NULL; + } + } + + int32_t MultiSearcher::docFreq(const Term* term) const { + int32_t docFreq = 0; + for (int32_t i = 0; i < searchablesLen; ++i) + docFreq += searchables[i]->docFreq(term); + return docFreq; + } + + /** For use by {@link HitCollector} implementations. */ + bool MultiSearcher::doc(int32_t n, Document* d) { + int32_t i = subSearcher(n); // find searcher index + return searchables[i]->doc(n - starts[i], d); // dispatch to searcher + } + + int32_t MultiSearcher::searcherIndex(int32_t n) const{ + return subSearcher(n); + } + + /** Returns index of the searcher for document <code>n</code> in the array + * used to construct this searcher. */ + int32_t MultiSearcher::subSearcher(int32_t n) const{ + // replace w/ call to Arrays.binarySearch in Java 1.2 + int32_t lo = 0; // search starts array + int32_t hi = searchablesLen - 1; // for first element less + // than n, return its index + int32_t mid,midValue; + while (hi >= lo) { + mid = (lo + hi) >> 1; + midValue = starts[mid]; + if (n < midValue) + hi = mid - 1; + else if (n > midValue) + lo = mid + 1; + else{ // found a match + while (mid+1 < searchablesLen && starts[mid+1] == midValue) { + ++mid; // scan to last match + } + return mid; + } + } + return hi; + } + + /** Returns the document number of document <code>n</code> within its + * sub-index. */ + int32_t MultiSearcher::subDoc(int32_t n) const{ + return n - starts[subSearcher(n)]; + } + + int32_t MultiSearcher::maxDoc() const{ + return _maxDoc; + } + + TopDocs* MultiSearcher::_search(Query* query, Filter* filter, const int32_t nDocs) { + HitQueue* hq = _CLNEW HitQueue(nDocs); + int32_t totalHits = 0; + TopDocs* docs; + int32_t j; + ScoreDoc* scoreDocs; + for (int32_t i = 0; i < searchablesLen; i++) { // search each searcher + docs = searchables[i]->_search(query, filter, nDocs); + totalHits += docs->totalHits; // update totalHits + scoreDocs = docs->scoreDocs; + for ( j = 0; j <docs->scoreDocsLength; ++j) { // merge scoreDocs int_to hq + scoreDocs[j].doc += starts[i]; // convert doc + if ( !hq->insert(scoreDocs[j])) + break; // no more scores > minScore + } + + _CLDELETE(docs); + } + + int32_t scoreDocsLen = hq->size(); + scoreDocs = _CL_NEWARRAY(ScoreDoc, scoreDocsLen); + {//MSVC 6 scope fix + for (int32_t i = scoreDocsLen-1; i >= 0; --i) // put docs in array + scoreDocs[i] = hq->pop(); + } + + //cleanup + _CLDELETE(hq); + + return _CLNEW TopDocs(totalHits, scoreDocs, scoreDocsLen); + } + + /** Lower-level search API. + * + * <p>{@link HitCollector#collect(int32_t,qreal)} is called for every non-zero + * scoring document. + * + * <p>Applications should only use this if they need <i>all</i> of the + * matching documents. The high-level search API ({@link + * Searcher#search(Query)}) is usually more efficient, as it skips + * non-high-scoring hits. + * + * @param query to match documents + * @param filter if non-null, a bitset used to eliminate some documents + * @param results to receive hits + */ + void MultiSearcher::_search(Query* query, Filter* filter, HitCollector* results){ + for (int32_t i = 0; i < searchablesLen; ++i) { + /* DSR:CL_BUG: Old implementation leaked and was misconceived. We need + ** to have the original HitCollector ($results) collect *all* hits; + ** the MultiHitCollector instantiated below serves only to adjust + ** (forward by starts[i]) the docNo passed to $results. + ** Old implementation instead created a sort of linked list of + ** MultiHitCollectors that applied the adjustments in $starts + ** cumulatively (and was never deleted). */ + HitCollector *docNoAdjuster = _CLNEW MultiHitCollector(results, starts[i]); + searchables[i]->_search(query, filter, docNoAdjuster); + _CLDELETE(docNoAdjuster); + } + } + + TopFieldDocs* MultiSearcher::_search (Query* query, Filter* filter, const int32_t n, const Sort* sort){ + FieldDocSortedHitQueue* hq = NULL; + int32_t totalHits = 0; + TopFieldDocs* docs; + int32_t j; + FieldDoc** fieldDocs; + + for (int32_t i = 0; i < searchablesLen; ++i) { // search each searcher + docs = searchables[i]->_search (query, filter, n, sort); + if (hq == NULL){ + hq = _CLNEW FieldDocSortedHitQueue (docs->fields, n); + docs->fields = NULL; //hit queue takes fields memory + } + + totalHits += docs->totalHits; // update totalHits + fieldDocs = docs->fieldDocs; + for(j = 0;j<docs->scoreDocsLength;++j){ // merge scoreDocs into hq + fieldDocs[j]->scoreDoc.doc += starts[i]; // convert doc + if (!hq->insert (fieldDocs[j]) ) + break; // no more scores > minScore + } + for ( int32_t x=0;x<j;++x ) + fieldDocs[x]=NULL; //move ownership of FieldDoc to the hitqueue + + _CLDELETE(docs); + } + + int32_t hqlen = hq->size(); + fieldDocs = _CL_NEWARRAY(FieldDoc*,hqlen); + for (j = hqlen - 1; j >= 0; j--) // put docs in array + fieldDocs[j] = hq->pop(); + + SortField** hqFields = hq->getFields(); + hq->setFields(NULL); //move ownership of memory over to TopFieldDocs + _CLDELETE(hq); + + return _CLNEW TopFieldDocs (totalHits, fieldDocs, hqlen, hqFields); + } + + Query* MultiSearcher::rewrite(Query* original) { + Query** queries = _CL_NEWARRAY(Query*,searchablesLen+1); + for (int32_t i = 0; i < searchablesLen; ++i) + queries[i] = searchables[i]->rewrite(original); + queries[searchablesLen]=NULL; + return original->combine(queries); + } + + void MultiSearcher::explain(Query* query, int32_t doc, Explanation* ret) { + int32_t i = subSearcher(doc); // find searcher index + searchables[i]->explain(query,doc-starts[i], ret); // dispatch to searcher + } + + MultiHitCollector::MultiHitCollector(HitCollector* _results, int32_t _start): + results(_results), + start(_start) { + } + + void MultiHitCollector::collect(const int32_t doc, const qreal score) { + results->collect(doc + start, score); + } + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/MultiSearcher.h b/src/3rdparty/clucene/src/CLucene/search/MultiSearcher.h new file mode 100644 index 0000000..1021fbb --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/MultiSearcher.h @@ -0,0 +1,95 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_multisearcher +#define _lucene_search_multisearcher + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "SearchHeader.h" +#include "CLucene/document/Document.h" +#include "CLucene/index/Term.h" + +CL_NS_DEF(search) + + class MultiHitCollector: public HitCollector{ + private: + HitCollector* results; + int32_t start; + public: + MultiHitCollector(HitCollector* _results, int32_t _start); + void collect(const int32_t doc, const qreal score) ; + }; + + + /** Implements search over a set of <code>Searchables</code>. + * + * <p>Applications usually need only call the inherited {@link #search(Query)} + * or {@link #search(Query,Filter)} methods. + */ + class MultiSearcher: public Searcher { + private: + Searchable** searchables; + int32_t searchablesLen; + int32_t* starts; + int32_t _maxDoc; + protected: + int32_t* getStarts() { + return starts; + } + + public: + /** Creates a searcher which searches <i>Searchables</i>. */ + MultiSearcher(Searchable** searchables); + + ~MultiSearcher(); + + /** Frees resources associated with this <code>Searcher</code>. */ + void close() ; + + int32_t docFreq(const CL_NS(index)::Term* term) const ; + + /** For use by {@link HitCollector} implementations. */ + bool doc(int32_t n, CL_NS(document)::Document* document); + + /** For use by {@link HitCollector} implementations to identify the + * index of the sub-searcher that a particular hit came from. */ + int32_t searcherIndex(int32_t n) const; + + int32_t subSearcher(int32_t n) const; + + int32_t subDoc(int32_t n) const; + + int32_t maxDoc() const; + + TopDocs* _search(Query* query, Filter* filter, const int32_t nDocs) ; + + TopFieldDocs* _search (Query* query, Filter* filter, const int32_t n, const Sort* sort); + + /** Lower-level search API. + * + * <p>{@link HitCollector#collect(int32_t,qreal)} is called for every non-zero + * scoring document. + * + * <p>Applications should only use this if they need <i>all</i> of the + * matching documents. The high-level search API ({@link + * Searcher#search(Query)}) is usually more efficient, as it skips + * non-high-scoring hits. + * + * @param query to match documents + * @param filter if non-null, a bitset used to eliminate some documents + * @param results to receive hits + */ + void _search(Query* query, Filter* filter, HitCollector* results); + + Query* rewrite(Query* original); + void explain(Query* query, int32_t doc, Explanation* ret); + }; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/MultiTermQuery.cpp b/src/3rdparty/clucene/src/CLucene/search/MultiTermQuery.cpp new file mode 100644 index 0000000..3bf8d7a --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/MultiTermQuery.cpp @@ -0,0 +1,98 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "MultiTermQuery.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_DEF(search) + +/** Constructs a query for terms matching <code>term</code>. */ + + MultiTermQuery::MultiTermQuery(Term* t){ + //Func - Constructor + //Pre - t != NULL + //Post - The instance has been created + + CND_PRECONDITION(t != NULL, "t is NULL"); + + term = _CL_POINTER(t); + + } + MultiTermQuery::MultiTermQuery(const MultiTermQuery& clone): + Query(clone) + { + term = _CLNEW Term(clone.getTerm(false),clone.getTerm(false)->text()); + } + + MultiTermQuery::~MultiTermQuery(){ + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + _CLDECDELETE(term); + } + + Term* MultiTermQuery::getTerm(bool pointer) const{ + if ( pointer ) + return _CL_POINTER(term); + else + return term; + } + + Query* MultiTermQuery::rewrite(IndexReader* reader) { + FilteredTermEnum* enumerator = getEnum(reader); + BooleanQuery* query = _CLNEW BooleanQuery(); + try { + do { + Term* t = enumerator->term(false); + if (t != NULL) { + TermQuery* tq = _CLNEW TermQuery(t); // found a match + tq->setBoost(getBoost() * enumerator->difference()); // set the boost + query->add(tq,true, false, false); // add to q + } + } while (enumerator->next()); + } _CLFINALLY ( enumerator->close(); _CLDELETE(enumerator) ); + + //if we only added one clause and the clause is not prohibited then + //we can just return the query + if (query->getClauseCount() == 1) { // optimize 1-clause queries + BooleanClause* c=0; + query->getClauses(&c); + + if (!c->prohibited) { // just return clause + c->deleteQuery=false; + Query* ret = c->query; + + _CLDELETE(query); + return ret; + } + } + return query; + } + + Query* MultiTermQuery::combine(Query** queries) { + return Query::mergeBooleanQueries(queries); + } + + /** Prints a user-readable version of this query. */ + TCHAR* MultiTermQuery::toString(const TCHAR* field) const{ + StringBuffer buffer; + + if ( field==NULL || _tcscmp(term->field(),field)!=0 ) { + buffer.append(term->field()); + buffer.append( _T(":")); + } + buffer.append(term->text()); + if (getBoost() != 1.0f) { + buffer.appendChar ( '^' ); + buffer.appendFloat( getBoost(),1); + } + return buffer.toString(); + } + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/MultiTermQuery.h b/src/3rdparty/clucene/src/CLucene/search/MultiTermQuery.h new file mode 100644 index 0000000..d376453 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/MultiTermQuery.h @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_MultiTermQuery_ +#define _lucene_search_MultiTermQuery_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/util/StringBuffer.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" +#include "FilteredTermEnum.h" +#include "SearchHeader.h" +#include "BooleanQuery.h" +#include "TermQuery.h" + +CL_NS_DEF(search) + /** + * A {@link Query} that matches documents containing a subset of terms provided + * by a {@link FilteredTermEnum} enumeration. + * <P> + * <code>MultiTermQuery</code> is not designed to be used by itself. + * <BR> + * The reason being that it is not intialized with a {@link FilteredTermEnum} + * enumeration. A {@link FilteredTermEnum} enumeration needs to be provided. + * <P> + * For example, {@link WildcardQuery} and {@link FuzzyQuery} extend + * <code>MultiTermQuery</code> to provide {@link WildcardTermEnum} and + * {@link FuzzyTermEnum}, respectively. + */ + class MultiTermQuery: public Query { + private: + CL_NS(index)::Term* term; + protected: + MultiTermQuery(const MultiTermQuery& clone); + + /** Construct the enumeration to be used, expanding the pattern term. */ + virtual FilteredTermEnum* getEnum(CL_NS(index)::IndexReader* reader) = 0; + public: + /** Constructs a query for terms matching <code>term</code>. */ + MultiTermQuery(CL_NS(index)::Term* t); + + virtual ~MultiTermQuery(); + + /** Returns the pattern term. */ + CL_NS(index)::Term* getTerm(bool pointer=true) const; + + Query* combine(Query** queries); + + /** Prints a user-readable version of this query. */ + TCHAR* toString(const TCHAR* field) const; + + Query* rewrite(CL_NS(index)::IndexReader* reader); + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/PhrasePositions.cpp b/src/3rdparty/clucene/src/CLucene/search/PhrasePositions.cpp new file mode 100644 index 0000000..7611056 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/PhrasePositions.cpp @@ -0,0 +1,116 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "PhrasePositions.h" + +#include "CLucene/index/Terms.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + + PhrasePositions::PhrasePositions(TermPositions* Tp, const int32_t OffSet){ + //Func - Constructor + //Pre - t != NULL + // OffSet != NULL + //Post - The instance has been created + + CND_PRECONDITION(Tp != NULL,"Tp is NULL"); + CND_PRECONDITION(OffSet >= 0 ,"OffSet is a negative number"); + + tp = Tp; + offset = OffSet; + position = 0; + count = 0; + doc = 0; + + _next = NULL; + } + + PhrasePositions::~PhrasePositions(){ + //Func - Destructor + //Pre - true + //Post - The instance has been deleted + + //delete next Phrase position and by doing that + //all PhrasePositions in the list + _CLDELETE(_next); + + //Check if tp is valid + if ( tp != NULL ){ + //Close TermPositions tp + tp->close(); + _CLDELETE(tp); + } + } + + bool PhrasePositions::next(){ + //Func - Increments to next doc + //Pre - tp != NULL + //Post - if there was no next then doc = INT_MAX otherwise + // doc contains the current document number + + CND_PRECONDITION(tp != NULL,"tp is NULL"); + + //Move to the next in TermPositions tp + if (!tp->next()) { + //There is no next so close the stream + tp->close(); + //delete tp and reset tp to NULL + _CLVDELETE(tp); //todo: not a clucene object... should be + //Assign Doc sentinel value + doc = INT_MAX; + return false; + }else{ + doc = tp->doc(); + position = 0; + return true; + } + } + bool PhrasePositions::skipTo(int32_t target){ + if (!tp->skipTo(target)) { + tp->close(); // close stream + doc = LUCENE_INT32_MAX_SHOULDBE; // sentinel value + return false; + } + doc = tp->doc(); + position = 0; + return true; + } + void PhrasePositions::firstPosition(){ + //Func - Read the first TermPosition + //Pre - tp != NULL + //Post - + + CND_PRECONDITION(tp != NULL,"tp is NULL"); + + //read first pos + count = tp->freq(); + //Move to the next TermPosition + nextPosition(); + } + + bool PhrasePositions::nextPosition(){ + //Func - Move to the next position + //Pre - tp != NULL + //Post - + + CND_PRECONDITION(tp != NULL,"tp is NULL"); + + if (count-- > 0) { + //read subsequent pos's + position = tp->nextPosition() - offset; + + //Check position always bigger than or equal to 0 + //bvk: todo, bug??? position < 0 occurs, cant figure out why, + //old version does it too and will fail the "SearchTest" test + //CND_CONDITION(position >= 0, "position has become a negative number"); + return true; + }else{ + return false; + } + } +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/PhrasePositions.h b/src/3rdparty/clucene/src/CLucene/search/PhrasePositions.h new file mode 100644 index 0000000..b6c8437 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/PhrasePositions.h @@ -0,0 +1,41 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_PhrasePositions_ +#define _lucene_search_PhrasePositions_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/Terms.h" + +CL_NS_DEF(search) + + class PhrasePositions:LUCENE_BASE { + public: + int32_t doc; // current doc + int32_t position; // position in doc + int32_t count; // remaining pos in this doc + int32_t offset; // position in phrase + CL_NS(index)::TermPositions* tp; // stream of positions + PhrasePositions* _next; // used to make lists + + + //Constructor + PhrasePositions(CL_NS(index)::TermPositions* Tp, const int32_t o); + //Destructor + ~PhrasePositions(); + + bool next(); + bool skipTo(int32_t target); + + void firstPosition(); + + bool nextPosition(); + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/PhraseQuery.cpp b/src/3rdparty/clucene/src/CLucene/search/PhraseQuery.cpp new file mode 100644 index 0000000..899cb3c --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/PhraseQuery.cpp @@ -0,0 +1,463 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "PhraseQuery.h" + +#include "SearchHeader.h" +#include "Scorer.h" +#include "BooleanQuery.h" +#include "TermQuery.h" + +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" +#include "CLucene/index/IndexReader.h" + +#include "CLucene/util/StringBuffer.h" +#include "CLucene/util/VoidList.h" +#include "CLucene/util/Arrays.h" + +#include "ExactPhraseScorer.h" +#include "SloppyPhraseScorer.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_DEF(search) + + PhraseQuery::PhraseQuery(): + terms(false) + { + //Func - Constructor + //Pre - true + //Post - An empty PhraseQuery has been created + + slop = 0; + + field = NULL; + } + PhraseQuery::PhraseQuery(const PhraseQuery& clone): + Query(clone), terms(false) + { + slop = clone.slop; + field = clone.field; + int32_t size=clone.positions.size(); + { //msvc6 scope fix + for ( int32_t i=0;i<size;i++ ){ + int32_t n = clone.positions[i]; + this->positions.push_back( n ); + } + } + size=clone.terms.size(); + { //msvc6 scope fix + for ( int32_t i=0;i<size;i++ ){ + this->terms.push_back( _CL_POINTER(clone.terms[i])); + } + } + } + Query* PhraseQuery::clone() const{ + return _CLNEW PhraseQuery(*this); + } + bool PhraseQuery::equals(CL_NS(search)::Query *other) const{ + if (!(other->instanceOf(PhraseQuery::getClassName()))) + return false; + + PhraseQuery* pq = (PhraseQuery*)other; + bool ret = (this->getBoost() == pq->getBoost()) + && (this->slop == pq->slop); + + if ( ret ){ + CLListEquals<CL_NS(index)::Term,Term::Equals, + const CL_NS(util)::CLVector<CL_NS(index)::Term*>, + const CL_NS(util)::CLVector<CL_NS(index)::Term*> > comp; + ret = comp.equals(&this->terms,&pq->terms); + } + + if ( ret ){ + CLListEquals<int32_t,Equals::Int32, + const CL_NS(util)::CLVector<int32_t,CL_NS(util)::Deletor::DummyInt32>, + const CL_NS(util)::CLVector<int32_t,CL_NS(util)::Deletor::DummyInt32> > comp; + ret = comp.equals(&this->positions,&pq->positions); + } + return ret; + } + + + PhraseQuery::~PhraseQuery(){ + //Func - Destructor + //Pre - true + //Post 0 The instance has been destroyed + + //Iterate through all the terms + for (uint32_t i = 0; i < terms.size(); i++){ + _CLLDECDELETE(terms[i]); + } + positions.clear(); + } + + size_t PhraseQuery::hashCode() const { + //todo: do cachedHashCode, and invalidate on add/remove clause + size_t ret = Similarity::floatToByte(getBoost()) ^ Similarity::floatToByte(slop); + + { //msvc6 scope fix + for ( int32_t i=0;terms.size();i++ ) + ret = 31 * ret + terms[i]->hashCode(); + } + { //msvc6 scope fix + for ( int32_t i=0;positions.size();i++ ) + ret = 31 * ret + positions[i]; + } + return ret; + } + + const TCHAR* PhraseQuery::getClassName(){ + return _T("PhraseQuery"); + } + const TCHAR* PhraseQuery::getQueryName() const{ + //Func - Returns the string "PhraseQuery" + //Pre - true + //Post - The string "PhraseQuery" has been returned + return getClassName(); + } + + + /** + * Adds a term to the end of the query phrase. + * The relative position of the term is the one immediately after the last term added. + */ + void PhraseQuery::add(Term* term) { + CND_PRECONDITION(term != NULL,"term is NULL"); + + int32_t position = 0; + + if(positions.size() > 0) + position = (positions[positions.size()-1]) + 1; + + add(term, position); + } + + void PhraseQuery::add(Term* term, int32_t position) { + //Func - Adds a term to the end of the query phrase. + //Pre - term != NULL + //Post - The term has been added if its field matches the field of the PhraseQuery + // and true is returned otherwise false is returned + CND_PRECONDITION(term != NULL,"term is NULL"); + + if (terms.size() == 0) + field = term->field(); + else{ + //Check if the field of the _CLNEW term matches the field of the PhraseQuery + //can use != because fields are interned + if ( term->field() != field){ + //return false; + TCHAR buf[200]; + _sntprintf(buf,200,_T("All phrase terms must be in the same field: %s"),term->field()); + _CLTHROWT(CL_ERR_IllegalArgument,buf); + } + } + //Store the _CLNEW term + terms.push_back(_CL_POINTER(term)); + + positions.push_back(position); + } + + void PhraseQuery::getPositions(Array<int32_t>& result) const{ + result.length = positions.size(); + result.values = _CL_NEWARRAY(int32_t,result.length); + for(int32_t i = 0; i < result.length; i++){ + result.values[i] = positions[i]; + } + } + int32_t* PhraseQuery::getPositions() const{ + CND_WARNING(false,"getPositions() is deprecated") + + Array<int32_t> arr; + getPositions(arr); + return arr.values; + } + + Weight* PhraseQuery::_createWeight(Searcher* searcher) { + if (terms.size() == 1) { // optimize one-term case + Term* term = terms[0]; + Query* termQuery = _CLNEW TermQuery(term); + termQuery->setBoost(getBoost()); + Weight* ret = termQuery->_createWeight(searcher); + _CLDELETE(termQuery); + return ret; + } + return _CLNEW PhraseWeight(searcher,this); + } + + + Term** PhraseQuery::getTerms() const{ + //Func - added by search highlighter + //Pre - + //Post - + + //Let size contain the number of terms + int32_t size = terms.size(); + Term** ret = _CL_NEWARRAY(Term*,size+1); + + CND_CONDITION(ret != NULL,"Could not allocated memory for ret"); + + //Iterate through terms and copy each pointer to ret + for ( int32_t i=0;i<size;i++ ){ + ret[i] = terms[i]; + } + ret[size] = NULL; + return ret; + } + + TCHAR* PhraseQuery::toString(const TCHAR* f) const{ + //Func - Prints a user-readable version of this query. + //Pre - f != NULL + //Post - The query string has been returned + + if ( terms.size()== 0 ) + return NULL; + + StringBuffer buffer; + if ( f==NULL || _tcscmp(field,f)!=0) { + buffer.append(field); + buffer.append( _T(":")); + } + + buffer.append( _T("\"") ); + + Term *T = NULL; + + //iterate through all terms + for (uint32_t i = 0; i < terms.size(); i++) { + //Get the i-th term + T = terms[i]; + + //Ensure T is a valid Term + CND_CONDITION(T !=NULL,"T is NULL"); + + buffer.append( T->text() ); + //Check if i is at the end of terms + if (i != terms.size()-1){ + buffer.append(_T(" ")); + } + } + + buffer.append( _T("\"") ); + + if (slop != 0) { + buffer.append(_T("~")); + buffer.appendFloat(slop,0); + } + + //Check if there is an other boost factor than 1.0 + if (getBoost() != 1.0f) { + buffer.append(_T("^")); + buffer.appendFloat( getBoost(),1 ); + } + + //return the query string + return buffer.toString(); + } + + + + + + + + + PhraseQuery::PhraseWeight::PhraseWeight(Searcher* searcher, PhraseQuery* _this) { + this->_this=_this; + this->value = 0; + this->idf = 0; + this->queryNorm = 0; + this->queryWeight = 0; + this->searcher = searcher; + } + + TCHAR* PhraseQuery::PhraseWeight::toString() { + return STRDUP_TtoT(_T("weight(PhraseQuery)")); + } + PhraseQuery::PhraseWeight::~PhraseWeight(){ + } + + + Query* PhraseQuery::PhraseWeight::getQuery() { return _this; } + qreal PhraseQuery::PhraseWeight::getValue() { return value; } + + qreal PhraseQuery::PhraseWeight::sumOfSquaredWeights(){ + idf = _this->getSimilarity(searcher)->idf(&_this->terms, searcher); + queryWeight = idf * _this->getBoost(); // compute query weight + return queryWeight * queryWeight; // square it + } + + void PhraseQuery::PhraseWeight::normalize(qreal queryNorm) { + this->queryNorm = queryNorm; + queryWeight *= queryNorm; // normalize query weight + value = queryWeight * idf; // idf for document + } + + Scorer* PhraseQuery::PhraseWeight::scorer(IndexReader* reader) { + //Func - + //Pre - + //Post - + + //Get the length of terms + int32_t tpsLength = _this->terms.size(); + + //optimize zero-term case + if (tpsLength == 0) + return NULL; + + TermPositions** tps = _CL_NEWARRAY(TermPositions*,tpsLength+1); + + //Check if tps has been allocated properly + CND_CONDITION(tps != NULL,"Could not allocate memory for tps"); + + TermPositions* p = NULL; + + //Iterate through all terms + int32_t size = _this->terms.size(); + for (int32_t i = 0; i < size; i++) { + //Get the termPostitions for the i-th term + p = reader->termPositions(_this->terms[i]); + + //Check if p is valid + if (p == NULL) { + //Delete previous retrieved termPositions + while (--i >= 0){ + _CLVDELETE(tps[i]); //todo: not a clucene object... should be + } + _CLDELETE_ARRAY(tps); + return NULL; + } + + //Store p at i in tps + tps[i] = p; + } + tps[tpsLength] = NULL; + + Scorer* ret = NULL; + + Array<int32_t> positions; + _this->getPositions(positions); + int32_t slop = _this->getSlop(); + if ( slop != 0) + // optimize exact case + //todo: need to pass these: this, tps, + ret = _CLNEW SloppyPhraseScorer(this,tps,positions.values, + _this->getSimilarity(searcher), + slop, reader->norms(_this->field)); + else + ret = _CLNEW ExactPhraseScorer(this, tps, positions.values, + _this->getSimilarity(searcher), + reader->norms(_this->field)); + positions.deleteArray(); + + CND_CONDITION(ret != NULL,"Could not allocate memory for ret"); + + //tps can be deleted safely. SloppyPhraseScorer or ExactPhraseScorer will take care + //of its values + + _CLDELETE_ARRAY(tps); + return ret; + } + + void PhraseQuery::PhraseWeight::explain(IndexReader* reader, int32_t doc, Explanation* result){ + TCHAR descbuf[LUCENE_SEARCH_EXPLANATION_DESC_LEN+1]; + TCHAR* tmp; + + tmp = getQuery()->toString(); + _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,_T("weight(%s in %d), product of:"), + tmp,doc); + _CLDELETE_CARRAY(tmp); + result->setDescription(descbuf); + + StringBuffer docFreqs; + StringBuffer query; + query.appendChar('\"'); + for (uint32_t i = 0; i < _this->terms.size(); i++) { + if (i != 0) { + docFreqs.appendChar(' '); + query.appendChar(' '); + } + + Term* term = _this->terms[i]; + + docFreqs.append(term->text()); + docFreqs.appendChar('='); + docFreqs.appendInt(searcher->docFreq(term)); + + query.append(term->text()); + } + query.appendChar('\"'); + + _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN, + _T("idf(%s: %s)"),_this->field,docFreqs.getBuffer()); + Explanation* idfExpl = _CLNEW Explanation(idf, descbuf); + + // explain query weight + Explanation* queryExpl = _CLNEW Explanation; + tmp = getQuery()->toString(); + _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN, + _T("queryWeight(%s), product of:"),tmp); + _CLDELETE_CARRAY(tmp); + queryExpl->setDescription(descbuf); + + Explanation* boostExpl = _CLNEW Explanation(_this->getBoost(), _T("boost")); + if (_this->getBoost() != 1.0f) + queryExpl->addDetail(boostExpl); + queryExpl->addDetail(idfExpl); + + Explanation* queryNormExpl = _CLNEW Explanation(queryNorm,_T("queryNorm")); + queryExpl->addDetail(queryNormExpl); + + queryExpl->setValue(boostExpl->getValue() * + idfExpl->getValue() * + queryNormExpl->getValue()); + + result->addDetail(queryExpl); + + // explain field weight + Explanation* fieldExpl = _CLNEW Explanation; + _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN, + _T("fieldWeight(%s:%s in %d), product of:"), + _this->field,query.getBuffer(),doc); + fieldExpl->setDescription(descbuf); + + + Explanation* tfExpl = _CLNEW Explanation; + scorer(reader)->explain(doc, tfExpl); + fieldExpl->addDetail(tfExpl); + fieldExpl->addDetail(idfExpl); + + Explanation* fieldNormExpl = _CLNEW Explanation(); + uint8_t* fieldNorms = reader->norms(_this->field); + qreal fieldNorm = + fieldNorms!=NULL ? Similarity::decodeNorm(fieldNorms[doc]) : 0.0f; + fieldNormExpl->setValue(fieldNorm); + + + _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN, + _T("fieldNorm(field=%s, doc=%d)"),_this->field,doc); + fieldNormExpl->setDescription(descbuf); + fieldExpl->addDetail(fieldNormExpl); + + fieldExpl->setValue(tfExpl->getValue() * + idfExpl->getValue() * + fieldNormExpl->getValue()); + + result->addDetail(fieldExpl); + + // combine them + result->setValue(queryExpl->getValue() * fieldExpl->getValue()); + + if (queryExpl->getValue() == 1.0f){ + result->set(*fieldExpl); + _CLDELETE(fieldExpl); + } + } + + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/PhraseQuery.h b/src/3rdparty/clucene/src/CLucene/search/PhraseQuery.h new file mode 100644 index 0000000..6b32558 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/PhraseQuery.h @@ -0,0 +1,127 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_PhraseQuery_ +#define _lucene_search_PhraseQuery_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "SearchHeader.h" +#include "Scorer.h" +#include "BooleanQuery.h" +#include "TermQuery.h" + +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" +#include "CLucene/index/IndexReader.h" + +#include "CLucene/util/StringBuffer.h" +#include "CLucene/util/VoidList.h" + +#include "ExactPhraseScorer.h" +#include "SloppyPhraseScorer.h" + +CL_NS_DEF(search) + // A Query that matches documents containing a particular sequence of terms. + // This may be combined with other terms with a {@link BooleanQuery}. + class PhraseQuery: public Query { + private: + CL_NS(util)::CLVector<int32_t,CL_NS(util)::Deletor::DummyInt32> positions; + int32_t slop; + + const TCHAR* field; + CL_NS(util)::CLVector<CL_NS(index)::Term*> terms; + + + class PhraseWeight: public Weight { + private: + Searcher* searcher; + qreal value; + qreal idf; + qreal queryNorm; + qreal queryWeight; + PhraseQuery* _this; + public: + PhraseWeight(Searcher* searcher, PhraseQuery* _this); + ~PhraseWeight(); + TCHAR* toString(); + + Query* getQuery(); + qreal getValue(); + + qreal sumOfSquaredWeights(); + void normalize(qreal queryNorm); + Scorer* scorer(CL_NS(index)::IndexReader* reader); + void explain(CL_NS(index)::IndexReader* reader, int32_t doc, Explanation* ret); + TCHAR* toString(TCHAR* f); + bool equals(PhraseWeight* o); + }; + friend class PhraseWeight; + protected: + Weight* _createWeight(Searcher* searcher); + PhraseQuery(const PhraseQuery& clone); + public: + //Constructor + PhraseQuery(); + + //Destructor + ~PhraseQuery(); + + //Returns the string "PhraseQuery" + const TCHAR* getQueryName() const; + static const TCHAR* getClassName(); + + //Sets the number of other words permitted between words in query phrase. + //If zero, then this is an exact phrase search. For larger values this works + //like a WITHIN or NEAR operator. + // + //The slop is in fact an edit-distance, where the units correspond to + //moves of terms in the query phrase out of position. For example, to switch + //the order of two words requires two moves (the first move places the words + //atop one another), so to permit re-orderings of phrases, the slop must be + //at least two. + // + //More exact matches are scored higher than sloppier matches, thus search + //results are sorted by exactness. + // + //The slop is zero by default, requiring exact matches. + void setSlop(const int32_t s) { slop = s; } + + //Returns the slop. See setSlop(). + int32_t getSlop() const { return slop; } + + //Adds a term to the end of the query phrase. + void add(CL_NS(index)::Term* term); + void add(CL_NS(index)::Term* term, int32_t position); + + + + //Returns the sum of squared weights + qreal sumOfSquaredWeights(Searcher* searcher); + + //Normalizes the Weight + void normalize(const qreal norm); + + Scorer* scorer(CL_NS(index)::IndexReader* reader); + + //added by search highlighter + CL_NS(index)::Term** getTerms() const; + _CL_DEPRECATED( deleteDocuments ) int32_t* getPositions() const; ///@deprecated. use getPositions(Array<int32_t>& result) + void getPositions(Array<int32_t>& result) const; + const TCHAR* getFieldName() const{ return field; } + + //Prints a user-readable version of this query. + TCHAR* toString(const TCHAR* f) const; + + Query* clone() const; + bool equals(CL_NS(search)::Query *) const; + + size_t hashCode() const; + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/PhraseQueue.h b/src/3rdparty/clucene/src/CLucene/search/PhraseQueue.h new file mode 100644 index 0000000..c0682fc --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/PhraseQueue.h @@ -0,0 +1,36 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_PriorityQueue_ +#define _lucene_search_PriorityQueue_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/util/PriorityQueue.h" +#include "PhrasePositions.h" + +CL_NS_DEF(search) + class PhraseQueue: public CL_NS(util)::PriorityQueue<PhrasePositions*, + CL_NS(util)::Deletor::Object<PhrasePositions> > { + public: + PhraseQueue(const int32_t size) { + initialize(size,false); + } + ~PhraseQueue(){ + } + + protected: + bool lessThan(PhrasePositions* pp1, PhrasePositions* pp2) { + if (pp1->doc == pp2->doc) + return pp1->position < pp2->position; + else + return pp1->doc < pp2->doc; + } + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/PhraseScorer.cpp b/src/3rdparty/clucene/src/CLucene/search/PhraseScorer.cpp new file mode 100644 index 0000000..b2da231 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/PhraseScorer.cpp @@ -0,0 +1,225 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "PhraseScorer.h" + +#include "PhraseQueue.h" +#include "PhrasePositions.h" +#include "Scorer.h" +#include "Similarity.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_DEF(search) + + + PhraseScorer::PhraseScorer(Weight* weight, TermPositions** tps, + int32_t* positions, Similarity* similarity, uint8_t* norms): + Scorer(similarity) + { + //Func - Constructor + //Pre - tps != NULL and is an array of TermPositions + // tpsLength >= 0 + // n != NULL + //Post - The instance has been created + + CND_PRECONDITION(tps != NULL,"tps is NULL"); + + //norms are only used if phraseFreq returns more than 0.0 + //phraseFreq should only return more than 0.0 if norms != NULL + //CND_PRECONDITION(n != NULL,"n is NULL"); + + firstTime = true; + more = true; + this->norms = norms; + this->weight = weight; + this->value = weight->getValue(); + + //reset internal pointers + first = NULL; + last = NULL; + + //use pq to build a sorted list of PhrasePositions + int32_t i = 0; + while(tps[i] != NULL){ + PhrasePositions *pp = _CLNEW PhrasePositions(tps[i], positions[i]); + CND_CONDITION(pp != NULL,"Could not allocate memory for pp"); + + //Store PhrasePos into the PhrasePos pq + if (last != NULL) { // add next to end of list + last->_next = pp; + } else + first = pp; + last = pp; + + i++; + } + + pq = _CLNEW PhraseQueue(i); //i==tps.length + CND_CONDITION(pq != NULL,"Could not allocate memory for pq"); + } + + PhraseScorer::~PhraseScorer() { + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + //The PhraseQueue pq (which is a PriorityQueue) pq is actually empty at present, the elements + //having been transferred by pqToList() to the linked list starting with + //first. The nodes of that linked list are deleted by the destructor of + //first, rather than the destructor of pq. + _CLDELETE(first); + _CLDELETE(pq); + } + + bool PhraseScorer::next(){ + if (firstTime) { + init(); + firstTime = false; + } else if (more) { + more = last->next(); // trigger further scanning + } + return doNext(); + } + + // next without initial increment + bool PhraseScorer::doNext() { + while (more) { + while (more && first->doc < last->doc) { // find doc w/ all the terms + more = first->skipTo(last->doc); // skip first upto last + firstToLast(); // and move it to the end + } + + if (more) { + // found a doc with all of the terms + freq = phraseFreq(); // check for phrase + if (freq == 0.0f) // no match + more = last->next(); // trigger further scanning + else + return true; // found a match + } + } + return false; // no more matches + } + + qreal PhraseScorer::score(){ + //System.out.println("scoring " + first.doc); + qreal raw = getSimilarity()->tf(freq) * value; // raw score + return raw * Similarity::decodeNorm(norms[first->doc]); // normalize + } + + bool PhraseScorer::skipTo(int32_t target) { + for (PhrasePositions* pp = first; more && pp != NULL; pp = pp->_next) { + more = pp->skipTo(target); + } + if (more) + sort(); // re-sort + return doNext(); + } + + void PhraseScorer::init() { + for (PhrasePositions* pp = first; more && pp != NULL; pp = pp->_next) + more = pp->next(); + if(more) + sort(); + } + + void PhraseScorer::sort() { + pq->clear(); + for (PhrasePositions* pp = first; pp != NULL; pp = pp->_next) + pq->put(pp); + pqToList(); + } + + + + void PhraseScorer::pqToList(){ + //Func - Transfers the PhrasePositions from the PhraseQueue pq to + // the PhrasePositions list with first as its first element + //Pre - pq != NULL + // first = NULL + // last = NULL + //Post - All PhrasePositions have been transfered to the list + // of PhrasePositions of which the first element is pointed to by first + // and the last element is pointed to by last + + CND_PRECONDITION(pq != NULL,"pq is NULL"); + + last = first = NULL; + + PhrasePositions* PhrasePos = NULL; + + //As long pq is not empty + while (pq->top() != NULL){ + //Pop a PhrasePositions instance + PhrasePos = pq->pop(); + + // add next to end of list + if (last != NULL) { + last->_next = PhrasePos; + } else { + first = PhrasePos; + } + + //Let last point to the new last PhrasePositions instance just added + last = PhrasePos; + //Reset the next of last to NULL + last->_next = NULL; + } + + //Check to see that pq is empty now + CND_CONDITION(pq->size()==0, "pq is not empty while it should be"); + } + + void PhraseScorer::firstToLast(){ + //Func - Moves first to the end of the list + //Pre - first is NULL or points to an PhrasePositions Instance + // last is NULL or points to an PhrasePositions Instance + // first and last both are NULL or both are not NULL + //Post - The first element has become the last element in the list + + CND_PRECONDITION(((first==NULL && last==NULL) ||(first !=NULL && last != NULL)), + "Either first or last is NULL but not both"); + + //Check if first and last are valid pointers + if(first && last){ + last->_next = first; + last = first; + first = first->_next; + last->_next = NULL; + } + } + + + void PhraseScorer::explain(int32_t _doc, Explanation* tfExplanation) { + while (next() && doc() < _doc){ + } + + qreal phraseFreq = (doc() == _doc) ? freq : 0.0f; + tfExplanation->setValue(getSimilarity()->tf(phraseFreq)); + + StringBuffer buf; + buf.append(_T("tf(phraseFreq=")); + buf.appendFloat(phraseFreq,2); + buf.append(_T(")")); + tfExplanation->setDescription(buf.getBuffer()); + } + + TCHAR* PhraseScorer::toString() { + StringBuffer buf; + buf.append(_T("scorer(")); + + TCHAR* tmp = weight->toString(); + buf.append(tmp); + _CLDELETE_CARRAY(tmp); + + buf.append(_T(")")); + + return buf.toString(); + } + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/PhraseScorer.h b/src/3rdparty/clucene/src/CLucene/search/PhraseScorer.h new file mode 100644 index 0000000..89f7a1f --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/PhraseScorer.h @@ -0,0 +1,65 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_PhraseScorer_ +#define _lucene_search_PhraseScorer_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "PhraseQueue.h" +#include "PhrasePositions.h" +#include "Scorer.h" +#include "Similarity.h" + +CL_NS_DEF(search) + + class PhraseScorer: public Scorer { + private: + Weight* weight; + qreal freq; + bool firstTime; + bool more; + + protected: + uint8_t* norms; + qreal value; + + PhraseQueue* pq; //is used to order the list point to by first and last + PhrasePositions* first; //Points to the first in the list of PhrasePositions + PhrasePositions* last; //Points to the last in the list of PhrasePositions + + public: + //Constructor + PhraseScorer(Weight* weight, CL_NS(index)::TermPositions** tps, + int32_t* positions, Similarity* similarity, uint8_t* norms); + virtual ~PhraseScorer(); + + int32_t doc() const { return first->doc; } + bool next(); + qreal score(); + bool skipTo(int32_t target); + + + void explain(int32_t doc, Explanation* ret); + TCHAR* toString(); + protected: + virtual qreal phraseFreq() =0; + + //Transfers the PhrasePositions from the PhraseQueue pq to + //the PhrasePositions list with first as its first element + void pqToList(); + + //Moves first to the end of the list + void firstToLast(); + private: + bool doNext(); + void init(); + void sort(); + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/PrefixQuery.cpp b/src/3rdparty/clucene/src/CLucene/search/PrefixQuery.cpp new file mode 100644 index 0000000..6bb27d1 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/PrefixQuery.cpp @@ -0,0 +1,273 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "PrefixQuery.h" +#include "CLucene/util/BitSet.h" + +CL_NS_USE(util) +CL_NS_USE(index) +CL_NS_DEF(search) + + PrefixQuery::PrefixQuery(Term* Prefix){ + //Func - Constructor. + // Constructs a query for terms starting with prefix + //Pre - Prefix != NULL + //Post - The instance has been created + + //Get a pointer to Prefix + prefix = _CL_POINTER(Prefix); + } + + PrefixQuery::PrefixQuery(const PrefixQuery& clone):Query(clone){ + prefix = _CL_POINTER(clone.prefix); + } + Query* PrefixQuery::clone() const{ + return _CLNEW PrefixQuery(*this); + } + + Term* PrefixQuery::getPrefix(bool pointer){ + if ( pointer ) + return _CL_POINTER(prefix); + else + return prefix; + } + + PrefixQuery::~PrefixQuery(){ + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed. + + //Delete prefix by finalizing it + _CLDECDELETE(prefix); + } + + + /** Returns a hash code value for this object.*/ + size_t PrefixQuery::hashCode() const { + return Similarity::floatToByte(getBoost()) ^ prefix->hashCode(); + } + + const TCHAR* PrefixQuery::getQueryName()const{ + //Func - Returns the name "PrefixQuery" + //Pre - true + //Post - The string "PrefixQuery" has been returned + + return getClassName(); + } + const TCHAR* PrefixQuery::getClassName(){ + //Func - Returns the name "PrefixQuery" + //Pre - true + //Post - The string "PrefixQuery" has been returned + + return _T("PrefixQuery"); + } + + bool PrefixQuery::equals(Query * other) const{ + if (!(other->instanceOf(PrefixQuery::getClassName()))) + return false; + + PrefixQuery* rq = (PrefixQuery*)other; + bool ret = (this->getBoost() == rq->getBoost()) + && (this->prefix->equals(rq->prefix)); + + return ret; + } + + Query* PrefixQuery::rewrite(IndexReader* reader){ + BooleanQuery* query = _CLNEW BooleanQuery(); + TermEnum* enumerator = reader->terms(prefix); + Term* lastTerm = NULL; + try { + const TCHAR* prefixText = prefix->text(); + const TCHAR* prefixField = prefix->field(); + const TCHAR* tmp; + size_t i; + int32_t prefixLen = prefix->textLength(); + do { + lastTerm = enumerator->term(); + if (lastTerm != NULL && lastTerm->field() == prefixField ){ + + //now see if term->text() starts with prefixText + int32_t termLen = lastTerm->textLength(); + if ( prefixLen>termLen ) + break; //the prefix is longer than the term, can't be matched + + tmp = lastTerm->text(); + + //check for prefix match in reverse, since most change will be at the end + for ( i=prefixLen-1;i!=-1;--i ){ + if ( tmp[i] != prefixText[i] ){ + tmp=NULL;//signals inequality + break; + } + } + if ( tmp == NULL ) + break; + + TermQuery* tq = _CLNEW TermQuery(lastTerm); // found a match + tq->setBoost(getBoost()); // set the boost + query->add(tq,true,false, false); // add to query + } else + break; + _CLDECDELETE(lastTerm); + } while (enumerator->next()); + }_CLFINALLY( + enumerator->close(); + _CLDELETE(enumerator); + _CLDECDELETE(lastTerm); + ); + _CLDECDELETE(lastTerm); + + + //if we only added one clause and the clause is not prohibited then + //we can just return the query + if (query->getClauseCount() == 1) { // optimize 1-clause queries + BooleanClause* c=0; + query->getClauses(&c); + + if (!c->prohibited) { // just return clause + c->deleteQuery=false; + Query* ret = c->query; + + _CLDELETE(query); + return ret; + } + } + + return query; + } + + Query* PrefixQuery::combine(Query** queries) { + return Query::mergeBooleanQueries(queries); + } + + TCHAR* PrefixQuery::toString(const TCHAR* field) const{ + //Func - Creates a user-readable version of this query and returns it as as string + //Pre - field != NULL + //Post - a user-readable version of this query has been returned as as string + + //Instantiate a stringbuffer buffer to store the readable version temporarily + CL_NS(util)::StringBuffer buffer; + //check if field equal to the field of prefix + if( field==NULL || _tcscmp(prefix->field(),field) != 0 ) { + //Append the field of prefix to the buffer + buffer.append(prefix->field()); + //Append a colon + buffer.append(_T(":") ); + } + //Append the text of the prefix + buffer.append(prefix->text()); + //Append a wildchar character + buffer.append(_T("*")); + //if the boost factor is not eaqual to 1 + if (getBoost() != 1.0f) { + //Append ^ + buffer.append(_T("^")); + //Append the boost factor + buffer.appendFloat( getBoost(),1); + } + //Convert StringBuffer buffer to TCHAR block and return it + return buffer.toString(); + } + + + + + + + + +PrefixFilter::PrefixFilter( Term* prefix ) +{ + this->prefix = _CL_POINTER(prefix); +} + +PrefixFilter::~PrefixFilter() +{ + _CLDECDELETE(prefix); +} + +PrefixFilter::PrefixFilter( const PrefixFilter& copy ) : + prefix( _CL_POINTER(copy.prefix) ) +{ +} + +Filter* PrefixFilter::clone() const { + return _CLNEW PrefixFilter(*this ); +} + +TCHAR* PrefixFilter::toString() +{ + //Instantiate a stringbuffer buffer to store the readable version temporarily + CL_NS(util)::StringBuffer buffer; + //check if field equal to the field of prefix + if( prefix->field() != NULL ) { + //Append the field of prefix to the buffer + buffer.append(prefix->field()); + //Append a colon + buffer.append(_T(":") ); + } + //Append the text of the prefix + buffer.append(prefix->text()); + buffer.append(_T("*")); + + //Convert StringBuffer buffer to TCHAR block and return it + return buffer.toString(); +} + +/** Returns a BitSet with true for documents which should be permitted in +search results, and false for those that should not. */ +BitSet* PrefixFilter::bits( IndexReader* reader ) +{ + BitSet* bts = _CLNEW BitSet( reader->maxDoc() ); + TermEnum* enumerator = reader->terms(prefix); + TermDocs* docs = reader->termDocs(); + const TCHAR* prefixText = prefix->text(); + const TCHAR* prefixField = prefix->field(); + const TCHAR* tmp; + size_t i; + int32_t prefixLen = prefix->textLength(); + Term* lastTerm = NULL; + + try{ + do{ + lastTerm = enumerator->term(false); + if (lastTerm != NULL && lastTerm->field() == prefixField ){ + //now see if term->text() starts with prefixText + int32_t termLen = lastTerm->textLength(); + if ( prefixLen>termLen ) + break; //the prefix is longer than the term, can't be matched + + tmp = lastTerm->text(); + + //check for prefix match in reverse, since most change will be at the end + for ( i=prefixLen-1;i!=-1;--i ){ + if ( tmp[i] != prefixText[i] ){ + tmp=NULL;//signals inequality + break; + } + } + if ( tmp == NULL ) + break; + + docs->seek(enumerator); + while (docs->next()) { + bts->set(docs->doc()); + } + } + }while(enumerator->next()); + } _CLFINALLY( + docs->close(); + _CLDELETE(docs); + enumerator->close(); + _CLDELETE(enumerator); + ) + + return bts; +} + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/PrefixQuery.h b/src/3rdparty/clucene/src/CLucene/search/PrefixQuery.h new file mode 100644 index 0000000..8e3f413 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/PrefixQuery.h @@ -0,0 +1,75 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_PrefixQuery +#define _lucene_search_PrefixQuery +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" +#include "CLucene/index/IndexReader.h" +#include "SearchHeader.h" +#include "BooleanQuery.h" +#include "TermQuery.h" +#include "CLucene/util/StringBuffer.h" + +CL_NS_DEF(search) + //PrefixQuery is a Query that matches documents containing terms with a specified prefix. + + class PrefixQuery: public Query { + private: + CL_NS(index)::Term* prefix; + protected: + PrefixQuery(const PrefixQuery& clone); + public: + + //Constructor. Constructs a query for terms starting with prefix + PrefixQuery(CL_NS(index)::Term* Prefix); + + //Destructor + ~PrefixQuery(); + + //Returns the name "PrefixQuery" + const TCHAR* getQueryName() const; + static const TCHAR* getClassName(); + + /** Returns the prefix of this query. */ + CL_NS(index)::Term* getPrefix(bool pointer=true); + + Query* combine(Query** queries); + Query* rewrite(CL_NS(index)::IndexReader* reader); + Query* clone() const; + bool equals(Query * other) const; + + //Creates a user-readable version of this query and returns it as as string + TCHAR* toString(const TCHAR* field) const; + + size_t hashCode() const; + }; + + + class PrefixFilter: public Filter + { + private: + CL_NS(index)::Term* prefix; + protected: + PrefixFilter( const PrefixFilter& copy ); + + public: + PrefixFilter(CL_NS(index)::Term* prefix); + ~PrefixFilter(); + + /** Returns a BitSet with true for documents which should be permitted in + search results, and false for those that should not. */ + CL_NS(util)::BitSet* bits( CL_NS(index)::IndexReader* reader ); + + Filter* clone() const; + TCHAR* toString(); + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/QueryFilter.cpp b/src/3rdparty/clucene/src/CLucene/search/QueryFilter.cpp new file mode 100644 index 0000000..2dbe2d7 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/QueryFilter.cpp @@ -0,0 +1,73 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "QueryFilter.h" +#include "IndexSearcher.h" + +CL_NS_DEF(search) +CL_NS_USE(util) +CL_NS_USE(index) + + +QueryFilter::QueryFilter( const Query* query ) +{ + this->query = query->clone(); +} + + +QueryFilter::~QueryFilter() +{ + _CLDELETE( query ); +} + + +QueryFilter::QueryFilter( const QueryFilter& copy ) +{ + this->query = copy.query->clone(); +} + + +Filter* QueryFilter::clone() const { + return _CLNEW QueryFilter(*this ); +} + + +TCHAR* QueryFilter::toString() +{ + TCHAR* qt = query->toString(); + size_t len = _tcslen(qt) + 14; + TCHAR* ret = _CL_NEWARRAY( TCHAR, len ); + ret[0] = 0; + _sntprintf( ret, len, _T("QueryFilter(%s)"), qt ); + _CLDELETE_CARRAY(qt); + return ret; +} + + +/** Returns a BitSet with true for documents which should be permitted in +search results, and false for those that should not. */ +BitSet* QueryFilter::bits( IndexReader* reader ) +{ + BitSet* bits = _CLNEW BitSet(reader->maxDoc()); + + IndexSearcher s(reader); + QFHitCollector hc(bits); + s._search(query, NULL, &hc); + return bits; +} + + +QueryFilter::QFHitCollector::QFHitCollector(CL_NS(util)::BitSet* bits){ + this->bits = bits; +} + +void QueryFilter::QFHitCollector::collect(const int32_t doc, const qreal score) { + bits->set(doc); // set bit for hit +} + + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/QueryFilter.h b/src/3rdparty/clucene/src/CLucene/search/QueryFilter.h new file mode 100644 index 0000000..8d423b2 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/QueryFilter.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_QueryFilter_ +#define _lucene_search_QueryFilter_ + +#include "CLucene/util/BitSet.h" +#include "CLucene/index/IndexReader.h" +#include "SearchHeader.h" +#include "CachingWrapperFilter.h" + +CL_NS_DEF(search) + +class QueryFilter: public Filter +{ +private: + Query* query; + + class QFHitCollector: public HitCollector{ + CL_NS(util)::BitSet* bits; + public: + QFHitCollector(CL_NS(util)::BitSet* bits); + void collect(const int32_t doc, const qreal score); + }; + +protected: + QueryFilter( const QueryFilter& copy ); +public: + QueryFilter( const Query* query ); + + ~QueryFilter(); + + CL_NS(util)::BitSet* bits( CL_NS(index)::IndexReader* reader ); + + Filter *clone() const; + + TCHAR *toString(); +}; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/RangeFilter.cpp b/src/3rdparty/clucene/src/CLucene/search/RangeFilter.cpp new file mode 100644 index 0000000..66ee5ce --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/RangeFilter.cpp @@ -0,0 +1,150 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "RangeFilter.h" + +CL_NS_DEF(search) +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_USE(document) + + +RangeFilter::RangeFilter( const TCHAR* fieldName, const TCHAR* lowerTerm, const TCHAR* upperTerm, bool includeLower, bool includeUpper ) +{ + this->field = STRDUP_TtoT(fieldName); + if ( lowerTerm != NULL ) + this->lowerValue = STRDUP_TtoT(lowerTerm); + else + this->lowerValue = NULL; + if ( upperTerm != NULL ) + this->upperValue = STRDUP_TtoT(upperTerm); + else + this->upperValue = NULL; + this->includeLower = includeLower; + this->includeUpper = includeUpper; +} + + +/** + * Constructs a filter for field <code>fieldName</code> matching + * less than or equal to <code>upperTerm</code>. + */ +RangeFilter* RangeFilter::Less( TCHAR* fieldName, TCHAR* upperTerm ) { + return new RangeFilter( fieldName, NULL, upperTerm, false, true ); +} + + +/** +* Constructs a filter for field <code>fieldName</code> matching +* more than or equal to <code>lowerTerm</code>. +*/ +RangeFilter* RangeFilter::More( TCHAR* fieldName, TCHAR* lowerTerm ) { + return new RangeFilter( fieldName, lowerTerm, NULL, true, false ); +} + + +RangeFilter::~RangeFilter() +{ + _CLDELETE_CARRAY( lowerValue ); + _CLDELETE_CARRAY( field ); + _CLDELETE_CARRAY( upperValue ); +} + + +RangeFilter::RangeFilter( const RangeFilter& copy ) : + field( STRDUP_TtoT(copy.field) ), + lowerValue( STRDUP_TtoT(copy.lowerValue) ), + upperValue( STRDUP_TtoT(copy.upperValue) ), + includeLower( copy.includeLower ), + includeUpper( copy.includeUpper ) +{ +} + + +Filter* RangeFilter::clone() const { + return _CLNEW RangeFilter(*this ); +} + + +TCHAR* RangeFilter::toString() +{ + size_t len = (field ? _tcslen(field) : 0) + (lowerValue ? _tcslen(lowerValue) : 0) + (upperValue ? _tcslen(upperValue) : 0) + 8; + TCHAR* ret = _CL_NEWARRAY( TCHAR, len ); + ret[0] = 0; + _sntprintf( ret, len, _T("%s: [%s-%s]"), field, (lowerValue?lowerValue:_T("")), (upperValue?upperValue:_T("")) ); + + return ret; +} + + +/** Returns a BitSet with true for documents which should be permitted in +search results, and false for those that should not. */ +BitSet* RangeFilter::bits( IndexReader* reader ) +{ + BitSet* bts = _CLNEW BitSet( reader->maxDoc() ); + Term* term = NULL; + + Term* t = _CLNEW Term( field, (lowerValue ? lowerValue : _T("")), false ); + TermEnum* enumerator = reader->terms( t ); // get enumeration of all terms after lowerValue + _CLDECDELETE( t ); + + if( enumerator->term(false) == NULL ) { + _CLDELETE( enumerator ); + return bts; + } + + bool checkLower = false; + if( !includeLower ) // make adjustments to set to exclusive + checkLower = true; + + TermDocs* termDocs = reader->termDocs(); + + try + { + do + { + term = enumerator->term(); + + if( term == NULL || _tcscmp(term->field(), field) ) + break; + + if( !checkLower || lowerValue == NULL || _tcscmp(term->text(), lowerValue) > 0 ) + { + checkLower = false; + if( upperValue != NULL ) + { + int compare = _tcscmp( upperValue, term->text() ); + + /* if beyond the upper term, or is exclusive and + * this is equal to the upper term, break out */ + if( (compare < 0) || (!includeUpper && compare == 0) ) + break; + } + + termDocs->seek( enumerator->term(false) ); + while( termDocs->next() ) { + bts->set( termDocs->doc() ); + } + } + + _CLDECDELETE( term ); + } + while( enumerator->next() ); + } + _CLFINALLY + ( + _CLDECDELETE( term ); + termDocs->close(); + _CLVDELETE( termDocs ); + enumerator->close(); + _CLDELETE( enumerator ); + ); + + return bts; +} + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/RangeFilter.h b/src/3rdparty/clucene/src/CLucene/search/RangeFilter.h new file mode 100644 index 0000000..0865e35 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/RangeFilter.h @@ -0,0 +1,51 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ + +#ifndef _lucene_search_RangeFilter_ +#define _lucene_search_RangeFilter_ + +#include "CLucene/document/DateField.h" +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/util/BitSet.h" +#include "CLucene/search/Filter.h" + +CL_NS_DEF(search) + +class RangeFilter: public Filter +{ +private: + const TCHAR* field; + TCHAR* lowerValue; + TCHAR* upperValue; + bool includeLower; + bool includeUpper; + +protected: + RangeFilter( const RangeFilter& copy ); + +public: + RangeFilter( const TCHAR* fieldName, const TCHAR* lowerValue, const TCHAR* upperValue, bool includeLower, bool includeUpper ); + + static RangeFilter* Less( TCHAR* fieldName, TCHAR* upperTerm ); + + static RangeFilter* More( TCHAR* fieldName, TCHAR* lowerTerm ); + + ~RangeFilter(); + + /** Returns a BitSet with true for documents which should be permitted in + search results, and false for those that should not. */ + CL_NS(util)::BitSet* bits( CL_NS(index)::IndexReader* reader ); + + Filter* clone() const; + + TCHAR* toString(); +}; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/RangeQuery.cpp b/src/3rdparty/clucene/src/CLucene/search/RangeQuery.cpp new file mode 100644 index 0000000..4fc2420 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/RangeQuery.cpp @@ -0,0 +1,204 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "RangeQuery.h" + +#include "SearchHeader.h" +#include "Scorer.h" +#include "BooleanQuery.h" +#include "TermQuery.h" + +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/util/StringBuffer.h" + + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_DEF(search) + + RangeQuery::RangeQuery(Term* lowerTerm, Term* upperTerm, const bool Inclusive){ + //Func - Constructor + //Pre - (LowerTerm != NULL OR UpperTerm != NULL) AND + // if LowerTerm and UpperTerm are valid pointer then the fieldnames must be the same + //Post - The instance has been created + + if (lowerTerm == NULL && upperTerm == NULL) + { + _CLTHROWA(CL_ERR_IllegalArgument,"At least one term must be non-null"); + } + if (lowerTerm != NULL && upperTerm != NULL && lowerTerm->field() != upperTerm->field()) + { + _CLTHROWA(CL_ERR_IllegalArgument,"Both terms must be for the same field"); + } + + // if we have a lowerTerm, start there. otherwise, start at beginning + if (lowerTerm != NULL) { + this->lowerTerm = _CL_POINTER(lowerTerm); + } + else { + this->lowerTerm = _CLNEW Term(upperTerm, LUCENE_BLANK_STRING); + } + this->upperTerm = (upperTerm != NULL ? _CL_POINTER(upperTerm) : NULL); + this->inclusive = Inclusive; + } + RangeQuery::RangeQuery(const RangeQuery& clone): + Query(clone){ + this->inclusive = clone.inclusive; + this->upperTerm = (clone.upperTerm != NULL ? _CL_POINTER(clone.upperTerm) : NULL ); + this->lowerTerm = (clone.lowerTerm != NULL ? _CL_POINTER(clone.lowerTerm) : NULL ); + } + Query* RangeQuery::clone() const{ + return _CLNEW RangeQuery(*this); + } + + RangeQuery::~RangeQuery() { + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + _CLDECDELETE(lowerTerm); + _CLDECDELETE(upperTerm); + } + + /** Returns a hash code value for this object.*/ + size_t RangeQuery::hashCode() const { + return Similarity::floatToByte(getBoost()) ^ + (lowerTerm != NULL ? lowerTerm->hashCode() : 0) ^ + (upperTerm != NULL ? upperTerm->hashCode() : 0) ^ + (this->inclusive ? 1 : 0); + } + + const TCHAR* RangeQuery::getQueryName() const{ + return getClassName(); + } + const TCHAR* RangeQuery::getClassName(){ + return _T("RangeQuery"); + } + + Query* RangeQuery::combine(Query** queries) { + return Query::mergeBooleanQueries(queries); + } + + bool RangeQuery::equals(Query * other) const{ + if (!(other->instanceOf(RangeQuery::getClassName()))) + return false; + + RangeQuery* rq = (RangeQuery*)other; + bool ret = (this->getBoost() == rq->getBoost()) + && (this->isInclusive() == rq->isInclusive()) + && (this->getLowerTerm()->equals(rq->getLowerTerm())) + && (this->getUpperTerm()->equals(rq->getUpperTerm())); + + return ret; + } + + + /** + * FIXME: Describe <code>rewrite</code> method here. + * + * @param reader an <code>IndexReader</code> value + * @return a <code>Query</code> value + * @exception IOException if an error occurs + */ + Query* RangeQuery::rewrite(IndexReader* reader){ + + BooleanQuery* query = _CLNEW BooleanQuery; + TermEnum* enumerator = reader->terms(lowerTerm); + Term* lastTerm = NULL; + try { + bool checkLower = false; + if (!inclusive) // make adjustments to set to exclusive + checkLower = true; + + const TCHAR* testField = getField(); + do { + lastTerm = enumerator->term(); + if (lastTerm != NULL && lastTerm->field() == testField ) { + if (!checkLower || _tcscmp(lastTerm->text(),lowerTerm->text()) > 0) { + checkLower = false; + if (upperTerm != NULL) { + int compare = _tcscmp(upperTerm->text(),lastTerm->text()); + /* if beyond the upper term, or is exclusive and + * this is equal to the upper term, break out */ + if ((compare < 0) || (!inclusive && compare == 0)) + break; + } + TermQuery* tq = _CLNEW TermQuery(lastTerm); // found a match + tq->setBoost(getBoost()); // set the boost + query->add(tq, true, false, false); // add to query + } + }else { + break; + } + _CLDECDELETE(lastTerm); + } + while (enumerator->next()); + }catch(...){ + _CLDECDELETE(lastTerm); //always need to delete this + _CLDELETE(query); //in case of error, delete the query + enumerator->close(); + _CLDELETE(enumerator); + throw; //rethrow + } + _CLDECDELETE(lastTerm); //always need to delete this + enumerator->close(); + _CLDELETE(enumerator); + + return query; + } + + /** Prints a user-readable version of this query. */ + TCHAR* RangeQuery::toString(const TCHAR* field) const + { + StringBuffer buffer; + if ( field==NULL || _tcscmp(getField(),field)!=0 ) + { + buffer.append( getField() ); + buffer.append( _T(":")); + } + buffer.append(inclusive ? _T("[") : _T("{")); + buffer.append(lowerTerm != NULL ? lowerTerm->text() : _T("NULL")); + buffer.append(_T(" TO ")); + buffer.append(upperTerm != NULL ? upperTerm->text() : _T("NULL")); + buffer.append(inclusive ? _T("]") : _T("}")); + if (getBoost() != 1.0f) + { + buffer.append( _T("^")); + buffer.appendFloat( getBoost(),1 ); + } + return buffer.toString(); + } + + + const TCHAR* RangeQuery::getField() const + { + return (lowerTerm != NULL ? lowerTerm->field() : upperTerm->field()); + } + + /** Returns the lower term of this range query */ + Term* RangeQuery::getLowerTerm(bool pointer) const { + if ( pointer ) + return _CL_POINTER(lowerTerm); + else + return lowerTerm; + } + + /** Returns the upper term of this range query */ + Term* RangeQuery::getUpperTerm(bool pointer) const { + if ( pointer ) + return _CL_POINTER(upperTerm); + else + return upperTerm; + } + + /** Returns <code>true</code> if the range query is inclusive */ + bool RangeQuery::isInclusive() const { return inclusive; } + + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/RangeQuery.h b/src/3rdparty/clucene/src/CLucene/search/RangeQuery.h new file mode 100644 index 0000000..9a7733c --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/RangeQuery.h @@ -0,0 +1,71 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_RangeQuery_ +#define _lucene_search_RangeQuery_ +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "SearchHeader.h" +#include "Scorer.h" +#include "TermQuery.h" + +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" + +#include "CLucene/util/StringBuffer.h" + + +CL_NS_DEF(search) + /** Constructs a query selecting all terms greater than + * <code>lowerTerm</code> but less than <code>upperTerm</code>. + * There must be at least one term and either term may be null, + * in which case there is no bound on that side, but if there are + * two terms, both terms <b>must</b> be for the same field. + */ + class RangeQuery: public Query + { + private: + CL_NS(index)::Term* lowerTerm; + CL_NS(index)::Term* upperTerm; + bool inclusive; + protected: + RangeQuery(const RangeQuery& clone); + + public: + // Constructs a query selecting all terms greater than + // <code>lowerTerm</code> but less than <code>upperTerm</code>. + // There must be at least one term and either term may be NULL-- + // in which case there is no bound on that side, but if there are + // two term, both terms <b>must</b> be for the same field. + RangeQuery(CL_NS(index)::Term* LowerTerm, CL_NS(index)::Term* UpperTerm, const bool Inclusive); + ~RangeQuery(); + + const TCHAR* getQueryName() const; + static const TCHAR* getClassName(); + + Query* rewrite(CL_NS(index)::IndexReader* reader); + + Query* combine(Query** queries); + + // Prints a user-readable version of this query. + TCHAR* toString(const TCHAR* field) const; + + Query* clone() const; + + bool equals(Query * other) const; + + CL_NS(index)::Term* getLowerTerm(bool pointer=true) const; + CL_NS(index)::Term* getUpperTerm(bool pointer=true) const; + bool isInclusive() const; + const TCHAR* getField() const; + + size_t hashCode() const; + }; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/Scorer.h b/src/3rdparty/clucene/src/CLucene/search/Scorer.h new file mode 100644 index 0000000..0d1d435 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Scorer.h @@ -0,0 +1,80 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_Scorer_ +#define _lucene_search_Scorer_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "Similarity.h" +#include "SearchHeader.h" +#include "Explanation.h" + +CL_NS_DEF(search) + /** Expert: Implements scoring for a class of queries. */ +class Scorer: LUCENE_BASE { + private: + Similarity* similarity; + protected: + /** Constructs a Scorer. */ + Scorer(Similarity* similarity) { + this->similarity = similarity; + } + public: + virtual ~Scorer(){ + } + + /** Returns the Similarity implementation used by this scorer. */ + Similarity* getSimilarity() const{ + return this->similarity; + } + + /** Scores all documents and passes them to a collector. */ + void score(HitCollector* hc) { + while (next()) { + hc->collect(doc(), score()); + } + } + + /** Advance to the next document matching the query. Returns true iff there + * is another match. */ + virtual bool next() = 0; + + /** Returns the current document number. Initially invalid, until {@link + * #next()} is called the first time. */ + virtual int32_t doc() const = 0; + + /** Returns the score of the current document. Initially invalid, until + * {@link #next()} is called the first time. */ + virtual qreal score() = 0; + + /** Skips to the first match beyond the current whose document number is + * greater than or equal to <i>target</i>. <p>Returns true iff there is such + * a match. <p>Behaves as if written: <pre> + * boolean skipTo(int32_t target) { + * do { + * if (!next()) + * return false; + * } while (target > doc()); + * return true; + * } + * </pre> + * Most implementations are considerably more efficient than that. + */ + virtual bool skipTo(int32_t target) = 0; + + /** Returns an explanation of the score for <code>doc</code>. */ + virtual void explain(int32_t doc, Explanation* ret) = 0; + + + /** Returns an string which explains the object */ + virtual TCHAR* toString() = 0; + + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/SearchHeader.cpp b/src/3rdparty/clucene/src/CLucene/search/SearchHeader.cpp new file mode 100644 index 0000000..56e4ad5 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/SearchHeader.cpp @@ -0,0 +1,141 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "SearchHeader.h" +#include "BooleanQuery.h" +#include "FieldDocSortedHitQueue.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + +CL_NS(document)::Document* Searchable::doc(const int32_t i){ + CL_NS(document)::Document* ret = _CLNEW CL_NS(document)::Document; + if (!doc(i,ret) ) + _CLDELETE(ret); + return ret; +} + +//static +Query* Query::mergeBooleanQueries(Query** queries) { + CL_NS(util)::CLVector<BooleanClause*> allClauses; + int32_t i = 0; + while ( queries[i] != NULL ){ + BooleanQuery* bq = (BooleanQuery*)queries[i]; + + int32_t size = bq->getClauseCount(); + BooleanClause** clauses = _CL_NEWARRAY(BooleanClause*, size); + bq->getClauses(clauses); + + for (int32_t j = 0;j<size;++j ){ + allClauses.push_back(clauses[j]); + j++; + } + _CLDELETE_ARRAY(clauses); + i++; + } + + BooleanQuery* result = _CLNEW BooleanQuery(); + CL_NS(util)::CLVector<BooleanClause*>::iterator itr = allClauses.begin(); + while (itr != allClauses.end() ) { + result->add(*itr); + } + return result; +} + +Query::Query(const Query& clone):boost(clone.boost){ + //constructor +} +Weight* Query::_createWeight(Searcher* searcher){ + _CLTHROWA(CL_ERR_UnsupportedOperation,"UnsupportedOperationException: Query::_createWeight"); +} + +Query::Query(): + boost(1.0f) +{ + //constructor +} +Query::~Query(){ +} + +/** Expert: called to re-write queries into primitive queries. */ +Query* Query::rewrite(CL_NS(index)::IndexReader* reader){ + return this; +} + +Query* Query::combine(Query** queries){ + _CLTHROWA(CL_ERR_UnsupportedOperation,"UnsupportedOperationException: Query::combine"); +} +Similarity* Query::getSimilarity(Searcher* searcher) { + return searcher->getSimilarity(); +} +bool Query::instanceOf(const TCHAR* other) const{ + const TCHAR* t = getQueryName(); + if ( t==other || _tcscmp( t, other )==0 ) + return true; + else + return false; +} +TCHAR* Query::toString() const{ + return toString(LUCENE_BLANK_STRING); +} + +void Query::setBoost(qreal b) { boost = b; } + +qreal Query::getBoost() const { return boost; } + +Weight* Query::weight(Searcher* searcher){ + Query* query = searcher->rewrite(this); + Weight* weight = query->_createWeight(searcher); + qreal sum = weight->sumOfSquaredWeights(); + qreal norm = getSimilarity(searcher)->queryNorm(sum); + weight->normalize(norm); + return weight; +} + +TopFieldDocs::TopFieldDocs (int32_t totalHits, FieldDoc** fieldDocs, int32_t scoreDocsLen, SortField** fields): + TopDocs (totalHits, NULL, scoreDocsLen) +{ + this->fields = fields; + this->fieldDocs = fieldDocs; + this->scoreDocs = _CL_NEWARRAY(ScoreDoc,scoreDocsLen); + for (int32_t i=0;i<scoreDocsLen;i++ ) + this->scoreDocs[i] = this->fieldDocs[i]->scoreDoc; +} +TopFieldDocs::~TopFieldDocs(){ + if ( fieldDocs ){ + for (int32_t i=0;i<scoreDocsLength;i++) + _CLDELETE(fieldDocs[i]); + _CLDELETE_ARRAY(fieldDocs); + } + if ( fields != NULL ){ + for ( int i=0;fields[i]!=NULL;i++ ) + _CLDELETE(fields[i]); + _CLDELETE_ARRAY(fields); + } +} + +TopDocs::TopDocs(const int32_t th, ScoreDoc*sds, int32_t scoreDocsLen): + totalHits(th), + scoreDocsLength(scoreDocsLen), + scoreDocs(sds) +{ +//Func - Constructor +//Pre - sds may or may not be NULL +// sdLength >= 0 +//Post - The instance has been created + +} + +TopDocs::~TopDocs(){ +//Func - Destructor +//Pre - true +//Post - The instance has been destroyed + + _CLDELETE_ARRAY(scoreDocs); +} + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/SearchHeader.h b/src/3rdparty/clucene/src/CLucene/search/SearchHeader.h new file mode 100644 index 0000000..4a896a5 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/SearchHeader.h @@ -0,0 +1,456 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_SearchHeader_ +#define _lucene_search_SearchHeader_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/IndexReader.h" +#include "CLucene/index/Term.h" +#include "Filter.h" +#include "CLucene/document/Document.h" +#include "Sort.h" +#include "CLucene/util/VoidList.h" +#include "Explanation.h" +#include "Similarity.h" + +CL_NS_DEF(search) + + //predefine classes + class Scorer; + class Query; + class Hits; + class Sort; + class FieldDoc; + class TopFieldDocs; + + /** Expert: Returned by low-level search implementations. + * @see TopDocs */ + struct ScoreDoc { + /** Expert: A hit document's number. + * @see Searcher#doc(int32_t) + */ + int32_t doc; + + /** Expert: The score of this document for the query. */ + qreal score; + }; + + /** Expert: Returned by low-level search implementations. + * @see Searcher#search(Query,Filter,int32_t) */ + class TopDocs:LUCENE_BASE { + public: + /** Expert: The total number of hits for the query. + * @see Hits#length() + */ + int32_t totalHits; + + /** Expert: The top hits for the query. */ + ScoreDoc* scoreDocs; + int32_t scoreDocsLength; + + /** Expert: Constructs a TopDocs. TopDocs takes ownership of the ScoreDoc array*/ + TopDocs(const int32_t th, ScoreDoc* sds, int32_t scoreDocsLength); + ~TopDocs(); + }; + + // Lower-level search API. + // @see Searcher#search(Query,HitCollector) + class HitCollector: LUCENE_BASE { + public: + /** Called once for every non-zero scoring document, with the document number + * and its score. + * + * <P>If, for example, an application wished to collect all of the hits for a + * query in a BitSet, then it might:<pre> + * Searcher searcher = new IndexSearcher(indexReader); + * final BitSet bits = new BitSet(indexReader.maxDoc()); + * searcher.search(query, new HitCollector() { + * public void collect(int32_t doc, float score) { + * bits.set(doc); + * } + * }); + * </pre> + * + * <p>Note: This is called in an inner search loop. For good search + * performance, implementations of this method should not call + * {@link Searcher#doc(int32_t)} or + * {@link IndexReader#document(int32_t)} on every + * document number encountered. Doing so can slow searches by an order + * of magnitude or more. + * <p>Note: The <code>score</code> passed to this method is a raw score. + * In other words, the score will not necessarily be a float whose value is + * between 0 and 1. + */ + virtual void collect(const int32_t doc, const qreal score) = 0; + virtual ~HitCollector(){} + }; + + /** Expert: Calculate query weights and build query scorers. + * + * <p>A Weight is constructed by a query, given a Searcher ({@link + * Query#_createWeight(Searcher)}). The {@link #sumOfSquaredWeights()} method + * is then called on the top-level query to compute the query normalization + * factor (@link Similarity#queryNorm(qreal)}). This factor is then passed to + * {@link #normalize(qreal)}. At this point the weighting is complete and a + * scorer may be constructed by calling {@link #scorer(IndexReader)}. + */ + class Weight: LUCENE_BASE { + public: + virtual ~Weight(){ + }; + + /** The query that this concerns. */ + virtual Query* getQuery() = 0; + + /** The weight for this query. */ + virtual qreal getValue() = 0; + + /** The sum of squared weights of contained query clauses. */ + virtual qreal sumOfSquaredWeights() = 0; + + /** Assigns the query normalization factor to this. */ + virtual void normalize(qreal norm) = 0; + + /** Constructs a scorer for this. */ + virtual Scorer* scorer(CL_NS(index)::IndexReader* reader) = 0; + + /** An explanation of the score computation for the named document. */ + virtual void explain(CL_NS(index)::IndexReader* reader, int32_t doc, Explanation* ret) = 0; + + virtual TCHAR* toString(){ + return STRDUP_TtoT(_T("Weight")); + } + }; + + class HitDoc:LUCENE_BASE { + public: + qreal score; + int32_t id; + CL_NS(document)::Document* doc; + + HitDoc* next; // in doubly-linked cache + HitDoc* prev; // in doubly-linked cache + + HitDoc(const qreal s, const int32_t i); + ~HitDoc(); + }; + + + + // A ranked list of documents, used to hold search results. + class Hits:LUCENE_BASE { + private: + Query* query; + Searcher* searcher; + Filter* filter; + const Sort* sort; + + size_t _length; // the total number of hits + CL_NS(util)::CLVector<HitDoc*, CL_NS(util)::Deletor::Object<HitDoc> > hitDocs; // cache of hits retrieved + + HitDoc* first; // head of LRU cache + HitDoc* last; // tail of LRU cache + int32_t numDocs; // number cached + int32_t maxDocs; // max to cache + + public: + Hits(Searcher* s, Query* q, Filter* f, const Sort* sort=NULL); + ~Hits(); + + /** Returns the total number of hits available in this set. */ + int32_t length() const; + + /** Returns the stored fields of the n<sup>th</sup> document in this set. + <p>Documents are cached, so that repeated requests for the same element may + return the same Document object. + * + * @memory Memory belongs to the hits object. Don't delete the return value. + */ + CL_NS(document)::Document& doc(const int32_t n); + + /** Returns the id for the nth document in this set. */ + int32_t id (const int32_t n); + + /** Returns the score for the nth document in this set. */ + qreal score(const int32_t n); + + private: + // Tries to add new documents to hitDocs. + // Ensures that the hit numbered <code>_min</code> has been retrieved. + void getMoreDocs(const size_t _min); + + HitDoc* getHitDoc(const size_t n); + + void addToFront(HitDoc* hitDoc); + + void remove(const HitDoc* hitDoc); + + }; + + /** The interface for search implementations. + * + * <p>Implementations provide search over a single index, over multiple + * indices, and over indices on remote servers. + */ + class Searchable: LUCENE_BASE { + public: + virtual ~Searchable(){ + } + + /** Lower-level search API. + * + * <p>{@link HitCollector#collect(int32_t,qreal)} is called for every non-zero + * scoring document. + * + * <p>Applications should only use this if they need <i>all</i> of the + * matching documents. The high-level search API ({@link + * Searcher#search(Query*)}) is usually more efficient, as it skips + * non-high-scoring hits. + * + * @param query to match documents + * @param filter if non-null, a bitset used to eliminate some documents + * @param results to receive hits + */ + virtual void _search(Query* query, Filter* filter, HitCollector* results) = 0; + + /** Frees resources associated with this Searcher. + * Be careful not to call this method while you are still using objects + * like {@link Hits}. + */ + virtual void close() = 0; + + /** Expert: Returns the number of documents containing <code>term</code>. + * Called by search code to compute term weights. + * @see IndexReader#docFreq(Term). + */ + virtual int32_t docFreq(const CL_NS(index)::Term* term) const = 0; + + /** Expert: Returns one greater than the largest possible document number. + * Called by search code to compute term weights. + * @see IndexReader#maxDoc(). + */ + virtual int32_t maxDoc() const = 0; + + /** Expert: Low-level search implementation. Finds the top <code>n</code> + * hits for <code>query</code>, applying <code>filter</code> if non-null. + * + * <p>Called by {@link Hits}. + * + * <p>Applications should usually call {@link Searcher#search(Query*)} or + * {@link Searcher#search(Query*,Filter*)} instead. + */ + virtual TopDocs* _search(Query* query, Filter* filter, const int32_t n) = 0; + + /** Expert: Returns the stored fields of document <code>i</code>. + * Called by {@link HitCollector} implementations. + * @see IndexReader#document(int32_t). + */ + virtual bool doc(int32_t i, CL_NS(document)::Document* d) = 0; + _CL_DEPRECATED( doc(i, document) ) CL_NS(document)::Document* doc(const int32_t i); + + /** Expert: called to re-write queries into primitive queries. */ + virtual Query* rewrite(Query* query) = 0; + + /** Returns an Explanation that describes how <code>doc</code> scored against + * <code>query</code>. + * + * <p>This is intended to be used in developing Similarity implementations, + * and, for good performance, should not be displayed with every hit. + * Computing an explanation is as expensive as executing the query over the + * entire index. + */ + virtual void explain(Query* query, int32_t doc, Explanation* ret) = 0; + + /** Expert: Low-level search implementation with arbitrary sorting. Finds + * the top <code>n</code> hits for <code>query</code>, applying + * <code>filter</code> if non-null, and sorting the hits by the criteria in + * <code>sort</code>. + * + * <p>Applications should usually call {@link + * Searcher#search(Query,Filter,Sort)} instead. + */ + virtual TopFieldDocs* _search(Query* query, Filter* filter, const int32_t n, const Sort* sort) = 0; + }; + + + + /** An abstract base class for search implementations. + * Implements some common utility methods. + */ + class Searcher:public Searchable { + private: + /** The Similarity implementation used by this searcher. */ + Similarity* similarity; + + public: + Searcher(){ + similarity = Similarity::getDefault(); + } + virtual ~Searcher(){ + } + + // Returns the documents matching <code>query</code>. + Hits* search(Query* query) { + return search(query, (Filter*)NULL ); + } + + // Returns the documents matching <code>query</code> and + // <code>filter</code>. + Hits* search(Query* query, Filter* filter) { + return _CLNEW Hits(this, query, filter); + } + + /** Returns documents matching <code>query</code> sorted by + * <code>sort</code>. + */ + Hits* search(Query* query, const Sort* sort){ + return _CLNEW Hits(this, query, NULL, sort); + } + + /** Returns documents matching <code>query</code> and <code>filter</code>, + * sorted by <code>sort</code>. + */ + Hits* search(Query* query, Filter* filter, const Sort* sort){ + return _CLNEW Hits(this, query, filter, sort); + } + + /** Lower-level search API. + * + * <p>{@link HitCollector#collect(int32_t ,qreal)} is called for every non-zero + * scoring document. + * + * <p>Applications should only use this if they need <i>all</i> of the + * matching documents. The high-level search API ({@link + * Searcher#search(Query*)}) is usually more efficient, as it skips + * non-high-scoring hits. + * <p>Note: The <code>score</code> passed to this method is a raw score. + * In other words, the score will not necessarily be a float whose value is + * between 0 and 1. + */ + void _search(Query* query, HitCollector* results) { + Searchable::_search(query, NULL, results); + } + + /** Expert: Set the Similarity implementation used by this Searcher. + * + * @see Similarity#setDefault(Similarity) + */ + void setSimilarity(Similarity* similarity) { + this->similarity = similarity; + } + + /** Expert: Return the Similarity implementation used by this Searcher. + * + * <p>This defaults to the current value of {@link Similarity#getDefault()}. + */ + Similarity* getSimilarity(){ + return this->similarity; + } + }; + + /** The abstract base class for queries. + <p>Instantiable subclasses are: + <ul> + <li> {@link TermQuery} + <li> {@link MultiTermQuery} + <li> {@link BooleanQuery} + <li> {@link WildcardQuery} + <li> {@link PhraseQuery} + <li> {@link PrefixQuery} + <li> {@link PhrasePrefixQuery} + <li> {@link FuzzyQuery} + <li> {@link RangeQuery} + <li> {@link spans.SpanQuery} + </ul> + <p>A parser for queries is contained in: + <ul> + <li>{@link queryParser.QueryParser QueryParser} + </ul> + */ + class Query :LUCENE_BASE { + private: + // query boost factor + qreal boost; + protected: + Query(const Query& clone); + public: + Query(); + virtual ~Query(); + + /** Sets the boost for this query clause to <code>b</code>. Documents + * matching this clause will (in addition to the normal weightings) have + * their score multiplied by <code>b</code>. + */ + void setBoost(qreal b); + + /** Gets the boost for this clause. Documents matching + * this clause will (in addition to the normal weightings) have their score + * multiplied by <code>b</code>. The boost is 1.0 by default. + */ + qreal getBoost() const; + + /** Expert: Constructs an initializes a Weight for a top-level query. */ + Weight* weight(Searcher* searcher); + + /** Expert: called to re-write queries into primitive queries. */ + virtual Query* rewrite(CL_NS(index)::IndexReader* reader); + + /** Expert: called when re-writing queries under MultiSearcher. + * + * <p>Only implemented by derived queries, with no + * {@link #_createWeight(Searcher)} implementatation. + */ + virtual Query* combine(Query** queries); + + /** Expert: merges the clauses of a set of BooleanQuery's into a single + * BooleanQuery. + * + *<p>A utility for use by {@link #combine(Query[])} implementations. + */ + static Query* mergeBooleanQueries(Query** queries); + + /** Expert: Returns the Similarity implementation to be used for this query. + * Subclasses may override this method to specify their own Similarity + * implementation, perhaps one that delegates through that of the Searcher. + * By default the Searcher's Similarity implementation is returned.*/ + Similarity* getSimilarity(Searcher* searcher); + + /** Returns a clone of this query. */ + virtual Query* clone() const = 0; + virtual const TCHAR* getQueryName() const = 0; + bool instanceOf(const TCHAR* other) const; + + /** Prints a query to a string, with <code>field</code> as the default field + * for terms. <p>The representation used is one that is readable by + * {@link queryParser.QueryParser QueryParser} + * (although, if the query was created by the parser, the printed + * representation may not be exactly what was parsed). + */ + virtual TCHAR* toString(const TCHAR* field) const = 0; + + virtual bool equals(Query* other) const = 0; + virtual size_t hashCode() const = 0; + + /** Prints a query to a string. */ + TCHAR* toString() const; + + + /** Expert: Constructs an appropriate Weight implementation for this query. + * + * <p>Only implemented by primitive queries, which re-write to themselves. + * <i>This is an Internal function</i> + */ + virtual Weight* _createWeight(Searcher* searcher); + + }; + + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/Similarity.cpp b/src/3rdparty/clucene/src/CLucene/search/Similarity.cpp new file mode 100644 index 0000000..d33a036 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Similarity.cpp @@ -0,0 +1,233 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "Similarity.h" + +#include "CLucene/index/Term.h" +#include "SearchHeader.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + +#ifdef _CL_HAVE_NO_FLOAT_BYTE + #if defined(_LUCENE_PRAGMA_WARNINGS) + #pragma message ("==================Using fallback float<->byte encodings!!!==================") + #else + #warning "==================Using fallback float<->byte encodings!!!==================" + #endif + + //if the autoconf figured out that we can't do the conversions properly, then + //we fall back on the old, inaccurate way of doing things. + qreal NORM_TABLE[] = { + 0.0,5.820766E-10,6.9849193E-10,8.1490725E-10,9.313226E-10,1.1641532E-9,1.3969839E-9, + 1.6298145E-9,1.8626451E-9,2.3283064E-9,2.7939677E-9,3.259629E-9,3.7252903E-9, + 4.656613E-9,5.5879354E-9,6.519258E-9,7.4505806E-9,9.313226E-9,1.1175871E-8,1.3038516E-8, + 1.4901161E-8,1.8626451E-8,2.2351742E-8,2.6077032E-8,2.9802322E-8,3.7252903E-8,4.4703484E-8, + 5.2154064E-8,5.9604645E-8,7.4505806E-8,8.940697E-8,1.0430813E-7,1.1920929E-7,1.4901161E-7, + 1.7881393E-7,2.0861626E-7,2.3841858E-7,2.9802322E-7,3.5762787E-7,4.172325E-7,4.7683716E-7, + 5.9604645E-7,7.1525574E-7,8.34465E-7,9.536743E-7,1.1920929E-6,1.4305115E-6,1.66893E-6, + 1.9073486E-6,2.3841858E-6,2.861023E-6,3.33786E-6,3.8146973E-6,4.7683716E-6,5.722046E-6, + 6.67572E-6,7.6293945E-6,9.536743E-6,1.1444092E-5,1.335144E-5,1.5258789E-5,1.9073486E-5, + 2.2888184E-5,2.670288E-5,3.0517578E-5,3.8146973E-5,4.5776367E-5,5.340576E-5,6.1035156E-5, + 7.6293945E-5,9.1552734E-5,1.0681152E-4,1.2207031E-4,1.5258789E-4,1.8310547E-4,2.1362305E-4, + 2.4414062E-4,3.0517578E-4,3.6621094E-4,4.272461E-4,4.8828125E-4,6.1035156E-4,7.324219E-4, + 8.544922E-4,9.765625E-4,0.0012207031,0.0014648438,0.0017089844,0.001953125,0.0024414062, + 0.0029296875,0.0034179688,0.00390625,0.0048828125,0.005859375,0.0068359375, + 0.0078125,0.009765625,0.01171875,0.013671875,0.015625,0.01953125,0.0234375, + 0.02734375,0.03125,0.0390625,0.046875,0.0546875,0.0625,0.078125,0.09375,0.109375, + 0.125,0.15625,0.1875,0.21875,0.25,0.3125,0.375,0.4375,0.5,0.625,0.75, + 0.875,1.0,1.25,1.5,1.75,2,2.5,3,3.5,4.0,5.0,6.0,7.0,8.0,10.0,12.0,14.0,16.0,20.0,24.0,28.0,32.0,40.0,48.0,56.0, + 64.0,80.0,96.0,112.0,128.0,160.0,192.0,224.0,256.0,320.0,384.0,448.0,512.0,640.0,768.0,896.0,1024.0,1280.0,1536.0,1792.0, + 2048.0,2560.0,3072.0,3584.0,4096.0,5120.0,6144.0,7168.0,8192.0,10240.0,12288.0,14336.0,16384.0,20480.0,24576.0, + 28672.0,32768.0,40960.0,49152.0,57344.0,65536.0,81920.0,98304.0,114688.0,131072.0,163840.0,196608.0, + 229376.0,262144.0,327680.0,393216.0,458752.0,524288.0,655360.0,786432.0,917504.0,1048576.0,1310720.0, + 1572864.0,1835008.0,2097152.0,2621440.0,3145728.0,3670016.0,4194304.0,5242880.0,6291456.0,7340032.0, + 8388608.0,10485760.0,12582912.0,14680064.0,16777216.0,20971520.0,25165824.0,29360128.0,33554432.0, + 41943040.0,50331648.0,58720256.0,67108864.0,83886080.0,100663296.0,117440512.0,134217728.0, + 167772160.0,201326592.0,234881024.0,268435456.0,335544320.0,402653184.0,469762048.0,536870912.0, + 671088640.0,805306368.0,939524096.0,1073741824.0,1342177280.0,1610612736.0,1879048192.0, + 2147483648.0,2684354560.0,3221225472.0,3758096384.0,4294967296.0,5368709120.0,6442450944.0,7516192768.0 + }; + + qreal Similarity::byteToFloat(uint8_t b) { + return NORM_TABLE[b]; + } + + uint8_t Similarity::floatToByte(qreal f) { + return Similarity::encodeNorm(f); + } + +#else + + /** Cache of decoded bytes. */ + qreal NORM_TABLE[256]; + bool NORM_TABLE_initd=false; + + //float to bits conversion utilities... + union clvalue { + int32_t i; + float f; //must use a float type, else types dont match up + }; + + int32_t floatToIntBits(qreal value) + { + clvalue u; + int32_t e, f; + u.f = (float)value; + e = u.i & 0x7f800000; + f = u.i & 0x007fffff; + + if (e == 0x7f800000 && f != 0) + u.i = 0x7fc00000; + + return u.i; + } + + qreal intBitsToFloat(int32_t bits) + { + clvalue u; + u.i = bits; + return u.f; + } + + + qreal Similarity::byteToFloat(uint8_t b) { + if (b == 0) // zero is a special case + return 0.0f; + int32_t mantissa = b & 7; + int32_t exponent = (b >> 3) & 31; + int32_t bits = ((exponent+(63-15)) << 24) | (mantissa << 21); + return intBitsToFloat(bits); + } + + uint8_t Similarity::floatToByte(qreal f) { + if (f < 0.0f) // round negatives up to zero + f = 0.0f; + + if (f == 0.0f) // zero is a special case + return 0; + + int32_t bits = floatToIntBits(f); // parse qreal into parts + int32_t mantissa = (bits & 0xffffff) >> 21; + int32_t exponent = (((bits >> 24) & 0x7f) - 63) + 15; + + if (exponent > 31) { // overflow: use max value + exponent = 31; + mantissa = 7; + } + + if (exponent < 0) { // underflow: use min value + exponent = 0; + mantissa = 1; + } + + return (uint8_t)((exponent << 3) | mantissa); // pack into a uint8_t + } +#endif + + /** The Similarity implementation used by default. */ + Similarity* _defaultImpl=NULL; + + void Similarity::setDefault(Similarity* similarity) { + _defaultImpl = similarity; + } + + Similarity* Similarity::getDefault() { + if ( _defaultImpl == NULL ){ + _defaultImpl = _CLNEW DefaultSimilarity(); + } + return _defaultImpl; + } + + qreal Similarity::decodeNorm(uint8_t b) { +#ifndef _CL_HAVE_NO_FLOAT_BYTE + if ( !NORM_TABLE_initd ){ + for (int i = 0; i < 256; i++) + NORM_TABLE[i] = byteToFloat(i); + NORM_TABLE_initd=true; + } +#endif + return NORM_TABLE[b]; + } + + uint8_t Similarity::encodeNorm(qreal f) { +#ifdef _CL_HAVE_NO_FLOAT_BYTE + int32_t i=0; + if ( f <= 0 ) + return 0; + + while ( i<256 && f > NORM_TABLE[i] ){ + i++; + } + if ( i == 0 ) + return 0; + else if ( i == 255 && f>NORM_TABLE[255] ) + return 255; + else + return i; +#else + return floatToByte(f); +#endif + } + + + qreal Similarity::idf(Term* term, Searcher* searcher) { + return idf(searcher->docFreq(term), searcher->maxDoc()); + } + + + qreal Similarity::idf(CL_NS(util)::CLVector<Term*>* terms, Searcher* searcher) { + qreal _idf = 0.0f; + for (CL_NS(util)::CLVector<Term*>::iterator i = terms->begin(); i != terms->end(); i++ ) { + _idf += idf((Term*)*i, searcher); + } + return _idf; + } + + Similarity::~Similarity(){ + } + + + + + DefaultSimilarity::DefaultSimilarity(){ + } + DefaultSimilarity::~DefaultSimilarity(){ + } + + qreal DefaultSimilarity::lengthNorm(const TCHAR* fieldName, int32_t numTerms) { + if ( numTerms == 0 ) //prevent div by zero + return 0; + qreal ret = (qreal)(1.0 / sqrt((qreal)numTerms)); + return ret; + } + + qreal DefaultSimilarity::queryNorm(qreal sumOfSquaredWeights) { + if ( sumOfSquaredWeights == 0 ) //prevent div by zero + return 0.0f; + qreal ret = (qreal)(1.0 / sqrt(sumOfSquaredWeights)); + return ret; + } + + qreal DefaultSimilarity::tf(qreal freq) { + return sqrt(freq); + } + + qreal DefaultSimilarity::sloppyFreq(int32_t distance) { + return 1.0f / (distance + 1); + } + + qreal DefaultSimilarity::idf(int32_t docFreq, int32_t numDocs) { + return (qreal)(log(numDocs/(qreal)(docFreq+1)) + 1.0); + } + + qreal DefaultSimilarity::coord(int32_t overlap, int32_t maxOverlap) { + if ( maxOverlap == 0 ) + return 0.0f; + return overlap / (qreal)maxOverlap; + } +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/Similarity.h b/src/3rdparty/clucene/src/CLucene/search/Similarity.h new file mode 100644 index 0000000..426af69 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Similarity.h @@ -0,0 +1,268 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_Similarity_ +#define _lucene_search_Similarity_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/Term.h" + +CL_NS_DEF(search) + +class Searcher;//save including the searchheader.h +class DefaultSimilarity; + +/** Expert: Scoring API. +* <p>Subclasses implement search scoring. +* +* <p>The score of query <code>q</code> for document <code>d</code> is defined +* in terms of these methods as follows: +* +* <table cellpadding="0" cellspacing="0" border="0"> +* <tr> +* <td valign="middle" align="right" rowspan="2">score(q,d) =<br></td> +* <td valign="middle" align="center"> +* <big><big><big><big><big>Σ</big></big></big></big></big></td> +* <td valign="middle"><small> +* {@link #tf(int32_t) tf}(t in d) * +* {@link #idf(Term,Searcher) idf}(t) * +* {@link Field#getBoost getBoost}(t.field in d) * +* {@link #lengthNorm(TCHAR*,int32_t) lengthNorm}(t.field in d) +* </small></td> +* <td valign="middle" rowspan="2"> * +* {@link #coord(int32_t,int32_t) coord}(q,d) * +* {@link #queryNorm(qreal) queryNorm}(q) +* </td> +* </tr> +* <tr> +* <td valign="top" align="right"> +* <small>t in q</small> +* </td> +* </tr> +* </table> +* +* @see #setDefault(Similarity) +* @see IndexWriter#setSimilarity(Similarity) +* @see Searcher#setSimilarity(Similarity) +*/ +class Similarity:LUCENE_BASE { +public: + virtual ~Similarity(); + + /** Set the default Similarity implementation used by indexing and search + * code. + * + * @see Searcher#setSimilarity(Similarity) + * @see IndexWriter#setSimilarity(Similarity) + */ + static void setDefault(Similarity* similarity); + + /** Return the default Similarity implementation used by indexing and search + * code. + * + * <p>This is initially an instance of {@link DefaultSimilarity}. + * + * @see Searcher#setSimilarity(Similarity) + * @see IndexWriter#setSimilarity(Similarity) + */ + static Similarity* getDefault(); + + /** Encodes a normalization factor for storage in an index. + * + * <p>The encoding uses a five-bit exponent and three-bit mantissa, thus + * representing values from around 7x10^9 to 2x10^-9 with about one + * significant decimal digit of accuracy. Zero is also represented. + * Negative numbers are rounded up to zero. Values too large to represent + * are rounded down to the largest representable value. Positive values too + * small to represent are rounded up to the smallest positive representable + * value. + * + * @see Field#setBoost(qreal) + */ + static uint8_t encodeNorm(qreal f); + + /** Decodes a normalization factor stored in an index. + * @see #encodeNorm(qreal) + */ + static qreal decodeNorm(uint8_t b); + + static uint8_t floatToByte(qreal f); + static qreal byteToFloat(uint8_t b); + + /** Computes a score factor for a phrase. + * + * <p>The default implementation sums the {@link #idf(Term,Searcher)} factor + * for each term in the phrase. + * + * @param terms the terms in the phrase + * @param searcher the document collection being searched + * @return a score factor for the phrase + */ + qreal idf(CL_NS(util)::CLVector<CL_NS(index)::Term*>* terms, Searcher* searcher); + //qreal idf(Term** terms, Searcher* searcher); + + + /** Computes a score factor for a simple term. + * + * <p>The default implementation is:<pre> + * return idf(searcher.docFreq(term), searcher.maxDoc()); + * </pre> + * + * Note that {@link Searcher#maxDoc()} is used instead of + * {@link IndexReader#numDocs()} because it is proportional to + * {@link Searcher#docFreq(Term)} , i.e., when one is inaccurate, + * so is the other, and in the same direction. + * + * @param term the term in question + * @param searcher the document collection being searched + * @return a score factor for the term + */ + qreal idf(CL_NS(index)::Term* term, Searcher* searcher); + + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(Term, Searcher)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + * <p>Terms and phrases repeated in a document indicate the topic of the + * document, so implementations of this method usually return larger values + * when <code>freq</code> is large, and smaller values when <code>freq</code> + * is small. + * + * <p>The default implementation calls {@link #tf(qreal)}. + * + * @param freq the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + inline qreal tf(int32_t freq){ return tf((qreal)freq); } + + /** Computes the normalization value for a field given the total number of + * terms contained in a field. These values, together with field boosts, are + * stored in an index and multipled into scores for hits on each field by the + * search code. + * + * <p>Matches in longer fields are less precise, so implemenations of this + * method usually return smaller values when <code>numTokens</code> is large, + * and larger values when <code>numTokens</code> is small. + * + * <p>That these values are computed under {@link + * IndexWriter#addDocument(Document)} and stored then using + * {#encodeNorm(qreal)}. Thus they have limited precision, and documents + * must be re-indexed if this method is altered. + * + * @param fieldName the name of the field + * @param numTokens the total number of tokens contained in fields named + * <i>fieldName</i> of <i>doc</i>. + * @return a normalization factor for hits on this field of this document + * + * @see Field#setBoost(qreal) + */ + virtual qreal lengthNorm(const TCHAR* fieldName, int32_t numTokens) = 0; + + /** Computes the normalization value for a query given the sum of the squared + * weights of each of the query terms. This value is then multipled into the + * weight of each query term. + * + * <p>This does not affect ranking, but rather just attempts to make scores + * from different queries comparable. + * + * @param sumOfSquaredWeights the sum of the squares of query term weights + * @return a normalization factor for query weights + */ + virtual qreal queryNorm(qreal sumOfSquaredWeights) = 0; + + /** Computes the amount of a sloppy phrase match, based on an edit distance. + * This value is summed for each sloppy phrase match in a document to form + * the frequency that is passed to {@link #tf(qreal)}. + * + * <p>A phrase match with a small edit distance to a document passage more + * closely matches the document, so implementations of this method usually + * return larger values when the edit distance is small and smaller values + * when it is large. + * + * @see PhraseQuery#setSlop(int32_t) + * @param distance the edit distance of this sloppy phrase match + * @return the frequency increment for this match + */ + virtual qreal sloppyFreq(int32_t distance) = 0; + + /** Computes a score factor based on a term or phrase's frequency in a + * document. This value is multiplied by the {@link #idf(Term, Searcher)} + * factor for each term in the query and these products are then summed to + * form the initial score for a document. + * + * <p>Terms and phrases repeated in a document indicate the topic of the + * document, so implemenations of this method usually return larger values + * when <code>freq</code> is large, and smaller values when <code>freq</code> + * is small. + * + * @param freq the frequency of a term within a document + * @return a score factor based on a term's within-document frequency + */ + virtual qreal tf(qreal freq) = 0; + + /** Computes a score factor based on a term's document frequency (the number + * of documents which contain the term). This value is multiplied by the + * {@link #tf(int32_t)} factor for each term in the query and these products are + * then summed to form the initial score for a document. + * + * <p>Terms that occur in fewer documents are better indicators of topic, so + * implemenations of this method usually return larger values for rare terms, + * and smaller values for common terms. + * + * @param docFreq the number of documents which contain the term + * @param numDocs the total number of documents in the collection + * @return a score factor based on the term's document frequency + */ + virtual qreal idf(int32_t docFreq, int32_t numDocs) = 0; + + /** Computes a score factor based on the fraction of all query terms that a + * document contains. This value is multiplied into scores. + * + * <p>The presence of a large portion of the query terms indicates a better + * match with the query, so implemenations of this method usually return + * larger values when the ratio between these parameters is large and smaller + * values when the ratio between them is small. + * + * @param overlap the number of query terms matched in the document + * @param maxOverlap the total number of terms in the query + * @return a score factor based on term overlap with the query + */ + virtual qreal coord(int32_t overlap, int32_t maxOverlap) = 0; +}; + + +/** Expert: Default scoring implementation. */ +class DefaultSimilarity: public Similarity { +public: + DefaultSimilarity(); + ~DefaultSimilarity(); + + /** Implemented as <code>1/sqrt(numTerms)</code>. */ + qreal lengthNorm(const TCHAR* fieldName, int32_t numTerms); + + /** Implemented as <code>1/sqrt(sumOfSquaredWeights)</code>. */ + qreal queryNorm(qreal sumOfSquaredWeights); + + /** Implemented as <code>sqrt(freq)</code>. */ + inline qreal tf(qreal freq); + + /** Implemented as <code>1 / (distance + 1)</code>. */ + qreal sloppyFreq(int32_t distance); + + /** Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>. */ + qreal idf(int32_t docFreq, int32_t numDocs); + + /** Implemented as <code>overlap / maxOverlap</code>. */ + qreal coord(int32_t overlap, int32_t maxOverlap); +}; + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/SloppyPhraseScorer.cpp b/src/3rdparty/clucene/src/CLucene/search/SloppyPhraseScorer.cpp new file mode 100644 index 0000000..b7683b0 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/SloppyPhraseScorer.cpp @@ -0,0 +1,106 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "SloppyPhraseScorer.h" + +#include "PhraseScorer.h" +#include "CLucene/index/Terms.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + + SloppyPhraseScorer::SloppyPhraseScorer(Weight* weight, CL_NS(index)::TermPositions** tps, + int32_t* positions, Similarity* similarity, + int32_t slop, uint8_t* norms): + PhraseScorer(weight,tps,positions,similarity,norms){ + //Func - Constructor + //Pre - tps != NULL + // tpsLength >= 0 + // n != NULL + //Post - Instance has been created + + CND_PRECONDITION(tps != NULL, "tps is NULL"); + //CND_PRECONDITION(n != NULL, _T("n is NULL")) = checked in PhraseScorer; + + this->slop = slop; + } + + qreal SloppyPhraseScorer::phraseFreq() { + //Func - Returns the freqency of the phrase + //Pre - first != NULL + // last != NULL + // pq != NULL + //Post - The frequency of the phrase has been returned + + CND_PRECONDITION(first != NULL,"first is NULL"); + CND_PRECONDITION(last != NULL,"last is NULL"); + CND_PRECONDITION(pq != NULL,"pq is NULL"); + + //Clear the PhraseQueue pq; + pq->clear(); + + int32_t end = 0; + + //declare iterator + PhrasePositions* pp = NULL; + + // build pq from list + + //Sort the list of PhrasePositions using pq + for (pp = first; pp != NULL; pp = pp->_next) { + //Read the first TermPosition of the current PhrasePositions pp + pp->firstPosition(); + //Check if the position of the pp is bigger than end + if (pp->position > end){ + end = pp->position; + } + //Store the current PhrasePositions pp into the PhraseQueue pp + pq->put(pp); + } + + qreal freq = 0.0f; + + bool done = false; + + do { + //Pop a PhrasePositions pp from the PhraseQueue pp + pp = pq->pop(); + //Get start position + int32_t start = pp->position; + //Get next position + int32_t next = pq->top()->position; + + for (int32_t pos = start; pos <= next; pos = pp->position) { + //advance pp to min window + start = pos; + + if (!pp->nextPosition()) { + //ran out of a term -- done + done = true; + break; + } + } + + //Calculate matchLength + int32_t matchLength = end - start; + //Check if matchLength is smaller than slop + if (matchLength <= slop){ + // penalize longer matches + freq += 1.0 / (matchLength + 1); + } + + if (pp->position > end){ + end = pp->position; + } + + //restore pq + pq->put(pp); + }while (!done); + + return freq; + } +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/SloppyPhraseScorer.h b/src/3rdparty/clucene/src/CLucene/search/SloppyPhraseScorer.h new file mode 100644 index 0000000..31516e3 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/SloppyPhraseScorer.h @@ -0,0 +1,34 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_SloppyPhraseScorer_ +#define _lucene_search_SloppyPhraseScorer_ +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "PhraseScorer.h" +#include "CLucene/index/Terms.h" + + +CL_NS_DEF(search) + class SloppyPhraseScorer: public PhraseScorer { + private: + int32_t slop; + + public: + SloppyPhraseScorer(Weight* weight, CL_NS(index)::TermPositions** tps, + int32_t* positions, Similarity* similarity, + int32_t slop, uint8_t* norms); + ~SloppyPhraseScorer(){ + } + + protected: + qreal phraseFreq(); + }; +CL_NS_END +#endif + diff --git a/src/3rdparty/clucene/src/CLucene/search/Sort.cpp b/src/3rdparty/clucene/src/CLucene/search/Sort.cpp new file mode 100644 index 0000000..5a17a78 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Sort.cpp @@ -0,0 +1,345 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "Sort.h" +#include "Compare.h" + +CL_NS_USE(util) +CL_NS_DEF(search) + + + + /** Represents sorting by document score (relevancy). */ + SortField* SortField::FIELD_SCORE = _CLNEW SortField (NULL, DOCSCORE,false); + + /** Represents sorting by document number (index order). */ + SortField* SortField::FIELD_DOC = _CLNEW SortField (NULL, DOC,false); + + + /** Represents sorting by computed relevance. Using this sort criteria + * returns the same results as calling {@link Searcher#search(Query) Searcher#search()} + * without a sort criteria, only with slightly more overhead. */ + Sort* Sort::RELEVANCE = _CLNEW Sort(); + + /** Represents sorting by index order. */ + Sort* Sort::INDEXORDER = _CLNEW Sort (SortField::FIELD_DOC); + + + + + /** Creates a sort by terms in the given field where the type of term value + * is determined dynamically ({@link #AUTO AUTO}). + * @param field Name of field to sort by, cannot be <code>null</code>. + */ + SortField::SortField (const TCHAR* field) { + this->type = AUTO; + this->reverse = false; + this->field = CLStringIntern::intern(field CL_FILELINE); + this->factory = NULL; + } + + /** Creates a sort, possibly in reverse, by terms in the given field where + * the type of term value is determined dynamically ({@link #AUTO AUTO}). + * @param field Name of field to sort by, cannot be <code>null</code>. + * @param reverse True if natural order should be reversed. + + SortField::SortField (const TCHAR* field, bool reverse) { + this->field = CLStringIntern::intern(field CL_FILELINE); + this->reverse = reverse; + this->type = AUTO; + this->factory = NULL; + }*/ + + + /** Creates a sort, possibly in reverse, by terms in the given field with the + * type of term values explicitly given. + * @param field Name of field to sort by. Can be <code>null</code> if + * <code>type</code> is SCORE or DOC. + * @param type Type of values in the terms. + * @param reverse True if natural order should be reversed (default=false). + */ + SortField::SortField (const TCHAR* field, int32_t type, bool reverse) { + this->field = (field != NULL) ? CLStringIntern::intern(field CL_FILELINE) : field; + this->type = type; + this->reverse = reverse; + this->factory = NULL; + } + + SortField::SortField(const SortField& clone){ + this->field = (clone.field != NULL) ? CLStringIntern::intern(clone.field CL_FILELINE) : clone.field; + this->type = clone.type; + this->reverse = clone.reverse; + this->factory = clone.factory; + } + SortField* SortField::clone() const{ + return _CLNEW SortField(*this); + } + + /** Creates a sort by terms in the given field sorted + * according to the given locale. + * @param field Name of field to sort by, cannot be <code>null</code>. + * @param locale Locale of values in the field. + */ + /*SortField::SortField (TCHAR* field, Locale* locale) { + this->field = (field != NULL) ? CLStringIntern::intern(field): field; + this->type = STRING; + this->locale = locale; + }*/ + + /** Creates a sort, possibly in reverse, by terms in the given field sorted + * according to the given locale. + * @param field Name of field to sort by, cannot be <code>null</code>. + * @param locale Locale of values in the field. + */ + /*SortField::SortField (TCHAR* field, Locale* locale, bool reverse) { + this->field = (field != NULL) ? CLStringIntern::intern(field): field; + this->type = STRING; + this->locale = locale; + this->reverse = reverse; + }*/ + + + /** Creates a sort, possibly in reverse, with a custom comparison function. + * @param field Name of field to sort by; cannot be <code>null</code>. + * @param comparator Returns a comparator for sorting hits. + * @param reverse True if natural order should be reversed (default=false). + */ + SortField::SortField (const TCHAR* field, SortComparatorSource* comparator, bool reverse) { + this->field = (field != NULL) ? CLStringIntern::intern(field CL_FILELINE): field; + this->type = CUSTOM; + this->reverse = reverse; + this->factory = comparator; + } + + SortField::~SortField(){ + CLStringIntern::unintern(field); + } + + TCHAR* SortField::toString() const { + CL_NS(util)::StringBuffer buffer; + switch (type) { + case DOCSCORE: buffer.append(_T("<score>")); + break; + + case DOC: buffer.append(_T("<doc>")); + break; + + case CUSTOM: buffer.append (_T("<custom:\"")); + buffer.append( field ); + buffer.append( _T("\": ")); + buffer.append(factory->getName()); + buffer.append(_T(">")); + break; + + default: buffer.append( _T("\"")); + buffer.append( field ); + buffer.append( _T("\"") ); + break; + } + + //if (locale != null) buffer.append ("("+locale+")"); todo: + if (reverse) buffer.appendChar('!'); + + return buffer.toString(); + } + + + + + + + + + + + + + + /** Sorts by computed relevance. This is the same sort criteria as + * calling {@link Searcher#search(Query) Searcher#search()} without a sort criteria, only with + * slightly more overhead. */ + Sort::Sort() { + fields=NULL; + SortField** fields=_CL_NEWARRAY(SortField*,3); + fields[0]=SortField::FIELD_SCORE; + fields[1]=SortField::FIELD_DOC; + fields[2]=NULL; + setSort (fields); + _CLDELETE_ARRAY(fields); + } + + Sort::~Sort(){ + clear(); + } + void Sort::clear(){ + if ( fields != NULL ){ + int32_t i=0; + while ( fields[i] != NULL ){ + if ( fields[i] != SortField::FIELD_SCORE && + fields[i] != SortField::FIELD_DOC ){ + _CLDELETE(fields[i]); + } + i++; + } + _CLDELETE_ARRAY(fields); + } + } + + /** Sorts possibly in reverse by the terms in <code>field</code> then by + * index order (document number). The type of value in <code>field</code> is determined + * automatically. + * @see SortField#AUTO + */ + Sort::Sort (const TCHAR* field, bool reverse) { + this->fields=NULL; + setSort (field, reverse); + } + + + /** Sorts in succession by the terms in each field. + * The type of value in <code>field</code> is determined + * automatically. + * @see SortField#AUTO + */ + Sort::Sort (const TCHAR** fields) { + this->fields=NULL; + setSort (fields); + } + + + /** Sorts by the criteria in the given SortField. */ + Sort::Sort (SortField* field) { + this->fields=NULL; + setSort (field); + } + + + /** Sorts in succession by the criteria in each SortField. */ + Sort::Sort (SortField** fields) { + this->fields=NULL; + setSort (fields); + } + + + /** Sets the sort to the terms in <code>field</code> possibly in reverse, + * then by index order (document number). */ + void Sort::setSort (const TCHAR* field, bool reverse) { + clear(); + fields = _CL_NEWARRAY(SortField*,3); + fields[0] = _CLNEW SortField (field, SortField::AUTO, reverse); + fields[1] = SortField::FIELD_DOC; + fields[2] = NULL; + } + + + /** Sets the sort to the terms in each field in succession. */ + void Sort::setSort (const TCHAR** fieldnames) { + clear(); + + int32_t n = 0; + while ( fieldnames[n] != NULL ) + n++; + + fields = _CL_NEWARRAY(SortField*,n+1); + for (int32_t i = 0; i < n; ++i) { + fields[i] = _CLNEW SortField (fieldnames[i], SortField::AUTO,false); + } + fields[n]=NULL; + } + + + /** Sets the sort to the given criteria. */ + void Sort::setSort (SortField* field) { + clear(); + + this->fields = _CL_NEWARRAY(SortField*,2); + this->fields[0] = field; + this->fields[1] = NULL; + } + + + /** Sets the sort to the given criteria in succession. */ + void Sort::setSort (SortField** fields) { + clear(); + + int n=0; + while ( fields[n] != NULL ) + n++; + this->fields = _CL_NEWARRAY(SortField*,n+1); + for (int i=0;i<n+1;i++) + this->fields[i]=fields[i]; + } + + TCHAR* Sort::toString() const { + CL_NS(util)::StringBuffer buffer; + + int32_t i = 0; + while ( fields[i] != NULL ){ + if (i>0) + buffer.appendChar(','); + + const TCHAR* p = fields[i]->toString(); + buffer.append(p); + _CLDELETE_CARRAY(p); + + i++; + } + + return buffer.toString(); + } + + + + + + ScoreDocComparator* ScoreDocComparator::INDEXORDER = _CLNEW ScoreDocComparators::IndexOrder; + ScoreDocComparator* ScoreDocComparator::RELEVANCE = _CLNEW ScoreDocComparators::Relevance; + + ScoreDocComparator::~ScoreDocComparator(){ + } + + +class ScoreDocComparatorImpl: public ScoreDocComparator{ + Comparable** cachedValues; + FieldCacheAuto* fca; + int32_t cachedValuesLen; +public: + ScoreDocComparatorImpl(FieldCacheAuto* fca){ + this->fca = fca; + if ( fca->contentType != FieldCacheAuto::COMPARABLE_ARRAY ) + _CLTHROWA(CL_ERR_InvalidCast,"Invalid field cache auto type"); + this->cachedValues = fca->comparableArray; + this->cachedValuesLen = fca->contentLen; + } + ~ScoreDocComparatorImpl(){ + } + int32_t compare (struct ScoreDoc* i, struct ScoreDoc* j){ + CND_PRECONDITION(i->doc >= 0 && i->doc < cachedValuesLen, "i->doc out of range") + CND_PRECONDITION(j->doc >= 0 && j->doc < cachedValuesLen, "j->doc out of range") + return cachedValues[i->doc]->compareTo (cachedValues[j->doc]); + } + + CL_NS(util)::Comparable* sortValue (struct ScoreDoc* i){ + CND_PRECONDITION(i->doc >= 0 && i->doc < cachedValuesLen, "i->doc out of range") + return cachedValues[i->doc]; + } + + int32_t sortType(){ + return SortField::CUSTOM; + } +}; + +ScoreDocComparator* SortComparator::newComparator (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname){ + return _CLNEW ScoreDocComparatorImpl(FieldCache::DEFAULT->getCustom (reader, fieldname, this)); +} +SortComparator::SortComparator(){ +} +SortComparator::~SortComparator(){ +} + + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/Sort.h b/src/3rdparty/clucene/src/CLucene/search/Sort.h new file mode 100644 index 0000000..cfe96d5 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/Sort.h @@ -0,0 +1,356 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_Sort_ +#define _lucene_search_Sort_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/IndexReader.h" +#include "SearchHeader.h" + +CL_NS_DEF(search) + + class SortField; //predefine + class Sort; + +/** + * Expert: Compares two ScoreDoc objects for sorting. + * + */ + class ScoreDocComparator:LUCENE_BASE { + protected: + ScoreDocComparator(){} + public: + virtual ~ScoreDocComparator(); +// CL_NS(util)::Comparable** cachedValues; +// ScoreDocComparator(CL_NS(util)::Comparable** cachedValues); + + /** + * Compares two ScoreDoc objects and returns a result indicating their + * sort order. + * @param i First ScoreDoc + * @param j Second ScoreDoc + * @return <code>-1</code> if <code>i</code> should come before <code>j</code><br><code>1</code> if <code>i</code> should come after <code>j</code><br><code>0</code> if they are equal + * @see java.util.Comparator + */ + virtual int32_t compare (struct ScoreDoc* i, struct ScoreDoc* j) = 0; + + /** + * Returns the value used to sort the given document. The + * object returned must implement the java.io.Serializable + * interface. This is used by multisearchers to determine how to collate results from their searchers. + * @see FieldDoc + * @param i Document + * @return Serializable object + */ + virtual CL_NS(util)::Comparable* sortValue (struct ScoreDoc* i) = 0; + + + /** + * Returns the type of sort. Should return <code>SortField.SCORE</code>, <code>SortField.DOC</code>, <code>SortField.STRING</code>, <code>SortField.INTEGER</code>, + * <code>SortField::FLOAT</code> or <code>SortField.CUSTOM</code>. It is not valid to return <code>SortField.AUTO</code>. + * This is used by multisearchers to determine how to collate results from their searchers. + * @return One of the constants in SortField. + * @see SortField + */ + virtual int32_t sortType() = 0; + + /** Special comparator for sorting hits according to computed relevance (document score). */ + static ScoreDocComparator* RELEVANCE; + + /** Special comparator for sorting hits according to index order (document number). */ + static ScoreDocComparator* INDEXORDER; + }; + +/** + * Expert: returns a comparator for sorting ScoreDocs. + * + */ +class SortComparatorSource:LUCENE_BASE { +public: + virtual ~SortComparatorSource(){ + } + + /** + * return a reference to a string describing the name of the comparator + * this is used in the explanation + */ + virtual TCHAR* getName() = 0; + + virtual size_t hashCode() = 0; + + /** + * Creates a comparator for the field in the given index. + * @param reader Index to create comparator for. + * @param fieldname Field to create comparator for. + * @return Comparator of ScoreDoc objects. + * @throws IOException If an error occurs reading the index. + */ + virtual ScoreDocComparator* newComparator (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname) = 0; +}; + + +/** + * Abstract base class for sorting hits returned by a Query. + * + * <p>This class should only be used if the other SortField + * types (SCORE, DOC, STRING, INT, FLOAT) do not provide an + * adequate sorting. It maintains an internal cache of values which + * could be quite large. The cache is an array of Comparable, + * one for each document in the index. There is a distinct + * Comparable for each unique term in the field - if + * some documents have the same term in the field, the cache + * array will have entries which reference the same Comparable. + * + */ +class SortComparator: public SortComparatorSource { +public: + virtual ScoreDocComparator* newComparator (CL_NS(index)::IndexReader* reader, const TCHAR* fieldname); + + SortComparator(); + virtual ~SortComparator(); + + /** + * Returns an object which, when sorted according to natural order, + * will order the Term values in the correct order. + * <p>For example, if the Terms contained integer values, this method + * would return <code>new Integer(termtext)</code>. Note that this + * might not always be the most efficient implementation - for this + * particular example, a better implementation might be to make a + * ScoreDocLookupComparator that uses an internal lookup table of int. + * @param termtext The textual value of the term. + * @return An object representing <code>termtext</code> that sorts + * according to the natural order of <code>termtext</code>. + * @see Comparable + * @see ScoreDocComparator + */ + virtual CL_NS(util)::Comparable* getComparable (const TCHAR* termtext) = 0; + +}; + + +/** + * Stores information about how to sort documents by terms in an individual + * field. Fields must be indexed in order to sort by them. + * + */ +class SortField:LUCENE_BASE { +private: + const TCHAR* field; + int32_t type; // defaults to determining type dynamically + //Locale* locale; // defaults to "natural order" (no Locale) + bool reverse; // defaults to natural order + SortComparatorSource* factory; + +protected: + SortField (const SortField& clone); +public: + virtual ~SortField(); + + /** Sort by document score (relevancy). Sort values are Float and higher + * values are at the front. + * PORTING: this is the same as SCORE in java, it had to be renamed because + * SCORE is a system macro on some platforms (AIX). + */ + LUCENE_STATIC_CONSTANT(int32_t, DOCSCORE=0); + + /** Sort by document number (index order). Sort values are Integer and lower + * values are at the front. */ + LUCENE_STATIC_CONSTANT(int32_t, DOC=1); + + /** Guess type of sort based on field contents. A regular expression is used + * to look at the first term indexed for the field and determine if it + * represents an integer number, a floating point number, or just arbitrary + * string characters. */ + LUCENE_STATIC_CONSTANT(int32_t, AUTO=2); + + /** Sort using term values as Strings. Sort values are String and lower + * values are at the front. */ + LUCENE_STATIC_CONSTANT(int32_t, STRING=3); + + /** Sort using term values as encoded Integers. Sort values are Integer and + * lower values are at the front. */ + LUCENE_STATIC_CONSTANT(int32_t, INT=4); + + /** Sort using term values as encoded Floats. Sort values are Float and + * lower values are at the front. */ + LUCENE_STATIC_CONSTANT(int32_t, FLOAT=5); + + /** Sort using a custom Comparator. Sort values are any Comparable and + * sorting is done according to natural order. */ + LUCENE_STATIC_CONSTANT(int32_t, CUSTOM=9); + + // IMPLEMENTATION NOTE: the FieldCache.STRING_INDEX is in the same "namespace" + // as the above static int values. Any new values must not have the same value + // as FieldCache.STRING_INDEX. + + /** Represents sorting by document score (relevancy). */ + static SortField* FIELD_SCORE; + + /** Represents sorting by document number (index order). */ + static SortField* FIELD_DOC; + + SortField (const TCHAR* field); + //SortField (const TCHAR* field, bool reverse); + //todo: we cannot make reverse use default field of =false. + //because bool and int are the same type in c, overloading is not possible + SortField (const TCHAR* field, int32_t type, bool reverse); + + /* + SortField (TCHAR* field, Locale* locale) { + SortField (TCHAR* field, Locale* locale, bool reverse);*/ + + SortField (const TCHAR* field, SortComparatorSource* comparator, bool reverse=false); + + /** Returns the name of the field. Could return <code>null</code> + * if the sort is by SCORE or DOC. + * @return Name of field, possibly <code>null</code>. + */ + const TCHAR* getField() const { return field; } + + SortField* clone() const; + + /** Returns the type of contents in the field. + * @return One of the constants SCORE, DOC, AUTO, STRING, INT or FLOAT. + */ + int32_t getType() const { return type; } + + /** Returns the Locale by which term values are interpreted. + * May return <code>null</code> if no Locale was specified. + * @return Locale, or <code>null</code>. + */ + /*Locale getLocale() { + return locale; + }*/ + + /** Returns whether the sort should be reversed. + * @return True if natural order should be reversed. + */ + bool getReverse() const { return reverse; } + + SortComparatorSource* getFactory() const { return factory; } + + TCHAR* toString() const; +}; + + + +/** + * Encapsulates sort criteria for returned hits. + * + * <p>The fields used to determine sort order must be carefully chosen. + * Documents must contain a single term in such a field, + * and the value of the term should indicate the document's relative position in + * a given sort order. The field must be indexed, but should not be tokenized, + * and does not need to be stored (unless you happen to want it back with the + * rest of your document data). In other words: + * + * <dl><dd><code>document.add (new Field ("byNumber", Integer.toString(x), false, true, false));</code> + * </dd></dl> + * + * <p><h3>Valid Types of Values</h3> + * + * <p>There are three possible kinds of term values which may be put into + * sorting fields: Integers, Floats, or Strings. Unless + * {@link SortField SortField} objects are specified, the type of value + * in the field is determined by parsing the first term in the field. + * + * <p>Integer term values should contain only digits and an optional + * preceeding negative sign. Values must be base 10 and in the range + * <code>Integer.MIN_VALUE</code> and <code>Integer.MAX_VALUE</code> inclusive. + * Documents which should appear first in the sort + * should have low value integers, later documents high values + * (i.e. the documents should be numbered <code>1..n</code> where + * <code>1</code> is the first and <code>n</code> the last). + * + * <p>Float term values should conform to values accepted by + * {@link Float Float.valueOf(String)} (except that <code>NaN</code> + * and <code>Infinity</code> are not supported). + * Documents which should appear first in the sort + * should have low values, later documents high values. + * + * <p>String term values can contain any valid String, but should + * not be tokenized. The values are sorted according to their + * {@link Comparable natural order}. Note that using this type + * of term value has higher memory requirements than the other + * two types. + * + * <p><h3>Object Reuse</h3> + * + * <p>One of these objects can be + * used multiple times and the sort order changed between usages. + * + * <p>This class is thread safe. + * + * <p><h3>Memory Usage</h3> + * + * <p>Sorting uses of caches of term values maintained by the + * internal HitQueue(s). The cache is static and contains an integer + * or float array of length <code>IndexReader.maxDoc()</code> for each field + * name for which a sort is performed. In other words, the size of the + * cache in bytes is: + * + * <p><code>4 * IndexReader.maxDoc() * (# of different fields actually used to sort)</code> + * + * <p>For String fields, the cache is larger: in addition to the + * above array, the value of every term in the field is kept in memory. + * If there are many unique terms in the field, this could + * be quite large. + * + * <p>Note that the size of the cache is not affected by how many + * fields are in the index and <i>might</i> be used to sort - only by + * the ones actually used to sort a result set. + * + * <p>The cache is cleared each time a new <code>IndexReader</code> is + * passed in, or if the value returned by <code>maxDoc()</code> + * changes for the current IndexReader. This class is not set up to + * be able to efficiently sort hits from more than one index + * simultaneously. + * + */ +class Sort:LUCENE_BASE { + // internal representation of the sort criteria + SortField** fields; + void clear(); +public: + ~Sort(); + + /** Represents sorting by computed relevance. Using this sort criteria + * returns the same results as calling {@link Searcher#search(Query) Searcher#search()} + * without a sort criteria, only with slightly more overhead. */ + static Sort* RELEVANCE; + + /** Represents sorting by index order. */ + static Sort* INDEXORDER; + + Sort(); + Sort (const TCHAR* field, bool reverse=false); + Sort (const TCHAR** fields); + Sort (SortField* field); + Sort (SortField** fields); + void setSort (const TCHAR* field, bool reverse=false); + void setSort (const TCHAR** fieldnames); + void setSort (SortField* field); + void setSort (SortField** fields); + + TCHAR* toString() const; + + /** + * Representation of the sort criteria. + * @return a pointer to the of SortField array used in this sort criteria + */ + SortField** getSort() const{ return fields; } +}; + + + + + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/TermQuery.cpp b/src/3rdparty/clucene/src/CLucene/search/TermQuery.cpp new file mode 100644 index 0000000..a04c20f --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/TermQuery.cpp @@ -0,0 +1,213 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "TermQuery.h" + +#include "SearchHeader.h" +#include "Scorer.h" +#include "CLucene/index/Term.h" +#include "TermScorer.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/util/StringBuffer.h" +#include "CLucene/index/Terms.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + + + /** Constructs a query for the term <code>t</code>. */ + TermQuery::TermQuery(Term* t): + term( _CL_POINTER(t) ) + { + } + TermQuery::TermQuery(const TermQuery& clone): + Query(clone){ + this->term=_CL_POINTER(clone.term); + } + TermQuery::~TermQuery(){ + _CLDECDELETE(term); + } + + Query* TermQuery::clone() const{ + return _CLNEW TermQuery(*this); + } + + const TCHAR* TermQuery::getClassName(){ + return _T("TermQuery"); + } + const TCHAR* TermQuery::getQueryName() const{ + return getClassName(); + } + size_t TermQuery::hashCode() const { + return Similarity::floatToByte(getBoost()) ^ term->hashCode(); + } + + + //added by search highlighter + Term* TermQuery::getTerm(bool pointer) const + { + if ( pointer ) + return _CL_POINTER(term); + else + return term; + } + + + /** Prints a user-readable version of this query. */ + TCHAR* TermQuery::toString(const TCHAR* field) const{ + CL_NS(util)::StringBuffer buffer; + if ( field==NULL || _tcscmp(term->field(),field)!= 0 ) { + buffer.append(term->field()); + buffer.append(_T(":")); + } + buffer.append(term->text()); + if (getBoost() != 1.0f) { + buffer.append(_T("^")); + buffer.appendFloat( getBoost(),1 ); + } + return buffer.toString(); + } + + /** Returns true iff <code>o</code> is equal to this. */ + bool TermQuery::equals(Query* other) const { + if (!(other->instanceOf(TermQuery::getClassName()))) + return false; + + TermQuery* tq = (TermQuery*)other; + return (this->getBoost() == tq->getBoost()) + && this->term->equals(tq->term); + } + + + TermQuery::TermWeight::TermWeight(Searcher* searcher, TermQuery* _this, Term* _term) { + this->_this = _this; + this->_term = _term; + this->searcher = searcher; + value=0; + idf=0; + queryNorm=0; + queryWeight=0; + } + TermQuery::TermWeight::~TermWeight(){ + } + + //return a *new* string describing this object + TCHAR* TermQuery::TermWeight::toString() { + int32_t size=_tcslen(_this->getQueryName()) + 10; + TCHAR* tmp = _CL_NEWARRAY(TCHAR, size);//_tcslen(weight()) + _sntprintf(tmp,size,_T("weight(%s)"),_this->getQueryName()); + return tmp; + } + + qreal TermQuery::TermWeight::sumOfSquaredWeights() { + idf = _this->getSimilarity(searcher)->idf(_term, searcher); // compute idf + queryWeight = idf * _this->getBoost(); // compute query weight + return queryWeight * queryWeight; // square it + } + + void TermQuery::TermWeight::normalize(qreal queryNorm) { + this->queryNorm = queryNorm; + queryWeight *= queryNorm; // normalize query weight + value = queryWeight * idf; // idf for document + } + + Scorer* TermQuery::TermWeight::scorer(IndexReader* reader) { + TermDocs* termDocs = reader->termDocs(_term); + + if (termDocs == NULL) + return NULL; + + return _CLNEW TermScorer(this, termDocs, _this->getSimilarity(searcher), + reader->norms(_term->field())); + } + + void TermQuery::TermWeight::explain(IndexReader* reader, int32_t doc, Explanation* result){ + TCHAR buf[LUCENE_SEARCH_EXPLANATION_DESC_LEN]; + TCHAR* tmp; + + tmp = getQuery()->toString(); + _sntprintf(buf,LUCENE_SEARCH_EXPLANATION_DESC_LEN, + _T("weight(%s in %d), product of:"),tmp,doc); + _CLDELETE_CARRAY(tmp); + result->setDescription(buf); + + _sntprintf(buf,LUCENE_SEARCH_EXPLANATION_DESC_LEN, + _T("idf(docFreq=%d)"), searcher->docFreq(_term) ); + Explanation* idfExpl = _CLNEW Explanation(idf, buf); + + // explain query weight + Explanation* queryExpl = _CLNEW Explanation(); + tmp = getQuery()->toString(); + _sntprintf(buf,LUCENE_SEARCH_EXPLANATION_DESC_LEN, + _T("queryWeight(%s), product of:"), tmp); + _CLDELETE_CARRAY(tmp); + queryExpl->setDescription(buf); + + Explanation* boostExpl = _CLNEW Explanation(_this->getBoost(), _T("boost")); + if (_this->getBoost() != 1.0f) + queryExpl->addDetail(boostExpl); + else + _CLDELETE(boostExpl); + + queryExpl->addDetail(idfExpl->clone()); + + Explanation* queryNormExpl = _CLNEW Explanation(queryNorm,_T("queryNorm")); + queryExpl->addDetail(queryNormExpl); + + queryExpl->setValue(_this->getBoost()* // always 1.0 + idfExpl->getValue() * + queryNormExpl->getValue()); + + // explain field weight + const TCHAR* field = _term->field(); + Explanation* fieldExpl = _CLNEW Explanation(); + + tmp = _term->toString(); + _sntprintf(buf,LUCENE_SEARCH_EXPLANATION_DESC_LEN, + _T("fieldWeight(%s in %d), product of:"),tmp,doc); + _CLDELETE_CARRAY(tmp); + fieldExpl->setDescription(buf); + + Scorer* sc = scorer(reader); + Explanation* tfExpl = _CLNEW Explanation; + sc->explain(doc, tfExpl); + _CLDELETE(sc); + fieldExpl->addDetail(tfExpl); + fieldExpl->addDetail(idfExpl); + + Explanation* fieldNormExpl = _CLNEW Explanation(); + uint8_t* fieldNorms = reader->norms(field); + qreal fieldNorm = + fieldNorms!=NULL ? Similarity::decodeNorm(fieldNorms[doc]) : 0.0f; + fieldNormExpl->setValue(fieldNorm); + + _sntprintf(buf,LUCENE_SEARCH_EXPLANATION_DESC_LEN, + _T("fieldNorm(field=%s, doc=%d)"),field,doc); + fieldNormExpl->setDescription(buf); + fieldExpl->addDetail(fieldNormExpl); + + fieldExpl->setValue(tfExpl->getValue() * + idfExpl->getValue() * + fieldNormExpl->getValue()); + + /*if (queryExpl->getValue() == 1.0f){ + _CLDELETE(result); + return fieldExpl; + }else{*/ + result->addDetail(queryExpl); + result->addDetail(fieldExpl); + + // combine them + result->setValue(queryExpl->getValue() * fieldExpl->getValue()); + //} + } + + Weight* TermQuery::_createWeight(Searcher* searcher) { + return _CLNEW TermWeight(searcher,this,term); + } +CL_NS_END + diff --git a/src/3rdparty/clucene/src/CLucene/search/TermQuery.h b/src/3rdparty/clucene/src/CLucene/search/TermQuery.h new file mode 100644 index 0000000..a7dd803 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/TermQuery.h @@ -0,0 +1,81 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_TermQuery_ +#define _lucene_search_TermQuery_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "SearchHeader.h" +#include "Scorer.h" +#include "CLucene/index/Term.h" +#include "TermScorer.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/util/StringBuffer.h" +#include "CLucene/index/Terms.h" + +CL_NS_DEF(search) + + + /** A Query that matches documents containing a term. + This may be combined with other terms with a {@link BooleanQuery}. + */ + class TermQuery: public Query { + private: + CL_NS(index)::Term* term; + + + class TermWeight: public Weight { + private: + Searcher* searcher; + qreal value; + qreal idf; + qreal queryNorm; + qreal queryWeight; + TermQuery* _this; + CL_NS(index)::Term* _term; + + public: + TermWeight(Searcher* searcher, TermQuery* _this, CL_NS(index)::Term* _term); + ~TermWeight(); + TCHAR* toString(); + Query* getQuery() { return (Query*)_this; } + qreal getValue() { return value; } + + qreal sumOfSquaredWeights(); + void normalize(qreal queryNorm); + Scorer* scorer(CL_NS(index)::IndexReader* reader); + void explain(CL_NS(index)::IndexReader* reader, int32_t doc, Explanation* ret); + }; + + protected: + Weight* _createWeight(Searcher* searcher); + TermQuery(const TermQuery& clone); + public: + // Constructs a query for the term <code>t</code>. + TermQuery(CL_NS(index)::Term* t); + ~TermQuery(); + + static const TCHAR* getClassName(); + const TCHAR* getQueryName() const; + + //added by search highlighter + CL_NS(index)::Term* getTerm(bool pointer=true) const; + + // Prints a user-readable version of this query. + TCHAR* toString(const TCHAR* field) const; + + bool equals(Query* other) const; + Query* clone() const; + + /** Returns a hash code value for this object.*/ + size_t hashCode() const; + }; +CL_NS_END +#endif + diff --git a/src/3rdparty/clucene/src/CLucene/search/TermScorer.cpp b/src/3rdparty/clucene/src/CLucene/search/TermScorer.cpp new file mode 100644 index 0000000..ddd7f74 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/TermScorer.cpp @@ -0,0 +1,120 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "TermScorer.h" + +#include "CLucene/index/Terms.h" +#include "TermQuery.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + + //TermScorer takes TermDocs and delets it when TermScorer is cleaned up + TermScorer::TermScorer(Weight* w, CL_NS(index)::TermDocs* td, + Similarity* similarity,uint8_t* _norms): + Scorer(similarity), + termDocs(td), + norms(_norms), + weight(w), + weightValue(w->getValue()), + _doc(0), + pointer(0), + pointerMax(0) + { + memset(docs,0,32*sizeof(int32_t)); + memset(freqs,0,32*sizeof(int32_t)); + + for (int32_t i = 0; i < LUCENE_SCORE_CACHE_SIZE; i++) + scoreCache[i] = getSimilarity()->tf(i) * weightValue; + } + + TermScorer::~TermScorer(){ + _CLDELETE(termDocs); + } + bool TermScorer::next(){ + pointer++; + if (pointer >= pointerMax) { + pointerMax = termDocs->read(docs, freqs, 32); // refill buffer + if (pointerMax != 0) { + pointer = 0; + } else { + termDocs->close(); // close stream + _doc = LUCENE_INT32_MAX_SHOULDBE; // set to sentinel value + return false; + } + } + _doc = docs[pointer]; + return true; + } + + bool TermScorer::skipTo(int32_t target) { + // first scan in cache + for (pointer++; pointer < pointerMax; pointer++) { + if (docs[pointer] >= target) { + _doc = docs[pointer]; + return true; + } + } + + // not found in cache, seek underlying stream + bool result = termDocs->skipTo(target); + if (result) { + pointerMax = 1; + pointer = 0; + docs[pointer] = _doc = termDocs->doc(); + freqs[pointer] = termDocs->freq(); + } else { + _doc = LUCENE_INT32_MAX_SHOULDBE; + } + return result; + } + + void TermScorer::explain(int32_t doc, Explanation* tfExplanation) { + TermQuery* query = (TermQuery*)weight->getQuery(); + int32_t tf = 0; + while (pointer < pointerMax) { + if (docs[pointer] == doc) + tf = freqs[pointer]; + pointer++; + } + if (tf == 0) { + while (termDocs->next()) { + if (termDocs->doc() == doc) { + tf = termDocs->freq(); + } + } + } + termDocs->close(); + tfExplanation->setValue(getSimilarity()->tf(tf)); + + TCHAR buf[LUCENE_SEARCH_EXPLANATION_DESC_LEN+1]; + TCHAR* termToString = query->getTerm(false)->toString(); + _sntprintf(buf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,_T("tf(termFreq(%s)=%d)"), termToString, tf); + _CLDELETE_CARRAY(termToString); + tfExplanation->setDescription(buf); + } + + TCHAR* TermScorer::toString() { + TCHAR* wb = weight->toString(); + int32_t rl = _tcslen(wb) + 9; //9=_tcslen("scorer(" ")") + 1 + TCHAR* ret = _CL_NEWARRAY(TCHAR,rl); + _sntprintf(ret,rl,_T("scorer(%s)"), wb); + _CLDELETE_ARRAY(wb); + return ret; + } + + qreal TermScorer::score(){ + int32_t f = freqs[pointer]; + qreal raw = // compute tf(f)*weight + f < LUCENE_SCORE_CACHE_SIZE // check cache + ? scoreCache[f] // cache hit + : getSimilarity()->tf(f) * weightValue; // cache miss + + return raw * Similarity::decodeNorm(norms[_doc]); // normalize for field + } + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/TermScorer.h b/src/3rdparty/clucene/src/CLucene/search/TermScorer.h new file mode 100644 index 0000000..ccbf5f7 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/TermScorer.h @@ -0,0 +1,53 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_TermScorer_ +#define _lucene_search_TermScorer_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "Scorer.h" +#include "CLucene/index/Terms.h" +#include "CLucene/search/Similarity.h" +#include "SearchHeader.h" + +CL_NS_DEF(search) + + class TermScorer: public Scorer { + private: + CL_NS(index)::TermDocs* termDocs; + uint8_t* norms; + Weight* weight; + const qreal weightValue; + int32_t _doc; + + int32_t docs[32]; // buffered doc numbers + int32_t freqs[32]; // buffered term freqs + int32_t pointer; + int32_t pointerMax; + + qreal scoreCache[LUCENE_SCORE_CACHE_SIZE]; + public: + + //TermScorer takes TermDocs and delets it when TermScorer is cleaned up + TermScorer(Weight* weight, CL_NS(index)::TermDocs* td, + Similarity* similarity, uint8_t* _norms); + + ~TermScorer(); + + int32_t doc() const { return _doc; } + + bool next(); + bool skipTo(int32_t target); + void explain(int32_t doc, Explanation* ret); + TCHAR* toString(); + + qreal score(); + }; +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/WildcardQuery.cpp b/src/3rdparty/clucene/src/CLucene/search/WildcardQuery.cpp new file mode 100644 index 0000000..9373cef --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/WildcardQuery.cpp @@ -0,0 +1,147 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "WildcardQuery.h" +#include "CLucene/util/BitSet.h" + +CL_NS_USE(index) +CL_NS_USE(util) +CL_NS_DEF(search) + + + WildcardQuery::WildcardQuery(Term* term): + MultiTermQuery( term ){ + //Func - Constructor + //Pre - term != NULL + //Post - Instance has been created + + } + + WildcardQuery::~WildcardQuery(){ + //Func - Destructor + //Pre - true + //Post - true + + } + + const TCHAR* WildcardQuery::getQueryName() const{ + //Func - Returns the string "WildcardQuery" + //Pre - true + //Post - The string "WildcardQuery" has been returned + return getClassName(); + } + + const TCHAR* WildcardQuery::getClassName(){ + return _T("WildcardQuery"); + } + + + FilteredTermEnum* WildcardQuery::getEnum(IndexReader* reader) { + return _CLNEW WildcardTermEnum(reader, getTerm(false)); + } + + WildcardQuery::WildcardQuery(const WildcardQuery& clone): + MultiTermQuery(clone) + { + } + + Query* WildcardQuery::clone() const{ + return _CLNEW WildcardQuery(*this); + } + size_t WildcardQuery::hashCode() const{ + //todo: we should give the query a seeding value... but + //need to do it for all hascode functions + return Similarity::floatToByte(getBoost()) ^ getTerm()->hashCode(); + } + bool WildcardQuery::equals(Query* other) const{ + if (!(other->instanceOf(WildcardQuery::getClassName()))) + return false; + + WildcardQuery* tq = (WildcardQuery*)other; + return (this->getBoost() == tq->getBoost()) + && getTerm()->equals(tq->getTerm()); + } + + + + + + + + + + + + +WildcardFilter::WildcardFilter( Term* term ) +{ + this->term = _CL_POINTER(term); +} + +WildcardFilter::~WildcardFilter() +{ + _CLDECDELETE(term); +} + +WildcardFilter::WildcardFilter( const WildcardFilter& copy ) : + term( _CL_POINTER(copy.term) ) +{ +} + +Filter* WildcardFilter::clone() const { + return _CLNEW WildcardFilter(*this ); +} + + +TCHAR* WildcardFilter::toString() +{ + //Instantiate a stringbuffer buffer to store the readable version temporarily + CL_NS(util)::StringBuffer buffer; + //check if field equal to the field of prefix + if( term->field() != NULL ) { + //Append the field of prefix to the buffer + buffer.append(term->field()); + //Append a colon + buffer.append(_T(":") ); + } + //Append the text of the prefix + buffer.append(term->text()); + + //Convert StringBuffer buffer to TCHAR block and return it + return buffer.toString(); +} + + +/** Returns a BitSet with true for documents which should be permitted in +search results, and false for those that should not. */ +BitSet* WildcardFilter::bits( IndexReader* reader ) +{ + BitSet* bts = _CLNEW BitSet( reader->maxDoc() ); + + WildcardTermEnum termEnum (reader, term); + if (termEnum.term(false) == NULL) + return bts; + + TermDocs* termDocs = reader->termDocs(); + try{ + do{ + termDocs->seek(&termEnum); + + while (termDocs->next()) { + bts->set(termDocs->doc()); + } + }while(termEnum.next()); + } _CLFINALLY( + termDocs->close(); + _CLDELETE(termDocs); + termEnum.close(); + ) + + return bts; +} + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/WildcardQuery.h b/src/3rdparty/clucene/src/CLucene/search/WildcardQuery.h new file mode 100644 index 0000000..cfc38f6 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/WildcardQuery.h @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_WildcardQuery_ +#define _lucene_search_WildcardQuery_ +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/IndexReader.h" +#include "CLucene/index/Term.h" +#include "MultiTermQuery.h" +#include "WildcardTermEnum.h" + +CL_NS_DEF(search) + + /** Implements the wildcard search query. Supported wildcards are <code>*</code>, which + * matches any character sequence (including the empty one), and <code>?</code>, + * which matches any single character. Note this query can be slow, as it + * needs to iterate over all terms. In order to prevent extremely slow WildcardQueries, + * a Wildcard term must not start with one of the wildcards <code>*</code> or + * <code>?</code>. + * + * @see WildcardTermEnum + */ + class WildcardQuery: public MultiTermQuery { + protected: + FilteredTermEnum* getEnum(CL_NS(index)::IndexReader* reader); + WildcardQuery(const WildcardQuery& clone); + public: + WildcardQuery(CL_NS(index)::Term* term); + ~WildcardQuery(); + + //Returns the string "WildcardQuery" + const TCHAR* getQueryName() const; + static const TCHAR* getClassName(); + + size_t hashCode() const; + bool equals(Query* other) const; + Query* clone() const; + }; + + + +class WildcardFilter: public Filter +{ +private: + CL_NS(index)::Term* term; +protected: + WildcardFilter( const WildcardFilter& copy ); + +public: + WildcardFilter(CL_NS(index)::Term* term); + ~WildcardFilter(); + + /** Returns a BitSet with true for documents which should be permitted in + search results, and false for those that should not. */ + CL_NS(util)::BitSet* bits( CL_NS(index)::IndexReader* reader ); + + Filter* clone() const; + TCHAR* toString(); +}; + + +CL_NS_END +#endif diff --git a/src/3rdparty/clucene/src/CLucene/search/WildcardTermEnum.cpp b/src/3rdparty/clucene/src/CLucene/search/WildcardTermEnum.cpp new file mode 100644 index 0000000..bed9e6e --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/WildcardTermEnum.cpp @@ -0,0 +1,150 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#include "CLucene/StdHeader.h" +#include "WildcardTermEnum.h" + +CL_NS_USE(index) +CL_NS_DEF(search) + + bool WildcardTermEnum::termCompare(Term* term) { + if ( term!=NULL && __term->field() == term->field() ) { + const TCHAR* searchText = term->text(); + const TCHAR* patternText = __term->text(); + if ( _tcsncmp( searchText, pre, preLen ) == 0 ){ + return wildcardEquals(patternText+preLen, __term->textLength()-preLen, 0, searchText, term->textLength(), preLen); + } + } + _endEnum = true; + return false; + } + + /** Creates new WildcardTermEnum */ + WildcardTermEnum::WildcardTermEnum(IndexReader* reader, Term* term): + FilteredTermEnum(), + __term(_CL_POINTER(term)), + fieldMatch(false), + _endEnum(false) + { + + pre = stringDuplicate(term->text()); + + const TCHAR* sidx = _tcschr( pre, LUCENE_WILDCARDTERMENUM_WILDCARD_STRING ); + const TCHAR* cidx = _tcschr( pre, LUCENE_WILDCARDTERMENUM_WILDCARD_CHAR ); + const TCHAR* tidx = sidx; + if (tidx == NULL) + tidx = cidx; + else if ( cidx && cidx > pre) + tidx = min(sidx, cidx); + CND_PRECONDITION(tidx != NULL, "tidx==NULL"); + int32_t idx = (int32_t)(tidx - pre); + preLen = idx; + CND_PRECONDITION(preLen<term->textLength(), "preLen >= term->textLength()"); + pre[preLen]=0; //trim end + + Term* t = _CLNEW Term(__term, pre); + setEnum( reader->terms(t) ); + _CLDECDELETE(t); + } + + void WildcardTermEnum::close() + { + if ( __term != NULL ){ + FilteredTermEnum::close(); + + _CLDECDELETE(__term); + __term = NULL; + + _CLDELETE_CARRAY( pre ); + } + } + WildcardTermEnum::~WildcardTermEnum() { + close(); + } + + qreal WildcardTermEnum::difference() { + return 1.0f; + } + + bool WildcardTermEnum::endEnum() { + return _endEnum; + } + + bool WildcardTermEnum::wildcardEquals(const TCHAR* pattern, int32_t patternLen, int32_t patternIdx, const TCHAR* str, int32_t strLen, int32_t stringIdx) + { + for (int32_t p = patternIdx; ; ++p) + { + for (int32_t s = stringIdx; ; ++p, ++s) + { + // End of str yet? + bool sEnd = (s >= strLen); + // End of pattern yet? + bool pEnd = (p >= patternLen); + + // If we're looking at the end of the str... + if (sEnd) + { + // Assume the only thing left on the pattern is/are wildcards + bool justWildcardsLeft = true; + + // Current wildcard position + int32_t wildcardSearchPos = p; + // While we haven't found the end of the pattern, + // and haven't encountered any non-wildcard characters + while (wildcardSearchPos < patternLen && justWildcardsLeft) + { + // Check the character at the current position + TCHAR wildchar = pattern[wildcardSearchPos]; + // If it's not a wildcard character, then there is more + // pattern information after this/these wildcards. + + if (wildchar != LUCENE_WILDCARDTERMENUM_WILDCARD_CHAR && + wildchar != LUCENE_WILDCARDTERMENUM_WILDCARD_STRING){ + justWildcardsLeft = false; + }else{ + // to prevent "cat" matches "ca??" + if (wildchar == LUCENE_WILDCARDTERMENUM_WILDCARD_CHAR) + return false; + + wildcardSearchPos++; // Look at the next character + } + } + + // This was a prefix wildcard search, and we've matched, so + // return true. + if (justWildcardsLeft) + return true; + } + + // If we've gone past the end of the str, or the pattern, + // return false. + if (sEnd || pEnd) + break; + + // Match a single character, so continue. + if (pattern[p] == LUCENE_WILDCARDTERMENUM_WILDCARD_CHAR) + continue; + + if (pattern[p] == LUCENE_WILDCARDTERMENUM_WILDCARD_STRING) + { + // Look at the character beyond the '*'. + ++p; + // Examine the str, starting at the last character. + for (int32_t i = strLen; i >= s; --i) + { + if (wildcardEquals(pattern, patternLen, p, str, strLen, i)) + return true; + } + break; + } + if (pattern[p] != str[s]) + break; + } + return false; + } + } + +CL_NS_END diff --git a/src/3rdparty/clucene/src/CLucene/search/WildcardTermEnum.h b/src/3rdparty/clucene/src/CLucene/search/WildcardTermEnum.h new file mode 100644 index 0000000..2a03735 --- /dev/null +++ b/src/3rdparty/clucene/src/CLucene/search/WildcardTermEnum.h @@ -0,0 +1,67 @@ +/*------------------------------------------------------------------------------ +* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team +* +* Distributable under the terms of either the Apache License (Version 2.0) or +* the GNU Lesser General Public License, as specified in the COPYING file. +------------------------------------------------------------------------------*/ +#ifndef _lucene_search_WildcardTermEnum_ +#define _lucene_search_WildcardTermEnum_ +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/index/IndexReader.h" +#include "CLucene/index/Term.h" +#include "CLucene/index/Terms.h" +#include "FilteredTermEnum.h" + +CL_NS_DEF(search) + /** + * Subclass of FilteredTermEnum for enumerating all terms that match the + * specified wildcard filter term-> + * <p> + * Term enumerations are always ordered by term->compareTo(). Each term in + * the enumeration is greater than all that precede it. + */ + class WildcardTermEnum: public FilteredTermEnum { + private: + CL_NS(index)::Term* __term; + TCHAR* pre; + int32_t preLen; + bool fieldMatch; + bool _endEnum; + + /******************************************** + * const TCHAR* equality with support for wildcards + ********************************************/ + + protected: + bool termCompare(CL_NS(index)::Term* term) ; + + public: + + /** + * Creates a new <code>WildcardTermEnum</code>. Passing in a + * {@link Term Term} that does not contain a + * <code>LUCENE_WILDCARDTERMENUM_WILDCARD_STRING</code> or + * <code>LUCENE_WILDCARDTERMENUM_WILDCARD_CHAR</code> will cause an exception to be thrown. + */ + WildcardTermEnum(CL_NS(index)::IndexReader* reader, CL_NS(index)::Term* term); + ~WildcardTermEnum(); + + qreal difference() ; + + bool endEnum() ; + + /** + * Determines if a word matches a wildcard pattern. + */ + static bool wildcardEquals(const TCHAR* pattern, int32_t patternLen, int32_t patternIdx, const TCHAR* str, int32_t strLen, int32_t stringIdx); + + void close(); + + const char* getObjectName(){ return WildcardTermEnum::getClassName(); } + static const char* getClassName(){ return "WildcardTermEnum"; } + }; +CL_NS_END +#endif |