summaryrefslogtreecommitdiffstats
path: root/src/search_php.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/search_php.h')
-rw-r--r--src/search_php.h68
1 files changed, 38 insertions, 30 deletions
diff --git a/src/search_php.h b/src/search_php.h
index fb6668a..337662c 100644
--- a/src/search_php.h
+++ b/src/search_php.h
@@ -1,3 +1,4 @@
+"<?\n"
"function readInt($file)\n"
"{\n"
" $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));\n"
@@ -60,10 +61,15 @@
" }\n"
" $w = readString($file);\n"
" }\n"
-" $totalFreq=0;\n"
+" $totalHi=0;\n"
+" $totalFreqHi=0;\n"
+" $totalFreqLo=0;\n"
" for ($count=$start;$count<sizeof($statsList);$count++)\n"
" {\n"
" $statInfo = &$statsList[$count];\n"
+" $multiplier = 1;\n"
+" // whole word matches have a double weight\n"
+" if ($statInfo[\"full\"]) $multiplier=2;\n"
" fseek($file,$statInfo[\"index\"]); \n"
" $numDocs = readInt($file);\n"
" $docInfo = array();\n"
@@ -72,11 +78,22 @@
" {\n"
" $idx=readInt($file); \n"
" $freq=readInt($file); \n"
-" $docInfo[$i]=array(\"idx\"=>$idx,\"freq\"=>$freq,\"rank\"=>0.0);\n"
-" $totalFreq+=$freq;\n"
-" if ($statInfo[\"full\"]) $totalFreq+=$freq;\n"
+" $docInfo[$i]=array(\"idx\" => $idx,\n"
+" \"freq\" => $freq>>1,\n"
+" \"rank\" => 0.0,\n"
+" \"hi\" => $freq&1\n"
+" );\n"
+" if ($freq&1) // word occurs in high priority doc\n"
+" {\n"
+" $totalHi++;\n"
+" $totalFreqHi+=$freq*$multiplier;\n"
+" }\n"
+" else // word occurs in low priority doc\n"
+" {\n"
+" $totalFreqLo+=$freq*$multiplier;\n"
+" }\n"
" }\n"
-" // read name an url info for the doc\n"
+" // read name and url info for the doc\n"
" for ($i=0;$i<$numDocs;$i++)\n"
" {\n"
" fseek($file,$docInfo[$i][\"idx\"]);\n"
@@ -85,15 +102,28 @@
" }\n"
" $statInfo[\"docs\"]=$docInfo;\n"
" }\n"
+" $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;\n"
" for ($count=$start;$count<sizeof($statsList);$count++)\n"
" {\n"
" $statInfo = &$statsList[$count];\n"
+" $multiplier = 1;\n"
+" // whole word matches have a double weight\n"
+" if ($statInfo[\"full\"]) $multiplier=2;\n"
" for ($i=0;$i<sizeof($statInfo[\"docs\"]);$i++)\n"
" {\n"
" $docInfo = &$statInfo[\"docs\"];\n"
" // compute frequency rank of the word in each doc\n"
-" $statInfo[\"docs\"][$i][\"rank\"]=\n"
-" (float)$docInfo[$i][\"freq\"]/$totalFreq;\n"
+" $freq=$docInfo[$i][\"freq\"];\n"
+" if ($docInfo[$i][\"hi\"])\n"
+" {\n"
+" $statInfo[\"docs\"][$i][\"rank\"]=\n"
+" (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;\n"
+" }\n"
+" else\n"
+" {\n"
+" $statInfo[\"docs\"][$i][\"rank\"]=\n"
+" (float)($freq*$multiplier)/$totalFreq;\n"
+" }\n"
" }\n"
" }\n"
" }\n"
@@ -113,7 +143,6 @@
" if (in_array($key, array_keys($docs)))\n"
" {\n"
" $docs[$key][\"rank\"]+=$rank;\n"
-" $docs[$key][\"rank\"]*=2; // multiple matches increases rank \n"
" }\n"
" else\n"
" {\n"
@@ -132,25 +161,6 @@
" return $docs;\n"
"}\n"
"\n"
-"function normalize_ranking(&$docs)\n"
-"{\n"
-" $maxRank = 0.0000001;\n"
-" // compute maximal rank\n"
-" foreach ($docs as $doc) \n"
-" {\n"
-" if ($doc[\"rank\"]>$maxRank)\n"
-" {\n"
-" $maxRank=$doc[\"rank\"];\n"
-" }\n"
-" }\n"
-" reset($docs);\n"
-" // normalize rankings\n"
-" while (list ($key, $val) = each ($docs)) \n"
-" {\n"
-" $docs[$key][\"rank\"]*=100/$maxRank;\n"
-" }\n"
-"}\n"
-"\n"
"function filter_results($docs,&$requiredWords,&$forbiddenWords)\n"
"{\n"
" $filteredDocs=array();\n"
@@ -284,7 +294,7 @@
" if (!in_array($word,$foundWords))\n"
" {\n"
" $foundWords[]=$word;\n"
-" search($file,strtolower($word),$results);\n"
+" search($file,$word,$results);\n"
" }\n"
" $word=strtok(\" \");\n"
" }\n"
@@ -293,8 +303,6 @@
" // filter out documents with forbidden word or that do not contain\n"
" // required words\n"
" $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);\n"
-" // normalize rankings so they are in the range [0-100]\n"
-" normalize_ranking($filteredDocs);\n"
" // sort the results based on rank\n"
" $sorted = array();\n"
" sort_results($filteredDocs,$sorted);\n"