diff options
Diffstat (limited to 'src/search_php.h')
-rw-r--r-- | src/search_php.h | 68 |
1 files changed, 38 insertions, 30 deletions
diff --git a/src/search_php.h b/src/search_php.h index fb6668a..337662c 100644 --- a/src/search_php.h +++ b/src/search_php.h @@ -1,3 +1,4 @@ +"<?\n" "function readInt($file)\n" "{\n" " $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));\n" @@ -60,10 +61,15 @@ " }\n" " $w = readString($file);\n" " }\n" -" $totalFreq=0;\n" +" $totalHi=0;\n" +" $totalFreqHi=0;\n" +" $totalFreqLo=0;\n" " for ($count=$start;$count<sizeof($statsList);$count++)\n" " {\n" " $statInfo = &$statsList[$count];\n" +" $multiplier = 1;\n" +" // whole word matches have a double weight\n" +" if ($statInfo[\"full\"]) $multiplier=2;\n" " fseek($file,$statInfo[\"index\"]); \n" " $numDocs = readInt($file);\n" " $docInfo = array();\n" @@ -72,11 +78,22 @@ " {\n" " $idx=readInt($file); \n" " $freq=readInt($file); \n" -" $docInfo[$i]=array(\"idx\"=>$idx,\"freq\"=>$freq,\"rank\"=>0.0);\n" -" $totalFreq+=$freq;\n" -" if ($statInfo[\"full\"]) $totalFreq+=$freq;\n" +" $docInfo[$i]=array(\"idx\" => $idx,\n" +" \"freq\" => $freq>>1,\n" +" \"rank\" => 0.0,\n" +" \"hi\" => $freq&1\n" +" );\n" +" if ($freq&1) // word occurs in high priority doc\n" +" {\n" +" $totalHi++;\n" +" $totalFreqHi+=$freq*$multiplier;\n" +" }\n" +" else // word occurs in low priority doc\n" +" {\n" +" $totalFreqLo+=$freq*$multiplier;\n" +" }\n" " }\n" -" // read name an url info for the doc\n" +" // read name and url info for the doc\n" " for ($i=0;$i<$numDocs;$i++)\n" " {\n" " fseek($file,$docInfo[$i][\"idx\"]);\n" @@ -85,15 +102,28 @@ " }\n" " $statInfo[\"docs\"]=$docInfo;\n" " }\n" +" $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;\n" " for ($count=$start;$count<sizeof($statsList);$count++)\n" " {\n" " $statInfo = &$statsList[$count];\n" +" $multiplier = 1;\n" +" // whole word matches have a double weight\n" +" if ($statInfo[\"full\"]) $multiplier=2;\n" " for ($i=0;$i<sizeof($statInfo[\"docs\"]);$i++)\n" " {\n" " $docInfo = &$statInfo[\"docs\"];\n" " // compute frequency rank of the word in each doc\n" -" $statInfo[\"docs\"][$i][\"rank\"]=\n" -" (float)$docInfo[$i][\"freq\"]/$totalFreq;\n" +" $freq=$docInfo[$i][\"freq\"];\n" +" if ($docInfo[$i][\"hi\"])\n" +" {\n" +" $statInfo[\"docs\"][$i][\"rank\"]=\n" +" (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;\n" +" }\n" +" else\n" +" {\n" +" $statInfo[\"docs\"][$i][\"rank\"]=\n" +" (float)($freq*$multiplier)/$totalFreq;\n" +" }\n" " }\n" " }\n" " }\n" @@ -113,7 +143,6 @@ " if (in_array($key, array_keys($docs)))\n" " {\n" " $docs[$key][\"rank\"]+=$rank;\n" -" $docs[$key][\"rank\"]*=2; // multiple matches increases rank \n" " }\n" " else\n" " {\n" @@ -132,25 +161,6 @@ " return $docs;\n" "}\n" "\n" -"function normalize_ranking(&$docs)\n" -"{\n" -" $maxRank = 0.0000001;\n" -" // compute maximal rank\n" -" foreach ($docs as $doc) \n" -" {\n" -" if ($doc[\"rank\"]>$maxRank)\n" -" {\n" -" $maxRank=$doc[\"rank\"];\n" -" }\n" -" }\n" -" reset($docs);\n" -" // normalize rankings\n" -" while (list ($key, $val) = each ($docs)) \n" -" {\n" -" $docs[$key][\"rank\"]*=100/$maxRank;\n" -" }\n" -"}\n" -"\n" "function filter_results($docs,&$requiredWords,&$forbiddenWords)\n" "{\n" " $filteredDocs=array();\n" @@ -284,7 +294,7 @@ " if (!in_array($word,$foundWords))\n" " {\n" " $foundWords[]=$word;\n" -" search($file,strtolower($word),$results);\n" +" search($file,$word,$results);\n" " }\n" " $word=strtok(\" \");\n" " }\n" @@ -293,8 +303,6 @@ " // filter out documents with forbidden word or that do not contain\n" " // required words\n" " $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);\n" -" // normalize rankings so they are in the range [0-100]\n" -" normalize_ranking($filteredDocs);\n" " // sort the results based on rank\n" " $sorted = array();\n" " sort_results($filteredDocs,$sorted);\n" |