Skip to content

Commit c8b68b0

Browse files
Added Fuzzy approx. feature + Stemming support + improved faceting
1 parent 11d48d6 commit c8b68b0

15 files changed

+279
-8
lines changed

Engine.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ private function getDefaultConfig(){
115115
"var_dir" => $_SERVER['DOCUMENT_ROOT'].DIRECTORY_SEPARATOR."var",
116116
"index_dir" => DIRECTORY_SEPARATOR."engine".DIRECTORY_SEPARATOR."index",
117117
"documents_dir" => DIRECTORY_SEPARATOR."engine".DIRECTORY_SEPARATOR."documents",
118-
"cache_dir" => DIRECTORY_SEPARATOR."engine".DIRECTORY_SEPARATOR."cache"
118+
"cache_dir" => DIRECTORY_SEPARATOR."engine".DIRECTORY_SEPARATOR."cache",
119+
"fuzzy_cost" => 1
119120
],
120121
"schemas" => [
121122
"example-post" => [

Services/Index.php

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212

1313
class Index
1414
{
15+
/**
16+
* @var array $config
17+
*/
18+
private $config;
19+
1520
/**
1621
* @var Directory $index
1722
*/
@@ -52,6 +57,7 @@ class Index
5257
*/
5358
public function __construct($config, $schemas, $types)
5459
{
60+
$this->config = $config;
5561
$this->schemas = $schemas;
5662
$this->types = $types;
5763
try {
@@ -223,6 +229,18 @@ public function search($query, $filters = [])
223229
arsort($results);
224230
$documents = [];
225231

232+
$facets = [];
233+
if(isset($filters['facets']) && !empty($filters['facets'])) {
234+
foreach ($filters['facets'] as $facet) {
235+
if ($this->index->open("facet_" . $facet, false) !== null) {
236+
$array = $this->index->open("facet_" . $facet, false)->getContent();
237+
foreach ($array as $token => $ids) {
238+
$facets[$facet][$token] = count(array_intersect_key(array_flip($ids), $results));
239+
}
240+
}
241+
}
242+
}
243+
226244
$i = 0;
227245
foreach($results as $doc => $score){
228246
if($i < $filters['offset']){
@@ -235,8 +253,7 @@ public function search($query, $filters = [])
235253
$i++;
236254
}
237255

238-
$facets = [];
239-
if(isset($filters['facets'])){
256+
if(empty($query) && isset($filters['facets'])){
240257
foreach($filters['facets'] as $facet){
241258
if($this->index->open("facet_".$facet,false) !== null){
242259
$array = $this->index->open("facet_".$facet,false)->getContent();
@@ -323,8 +340,9 @@ public function suggest($token, $providePonderations = false){
323340
$tokens = array_keys($all->getContent());
324341
$matching = [];
325342
foreach($tokens as $indexToken){
326-
if(strpos($indexToken, $token) !== false){
327-
$matching[$indexToken] = strpos($indexToken, $token);
343+
$strPos = strpos($indexToken, $token);
344+
if($strPos !== false){
345+
$matching[$indexToken] = $strPos;
328346
}
329347
}
330348
asort($matching);
@@ -339,9 +357,11 @@ public function suggest($token, $providePonderations = false){
339357
* @return array
340358
* @throws Exception
341359
*/
342-
private function fuzzyFind($token){
343-
if(empty($token)) return [];
360+
private function fuzzyFind($token){if(empty($token)) return [];
344361
$matching = $this->suggest($token, true);
362+
if(empty($matching)){
363+
$matching = $this->approximate($token, $this->config['fuzzy_cost']);
364+
}
345365
$found = [];
346366
if(!empty($matching)){
347367
reset($matching);
@@ -358,6 +378,51 @@ private function fuzzyFind($token){
358378
return $found;
359379
}
360380

381+
/**
382+
*
383+
* @param $term
384+
* @param $cost
385+
* @param array $positions
386+
* @return array|mixed
387+
* @throws Exception
388+
*/
389+
private function approximate($term, $cost, $positions = []){
390+
$cached = $this->getCache("approx_".$term);
391+
if(!empty($cached)){
392+
return $cached;
393+
}
394+
$termL = strlen($term);
395+
if($termL <= 1) return []; // we shouldn't approximate one character
396+
$cost = max($cost, $termL-1); // The cost can't be more than the term's length itself
397+
$tokens = array_keys($this->index->open("all")->getContent());
398+
$matching = [];
399+
for($i=0;$i<$termL;$i++){
400+
$termToFind = substr_replace($term, '', $i,1);
401+
foreach($tokens as $token){
402+
$originalToken = $token;
403+
if(!empty($positions)){
404+
foreach($positions as $position){
405+
$token = substr_replace($token, '', $position,1);
406+
}
407+
}
408+
if(strlen($token) >= $termL){
409+
$tokenToLink = substr_replace($token, '', $i,1);
410+
$strPos = strpos($tokenToLink,$termToFind);
411+
if($strPos !== false){
412+
$matching[$originalToken] = $strPos;
413+
}
414+
}
415+
}
416+
if($cost > 1){
417+
$positions[$cost] = $i;
418+
$matching = array_replace($matching, $this->approximate($termToFind,$cost-1, $positions));
419+
}
420+
}
421+
asort($matching);
422+
$this->setCache("approx_".$term, $matching);
423+
return $matching;
424+
}
425+
361426
/**
362427
* @param $data
363428
* @param $schema
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace VFou\Search\Tokenizers;
4+
5+
use Wamania\Snowball\Danish;
6+
7+
class DanishStemmingTokenizer implements TokenizerInterface
8+
{
9+
10+
public static function tokenize($data)
11+
{
12+
return array_map(function($value){
13+
$stemmer = new Danish();
14+
return $stemmer->stem($value);
15+
}, $data);
16+
}
17+
}

Tokenizers/DutchStemmingTokenizer.php

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace VFou\Search\Tokenizers;
4+
5+
use Wamania\Snowball\Dutch;
6+
7+
class DutchStemmingTokenizer implements TokenizerInterface
8+
{
9+
10+
public static function tokenize($data)
11+
{
12+
return array_map(function($value){
13+
$stemmer = new Dutch();
14+
return $stemmer->stem($value);
15+
}, $data);
16+
}
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace VFou\Search\Tokenizers;
4+
5+
use Wamania\Snowball\English;
6+
7+
class EnglishStemmingTokenizer implements TokenizerInterface
8+
{
9+
10+
public static function tokenize($data)
11+
{
12+
return array_map(function($value){
13+
$stemmer = new English();
14+
return $stemmer->stem($value);
15+
}, $data);
16+
}
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace VFou\Search\Tokenizers;
4+
5+
use Wamania\Snowball\French;
6+
7+
class FrenchStemmingTokenizer implements TokenizerInterface
8+
{
9+
10+
public static function tokenize($data)
11+
{
12+
return array_map(function($value){
13+
$stemmer = new French();
14+
return $stemmer->stem($value);
15+
}, $data);
16+
}
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace VFou\Search\Tokenizers;
4+
5+
use Wamania\Snowball\German;
6+
7+
class GermanStemmingTokenizer implements TokenizerInterface
8+
{
9+
10+
public static function tokenize($data)
11+
{
12+
return array_map(function($value){
13+
$stemmer = new German();
14+
return $stemmer->stem($value);
15+
}, $data);
16+
}
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace VFou\Search\Tokenizers;
4+
5+
use Wamania\Snowball\Italian;
6+
7+
class ItalianStemmingTokenizer implements TokenizerInterface
8+
{
9+
10+
public static function tokenize($data)
11+
{
12+
return array_map(function($value){
13+
$stemmer = new Italian();
14+
return $stemmer->stem($value);
15+
}, $data);
16+
}
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace VFou\Search\Tokenizers;
4+
5+
use Wamania\Snowball\Norwegian;
6+
7+
class NorwegianStemmingTokenizer implements TokenizerInterface
8+
{
9+
10+
public static function tokenize($data)
11+
{
12+
return array_map(function($value){
13+
$stemmer = new Norwegian();
14+
return $stemmer->stem($value);
15+
}, $data);
16+
}
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace VFou\Search\Tokenizers;
4+
5+
use Wamania\Snowball\Portuguese;
6+
7+
class PortugueseStemmingTokenizer implements TokenizerInterface
8+
{
9+
10+
public static function tokenize($data)
11+
{
12+
return array_map(function($value){
13+
$stemmer = new Portuguese();
14+
return $stemmer->stem($value);
15+
}, $data);
16+
}
17+
}

0 commit comments

Comments
 (0)