ez-pro/core/bitrix/modules/search/tools/stemming.php
2025-11-13 19:04:05 +03:00

360 lines
7.1 KiB
PHP

<?php
function stemming_init($sLang='ru')
{
static $arStemFunc = false;
//Init all languages
if ($arStemFunc === false)
{
$arStemFunc = [];
$rsLanguages = CLanguage::GetList();
while ($arLanguage = $rsLanguages->Fetch())
{
stemming_init($arLanguage['LID']);
}
}
//Check if language was not used
if ($sLang !== false && !isset($arStemFunc[$sLang]))
{
$stemming_function_suf = $sLang;
if (!function_exists('stemming_' . $sLang))
{
$strFileName = $_SERVER['DOCUMENT_ROOT'] . BX_PERSONAL_ROOT . '/php_interface/' . $sLang . '/search/stemming.php';
if (file_exists($strFileName))
{
@include $strFileName;
}
if (!function_exists('stemming_' . $sLang))
{
$strFileName = $_SERVER['DOCUMENT_ROOT'] . '/bitrix/modules/search/tools/' . $sLang . '/stemming.php';
if (file_exists($strFileName))
{
if (\Bitrix\Main\Localization\Translation::allowConvertEncoding())
{
\Bitrix\Main\Localization\StreamConverter::include($strFileName, $sLang);
}
else
{
@include $strFileName;
}
}
if (!function_exists('stemming_' . $sLang))
{
$stemming_function_suf = 'default';
}
}
}
$stemming_stop_function = 'stemming_stop_' . $sLang;
if (!function_exists($stemming_stop_function))
{
$stemming_stop_function = 'stemming_stop_default';
}
$stemming_upper_function = 'stemming_upper_' . $sLang;
if (!function_exists($stemming_upper_function))
{
$stemming_upper_function = 'stemming_upper_default';
}
$letters = stemming_letter_default();
$stemming_letter_function = 'stemming_letter_' . $sLang;
if (function_exists($stemming_letter_function))
{
$letters .= $stemming_letter_function();
}
// Do not use CPageOption feature in the real project. This is for unit tests only.
$letters .= CPageOption::GetOptionString('search', 'letters') ?: COption::GetOptionString('search', 'letters');
if (function_exists($stemming_letter_function))
{
$abc = $stemming_letter_function();
}
else
{
$abc = '';
}
if ($abc == '')
{
$abc = stemming_letter_default();
}
$arStemFunc[$sLang] = [
'stem' => 'stemming_' . $stemming_function_suf,
'stop' => $stemming_stop_function,
'upper' => $stemming_upper_function,
'letters' => $letters,
'pcre_letters' => '\\w\\d' . str_replace(
['\\' , '-' , '^' , ']' , '/'],
['\\\\', '\\-', '\\^', '\\]', '\\/'],
$letters
),
'abc' => $abc,
'pcre_abc' => '\\w\\d' . str_replace(
['\\' , '-' , '^' , ']' , '/'],
['\\\\', '\\-', '\\^', '\\]', '\\/'],
$abc
),
];
}
if ($sLang === false)
{
return $arStemFunc;
}
else
{
return $arStemFunc[$sLang];
}
}
function stemming_upper($sText, $sLang='ru')
{
$arStemFunc = stemming_init($sLang);
$upper_function = $arStemFunc['upper'];
return $upper_function($sText);
}
function stemming_split($sText, $sLang='ru')
{
$arStemFunc = stemming_init($sLang);
$words = [];
$tok = ' ';
$sText = stemming_upper($sText, $sLang);
$sText = preg_replace('/[^' . $arStemFunc['pcre_letters'] . ']/u', $tok, $sText);
$word = strtok($sText, $tok);
while ($word !== false)
{
$word = mb_substr($word, 0, 100);
if (!isset($words[$word]))
{
$words[$word] = mb_strpos($sText, $word);
}
$word = strtok($tok);
}
return $words;
}
function stemming($sText, $sLang='ru', $bIgnoreStopWords = false, $bReturnPositions = false)
{
static $STOP_CACHE = [];
if (!isset($STOP_CACHE[$sLang]))
{
$STOP_CACHE[$sLang] = [];
}
$stop_cache = &$STOP_CACHE[$sLang];
//Result
$stems = [];
//Get info about all languages
$arStemInfo = stemming_init(false);
//Add default functions if language was not defined
if (!isset($arStemInfo[$sLang]))
{
$arStemInfo[$sLang] = stemming_init($sLang);
}
$stem_func = $arStemInfo[$sLang]['stem'];
$pcre_abc = '/[^' . $arStemInfo[$sLang]['pcre_abc'] . ']+/u';
//Delimiter of the words
$tok = ' ';
$sText = stemming_upper($sText, $sLang);
if ($bReturnPositions)
{
$sText = preg_replace('/[^' . $arStemInfo[$sLang]['pcre_letters'] . '.!?]+/u', $tok, $sText);
$sText = preg_replace('/[!?]+/u', '.', $sText);
}
else
{
$sText = preg_replace('/[^' . $arStemInfo[$sLang]['pcre_letters'] . ']+/u', $tok, $sText);
}
//Parse text
$words = strtok($sText, $tok);
$pos = 1;
while ($words !== false)
{
if ($bReturnPositions)
{
$words = explode('.', $words);
}
else
{
$words = [$words];
}
foreach ($words as $i => $word)
{
$word = mb_substr($word, 0, 50);
if ($bReturnPositions)
{
if ($i > 0)
{
$pos += 5; //Sentence distance
}
if ($word == '')
{
continue;
}
}
//Try to stem starting with desired language
//1 - stemming may return more than one word
$stem = $stem_func($word, 1);
$stop_lang = $sLang;
//If word equals it's stemming
//and has letters not from ABC
if (
!is_array($stem)
&& $stem === $word
&& preg_match($pcre_abc, $word)
)
{
//Do the best to detect correct one
$guess = stemming_detect($word, $arStemInfo, $sLang);
if ($guess[0] <> '')
{
$stem = $guess[0];
$stop_lang = $guess[1];
}
}
if ($bIgnoreStopWords)
{
if (is_array($stem))
{
foreach ($stem as $st)
{
$stems[$st] = isset($stems[$st]) ? $stems[$st] + $pos : $pos;
}
}
else
{
$stems[$stem] = isset($stems[$stem]) ? $stems[$stem] + $pos : $pos;
}
}
else
{
$stop_func = $arStemInfo[$stop_lang]['stop'];
if (is_array($stem))
{
foreach ($stem as $st)
{
if (!isset($stop_cache[$st]))
{
$stop_cache[$st] = $stop_func($st);
}
if ($stop_cache[$st])
{
$stems[$st] = isset($stems[$st]) ? $stems[$st] + $pos : $pos;
}
}
}
else
{
if (!isset($stop_cache[$stem]))
{
$stop_cache[$stem] = $stop_func($stem);
}
if ($stop_cache[$stem])
{
$stems[$stem] = isset($stems[$stem]) ? $stems[$stem] + $pos : $pos;
}
}
}
if ($bReturnPositions)
{
$pos++;
}
}
//Next word
$words = strtok($tok);
}
return $stems;
}
function stemming_detect($word, $arStemInfo, $skipLang)
{
$stem = '';
$lang = '';
foreach ($arStemInfo as $sGuessLang => $arInfo)
{
if ($sGuessLang === $skipLang)
{
continue;
}
//Word has letters not from ABC, so skip to next language
if (preg_match('/[^' . $arInfo['pcre_abc'] . ']+/u', $word))
{
continue;
}
$stem = $arInfo['stem']($word);
$lang = $sGuessLang;
//It looks like stemming succseeded
if ($stem !== $word)
{
break;
}
//Check if stop function flag word as stop
$stop_func = $arInfo['stop'];
if (!$stop_func($stem))
{
break;
}
}
//It' s the best we can do
//return word and lang to use as stop
return [$stem, $lang];
}
function stemming_upper_default($sText)
{
return mb_strtoupper($sText);
}
function stemming_default($sText)
{
return $sText;
}
function stemming_stop_default($sWord)
{
if (mb_strlen($sWord) < 2)
{
return false;
}
else
{
return true;
}
}
function stemming_letter_default()
{
return 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM0123456789';
}