ez-pro/core/bitrix/modules/search/tools/language.php
2025-11-13 19:04:05 +03:00

630 lines
13 KiB
PHP

<?php
class CSearchLanguage
{
var $_abc = [];
var $_lang_id;
var $_lang_bigramm_cache;
var $_trigrams = [];
var $_has_bigramm_info = null;
var $_bigrams = null;
function __construct($lang_id)
{
$this->_lang_id = $lang_id;
}
//Function loads language class
static function GetLanguage($sLang)
{
static $arLanguages = [];
if (!isset($arLanguages[$sLang]))
{
$obLanguage = null;
$class_name = mb_strtolower('CSearchLanguage' . $sLang);
if (!class_exists($class_name))
{
//First try to load customized class
$strDirName = $_SERVER['DOCUMENT_ROOT'] . BX_PERSONAL_ROOT . '/php_interface/' . $sLang . '/search';
$strFileName = $strDirName . '/language.php';
if (file_exists($strFileName))
{
$obLanguage = @include $strFileName;
}
if (!is_object($obLanguage))
{
if (!class_exists($class_name))
{
//Then module class
$strDirName = $_SERVER['DOCUMENT_ROOT'] . '/bitrix/modules/search/tools/' . $sLang;
$strFileName = $strDirName . '/language.php';
if (file_exists($strFileName))
{
if (\Bitrix\Main\Localization\Translation::allowConvertEncoding())
{
\Bitrix\Main\Localization\StreamConverter::include($strFileName, $sLang);
}
else
{
@include $strFileName;
}
}
if (!class_exists($class_name))
{
$class_name = 'CSearchLanguage';
}
}
}
}
if (!is_object($obLanguage))
{
$obLanguage = new $class_name($sLang);
}
$obLanguage->LoadTrigrams($strDirName);
$arStemInfo = stemming_init($sLang);
if (is_array($arStemInfo))
{
$obLanguage->_abc = array_flip($obLanguage->StrToArray($arStemInfo['abc']));
}
$obLanguage->_has_bigramm_info = is_callable([$obLanguage, 'getbigrammletterfreq']);
$arLanguages[$sLang] = $obLanguage;
}
return $arLanguages[$sLang];
}
//Reads file with trigrams (combinations not allowed in the words)
function LoadTrigrams($dir_name)
{
if (empty($this->_trigrams))
{
$file_name = $dir_name . '/trigram';
if (file_exists($file_name) && is_file($file_name))
{
$cache_id = filemtime($file_name) . ',v1,' . $file_name;
$obCache = new CPHPCache;
if ($obCache->StartDataCache(360000, $cache_id, 'search'))
{
$text = file_get_contents($file_name);
$keyboard = $this->GetKeyboardLayout();
if (isset($keyboard['trigram_charset']))
{
$text = \Bitrix\Main\Text\Encoding::convertEncoding($text, $keyboard['trigram_charset'], 'utf8');
}
$ar = explode("\n", $text);
foreach ($ar as $trigramm)
{
if (mb_strlen($trigramm) == 3)
{
$strScanCodesTmp = $this->ConvertToScancode($trigramm, false, true);
if (mb_strlen($strScanCodesTmp) == 3)
{
$this->_trigrams[$strScanCodesTmp] = true;
}
}
}
$obCache->EndDataCache($this->_trigrams);
}
else
{
$this->_trigrams = $obCache->GetVars();
}
}
}
}
function HasTrigrams()
{
return !empty($this->_trigrams);
}
//Check phrase against trigrams
function CheckTrigrams($arScanCodes)
{
$result = 0;
$check = '';
$len = 0;
foreach ($arScanCodes as $i => $code)
{
if ($code === false) //new word starts here
{
$check = '';
$len = 0;
}
else
{
//running window of 3 bytes
if ($len < 3)
{
$check .= chr($code + 1);
$len++;
}
else
{
$check = $check[1] . $check[2] . chr($code + 1);
$len = 3;
}
}
if ($len >= 3)
{
if (isset($this->_trigrams[$check]))
{
$result++;
}
}
}
return $result;
}
//This function returns positions of the letters
//on the keyboard. This one is default English layout
function GetKeyboardLayout()
{
return [
'lo' => '` - ' . 'qwertyuiop[]' . "asdfghjkl;'"
. 'zxcvbnm,. ',
'hi' => '~ ' . 'QWERTYUIOP{}' . 'ASDFGHJKL:"' . 'ZXCVBNM<> '
];
}
function ConvertFromScancode($arScancode)
{
$result = '';
$keyboard = $this->GetKeyboardLayout();
foreach ($arScancode as $code)
{
$result .= mb_substr($keyboard['lo'], $code, 1);
}
return $result;
}
public static function StrToArray($str)
{
$result = [];
$len = mb_strlen($str);
for ($i = 0;$i < $len; $i++)
{
$result[] = mb_substr($str, $i, 1);
}
return $result;
}
//This function converts text between layouts
public static function ConvertKeyboardLayout($text, $from, $to)
{
static $keyboards = [];
$combo = $from . '|' . $to;
if (!isset($keyboards[$combo]))
{
//Fill local cache
if (!array_key_exists($from, $keyboards))
{
$ob = CSearchLanguage::GetLanguage($from);
$keyboard = $ob->GetKeyboardLayout();
if (is_array($keyboard))
{
$keyboards[$from] = array_merge($ob->StrToArray($keyboard['lo']), $ob->StrToArray($keyboard['hi']));
}
else
{
$keyboards[$from] = null;
}
}
if (!array_key_exists($to, $keyboards))
{
$ob = CSearchLanguage::GetLanguage($to);
$keyboard = $ob->GetKeyboardLayout();
if (is_array($keyboard))
{
$keyboards[$to] = array_merge($ob->StrToArray($keyboard['lo']), $ob->StrToArray($keyboard['hi']));
}
else
{
$keyboards[$to] = null;
}
}
//when both layouts defined
if (isset($keyboards[$from]) && isset($keyboards[$to]))
{
$keyboards[$combo] = [];
foreach ($keyboards[$from] as $i => $ch)
{
if ($ch != false)
{
$keyboards[$combo][$ch] = $keyboards[$to][$i];
}
}
}
}
if (isset($keyboards[$combo]))
{
$text = static::StrToArray($text);
foreach ($text as $pos => $char)
{
if (isset($keyboards[$combo][$char]))
{
$text[$pos] = $keyboards[$combo][$char];
}
}
return implode('', $text);
}
else
{
return $text;
}
}
//This function converts text into array of character positions
//on the keyboard. Not defined chars turns into "false" value.
function ConvertToScancode($text, $strict=false, $binary=false)
{
static $cache = [];
if (!isset($cache[$this->_lang_id]))
{
$cache[$this->_lang_id] = [];
$keyboard = $this->GetKeyboardLayout();
foreach ($this->StrToArray($keyboard['lo']) as $pos => $ch)
{
$cache[$this->_lang_id][$ch] = $pos;
}
foreach ($this->StrToArray($keyboard['hi']) as $pos => $ch)
{
$cache[$this->_lang_id][$ch] = $pos;
}
}
$scancodes = &$cache[$this->_lang_id];
if ($binary)
{
$result = '';
foreach ($this->StrToArray($text) as $ch)
{
if (
isset($scancodes[$ch])
&& !($ch === ' ')
&& !($strict && !isset($this->_abc[$ch]))
)
{
$result .= chr($scancodes[$ch] + 1);
}
}
}
else
{
$result = [];
foreach ($this->StrToArray($text) as $ch)
{
if ($ch === ' ')
{
$result[] = false;
}
elseif ($strict && !isset($this->_abc[$ch]))
{
$result[] = false;
}
elseif (isset($scancodes[$ch]))
{
$result[] = $scancodes[$ch];
}
else
{
$result[] = false;
}
}
}
return $result;
}
function PreGuessLanguage($text, $lang=false)
{
//Indicates that there is no own guess
return false;
//In subclasses you should return array("from" => lang, "to" => lang) to translate
//or return true when no translation nedded
//or parent::GuessLanguage for futher processing
}
public static function GuessLanguage($text, $lang=false)
{
if ($text == '')
{
return false;
}
static $cache = [];
if (empty($cache))
{
$cache[] = 'en';//English is always in mind and on the first place
$rsLanguages = CLanguage::GetList();
while ($arLanguage = $rsLanguages->Fetch())
{
if ($arLanguage['LID'] != 'en')
{
$cache[] = $arLanguage['LID'];
}
}
}
if (is_array($lang))
{
$arLanguages = $lang;
}
else
{
$arLanguages = $cache;
}
if (count($arLanguages) < 2)
{
return false;
}
//Give customized languages a chance to guess
foreach ($arLanguages as $lang)
{
$ob = CSearchLanguage::GetLanguage($lang);
$res = $ob->PreGuessLanguage($text, $lang);
if (is_array($res))
{
return $res;
}
elseif ($res === true)
{
return false;
}
}
//First try to detect language which
//was used to type the phrase
$max_len = 0;
$languages_from = [];
foreach ($arLanguages as $lang)
{
$ob = CSearchLanguage::GetLanguage($lang);
$arScanCodesTmp1 = $ob->ConvertToScancode($text, true);
$_cnt = count(array_filter($arScanCodesTmp1));
if ($_cnt > $max_len)
{
$max_len = $_cnt;
}
$languages_from[$lang] = $arScanCodesTmp1;
}
if (empty($languages_from))
{
return false;
}
if ($max_len < 2)
{
return false;
}
$languages_from = array_filter($languages_from,
function($a) use($max_len)
{
return count(array_filter($a)) >= $max_len;
}
);
uasort($languages_from,
function($a, $b)
{
return count(array_filter($b)) - count(array_filter($a));
}
);
//If more than one language is detected as input
//try to get one with best trigram info
$arDetectionFrom = [];
$i = 0;
foreach ($languages_from as $lang => $arScanCodes)
{
$ob = CSearchLanguage::GetLanguage($lang);
//Calculate how far sequence of scan codes
//is from language model
$deviation = $ob->GetDeviation($arScanCodes);
$arDetectionFrom[$lang] = [
$ob->HasTrigrams(),
$ob->CheckTrigrams($arScanCodes),
$deviation[1],
intval($deviation[0] * 100),
$i,
];
$i++;
}
uasort($arDetectionFrom, ['CSearchLanguage', 'cmp']);
//Now try the best to detect the language
$arDetection = [];
$i = 0;
foreach ($arDetectionFrom as $lang_from => $arTemp)
{
foreach ($arLanguages as $lang)
{
$lang_from_to = $lang_from . '=>' . $lang;
$arDetection[$lang_from_to] = [];
$ob = CSearchLanguage::GetLanguage($lang);
$alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $lang_from, $lang);
$arScanCodes = $ob->ConvertToScancode($alt_text, true);
$arDetection[$lang_from_to][] = $ob->HasBigrammInfo() ? 0 : 1;
$arDetection[$lang_from_to][] = $ob->CheckTrigrams($arScanCodes);
$arDetection[$lang_from_to][] = -count(array_filter($arScanCodes));
//Calculate how far sequence of scan codes
//is from language model
$deviation = $ob->GetDeviation($arScanCodes);
$arDetection[$lang_from_to][] = $deviation[1];
$arDetection[$lang_from_to][] = $deviation[0];
$arDetection[$lang_from_to][] = $i;
$arDetection[$lang_from_to][] = $lang_from_to;
$i++;
}
}
uasort($arDetection, ['CSearchLanguage', 'cmp']);
$language_from_to = key($arDetection);
list($language_from, $language_to) = explode('=>', $language_from_to);
$alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $language_from, $language_to);
if ($alt_text === $text)
{
return false;
}
return ['from' => $language_from, 'to' => $language_to];
}
//Compare to results of text analysis
static function cmp($a, $b)
{
$c = count($a);
for ($i = 0; $i < $c; $i++)
{
if ($a[$i] < $b[$i])
{
return -1;
}
elseif ($a[$i] > $b[$i])
{
return 1;
}
}
return 0;//never happens
}
//Function returns distance of the text (sequence of scan codes)
//from language model
function GetDeviation($arScanCodes)
{
//This is language model
$lang_bigrams = $this->GetBigrammScancodeFreq();
$lang_count = $lang_bigrams['count'];
unset($lang_bigrams['count']);
//This is text model
$text_bigrams = $this->ConvertToBigramms($arScanCodes);
$count = $text_bigrams['count'];
unset($text_bigrams['count']);
$deviation = 0;
$zeroes = 0;
foreach ($text_bigrams as $key => $value)
{
for ($i = 0;$i < $value; $i++)
{
if (!isset($lang_bigrams[$key]))
{
$zeroes++;
$deviation += 1 / $count;
}
else
{
$deviation += abs(1 / $count - $lang_bigrams[$key] / $lang_count);
}
}
}
return [$deviation, $zeroes];
}
//Function returns bigramms of the text (array of scancodes)
//For example "FAT RAT" will be
//array("FA", "AT", "RA", "AT")
//This is model of the text
function ConvertToBigramms($arScancodes)
{
$result = ['count' => 0];
$len = count($arScancodes) - 1;
for ($i = 0; $i < $len; $i++)
{
$code1 = $arScancodes[$i];
$code2 = $arScancodes[$i + 1];
if ($code1 !== false && $code2 !== false)
{
$result['count']++;
if (!isset($result[$code1 . ' ' . $code2]))
{
$result[$code1 . ' ' . $code2] = 0;
}
$result[$code1 . ' ' . $code2]++;
}
}
return $result;
}
function HasBigrammInfo()
{
return $this->_has_bigramm_info;
}
//Function returns model of the language
function GetBigrammScancodeFreq()
{
if (!$this->HasBigrammInfo())
{
return ['count' => 1];
}
if (!isset($this->_lang_bigramm_cache))
{
$bigramms = $this->GetBigrammLetterFreq();
$keyboard = $this->GetKeyboardLayout();
$keyboard_lo = $keyboard['lo'];
$keyboard_hi = $keyboard['hi'];
$result = ['count' => 0];
foreach ($bigramms as $letter1 => $row)
{
$p1 = mb_strpos($keyboard_lo, $letter1);
if ($p1 === false)
{
$p1 = mb_strpos($keyboard_hi, $letter1);
}
$i = 0;
foreach ($bigramms as $letter2 => $tmp)
{
$p2 = mb_strpos($keyboard_lo, $letter2);
if ($p2 === false)
{
$p2 = mb_strpos($keyboard_hi, $letter2);
}
$weight = $row[$i];
$result['count'] += $weight;
$result[$p1 . ' ' . $p2] = $weight;
$i++;
}
}
$this->_lang_bigramm_cache = $result;
}
return $this->_lang_bigramm_cache;
}
}