full-text-rss/libraries/language-detect/Parser.php

355 lines
9.4 KiB
PHP
Raw Normal View History

2013-04-18 16:11:06 +02:00
<?php
/**
* This class represents a text sample to be parsed.
*
* @category Text
* @package Text_LanguageDetect
* @author Nicholas Pisarro
* @copyright 2006
* @license BSD
* @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $
* @link http://pear.php.net/package/Text_LanguageDetect/
* @link http://langdetect.blogspot.com/
*/
/**
* This class represents a text sample to be parsed.
*
* This separates the analysis of a text sample from the primary LanguageDetect
* class. After a new profile has been built, the data can be retrieved using
* the accessor functions.
*
* This class is intended to be used by the Text_LanguageDetect class, not
* end-users.
*
* @category Text
* @package Text_LanguageDetect
* @author Nicholas Pisarro
* @copyright 2006
* @license BSD
* @version release: 0.2.3
*/
class Text_LanguageDetect_Parser extends Text_LanguageDetect
{
/**
* the piece of text being parsed
*
* @access private
* @var string
*/
var $_string;
/**
* stores the trigram frequencies of the sample
*
* @access private
* @var string
*/
var $_trigrams = array();
/**
* stores the trigram ranks of the sample
*
* @access private
* @var array
*/
var $_trigram_ranks = array();
/**
* stores the unicode blocks of the sample
*
* @access private
* @var array
*/
var $_unicode_blocks = array();
/**
* Whether the parser should compile the unicode ranges
*
* @access private
* @var bool
*/
var $_compile_unicode = false;
/**
* Whether the parser should compile trigrams
*
* @access private
* @var bool
*/
var $_compile_trigram = false;
/**
* Whether the trigram parser should pad the beginning of the string
*
* @access private
* @var bool
*/
var $_trigram_pad_start = false;
/**
* Whether the unicode parser should skip non-alphabetical ascii chars
*
* @access private
* @var bool
*/
var $_unicode_skip_symbols = true;
/**
* Constructor
*
* @access private
* @param string $string string to be parsed
*/
function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) {
if (isset($db)) $this->_db_filename = $db;
if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
$this->_string = $string;
}
/**
* Returns true if a string is suitable for parsing
*
* @static
* @access public
* @param string $str input string to test
* @return bool true if acceptable, false if not
*/
function validateString($str) {
if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
return true;
} else {
return false;
}
}
/**
* turn on/off trigram counting
*
* @access public
* @param bool $bool true for on, false for off
*/
function prepareTrigram($bool = true)
{
$this->_compile_trigram = $bool;
}
/**
* turn on/off unicode block counting
*
* @access public
* @param bool $bool true for on, false for off
*/
function prepareUnicode($bool = true)
{
$this->_compile_unicode = $bool;
}
/**
* turn on/off padding the beginning of the sample string
*
* @access public
* @param bool $bool true for on, false for off
*/
function setPadStart($bool = true)
{
$this->_trigram_pad_start = $bool;
}
/**
* Should the unicode block counter skip non-alphabetical ascii chars?
*
* @access public
* @param bool $bool true for on, false for off
*/
function setUnicodeSkipSymbols($bool = true)
{
$this->_unicode_skip_symbols = $bool;
}
/**
* Returns the trigram ranks for the text sample
*
* @access public
* @return array trigram ranks in the text sample
*/
function &getTrigramRanks()
{
return $this->_trigram_ranks;
}
/**
* Return the trigram freqency table
*
* only used in testing to make sure the parser is working
*
* @access public
* @return array trigram freqencies in the text sample
*/
function &getTrigramFreqs()
{
return $this->_trigram;
}
/**
* returns the array of unicode blocks
*
* @access public
* @return array unicode blocks in the text sample
*/
function &getUnicodeBlocks()
{
return $this->_unicode_blocks;
}
/**
* Executes the parsing operation
*
* Be sure to call the set*() functions to set options and the
* prepare*() functions first to tell it what kind of data to compute
*
* Afterwards the get*() functions can be used to access the compiled
* information.
*
* @access public
*/
function analyze()
{
$len = strlen($this->_string);
$byte_counter = 0;
// unicode startup
if ($this->_compile_unicode) {
$blocks =& $this->_read_unicode_block_db();
$block_count = count($blocks);
$skipped_count = 0;
$unicode_chars = array();
}
// trigram startup
if ($this->_compile_trigram) {
// initialize them as blank so the parser will skip the first two
// (since it skips trigrams with more than 2 contiguous spaces)
$a = ' ';
$b = ' ';
// kludge
// if it finds a valid trigram to start and the start pad option is
// off, then set a variable that will be used to reduce this
// trigram after parsing has finished
if (!$this->_trigram_pad_start) {
$a = $this->_next_char($this->_string, $byte_counter, true);
if ($a != ' ') {
$b = $this->_next_char($this->_string, $byte_counter, true);
$dropone = " $a$b";
}
$byte_counter = 0;
$a = ' ';
$b = ' ';
}
}
while ($byte_counter < $len) {
$char = $this->_next_char($this->_string, $byte_counter, true);
// language trigram detection
if ($this->_compile_trigram) {
if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
if (!isset($this->_trigram[$a . $b . $char])) {
$this->_trigram[$a . $b . $char] = 1;
} else {
$this->_trigram[$a . $b . $char]++;
}
}
$a = $b;
$b = $char;
}
// unicode block detection
if ($this->_compile_unicode) {
if ($this->_unicode_skip_symbols
&& strlen($char) == 1
&& ($char < 'A' || $char > 'z'
|| ($char > 'Z' && $char < 'a'))
&& $char != "'") { // does not skip the apostrophe
// since it's included in the language
// models
$skipped_count++;
continue;
}
// build an array of all the characters
if (isset($unicode_chars[$char])) {
$unicode_chars[$char]++;
} else {
$unicode_chars[$char] = 1;
}
}
// todo: add byte detection here
}
// unicode cleanup
if ($this->_compile_unicode) {
foreach ($unicode_chars as $utf8_char => $count) {
$search_result = $this->_unicode_block_name(
$this->_utf8char2unicode($utf8_char), $blocks, $block_count);
if ($search_result != -1) {
$block_name = $search_result[2];
} else {
$block_name = '[Malformatted]';
}
if (isset($this->_unicode_blocks[$block_name])) {
$this->_unicode_blocks[$block_name] += $count;
} else {
$this->_unicode_blocks[$block_name] = $count;
}
}
}
// trigram cleanup
if ($this->_compile_trigram) {
// pad the end
if ($b != ' ') {
if (!isset($this->_trigram["$a$b "])) {
$this->_trigram["$a$b "] = 1;
} else {
$this->_trigram["$a$b "]++;
}
}
// perl compatibility; Language::Guess does not pad the beginning
// kludge
if (isset($dropone)) {
if ($this->_trigram[$dropone] == 1) {
unset($this->_trigram[$dropone]);
} else {
$this->_trigram[$dropone]--;
}
}
if (!empty($this->_trigram)) {
$this->_trigram_ranks = $this->_arr_rank($this->_trigram);
} else {
$this->_trigram_ranks = array();
}
}
}
}
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
?>