Full-Text RSS 3.8
This commit is contained in:
parent
954e765b5a
commit
1ec2f36b3e
@ -2,6 +2,22 @@ FiveFilters.org: Full-Text RSS
|
||||
http://fivefilters.org/content-only/
|
||||
CHANGELOG
|
||||
------------------------------------
|
||||
3.8 (2017-09-25)
|
||||
- New site config directive: strip_attr: XPath attribute selector (e.g. //img/@srcset) - remove attribute from element
|
||||
- New site config directive: insert_detected_image: yes/no (default yes) - places image in og:image in the body if no other images extracted
|
||||
- Bug fix: Better handling of Internationalized Domain Names (IDNs)
|
||||
- Bug fix: Relative base URLs (<base>) now resolved against page URL
|
||||
- Bug fix: Wrong site config file chosen in certain cases (when wildcard and exact subdomain files available and cached in APCu)
|
||||
- Bug fix: ' HTML entities not converted correctly when parsing with Gumbo PHP
|
||||
- Remove srcset (+ sizes) attributes on img elements if it looks like they only contain relative URLs (browser will use src attribute value instead)
|
||||
- https:// URLs now re-written to sec:// before being submitted to avoid overzealous security software blocking request on some servers - no redirect, only affects newly submitted URLs on index.php
|
||||
- HTML5-PHP library updated
|
||||
- Language Detect library updated
|
||||
- Site config files updated for better extraction
|
||||
- Minimum PHP version is now 5.4. If you must use PHP 5.3, please stick with Full-Text RSS 3.7
|
||||
- Tested with PHP 7.2
|
||||
- Other fixes/improvements
|
||||
|
||||
3.7 (2017-02-12)
|
||||
- Request HTML5 output using HTML5-PHP - new config option $options->html5_output and new request parameter &content=html5
|
||||
- Improve support for lazy-loading images
|
||||
|
13
config.php
13
config.php
@ -61,16 +61,15 @@ $options->content = 'user';
|
||||
|
||||
// HTML5 output
|
||||
// ----------------------
|
||||
// By default, Full-Text RSS uses libxml to convert the parsed DOM tree back into HTML.
|
||||
// If this is enabled, we'll use HTML5-PHP to produce the HTML. This will be a little
|
||||
// slower, but might produce better results, adhering to the HTML5 spec.
|
||||
//
|
||||
// Note: in a future release we might make HTML5 output the default.
|
||||
// Full-Text RSS used to rely on libxml to output HTML extracted from
|
||||
// a web page. Since version 3.8 we use HTML5-PHP by default.
|
||||
// If you prefer the old output, either set this to false or pass &content=1
|
||||
// in the querystring.
|
||||
//
|
||||
// Possible values...
|
||||
// HTML5 (slower): true
|
||||
// libxml (faster): false
|
||||
// libxml unless user overrides (&content=html5): 'user' (default)
|
||||
// HTML5 unless user overrides (&content=1): 'user' (default)
|
||||
$options->html5_output = 'user';
|
||||
|
||||
// Excerpts
|
||||
@ -524,7 +523,7 @@ $options->cache_cleanup = 100;
|
||||
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
|
||||
/////////////////////////////////////////////////
|
||||
|
||||
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.7');
|
||||
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.8');
|
||||
|
||||
if (basename(__FILE__) == 'config.php') {
|
||||
if (file_exists(dirname(__FILE__).'/custom_config.php')) {
|
||||
|
@ -16,12 +16,12 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
|
||||
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
|
||||
*/
|
||||
|
||||
$app_name = 'Full-Text RSS 3.7';
|
||||
$app_name = 'Full-Text RSS 3.8';
|
||||
|
||||
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
|
||||
//$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
|
||||
// HHVM works okay, but no Tidy and autoupdate of site config files not working (tested 3.7.1)
|
||||
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.3.0', '>='));
|
||||
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.4.0', '>='));
|
||||
$pcre_ok = extension_loaded('pcre');
|
||||
$zlib_ok = extension_loaded('zlib');
|
||||
$mbstring_ok = extension_loaded('mbstring');
|
||||
@ -32,6 +32,7 @@ $parallel_ok = ((extension_loaded('http') && class_exists('http\Client\Request')
|
||||
$allow_url_fopen_ok = (bool)ini_get('allow_url_fopen');
|
||||
$filter_ok = extension_loaded('filter');
|
||||
$gumbo_ok = class_exists('Layershifter\Gumbo\Parser');
|
||||
$idn_ok = function_exists('idn_to_ascii');
|
||||
|
||||
if (extension_loaded('xmlreader')) {
|
||||
$xml_ok = true;
|
||||
@ -204,7 +205,7 @@ div.chunk {
|
||||
<tbody>
|
||||
<tr class="<?php echo ($php_ok) ? 'enabled' : 'disabled'; ?>">
|
||||
<td>PHP</td>
|
||||
<td>5.3 or higher</td>
|
||||
<td>5.4 or higher</td>
|
||||
<td><?php echo phpversion(); ?></td>
|
||||
</tr>
|
||||
<tr class="<?php echo ($xml_ok) ? 'enabled, and sane' : 'disabled, or broken'; ?>">
|
||||
@ -354,6 +355,11 @@ div.chunk {
|
||||
|
||||
<div class="chunk">
|
||||
<h3>Further info</h3>
|
||||
|
||||
<h4>IDN support</h4>
|
||||
<p>When treating an <a href="https://en.wikipedia.org/wiki/Internationalized_domain_name">internationalized domain name (IDN)</a> Full-Text RSS will try to make use of PHP's <code>idn_to_ascii</code> function to convert the domain to ASCII. If this function does not exist, you might have trouble retrieving article content from internationalized domains.</p>
|
||||
<p class="highlight"><strong>idn_to_ascii</strong> is <?php if (!$idn_ok) echo '<strong>not</strong>'; ?> available on this server.</p>
|
||||
|
||||
<h4>HTTP module</h4>
|
||||
<p>Full-Text RSS can make use of PHP's HTTP extension or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p>
|
||||
<?php
|
||||
|
@ -25,6 +25,7 @@ if (!defined('_FF_FTR_INDEX')) {
|
||||
// remove http scheme from urls before submitting
|
||||
$('#form').submit(function() {
|
||||
$('#url').val($('#url').val().replace(/^http:\/\//i, ''));
|
||||
$('#url').val($('#url').val().replace(/^https:\/\//i, 'sec://'));
|
||||
return true;
|
||||
});
|
||||
// popovers
|
||||
@ -271,8 +272,8 @@ if (!defined('_FF_FTR_INDEX')) {
|
||||
|
||||
<tr>
|
||||
<td>content</td>
|
||||
<td><tt>0</tt>, <tt>1</tt> (default), <tt>html5</tt></td>
|
||||
<td>If set to 0, the extracted content will not be included in the output. If set to html5, we'll output HTML5.</td>
|
||||
<td><tt>0</tt>, <tt>1</tt>, <tt>html5</tt> (default)</td>
|
||||
<td>If set to 0, the extracted content will not be included in the output. If set to 1, we'll use regular libxml output - might not be HTML5 compliant.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
|
@ -5,8 +5,8 @@
|
||||
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
|
||||
* to extract content from HTML files.
|
||||
*
|
||||
* @version 1.3
|
||||
* @date 2017-02-12
|
||||
* @version 1.4
|
||||
* @date 2017-09-25
|
||||
* @author Keyvan Minoukadeh
|
||||
* @copyright 2017 Keyvan Minoukadeh
|
||||
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
||||
@ -107,24 +107,13 @@ class ContentExtractor
|
||||
}
|
||||
|
||||
// returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
|
||||
public function buildSiteConfig($url, $html='', $add_to_cache=true) {
|
||||
public function buildSiteConfig($url, $html='') {
|
||||
// extract host name
|
||||
$host = @parse_url($url, PHP_URL_HOST);
|
||||
$host = strtolower($host);
|
||||
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
|
||||
// is merged version already cached?
|
||||
if (SiteConfig::is_cached("$host.merged")) {
|
||||
$config = SiteConfig::build("$host.merged");
|
||||
if ($config) {
|
||||
$this->debug("Returning cached and merged site config for $host");
|
||||
return $config;
|
||||
}
|
||||
}
|
||||
// let's build from site_config/custom/ and standard/
|
||||
$config = SiteConfig::build($host);
|
||||
if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
|
||||
SiteConfig::add_to_cache($host, $config);
|
||||
}
|
||||
// if no match, use defaults
|
||||
if (!$config) $config = new SiteConfig();
|
||||
// load fingerprint config?
|
||||
@ -134,10 +123,6 @@ class ContentExtractor
|
||||
if ($config_fingerprint = SiteConfig::build($_fphost)) {
|
||||
$this->debug("Appending site config settings from $_fphost (fingerprint match)");
|
||||
$config->append($config_fingerprint);
|
||||
if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
|
||||
//$config_fingerprint->cache_in_apc = true;
|
||||
SiteConfig::add_to_cache($_fphost, $config_fingerprint);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -146,19 +131,8 @@ class ContentExtractor
|
||||
if ($config_global = SiteConfig::build('global', true)) {
|
||||
$this->debug('Appending site config settings from global.txt');
|
||||
$config->append($config_global);
|
||||
if ($add_to_cache && !SiteConfig::is_cached('global')) {
|
||||
//$config_global->cache_in_apc = true;
|
||||
SiteConfig::add_to_cache('global', $config_global);
|
||||
}
|
||||
}
|
||||
}
|
||||
// store copy of merged config
|
||||
if ($add_to_cache) {
|
||||
// do not store in APC if wildcard match
|
||||
$use_apc = ($host == $config->cache_key);
|
||||
$config->cache_key = null;
|
||||
SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
|
||||
}
|
||||
return $config;
|
||||
}
|
||||
|
||||
@ -398,14 +372,18 @@ class ContentExtractor
|
||||
$elems = @$xpath->query($pattern, $this->readability->dom);
|
||||
// check for matches
|
||||
if ($elems && $elems->length > 0) {
|
||||
$this->debug('Stripping '.$elems->length.' elements (strip)');
|
||||
$this->debug('Stripping '.$elems->length.' elements (strip: '.$pattern.')');
|
||||
for ($i=$elems->length-1; $i >= 0; $i--) {
|
||||
if ($elems->item($i)->parentNode) {
|
||||
if ($elems->item($i) instanceof DOMAttr) {
|
||||
$elems->item($i)->parentNode->removeAttributeNode($elems->item($i));
|
||||
} else {
|
||||
$elems->item($i)->parentNode->removeChild($elems->item($i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// strip elements (using id and class attribute values)
|
||||
foreach ($this->config->strip_id_or_class as $string) {
|
||||
@ -413,7 +391,7 @@ class ContentExtractor
|
||||
$elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
|
||||
// check for matches
|
||||
if ($elems && $elems->length > 0) {
|
||||
$this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
|
||||
$this->debug('Stripping '.$elems->length.' elements (strip_id_or_class: '.$string.')');
|
||||
for ($i=$elems->length-1; $i >= 0; $i--) {
|
||||
$elems->item($i)->parentNode->removeChild($elems->item($i));
|
||||
}
|
||||
@ -426,12 +404,13 @@ class ContentExtractor
|
||||
$elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
|
||||
// check for matches
|
||||
if ($elems && $elems->length > 0) {
|
||||
$this->debug('Stripping '.$elems->length.' image elements');
|
||||
$this->debug('Stripping '.$elems->length.' elements (strip_image_src: '.$string.')');
|
||||
for ($i=$elems->length-1; $i >= 0; $i--) {
|
||||
$elems->item($i)->parentNode->removeChild($elems->item($i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// strip elements using Readability.com and Instapaper.com ignore class names
|
||||
// .entry-unrelated and .instapaper_ignore
|
||||
// See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
|
||||
@ -465,6 +444,21 @@ class ContentExtractor
|
||||
}
|
||||
}
|
||||
|
||||
// strip img srcset/sizes attributes with relative URIs (src should be present and will be absolutised)
|
||||
// TODO: absolutize srcet values rather than removing them
|
||||
// To remove srcset from all image elements, site config files can contain: strip: //img/@srcset
|
||||
$elems = $xpath->query("//img[@srcset and not(contains(@srcset, '//'))]", $this->readability->dom);
|
||||
// check for matches
|
||||
if ($elems && $elems->length > 0) {
|
||||
$this->debug('Stripping '.$elems->length.' srcset attributes');
|
||||
foreach ($elems as $elem) {
|
||||
$elem->removeAttribute('srcset');
|
||||
if ($elem->hasAttribute('sizes')) {
|
||||
$elem->removeAttribute('sizes');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// try to get body
|
||||
foreach ($this->config->body as $pattern) {
|
||||
$elems = @$xpath->query($pattern, $this->readability->dom);
|
||||
@ -880,7 +874,7 @@ class ContentExtractor
|
||||
}
|
||||
} else {
|
||||
// If there's an og:image, but we have no images in the article, let's place it at the beginning of the article.
|
||||
if ($this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
|
||||
if ($this->config->insert_detected_image() && $this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
|
||||
$elems = @$xpath->query(".//img", $this->body);
|
||||
if ($elems->length === 0) {
|
||||
$_new_elem = $this->body->ownerDocument->createDocumentFragment();
|
||||
|
@ -5,10 +5,10 @@
|
||||
* Each instance of this class should hold extraction patterns and other directives
|
||||
* for a website. See ContentExtractor class to see how it's used.
|
||||
*
|
||||
* @version 1.0
|
||||
* @date 2015-06-09
|
||||
* @version 1.1
|
||||
* @date 2017-09-25
|
||||
* @author Keyvan Minoukadeh
|
||||
* @copyright 2015 Keyvan Minoukadeh
|
||||
* @copyright 2017 Keyvan Minoukadeh
|
||||
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
||||
*/
|
||||
|
||||
@ -43,7 +43,6 @@ class SiteConfig
|
||||
|
||||
// Process HTML with tidy before creating DOM (bool or null if undeclared)
|
||||
public $tidy = null;
|
||||
|
||||
protected $default_tidy = true; // used if undeclared
|
||||
|
||||
// Autodetect title/body if xpath expressions fail to produce results.
|
||||
@ -93,6 +92,12 @@ class SiteConfig
|
||||
public $parser = null;
|
||||
protected $default_parser = 'libxml'; // used if undeclared
|
||||
|
||||
// Insert detected image (currently only og:image) into beginning of extracted article
|
||||
// Only does this if extracted article contains no images
|
||||
// bool or null if undeclared
|
||||
public $insert_detected_image = null;
|
||||
protected $default_insert_detected_image = true; // used if undeclared
|
||||
|
||||
// Strings to search for in HTML before processing begins (used with $replace_string)
|
||||
public $find_string = array();
|
||||
// Strings to replace those found in $find_string before HTML processing begins
|
||||
@ -101,10 +106,9 @@ class SiteConfig
|
||||
// the options below cannot be set in the config files which this class represents
|
||||
|
||||
//public $cache_in_apc = false; // used to decide if we should cache in apc or not
|
||||
public $cache_key = null;
|
||||
public static $debug = false;
|
||||
protected static $apc = false;
|
||||
protected static $config_path;
|
||||
protected static $config_path_custom;
|
||||
protected static $config_path_fallback;
|
||||
protected static $config_cache = array();
|
||||
const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
|
||||
@ -137,6 +141,12 @@ class SiteConfig
|
||||
return $apc;
|
||||
}
|
||||
|
||||
// return bool or null
|
||||
public function insert_detected_image($use_default=true) {
|
||||
if ($use_default) return (isset($this->insert_detected_image)) ? $this->insert_detected_image : $this->default_insert_detected_image;
|
||||
return $this->insert_detected_image;
|
||||
}
|
||||
|
||||
// return bool or null
|
||||
public function tidy($use_default=true) {
|
||||
if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
|
||||
@ -162,15 +172,32 @@ class SiteConfig
|
||||
}
|
||||
|
||||
public static function set_config_path($path, $fallback=null) {
|
||||
self::$config_path = $path;
|
||||
self::$config_path_custom = $path;
|
||||
self::$config_path_fallback = $fallback;
|
||||
}
|
||||
|
||||
protected static function load_cached_merged($host, $exact_host_match) {
|
||||
if ($exact_host_match) {
|
||||
$key = $host.'.merged.ex';
|
||||
} else {
|
||||
$key = $host.'.merged';
|
||||
}
|
||||
return self::load_cached($key);
|
||||
}
|
||||
|
||||
protected static function add_to_cache_merged($host, $exact_host_match, SiteConfig $config=null) {
|
||||
if ($exact_host_match) {
|
||||
$key = $host.'.merged.ex';
|
||||
} else {
|
||||
$key = $host.'.merged';
|
||||
}
|
||||
if (!isset($config)) $config = new SiteConfig();
|
||||
self::add_to_cache($key, $config);
|
||||
}
|
||||
|
||||
public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
|
||||
$key = strtolower($key);
|
||||
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
|
||||
if ($config->cache_key) $key = $config->cache_key;
|
||||
$key .= '.'.self::get_key_suffix();
|
||||
self::$config_cache[$key] = $config;
|
||||
if (self::$apc && $use_apc) {
|
||||
self::debug("Adding site config to APC cache with key sc.$key");
|
||||
@ -179,9 +206,22 @@ class SiteConfig
|
||||
self::debug("Cached site config with key $key");
|
||||
}
|
||||
|
||||
public static function load_cached($key) {
|
||||
$key = strtolower($key);
|
||||
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
|
||||
//var_dump('in cache?', $key, self::$config_cache);
|
||||
if (array_key_exists($key, self::$config_cache)) {
|
||||
self::debug("... site config for $key already loaded in this request");
|
||||
return self::$config_cache[$key];
|
||||
} elseif (self::$apc && ($sconfig = apc_fetch("sc.$key"))) {
|
||||
self::debug("... site config for $key found in APCu");
|
||||
return $sconfig;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static function is_cached($key) {
|
||||
$key = strtolower($key);
|
||||
$key .= '.'.self::get_key_suffix();
|
||||
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
|
||||
if (array_key_exists($key, self::$config_cache)) {
|
||||
return true;
|
||||
@ -212,7 +252,7 @@ class SiteConfig
|
||||
}
|
||||
// check for single statement commands
|
||||
// we do not overwrite existing non null values
|
||||
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
|
||||
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure', 'insert_detected_image') as $var) {
|
||||
if ($this->$var === null) $this->$var = $newconfig->$var;
|
||||
}
|
||||
// treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
|
||||
@ -223,16 +263,6 @@ class SiteConfig
|
||||
}
|
||||
}
|
||||
|
||||
// This is used to make sure that when a different primary folder is chosen
|
||||
// The key for the cached result includes that folder choice.
|
||||
// Otherwise, a subsequent request choosing a different folder
|
||||
// could return the wrong cached config.
|
||||
public static function get_key_suffix() {
|
||||
$key_suffix = basename(self::$config_path);
|
||||
if ($key_suffix === 'custom') $key_suffix = '';
|
||||
return $key_suffix;
|
||||
}
|
||||
|
||||
// Add test_contains to last test_url
|
||||
public function add_test_contains($test_contains) {
|
||||
if (!empty($this->test_url)) {
|
||||
@ -274,6 +304,12 @@ class SiteConfig
|
||||
$host = strtolower($host);
|
||||
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
|
||||
if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
|
||||
// got a merged one?
|
||||
$config = self::load_cached_merged($host, $exact_host_match);
|
||||
if ($config) {
|
||||
//self::debug('. returned merged config from a previous request');
|
||||
return $config;
|
||||
}
|
||||
// check for site configuration
|
||||
$try = array($host);
|
||||
// should we look for wildcard matches
|
||||
@ -285,99 +321,84 @@ class SiteConfig
|
||||
}
|
||||
}
|
||||
|
||||
// Which primary folder should we look inside?
|
||||
// If it's not the default ('custom'), we need
|
||||
// a key suffix to distinguish site config fules
|
||||
// held in this folder from those in other folders.
|
||||
$key_suffix = self::get_key_suffix();
|
||||
|
||||
// look for site config file in primary folder
|
||||
self::debug(". looking for site config for $host in primary folder");
|
||||
// look for site config file in custom folder
|
||||
self::debug(". looking for site config for $host in custom folder");
|
||||
//var_dump($try);
|
||||
$config = null;
|
||||
$config_std = null;
|
||||
foreach ($try as $h) {
|
||||
$h_key = "$h.$key_suffix";
|
||||
if (array_key_exists($h_key, self::$config_cache)) {
|
||||
self::debug("... site config for $h already loaded in this request");
|
||||
return self::$config_cache[$h_key];
|
||||
} elseif (self::$apc && ($sconfig = apc_fetch("sc.$h_key"))) {
|
||||
self::debug("... site config for $h in APC cache");
|
||||
return $sconfig;
|
||||
} elseif (file_exists(self::$config_path."/$h.txt")) {
|
||||
//$h_key = $h.'.'.$key_suffix;
|
||||
$h_key = $h.'.custom';
|
||||
//var_dump($h_key, $h);
|
||||
if ($config = self::load_cached($h_key)) {
|
||||
break;
|
||||
} elseif (file_exists(self::$config_path_custom."/$h.txt")) {
|
||||
self::debug("... found site config ($h.txt)");
|
||||
$file_primary = self::$config_path."/$h.txt";
|
||||
$matched_name = $h;
|
||||
$file_custom = self::$config_path_custom."/$h.txt";
|
||||
$config = self::build_from_file($file_custom);
|
||||
//$matched_name = $h;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// if we found site config, process it
|
||||
if (isset($file_primary)) {
|
||||
$config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
|
||||
if (!$config_lines || !is_array($config_lines)) return false;
|
||||
$config = self::build_from_array($config_lines);
|
||||
// if APC caching is available and enabled, mark this for cache
|
||||
//$config->cache_in_apc = true;
|
||||
$config->cache_key = $matched_name;
|
||||
|
||||
// if autodetec on failure is off (on by default) we do not need to look
|
||||
// in secondary folder
|
||||
if (!$config->autodetect_on_failure()) {
|
||||
if ($config && !$config->autodetect_on_failure()) {
|
||||
self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
|
||||
self::add_to_cache_merged($host, $exact_host_match, $config);
|
||||
return $config;
|
||||
}
|
||||
}
|
||||
|
||||
// look for site config file in secondary folder
|
||||
if (isset(self::$config_path_fallback)) {
|
||||
self::debug(". looking for site config for $host in secondary folder");
|
||||
self::debug(". looking for site config for $host in standard folder");
|
||||
foreach ($try as $h) {
|
||||
if (file_exists(self::$config_path_fallback."/$h.txt")) {
|
||||
self::debug("... found site config in secondary folder ($h.txt)");
|
||||
if ($config_std = self::load_cached($h)) {
|
||||
break;
|
||||
} elseif (file_exists(self::$config_path_fallback."/$h.txt")) {
|
||||
self::debug("... found site config in standard folder ($h.txt)");
|
||||
$file_secondary = self::$config_path_fallback."/$h.txt";
|
||||
$matched_name = $h;
|
||||
$config_std = self::build_from_file($file_secondary);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!isset($file_secondary)) {
|
||||
self::debug("... no site config match in secondary folder");
|
||||
}
|
||||
}
|
||||
|
||||
// return false if no config file found
|
||||
if (!isset($file_primary) && !isset($file_secondary)) {
|
||||
if (!$config && !$config_std) {
|
||||
self::debug("... no site config match for $host");
|
||||
self::add_to_cache_merged($host, $exact_host_match);
|
||||
return false;
|
||||
}
|
||||
|
||||
// return primary config if secondary not found
|
||||
if (!isset($file_secondary) && isset($config)) {
|
||||
return $config;
|
||||
}
|
||||
|
||||
// process secondary config file
|
||||
$config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
|
||||
if (!$config_lines || !is_array($config_lines)) {
|
||||
// failed to process secondary
|
||||
if (isset($config)) {
|
||||
// return primary config
|
||||
return $config;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// merge with primary and return
|
||||
if (isset($config)) {
|
||||
// final config handling
|
||||
$config_final = null;
|
||||
if (!$config_std && $config) {
|
||||
$config_final = $config;
|
||||
// merge with primary
|
||||
} elseif ($config_std && $config) {
|
||||
self::debug('. merging config files');
|
||||
$config->append(self::build_from_array($config_lines));
|
||||
return $config;
|
||||
$config->append($config_std);
|
||||
$config_final = $config;
|
||||
} else {
|
||||
// return just secondary
|
||||
$config = self::build_from_array($config_lines);
|
||||
//$config = self::build_from_array($config_lines);
|
||||
// if APC caching is available and enabled, mark this for cache
|
||||
//$config->cache_in_apc = true;
|
||||
$config->cache_key = $matched_name;
|
||||
return $config;
|
||||
$config_final = $config_std;
|
||||
}
|
||||
self::add_to_cache_merged($host, $exact_host_match, $config_final);
|
||||
return $config_final;
|
||||
}
|
||||
|
||||
public static function build_from_file($path, $cache=true) {
|
||||
$key = basename($path, '.txt');
|
||||
$config_lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
|
||||
if (!$config_lines || !is_array($config_lines)) return false;
|
||||
$config = self::build_from_array($config_lines);
|
||||
if ($cache) self::add_to_cache($key, $config);
|
||||
return $config;
|
||||
}
|
||||
|
||||
public static function build_from_string($string) {
|
||||
@ -399,13 +420,23 @@ class SiteConfig
|
||||
if (count($command) != 2) continue;
|
||||
$val = trim($command[1]);
|
||||
$command = trim($command[0]);
|
||||
if ($command == '' || $val == '') continue;
|
||||
//if ($command == '' || $val == '') continue;
|
||||
// $val can be empty, e.g. replace_string:
|
||||
if ($command == '') continue;
|
||||
|
||||
// strip_attr is now an alias for strip.
|
||||
// In FTR 3.8 we can strip attributes from elements, not only the elements themselves
|
||||
// e.g. strip: //img/@srcset (removes srcset attribute from all img elements)
|
||||
// but for backward compatibility (to avoid errors with new config files + old version of FTR)
|
||||
// we've introduced strip_attr and we'll recommend using that in our public site config rep.
|
||||
// strip_attr: //img/@srcset
|
||||
if ($command == 'strip_attr') $command = 'strip';
|
||||
|
||||
// check for commands where we accept multiple statements
|
||||
if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
|
||||
array_push($config->$command, $val);
|
||||
// check for single statement commands that evaluate to true or false
|
||||
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
|
||||
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure', 'insert_detected_image'))) {
|
||||
$config->$command = ($val == 'yes');
|
||||
// check for single statement commands stored as strings
|
||||
} elseif (in_array($command, array('parser'))) {
|
||||
|
@ -186,5 +186,4 @@
|
||||
$this->setElement('enclosure','',$attributes);
|
||||
}
|
||||
|
||||
} // end of class FeedItem
|
||||
?>
|
||||
}
|
@ -1,4 +1,6 @@
|
||||
<?php
|
||||
define('ATOM', -1); // unused
|
||||
define('RSS1', 0); // unused
|
||||
define('RSS2', 1);
|
||||
define('JSON', 2);
|
||||
define('JSONP', 3);
|
||||
|
File diff suppressed because one or more lines are too long
@ -274,7 +274,8 @@ class DOMTreeBuilder implements EventHandler
|
||||
|
||||
// SPECIAL TAG HANDLING:
|
||||
// Spec says do this, and "don't ask."
|
||||
if ($name == 'image') {
|
||||
// find the spec where this is defined... looks problematic
|
||||
if ($name == 'image' && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) {
|
||||
$name = 'img';
|
||||
}
|
||||
|
||||
|
@ -83,11 +83,8 @@ class Tokenizer
|
||||
*/
|
||||
public function parse()
|
||||
{
|
||||
$p = 0;
|
||||
do {
|
||||
$p = $this->scanner->position();
|
||||
$this->consumeData();
|
||||
|
||||
// FIXME: Add infinite loop protection.
|
||||
} while ($this->carryOn);
|
||||
}
|
||||
@ -145,7 +142,8 @@ class Tokenizer
|
||||
*/
|
||||
protected function characterData()
|
||||
{
|
||||
if ($this->scanner->current() === false) {
|
||||
$tok = $this->scanner->current();
|
||||
if ($tok === false) {
|
||||
return false;
|
||||
}
|
||||
switch ($this->textMode) {
|
||||
@ -154,7 +152,6 @@ class Tokenizer
|
||||
case Elements::TEXT_RCDATA:
|
||||
return $this->rcdata();
|
||||
default:
|
||||
$tok = $this->scanner->current();
|
||||
if (strspn($tok, "<&")) {
|
||||
return false;
|
||||
}
|
||||
@ -408,24 +405,26 @@ class Tokenizer
|
||||
if ($tok == '/') {
|
||||
$this->scanner->next();
|
||||
$this->scanner->whitespace();
|
||||
if ($this->scanner->current() == '>') {
|
||||
$tok = $this->scanner->current();
|
||||
|
||||
if ($tok == '>') {
|
||||
$selfClose = true;
|
||||
return true;
|
||||
}
|
||||
if ($this->scanner->current() === false) {
|
||||
if ($tok === false) {
|
||||
$this->parseError("Unexpected EOF inside of tag.");
|
||||
return true;
|
||||
}
|
||||
// Basically, we skip the / token and go on.
|
||||
// See 8.2.4.43.
|
||||
$this->parseError("Unexpected '%s' inside of a tag.", $this->scanner->current());
|
||||
$this->parseError("Unexpected '%s' inside of a tag.", $tok);
|
||||
return false;
|
||||
}
|
||||
|
||||
if ($this->scanner->current() == '>') {
|
||||
if ($tok == '>') {
|
||||
return true;
|
||||
}
|
||||
if ($this->scanner->current() === false) {
|
||||
if ($tok === false) {
|
||||
$this->parseError("Unexpected EOF inside of tag.");
|
||||
return true;
|
||||
}
|
||||
@ -541,15 +540,21 @@ class Tokenizer
|
||||
{
|
||||
$stoplist = "\f" . $quote;
|
||||
$val = '';
|
||||
$tok = $this->scanner->current();
|
||||
while (strspn($tok, $stoplist) == 0 && $tok !== false) {
|
||||
if ($tok == '&') {
|
||||
$val .= $this->decodeCharacterReference(true);
|
||||
$tok = $this->scanner->current();
|
||||
|
||||
while (true) {
|
||||
$tokens = $this->scanner->charsUntil($stoplist.'&');
|
||||
if ($tokens !== false) {
|
||||
$val .= $tokens;
|
||||
} else {
|
||||
$val .= $tok;
|
||||
$tok = $this->scanner->next();
|
||||
break;
|
||||
}
|
||||
|
||||
$tok = $this->scanner->current();
|
||||
if ($tok == '&') {
|
||||
$val .= $this->decodeCharacterReference(true, $tok);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
$this->scanner->next();
|
||||
return $val;
|
||||
@ -591,18 +596,18 @@ class Tokenizer
|
||||
*/
|
||||
protected function bogusComment($leading = '')
|
||||
{
|
||||
|
||||
// TODO: This can be done more efficiently when the
|
||||
// scanner exposes a readUntil() method.
|
||||
$comment = $leading;
|
||||
$tokens = $this->scanner->charsUntil('>');
|
||||
if ($tokens !== false) {
|
||||
$comment .= $tokens;
|
||||
}
|
||||
$tok = $this->scanner->current();
|
||||
do {
|
||||
if ($tok !== false) {
|
||||
$comment .= $tok;
|
||||
$tok = $this->scanner->next();
|
||||
} while ($tok !== false && $tok != '>');
|
||||
}
|
||||
|
||||
$this->flushBuffer();
|
||||
$this->events->comment($comment . $tok);
|
||||
$this->events->comment($comment);
|
||||
$this->scanner->next();
|
||||
|
||||
return true;
|
||||
@ -646,15 +651,17 @@ class Tokenizer
|
||||
*/
|
||||
protected function isCommentEnd()
|
||||
{
|
||||
$tok = $this->scanner->current();
|
||||
|
||||
// EOF
|
||||
if ($this->scanner->current() === false) {
|
||||
if ($tok === false) {
|
||||
// Hit the end.
|
||||
$this->parseError("Unexpected EOF in a comment.");
|
||||
return true;
|
||||
}
|
||||
|
||||
// If it doesn't start with -, not the end.
|
||||
if ($this->scanner->current() != '-') {
|
||||
if ($tok != '-') {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -737,7 +744,6 @@ class Tokenizer
|
||||
|
||||
$pub = strtoupper($this->scanner->getAsciiAlpha());
|
||||
$white = strlen($this->scanner->whitespace());
|
||||
$tok = $this->scanner->current();
|
||||
|
||||
// Get ID, and flag it as pub or system.
|
||||
if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
|
||||
@ -938,10 +944,11 @@ class Tokenizer
|
||||
$len = strlen($sequence);
|
||||
$buffer = '';
|
||||
for ($i = 0; $i < $len; ++ $i) {
|
||||
$buffer .= $this->scanner->current();
|
||||
$tok = $this->scanner->current();
|
||||
$buffer .= $tok;
|
||||
|
||||
// EOF. Rewind and let the caller handle it.
|
||||
if ($this->scanner->current() === false) {
|
||||
if ($tok === false) {
|
||||
$this->scanner->unconsume($i);
|
||||
return false;
|
||||
}
|
||||
@ -1067,18 +1074,22 @@ class Tokenizer
|
||||
}
|
||||
$entity = CharacterReference::lookupDecimal($numeric);
|
||||
}
|
||||
} // String entity.
|
||||
else {
|
||||
} elseif ($tok === '=' && $inAttribute) {
|
||||
return '&';
|
||||
} else { // String entity.
|
||||
|
||||
// Attempt to consume a string up to a ';'.
|
||||
// [a-zA-Z0-9]+;
|
||||
$cname = $this->scanner->getAsciiAlpha();
|
||||
$cname = $this->scanner->getAsciiAlphaNum();
|
||||
$entity = CharacterReference::lookupName($cname);
|
||||
|
||||
// When no entity is found provide the name of the unmatched string
|
||||
// and continue on as the & is not part of an entity. The & will
|
||||
// be converted to & elsewhere.
|
||||
if ($entity == null) {
|
||||
if (!$inAttribute || strlen($cname) === 0) {
|
||||
$this->parseError("No match in entity table for '%s'", $cname);
|
||||
}
|
||||
$this->scanner->unconsume($this->scanner->position() - $start);
|
||||
return '&';
|
||||
}
|
||||
|
@ -1,14 +1,16 @@
|
||||
# HTML5-PHP
|
||||
|
||||
The need for an HTML5 parser in PHP is clear. This project initially
|
||||
began with the seemingly abandoned `html5lib` project [original source](https://code.google.com/p/html5lib/source/checkout).
|
||||
But after some initial refactoring work, we began a new parser.
|
||||
HTML5 is a standards-compliant HTML5 parser and writer written entirely in PHP.
|
||||
It is stable and used in many production websites, and has
|
||||
well over [one million downloads](https://packagist.org/packages/masterminds/html5).
|
||||
|
||||
HTML5 provides the following features.
|
||||
|
||||
- An HTML5 serializer
|
||||
- Support for PHP namespaces
|
||||
- Composer support
|
||||
- Event-based (SAX-like) parser
|
||||
- DOM tree builder
|
||||
- A DOM tree builder
|
||||
- Interoperability with [QueryPath](https://github.com/technosophos/querypath)
|
||||
- Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer
|
||||
|
||||
@ -16,6 +18,7 @@ But after some initial refactoring work, we began a new parser.
|
||||
[![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5)
|
||||
[![Code Coverage](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
|
||||
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
|
||||
[![Stability: Sustained](https://masterminds.github.io/stability/sustained.svg)](https://masterminds.github.io/stability/sustained.html)
|
||||
|
||||
## Installation
|
||||
|
||||
@ -23,7 +26,7 @@ Install HTML5-PHP using [composer](http://getcomposer.org/).
|
||||
|
||||
To install, add `masterminds/html5` to your `composer.json` file:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"require" : {
|
||||
"masterminds/html5": "2.*"
|
||||
|
@ -1,6 +1,13 @@
|
||||
# Release Notes
|
||||
|
||||
2.2.2 (2016-10-22)
|
||||
2.3.0 (2017-09-04)
|
||||
|
||||
- #129: image within inline svg breaks system (fixed by #133)
|
||||
- #131: ² does not work (fixed by #132)
|
||||
- #134: Improve tokenizer performance by 20% (alternative version of #130 thanks to @MichaelHeerklotz)
|
||||
- #135: Raw & in attributes
|
||||
|
||||
2.2.2 (2016-09-22)
|
||||
|
||||
- #116: In XML mode, tags are case sensitive
|
||||
- #115: Fix PHP Notice in OutputRules
|
||||
@ -14,8 +21,7 @@
|
||||
2.2.0 (2016-04-11)
|
||||
|
||||
- #105: Enable composer cache (for CI/CD)
|
||||
- #100: Use mb_substitute_character inset of ini_set for environments where
|
||||
ini_set is disable (e.g., shared hosting)
|
||||
- #100: Use mb_substitute_character inset of ini_set for environments where ini_set is disable (e.g., shared hosting)
|
||||
- #98: Allow link, meta, style tags in noscript tags
|
||||
- #96: Fixed xml:href on svgs that use the "use" breaking
|
||||
- #94: Counting UTF8 characters performance improvement
|
||||
|
@ -7,8 +7,8 @@
|
||||
* For environments which do not have these options, it reverts to standard sequential
|
||||
* requests (using file_get_contents())
|
||||
*
|
||||
* @version 1.7
|
||||
* @date 2016-11-28
|
||||
* @version 1.8
|
||||
* @date 2017-09-25
|
||||
* @see http://devel-m6w6.rhcloud.com/mdref/http
|
||||
* @author Keyvan Minoukadeh
|
||||
* @copyright 2011-2016 Keyvan Minoukadeh
|
||||
@ -21,8 +21,9 @@ class HumbleHttpAgent
|
||||
const METHOD_CURL_MULTI = 2;
|
||||
const METHOD_FILE_GET_CONTENTS = 4;
|
||||
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
|
||||
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36';
|
||||
const UA_PHP = 'PHP/5.6';
|
||||
// popular user agents from https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
|
||||
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36';
|
||||
const UA_PHP = 'PHP/7.1';
|
||||
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
|
||||
|
||||
protected $requests = array();
|
||||
@ -194,6 +195,24 @@ class HumbleHttpAgent
|
||||
|
||||
public function getMetaRefreshURL($url, $html) {
|
||||
if ($html == '') return false;
|
||||
|
||||
// TODO: parse HTML properly
|
||||
// For now, to deal with cases where meta refresh matches but shouldn't, e.g. CNN's
|
||||
// <!--[if lte IE 9]><meta http-equiv="refresh" content="1;url=/2.37.2/static/unsupp.html" /><![endif]-->
|
||||
// we do the string replacements in the site config file before looking for the meta refresh
|
||||
if (isset($this->siteConfigBuilder)) {
|
||||
$sconfig = $this->siteConfigBuilder->buildSiteConfig($url);
|
||||
// do string replacements
|
||||
if (!empty($sconfig->find_string)) {
|
||||
if (count($sconfig->find_string) == count($sconfig->replace_string)) {
|
||||
$html = str_replace($sconfig->find_string, $sconfig->replace_string, $html, $_count);
|
||||
//$this->debug("Strings replaced: $_count (find_string and/or replace_string)");
|
||||
} else {
|
||||
//$this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
|
||||
if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']?!i', $html, $match)) {
|
||||
return false;
|
||||
@ -211,7 +230,7 @@ class HumbleHttpAgent
|
||||
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
|
||||
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
|
||||
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
|
||||
return $absolute->get_iri();
|
||||
return $absolute->get_uri();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -248,6 +267,21 @@ class HumbleHttpAgent
|
||||
}
|
||||
}
|
||||
|
||||
public function convertIdn($url) {
|
||||
if (function_exists('idn_to_ascii')) {
|
||||
if ($host = @parse_url($url, PHP_URL_HOST)) {
|
||||
$puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46);
|
||||
if ($host != $puny) {
|
||||
$pos = strpos($url, $host);
|
||||
if ($pos !== false) {
|
||||
$url = substr_replace($url, $puny, $pos, strlen($host));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return $url;
|
||||
}
|
||||
|
||||
public function rewriteUrls($url) {
|
||||
foreach ($this->rewriteUrls as $find => $action) {
|
||||
if (strpos($url, $find) !== false) {
|
||||
@ -327,6 +361,7 @@ class HumbleHttpAgent
|
||||
} else {
|
||||
$this->debug("......adding to pool");
|
||||
$req_url = $this->rewriteUrls($url);
|
||||
$req_url = $this->convertIdn($req_url);
|
||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
|
||||
$req_url = $this->removeFragment($req_url);
|
||||
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
|
||||
@ -507,6 +542,7 @@ class HumbleHttpAgent
|
||||
} else {
|
||||
$this->debug("......adding to pool");
|
||||
$req_url = $this->rewriteUrls($url);
|
||||
$req_url = $this->convertIdn($req_url);
|
||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
|
||||
$req_url = $this->removeFragment($req_url);
|
||||
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
|
||||
@ -649,6 +685,7 @@ class HumbleHttpAgent
|
||||
$this->debug("Sending request for $url");
|
||||
$this->requests[$orig]['original_url'] = $orig;
|
||||
$req_url = $this->rewriteUrls($url);
|
||||
$req_url = $this->convertIdn($req_url);
|
||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
|
||||
$req_url = $this->removeFragment($req_url);
|
||||
$httpContext = $this->httpContext;
|
||||
|
@ -1,5 +1,4 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Detects the language of a given piece of text.
|
||||
*
|
||||
@ -74,9 +73,8 @@ class Text_LanguageDetect
|
||||
* $this->_data_dir will be ignored
|
||||
*
|
||||
* @var string
|
||||
* @access private
|
||||
*/
|
||||
var $_db_filename = 'lang.dat';
|
||||
protected $_db_filename = 'lang.dat';
|
||||
|
||||
/**
|
||||
* The filename that stores the unicode block definitions
|
||||
@ -85,9 +83,8 @@ class Text_LanguageDetect
|
||||
* $this->_data_dir will be ignored
|
||||
*
|
||||
* @var string
|
||||
* @access private
|
||||
*/
|
||||
var $_unicode_db_filename = 'unicode_blocks.dat';
|
||||
protected $_unicode_db_filename = 'unicode_blocks.dat';
|
||||
|
||||
/**
|
||||
* The data directory
|
||||
@ -95,9 +92,8 @@ class Text_LanguageDetect
|
||||
* Should be set by PEAR installer
|
||||
*
|
||||
* @var string
|
||||
* @access private
|
||||
*/
|
||||
var $_data_dir = '@data_dir@';
|
||||
protected $_data_dir = '@data_dir@';
|
||||
|
||||
/**
|
||||
* The trigram data for comparison
|
||||
@ -105,63 +101,56 @@ class Text_LanguageDetect
|
||||
* Will be loaded on start from $this->_db_filename
|
||||
*
|
||||
* @var array
|
||||
* @access private
|
||||
*/
|
||||
var $_lang_db = array();
|
||||
protected $_lang_db = array();
|
||||
|
||||
/**
|
||||
* stores the map of the trigram data to unicode characters
|
||||
* Stores the map of the trigram data to unicode characters
|
||||
*
|
||||
* @access private
|
||||
* @var array
|
||||
*/
|
||||
var $_unicode_map;
|
||||
protected $_unicode_map;
|
||||
|
||||
/**
|
||||
* The size of the trigram data arrays
|
||||
*
|
||||
* @var int
|
||||
* @access private
|
||||
*/
|
||||
var $_threshold = 300;
|
||||
protected $_threshold = 300;
|
||||
|
||||
/**
|
||||
* the maximum possible score.
|
||||
* The maximum possible score.
|
||||
*
|
||||
* needed for score normalization. Different depending on the
|
||||
* Needed for score normalization. Different depending on the
|
||||
* perl compatibility setting
|
||||
*
|
||||
* @access private
|
||||
* @var int
|
||||
* @see setPerlCompatible()
|
||||
*/
|
||||
var $_max_score = 0;
|
||||
protected $_max_score = 0;
|
||||
|
||||
/**
|
||||
* Whether or not to simulate perl's Language::Guess exactly
|
||||
*
|
||||
* @access private
|
||||
* @var bool
|
||||
* @see setPerlCompatible()
|
||||
*/
|
||||
var $_perl_compatible = false;
|
||||
protected $_perl_compatible = false;
|
||||
|
||||
/**
|
||||
* Whether to use the unicode block detection to speed up processing
|
||||
*
|
||||
* @access private
|
||||
* @var bool
|
||||
*/
|
||||
var $_use_unicode_narrowing = true;
|
||||
protected $_use_unicode_narrowing = true;
|
||||
|
||||
/**
|
||||
* stores the result of the clustering operation
|
||||
* Stores the result of the clustering operation
|
||||
*
|
||||
* @access private
|
||||
* @var array
|
||||
* @see clusterLanguages()
|
||||
*/
|
||||
var $_clusters;
|
||||
protected $_clusters;
|
||||
|
||||
/**
|
||||
* Which type of "language names" are accepted and returned:
|
||||
@ -170,7 +159,7 @@ class Text_LanguageDetect
|
||||
* 2 - 2-letter ISO 639-1 code ("en")
|
||||
* 3 - 3-letter ISO 639-2 code ("eng")
|
||||
*/
|
||||
var $_name_mode = 0;
|
||||
protected $_name_mode = 0;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
@ -178,7 +167,7 @@ class Text_LanguageDetect
|
||||
* Will attempt to load the language database. If it fails, you will get
|
||||
* an exception.
|
||||
*/
|
||||
function __construct()
|
||||
public function __construct()
|
||||
{
|
||||
$data = $this->_readdb($this->_db_filename);
|
||||
$this->_checkTrigram($data['trigram']);
|
||||
@ -200,9 +189,8 @@ class Text_LanguageDetect
|
||||
* @param string $fname File name to load
|
||||
*
|
||||
* @return string expected path to the language model database
|
||||
* @access private
|
||||
*/
|
||||
function _get_data_loc($fname)
|
||||
protected function _get_data_loc($fname)
|
||||
{
|
||||
return dirname(__FILE__).'/'.$fname;
|
||||
}
|
||||
@ -216,9 +204,8 @@ class Text_LanguageDetect
|
||||
*
|
||||
* @return array the language model data
|
||||
* @throws Text_LanguageDetect_Exception
|
||||
* @access private
|
||||
*/
|
||||
function _readdb($fname)
|
||||
protected function _readdb($fname)
|
||||
{
|
||||
// finds the correct data dir
|
||||
$fname = $this->_get_data_loc($fname);
|
||||
@ -246,9 +233,8 @@ class Text_LanguageDetect
|
||||
* @param array $trigram Trigram data from database
|
||||
*
|
||||
* @return void
|
||||
* @access private
|
||||
*/
|
||||
function _checkTrigram($trigram)
|
||||
protected function _checkTrigram($trigram)
|
||||
{
|
||||
if (!is_array($trigram)) {
|
||||
if (ini_get('magic_quotes_runtime')) {
|
||||
@ -340,11 +326,10 @@ class Text_LanguageDetect
|
||||
/**
|
||||
* Returns the number of languages that this object can detect
|
||||
*
|
||||
* @access public
|
||||
* @return int the number of languages
|
||||
* @throws Text_LanguageDetect_Exception
|
||||
*/
|
||||
function getLanguageCount()
|
||||
public function getLanguageCount()
|
||||
{
|
||||
return count($this->_lang_db);
|
||||
}
|
||||
@ -382,11 +367,10 @@ class Text_LanguageDetect
|
||||
/**
|
||||
* Returns the list of detectable languages
|
||||
*
|
||||
* @access public
|
||||
* @return array the names of the languages known to this object<<<<<<<
|
||||
* @throws Text_LanguageDetect_Exception
|
||||
*/
|
||||
function getLanguages()
|
||||
public function getLanguages()
|
||||
{
|
||||
return $this->_convertToNameMode(
|
||||
array_keys($this->_lang_db)
|
||||
@ -424,7 +408,7 @@ class Text_LanguageDetect
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
function setNameMode($name_mode)
|
||||
public function setNameMode($name_mode)
|
||||
{
|
||||
$this->_name_mode = $name_mode;
|
||||
}
|
||||
@ -454,10 +438,9 @@ class Text_LanguageDetect
|
||||
* @param string $text text to convert
|
||||
*
|
||||
* @return array array of trigram frequencies
|
||||
* @access private
|
||||
* @deprecated Superceded by the Text_LanguageDetect_Parser class
|
||||
*/
|
||||
function _trigram($text)
|
||||
protected function _trigram($text)
|
||||
{
|
||||
$s = new Text_LanguageDetect_Parser($text);
|
||||
$s->prepareTrigram();
|
||||
@ -475,9 +458,8 @@ class Text_LanguageDetect
|
||||
* @param array $arr array of trigram
|
||||
*
|
||||
* @return array ranks of trigrams
|
||||
* @access protected
|
||||
*/
|
||||
function _arr_rank($arr)
|
||||
protected function _arr_rank($arr)
|
||||
{
|
||||
|
||||
// sorts alphabetically first as a standard way of breaking rank ties
|
||||
@ -505,12 +487,11 @@ class Text_LanguageDetect
|
||||
/**
|
||||
* Sorts an array by value breaking ties alphabetically
|
||||
*
|
||||
* @param array &$arr the array to sort
|
||||
* @param array $arr the array to sort
|
||||
*
|
||||
* @return void
|
||||
* @access private
|
||||
*/
|
||||
function _bub_sort(&$arr)
|
||||
protected function _bub_sort(&$arr)
|
||||
{
|
||||
// should do the same as this perl statement:
|
||||
// sort { $trigrams{$b} == $trigrams{$a}
|
||||
@ -548,9 +529,8 @@ class Text_LanguageDetect
|
||||
*
|
||||
* @return int 1 if $a is greater, -1 if not
|
||||
* @see _bub_sort()
|
||||
* @access private
|
||||
*/
|
||||
function _sort_func($a, $b)
|
||||
protected function _sort_func($a, $b)
|
||||
{
|
||||
// each is actually a key/value pair, so that it can compare using both
|
||||
list($a_key, $a_value) = $a;
|
||||
@ -588,9 +568,8 @@ class Text_LanguageDetect
|
||||
*
|
||||
* @return int the sum of the differences between the ranks of
|
||||
* the two trigram sets
|
||||
* @access private
|
||||
*/
|
||||
function _distance($arr1, $arr2)
|
||||
protected function _distance($arr1, $arr2)
|
||||
{
|
||||
$sumdist = 0;
|
||||
|
||||
@ -621,9 +600,8 @@ class Text_LanguageDetect
|
||||
*
|
||||
* @return float the normalized score
|
||||
* @see _distance()
|
||||
* @access private
|
||||
*/
|
||||
function _normalize_score($score, $base_count = null)
|
||||
protected function _normalize_score($score, $base_count = null)
|
||||
{
|
||||
if ($base_count === null) {
|
||||
$base_count = $this->_threshold;
|
||||
@ -699,7 +677,7 @@ class Text_LanguageDetect
|
||||
$sample_obj->setPadStart(!$this->_perl_compatible);
|
||||
$sample_obj->analyze();
|
||||
|
||||
$trigram_freqs =& $sample_obj->getTrigramRanks();
|
||||
$trigram_freqs = $sample_obj->getTrigramRanks();
|
||||
$trigram_count = count($trigram_freqs);
|
||||
|
||||
if ($trigram_count == 0) {
|
||||
@ -710,7 +688,7 @@ class Text_LanguageDetect
|
||||
|
||||
// use unicode block detection to narrow down the possibilities
|
||||
if ($this->_use_unicode_narrowing) {
|
||||
$blocks =& $sample_obj->getUnicodeBlocks();
|
||||
$blocks = $sample_obj->getUnicodeBlocks();
|
||||
|
||||
if (is_array($blocks)) {
|
||||
$present_blocks = array_keys($blocks);
|
||||
@ -962,16 +940,15 @@ class Text_LanguageDetect
|
||||
*
|
||||
* @return mixed Block name, -1 if it failed
|
||||
* @see unicodeBlockName()
|
||||
* @access protected
|
||||
*/
|
||||
function _unicode_block_name($unicode, $blocks, $block_count = -1)
|
||||
protected function _unicode_block_name($unicode, $blocks, $block_count = -1)
|
||||
{
|
||||
// for a reference, see
|
||||
// http://www.unicode.org/Public/UNIDATA/Blocks.txt
|
||||
|
||||
// assume that ascii characters are the most common
|
||||
// so try it first for efficiency
|
||||
if ($unicode <= hexdec($blocks[0][1])) {
|
||||
if ($unicode <= $blocks[0][1]) {
|
||||
return $blocks[0];
|
||||
}
|
||||
|
||||
@ -989,11 +966,11 @@ class Text_LanguageDetect
|
||||
while ($low <= $high) {
|
||||
$mid = floor(($low + $high) / 2);
|
||||
|
||||
if ($unicode < hexdec($blocks[$mid][0])) {
|
||||
if ($unicode < $blocks[$mid][0]) {
|
||||
// if it's lower than the lower bound
|
||||
$high = $mid - 1;
|
||||
|
||||
} elseif ($unicode > hexdec($blocks[$mid][1])) {
|
||||
} elseif ($unicode > $blocks[$mid][1]) {
|
||||
// if it's higher than the upper bound
|
||||
$low = $mid + 1;
|
||||
|
||||
@ -1015,9 +992,8 @@ class Text_LanguageDetect
|
||||
*
|
||||
* @return array the database of unicode block definitions
|
||||
* @throws Text_LanguageDetect_Exception
|
||||
* @access protected
|
||||
*/
|
||||
function _read_unicode_block_db()
|
||||
protected function _read_unicode_block_db()
|
||||
{
|
||||
// since the unicode definitions are always going to be the same,
|
||||
// might as well share the memory for the db with all other instances
|
||||
@ -1136,14 +1112,13 @@ class Text_LanguageDetect
|
||||
* Uses a nearest neighbor technique to generate the maximum possible
|
||||
* number of dendograms from the similarity data.
|
||||
*
|
||||
* @access public
|
||||
* @return array language cluster data
|
||||
* @throws Text_LanguageDetect_Exception
|
||||
* @see languageSimilarity()
|
||||
* @deprecated this function will eventually be removed and placed into
|
||||
* the model generation class
|
||||
*/
|
||||
function clusterLanguages()
|
||||
public function clusterLanguages()
|
||||
{
|
||||
// todo: set the maximum number of clusters
|
||||
// return cached result, if any
|
||||
@ -1452,7 +1427,7 @@ class Text_LanguageDetect
|
||||
}
|
||||
|
||||
/**
|
||||
* ut8-safe strlen()
|
||||
* UTF8-safe strlen()
|
||||
*
|
||||
* Returns the numbers of characters (not bytes) in a utf8 string
|
||||
*
|
||||
@ -1476,10 +1451,9 @@ class Text_LanguageDetect
|
||||
* @param string $char a utf8 (possibly multi-byte) char
|
||||
*
|
||||
* @return int unicode value
|
||||
* @access protected
|
||||
* @link http://en.wikipedia.org/wiki/UTF-8
|
||||
*/
|
||||
function _utf8char2unicode($char)
|
||||
protected function _utf8char2unicode($char)
|
||||
{
|
||||
// strlen() here will actually get the binary length of a single char
|
||||
switch (strlen($char)) {
|
||||
@ -1516,20 +1490,19 @@ class Text_LanguageDetect
|
||||
}
|
||||
|
||||
/**
|
||||
* utf8-safe fast character iterator
|
||||
* UTF8-safe fast character iterator
|
||||
*
|
||||
* Will get the next character starting from $counter, which will then be
|
||||
* incremented. If a multi-byte char the bytes will be concatenated and
|
||||
* $counter will be incremeted by the number of bytes in the char.
|
||||
*
|
||||
* @param string $str the string being iterated over
|
||||
* @param int &$counter the iterator, will increment by reference
|
||||
* @param int $counter the iterator, will increment by reference
|
||||
* @param bool $special_convert whether to do special conversions
|
||||
*
|
||||
* @return char the next (possibly multi-byte) char from $counter
|
||||
* @access private
|
||||
*/
|
||||
static function _next_char($str, &$counter, $special_convert = false)
|
||||
protected static function _next_char($str, &$counter, $special_convert = false)
|
||||
{
|
||||
$char = $str{$counter++};
|
||||
$ord = ord($char);
|
||||
@ -1621,7 +1594,7 @@ class Text_LanguageDetect
|
||||
*
|
||||
* @return string|array Language name
|
||||
*/
|
||||
function _convertFromNameMode($lang, $convertKey = false)
|
||||
protected function _convertFromNameMode($lang, $convertKey = false)
|
||||
{
|
||||
if ($this->_name_mode == 0) {
|
||||
return $lang;
|
||||
@ -1661,7 +1634,7 @@ class Text_LanguageDetect
|
||||
*
|
||||
* @return string|array Language name
|
||||
*/
|
||||
function _convertToNameMode($lang, $convertKey = false)
|
||||
protected function _convertToNameMode($lang, $convertKey = false)
|
||||
{
|
||||
if ($this->_name_mode == 0) {
|
||||
return $lang;
|
||||
@ -1689,5 +1662,3 @@ class Text_LanguageDetect
|
||||
return $newlang;
|
||||
}
|
||||
}
|
||||
|
||||
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
|
@ -1,4 +1,16 @@
|
||||
<?php
|
||||
/**
|
||||
* Part of Text_LanguageDetect
|
||||
*
|
||||
* PHP version 5
|
||||
*
|
||||
* @category Text
|
||||
* @package Text_LanguageDetect
|
||||
* @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
|
||||
* @license BSD http://www.opensource.org/licenses/bsd-license.php
|
||||
* @link http://pear.php.net/package/Text_LanguageDetect/
|
||||
*/
|
||||
|
||||
class Text_LanguageDetect_Exception extends Exception
|
||||
{
|
||||
/**
|
||||
|
@ -1,18 +1,4 @@
|
||||
<?php
|
||||
/**
|
||||
* Part of Text_LanguageDetect
|
||||
*
|
||||
* PHP version 5
|
||||
*
|
||||
* @category Text
|
||||
* @package Text_LanguageDetect
|
||||
* @author Christian Weiske <cweiske@php.net>
|
||||
* @copyright 2011 Christian Weiske <cweiske@php.net>
|
||||
* @license http://www.debian.org/misc/bsd.license BSD
|
||||
* @version SVN: $Id$
|
||||
* @link http://pear.php.net/package/Text_LanguageDetect/
|
||||
*/
|
||||
|
||||
/**
|
||||
* Provides a mapping between the languages from lang.dat and the
|
||||
* ISO 639-1 and ISO-639-2 codes.
|
||||
@ -23,7 +9,7 @@
|
||||
* @package Text_LanguageDetect
|
||||
* @author Christian Weiske <cweiske@php.net>
|
||||
* @copyright 2011 Christian Weiske <cweiske@php.net>
|
||||
* @license http://www.debian.org/misc/bsd.license BSD
|
||||
* @license BSD http://www.opensource.org/licenses/bsd-license.php
|
||||
* @link http://www.loc.gov/standards/iso639-2/php/code_list.php
|
||||
*/
|
||||
class Text_LanguageDetect_ISO639
|
||||
|
@ -1,18 +1,4 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* This class represents a text sample to be parsed.
|
||||
*
|
||||
* @category Text
|
||||
* @package Text_LanguageDetect
|
||||
* @author Nicholas Pisarro
|
||||
* @copyright 2006
|
||||
* @license BSD
|
||||
* @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
|
||||
* @link http://pear.php.net/package/Text_LanguageDetect/
|
||||
* @link http://langdetect.blogspot.com/
|
||||
*/
|
||||
|
||||
/**
|
||||
* This class represents a text sample to be parsed.
|
||||
*
|
||||
@ -25,94 +11,101 @@
|
||||
*
|
||||
* @category Text
|
||||
* @package Text_LanguageDetect
|
||||
* @author Nicholas Pisarro
|
||||
* @copyright 2006
|
||||
* @license BSD
|
||||
* @version release: 0.3.0
|
||||
* @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
|
||||
* @copyright 2006 Nicholas Pisarro
|
||||
* @license BSD http://www.opensource.org/licenses/bsd-license.php
|
||||
* @version Release: 1.0.0
|
||||
* @link http://pear.php.net/package/Text_LanguageDetect/
|
||||
*/
|
||||
class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
{
|
||||
/**
|
||||
* the piece of text being parsed
|
||||
* The piece of text being parsed
|
||||
*
|
||||
* @access private
|
||||
* @var string
|
||||
*/
|
||||
var $_string;
|
||||
protected $_string;
|
||||
|
||||
/**
|
||||
* stores the trigram frequencies of the sample
|
||||
* Stores the trigram frequencies of the sample
|
||||
*
|
||||
* @access private
|
||||
* @var string
|
||||
*/
|
||||
var $_trigrams = array();
|
||||
protected $_trigrams = array();
|
||||
|
||||
/**
|
||||
* stores the trigram ranks of the sample
|
||||
* Stores the trigram ranks of the sample
|
||||
*
|
||||
* @access private
|
||||
* @var array
|
||||
*/
|
||||
var $_trigram_ranks = array();
|
||||
protected $_trigram_ranks = array();
|
||||
|
||||
/**
|
||||
* stores the unicode blocks of the sample
|
||||
* Stores the unicode blocks of the sample
|
||||
*
|
||||
* @access private
|
||||
* @var array
|
||||
*/
|
||||
var $_unicode_blocks = array();
|
||||
protected $_unicode_blocks = array();
|
||||
|
||||
/**
|
||||
* Whether the parser should compile the unicode ranges
|
||||
*
|
||||
* @access private
|
||||
* @var bool
|
||||
*/
|
||||
var $_compile_unicode = false;
|
||||
protected $_compile_unicode = false;
|
||||
|
||||
/**
|
||||
* Whether the parser should compile trigrams
|
||||
*
|
||||
* @access private
|
||||
* @var bool
|
||||
*/
|
||||
var $_compile_trigram = false;
|
||||
protected $_compile_trigram = false;
|
||||
|
||||
/**
|
||||
* Whether the trigram parser should pad the beginning of the string
|
||||
*
|
||||
* @access private
|
||||
* @var bool
|
||||
*/
|
||||
var $_trigram_pad_start = false;
|
||||
protected $_trigram_pad_start = false;
|
||||
|
||||
/**
|
||||
* Whether the unicode parser should skip non-alphabetical ascii chars
|
||||
*
|
||||
* @access private
|
||||
* @var bool
|
||||
*/
|
||||
var $_unicode_skip_symbols = true;
|
||||
protected $_unicode_skip_symbols = true;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @access private
|
||||
* @param string $string string to be parsed
|
||||
*/
|
||||
function __construct($string) {
|
||||
public function __construct($string)
|
||||
{
|
||||
$this->_string = $string;
|
||||
}
|
||||
|
||||
/**
|
||||
* PHP 4 constructor for backwards compatibility.
|
||||
*
|
||||
* @param string $string string to be parsed
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
public function Text_LanguageDetect_Parser($string)
|
||||
{
|
||||
self::__construct($string);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if a string is suitable for parsing
|
||||
*
|
||||
* @param string $str input string to test
|
||||
*
|
||||
* @return bool true if acceptable, false if not
|
||||
*/
|
||||
public static function validateString($str) {
|
||||
public static function validateString($str)
|
||||
{
|
||||
if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
|
||||
return true;
|
||||
} else {
|
||||
@ -121,34 +114,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
}
|
||||
|
||||
/**
|
||||
* turn on/off trigram counting
|
||||
* Turn on/off trigram counting
|
||||
*
|
||||
* @access public
|
||||
* @param bool $bool true for on, false for off
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
function prepareTrigram($bool = true)
|
||||
public function prepareTrigram($bool = true)
|
||||
{
|
||||
$this->_compile_trigram = $bool;
|
||||
}
|
||||
|
||||
/**
|
||||
* turn on/off unicode block counting
|
||||
* Turn on/off unicode block counting
|
||||
*
|
||||
* @access public
|
||||
* @param bool $bool true for on, false for off
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
function prepareUnicode($bool = true)
|
||||
public function prepareUnicode($bool = true)
|
||||
{
|
||||
$this->_compile_unicode = $bool;
|
||||
}
|
||||
|
||||
/**
|
||||
* turn on/off padding the beginning of the sample string
|
||||
* Turn on/off padding the beginning of the sample string
|
||||
*
|
||||
* @access public
|
||||
* @param bool $bool true for on, false for off
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
function setPadStart($bool = true)
|
||||
public function setPadStart($bool = true)
|
||||
{
|
||||
$this->_trigram_pad_start = $bool;
|
||||
}
|
||||
@ -156,10 +152,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
/**
|
||||
* Should the unicode block counter skip non-alphabetical ascii chars?
|
||||
*
|
||||
* @access public
|
||||
* @param bool $bool true for on, false for off
|
||||
*
|
||||
* @return void
|
||||
*/
|
||||
function setUnicodeSkipSymbols($bool = true)
|
||||
public function setUnicodeSkipSymbols($bool = true)
|
||||
{
|
||||
$this->_unicode_skip_symbols = $bool;
|
||||
}
|
||||
@ -167,10 +164,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
/**
|
||||
* Returns the trigram ranks for the text sample
|
||||
*
|
||||
* @access public
|
||||
* @return array trigram ranks in the text sample
|
||||
* @return array Trigram ranks in the text sample
|
||||
*/
|
||||
function &getTrigramRanks()
|
||||
public function getTrigramRanks()
|
||||
{
|
||||
return $this->_trigram_ranks;
|
||||
}
|
||||
@ -178,23 +174,21 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
/**
|
||||
* Return the trigram freqency table
|
||||
*
|
||||
* only used in testing to make sure the parser is working
|
||||
* Only used in testing to make sure the parser is working
|
||||
*
|
||||
* @access public
|
||||
* @return array trigram freqencies in the text sample
|
||||
* @return array Trigram freqencies in the text sample
|
||||
*/
|
||||
function &getTrigramFreqs()
|
||||
public function getTrigramFreqs()
|
||||
{
|
||||
return $this->_trigram;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the array of unicode blocks
|
||||
* Returns the array of unicode blocks
|
||||
*
|
||||
* @access public
|
||||
* @return array unicode blocks in the text sample
|
||||
* @return array Unicode blocks in the text sample
|
||||
*/
|
||||
function &getUnicodeBlocks()
|
||||
public function getUnicodeBlocks()
|
||||
{
|
||||
return $this->_unicode_blocks;
|
||||
}
|
||||
@ -208,9 +202,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
* Afterwards the get*() functions can be used to access the compiled
|
||||
* information.
|
||||
*
|
||||
* @access public
|
||||
* @return void
|
||||
*/
|
||||
function analyze()
|
||||
public function analyze()
|
||||
{
|
||||
$len = strlen($this->_string);
|
||||
$byte_counter = 0;
|
||||
@ -274,7 +268,8 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
&& strlen($char) == 1
|
||||
&& ($char < 'A' || $char > 'z'
|
||||
|| ($char > 'Z' && $char < 'a'))
|
||||
&& $char != "'") { // does not skip the apostrophe
|
||||
&& $char != "'"
|
||||
) { // does not skip the apostrophe
|
||||
// since it's included in the language
|
||||
// models
|
||||
|
||||
@ -297,7 +292,8 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
if ($this->_compile_unicode) {
|
||||
foreach ($unicode_chars as $utf8_char => $count) {
|
||||
$search_result = $this->_unicode_block_name(
|
||||
$this->_utf8char2unicode($utf8_char), $blocks, $block_count);
|
||||
$this->_utf8char2unicode($utf8_char), $blocks, $block_count
|
||||
);
|
||||
|
||||
if ($search_result != -1) {
|
||||
$block_name = $search_result[2];
|
||||
@ -343,5 +339,3 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
|
File diff suppressed because one or more lines are too long
@ -122,6 +122,7 @@ class Readability
|
||||
if ($parser=='gumbo') {
|
||||
// Can we avoid this encoding/deocding step? Test on:
|
||||
// http://www.medialens.org/index.php/alerts/alert-archive/2017/837-undermining-democracy-corporate-media-bias-on-jeremy-corbyn-boris-johnson-and-syria.html
|
||||
$html = str_replace(''', "'", $html); // other named entities handled okay
|
||||
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
|
||||
$html = mb_convert_encoding($html, "UTF-8", 'HTML-ENTITIES');
|
||||
$this->dom = @Layershifter\Gumbo\Parser::load($html);
|
||||
|
@ -3,8 +3,8 @@
|
||||
// Author: Keyvan Minoukadeh
|
||||
// Copyright (c) 2017 Keyvan Minoukadeh
|
||||
// License: AGPLv3
|
||||
// Version: 3.7
|
||||
// Date: 2017-02-12
|
||||
// Version: 3.8
|
||||
// Date: 2017-09-25
|
||||
// More info: http://fivefilters.org/content-only/
|
||||
// Help: http://help.fivefilters.org
|
||||
|
||||
@ -183,7 +183,9 @@ if (!isset($_REQUEST['url'])) {
|
||||
die('No URL supplied');
|
||||
}
|
||||
$url = trim($_REQUEST['url']);
|
||||
if (strtolower(substr($url, 0, 7)) == 'feed://') {
|
||||
if (strtolower(substr($url, 0, 6)) == 'sec://') {
|
||||
$url = 'https://'.substr($url, 6);
|
||||
} elseif (strtolower(substr($url, 0, 7)) == 'feed://') {
|
||||
$url = 'http://'.substr($url, 7);
|
||||
}
|
||||
if (!preg_match('!^https?://.+!i', $url)) {
|
||||
@ -345,10 +347,10 @@ if ($options->content === 'user') {
|
||||
// HTML5 output?
|
||||
///////////////////////////////////////////////
|
||||
if ($options->html5_output === 'user') {
|
||||
if (isset($_REQUEST['content']) && $_REQUEST['content'] === 'html5') {
|
||||
$options->html5_output = true;
|
||||
} else {
|
||||
if (isset($_REQUEST['content']) && $_REQUEST['content'] === '1') {
|
||||
$options->html5_output = false;
|
||||
} else {
|
||||
$options->html5_output = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -820,7 +822,7 @@ foreach ($items as $key => $item) {
|
||||
continue; // skip this feed item entry
|
||||
}
|
||||
}
|
||||
$base_url = get_base_url($readability->dom);
|
||||
$base_url = get_base_url($readability->dom, $effective_url);
|
||||
if (!$base_url) $base_url = $effective_url;
|
||||
$content_block = ($extract_result) ? $extractor->getContent() : null;
|
||||
$extracted_title = ($extract_result) ? $extractor->getTitle() : '';
|
||||
@ -945,6 +947,7 @@ foreach ($items as $key => $item) {
|
||||
//unset($content_block);
|
||||
// post-processing cleanup
|
||||
$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
|
||||
$html = str_replace('<p> </p>', '', $html);
|
||||
if ($links == 'remove') {
|
||||
$html = preg_replace('!<a\s+[^>]*>!', '', $html);
|
||||
$html = preg_replace('!</a>!', '', $html);
|
||||
@ -1080,6 +1083,7 @@ foreach ($items as $key => $item) {
|
||||
$l_result = $l->detect($text_sample, 1);
|
||||
if (count($l_result) > 0) {
|
||||
$language = key($l_result);
|
||||
debug('Language detected: '.$language);
|
||||
}
|
||||
}
|
||||
} catch (Exception $e) {
|
||||
@ -1248,6 +1252,17 @@ function get_self_url() {
|
||||
}
|
||||
|
||||
function validate_url($url) {
|
||||
if (function_exists('idn_to_ascii')) {
|
||||
if ($host = @parse_url($url, PHP_URL_HOST)) {
|
||||
$puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46);
|
||||
if ($host != $puny) {
|
||||
$pos = strpos($url, $host);
|
||||
if ($pos !== false) {
|
||||
$url = substr_replace($url, $puny, $pos, strlen($host));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
$url = filter_var($url, FILTER_SANITIZE_URL);
|
||||
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
|
||||
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
|
||||
@ -1261,9 +1276,14 @@ function validate_url($url) {
|
||||
}
|
||||
}
|
||||
|
||||
function get_base_url($dom) {
|
||||
function get_base_url($dom, $url=null) {
|
||||
$xpath = new DOMXPath($dom);
|
||||
return @$xpath->evaluate('string(//head/base/@href)', $dom);
|
||||
$base = @$xpath->evaluate('string(//head/base/@href)', $dom);
|
||||
if (!$base) return false;
|
||||
if (isset($url) && !preg_match('!^https?://!i', $base)) {
|
||||
$base = make_absolute_str($url, $base);
|
||||
}
|
||||
return $base;
|
||||
}
|
||||
|
||||
function is_ssl() {
|
||||
@ -1436,7 +1456,7 @@ function make_absolute_attr($base, $e, $attr) {
|
||||
$url = str_replace(' ', '%20', $url);
|
||||
if (!preg_match('!https?://!i', $url)) {
|
||||
if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
|
||||
$e->setAttribute($attr, $absolute);
|
||||
$e->setAttribute($attr, $absolute->get_uri());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1450,7 +1470,7 @@ function make_absolute_str($base, $url) {
|
||||
return $url;
|
||||
} else {
|
||||
if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
|
||||
return $absolute;
|
||||
return $absolute->get_uri();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -1529,7 +1549,7 @@ function get_single_page($item, $html, $url) {
|
||||
}
|
||||
}
|
||||
}
|
||||
$base_url = get_base_url($readability->dom);
|
||||
$base_url = get_base_url($readability->dom, $url);
|
||||
if (!$base_url) $base_url = $url;
|
||||
// If we've got URL, resolve against $base_url
|
||||
if (isset($single_page_url) && ($single_page_url = make_absolute_str($base_url, $single_page_url))) {
|
||||
|
Loading…
Reference in New Issue
Block a user