Full-Text RSS 3.8

This commit is contained in:
FiveFilters.org 2019-04-04 23:46:36 +02:00
parent 954e765b5a
commit 1ec2f36b3e
21 changed files with 635 additions and 545 deletions

View File

@ -2,6 +2,22 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
3.8 (2017-09-25)
- New site config directive: strip_attr: XPath attribute selector (e.g. //img/@srcset) - remove attribute from element
- New site config directive: insert_detected_image: yes/no (default yes) - places image in og:image in the body if no other images extracted
- Bug fix: Better handling of Internationalized Domain Names (IDNs)
- Bug fix: Relative base URLs (<base>) now resolved against page URL
- Bug fix: Wrong site config file chosen in certain cases (when wildcard and exact subdomain files available and cached in APCu)
- Bug fix: &apos; HTML entities not converted correctly when parsing with Gumbo PHP
- Remove srcset (+ sizes) attributes on img elements if it looks like they only contain relative URLs (browser will use src attribute value instead)
- https:// URLs now re-written to sec:// before being submitted to avoid overzealous security software blocking request on some servers - no redirect, only affects newly submitted URLs on index.php
- HTML5-PHP library updated
- Language Detect library updated
- Site config files updated for better extraction
- Minimum PHP version is now 5.4. If you must use PHP 5.3, please stick with Full-Text RSS 3.7
- Tested with PHP 7.2
- Other fixes/improvements
3.7 (2017-02-12)
- Request HTML5 output using HTML5-PHP - new config option $options->html5_output and new request parameter &content=html5
- Improve support for lazy-loading images
@ -23,31 +39,31 @@ CHANGELOG
- Other fixes/improvements
3.6 (2016-02-21)
- Insert og:image (if we find one) at the top of the article when no images have been extracted
- Additional lazy image load handling - helps preserve more images designed for JS-enabled browsers
- Original GUID values from feed items now preserved
- New config option favour_effective_url determines if item's effective URL (after redirects) should replace original item URL in feed output
- Adding &use_effective_url to querystring will replace original feed item URL with effective URL (unless disabled with config option above)
- APCu stats view in admin panel fixed to work with recent versions of APCu
- HTML5-PHP library updated
- Tested for PHP 7 compatibility
- VPS Puppet script (ubuntu-15.10.pp) updated - fixes issue with IDN encodings, among other things. (This is intended for setting up a new Ubuntu 15.10 instance for running Full-Text RSS.)
- Site config files updated for better extraction
- Other minor fixes/improvements
- Insert og:image (if we find one) at the top of the article when no images have been extracted
- Additional lazy image load handling - helps preserve more images designed for JS-enabled browsers
- Original GUID values from feed items now preserved
- New config option favour_effective_url determines if item's effective URL (after redirects) should replace original item URL in feed output
- Adding &use_effective_url to querystring will replace original feed item URL with effective URL (unless disabled with config option above)
- APCu stats view in admin panel fixed to work with recent versions of APCu
- HTML5-PHP library updated
- Tested for PHP 7 compatibility
- VPS Puppet script (ubuntu-15.10.pp) updated - fixes issue with IDN encodings, among other things. (This is intended for setting up a new Ubuntu 15.10 instance for running Full-Text RSS.)
- Site config files updated for better extraction
- Other minor fixes/improvements
3.5 (2015-06-13)
- Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
- Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
- Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky)
- Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates)
- Humble HTTP Agent updated to support version 2 of PHP's HTTP extension
- HTML5-PHP library updated
- Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6
- Config option removed: $options->user_agents - use site config files.
- Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional.
- Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4
- Site config files updated for better extraction
- Other minor fixes/improvements
- Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
- Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
- Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky)
- Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates)
- Humble HTTP Agent updated to support version 2 of PHP's HTTP extension
- HTML5-PHP library updated
- Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6
- Config option removed: $options->user_agents - use site config files.
- Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional.
- Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4
- Site config files updated for better extraction
- Other minor fixes/improvements
3.4 (2014-09-08)
- New request parameter: siteconfig lets you submit extraction rules directly in request

View File

@ -61,16 +61,15 @@ $options->content = 'user';
// HTML5 output
// ----------------------
// By default, Full-Text RSS uses libxml to convert the parsed DOM tree back into HTML.
// If this is enabled, we'll use HTML5-PHP to produce the HTML. This will be a little
// slower, but might produce better results, adhering to the HTML5 spec.
//
// Note: in a future release we might make HTML5 output the default.
// Full-Text RSS used to rely on libxml to output HTML extracted from
// a web page. Since version 3.8 we use HTML5-PHP by default.
// If you prefer the old output, either set this to false or pass &content=1
// in the querystring.
//
// Possible values...
// HTML5 (slower): true
// libxml (faster): false
// libxml unless user overrides (&content=html5): 'user' (default)
// HTML5 unless user overrides (&content=1): 'user' (default)
$options->html5_output = 'user';
// Excerpts
@ -524,7 +523,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.7');
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.8');
if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) {

View File

@ -16,12 +16,12 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/
$app_name = 'Full-Text RSS 3.7';
$app_name = 'Full-Text RSS 3.8';
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
//$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
// HHVM works okay, but no Tidy and autoupdate of site config files not working (tested 3.7.1)
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.3.0', '>='));
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.4.0', '>='));
$pcre_ok = extension_loaded('pcre');
$zlib_ok = extension_loaded('zlib');
$mbstring_ok = extension_loaded('mbstring');
@ -32,6 +32,7 @@ $parallel_ok = ((extension_loaded('http') && class_exists('http\Client\Request')
$allow_url_fopen_ok = (bool)ini_get('allow_url_fopen');
$filter_ok = extension_loaded('filter');
$gumbo_ok = class_exists('Layershifter\Gumbo\Parser');
$idn_ok = function_exists('idn_to_ascii');
if (extension_loaded('xmlreader')) {
$xml_ok = true;
@ -204,7 +205,7 @@ div.chunk {
<tbody>
<tr class="<?php echo ($php_ok) ? 'enabled' : 'disabled'; ?>">
<td>PHP</td>
<td>5.3 or higher</td>
<td>5.4 or higher</td>
<td><?php echo phpversion(); ?></td>
</tr>
<tr class="<?php echo ($xml_ok) ? 'enabled, and sane' : 'disabled, or broken'; ?>">
@ -354,6 +355,11 @@ div.chunk {
<div class="chunk">
<h3>Further info</h3>
<h4>IDN support</h4>
<p>When treating an <a href="https://en.wikipedia.org/wiki/Internationalized_domain_name">internationalized domain name (IDN)</a> Full-Text RSS will try to make use of PHP's <code>idn_to_ascii</code> function to convert the domain to ASCII. If this function does not exist, you might have trouble retrieving article content from internationalized domains.</p>
<p class="highlight"><strong>idn_to_ascii</strong> is <?php if (!$idn_ok) echo '<strong>not</strong>'; ?> available on this server.</p>
<h4>HTTP module</h4>
<p>Full-Text RSS can make use of PHP's HTTP extension or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p>
<?php

View File

@ -25,6 +25,7 @@ if (!defined('_FF_FTR_INDEX')) {
// remove http scheme from urls before submitting
$('#form').submit(function() {
$('#url').val($('#url').val().replace(/^http:\/\//i, ''));
$('#url').val($('#url').val().replace(/^https:\/\//i, 'sec://'));
return true;
});
// popovers
@ -271,8 +272,8 @@ if (!defined('_FF_FTR_INDEX')) {
<tr>
<td>content</td>
<td><tt>0</tt>, <tt>1</tt> (default), <tt>html5</tt></td>
<td>If set to 0, the extracted content will not be included in the output. If set to html5, we'll output HTML5.</td>
<td><tt>0</tt>, <tt>1</tt>, <tt>html5</tt> (default)</td>
<td>If set to 0, the extracted content will not be included in the output. If set to 1, we'll use regular libxml output - might not be HTML5 compliant.</td>
</tr>
<tr>

View File

@ -5,8 +5,8 @@
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
* to extract content from HTML files.
*
* @version 1.3
* @date 2017-02-12
* @version 1.4
* @date 2017-09-25
* @author Keyvan Minoukadeh
* @copyright 2017 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
@ -107,24 +107,13 @@ class ContentExtractor
}
// returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
public function buildSiteConfig($url, $html='', $add_to_cache=true) {
public function buildSiteConfig($url, $html='') {
// extract host name
$host = @parse_url($url, PHP_URL_HOST);
$host = strtolower($host);
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
// is merged version already cached?
if (SiteConfig::is_cached("$host.merged")) {
$config = SiteConfig::build("$host.merged");
if ($config) {
$this->debug("Returning cached and merged site config for $host");
return $config;
}
}
// let's build from site_config/custom/ and standard/
$config = SiteConfig::build($host);
if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
SiteConfig::add_to_cache($host, $config);
}
// if no match, use defaults
if (!$config) $config = new SiteConfig();
// load fingerprint config?
@ -134,10 +123,6 @@ class ContentExtractor
if ($config_fingerprint = SiteConfig::build($_fphost)) {
$this->debug("Appending site config settings from $_fphost (fingerprint match)");
$config->append($config_fingerprint);
if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
//$config_fingerprint->cache_in_apc = true;
SiteConfig::add_to_cache($_fphost, $config_fingerprint);
}
}
}
}
@ -146,19 +131,8 @@ class ContentExtractor
if ($config_global = SiteConfig::build('global', true)) {
$this->debug('Appending site config settings from global.txt');
$config->append($config_global);
if ($add_to_cache && !SiteConfig::is_cached('global')) {
//$config_global->cache_in_apc = true;
SiteConfig::add_to_cache('global', $config_global);
}
}
}
// store copy of merged config
if ($add_to_cache) {
// do not store in APC if wildcard match
$use_apc = ($host == $config->cache_key);
$config->cache_key = null;
SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
}
return $config;
}
@ -398,10 +372,14 @@ class ContentExtractor
$elems = @$xpath->query($pattern, $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' elements (strip)');
$this->debug('Stripping '.$elems->length.' elements (strip: '.$pattern.')');
for ($i=$elems->length-1; $i >= 0; $i--) {
if ($elems->item($i)->parentNode) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
if ($elems->item($i) instanceof DOMAttr) {
$elems->item($i)->parentNode->removeAttributeNode($elems->item($i));
} else {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
}
@ -413,7 +391,7 @@ class ContentExtractor
$elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
$this->debug('Stripping '.$elems->length.' elements (strip_id_or_class: '.$string.')');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
@ -426,12 +404,13 @@ class ContentExtractor
$elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' image elements');
$this->debug('Stripping '.$elems->length.' elements (strip_image_src: '.$string.')');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
// strip elements using Readability.com and Instapaper.com ignore class names
// .entry-unrelated and .instapaper_ignore
// See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
@ -464,7 +443,22 @@ class ContentExtractor
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
// strip img srcset/sizes attributes with relative URIs (src should be present and will be absolutised)
// TODO: absolutize srcet values rather than removing them
// To remove srcset from all image elements, site config files can contain: strip: //img/@srcset
$elems = $xpath->query("//img[@srcset and not(contains(@srcset, '//'))]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' srcset attributes');
foreach ($elems as $elem) {
$elem->removeAttribute('srcset');
if ($elem->hasAttribute('sizes')) {
$elem->removeAttribute('sizes');
}
}
}
// try to get body
foreach ($this->config->body as $pattern) {
$elems = @$xpath->query($pattern, $this->readability->dom);
@ -880,7 +874,7 @@ class ContentExtractor
}
} else {
// If there's an og:image, but we have no images in the article, let's place it at the beginning of the article.
if ($this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
if ($this->config->insert_detected_image() && $this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
$elems = @$xpath->query(".//img", $this->body);
if ($elems->length === 0) {
$_new_elem = $this->body->ownerDocument->createDocumentFragment();
@ -902,7 +896,7 @@ class ContentExtractor
return $this->success;
}
private function isDescendant(DOMElement $parent, DOMElement $child) {
$node = $child->parentNode;
while ($node != null) {

View File

@ -5,10 +5,10 @@
* Each instance of this class should hold extraction patterns and other directives
* for a website. See ContentExtractor class to see how it's used.
*
* @version 1.0
* @date 2015-06-09
* @version 1.1
* @date 2017-09-25
* @author Keyvan Minoukadeh
* @copyright 2015 Keyvan Minoukadeh
* @copyright 2017 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@ -43,7 +43,6 @@ class SiteConfig
// Process HTML with tidy before creating DOM (bool or null if undeclared)
public $tidy = null;
protected $default_tidy = true; // used if undeclared
// Autodetect title/body if xpath expressions fail to produce results.
@ -93,6 +92,12 @@ class SiteConfig
public $parser = null;
protected $default_parser = 'libxml'; // used if undeclared
// Insert detected image (currently only og:image) into beginning of extracted article
// Only does this if extracted article contains no images
// bool or null if undeclared
public $insert_detected_image = null;
protected $default_insert_detected_image = true; // used if undeclared
// Strings to search for in HTML before processing begins (used with $replace_string)
public $find_string = array();
// Strings to replace those found in $find_string before HTML processing begins
@ -101,10 +106,9 @@ class SiteConfig
// the options below cannot be set in the config files which this class represents
//public $cache_in_apc = false; // used to decide if we should cache in apc or not
public $cache_key = null;
public static $debug = false;
protected static $apc = false;
protected static $config_path;
protected static $config_path_custom;
protected static $config_path_fallback;
protected static $config_cache = array();
const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
@ -136,7 +140,13 @@ class SiteConfig
self::$apc = $apc;
return $apc;
}
// return bool or null
public function insert_detected_image($use_default=true) {
if ($use_default) return (isset($this->insert_detected_image)) ? $this->insert_detected_image : $this->default_insert_detected_image;
return $this->insert_detected_image;
}
// return bool or null
public function tidy($use_default=true) {
if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
@ -162,15 +172,32 @@ class SiteConfig
}
public static function set_config_path($path, $fallback=null) {
self::$config_path = $path;
self::$config_path_custom = $path;
self::$config_path_fallback = $fallback;
}
protected static function load_cached_merged($host, $exact_host_match) {
if ($exact_host_match) {
$key = $host.'.merged.ex';
} else {
$key = $host.'.merged';
}
return self::load_cached($key);
}
protected static function add_to_cache_merged($host, $exact_host_match, SiteConfig $config=null) {
if ($exact_host_match) {
$key = $host.'.merged.ex';
} else {
$key = $host.'.merged';
}
if (!isset($config)) $config = new SiteConfig();
self::add_to_cache($key, $config);
}
public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
$key = strtolower($key);
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
if ($config->cache_key) $key = $config->cache_key;
$key .= '.'.self::get_key_suffix();
self::$config_cache[$key] = $config;
if (self::$apc && $use_apc) {
self::debug("Adding site config to APC cache with key sc.$key");
@ -178,10 +205,23 @@ class SiteConfig
}
self::debug("Cached site config with key $key");
}
public static function load_cached($key) {
$key = strtolower($key);
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
//var_dump('in cache?', $key, self::$config_cache);
if (array_key_exists($key, self::$config_cache)) {
self::debug("... site config for $key already loaded in this request");
return self::$config_cache[$key];
} elseif (self::$apc && ($sconfig = apc_fetch("sc.$key"))) {
self::debug("... site config for $key found in APCu");
return $sconfig;
}
return false;
}
public static function is_cached($key) {
$key = strtolower($key);
$key .= '.'.self::get_key_suffix();
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
if (array_key_exists($key, self::$config_cache)) {
return true;
@ -212,7 +252,7 @@ class SiteConfig
}
// check for single statement commands
// we do not overwrite existing non null values
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure', 'insert_detected_image') as $var) {
if ($this->$var === null) $this->$var = $newconfig->$var;
}
// treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
@ -222,16 +262,6 @@ class SiteConfig
$this->$var = array_merge($this->$var, $newconfig->$var);
}
}
// This is used to make sure that when a different primary folder is chosen
// The key for the cached result includes that folder choice.
// Otherwise, a subsequent request choosing a different folder
// could return the wrong cached config.
public static function get_key_suffix() {
$key_suffix = basename(self::$config_path);
if ($key_suffix === 'custom') $key_suffix = '';
return $key_suffix;
}
// Add test_contains to last test_url
public function add_test_contains($test_contains) {
@ -274,6 +304,12 @@ class SiteConfig
$host = strtolower($host);
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
// got a merged one?
$config = self::load_cached_merged($host, $exact_host_match);
if ($config) {
//self::debug('. returned merged config from a previous request');
return $config;
}
// check for site configuration
$try = array($host);
// should we look for wildcard matches
@ -284,102 +320,87 @@ class SiteConfig
$try[] = '.'.implode('.', $split);
}
}
// Which primary folder should we look inside?
// If it's not the default ('custom'), we need
// a key suffix to distinguish site config fules
// held in this folder from those in other folders.
$key_suffix = self::get_key_suffix();
// look for site config file in primary folder
self::debug(". looking for site config for $host in primary folder");
// look for site config file in custom folder
self::debug(". looking for site config for $host in custom folder");
//var_dump($try);
$config = null;
$config_std = null;
foreach ($try as $h) {
$h_key = "$h.$key_suffix";
if (array_key_exists($h_key, self::$config_cache)) {
self::debug("... site config for $h already loaded in this request");
return self::$config_cache[$h_key];
} elseif (self::$apc && ($sconfig = apc_fetch("sc.$h_key"))) {
self::debug("... site config for $h in APC cache");
return $sconfig;
} elseif (file_exists(self::$config_path."/$h.txt")) {
//$h_key = $h.'.'.$key_suffix;
$h_key = $h.'.custom';
//var_dump($h_key, $h);
if ($config = self::load_cached($h_key)) {
break;
} elseif (file_exists(self::$config_path_custom."/$h.txt")) {
self::debug("... found site config ($h.txt)");
$file_primary = self::$config_path."/$h.txt";
$matched_name = $h;
$file_custom = self::$config_path_custom."/$h.txt";
$config = self::build_from_file($file_custom);
//$matched_name = $h;
break;
}
}
// if we found site config, process it
if (isset($file_primary)) {
$config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
if (!$config_lines || !is_array($config_lines)) return false;
$config = self::build_from_array($config_lines);
// if APC caching is available and enabled, mark this for cache
//$config->cache_in_apc = true;
$config->cache_key = $matched_name;
// if autodetec on failure is off (on by default) we do not need to look
// in secondary folder
if (!$config->autodetect_on_failure()) {
self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
return $config;
}
// if autodetec on failure is off (on by default) we do not need to look
// in secondary folder
if ($config && !$config->autodetect_on_failure()) {
self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
self::add_to_cache_merged($host, $exact_host_match, $config);
return $config;
}
// look for site config file in secondary folder
if (isset(self::$config_path_fallback)) {
self::debug(". looking for site config for $host in secondary folder");
self::debug(". looking for site config for $host in standard folder");
foreach ($try as $h) {
if (file_exists(self::$config_path_fallback."/$h.txt")) {
self::debug("... found site config in secondary folder ($h.txt)");
if ($config_std = self::load_cached($h)) {
break;
} elseif (file_exists(self::$config_path_fallback."/$h.txt")) {
self::debug("... found site config in standard folder ($h.txt)");
$file_secondary = self::$config_path_fallback."/$h.txt";
$matched_name = $h;
$config_std = self::build_from_file($file_secondary);
break;
}
}
if (!isset($file_secondary)) {
self::debug("... no site config match in secondary folder");
}
}
// return false if no config file found
if (!isset($file_primary) && !isset($file_secondary)) {
if (!$config && !$config_std) {
self::debug("... no site config match for $host");
self::add_to_cache_merged($host, $exact_host_match);
return false;
}
// return primary config if secondary not found
if (!isset($file_secondary) && isset($config)) {
return $config;
}
// process secondary config file
$config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
if (!$config_lines || !is_array($config_lines)) {
// failed to process secondary
if (isset($config)) {
// return primary config
return $config;
} else {
return false;
}
}
// merge with primary and return
if (isset($config)) {
// final config handling
$config_final = null;
if (!$config_std && $config) {
$config_final = $config;
// merge with primary
} elseif ($config_std && $config) {
self::debug('. merging config files');
$config->append(self::build_from_array($config_lines));
return $config;
$config->append($config_std);
$config_final = $config;
} else {
// return just secondary
$config = self::build_from_array($config_lines);
//$config = self::build_from_array($config_lines);
// if APC caching is available and enabled, mark this for cache
//$config->cache_in_apc = true;
$config->cache_key = $matched_name;
return $config;
$config_final = $config_std;
}
self::add_to_cache_merged($host, $exact_host_match, $config_final);
return $config_final;
}
public static function build_from_file($path, $cache=true) {
$key = basename($path, '.txt');
$config_lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
if (!$config_lines || !is_array($config_lines)) return false;
$config = self::build_from_array($config_lines);
if ($cache) self::add_to_cache($key, $config);
return $config;
}
public static function build_from_string($string) {
$config_lines = explode("\n", $string);
return self::build_from_array($config_lines);
@ -399,13 +420,23 @@ class SiteConfig
if (count($command) != 2) continue;
$val = trim($command[1]);
$command = trim($command[0]);
if ($command == '' || $val == '') continue;
//if ($command == '' || $val == '') continue;
// $val can be empty, e.g. replace_string:
if ($command == '') continue;
// strip_attr is now an alias for strip.
// In FTR 3.8 we can strip attributes from elements, not only the elements themselves
// e.g. strip: //img/@srcset (removes srcset attribute from all img elements)
// but for backward compatibility (to avoid errors with new config files + old version of FTR)
// we've introduced strip_attr and we'll recommend using that in our public site config rep.
// strip_attr: //img/@srcset
if ($command == 'strip_attr') $command = 'strip';
// check for commands where we accept multiple statements
if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
array_push($config->$command, $val);
// check for single statement commands that evaluate to true or false
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure', 'insert_detected_image'))) {
$config->$command = ($val == 'yes');
// check for single statement commands stored as strings
} elseif (in_array($command, array('parser'))) {

View File

@ -186,5 +186,4 @@
$this->setElement('enclosure','',$attributes);
}
} // end of class FeedItem
?>
}

View File

@ -1,4 +1,6 @@
<?php
define('ATOM', -1); // unused
define('RSS1', 0); // unused
define('RSS2', 1);
define('JSON', 2);
define('JSONP', 3);

File diff suppressed because one or more lines are too long

View File

@ -274,7 +274,8 @@ class DOMTreeBuilder implements EventHandler
// SPECIAL TAG HANDLING:
// Spec says do this, and "don't ask."
if ($name == 'image') {
// find the spec where this is defined... looks problematic
if ($name == 'image' && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) {
$name = 'img';
}
@ -681,4 +682,4 @@ class DOMTreeBuilder implements EventHandler
{
return $this->current->tagName == $tagname;
}
}
}

View File

@ -83,11 +83,8 @@ class Tokenizer
*/
public function parse()
{
$p = 0;
do {
$p = $this->scanner->position();
$this->consumeData();
// FIXME: Add infinite loop protection.
} while ($this->carryOn);
}
@ -145,7 +142,8 @@ class Tokenizer
*/
protected function characterData()
{
if ($this->scanner->current() === false) {
$tok = $this->scanner->current();
if ($tok === false) {
return false;
}
switch ($this->textMode) {
@ -154,7 +152,6 @@ class Tokenizer
case Elements::TEXT_RCDATA:
return $this->rcdata();
default:
$tok = $this->scanner->current();
if (strspn($tok, "<&")) {
return false;
}
@ -408,24 +405,26 @@ class Tokenizer
if ($tok == '/') {
$this->scanner->next();
$this->scanner->whitespace();
if ($this->scanner->current() == '>') {
$tok = $this->scanner->current();
if ($tok == '>') {
$selfClose = true;
return true;
}
if ($this->scanner->current() === false) {
if ($tok === false) {
$this->parseError("Unexpected EOF inside of tag.");
return true;
}
// Basically, we skip the / token and go on.
// See 8.2.4.43.
$this->parseError("Unexpected '%s' inside of a tag.", $this->scanner->current());
$this->parseError("Unexpected '%s' inside of a tag.", $tok);
return false;
}
if ($this->scanner->current() == '>') {
if ($tok == '>') {
return true;
}
if ($this->scanner->current() === false) {
if ($tok === false) {
$this->parseError("Unexpected EOF inside of tag.");
return true;
}
@ -541,15 +540,21 @@ class Tokenizer
{
$stoplist = "\f" . $quote;
$val = '';
$tok = $this->scanner->current();
while (strspn($tok, $stoplist) == 0 && $tok !== false) {
if ($tok == '&') {
$val .= $this->decodeCharacterReference(true);
$tok = $this->scanner->current();
while (true) {
$tokens = $this->scanner->charsUntil($stoplist.'&');
if ($tokens !== false) {
$val .= $tokens;
} else {
$val .= $tok;
$tok = $this->scanner->next();
break;
}
$tok = $this->scanner->current();
if ($tok == '&') {
$val .= $this->decodeCharacterReference(true, $tok);
continue;
}
break;
}
$this->scanner->next();
return $val;
@ -591,18 +596,18 @@ class Tokenizer
*/
protected function bogusComment($leading = '')
{
// TODO: This can be done more efficiently when the
// scanner exposes a readUntil() method.
$comment = $leading;
$tokens = $this->scanner->charsUntil('>');
if ($tokens !== false) {
$comment .= $tokens;
}
$tok = $this->scanner->current();
do {
if ($tok !== false) {
$comment .= $tok;
$tok = $this->scanner->next();
} while ($tok !== false && $tok != '>');
}
$this->flushBuffer();
$this->events->comment($comment . $tok);
$this->events->comment($comment);
$this->scanner->next();
return true;
@ -646,15 +651,17 @@ class Tokenizer
*/
protected function isCommentEnd()
{
$tok = $this->scanner->current();
// EOF
if ($this->scanner->current() === false) {
if ($tok === false) {
// Hit the end.
$this->parseError("Unexpected EOF in a comment.");
return true;
}
// If it doesn't start with -, not the end.
if ($this->scanner->current() != '-') {
if ($tok != '-') {
return false;
}
@ -737,7 +744,6 @@ class Tokenizer
$pub = strtoupper($this->scanner->getAsciiAlpha());
$white = strlen($this->scanner->whitespace());
$tok = $this->scanner->current();
// Get ID, and flag it as pub or system.
if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
@ -938,10 +944,11 @@ class Tokenizer
$len = strlen($sequence);
$buffer = '';
for ($i = 0; $i < $len; ++ $i) {
$buffer .= $this->scanner->current();
$tok = $this->scanner->current();
$buffer .= $tok;
// EOF. Rewind and let the caller handle it.
if ($this->scanner->current() === false) {
if ($tok === false) {
$this->scanner->unconsume($i);
return false;
}
@ -1067,18 +1074,22 @@ class Tokenizer
}
$entity = CharacterReference::lookupDecimal($numeric);
}
} // String entity.
else {
} elseif ($tok === '=' && $inAttribute) {
return '&';
} else { // String entity.
// Attempt to consume a string up to a ';'.
// [a-zA-Z0-9]+;
$cname = $this->scanner->getAsciiAlpha();
$cname = $this->scanner->getAsciiAlphaNum();
$entity = CharacterReference::lookupName($cname);
// When no entity is found provide the name of the unmatched string
// and continue on as the & is not part of an entity. The & will
// be converted to &amp; elsewhere.
if ($entity == null) {
$this->parseError("No match in entity table for '%s'", $cname);
if (!$inAttribute || strlen($cname) === 0) {
$this->parseError("No match in entity table for '%s'", $cname);
}
$this->scanner->unconsume($this->scanner->position() - $start);
return '&';
}

View File

@ -1,14 +1,16 @@
# HTML5-PHP
The need for an HTML5 parser in PHP is clear. This project initially
began with the seemingly abandoned `html5lib` project [original source](https://code.google.com/p/html5lib/source/checkout).
But after some initial refactoring work, we began a new parser.
HTML5 is a standards-compliant HTML5 parser and writer written entirely in PHP.
It is stable and used in many production websites, and has
well over [one million downloads](https://packagist.org/packages/masterminds/html5).
HTML5 provides the following features.
- An HTML5 serializer
- Support for PHP namespaces
- Composer support
- Event-based (SAX-like) parser
- DOM tree builder
- A DOM tree builder
- Interoperability with [QueryPath](https://github.com/technosophos/querypath)
- Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer
@ -16,6 +18,7 @@ But after some initial refactoring work, we began a new parser.
[![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5)
[![Code Coverage](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
[![Stability: Sustained](https://masterminds.github.io/stability/sustained.svg)](https://masterminds.github.io/stability/sustained.html)
## Installation
@ -23,7 +26,7 @@ Install HTML5-PHP using [composer](http://getcomposer.org/).
To install, add `masterminds/html5` to your `composer.json` file:
```
```json
{
"require" : {
"masterminds/html5": "2.*"

View File

@ -1,6 +1,13 @@
# Release Notes
2.2.2 (2016-10-22)
2.3.0 (2017-09-04)
- #129: image within inline svg breaks system (fixed by #133)
- #131: &sup2; does not work (fixed by #132)
- #134: Improve tokenizer performance by 20% (alternative version of #130 thanks to @MichaelHeerklotz)
- #135: Raw & in attributes
2.2.2 (2016-09-22)
- #116: In XML mode, tags are case sensitive
- #115: Fix PHP Notice in OutputRules
@ -14,8 +21,7 @@
2.2.0 (2016-04-11)
- #105: Enable composer cache (for CI/CD)
- #100: Use mb_substitute_character inset of ini_set for environments where
ini_set is disable (e.g., shared hosting)
- #100: Use mb_substitute_character inset of ini_set for environments where ini_set is disable (e.g., shared hosting)
- #98: Allow link, meta, style tags in noscript tags
- #96: Fixed xml:href on svgs that use the "use" breaking
- #94: Counting UTF8 characters performance improvement

View File

@ -7,8 +7,8 @@
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
* @version 1.7
* @date 2016-11-28
* @version 1.8
* @date 2017-09-25
* @see http://devel-m6w6.rhcloud.com/mdref/http
* @author Keyvan Minoukadeh
* @copyright 2011-2016 Keyvan Minoukadeh
@ -21,8 +21,9 @@ class HumbleHttpAgent
const METHOD_CURL_MULTI = 2;
const METHOD_FILE_GET_CONTENTS = 4;
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36';
const UA_PHP = 'PHP/5.6';
// popular user agents from https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36';
const UA_PHP = 'PHP/7.1';
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
protected $requests = array();
@ -194,6 +195,24 @@ class HumbleHttpAgent
public function getMetaRefreshURL($url, $html) {
if ($html == '') return false;
// TODO: parse HTML properly
// For now, to deal with cases where meta refresh matches but shouldn't, e.g. CNN's
// <!--[if lte IE 9]><meta http-equiv="refresh" content="1;url=/2.37.2/static/unsupp.html" /><![endif]-->
// we do the string replacements in the site config file before looking for the meta refresh
if (isset($this->siteConfigBuilder)) {
$sconfig = $this->siteConfigBuilder->buildSiteConfig($url);
// do string replacements
if (!empty($sconfig->find_string)) {
if (count($sconfig->find_string) == count($sconfig->replace_string)) {
$html = str_replace($sconfig->find_string, $sconfig->replace_string, $html, $_count);
//$this->debug("Strings replaced: $_count (find_string and/or replace_string)");
} else {
//$this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
}
}
}
// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']?!i', $html, $match)) {
return false;
@ -211,7 +230,7 @@ class HumbleHttpAgent
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
return $absolute->get_iri();
return $absolute->get_uri();
}
return false;
}
@ -248,6 +267,21 @@ class HumbleHttpAgent
}
}
public function convertIdn($url) {
if (function_exists('idn_to_ascii')) {
if ($host = @parse_url($url, PHP_URL_HOST)) {
$puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46);
if ($host != $puny) {
$pos = strpos($url, $host);
if ($pos !== false) {
$url = substr_replace($url, $puny, $pos, strlen($host));
}
}
}
}
return $url;
}
public function rewriteUrls($url) {
foreach ($this->rewriteUrls as $find => $action) {
if (strpos($url, $find) !== false) {
@ -327,6 +361,7 @@ class HumbleHttpAgent
} else {
$this->debug("......adding to pool");
$req_url = $this->rewriteUrls($url);
$req_url = $this->convertIdn($req_url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
@ -507,6 +542,7 @@ class HumbleHttpAgent
} else {
$this->debug("......adding to pool");
$req_url = $this->rewriteUrls($url);
$req_url = $this->convertIdn($req_url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
@ -649,6 +685,7 @@ class HumbleHttpAgent
$this->debug("Sending request for $url");
$this->requests[$orig]['original_url'] = $orig;
$req_url = $this->rewriteUrls($url);
$req_url = $this->convertIdn($req_url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
$httpContext = $this->httpContext;

View File

@ -1,5 +1,4 @@
<?php
/**
* Detects the language of a given piece of text.
*
@ -73,10 +72,9 @@ class Text_LanguageDetect
* If this value starts with a slash (/) or a dot (.) the value of
* $this->_data_dir will be ignored
*
* @var string
* @access private
* @var string
*/
var $_db_filename = 'lang.dat';
protected $_db_filename = 'lang.dat';
/**
* The filename that stores the unicode block definitions
@ -85,83 +83,74 @@ class Text_LanguageDetect
* $this->_data_dir will be ignored
*
* @var string
* @access private
*/
var $_unicode_db_filename = 'unicode_blocks.dat';
protected $_unicode_db_filename = 'unicode_blocks.dat';
/**
* The data directory
*
* Should be set by PEAR installer
*
* @var string
* @access private
* @var string
*/
var $_data_dir = '@data_dir@';
protected $_data_dir = '@data_dir@';
/**
* The trigram data for comparison
*
* Will be loaded on start from $this->_db_filename
*
* @var array
* @access private
*/
var $_lang_db = array();
/**
* stores the map of the trigram data to unicode characters
*
* @access private
* @var array
*/
var $_unicode_map;
protected $_lang_db = array();
/**
* Stores the map of the trigram data to unicode characters
*
* @var array
*/
protected $_unicode_map;
/**
* The size of the trigram data arrays
*
* @var int
* @access private
* @var int
*/
var $_threshold = 300;
protected $_threshold = 300;
/**
* the maximum possible score.
* The maximum possible score.
*
* needed for score normalization. Different depending on the
* Needed for score normalization. Different depending on the
* perl compatibility setting
*
* @access private
* @var int
* @see setPerlCompatible()
* @var int
* @see setPerlCompatible()
*/
var $_max_score = 0;
protected $_max_score = 0;
/**
* Whether or not to simulate perl's Language::Guess exactly
*
* @access private
* @var bool
* @see setPerlCompatible()
* @var bool
* @see setPerlCompatible()
*/
var $_perl_compatible = false;
protected $_perl_compatible = false;
/**
* Whether to use the unicode block detection to speed up processing
*
* @access private
* @var bool
*/
var $_use_unicode_narrowing = true;
protected $_use_unicode_narrowing = true;
/**
* stores the result of the clustering operation
* Stores the result of the clustering operation
*
* @access private
* @var array
* @see clusterLanguages()
* @var array
* @see clusterLanguages()
*/
var $_clusters;
protected $_clusters;
/**
* Which type of "language names" are accepted and returned:
@ -170,7 +159,7 @@ class Text_LanguageDetect
* 2 - 2-letter ISO 639-1 code ("en")
* 3 - 3-letter ISO 639-2 code ("eng")
*/
var $_name_mode = 0;
protected $_name_mode = 0;
/**
* Constructor
@ -178,7 +167,7 @@ class Text_LanguageDetect
* Will attempt to load the language database. If it fails, you will get
* an exception.
*/
function __construct()
public function __construct()
{
$data = $this->_readdb($this->_db_filename);
$this->_checkTrigram($data['trigram']);
@ -200,9 +189,8 @@ class Text_LanguageDetect
* @param string $fname File name to load
*
* @return string expected path to the language model database
* @access private
*/
function _get_data_loc($fname)
protected function _get_data_loc($fname)
{
return dirname(__FILE__).'/'.$fname;
}
@ -216,9 +204,8 @@ class Text_LanguageDetect
*
* @return array the language model data
* @throws Text_LanguageDetect_Exception
* @access private
*/
function _readdb($fname)
protected function _readdb($fname)
{
// finds the correct data dir
$fname = $this->_get_data_loc($fname);
@ -246,9 +233,8 @@ class Text_LanguageDetect
* @param array $trigram Trigram data from database
*
* @return void
* @access private
*/
function _checkTrigram($trigram)
protected function _checkTrigram($trigram)
{
if (!is_array($trigram)) {
if (ini_get('magic_quotes_runtime')) {
@ -340,11 +326,10 @@ class Text_LanguageDetect
/**
* Returns the number of languages that this object can detect
*
* @access public
* @return int the number of languages
* @throws Text_LanguageDetect_Exception
* @throws Text_LanguageDetect_Exception
*/
function getLanguageCount()
public function getLanguageCount()
{
return count($this->_lang_db);
}
@ -382,11 +367,10 @@ class Text_LanguageDetect
/**
* Returns the list of detectable languages
*
* @access public
* @return array the names of the languages known to this object<<<<<<<
* @throws Text_LanguageDetect_Exception
* @throws Text_LanguageDetect_Exception
*/
function getLanguages()
public function getLanguages()
{
return $this->_convertToNameMode(
array_keys($this->_lang_db)
@ -424,7 +408,7 @@ class Text_LanguageDetect
*
* @return void
*/
function setNameMode($name_mode)
public function setNameMode($name_mode)
{
$this->_name_mode = $name_mode;
}
@ -454,10 +438,9 @@ class Text_LanguageDetect
* @param string $text text to convert
*
* @return array array of trigram frequencies
* @access private
* @deprecated Superceded by the Text_LanguageDetect_Parser class
*/
function _trigram($text)
protected function _trigram($text)
{
$s = new Text_LanguageDetect_Parser($text);
$s->prepareTrigram();
@ -475,9 +458,8 @@ class Text_LanguageDetect
* @param array $arr array of trigram
*
* @return array ranks of trigrams
* @access protected
*/
function _arr_rank($arr)
protected function _arr_rank($arr)
{
// sorts alphabetically first as a standard way of breaking rank ties
@ -505,12 +487,11 @@ class Text_LanguageDetect
/**
* Sorts an array by value breaking ties alphabetically
*
* @param array &$arr the array to sort
* @param array $arr the array to sort
*
* @return void
* @access private
*/
function _bub_sort(&$arr)
protected function _bub_sort(&$arr)
{
// should do the same as this perl statement:
// sort { $trigrams{$b} == $trigrams{$a}
@ -548,9 +529,8 @@ class Text_LanguageDetect
*
* @return int 1 if $a is greater, -1 if not
* @see _bub_sort()
* @access private
*/
function _sort_func($a, $b)
protected function _sort_func($a, $b)
{
// each is actually a key/value pair, so that it can compare using both
list($a_key, $a_value) = $a;
@ -588,9 +568,8 @@ class Text_LanguageDetect
*
* @return int the sum of the differences between the ranks of
* the two trigram sets
* @access private
*/
function _distance($arr1, $arr2)
protected function _distance($arr1, $arr2)
{
$sumdist = 0;
@ -621,9 +600,8 @@ class Text_LanguageDetect
*
* @return float the normalized score
* @see _distance()
* @access private
*/
function _normalize_score($score, $base_count = null)
protected function _normalize_score($score, $base_count = null)
{
if ($base_count === null) {
$base_count = $this->_threshold;
@ -699,7 +677,7 @@ class Text_LanguageDetect
$sample_obj->setPadStart(!$this->_perl_compatible);
$sample_obj->analyze();
$trigram_freqs =& $sample_obj->getTrigramRanks();
$trigram_freqs = $sample_obj->getTrigramRanks();
$trigram_count = count($trigram_freqs);
if ($trigram_count == 0) {
@ -710,7 +688,7 @@ class Text_LanguageDetect
// use unicode block detection to narrow down the possibilities
if ($this->_use_unicode_narrowing) {
$blocks =& $sample_obj->getUnicodeBlocks();
$blocks = $sample_obj->getUnicodeBlocks();
if (is_array($blocks)) {
$present_blocks = array_keys($blocks);
@ -962,16 +940,15 @@ class Text_LanguageDetect
*
* @return mixed Block name, -1 if it failed
* @see unicodeBlockName()
* @access protected
*/
function _unicode_block_name($unicode, $blocks, $block_count = -1)
protected function _unicode_block_name($unicode, $blocks, $block_count = -1)
{
// for a reference, see
// http://www.unicode.org/Public/UNIDATA/Blocks.txt
// assume that ascii characters are the most common
// so try it first for efficiency
if ($unicode <= hexdec($blocks[0][1])) {
if ($unicode <= $blocks[0][1]) {
return $blocks[0];
}
@ -989,11 +966,11 @@ class Text_LanguageDetect
while ($low <= $high) {
$mid = floor(($low + $high) / 2);
if ($unicode < hexdec($blocks[$mid][0])) {
if ($unicode < $blocks[$mid][0]) {
// if it's lower than the lower bound
$high = $mid - 1;
} elseif ($unicode > hexdec($blocks[$mid][1])) {
} elseif ($unicode > $blocks[$mid][1]) {
// if it's higher than the upper bound
$low = $mid + 1;
@ -1015,9 +992,8 @@ class Text_LanguageDetect
*
* @return array the database of unicode block definitions
* @throws Text_LanguageDetect_Exception
* @access protected
*/
function _read_unicode_block_db()
protected function _read_unicode_block_db()
{
// since the unicode definitions are always going to be the same,
// might as well share the memory for the db with all other instances
@ -1136,14 +1112,13 @@ class Text_LanguageDetect
* Uses a nearest neighbor technique to generate the maximum possible
* number of dendograms from the similarity data.
*
* @access public
* @return array language cluster data
* @throws Text_LanguageDetect_Exception
* @see languageSimilarity()
* @deprecated this function will eventually be removed and placed into
* @return array language cluster data
* @throws Text_LanguageDetect_Exception
* @see languageSimilarity()
* @deprecated this function will eventually be removed and placed into
* the model generation class
*/
function clusterLanguages()
public function clusterLanguages()
{
// todo: set the maximum number of clusters
// return cached result, if any
@ -1452,7 +1427,7 @@ class Text_LanguageDetect
}
/**
* ut8-safe strlen()
* UTF8-safe strlen()
*
* Returns the numbers of characters (not bytes) in a utf8 string
*
@ -1476,10 +1451,9 @@ class Text_LanguageDetect
* @param string $char a utf8 (possibly multi-byte) char
*
* @return int unicode value
* @access protected
* @link http://en.wikipedia.org/wiki/UTF-8
*/
function _utf8char2unicode($char)
protected function _utf8char2unicode($char)
{
// strlen() here will actually get the binary length of a single char
switch (strlen($char)) {
@ -1516,20 +1490,19 @@ class Text_LanguageDetect
}
/**
* utf8-safe fast character iterator
* UTF8-safe fast character iterator
*
* Will get the next character starting from $counter, which will then be
* incremented. If a multi-byte char the bytes will be concatenated and
* $counter will be incremeted by the number of bytes in the char.
*
* @param string $str the string being iterated over
* @param int &$counter the iterator, will increment by reference
* @param int $counter the iterator, will increment by reference
* @param bool $special_convert whether to do special conversions
*
* @return char the next (possibly multi-byte) char from $counter
* @access private
*/
static function _next_char($str, &$counter, $special_convert = false)
protected static function _next_char($str, &$counter, $special_convert = false)
{
$char = $str{$counter++};
$ord = ord($char);
@ -1621,7 +1594,7 @@ class Text_LanguageDetect
*
* @return string|array Language name
*/
function _convertFromNameMode($lang, $convertKey = false)
protected function _convertFromNameMode($lang, $convertKey = false)
{
if ($this->_name_mode == 0) {
return $lang;
@ -1661,7 +1634,7 @@ class Text_LanguageDetect
*
* @return string|array Language name
*/
function _convertToNameMode($lang, $convertKey = false)
protected function _convertToNameMode($lang, $convertKey = false)
{
if ($this->_name_mode == 0) {
return $lang;
@ -1688,6 +1661,4 @@ class Text_LanguageDetect
}
return $newlang;
}
}
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
}

View File

@ -1,4 +1,16 @@
<?php
/**
* Part of Text_LanguageDetect
*
* PHP version 5
*
* @category Text
* @package Text_LanguageDetect
* @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
* @license BSD http://www.opensource.org/licenses/bsd-license.php
* @link http://pear.php.net/package/Text_LanguageDetect/
*/
class Text_LanguageDetect_Exception extends Exception
{
/**

View File

@ -1,18 +1,4 @@
<?php
/**
* Part of Text_LanguageDetect
*
* PHP version 5
*
* @category Text
* @package Text_LanguageDetect
* @author Christian Weiske <cweiske@php.net>
* @copyright 2011 Christian Weiske <cweiske@php.net>
* @license http://www.debian.org/misc/bsd.license BSD
* @version SVN: $Id$
* @link http://pear.php.net/package/Text_LanguageDetect/
*/
/**
* Provides a mapping between the languages from lang.dat and the
* ISO 639-1 and ISO-639-2 codes.
@ -23,7 +9,7 @@
* @package Text_LanguageDetect
* @author Christian Weiske <cweiske@php.net>
* @copyright 2011 Christian Weiske <cweiske@php.net>
* @license http://www.debian.org/misc/bsd.license BSD
* @license BSD http://www.opensource.org/licenses/bsd-license.php
* @link http://www.loc.gov/standards/iso639-2/php/code_list.php
*/
class Text_LanguageDetect_ISO639

View File

@ -1,18 +1,4 @@
<?php
/**
* This class represents a text sample to be parsed.
*
* @category Text
* @package Text_LanguageDetect
* @author Nicholas Pisarro
* @copyright 2006
* @license BSD
* @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
* @link http://pear.php.net/package/Text_LanguageDetect/
* @link http://langdetect.blogspot.com/
*/
/**
* This class represents a text sample to be parsed.
*
@ -20,99 +6,106 @@
* class. After a new profile has been built, the data can be retrieved using
* the accessor functions.
*
* This class is intended to be used by the Text_LanguageDetect class, not
* This class is intended to be used by the Text_LanguageDetect class, not
* end-users.
*
* @category Text
* @package Text_LanguageDetect
* @author Nicholas Pisarro
* @copyright 2006
* @license BSD
* @version release: 0.3.0
* @category Text
* @package Text_LanguageDetect
* @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
* @copyright 2006 Nicholas Pisarro
* @license BSD http://www.opensource.org/licenses/bsd-license.php
* @version Release: 1.0.0
* @link http://pear.php.net/package/Text_LanguageDetect/
*/
class Text_LanguageDetect_Parser extends Text_LanguageDetect
{
/**
* the piece of text being parsed
* The piece of text being parsed
*
* @access private
* @var string
* @var string
*/
var $_string;
protected $_string;
/**
* stores the trigram frequencies of the sample
* Stores the trigram frequencies of the sample
*
* @access private
* @var string
* @var string
*/
var $_trigrams = array();
protected $_trigrams = array();
/**
* stores the trigram ranks of the sample
* Stores the trigram ranks of the sample
*
* @access private
* @var array
* @var array
*/
var $_trigram_ranks = array();
protected $_trigram_ranks = array();
/**
* stores the unicode blocks of the sample
* Stores the unicode blocks of the sample
*
* @access private
* @var array
* @var array
*/
var $_unicode_blocks = array();
protected $_unicode_blocks = array();
/**
* Whether the parser should compile the unicode ranges
*
* @access private
* @var bool
*
* @var bool
*/
var $_compile_unicode = false;
protected $_compile_unicode = false;
/**
* Whether the parser should compile trigrams
*
* @access private
* @var bool
* @var bool
*/
var $_compile_trigram = false;
protected $_compile_trigram = false;
/**
* Whether the trigram parser should pad the beginning of the string
*
* @access private
* @var bool
* @var bool
*/
var $_trigram_pad_start = false;
protected $_trigram_pad_start = false;
/**
* Whether the unicode parser should skip non-alphabetical ascii chars
*
* @access private
* @var bool
* @var bool
*/
var $_unicode_skip_symbols = true;
protected $_unicode_skip_symbols = true;
/**
* Constructor
*
* @access private
* @param string $string string to be parsed
* @param string $string string to be parsed
*/
function __construct($string) {
public function __construct($string)
{
$this->_string = $string;
}
/**
* PHP 4 constructor for backwards compatibility.
*
* @param string $string string to be parsed
*
* @return void
*/
public function Text_LanguageDetect_Parser($string)
{
self::__construct($string);
}
/**
* Returns true if a string is suitable for parsing
*
* @param string $str input string to test
* @return bool true if acceptable, false if not
* @param string $str input string to test
*
* @return bool true if acceptable, false if not
*/
public static function validateString($str) {
public static function validateString($str)
{
if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
return true;
} else {
@ -121,34 +114,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
}
/**
* turn on/off trigram counting
* Turn on/off trigram counting
*
* @access public
* @param bool $bool true for on, false for off
* @param bool $bool true for on, false for off
*
* @return void
*/
function prepareTrigram($bool = true)
public function prepareTrigram($bool = true)
{
$this->_compile_trigram = $bool;
}
/**
* turn on/off unicode block counting
* Turn on/off unicode block counting
*
* @access public
* @param bool $bool true for on, false for off
* @param bool $bool true for on, false for off
*
* @return void
*/
function prepareUnicode($bool = true)
public function prepareUnicode($bool = true)
{
$this->_compile_unicode = $bool;
}
/**
* turn on/off padding the beginning of the sample string
* Turn on/off padding the beginning of the sample string
*
* @access public
* @param bool $bool true for on, false for off
* @param bool $bool true for on, false for off
*
* @return void
*/
function setPadStart($bool = true)
public function setPadStart($bool = true)
{
$this->_trigram_pad_start = $bool;
}
@ -156,10 +152,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
/**
* Should the unicode block counter skip non-alphabetical ascii chars?
*
* @access public
* @param bool $bool true for on, false for off
* @param bool $bool true for on, false for off
*
* @return void
*/
function setUnicodeSkipSymbols($bool = true)
public function setUnicodeSkipSymbols($bool = true)
{
$this->_unicode_skip_symbols = $bool;
}
@ -167,10 +164,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
/**
* Returns the trigram ranks for the text sample
*
* @access public
* @return array trigram ranks in the text sample
* @return array Trigram ranks in the text sample
*/
function &getTrigramRanks()
public function getTrigramRanks()
{
return $this->_trigram_ranks;
}
@ -178,39 +174,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
/**
* Return the trigram freqency table
*
* only used in testing to make sure the parser is working
* Only used in testing to make sure the parser is working
*
* @access public
* @return array trigram freqencies in the text sample
* @return array Trigram freqencies in the text sample
*/
function &getTrigramFreqs()
public function getTrigramFreqs()
{
return $this->_trigram;
}
/**
* returns the array of unicode blocks
* Returns the array of unicode blocks
*
* @access public
* @return array unicode blocks in the text sample
* @return array Unicode blocks in the text sample
*/
function &getUnicodeBlocks()
public function getUnicodeBlocks()
{
return $this->_unicode_blocks;
}
/**
* Executes the parsing operation
*
* Be sure to call the set*() functions to set options and the
*
* Be sure to call the set*() functions to set options and the
* prepare*() functions first to tell it what kind of data to compute
*
* Afterwards the get*() functions can be used to access the compiled
* information.
*
* @access public
* @return void
*/
function analyze()
public function analyze()
{
$len = strlen($this->_string);
$byte_counter = 0;
@ -258,9 +252,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
if ($this->_compile_trigram) {
if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
if (!isset($this->_trigram[$a . $b . $char])) {
$this->_trigram[$a . $b . $char] = 1;
$this->_trigram[$a . $b . $char] = 1;
} else {
$this->_trigram[$a . $b . $char]++;
$this->_trigram[$a . $b . $char]++;
}
}
@ -271,10 +265,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
// unicode block detection
if ($this->_compile_unicode) {
if ($this->_unicode_skip_symbols
&& strlen($char) == 1
&& ($char < 'A' || $char > 'z'
|| ($char > 'Z' && $char < 'a'))
&& $char != "'") { // does not skip the apostrophe
&& strlen($char) == 1
&& ($char < 'A' || $char > 'z'
|| ($char > 'Z' && $char < 'a'))
&& $char != "'"
) { // does not skip the apostrophe
// since it's included in the language
// models
@ -297,7 +292,8 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
if ($this->_compile_unicode) {
foreach ($unicode_chars as $utf8_char => $count) {
$search_result = $this->_unicode_block_name(
$this->_utf8char2unicode($utf8_char), $blocks, $block_count);
$this->_utf8char2unicode($utf8_char), $blocks, $block_count
);
if ($search_result != -1) {
$block_name = $search_result[2];
@ -342,6 +338,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
}
}
}
}
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
}

File diff suppressed because one or more lines are too long

View File

@ -122,6 +122,7 @@ class Readability
if ($parser=='gumbo') {
// Can we avoid this encoding/deocding step? Test on:
// http://www.medialens.org/index.php/alerts/alert-archive/2017/837-undermining-democracy-corporate-media-bias-on-jeremy-corbyn-boris-johnson-and-syria.html
$html = str_replace('&apos;', "'", $html); // other named entities handled okay
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
$html = mb_convert_encoding($html, "UTF-8", 'HTML-ENTITIES');
$this->dom = @Layershifter\Gumbo\Parser::load($html);

View File

@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh
// Copyright (c) 2017 Keyvan Minoukadeh
// License: AGPLv3
// Version: 3.7
// Date: 2017-02-12
// Version: 3.8
// Date: 2017-09-25
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
@ -183,7 +183,9 @@ if (!isset($_REQUEST['url'])) {
die('No URL supplied');
}
$url = trim($_REQUEST['url']);
if (strtolower(substr($url, 0, 7)) == 'feed://') {
if (strtolower(substr($url, 0, 6)) == 'sec://') {
$url = 'https://'.substr($url, 6);
} elseif (strtolower(substr($url, 0, 7)) == 'feed://') {
$url = 'http://'.substr($url, 7);
}
if (!preg_match('!^https?://.+!i', $url)) {
@ -345,10 +347,10 @@ if ($options->content === 'user') {
// HTML5 output?
///////////////////////////////////////////////
if ($options->html5_output === 'user') {
if (isset($_REQUEST['content']) && $_REQUEST['content'] === 'html5') {
$options->html5_output = true;
} else {
if (isset($_REQUEST['content']) && $_REQUEST['content'] === '1') {
$options->html5_output = false;
} else {
$options->html5_output = true;
}
}
@ -820,7 +822,7 @@ foreach ($items as $key => $item) {
continue; // skip this feed item entry
}
}
$base_url = get_base_url($readability->dom);
$base_url = get_base_url($readability->dom, $effective_url);
if (!$base_url) $base_url = $effective_url;
$content_block = ($extract_result) ? $extractor->getContent() : null;
$extracted_title = ($extract_result) ? $extractor->getTitle() : '';
@ -945,6 +947,7 @@ foreach ($items as $key => $item) {
//unset($content_block);
// post-processing cleanup
$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
$html = str_replace('<p>&nbsp;</p>', '', $html);
if ($links == 'remove') {
$html = preg_replace('!<a\s+[^>]*>!', '', $html);
$html = preg_replace('!</a>!', '', $html);
@ -1080,6 +1083,7 @@ foreach ($items as $key => $item) {
$l_result = $l->detect($text_sample, 1);
if (count($l_result) > 0) {
$language = key($l_result);
debug('Language detected: '.$language);
}
}
} catch (Exception $e) {
@ -1248,6 +1252,17 @@ function get_self_url() {
}
function validate_url($url) {
if (function_exists('idn_to_ascii')) {
if ($host = @parse_url($url, PHP_URL_HOST)) {
$puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46);
if ($host != $puny) {
$pos = strpos($url, $host);
if ($pos !== false) {
$url = substr_replace($url, $puny, $pos, strlen($host));
}
}
}
}
$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
@ -1261,9 +1276,14 @@ function validate_url($url) {
}
}
function get_base_url($dom) {
function get_base_url($dom, $url=null) {
$xpath = new DOMXPath($dom);
return @$xpath->evaluate('string(//head/base/@href)', $dom);
$base = @$xpath->evaluate('string(//head/base/@href)', $dom);
if (!$base) return false;
if (isset($url) && !preg_match('!^https?://!i', $base)) {
$base = make_absolute_str($url, $base);
}
return $base;
}
function is_ssl() {
@ -1436,7 +1456,7 @@ function make_absolute_attr($base, $e, $attr) {
$url = str_replace(' ', '%20', $url);
if (!preg_match('!https?://!i', $url)) {
if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
$e->setAttribute($attr, $absolute);
$e->setAttribute($attr, $absolute->get_uri());
}
}
}
@ -1450,7 +1470,7 @@ function make_absolute_str($base, $url) {
return $url;
} else {
if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
return $absolute;
return $absolute->get_uri();
}
return false;
}
@ -1529,7 +1549,7 @@ function get_single_page($item, $html, $url) {
}
}
}
$base_url = get_base_url($readability->dom);
$base_url = get_base_url($readability->dom, $url);
if (!$base_url) $base_url = $url;
// If we've got URL, resolve against $base_url
if (isset($single_page_url) && ($single_page_url = make_absolute_str($base_url, $single_page_url))) {