diff --git a/changelog.txt b/changelog.txt
index 9918041..c66978a 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -2,6 +2,22 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
+3.8 (2017-09-25)
+ - New site config directive: strip_attr: XPath attribute selector (e.g. //img/@srcset) - remove attribute from element
+ - New site config directive: insert_detected_image: yes/no (default yes) - places image in og:image in the body if no other images extracted
+ - Bug fix: Better handling of Internationalized Domain Names (IDNs)
+ - Bug fix: Relative base URLs () now resolved against page URL
+ - Bug fix: Wrong site config file chosen in certain cases (when wildcard and exact subdomain files available and cached in APCu)
+ - Bug fix: ' HTML entities not converted correctly when parsing with Gumbo PHP
+ - Remove srcset (+ sizes) attributes on img elements if it looks like they only contain relative URLs (browser will use src attribute value instead)
+ - https:// URLs now re-written to sec:// before being submitted to avoid overzealous security software blocking request on some servers - no redirect, only affects newly submitted URLs on index.php
+ - HTML5-PHP library updated
+ - Language Detect library updated
+ - Site config files updated for better extraction
+ - Minimum PHP version is now 5.4. If you must use PHP 5.3, please stick with Full-Text RSS 3.7
+ - Tested with PHP 7.2
+ - Other fixes/improvements
+
3.7 (2017-02-12)
- Request HTML5 output using HTML5-PHP - new config option $options->html5_output and new request parameter &content=html5
- Improve support for lazy-loading images
@@ -23,31 +39,31 @@ CHANGELOG
- Other fixes/improvements
3.6 (2016-02-21)
-- Insert og:image (if we find one) at the top of the article when no images have been extracted
-- Additional lazy image load handling - helps preserve more images designed for JS-enabled browsers
-- Original GUID values from feed items now preserved
-- New config option favour_effective_url determines if item's effective URL (after redirects) should replace original item URL in feed output
-- Adding &use_effective_url to querystring will replace original feed item URL with effective URL (unless disabled with config option above)
-- APCu stats view in admin panel fixed to work with recent versions of APCu
-- HTML5-PHP library updated
-- Tested for PHP 7 compatibility
-- VPS Puppet script (ubuntu-15.10.pp) updated - fixes issue with IDN encodings, among other things. (This is intended for setting up a new Ubuntu 15.10 instance for running Full-Text RSS.)
-- Site config files updated for better extraction
-- Other minor fixes/improvements
+ - Insert og:image (if we find one) at the top of the article when no images have been extracted
+ - Additional lazy image load handling - helps preserve more images designed for JS-enabled browsers
+ - Original GUID values from feed items now preserved
+ - New config option favour_effective_url determines if item's effective URL (after redirects) should replace original item URL in feed output
+ - Adding &use_effective_url to querystring will replace original feed item URL with effective URL (unless disabled with config option above)
+ - APCu stats view in admin panel fixed to work with recent versions of APCu
+ - HTML5-PHP library updated
+ - Tested for PHP 7 compatibility
+ - VPS Puppet script (ubuntu-15.10.pp) updated - fixes issue with IDN encodings, among other things. (This is intended for setting up a new Ubuntu 15.10 instance for running Full-Text RSS.)
+ - Site config files updated for better extraction
+ - Other minor fixes/improvements
3.5 (2015-06-13)
-- Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
-- Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
-- Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky)
-- Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates)
-- Humble HTTP Agent updated to support version 2 of PHP's HTTP extension
-- HTML5-PHP library updated
-- Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6
-- Config option removed: $options->user_agents - use site config files.
-- Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional.
-- Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4
-- Site config files updated for better extraction
-- Other minor fixes/improvements
+ - Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
+ - Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
+ - Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky)
+ - Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates)
+ - Humble HTTP Agent updated to support version 2 of PHP's HTTP extension
+ - HTML5-PHP library updated
+ - Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6
+ - Config option removed: $options->user_agents - use site config files.
+ - Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional.
+ - Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4
+ - Site config files updated for better extraction
+ - Other minor fixes/improvements
3.4 (2014-09-08)
- New request parameter: siteconfig lets you submit extraction rules directly in request
diff --git a/config.php b/config.php
index 06f6ec2..6860d36 100644
--- a/config.php
+++ b/config.php
@@ -61,16 +61,15 @@ $options->content = 'user';
// HTML5 output
// ----------------------
-// By default, Full-Text RSS uses libxml to convert the parsed DOM tree back into HTML.
-// If this is enabled, we'll use HTML5-PHP to produce the HTML. This will be a little
-// slower, but might produce better results, adhering to the HTML5 spec.
-//
-// Note: in a future release we might make HTML5 output the default.
+// Full-Text RSS used to rely on libxml to output HTML extracted from
+// a web page. Since version 3.8 we use HTML5-PHP by default.
+// If you prefer the old output, either set this to false or pass &content=1
+// in the querystring.
//
// Possible values...
// HTML5 (slower): true
// libxml (faster): false
-// libxml unless user overrides (&content=html5): 'user' (default)
+// HTML5 unless user overrides (&content=1): 'user' (default)
$options->html5_output = 'user';
// Excerpts
@@ -524,7 +523,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
-if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.7');
+if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.8');
if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) {
diff --git a/ftr_compatibility_test.php b/ftr_compatibility_test.php
index 98ac060..4ce9161 100644
--- a/ftr_compatibility_test.php
+++ b/ftr_compatibility_test.php
@@ -16,12 +16,12 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/
-$app_name = 'Full-Text RSS 3.7';
+$app_name = 'Full-Text RSS 3.8';
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
//$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
// HHVM works okay, but no Tidy and autoupdate of site config files not working (tested 3.7.1)
-$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.3.0', '>='));
+$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.4.0', '>='));
$pcre_ok = extension_loaded('pcre');
$zlib_ok = extension_loaded('zlib');
$mbstring_ok = extension_loaded('mbstring');
@@ -32,6 +32,7 @@ $parallel_ok = ((extension_loaded('http') && class_exists('http\Client\Request')
$allow_url_fopen_ok = (bool)ini_get('allow_url_fopen');
$filter_ok = extension_loaded('filter');
$gumbo_ok = class_exists('Layershifter\Gumbo\Parser');
+$idn_ok = function_exists('idn_to_ascii');
if (extension_loaded('xmlreader')) {
$xml_ok = true;
@@ -204,7 +205,7 @@ div.chunk {
PHP |
- 5.3 or higher |
+ 5.4 or higher |
|
@@ -354,6 +355,11 @@ div.chunk {
Further info
+
+
IDN support
+
When treating an internationalized domain name (IDN) Full-Text RSS will try to make use of PHP's idn_to_ascii
function to convert the domain to ASCII. If this function does not exist, you might have trouble retrieving article content from internationalized domains.
+
idn_to_ascii is not'; ?> available on this server.
+
HTTP module
Full-Text RSS can make use of PHP's HTTP extension or curl_multi
to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using file_get_contents
.
content |
-
0, 1 (default), html5 |
-
If set to 0, the extracted content will not be included in the output. If set to html5, we'll output HTML5. |
+
0, 1, html5 (default) |
+
If set to 0, the extracted content will not be included in the output. If set to 1, we'll use regular libxml output - might not be HTML5 compliant. |
diff --git a/libraries/content-extractor/ContentExtractor.php b/libraries/content-extractor/ContentExtractor.php
index 2519f2e..7f8c652 100644
--- a/libraries/content-extractor/ContentExtractor.php
+++ b/libraries/content-extractor/ContentExtractor.php
@@ -5,8 +5,8 @@
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
* to extract content from HTML files.
*
- * @version 1.3
- * @date 2017-02-12
+ * @version 1.4
+ * @date 2017-09-25
* @author Keyvan Minoukadeh
* @copyright 2017 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
@@ -107,24 +107,13 @@ class ContentExtractor
}
// returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
- public function buildSiteConfig($url, $html='', $add_to_cache=true) {
+ public function buildSiteConfig($url, $html='') {
// extract host name
$host = @parse_url($url, PHP_URL_HOST);
$host = strtolower($host);
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
- // is merged version already cached?
- if (SiteConfig::is_cached("$host.merged")) {
- $config = SiteConfig::build("$host.merged");
- if ($config) {
- $this->debug("Returning cached and merged site config for $host");
- return $config;
- }
- }
// let's build from site_config/custom/ and standard/
$config = SiteConfig::build($host);
- if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
- SiteConfig::add_to_cache($host, $config);
- }
// if no match, use defaults
if (!$config) $config = new SiteConfig();
// load fingerprint config?
@@ -134,10 +123,6 @@ class ContentExtractor
if ($config_fingerprint = SiteConfig::build($_fphost)) {
$this->debug("Appending site config settings from $_fphost (fingerprint match)");
$config->append($config_fingerprint);
- if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
- //$config_fingerprint->cache_in_apc = true;
- SiteConfig::add_to_cache($_fphost, $config_fingerprint);
- }
}
}
}
@@ -146,19 +131,8 @@ class ContentExtractor
if ($config_global = SiteConfig::build('global', true)) {
$this->debug('Appending site config settings from global.txt');
$config->append($config_global);
- if ($add_to_cache && !SiteConfig::is_cached('global')) {
- //$config_global->cache_in_apc = true;
- SiteConfig::add_to_cache('global', $config_global);
- }
}
}
- // store copy of merged config
- if ($add_to_cache) {
- // do not store in APC if wildcard match
- $use_apc = ($host == $config->cache_key);
- $config->cache_key = null;
- SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
- }
return $config;
}
@@ -398,10 +372,14 @@ class ContentExtractor
$elems = @$xpath->query($pattern, $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
- $this->debug('Stripping '.$elems->length.' elements (strip)');
+ $this->debug('Stripping '.$elems->length.' elements (strip: '.$pattern.')');
for ($i=$elems->length-1; $i >= 0; $i--) {
if ($elems->item($i)->parentNode) {
- $elems->item($i)->parentNode->removeChild($elems->item($i));
+ if ($elems->item($i) instanceof DOMAttr) {
+ $elems->item($i)->parentNode->removeAttributeNode($elems->item($i));
+ } else {
+ $elems->item($i)->parentNode->removeChild($elems->item($i));
+ }
}
}
}
@@ -413,7 +391,7 @@ class ContentExtractor
$elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
- $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
+ $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class: '.$string.')');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
@@ -426,12 +404,13 @@ class ContentExtractor
$elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
- $this->debug('Stripping '.$elems->length.' image elements');
+ $this->debug('Stripping '.$elems->length.' elements (strip_image_src: '.$string.')');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
+
// strip elements using Readability.com and Instapaper.com ignore class names
// .entry-unrelated and .instapaper_ignore
// See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
@@ -464,7 +443,22 @@ class ContentExtractor
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
-
+
+ // strip img srcset/sizes attributes with relative URIs (src should be present and will be absolutised)
+ // TODO: absolutize srcet values rather than removing them
+ // To remove srcset from all image elements, site config files can contain: strip: //img/@srcset
+ $elems = $xpath->query("//img[@srcset and not(contains(@srcset, '//'))]", $this->readability->dom);
+ // check for matches
+ if ($elems && $elems->length > 0) {
+ $this->debug('Stripping '.$elems->length.' srcset attributes');
+ foreach ($elems as $elem) {
+ $elem->removeAttribute('srcset');
+ if ($elem->hasAttribute('sizes')) {
+ $elem->removeAttribute('sizes');
+ }
+ }
+ }
+
// try to get body
foreach ($this->config->body as $pattern) {
$elems = @$xpath->query($pattern, $this->readability->dom);
@@ -880,7 +874,7 @@ class ContentExtractor
}
} else {
// If there's an og:image, but we have no images in the article, let's place it at the beginning of the article.
- if ($this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
+ if ($this->config->insert_detected_image() && $this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
$elems = @$xpath->query(".//img", $this->body);
if ($elems->length === 0) {
$_new_elem = $this->body->ownerDocument->createDocumentFragment();
@@ -902,7 +896,7 @@ class ContentExtractor
return $this->success;
}
-
+
private function isDescendant(DOMElement $parent, DOMElement $child) {
$node = $child->parentNode;
while ($node != null) {
diff --git a/libraries/content-extractor/SiteConfig.php b/libraries/content-extractor/SiteConfig.php
index 3b90a75..fce2b04 100644
--- a/libraries/content-extractor/SiteConfig.php
+++ b/libraries/content-extractor/SiteConfig.php
@@ -5,10 +5,10 @@
* Each instance of this class should hold extraction patterns and other directives
* for a website. See ContentExtractor class to see how it's used.
*
- * @version 1.0
- * @date 2015-06-09
+ * @version 1.1
+ * @date 2017-09-25
* @author Keyvan Minoukadeh
- * @copyright 2015 Keyvan Minoukadeh
+ * @copyright 2017 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@@ -43,7 +43,6 @@ class SiteConfig
// Process HTML with tidy before creating DOM (bool or null if undeclared)
public $tidy = null;
-
protected $default_tidy = true; // used if undeclared
// Autodetect title/body if xpath expressions fail to produce results.
@@ -93,6 +92,12 @@ class SiteConfig
public $parser = null;
protected $default_parser = 'libxml'; // used if undeclared
+ // Insert detected image (currently only og:image) into beginning of extracted article
+ // Only does this if extracted article contains no images
+ // bool or null if undeclared
+ public $insert_detected_image = null;
+ protected $default_insert_detected_image = true; // used if undeclared
+
// Strings to search for in HTML before processing begins (used with $replace_string)
public $find_string = array();
// Strings to replace those found in $find_string before HTML processing begins
@@ -101,10 +106,9 @@ class SiteConfig
// the options below cannot be set in the config files which this class represents
//public $cache_in_apc = false; // used to decide if we should cache in apc or not
- public $cache_key = null;
public static $debug = false;
protected static $apc = false;
- protected static $config_path;
+ protected static $config_path_custom;
protected static $config_path_fallback;
protected static $config_cache = array();
const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
@@ -136,7 +140,13 @@ class SiteConfig
self::$apc = $apc;
return $apc;
}
-
+
+ // return bool or null
+ public function insert_detected_image($use_default=true) {
+ if ($use_default) return (isset($this->insert_detected_image)) ? $this->insert_detected_image : $this->default_insert_detected_image;
+ return $this->insert_detected_image;
+ }
+
// return bool or null
public function tidy($use_default=true) {
if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
@@ -162,15 +172,32 @@ class SiteConfig
}
public static function set_config_path($path, $fallback=null) {
- self::$config_path = $path;
+ self::$config_path_custom = $path;
self::$config_path_fallback = $fallback;
}
-
+
+ protected static function load_cached_merged($host, $exact_host_match) {
+ if ($exact_host_match) {
+ $key = $host.'.merged.ex';
+ } else {
+ $key = $host.'.merged';
+ }
+ return self::load_cached($key);
+ }
+
+ protected static function add_to_cache_merged($host, $exact_host_match, SiteConfig $config=null) {
+ if ($exact_host_match) {
+ $key = $host.'.merged.ex';
+ } else {
+ $key = $host.'.merged';
+ }
+ if (!isset($config)) $config = new SiteConfig();
+ self::add_to_cache($key, $config);
+ }
+
public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
$key = strtolower($key);
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
- if ($config->cache_key) $key = $config->cache_key;
- $key .= '.'.self::get_key_suffix();
self::$config_cache[$key] = $config;
if (self::$apc && $use_apc) {
self::debug("Adding site config to APC cache with key sc.$key");
@@ -178,10 +205,23 @@ class SiteConfig
}
self::debug("Cached site config with key $key");
}
-
+
+ public static function load_cached($key) {
+ $key = strtolower($key);
+ if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
+ //var_dump('in cache?', $key, self::$config_cache);
+ if (array_key_exists($key, self::$config_cache)) {
+ self::debug("... site config for $key already loaded in this request");
+ return self::$config_cache[$key];
+ } elseif (self::$apc && ($sconfig = apc_fetch("sc.$key"))) {
+ self::debug("... site config for $key found in APCu");
+ return $sconfig;
+ }
+ return false;
+ }
+
public static function is_cached($key) {
$key = strtolower($key);
- $key .= '.'.self::get_key_suffix();
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
if (array_key_exists($key, self::$config_cache)) {
return true;
@@ -212,7 +252,7 @@ class SiteConfig
}
// check for single statement commands
// we do not overwrite existing non null values
- foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
+ foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure', 'insert_detected_image') as $var) {
if ($this->$var === null) $this->$var = $newconfig->$var;
}
// treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
@@ -222,16 +262,6 @@ class SiteConfig
$this->$var = array_merge($this->$var, $newconfig->$var);
}
}
-
- // This is used to make sure that when a different primary folder is chosen
- // The key for the cached result includes that folder choice.
- // Otherwise, a subsequent request choosing a different folder
- // could return the wrong cached config.
- public static function get_key_suffix() {
- $key_suffix = basename(self::$config_path);
- if ($key_suffix === 'custom') $key_suffix = '';
- return $key_suffix;
- }
// Add test_contains to last test_url
public function add_test_contains($test_contains) {
@@ -274,6 +304,12 @@ class SiteConfig
$host = strtolower($host);
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
+ // got a merged one?
+ $config = self::load_cached_merged($host, $exact_host_match);
+ if ($config) {
+ //self::debug('. returned merged config from a previous request');
+ return $config;
+ }
// check for site configuration
$try = array($host);
// should we look for wildcard matches
@@ -284,102 +320,87 @@ class SiteConfig
$try[] = '.'.implode('.', $split);
}
}
-
- // Which primary folder should we look inside?
- // If it's not the default ('custom'), we need
- // a key suffix to distinguish site config fules
- // held in this folder from those in other folders.
- $key_suffix = self::get_key_suffix();
- // look for site config file in primary folder
- self::debug(". looking for site config for $host in primary folder");
+ // look for site config file in custom folder
+ self::debug(". looking for site config for $host in custom folder");
+ //var_dump($try);
+ $config = null;
+ $config_std = null;
foreach ($try as $h) {
- $h_key = "$h.$key_suffix";
- if (array_key_exists($h_key, self::$config_cache)) {
- self::debug("... site config for $h already loaded in this request");
- return self::$config_cache[$h_key];
- } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h_key"))) {
- self::debug("... site config for $h in APC cache");
- return $sconfig;
- } elseif (file_exists(self::$config_path."/$h.txt")) {
+ //$h_key = $h.'.'.$key_suffix;
+ $h_key = $h.'.custom';
+ //var_dump($h_key, $h);
+ if ($config = self::load_cached($h_key)) {
+ break;
+ } elseif (file_exists(self::$config_path_custom."/$h.txt")) {
self::debug("... found site config ($h.txt)");
- $file_primary = self::$config_path."/$h.txt";
- $matched_name = $h;
+ $file_custom = self::$config_path_custom."/$h.txt";
+ $config = self::build_from_file($file_custom);
+ //$matched_name = $h;
break;
}
}
// if we found site config, process it
- if (isset($file_primary)) {
- $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- if (!$config_lines || !is_array($config_lines)) return false;
- $config = self::build_from_array($config_lines);
- // if APC caching is available and enabled, mark this for cache
- //$config->cache_in_apc = true;
- $config->cache_key = $matched_name;
-
- // if autodetec on failure is off (on by default) we do not need to look
- // in secondary folder
- if (!$config->autodetect_on_failure()) {
- self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
- return $config;
- }
+ // if autodetec on failure is off (on by default) we do not need to look
+ // in secondary folder
+ if ($config && !$config->autodetect_on_failure()) {
+ self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
+ self::add_to_cache_merged($host, $exact_host_match, $config);
+ return $config;
}
// look for site config file in secondary folder
if (isset(self::$config_path_fallback)) {
- self::debug(". looking for site config for $host in secondary folder");
+ self::debug(". looking for site config for $host in standard folder");
foreach ($try as $h) {
- if (file_exists(self::$config_path_fallback."/$h.txt")) {
- self::debug("... found site config in secondary folder ($h.txt)");
+ if ($config_std = self::load_cached($h)) {
+ break;
+ } elseif (file_exists(self::$config_path_fallback."/$h.txt")) {
+ self::debug("... found site config in standard folder ($h.txt)");
$file_secondary = self::$config_path_fallback."/$h.txt";
- $matched_name = $h;
+ $config_std = self::build_from_file($file_secondary);
break;
}
}
- if (!isset($file_secondary)) {
- self::debug("... no site config match in secondary folder");
- }
}
// return false if no config file found
- if (!isset($file_primary) && !isset($file_secondary)) {
+ if (!$config && !$config_std) {
self::debug("... no site config match for $host");
+ self::add_to_cache_merged($host, $exact_host_match);
return false;
}
- // return primary config if secondary not found
- if (!isset($file_secondary) && isset($config)) {
- return $config;
- }
-
- // process secondary config file
- $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- if (!$config_lines || !is_array($config_lines)) {
- // failed to process secondary
- if (isset($config)) {
- // return primary config
- return $config;
- } else {
- return false;
- }
- }
-
- // merge with primary and return
- if (isset($config)) {
+ // final config handling
+ $config_final = null;
+ if (!$config_std && $config) {
+ $config_final = $config;
+ // merge with primary
+ } elseif ($config_std && $config) {
self::debug('. merging config files');
- $config->append(self::build_from_array($config_lines));
- return $config;
+ $config->append($config_std);
+ $config_final = $config;
} else {
// return just secondary
- $config = self::build_from_array($config_lines);
+ //$config = self::build_from_array($config_lines);
// if APC caching is available and enabled, mark this for cache
//$config->cache_in_apc = true;
- $config->cache_key = $matched_name;
- return $config;
+ $config_final = $config_std;
}
+ self::add_to_cache_merged($host, $exact_host_match, $config_final);
+ return $config_final;
}
+ public static function build_from_file($path, $cache=true) {
+ $key = basename($path, '.txt');
+ $config_lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+ if (!$config_lines || !is_array($config_lines)) return false;
+ $config = self::build_from_array($config_lines);
+ if ($cache) self::add_to_cache($key, $config);
+ return $config;
+ }
+
public static function build_from_string($string) {
$config_lines = explode("\n", $string);
return self::build_from_array($config_lines);
@@ -399,13 +420,23 @@ class SiteConfig
if (count($command) != 2) continue;
$val = trim($command[1]);
$command = trim($command[0]);
- if ($command == '' || $val == '') continue;
-
+ //if ($command == '' || $val == '') continue;
+ // $val can be empty, e.g. replace_string:
+ if ($command == '') continue;
+
+ // strip_attr is now an alias for strip.
+ // In FTR 3.8 we can strip attributes from elements, not only the elements themselves
+ // e.g. strip: //img/@srcset (removes srcset attribute from all img elements)
+ // but for backward compatibility (to avoid errors with new config files + old version of FTR)
+ // we've introduced strip_attr and we'll recommend using that in our public site config rep.
+ // strip_attr: //img/@srcset
+ if ($command == 'strip_attr') $command = 'strip';
+
// check for commands where we accept multiple statements
if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
array_push($config->$command, $val);
// check for single statement commands that evaluate to true or false
- } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
+ } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure', 'insert_detected_image'))) {
$config->$command = ($val == 'yes');
// check for single statement commands stored as strings
} elseif (in_array($command, array('parser'))) {
diff --git a/libraries/feedwriter/FeedItem.php b/libraries/feedwriter/FeedItem.php
index ed7a8cf..e44268d 100644
--- a/libraries/feedwriter/FeedItem.php
+++ b/libraries/feedwriter/FeedItem.php
@@ -186,5 +186,4 @@
$this->setElement('enclosure','',$attributes);
}
- } // end of class FeedItem
-?>
+ }
\ No newline at end of file
diff --git a/libraries/feedwriter/FeedWriter.php b/libraries/feedwriter/FeedWriter.php
index 7061b02..42c7cd8 100644
--- a/libraries/feedwriter/FeedWriter.php
+++ b/libraries/feedwriter/FeedWriter.php
@@ -1,4 +1,6 @@
1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'applet'=>1, 'area'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'blockquote'=>1, 'br'=>1, 'button'=>1, 'caption'=>1, 'center'=>1, 'cite'=>1, 'code'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'del'=>1, 'dfn'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'dt'=>1, 'em'=>1, 'embed'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'isindex'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'p'=>1, 'param'=>1, 'pre'=>1, 'q'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'tt'=>1, 'u'=>1, 'ul'=>1, 'var'=>1); // 86/deprecated+embed+ruby
+$e = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'applet'=>1, 'area'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'blockquote'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'caption'=>1, 'center'=>1, 'cite'=>1, 'code'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'dd'=>1, 'del'=>1, 'details'=>1, 'dfn'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'dt'=>1, 'em'=>1, 'embed'=>1, 'fieldset'=>1, 'figcaption'=>1, 'figure'=>1, 'font'=>1, 'footer'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'isindex'=>1, 'kbd'=>1, 'keygen'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'link'=>1, 'main'=>1, 'map'=>1, 'mark'=>1, 'menu'=>1, 'meta'=>1, 'meter'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'output'=>1, 'p'=>1, 'param'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'small'=>1, 'source'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'style'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'time'=>1, 'tr'=>1, 'track'=>1, 'tt'=>1, 'u'=>1, 'ul'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1); // 118 incl. deprecated & some Ruby
+
if(!empty($C['safe'])){
- unset($e['applet'], $e['embed'], $e['iframe'], $e['object'], $e['script']);
+ unset($e['applet'], $e['audio'], $e['canvas'], $e['embed'], $e['iframe'], $e['object'], $e['script'], $e['video']);
}
$x = !empty($C['elements']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['elements']) : '*';
if($x == '-*'){$e = array();}
@@ -39,21 +40,20 @@ else{
}
$C['elements'] =& $e;
// config attrs
-$x = !empty($C['deny_attribute']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['deny_attribute']) : '';
-$x = array_flip((isset($x[0]) && $x[0] == '*') ? explode('-', $x) : explode(',', $x. (!empty($C['safe']) ? ',on*' : '')));
-if(isset($x['on*'])){
- unset($x['on*']);
- $x += array('onblur'=>1, 'onchange'=>1, 'onclick'=>1, 'ondblclick'=>1, 'onfocus'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onmousedown'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onreset'=>1, 'onselect'=>1, 'onsubmit'=>1);
-}
+$x = !empty($C['deny_attribute']) ? strtolower(str_replace(array("\n", "\r", "\t", ' '), '', $C['deny_attribute'])) : '';
+$x = array_flip((isset($x[0]) && $x[0] == '*') ? str_replace('/', 'data-', explode('-', str_replace('data-', '/', $x))) : explode(',', $x. (!empty($C['safe']) ? ',on*' : '')));
$C['deny_attribute'] = $x;
-// config URL
-$x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':')) ? strtolower($C['schemes']) : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https';
+// config URLs
+$x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':')) ? strtolower($C['schemes']) : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet'. (empty($C['safe']) ? ', app, javascript; *: data, javascript, ' : '; *:'). 'file, http, https';
$C['schemes'] = array();
foreach(explode(';', str_replace(array(' ', "\t", "\r", "\n"), '', $x)) as $v){
$x = $x2 = null; list($x, $x2) = explode(':', $v, 2);
if($x2){$C['schemes'][$x] = array_flip(explode(',', $x2));}
}
-if(!isset($C['schemes']['*'])){$C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1,);}
+if(!isset($C['schemes']['*'])){
+ $C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1);
+ if(empty($C['safe'])){$C['schemes']['*'] += array('data'=>1, 'javascript'=>1);}
+}
if(!empty($C['safe']) && empty($C['schemes']['style'])){$C['schemes']['style'] = array('!'=>1);}
$C['abs_url'] = isset($C['abs_url']) ? $C['abs_url'] : 0;
if(!isset($C['base_url']) or !preg_match('`^[a-zA-Z\d.+\-]+://[^/]+/(.+?/)?$`', $C['base_url'])){
@@ -81,7 +81,7 @@ $C['parent'] = isset($C['parent'][0]) ? strtolower($C['parent']) : 'body';
$C['show_setting'] = !empty($C['show_setting']) ? $C['show_setting'] : 0;
$C['style_pass'] = empty($C['style_pass']) ? 0 : 1;
$C['tidy'] = empty($C['tidy']) ? 0 : $C['tidy'];
-$C['unique_ids'] = isset($C['unique_ids']) ? $C['unique_ids'] : 1;
+$C['unique_ids'] = isset($C['unique_ids']) && (!preg_match('`\W`', $C['unique_ids'])) ? $C['unique_ids'] : 1;
$C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 0;
if(isset($GLOBALS['C'])){$reC = $GLOBALS['C'];}
@@ -97,7 +97,7 @@ if($C['clean_ms_char']){
$t = strtr($t, $x);
}
if($C['cdata'] or $C['comment']){$t = preg_replace_callback('``sm', 'htmLawed::hl_cmtcd', $t);}
-$t = preg_replace_callback('`&([A-Za-z][A-Za-z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', 'htmLawed::hl_ent', str_replace('&', '&', $t));
+$t = preg_replace_callback('`&([a-zA-Z][a-zA-Z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', 'htmLawed::hl_ent', str_replace('&', '&', $t));
if($C['unique_ids'] && !isset($GLOBALS['hl_Ids'])){$GLOBALS['hl_Ids'] = array();}
if($C['hook']){$t = $C['hook']($t, $C, $S);}
if($C['show_setting'] && preg_match('`^[a-z][a-z0-9_]*$`i', $C['show_setting'])){
@@ -112,18 +112,18 @@ unset($C, $e);
if(isset($reC)){$GLOBALS['C'] = $reC;}
if(isset($reS)){$GLOBALS['S'] = $reS;}
return $t;
-// eof
}
public static function hl_attrval($a, $t, $p){
// check attr val against $S
-static $ma = array('accesskey', 'class', 'rel');
-$s = in_array($a, $ma) ? ' ' : '';
+static $ma = array('accesskey', 'class', 'itemtype', 'rel');
+$s = in_array($a, $ma) ? ' ' : ($a == 'srcset' ? ',': '');
$r = array();
$t = !empty($s) ? explode($s, $t) : array($t);
foreach($t as $tk=>$tv){
- $o = 1; $l = strlen($tv);
+ $o = 1; $tv = trim($tv); $l = strlen($tv);
foreach($p as $k=>$v){
+ if(!$l){continue;}
switch($k){
case 'maxlen': if($l > $v){$o = 0;}
break; case 'minlen': if($l < $v){$o = 0;}
@@ -146,30 +146,29 @@ foreach($t as $tk=>$tv){
}
if($o){$r[] = $tv;}
}
+if($s == ','){$s = ', ';}
$r = implode($s, $r);
return (isset($r[0]) ? $r : (isset($p['default']) ? $p['default'] : 0));
-// eof
}
public static function hl_bal($t, $do=1, $in='div'){
// balance tags
// by content
$cB = array('blockquote'=>1, 'form'=>1, 'map'=>1, 'noscript'=>1); // Block
-$cE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); // Empty
-$cF = array('button'=>1, 'del'=>1, 'div'=>1, 'dd'=>1, 'fieldset'=>1, 'iframe'=>1, 'ins'=>1, 'li'=>1, 'noscript'=>1, 'object'=>1, 'td'=>1, 'th'=>1); // Flow; later context-wise dynamic move of ins & del to $cI
-$cI = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'caption'=>1, 'cite'=>1, 'code'=>1, 'dfn'=>1, 'dt'=>1, 'em'=>1, 'font'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'i'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'p'=>1, 'pre'=>1, 'q'=>1, 'rb'=>1, 'rt'=>1, 's'=>1, 'samp'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); // Inline
-$cN = array('a'=>array('a'=>1), 'button'=>array('a'=>1, 'button'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'fieldset'=>array('fieldset'=>1), 'form'=>array('form'=>1), 'label'=>array('label'=>1), 'noscript'=>array('script'=>1), 'pre'=>array('big'=>1, 'font'=>1, 'img'=>1, 'object'=>1, 'script'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1)); // Illegal
+$cE = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); // Empty
+$cF = array('a'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'button'=>1, 'canvas'=>1, 'del'=>1, 'details'=>1, 'div'=>1, 'dd'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'header'=>1, 'iframe'=>1, 'ins'=>1, 'li'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'section'=>1, 'style'=>1, 'td'=>1, 'th'=>1, 'video'=>1); // Flow; later context-wise dynamic move of ins & del to $cI
+$cI = array('abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'caption'=>1, 'cite'=>1, 'code'=>1, 'data'=>1, 'datalist'=>1, 'dfn'=>1, 'dt'=>1, 'em'=>1, 'figcaption'=>1, 'font'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hgroup'=>1, 'i'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'mark'=>1, 'meter'=>1, 'output'=>1, 'p'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rt'=>1, 's'=>1, 'samp'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); // Inline
+$cN = array('a'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'address'=>array('address'=>1, 'article'=>1, 'aside'=>1, 'header'=>1, 'keygen'=>1, 'footer'=>1, 'nav'=>1, 'section'=>1), 'button'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'fieldset'=>array('fieldset'=>1), 'footer'=>array('header'=>1, 'footer'=>1), 'form'=>array('form'=>1), 'header'=>array('header'=>1, 'footer'=>1), 'label'=>array('label'=>1), 'main'=>array('main'=>1), 'meter'=>array('meter'=>1), 'noscript'=>array('script'=>1), 'pre'=>array('big'=>1, 'font'=>1, 'img'=>1, 'object'=>1, 'script'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1), 'progress'=>array('progress'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1), 'time'=>array('time'=>1), ); // Illegal
$cN2 = array_keys($cN);
-$cR = array('blockquote'=>1, 'dir'=>1, 'dl'=>1, 'form'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1);
-$cS = array('colgroup'=>array('col'=>1), 'dir'=>array('li'=>1), 'dl'=>array('dd'=>1, 'dt'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child
-if($GLOBALS['C']['direct_list_nest']){$cS['ol'] = $cS['ul'] += array('ol'=>1, 'ul'=>1);}
-$cO = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'blockquote'=>array('script'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'form'=>array('script'=>1), 'map'=>array('area'=>1), 'object'=>array('param'=>1, 'embed'=>1)); // Other
+$cS = array('colgroup'=>array('col'=>1), 'datalist'=>array('option'=>1), 'dir'=>array('li'=>1), 'dl'=>array('dd'=>1, 'dt'=>1), 'hgroup'=>array('h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child
+if($GLOBALS['C']['direct_list_nest']){$cS['ol'] = $cS['ul'] = $cS['menu'] += array('menu'=>1, 'ol'=>1, 'ul'=>1);}
+$cO = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'audio'=>array('source'=>1, 'track'=>1), 'blockquote'=>array('script'=>1), 'details'=>array('summary'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'figure'=>array('figcaption'=>1),'form'=>array('script'=>1), 'map'=>array('area'=>1), 'object'=>array('param'=>1, 'embed'=>1), 'video'=>array('source'=>1, 'track'=>1)); // Other
$cT = array('colgroup'=>1, 'dd'=>1, 'dt'=>1, 'li'=>1, 'option'=>1, 'p'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1); // Omitable closing
-// block/inline type; ins & del both type; #pcdata: text
-$eB = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'isindex'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'table'=>1, 'ul'=>1);
-$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'cite'=>1, 'code'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'map'=>1, 'object'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'tt'=>1, 'u'=>1, 'var'=>1);
-$eN = array('a'=>1, 'big'=>1, 'button'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'label'=>1, 'object'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1); // Exclude from specific ele; $cN values
-$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'dt'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1); // Missing in $eB & $eI
+// block/inline type; a/ins/del both type; #pcdata: text
+$eB = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'details'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hr'=>1, 'isindex'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'section'=>1, 'style'=>1, 'table'=>1, 'ul'=>1);
+$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'cite'=>1, 'code'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'figcaption'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'link'=>1, 'map'=>1, 'mark'=>1, 'meta'=>1, 'meter'=>1, 'object'=>1, 'output'=>1, 'progress'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'textarea'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1);
+$eN = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'big'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'fieldset'=>1, 'font'=>1, 'footer'=>1, 'form'=>1, 'header'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'meter'=>1, 'nav'=>1, 'object'=>1, 'progress'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'time'=>1); // Exclude from specific ele; $cN values
+$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'dd'=>1, 'dt'=>1, 'hgroup'=>1, 'keygen'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'source'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1, 'track'=>1); // Missing in $eB & $eI
$eF = $eB + $eI;
// $in sets allowed child
@@ -223,7 +222,7 @@ for($i=-1, $ci=count($t); ++$i<$ci;){
if(isset($cE[$e]) or !in_array($e, $q)){continue;} // Empty/unopen
if($p == $e){array_pop($q); echo '', $e, '>'; unset($e); continue;} // Last open
$add = ''; // Nesting - close open tags that need to be
- for($j=-1, $cj=count($q); ++$j<$cj;){
+ for($j=-1, $cj=count($q); ++$j<$cj;){
if(($d = array_pop($q)) == $e){break;}
else{$add .= "{$d}>";}
}
@@ -304,7 +303,6 @@ while(!empty($q) && ($e = array_pop($q))){echo '', $e, '>';}
$o = ob_get_contents();
ob_end_clean();
return $o;
-// eof
}
public static function hl_cmtcd($t){
@@ -313,13 +311,12 @@ $t = $t[0];
global $C;
if(!($v = $C[$n = $t[3] == '-' ? 'comment' : 'cdata'])){return $t;}
if($v == 1){return '';}
-if($n == 'comment'){
+if($n == 'comment' && $v < 4){
if(substr(($t = preg_replace('`--+`', '-', substr($t, 4, -3))), -1) != ' '){$t .= ' ';}
}
else{$t = substr($t, 1, -1);}
$t = $v == 2 ? str_replace(array('&', '<', '>'), array('&', '<', '>'), $t) : $t;
return str_replace(array('&', '<', '>'), array("\x03", "\x04", "\x05"), ($n == 'comment' ? "\x01\x02\x04!--$t--\x05\x02\x01" : "\x01\x01\x04$t\x05\x01\x01"));
-// eof
}
public static function hl_ent($t){
@@ -335,7 +332,6 @@ if(($n = ctype_digit($t = substr($t, 1)) ? intval($t) : hexdec(substr($t, 1))) <
return ($C['and_mark'] ? "\x06" : '&'). "amp;#{$t};";
}
return ($C['and_mark'] ? "\x06" : '&'). '#'. (((ctype_digit($t) && $C['hexdec_entity'] < 2) or !$C['hexdec_entity']) ? $n : 'x'. dechex($n)). ';';
-// eof
}
public static function hl_prot($p, $c=null){
@@ -368,29 +364,36 @@ if($C['abs_url']){
}
}
return "{$b}{$p}{$a}";
-// eof
}
public static function hl_regex($p){
-// ?regex
+// check regex
if(empty($p)){return 0;}
-if($t = ini_get('track_errors')){$o = isset($php_errormsg) ? $php_errormsg : null;}
-else{ini_set('track_errors', 1);}
-unset($php_errormsg);
+if($v = function_exists('error_clear_last') && function_exists('error_get_last')){error_clear_last();}
+else{
+ if($t = ini_get('track_errors')){$o = isset($php_errormsg) ? $php_errormsg : null;}
+ else{ini_set('track_errors', 1);}
+ unset($php_errormsg);
+}
if(($d = ini_get('display_errors'))){ini_set('display_errors', 0);}
preg_match($p, '');
+if($v){$r = error_get_last() == null ? 1 : 0; }
+else{
+ $r = isset($php_errormsg) ? 0 : 1;
+ if($t){$php_errormsg = isset($o) ? $o : null;}
+ else{ini_set('track_errors', 0);}
+}
if($d){ini_set('display_errors', 1);}
-$r = isset($php_errormsg) ? 0 : 1;
-if($t){$php_errormsg = isset($o) ? $o : null;}
-else{ini_set('track_errors', 0);}
return $r;
-// eof
}
public static function hl_spec($t){
// final $spec
$s = array();
-$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace_callback('/"(?>(`.|[^"])*)"/sm', create_function('$m', 'return substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", \'`"\'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\""), $m[0]), 1, -1);'), trim($t)));
+if(!function_exists('hl_aux1')){function hl_aux1($m){
+ return substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", '`"'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", '"'), $m[0]), 1, -1);
+}}
+$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace_callback('/"(?>(`.|[^"])*)"/sm', 'hl_aux1', trim($t)));
for($i = count(($t = explode(';', $t))); --$i>=0;){
$w = $t[$i];
if(empty($w) or ($e = strpos($w, '=')) === false or !strlen(($a = substr($w, $e+1)))){continue;}
@@ -410,12 +413,11 @@ for($i = count(($t = explode(';', $t))); --$i>=0;){
if(!count($y) && !count($n)){continue;}
foreach(explode(',', substr($w, 0, $e)) as $v){
if(!strlen(($v = strtolower($v)))){continue;}
- if(count($y)){$s[$v] = $y;}
- if(count($n)){$s[$v]['n'] = $n;}
+ if(count($y)){if(!isset($s[$v])){$s[$v] = $y;} else{$s[$v] = array_merge($s[$v], $y);}}
+ if(count($n)){if(!isset($s[$v]['n'])){$s[$v]['n'] = $n;} else{$s[$v]['n'] = array_merge($s[$v]['n'], $n);}}
}
}
return $s;
-// eof
}
public static function hl_tag($t){
@@ -433,35 +435,37 @@ if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){
// attr string
$a = str_replace(array("\n", "\r", "\t"), ' ', trim($m[3]));
// tag transform
-static $eD = array('applet'=>1, 'center'=>1, 'dir'=>1, 'embed'=>1, 'font'=>1, 'isindex'=>1, 'menu'=>1, 's'=>1, 'strike'=>1, 'u'=>1); // Deprecated
+static $eD = array('acronym'=>1, 'applet'=>1, 'big'=>1, 'center'=>1, 'dir'=>1, 'font'=>1, 'isindex'=>1, 's'=>1, 'strike'=>1, 'tt'=>1); // Deprecated
if($C['make_tag_strict'] && isset($eD[$e])){
$trt = htmLawed::hl_tag2($e, $a, $C['make_tag_strict']);
if(!$e){return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('<', '>'), $t) : '');}
}
// close tag
-static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); // Empty ele
+static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); // Empty ele
if(!empty($m[1])){
return (!isset($eE[$e]) ? (empty($C['hook_tag']) ? "$e>" : $C['hook_tag']($e)) : (($C['keep_bad'])%2 ? str_replace(array('<', '>'), array('<', '>'), $t) : ''));
}
// open tag & attr
-static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'allowfullscreen'=>array('iframe'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific
-static $aNE = array('checked'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'disabled'=>1, 'ismap'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'readonly'=>1, 'selected'=>1); // Empty
-static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* & src
-static $aNU = array('class'=>array('param'=>1, 'script'=>1), 'dir'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'id'=>array('script'=>1), 'lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'xml:lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'onclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'ondblclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeydown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeypress'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeyup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousedown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousemove'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseout'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseover'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'style'=>array('param'=>1, 'script'=>1), 'title'=>array('param'=>1, 'script'=>1)); // Univ & exceptions
+static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accept-charset'=>array('form'=>1), 'action'=>array('form'=>1), 'align'=>array('applet'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'div'=>1, 'embed'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'allowfullscreen'=>array('iframe'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'async'=>array('script'=>1), 'autocomplete'=>array('form'=>1, 'input'=>1), 'autofocus'=>array('button'=>1, 'input'=>1, 'keygen'=>1, 'select'=>1, 'textarea'=>1), 'autoplay'=>array('audio'=>1, 'video'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('img'=>1, 'object'=>1, 'table'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'challenge'=>array('keygen'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('command'=>1, 'input'=>1), 'cite'=>array('blockquote'=>1, 'del'=>1, 'ins'=>1, 'q'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('applet'=>1, 'object'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'content'=>array('meta'=>1), 'controls'=>array('audio'=>1, 'video'=>1), 'coords'=>array('a'=>1, 'area'=>1), 'crossorigin'=>array('img'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1, 'time'=>1), 'declare'=>array('object'=>1), 'default'=>array('track'=>1), 'defer'=>array('script'=>1), 'dirname'=>array('input'=>1, 'textarea'=>1), 'disabled'=>array('button'=>1, 'command'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'download'=>array('a'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'flashvars'=>array('embed'=>1), 'for'=>array('label'=>1, 'output'=>1), 'form'=>array('button'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'object'=>1, 'output'=>1, 'select'=>1, 'textarea'=>1), 'formaction'=>array('button'=>1, 'input'=>1), 'formenctype'=>array('button'=>1, 'input'=>1), 'formmethod'=>array('button'=>1, 'input'=>1), 'formnovalidate'=>array('button'=>1, 'input'=>1), 'formtarget'=>array('button'=>1, 'input'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('applet'=>1, 'canvas'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'high'=>array('meter'=>1), 'href'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hreflang'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'icon'=>array('command'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'keyparams'=>array('keygen'=>1), 'keytype'=>array('keygen'=>1), 'kind'=>array('track'=>1), 'label'=>array('command'=>1, 'menu'=>1, 'option'=>1, 'optgroup'=>1, 'track'=>1), 'language'=>array('script'=>1), 'list'=>array('input'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'loop'=>array('audio'=>1, 'video'=>1), 'low'=>array('meter'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'max'=>array('input'=>1, 'meter'=>1, 'progress'=>1), 'maxlength'=>array('input'=>1, 'textarea'=>1), 'media'=>array('a'=>1, 'area'=>1, 'link'=>1, 'source'=>1, 'style'=>1), 'mediagroup'=>array('audio'=>1, 'video'=>1), 'method'=>array('form'=>1), 'min'=>array('input'=>1, 'meter'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('input'=>1, 'select'=>1), 'muted'=>array('audio'=>1, 'video'=>1), 'name'=>array('a'=>1, 'applet'=>1, 'button'=>1, 'embed'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'keygen'=>1, 'map'=>1, 'object'=>1, 'output'=>1, 'param'=>1, 'select'=>1, 'textarea'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'novalidate'=>array('form'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'open'=>array('details'=>1), 'optimum'=>array('meter'=>1), 'pattern'=>array('input'=>1), 'ping'=>array('a'=>1, 'area'=>1), 'placeholder'=>array('input'=>1, 'textarea'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'poster'=>array('video'=>1), 'pqg'=>array('keygen'=>1), 'preload'=>array('audio'=>1, 'video'=>1), 'prompt'=>array('isindex'=>1), 'pubdate'=>array('time'=>1), 'radiogroup'=>array('command'=>1), 'readonly'=>array('input'=>1, 'textarea'=>1), 'rel'=>array('a'=>1, 'area'=>1, 'link'=>1), 'required'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'rev'=>array('a'=>1), 'reversed'=>array('ol'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'sandbox'=>array('iframe'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scoped'=>array('style'=>1), 'scrolling'=>array('iframe'=>1), 'seamless'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('a'=>1, 'area'=>1), 'size'=>array('font'=>1, 'hr'=>1, 'input'=>1, 'select'=>1), 'sizes'=>array('link'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('audio'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'script'=>1, 'source'=>1, 'track'=>1, 'video'=>1), 'srcdoc'=>array('iframe'=>1), 'srclang'=>array('track'=>1), 'srcset'=>array('img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'step'=>array('input'=>1), 'summary'=>array('table'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'area'=>1, 'button'=>1, 'command'=>1, 'embed'=>1, 'input'=>1, 'li'=>1, 'link'=>1, 'menu'=>1, 'object'=>1, 'ol'=>1, 'param'=>1, 'script'=>1, 'source'=>1, 'style'=>1, 'ul'=>1), 'typemustmatch'=>array('object'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('button'=>1, 'data'=>1, 'input'=>1, 'li'=>1, 'meter'=>1, 'option'=>1, 'param'=>1, 'progress'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'width'=>array('applet'=>1, 'canvas'=>1, 'col'=>1, 'colgroup'=>1, 'embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'wmode'=>array('embed'=>1), 'wrap'=>array('textarea'=>1)); // Ele-specific
+static $aNA = array('aria-activedescendant'=>1, 'aria-atomic'=>1, 'aria-autocomplete'=>1, 'aria-busy'=>1, 'aria-checked'=>1, 'aria-controls'=>1, 'aria-describedby'=>1, 'aria-disabled'=>1, 'aria-dropeffect'=>1, 'aria-expanded'=>1, 'aria-flowto'=>1, 'aria-grabbed'=>1, 'aria-haspopup'=>1, 'aria-hidden'=>1, 'aria-invalid'=>1, 'aria-label'=>1, 'aria-labelledby'=>1, 'aria-level'=>1, 'aria-live'=>1, 'aria-multiline'=>1, 'aria-multiselectable'=>1, 'aria-orientation'=>1, 'aria-owns'=>1, 'aria-posinset'=>1, 'aria-pressed'=>1, 'aria-readonly'=>1, 'aria-relevant'=>1, 'aria-required'=>1, 'aria-selected'=>1, 'aria-setsize'=>1, 'aria-sort'=>1, 'aria-valuemax'=>1, 'aria-valuemin'=>1, 'aria-valuenow'=>1, 'aria-valuetext'=>1); // ARIA
+static $aNE = array('allowfullscreen'=>1, 'checkbox'=>1, 'checked'=>1, 'command'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'default'=>1, 'disabled'=>1, 'hidden'=>1, 'inert'=>1, 'ismap'=>1, 'itemscope'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'open'=>1, 'radio'=>1, 'readonly'=>1, 'required'=>1, 'reversed'=>1, 'selected'=>1); // Empty
+static $aNO = array('onabort'=>1, 'onblur'=>1, 'oncanplay'=>1, 'oncanplaythrough'=>1, 'onchange'=>1, 'onclick'=>1, 'oncontextmenu'=>1, 'oncopy'=>1, 'oncuechange'=>1, 'oncut'=>1, 'ondblclick'=>1, 'ondrag'=>1, 'ondragend'=>1, 'ondragenter'=>1, 'ondragleave'=>1, 'ondragover'=>1, 'ondragstart'=>1, 'ondrop'=>1, 'ondurationchange'=>1, 'onemptied'=>1, 'onended'=>1, 'onerror'=>1, 'onfocus'=>1, 'onformchange'=>1, 'onforminput'=>1, 'oninput'=>1, 'oninvalid'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onload'=>1, 'onloadeddata'=>1, 'onloadedmetadata'=>1, 'onloadstart'=>1, 'onlostpointercapture'=>1, 'onmousedown'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onmousewheel'=>1, 'onpaste'=>1, 'onpause'=>1, 'onplay'=>1, 'onplaying'=>1, 'onpointercancel'=>1, 'ongotpointercapture'=>1, 'onpointerdown'=>1, 'onpointerenter'=>1, 'onpointerleave'=>1, 'onpointermove'=>1, 'onpointerout'=>1, 'onpointerover'=>1, 'onpointerup'=>1, 'onprogress'=>1, 'onratechange'=>1, 'onreadystatechange'=>1, 'onreset'=>1, 'onsearch'=>1, 'onscroll'=>1, 'onseeked'=>1, 'onseeking'=>1, 'onselect'=>1, 'onshow'=>1, 'onstalled'=>1, 'onsubmit'=>1, 'onsuspend'=>1, 'ontimeupdate'=>1, 'ontoggle'=>1, 'ontouchcancel'=>1, 'ontouchend'=>1, 'ontouchmove'=>1, 'ontouchstart'=>1, 'onvolumechange'=>1, 'onwaiting'=>1, 'onwheel'=>1); // Event
+static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'itemtype'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'src'=>1, 'srcset'=>1, 'usemap'=>1); // Need scheme check; excludes style, on*
+static $aNU = array('accesskey'=>1, 'class'=>1, 'contenteditable'=>1, 'contextmenu'=>1, 'dir'=>1, 'draggable'=>1, 'dropzone'=>1, 'hidden'=>1, 'id'=>1, 'inert'=>1, 'itemid'=>1, 'itemprop'=>1, 'itemref'=>1, 'itemscope'=>1, 'itemtype'=>1, 'lang'=>1, 'role'=>1, 'spellcheck'=>1, 'style'=>1, 'tabindex'=>1, 'title'=>1, 'translate'=>1, 'xmlns'=>1, 'xml:base'=>1, 'xml:lang'=>1, 'xml:space'=>1); // Univ; excludes on*, aria*
if($C['lc_std_val']){
// predef attr vals for $eAL & $aNE ele
- static $aNL = array('all'=>1, 'baseline'=>1, 'bottom'=>1, 'button'=>1, 'center'=>1, 'char'=>1, 'checkbox'=>1, 'circle'=>1, 'col'=>1, 'colgroup'=>1, 'cols'=>1, 'data'=>1, 'default'=>1, 'file'=>1, 'get'=>1, 'groups'=>1, 'hidden'=>1, 'image'=>1, 'justify'=>1, 'left'=>1, 'ltr'=>1, 'middle'=>1, 'none'=>1, 'object'=>1, 'password'=>1, 'poly'=>1, 'post'=>1, 'preserve'=>1, 'radio'=>1, 'rect'=>1, 'ref'=>1, 'reset'=>1, 'right'=>1, 'row'=>1, 'rowgroup'=>1, 'rows'=>1, 'rtl'=>1, 'submit'=>1, 'text'=>1, 'top'=>1);
- static $eAL = array('a'=>1, 'area'=>1, 'bdo'=>1, 'button'=>1, 'col'=>1, 'form'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'xml:space'=>1);
+ static $aNL = array('all'=>1, 'auto'=>1, 'baseline'=>1, 'bottom'=>1, 'button'=>1, 'captions'=>1, 'center'=>1, 'chapters'=>1, 'char'=>1, 'checkbox'=>1, 'circle'=>1, 'col'=>1, 'colgroup'=>1, 'color'=>1, 'cols'=>1, 'data'=>1, 'date'=>1, 'datetime'=>1, 'datetime-local'=>1, 'default'=>1, 'descriptions'=>1, 'email'=>1, 'file'=>1, 'get'=>1, 'groups'=>1, 'hidden'=>1, 'image'=>1, 'justify'=>1, 'left'=>1, 'ltr'=>1, 'metadata'=>1, 'middle'=>1, 'month'=>1, 'none'=>1, 'number'=>1, 'object'=>1, 'password'=>1, 'poly'=>1, 'post'=>1, 'preserve'=>1, 'radio'=>1, 'range'=>1, 'rect'=>1, 'ref'=>1, 'reset'=>1, 'right'=>1, 'row'=>1, 'rowgroup'=>1, 'rows'=>1, 'rtl'=>1, 'search'=>1, 'submit'=>1, 'subtitles'=>1, 'tel'=>1, 'text'=>1, 'time'=>1, 'top'=>1, 'url'=>1, 'week'=>1);
+ static $eAL = array('a'=>1, 'area'=>1, 'bdo'=>1, 'button'=>1, 'col'=>1, 'fieldset'=>1, 'form'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'track'=>1, 'xml:space'=>1);
$lcase = isset($eAL[$e]) ? 1 : 0;
}
$depTr = 0;
if($C['no_deprecated_attr']){
- // dep attr:applicable ele
- static $aND = array('align'=>array('caption'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1), 'bgcolor'=>array('table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'clear'=>array('br'=>1), 'compact'=>array('dl'=>1, 'ol'=>1, 'ul'=>1), 'height'=>array('td'=>1, 'th'=>1), 'hspace'=>array('img'=>1, 'object'=>1), 'language'=>array('script'=>1), 'name'=>array('a'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'map'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'size'=>array('hr'=>1), 'start'=>array('ol'=>1), 'type'=>array('li'=>1, 'ol'=>1, 'ul'=>1), 'value'=>array('li'=>1), 'vspace'=>array('img'=>1, 'object'=>1), 'width'=>array('hr'=>1, 'pre'=>1, 'td'=>1, 'th'=>1));
- static $eAD = array('a'=>1, 'br'=>1, 'caption'=>1, 'div'=>1, 'dl'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'li'=>1, 'map'=>1, 'object'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'script'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1, 'ul'=>1);
+ // depr attr:applicable ele
+ static $aND = array('align'=>array('caption'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1), 'bgcolor'=>array('table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellspacing'=>array('table'=>1), 'clear'=>array('br'=>1), 'compact'=>array('dl'=>1, 'ol'=>1, 'ul'=>1), 'height'=>array('td'=>1, 'th'=>1), 'hspace'=>array('img'=>1, 'object'=>1), 'language'=>array('script'=>1), 'name'=>array('a'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'map'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'size'=>array('hr'=>1), 'vspace'=>array('img'=>1, 'object'=>1), 'width'=>array('hr'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1));
+ static $eAD = array('a'=>1, 'br'=>1, 'caption'=>1, 'div'=>1, 'dl'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'map'=>1, 'object'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'script'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1, 'ul'=>1);
$depTr = isset($eAD[$e]) ? 1 : 0;
}
@@ -472,7 +476,7 @@ while(strlen($a)){
$w = 0;
switch($mode){
case 0: // Name
- if(preg_match('`^[a-zA-Z][\-a-zA-Z:]+`', $a, $m)){
+ if(preg_match('`^[a-zA-Z][^\s=/]+`', $a, $m)){
$nm = strtolower($m[0]);
$w = $mode = 1; $a = ltrim(substr_replace($a, '', 0, strlen($m[0])));
}
@@ -500,9 +504,9 @@ if($mode == 1){$aA[$nm] = '';}
// clean attrs
global $S;
$rl = isset($S[$e]) ? $S[$e] : array();
-$a = array(); $nfr = 0;
+$a = array(); $nfr = 0; $d = $C['deny_attribute'];
foreach($aA as $k=>$v){
- if(((isset($C['deny_attribute']['*']) ? isset($C['deny_attribute'][$k]) : !isset($C['deny_attribute'][$k])) && (isset($aN[$k][$e]) or (isset($aNU[$k]) && !isset($aNU[$k][$e]))) && !isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])){
+ if(((isset($d['*']) ? isset($d[$k]) : !isset($d[$k])) && (isset($aN[$k][$e]) or isset($aNU[$k]) or (isset($aNO[$k]) && !isset($d['on*'])) or (isset($aNA[$k]) && !isset($d['aria*'])) or (!isset($d['data*']) && preg_match('`data-((?!xml)[^:]+$)`', $k))) && !isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])){
if(isset($aNE[$k])){$v = $k;}
elseif(!empty($lcase) && (($e != 'button' or $e != 'input') or $k == 'type')){ // Rather loose but ?not cause issues
$v = (isset($aNL[($v2 = strtolower($v))])) ? $v2 : $v;
@@ -514,9 +518,26 @@ foreach($aA as $k=>$v){
}
$v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'htmLawed::hl_prot', $v);
$v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v;
- }elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){
- $v = str_replace("Â", ' ', (strpos($v, '&') !== false ? str_replace(array('', '', ''), ' ', $v) : $v)); # double-quoted char is soft-hyphen; appears here as "Â" or hyphen or something else depending on viewing software
- $v = htmLawed::hl_prot($v, $k);
+ }elseif(isset($aNP[$k]) or isset($aNO[$k])){
+ $v = str_replace("Â", ' ', (strpos($v, '&') !== false ? str_replace(array('', '', ''), ' ', $v) : $v)); # double-quoted char: soft-hyphen; appears here as "Â" or hyphen or something else depending on viewing software
+ if($k == 'srcset'){
+ $v2 = '';
+ foreach(explode(',', $v) as $k1=>$v1){
+ $v1 = explode(' ', ltrim($v1), 2);
+ $k1 = isset($v1[1]) ? trim($v1[1]) : '';
+ $v1 = trim($v1[0]);
+ if(isset($v1[0])){$v2 .= htmLawed::hl_prot($v1, $k). (empty($k1) ? '' : ' '. $k1). ', ';}
+ }
+ $v = trim($v2, ', ');
+ }
+ if($k == 'itemtype'){
+ $v2 = '';
+ foreach(explode(' ', $v) as $v1){
+ if(isset($v1[0])){$v2 .= htmLawed::hl_prot($v1, $k). ' ';}
+ }
+ $v = trim($v2, ' ');
+ }
+ else{$v = htmLawed::hl_prot($v, $k);}
if($k == 'href'){ // X-spam
if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){
$v = str_replace('@', htmlspecialchars($C['anti_mail_spam']), $v);
@@ -541,18 +562,19 @@ foreach($aA as $k=>$v){
if($nfr){$a['rel'] = isset($a['rel']) ? $a['rel']. ' nofollow' : 'nofollow';}
// rqd attr
-static $eAR = array('area'=>array('alt'=>'area'), 'bdo'=>array('dir'=>'ltr'), 'form'=>array('action'=>''), 'img'=>array('src'=>'', 'alt'=>'image'), 'map'=>array('name'=>''), 'optgroup'=>array('label'=>''), 'param'=>array('name'=>''), 'script'=>array('type'=>'text/javascript'), 'textarea'=>array('rows'=>'10', 'cols'=>'50'));
+static $eAR = array('area'=>array('alt'=>'area'), 'bdo'=>array('dir'=>'ltr'), 'command'=>array('label'=>''), 'form'=>array('action'=>''), 'img'=>array('src'=>'', 'alt'=>'image'), 'map'=>array('name'=>''), 'optgroup'=>array('label'=>''), 'param'=>array('name'=>''), 'style'=>array('scoped'=>''), 'textarea'=>array('rows'=>'10', 'cols'=>'50'));
if(isset($eAR[$e])){
foreach($eAR[$e] as $k=>$v){
if(!isset($a[$k])){$a[$k] = isset($v[0]) ? $v : $k;}
}
}
-// depr attrs
+// depr attr
if($depTr){
$c = array();
foreach($a as $k=>$v){
if($k == 'style' or !isset($aND[$k][$e])){continue;}
+ $v = str_replace(array('\\', ':', ';', ''), '', $v);
if($k == 'align'){
unset($a['align']);
if($e == 'img' && ($v == 'left' or $v == 'right')){$c[] = 'float: '. $v;}
@@ -565,6 +587,8 @@ if($depTr){
unset($a['border']); $c[] = "border: {$v}px";
}elseif($k == 'bordercolor'){
unset($a['bordercolor']); $c[] = 'border-color: '. $v;
+ }elseif($k == 'cellspacing'){
+ unset($a['cellspacing']); $c[] = "border-spacing: {$v}px";
}elseif($k == 'clear'){
unset($a['clear']); $c[] = 'clear: '. ($v != 'all' ? $v : 'both');
}elseif($k == 'compact'){
@@ -578,19 +602,13 @@ if($depTr){
$a['type'] = 'text/'. strtolower($v);
}elseif($k == 'name'){
if($C['no_deprecated_attr'] == 2 or ($e != 'a' && $e != 'map')){unset($a['name']);}
- if(!isset($a['id']) && preg_match('`[a-zA-Z][a-zA-Z\d.:_\-]*`', $v)){$a['id'] = $v;}
+ if(!isset($a['id']) && !preg_match('`\W`', $v)){$a['id'] = $v;}
}elseif($k == 'noshade'){
unset($a['noshade']); $c[] = 'border-style: none; border: 0; background-color: gray; color: gray';
}elseif($k == 'nowrap'){
unset($a['nowrap']); $c[] = 'white-space: nowrap';
}elseif($k == 'size'){
unset($a['size']); $c[] = 'size: '. $v. 'px';
- }elseif($k == 'start' or $k == 'value'){
- unset($a[$k]);
- }elseif($k == 'type'){
- unset($a['type']);
- static $ol_type = array('i'=>'lower-roman', 'I'=>'upper-roman', 'a'=>'lower-latin', 'A'=>'upper-latin', '1'=>'decimal');
- $c[] = 'list-style-type: '. (isset($ol_type[$v]) ? $ol_type[$v] : 'decimal');
}elseif($k == 'vspace'){
unset($a['vspace']); $c[] = "margin-top: {$v}px; margin-bottom: {$v}px";
}
@@ -602,7 +620,7 @@ if($depTr){
}
// unique ID
if($C['unique_ids'] && isset($a['id'])){
- if(!preg_match('`^[A-Za-z][A-Za-z0-9_\-.:]*$`', ($id = $a['id'])) or (isset($GLOBALS['hl_Ids'][$id]) && $C['unique_ids'] == 1)){unset($a['id']);
+ if(preg_match('`\s`', ($id = $a['id'])) or (isset($GLOBALS['hl_Ids'][$id]) && $C['unique_ids'] == 1)){unset($a['id']);
}else{
while(isset($GLOBALS['hl_Ids'][$id])){$id = $C['unique_ids']. $id;}
$GLOBALS['hl_Ids'][($a['id'] = $id)] = 1;
@@ -624,15 +642,14 @@ if(empty($C['hook_tag'])){
return "<{$e}{$aA}". (isset($eE[$e]) ? ' /' : ''). '>';
}
else{return $C['hook_tag']($e, $a);}
-// eof
}
public static function hl_tag2(&$e, &$a, $t=1){
// transform tag
-if($e == 'center'){$e = 'div'; return 'text-align: center;';}
-if($e == 'dir' or $e == 'menu'){$e = 'ul'; return '';}
+if($e == 'big'){$e = 'span'; return 'font-size: larger;';}
if($e == 's' or $e == 'strike'){$e = 'span'; return 'text-decoration: line-through;';}
-if($e == 'u'){$e = 'span'; return 'text-decoration: underline;';}
+if($e == 'tt'){$e = 'code'; return '';}
+if($e == 'center'){$e = 'div'; return 'text-align: center;';}
static $fs = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%');
if($e == 'font'){
$a2 = '';
@@ -646,15 +663,19 @@ if($e == 'font'){
}
$e = 'span'; return ltrim(str_replace('<', '', $a2));
}
+if($e == 'acronym'){$e = 'abbr'; return '';}
+if($e == 'dir'){$e = 'ul'; return '';}
if($t == 2){$e = 0; return 0;}
return '';
-// eof
}
public static function hl_tidy($t, $w, $p){
-// Tidy/compact HTM
+// tidy/compact HTM
if(strpos(' pre,script,textarea', "$p,")){return $t;}
-$t = preg_replace('`\s+`', ' ', preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t));
+if(!function_exists('hl_aux2')){function hl_aux2($m){
+ return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", ' '), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];
+}}
+$t = preg_replace(array('`(<\w[^>]*(?)\s+`', '`\s+`', '`(<\w[^>]*(?) `'), array(' $1', ' ', '$1'), preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(\2>)`sm'), 'hl_aux2', $t));
if(($w = strtolower($w)) == -1){
return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t);
}
@@ -662,9 +683,9 @@ $s = strpos(" $w", 't') ? "\t" : ' ';
$s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2));
$N = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0;
$a = array('br'=>1);
-$b = array('button'=>1, 'input'=>1, 'option'=>1, 'param'=>1);
-$c = array('caption'=>1, 'dd'=>1, 'dt'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1);
-$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1);
+$b = array('button'=>1, 'command'=>1, 'input'=>1, 'option'=>1, 'param'=>1, 'track'=>1);
+$c = array('audio'=>1, 'canvas'=>1, 'caption'=>1, 'dd'=>1, 'dt'=>1, 'figcaption'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'style'=>1, 'summary'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1, 'video'=>1);
+$d = array('address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'datalist'=>1, 'details'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'iframe'=>1, 'main'=>1, 'map'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1);
$T = explode('<', $t);
$X = 1;
while($X){
@@ -703,33 +724,12 @@ if(($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)){
$t = str_replace("\n", $l, $t);
}
return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t);
-// eof
}
public static function hl_version(){
-// rel
-return '1.1.22';
-// eof
+// version
+return '1.2.4.1';
}
-public static function kses($t, $h, $p=array('http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'gopher', 'mailto')){
-// kses compat
-foreach($h as $k=>$v){
- $h[$k]['n']['*'] = 1;
-}
-$C['cdata'] = $C['comment'] = $C['make_tag_strict'] = $C['no_deprecated_attr'] = $C['unique_ids'] = 0;
-$C['keep_bad'] = 1;
-$C['elements'] = count($h) ? strtolower(implode(',', array_keys($h))) : '-*';
-$C['hook'] = 'htmLawed::kses_hook';
-$C['schemes'] = '*:'. implode(',', $p);
-return htmLawed::hl($t, $C, $h);
-// eof
-}
-
-public static function kses_hook($t, &$C, &$S){
-// kses compat
-return $t;
-// eof
-}
// end class
}
diff --git a/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php b/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php
index ccad229..b26860d 100644
--- a/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php
+++ b/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php
@@ -274,7 +274,8 @@ class DOMTreeBuilder implements EventHandler
// SPECIAL TAG HANDLING:
// Spec says do this, and "don't ask."
- if ($name == 'image') {
+ // find the spec where this is defined... looks problematic
+ if ($name == 'image' && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) {
$name = 'img';
}
@@ -681,4 +682,4 @@ class DOMTreeBuilder implements EventHandler
{
return $this->current->tagName == $tagname;
}
-}
\ No newline at end of file
+}
diff --git a/libraries/html5php/HTML5/Parser/Tokenizer.php b/libraries/html5php/HTML5/Parser/Tokenizer.php
index 02b2aff..c42bc3d 100644
--- a/libraries/html5php/HTML5/Parser/Tokenizer.php
+++ b/libraries/html5php/HTML5/Parser/Tokenizer.php
@@ -83,11 +83,8 @@ class Tokenizer
*/
public function parse()
{
- $p = 0;
do {
- $p = $this->scanner->position();
$this->consumeData();
-
// FIXME: Add infinite loop protection.
} while ($this->carryOn);
}
@@ -145,7 +142,8 @@ class Tokenizer
*/
protected function characterData()
{
- if ($this->scanner->current() === false) {
+ $tok = $this->scanner->current();
+ if ($tok === false) {
return false;
}
switch ($this->textMode) {
@@ -154,7 +152,6 @@ class Tokenizer
case Elements::TEXT_RCDATA:
return $this->rcdata();
default:
- $tok = $this->scanner->current();
if (strspn($tok, "<&")) {
return false;
}
@@ -408,24 +405,26 @@ class Tokenizer
if ($tok == '/') {
$this->scanner->next();
$this->scanner->whitespace();
- if ($this->scanner->current() == '>') {
+ $tok = $this->scanner->current();
+
+ if ($tok == '>') {
$selfClose = true;
return true;
}
- if ($this->scanner->current() === false) {
+ if ($tok === false) {
$this->parseError("Unexpected EOF inside of tag.");
return true;
}
// Basically, we skip the / token and go on.
// See 8.2.4.43.
- $this->parseError("Unexpected '%s' inside of a tag.", $this->scanner->current());
+ $this->parseError("Unexpected '%s' inside of a tag.", $tok);
return false;
}
- if ($this->scanner->current() == '>') {
+ if ($tok == '>') {
return true;
}
- if ($this->scanner->current() === false) {
+ if ($tok === false) {
$this->parseError("Unexpected EOF inside of tag.");
return true;
}
@@ -541,15 +540,21 @@ class Tokenizer
{
$stoplist = "\f" . $quote;
$val = '';
- $tok = $this->scanner->current();
- while (strspn($tok, $stoplist) == 0 && $tok !== false) {
- if ($tok == '&') {
- $val .= $this->decodeCharacterReference(true);
- $tok = $this->scanner->current();
+
+ while (true) {
+ $tokens = $this->scanner->charsUntil($stoplist.'&');
+ if ($tokens !== false) {
+ $val .= $tokens;
} else {
- $val .= $tok;
- $tok = $this->scanner->next();
+ break;
}
+
+ $tok = $this->scanner->current();
+ if ($tok == '&') {
+ $val .= $this->decodeCharacterReference(true, $tok);
+ continue;
+ }
+ break;
}
$this->scanner->next();
return $val;
@@ -591,18 +596,18 @@ class Tokenizer
*/
protected function bogusComment($leading = '')
{
-
- // TODO: This can be done more efficiently when the
- // scanner exposes a readUntil() method.
$comment = $leading;
+ $tokens = $this->scanner->charsUntil('>');
+ if ($tokens !== false) {
+ $comment .= $tokens;
+ }
$tok = $this->scanner->current();
- do {
+ if ($tok !== false) {
$comment .= $tok;
- $tok = $this->scanner->next();
- } while ($tok !== false && $tok != '>');
+ }
$this->flushBuffer();
- $this->events->comment($comment . $tok);
+ $this->events->comment($comment);
$this->scanner->next();
return true;
@@ -646,15 +651,17 @@ class Tokenizer
*/
protected function isCommentEnd()
{
+ $tok = $this->scanner->current();
+
// EOF
- if ($this->scanner->current() === false) {
+ if ($tok === false) {
// Hit the end.
$this->parseError("Unexpected EOF in a comment.");
return true;
}
// If it doesn't start with -, not the end.
- if ($this->scanner->current() != '-') {
+ if ($tok != '-') {
return false;
}
@@ -737,7 +744,6 @@ class Tokenizer
$pub = strtoupper($this->scanner->getAsciiAlpha());
$white = strlen($this->scanner->whitespace());
- $tok = $this->scanner->current();
// Get ID, and flag it as pub or system.
if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
@@ -938,10 +944,11 @@ class Tokenizer
$len = strlen($sequence);
$buffer = '';
for ($i = 0; $i < $len; ++ $i) {
- $buffer .= $this->scanner->current();
+ $tok = $this->scanner->current();
+ $buffer .= $tok;
// EOF. Rewind and let the caller handle it.
- if ($this->scanner->current() === false) {
+ if ($tok === false) {
$this->scanner->unconsume($i);
return false;
}
@@ -1067,18 +1074,22 @@ class Tokenizer
}
$entity = CharacterReference::lookupDecimal($numeric);
}
- } // String entity.
- else {
+ } elseif ($tok === '=' && $inAttribute) {
+ return '&';
+ } else { // String entity.
+
// Attempt to consume a string up to a ';'.
// [a-zA-Z0-9]+;
- $cname = $this->scanner->getAsciiAlpha();
+ $cname = $this->scanner->getAsciiAlphaNum();
$entity = CharacterReference::lookupName($cname);
// When no entity is found provide the name of the unmatched string
// and continue on as the & is not part of an entity. The & will
// be converted to & elsewhere.
if ($entity == null) {
- $this->parseError("No match in entity table for '%s'", $cname);
+ if (!$inAttribute || strlen($cname) === 0) {
+ $this->parseError("No match in entity table for '%s'", $cname);
+ }
$this->scanner->unconsume($this->scanner->position() - $start);
return '&';
}
diff --git a/libraries/html5php/README.md b/libraries/html5php/README.md
index 505a85f..e2cfdf9 100644
--- a/libraries/html5php/README.md
+++ b/libraries/html5php/README.md
@@ -1,14 +1,16 @@
# HTML5-PHP
-The need for an HTML5 parser in PHP is clear. This project initially
-began with the seemingly abandoned `html5lib` project [original source](https://code.google.com/p/html5lib/source/checkout).
-But after some initial refactoring work, we began a new parser.
+HTML5 is a standards-compliant HTML5 parser and writer written entirely in PHP.
+It is stable and used in many production websites, and has
+well over [one million downloads](https://packagist.org/packages/masterminds/html5).
+
+HTML5 provides the following features.
- An HTML5 serializer
- Support for PHP namespaces
- Composer support
- Event-based (SAX-like) parser
-- DOM tree builder
+- A DOM tree builder
- Interoperability with [QueryPath](https://github.com/technosophos/querypath)
- Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer
@@ -16,6 +18,7 @@ But after some initial refactoring work, we began a new parser.
[![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5)
[![Code Coverage](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
+[![Stability: Sustained](https://masterminds.github.io/stability/sustained.svg)](https://masterminds.github.io/stability/sustained.html)
## Installation
@@ -23,7 +26,7 @@ Install HTML5-PHP using [composer](http://getcomposer.org/).
To install, add `masterminds/html5` to your `composer.json` file:
-```
+```json
{
"require" : {
"masterminds/html5": "2.*"
diff --git a/libraries/html5php/RELEASE.md b/libraries/html5php/RELEASE.md
index 56d5fa1..b4ddf82 100644
--- a/libraries/html5php/RELEASE.md
+++ b/libraries/html5php/RELEASE.md
@@ -1,6 +1,13 @@
# Release Notes
-2.2.2 (2016-10-22)
+2.3.0 (2017-09-04)
+
+- #129: image within inline svg breaks system (fixed by #133)
+- #131: ² does not work (fixed by #132)
+- #134: Improve tokenizer performance by 20% (alternative version of #130 thanks to @MichaelHeerklotz)
+- #135: Raw & in attributes
+
+2.2.2 (2016-09-22)
- #116: In XML mode, tags are case sensitive
- #115: Fix PHP Notice in OutputRules
@@ -14,8 +21,7 @@
2.2.0 (2016-04-11)
- #105: Enable composer cache (for CI/CD)
-- #100: Use mb_substitute_character inset of ini_set for environments where
- ini_set is disable (e.g., shared hosting)
+- #100: Use mb_substitute_character inset of ini_set for environments where ini_set is disable (e.g., shared hosting)
- #98: Allow link, meta, style tags in noscript tags
- #96: Fixed xml:href on svgs that use the "use" breaking
- #94: Counting UTF8 characters performance improvement
diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php
index 605a6ad..db0040d 100644
--- a/libraries/humble-http-agent/HumbleHttpAgent.php
+++ b/libraries/humble-http-agent/HumbleHttpAgent.php
@@ -7,8 +7,8 @@
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
- * @version 1.7
- * @date 2016-11-28
+ * @version 1.8
+ * @date 2017-09-25
* @see http://devel-m6w6.rhcloud.com/mdref/http
* @author Keyvan Minoukadeh
* @copyright 2011-2016 Keyvan Minoukadeh
@@ -21,8 +21,9 @@ class HumbleHttpAgent
const METHOD_CURL_MULTI = 2;
const METHOD_FILE_GET_CONTENTS = 4;
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
- const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36';
- const UA_PHP = 'PHP/5.6';
+ // popular user agents from https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
+ const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36';
+ const UA_PHP = 'PHP/7.1';
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
protected $requests = array();
@@ -194,6 +195,24 @@ class HumbleHttpAgent
public function getMetaRefreshURL($url, $html) {
if ($html == '') return false;
+
+ // TODO: parse HTML properly
+ // For now, to deal with cases where meta refresh matches but shouldn't, e.g. CNN's
+ //
+ // we do the string replacements in the site config file before looking for the meta refresh
+ if (isset($this->siteConfigBuilder)) {
+ $sconfig = $this->siteConfigBuilder->buildSiteConfig($url);
+ // do string replacements
+ if (!empty($sconfig->find_string)) {
+ if (count($sconfig->find_string) == count($sconfig->replace_string)) {
+ $html = str_replace($sconfig->find_string, $sconfig->replace_string, $html, $_count);
+ //$this->debug("Strings replaced: $_count (find_string and/or replace_string)");
+ } else {
+ //$this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
+ }
+ }
+ }
+
//
if (!preg_match('!]+)["\']?!i', $html, $match)) {
return false;
@@ -211,7 +230,7 @@ class HumbleHttpAgent
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
- return $absolute->get_iri();
+ return $absolute->get_uri();
}
return false;
}
@@ -248,6 +267,21 @@ class HumbleHttpAgent
}
}
+ public function convertIdn($url) {
+ if (function_exists('idn_to_ascii')) {
+ if ($host = @parse_url($url, PHP_URL_HOST)) {
+ $puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46);
+ if ($host != $puny) {
+ $pos = strpos($url, $host);
+ if ($pos !== false) {
+ $url = substr_replace($url, $puny, $pos, strlen($host));
+ }
+ }
+ }
+ }
+ return $url;
+ }
+
public function rewriteUrls($url) {
foreach ($this->rewriteUrls as $find => $action) {
if (strpos($url, $find) !== false) {
@@ -327,6 +361,7 @@ class HumbleHttpAgent
} else {
$this->debug("......adding to pool");
$req_url = $this->rewriteUrls($url);
+ $req_url = $this->convertIdn($req_url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
@@ -507,6 +542,7 @@ class HumbleHttpAgent
} else {
$this->debug("......adding to pool");
$req_url = $this->rewriteUrls($url);
+ $req_url = $this->convertIdn($req_url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
@@ -649,6 +685,7 @@ class HumbleHttpAgent
$this->debug("Sending request for $url");
$this->requests[$orig]['original_url'] = $orig;
$req_url = $this->rewriteUrls($url);
+ $req_url = $this->convertIdn($req_url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
$httpContext = $this->httpContext;
diff --git a/libraries/language-detect/LanguageDetect.php b/libraries/language-detect/LanguageDetect.php
index a6922fa..52a4ee0 100644
--- a/libraries/language-detect/LanguageDetect.php
+++ b/libraries/language-detect/LanguageDetect.php
@@ -1,5 +1,4 @@
_data_dir will be ignored
*
- * @var string
- * @access private
+ * @var string
*/
- var $_db_filename = 'lang.dat';
+ protected $_db_filename = 'lang.dat';
/**
* The filename that stores the unicode block definitions
@@ -85,83 +83,74 @@ class Text_LanguageDetect
* $this->_data_dir will be ignored
*
* @var string
- * @access private
*/
- var $_unicode_db_filename = 'unicode_blocks.dat';
+ protected $_unicode_db_filename = 'unicode_blocks.dat';
/**
* The data directory
*
* Should be set by PEAR installer
*
- * @var string
- * @access private
+ * @var string
*/
- var $_data_dir = '@data_dir@';
+ protected $_data_dir = '@data_dir@';
/**
* The trigram data for comparison
*
* Will be loaded on start from $this->_db_filename
*
- * @var array
- * @access private
- */
- var $_lang_db = array();
-
- /**
- * stores the map of the trigram data to unicode characters
- *
- * @access private
* @var array
*/
- var $_unicode_map;
+ protected $_lang_db = array();
+
+ /**
+ * Stores the map of the trigram data to unicode characters
+ *
+ * @var array
+ */
+ protected $_unicode_map;
/**
* The size of the trigram data arrays
*
- * @var int
- * @access private
+ * @var int
*/
- var $_threshold = 300;
+ protected $_threshold = 300;
/**
- * the maximum possible score.
+ * The maximum possible score.
*
- * needed for score normalization. Different depending on the
+ * Needed for score normalization. Different depending on the
* perl compatibility setting
*
- * @access private
- * @var int
- * @see setPerlCompatible()
+ * @var int
+ * @see setPerlCompatible()
*/
- var $_max_score = 0;
+ protected $_max_score = 0;
/**
* Whether or not to simulate perl's Language::Guess exactly
*
- * @access private
- * @var bool
- * @see setPerlCompatible()
+ * @var bool
+ * @see setPerlCompatible()
*/
- var $_perl_compatible = false;
+ protected $_perl_compatible = false;
/**
* Whether to use the unicode block detection to speed up processing
*
- * @access private
* @var bool
*/
- var $_use_unicode_narrowing = true;
+ protected $_use_unicode_narrowing = true;
/**
- * stores the result of the clustering operation
+ * Stores the result of the clustering operation
*
- * @access private
- * @var array
- * @see clusterLanguages()
+ * @var array
+ * @see clusterLanguages()
*/
- var $_clusters;
+ protected $_clusters;
/**
* Which type of "language names" are accepted and returned:
@@ -170,7 +159,7 @@ class Text_LanguageDetect
* 2 - 2-letter ISO 639-1 code ("en")
* 3 - 3-letter ISO 639-2 code ("eng")
*/
- var $_name_mode = 0;
+ protected $_name_mode = 0;
/**
* Constructor
@@ -178,7 +167,7 @@ class Text_LanguageDetect
* Will attempt to load the language database. If it fails, you will get
* an exception.
*/
- function __construct()
+ public function __construct()
{
$data = $this->_readdb($this->_db_filename);
$this->_checkTrigram($data['trigram']);
@@ -200,9 +189,8 @@ class Text_LanguageDetect
* @param string $fname File name to load
*
* @return string expected path to the language model database
- * @access private
*/
- function _get_data_loc($fname)
+ protected function _get_data_loc($fname)
{
return dirname(__FILE__).'/'.$fname;
}
@@ -216,9 +204,8 @@ class Text_LanguageDetect
*
* @return array the language model data
* @throws Text_LanguageDetect_Exception
- * @access private
*/
- function _readdb($fname)
+ protected function _readdb($fname)
{
// finds the correct data dir
$fname = $this->_get_data_loc($fname);
@@ -246,9 +233,8 @@ class Text_LanguageDetect
* @param array $trigram Trigram data from database
*
* @return void
- * @access private
*/
- function _checkTrigram($trigram)
+ protected function _checkTrigram($trigram)
{
if (!is_array($trigram)) {
if (ini_get('magic_quotes_runtime')) {
@@ -340,11 +326,10 @@ class Text_LanguageDetect
/**
* Returns the number of languages that this object can detect
*
- * @access public
* @return int the number of languages
- * @throws Text_LanguageDetect_Exception
+ * @throws Text_LanguageDetect_Exception
*/
- function getLanguageCount()
+ public function getLanguageCount()
{
return count($this->_lang_db);
}
@@ -382,11 +367,10 @@ class Text_LanguageDetect
/**
* Returns the list of detectable languages
*
- * @access public
* @return array the names of the languages known to this object<<<<<<<
- * @throws Text_LanguageDetect_Exception
+ * @throws Text_LanguageDetect_Exception
*/
- function getLanguages()
+ public function getLanguages()
{
return $this->_convertToNameMode(
array_keys($this->_lang_db)
@@ -424,7 +408,7 @@ class Text_LanguageDetect
*
* @return void
*/
- function setNameMode($name_mode)
+ public function setNameMode($name_mode)
{
$this->_name_mode = $name_mode;
}
@@ -454,10 +438,9 @@ class Text_LanguageDetect
* @param string $text text to convert
*
* @return array array of trigram frequencies
- * @access private
* @deprecated Superceded by the Text_LanguageDetect_Parser class
*/
- function _trigram($text)
+ protected function _trigram($text)
{
$s = new Text_LanguageDetect_Parser($text);
$s->prepareTrigram();
@@ -475,9 +458,8 @@ class Text_LanguageDetect
* @param array $arr array of trigram
*
* @return array ranks of trigrams
- * @access protected
*/
- function _arr_rank($arr)
+ protected function _arr_rank($arr)
{
// sorts alphabetically first as a standard way of breaking rank ties
@@ -505,12 +487,11 @@ class Text_LanguageDetect
/**
* Sorts an array by value breaking ties alphabetically
*
- * @param array &$arr the array to sort
+ * @param array $arr the array to sort
*
* @return void
- * @access private
*/
- function _bub_sort(&$arr)
+ protected function _bub_sort(&$arr)
{
// should do the same as this perl statement:
// sort { $trigrams{$b} == $trigrams{$a}
@@ -548,9 +529,8 @@ class Text_LanguageDetect
*
* @return int 1 if $a is greater, -1 if not
* @see _bub_sort()
- * @access private
*/
- function _sort_func($a, $b)
+ protected function _sort_func($a, $b)
{
// each is actually a key/value pair, so that it can compare using both
list($a_key, $a_value) = $a;
@@ -588,9 +568,8 @@ class Text_LanguageDetect
*
* @return int the sum of the differences between the ranks of
* the two trigram sets
- * @access private
*/
- function _distance($arr1, $arr2)
+ protected function _distance($arr1, $arr2)
{
$sumdist = 0;
@@ -621,9 +600,8 @@ class Text_LanguageDetect
*
* @return float the normalized score
* @see _distance()
- * @access private
*/
- function _normalize_score($score, $base_count = null)
+ protected function _normalize_score($score, $base_count = null)
{
if ($base_count === null) {
$base_count = $this->_threshold;
@@ -699,7 +677,7 @@ class Text_LanguageDetect
$sample_obj->setPadStart(!$this->_perl_compatible);
$sample_obj->analyze();
- $trigram_freqs =& $sample_obj->getTrigramRanks();
+ $trigram_freqs = $sample_obj->getTrigramRanks();
$trigram_count = count($trigram_freqs);
if ($trigram_count == 0) {
@@ -710,7 +688,7 @@ class Text_LanguageDetect
// use unicode block detection to narrow down the possibilities
if ($this->_use_unicode_narrowing) {
- $blocks =& $sample_obj->getUnicodeBlocks();
+ $blocks = $sample_obj->getUnicodeBlocks();
if (is_array($blocks)) {
$present_blocks = array_keys($blocks);
@@ -962,16 +940,15 @@ class Text_LanguageDetect
*
* @return mixed Block name, -1 if it failed
* @see unicodeBlockName()
- * @access protected
*/
- function _unicode_block_name($unicode, $blocks, $block_count = -1)
+ protected function _unicode_block_name($unicode, $blocks, $block_count = -1)
{
// for a reference, see
// http://www.unicode.org/Public/UNIDATA/Blocks.txt
// assume that ascii characters are the most common
// so try it first for efficiency
- if ($unicode <= hexdec($blocks[0][1])) {
+ if ($unicode <= $blocks[0][1]) {
return $blocks[0];
}
@@ -989,11 +966,11 @@ class Text_LanguageDetect
while ($low <= $high) {
$mid = floor(($low + $high) / 2);
- if ($unicode < hexdec($blocks[$mid][0])) {
+ if ($unicode < $blocks[$mid][0]) {
// if it's lower than the lower bound
$high = $mid - 1;
- } elseif ($unicode > hexdec($blocks[$mid][1])) {
+ } elseif ($unicode > $blocks[$mid][1]) {
// if it's higher than the upper bound
$low = $mid + 1;
@@ -1015,9 +992,8 @@ class Text_LanguageDetect
*
* @return array the database of unicode block definitions
* @throws Text_LanguageDetect_Exception
- * @access protected
*/
- function _read_unicode_block_db()
+ protected function _read_unicode_block_db()
{
// since the unicode definitions are always going to be the same,
// might as well share the memory for the db with all other instances
@@ -1136,14 +1112,13 @@ class Text_LanguageDetect
* Uses a nearest neighbor technique to generate the maximum possible
* number of dendograms from the similarity data.
*
- * @access public
- * @return array language cluster data
- * @throws Text_LanguageDetect_Exception
- * @see languageSimilarity()
- * @deprecated this function will eventually be removed and placed into
+ * @return array language cluster data
+ * @throws Text_LanguageDetect_Exception
+ * @see languageSimilarity()
+ * @deprecated this function will eventually be removed and placed into
* the model generation class
*/
- function clusterLanguages()
+ public function clusterLanguages()
{
// todo: set the maximum number of clusters
// return cached result, if any
@@ -1452,7 +1427,7 @@ class Text_LanguageDetect
}
/**
- * ut8-safe strlen()
+ * UTF8-safe strlen()
*
* Returns the numbers of characters (not bytes) in a utf8 string
*
@@ -1476,10 +1451,9 @@ class Text_LanguageDetect
* @param string $char a utf8 (possibly multi-byte) char
*
* @return int unicode value
- * @access protected
* @link http://en.wikipedia.org/wiki/UTF-8
*/
- function _utf8char2unicode($char)
+ protected function _utf8char2unicode($char)
{
// strlen() here will actually get the binary length of a single char
switch (strlen($char)) {
@@ -1516,20 +1490,19 @@ class Text_LanguageDetect
}
/**
- * utf8-safe fast character iterator
+ * UTF8-safe fast character iterator
*
* Will get the next character starting from $counter, which will then be
* incremented. If a multi-byte char the bytes will be concatenated and
* $counter will be incremeted by the number of bytes in the char.
*
* @param string $str the string being iterated over
- * @param int &$counter the iterator, will increment by reference
+ * @param int $counter the iterator, will increment by reference
* @param bool $special_convert whether to do special conversions
*
* @return char the next (possibly multi-byte) char from $counter
- * @access private
*/
- static function _next_char($str, &$counter, $special_convert = false)
+ protected static function _next_char($str, &$counter, $special_convert = false)
{
$char = $str{$counter++};
$ord = ord($char);
@@ -1621,7 +1594,7 @@ class Text_LanguageDetect
*
* @return string|array Language name
*/
- function _convertFromNameMode($lang, $convertKey = false)
+ protected function _convertFromNameMode($lang, $convertKey = false)
{
if ($this->_name_mode == 0) {
return $lang;
@@ -1661,7 +1634,7 @@ class Text_LanguageDetect
*
* @return string|array Language name
*/
- function _convertToNameMode($lang, $convertKey = false)
+ protected function _convertToNameMode($lang, $convertKey = false)
{
if ($this->_name_mode == 0) {
return $lang;
@@ -1688,6 +1661,4 @@ class Text_LanguageDetect
}
return $newlang;
}
-}
-
-/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/libraries/language-detect/LanguageDetect/Exception.php b/libraries/language-detect/LanguageDetect/Exception.php
index 196d994..6566e6b 100644
--- a/libraries/language-detect/LanguageDetect/Exception.php
+++ b/libraries/language-detect/LanguageDetect/Exception.php
@@ -1,4 +1,16 @@
+ * @license BSD http://www.opensource.org/licenses/bsd-license.php
+ * @link http://pear.php.net/package/Text_LanguageDetect/
+ */
+
class Text_LanguageDetect_Exception extends Exception
{
/**
diff --git a/libraries/language-detect/LanguageDetect/ISO639.php b/libraries/language-detect/LanguageDetect/ISO639.php
index 05b0590..e76dd2d 100644
--- a/libraries/language-detect/LanguageDetect/ISO639.php
+++ b/libraries/language-detect/LanguageDetect/ISO639.php
@@ -1,18 +1,4 @@
- * @copyright 2011 Christian Weiske
- * @license http://www.debian.org/misc/bsd.license BSD
- * @version SVN: $Id$
- * @link http://pear.php.net/package/Text_LanguageDetect/
- */
-
/**
* Provides a mapping between the languages from lang.dat and the
* ISO 639-1 and ISO-639-2 codes.
@@ -23,7 +9,7 @@
* @package Text_LanguageDetect
* @author Christian Weiske
* @copyright 2011 Christian Weiske
- * @license http://www.debian.org/misc/bsd.license BSD
+ * @license BSD http://www.opensource.org/licenses/bsd-license.php
* @link http://www.loc.gov/standards/iso639-2/php/code_list.php
*/
class Text_LanguageDetect_ISO639
diff --git a/libraries/language-detect/LanguageDetect/Parser.php b/libraries/language-detect/LanguageDetect/Parser.php
index e859218..3558b81 100644
--- a/libraries/language-detect/LanguageDetect/Parser.php
+++ b/libraries/language-detect/LanguageDetect/Parser.php
@@ -1,18 +1,4 @@
+ * @copyright 2006 Nicholas Pisarro
+ * @license BSD http://www.opensource.org/licenses/bsd-license.php
+ * @version Release: 1.0.0
+ * @link http://pear.php.net/package/Text_LanguageDetect/
*/
class Text_LanguageDetect_Parser extends Text_LanguageDetect
{
/**
- * the piece of text being parsed
+ * The piece of text being parsed
*
- * @access private
- * @var string
+ * @var string
*/
- var $_string;
+ protected $_string;
/**
- * stores the trigram frequencies of the sample
+ * Stores the trigram frequencies of the sample
*
- * @access private
- * @var string
+ * @var string
*/
- var $_trigrams = array();
+ protected $_trigrams = array();
/**
- * stores the trigram ranks of the sample
+ * Stores the trigram ranks of the sample
*
- * @access private
- * @var array
+ * @var array
*/
- var $_trigram_ranks = array();
+ protected $_trigram_ranks = array();
/**
- * stores the unicode blocks of the sample
+ * Stores the unicode blocks of the sample
*
- * @access private
- * @var array
+ * @var array
*/
- var $_unicode_blocks = array();
-
+ protected $_unicode_blocks = array();
+
/**
* Whether the parser should compile the unicode ranges
- *
- * @access private
- * @var bool
+ *
+ * @var bool
*/
- var $_compile_unicode = false;
+ protected $_compile_unicode = false;
/**
* Whether the parser should compile trigrams
*
- * @access private
- * @var bool
+ * @var bool
*/
- var $_compile_trigram = false;
+ protected $_compile_trigram = false;
/**
* Whether the trigram parser should pad the beginning of the string
*
- * @access private
- * @var bool
+ * @var bool
*/
- var $_trigram_pad_start = false;
+ protected $_trigram_pad_start = false;
/**
* Whether the unicode parser should skip non-alphabetical ascii chars
*
- * @access private
- * @var bool
+ * @var bool
*/
- var $_unicode_skip_symbols = true;
+ protected $_unicode_skip_symbols = true;
/**
* Constructor
*
- * @access private
- * @param string $string string to be parsed
+ * @param string $string string to be parsed
*/
- function __construct($string) {
+ public function __construct($string)
+ {
$this->_string = $string;
}
+ /**
+ * PHP 4 constructor for backwards compatibility.
+ *
+ * @param string $string string to be parsed
+ *
+ * @return void
+ */
+ public function Text_LanguageDetect_Parser($string)
+ {
+ self::__construct($string);
+ }
+
/**
* Returns true if a string is suitable for parsing
*
- * @param string $str input string to test
- * @return bool true if acceptable, false if not
+ * @param string $str input string to test
+ *
+ * @return bool true if acceptable, false if not
*/
- public static function validateString($str) {
+ public static function validateString($str)
+ {
if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
return true;
} else {
@@ -121,34 +114,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
}
/**
- * turn on/off trigram counting
+ * Turn on/off trigram counting
*
- * @access public
- * @param bool $bool true for on, false for off
+ * @param bool $bool true for on, false for off
+ *
+ * @return void
*/
- function prepareTrigram($bool = true)
+ public function prepareTrigram($bool = true)
{
$this->_compile_trigram = $bool;
}
/**
- * turn on/off unicode block counting
+ * Turn on/off unicode block counting
*
- * @access public
- * @param bool $bool true for on, false for off
+ * @param bool $bool true for on, false for off
+ *
+ * @return void
*/
- function prepareUnicode($bool = true)
+ public function prepareUnicode($bool = true)
{
$this->_compile_unicode = $bool;
}
/**
- * turn on/off padding the beginning of the sample string
+ * Turn on/off padding the beginning of the sample string
*
- * @access public
- * @param bool $bool true for on, false for off
+ * @param bool $bool true for on, false for off
+ *
+ * @return void
*/
- function setPadStart($bool = true)
+ public function setPadStart($bool = true)
{
$this->_trigram_pad_start = $bool;
}
@@ -156,10 +152,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
/**
* Should the unicode block counter skip non-alphabetical ascii chars?
*
- * @access public
- * @param bool $bool true for on, false for off
+ * @param bool $bool true for on, false for off
+ *
+ * @return void
*/
- function setUnicodeSkipSymbols($bool = true)
+ public function setUnicodeSkipSymbols($bool = true)
{
$this->_unicode_skip_symbols = $bool;
}
@@ -167,10 +164,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
/**
* Returns the trigram ranks for the text sample
*
- * @access public
- * @return array trigram ranks in the text sample
+ * @return array Trigram ranks in the text sample
*/
- function &getTrigramRanks()
+ public function getTrigramRanks()
{
return $this->_trigram_ranks;
}
@@ -178,39 +174,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
/**
* Return the trigram freqency table
*
- * only used in testing to make sure the parser is working
+ * Only used in testing to make sure the parser is working
*
- * @access public
- * @return array trigram freqencies in the text sample
+ * @return array Trigram freqencies in the text sample
*/
- function &getTrigramFreqs()
+ public function getTrigramFreqs()
{
return $this->_trigram;
}
/**
- * returns the array of unicode blocks
+ * Returns the array of unicode blocks
*
- * @access public
- * @return array unicode blocks in the text sample
+ * @return array Unicode blocks in the text sample
*/
- function &getUnicodeBlocks()
+ public function getUnicodeBlocks()
{
return $this->_unicode_blocks;
}
/**
* Executes the parsing operation
- *
- * Be sure to call the set*() functions to set options and the
+ *
+ * Be sure to call the set*() functions to set options and the
* prepare*() functions first to tell it what kind of data to compute
*
* Afterwards the get*() functions can be used to access the compiled
* information.
*
- * @access public
+ * @return void
*/
- function analyze()
+ public function analyze()
{
$len = strlen($this->_string);
$byte_counter = 0;
@@ -258,9 +252,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
if ($this->_compile_trigram) {
if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
if (!isset($this->_trigram[$a . $b . $char])) {
- $this->_trigram[$a . $b . $char] = 1;
+ $this->_trigram[$a . $b . $char] = 1;
} else {
- $this->_trigram[$a . $b . $char]++;
+ $this->_trigram[$a . $b . $char]++;
}
}
@@ -271,10 +265,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
// unicode block detection
if ($this->_compile_unicode) {
if ($this->_unicode_skip_symbols
- && strlen($char) == 1
- && ($char < 'A' || $char > 'z'
- || ($char > 'Z' && $char < 'a'))
- && $char != "'") { // does not skip the apostrophe
+ && strlen($char) == 1
+ && ($char < 'A' || $char > 'z'
+ || ($char > 'Z' && $char < 'a'))
+ && $char != "'"
+ ) { // does not skip the apostrophe
// since it's included in the language
// models
@@ -297,7 +292,8 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
if ($this->_compile_unicode) {
foreach ($unicode_chars as $utf8_char => $count) {
$search_result = $this->_unicode_block_name(
- $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
+ $this->_utf8char2unicode($utf8_char), $blocks, $block_count
+ );
if ($search_result != -1) {
$block_name = $search_result[2];
@@ -342,6 +338,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
}
}
}
-}
-
-/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/libraries/language-detect/unicode_blocks.dat b/libraries/language-detect/unicode_blocks.dat
index 3b24cd2..1f66cac 100644
--- a/libraries/language-detect/unicode_blocks.dat
+++ b/libraries/language-detect/unicode_blocks.dat
@@ -1 +1 @@
-a:145:{i:0;a:3:{i:0;s:6:"0x0000";i:1;s:6:"0x007F";i:2;s:11:"Basic Latin";}i:1;a:3:{i:0;s:6:"0x0080";i:1;s:6:"0x00FF";i:2;s:18:"Latin-1 Supplement";}i:2;a:3:{i:0;s:6:"0x0100";i:1;s:6:"0x017F";i:2;s:16:"Latin Extended-A";}i:3;a:3:{i:0;s:6:"0x0180";i:1;s:6:"0x024F";i:2;s:16:"Latin Extended-B";}i:4;a:3:{i:0;s:6:"0x0250";i:1;s:6:"0x02AF";i:2;s:14:"IPA Extensions";}i:5;a:3:{i:0;s:6:"0x02B0";i:1;s:6:"0x02FF";i:2;s:24:"Spacing Modifier Letters";}i:6;a:3:{i:0;s:6:"0x0300";i:1;s:6:"0x036F";i:2;s:27:"Combining Diacritical Marks";}i:7;a:3:{i:0;s:6:"0x0370";i:1;s:6:"0x03FF";i:2;s:16:"Greek and Coptic";}i:8;a:3:{i:0;s:6:"0x0400";i:1;s:6:"0x04FF";i:2;s:8:"Cyrillic";}i:9;a:3:{i:0;s:6:"0x0500";i:1;s:6:"0x052F";i:2;s:19:"Cyrillic Supplement";}i:10;a:3:{i:0;s:6:"0x0530";i:1;s:6:"0x058F";i:2;s:8:"Armenian";}i:11;a:3:{i:0;s:6:"0x0590";i:1;s:6:"0x05FF";i:2;s:6:"Hebrew";}i:12;a:3:{i:0;s:6:"0x0600";i:1;s:6:"0x06FF";i:2;s:6:"Arabic";}i:13;a:3:{i:0;s:6:"0x0700";i:1;s:6:"0x074F";i:2;s:6:"Syriac";}i:14;a:3:{i:0;s:6:"0x0750";i:1;s:6:"0x077F";i:2;s:17:"Arabic Supplement";}i:15;a:3:{i:0;s:6:"0x0780";i:1;s:6:"0x07BF";i:2;s:6:"Thaana";}i:16;a:3:{i:0;s:6:"0x0900";i:1;s:6:"0x097F";i:2;s:10:"Devanagari";}i:17;a:3:{i:0;s:6:"0x0980";i:1;s:6:"0x09FF";i:2;s:7:"Bengali";}i:18;a:3:{i:0;s:6:"0x0A00";i:1;s:6:"0x0A7F";i:2;s:8:"Gurmukhi";}i:19;a:3:{i:0;s:6:"0x0A80";i:1;s:6:"0x0AFF";i:2;s:8:"Gujarati";}i:20;a:3:{i:0;s:6:"0x0B00";i:1;s:6:"0x0B7F";i:2;s:5:"Oriya";}i:21;a:3:{i:0;s:6:"0x0B80";i:1;s:6:"0x0BFF";i:2;s:5:"Tamil";}i:22;a:3:{i:0;s:6:"0x0C00";i:1;s:6:"0x0C7F";i:2;s:6:"Telugu";}i:23;a:3:{i:0;s:6:"0x0C80";i:1;s:6:"0x0CFF";i:2;s:7:"Kannada";}i:24;a:3:{i:0;s:6:"0x0D00";i:1;s:6:"0x0D7F";i:2;s:9:"Malayalam";}i:25;a:3:{i:0;s:6:"0x0D80";i:1;s:6:"0x0DFF";i:2;s:7:"Sinhala";}i:26;a:3:{i:0;s:6:"0x0E00";i:1;s:6:"0x0E7F";i:2;s:4:"Thai";}i:27;a:3:{i:0;s:6:"0x0E80";i:1;s:6:"0x0EFF";i:2;s:3:"Lao";}i:28;a:3:{i:0;s:6:"0x0F00";i:1;s:6:"0x0FFF";i:2;s:7:"Tibetan";}i:29;a:3:{i:0;s:6:"0x1000";i:1;s:6:"0x109F";i:2;s:7:"Myanmar";}i:30;a:3:{i:0;s:6:"0x10A0";i:1;s:6:"0x10FF";i:2;s:8:"Georgian";}i:31;a:3:{i:0;s:6:"0x1100";i:1;s:6:"0x11FF";i:2;s:11:"Hangul Jamo";}i:32;a:3:{i:0;s:6:"0x1200";i:1;s:6:"0x137F";i:2;s:8:"Ethiopic";}i:33;a:3:{i:0;s:6:"0x1380";i:1;s:6:"0x139F";i:2;s:19:"Ethiopic Supplement";}i:34;a:3:{i:0;s:6:"0x13A0";i:1;s:6:"0x13FF";i:2;s:8:"Cherokee";}i:35;a:3:{i:0;s:6:"0x1400";i:1;s:6:"0x167F";i:2;s:37:"Unified Canadian Aboriginal Syllabics";}i:36;a:3:{i:0;s:6:"0x1680";i:1;s:6:"0x169F";i:2;s:5:"Ogham";}i:37;a:3:{i:0;s:6:"0x16A0";i:1;s:6:"0x16FF";i:2;s:5:"Runic";}i:38;a:3:{i:0;s:6:"0x1700";i:1;s:6:"0x171F";i:2;s:7:"Tagalog";}i:39;a:3:{i:0;s:6:"0x1720";i:1;s:6:"0x173F";i:2;s:7:"Hanunoo";}i:40;a:3:{i:0;s:6:"0x1740";i:1;s:6:"0x175F";i:2;s:5:"Buhid";}i:41;a:3:{i:0;s:6:"0x1760";i:1;s:6:"0x177F";i:2;s:8:"Tagbanwa";}i:42;a:3:{i:0;s:6:"0x1780";i:1;s:6:"0x17FF";i:2;s:5:"Khmer";}i:43;a:3:{i:0;s:6:"0x1800";i:1;s:6:"0x18AF";i:2;s:9:"Mongolian";}i:44;a:3:{i:0;s:6:"0x1900";i:1;s:6:"0x194F";i:2;s:5:"Limbu";}i:45;a:3:{i:0;s:6:"0x1950";i:1;s:6:"0x197F";i:2;s:6:"Tai Le";}i:46;a:3:{i:0;s:6:"0x1980";i:1;s:6:"0x19DF";i:2;s:11:"New Tai Lue";}i:47;a:3:{i:0;s:6:"0x19E0";i:1;s:6:"0x19FF";i:2;s:13:"Khmer Symbols";}i:48;a:3:{i:0;s:6:"0x1A00";i:1;s:6:"0x1A1F";i:2;s:8:"Buginese";}i:49;a:3:{i:0;s:6:"0x1D00";i:1;s:6:"0x1D7F";i:2;s:19:"Phonetic Extensions";}i:50;a:3:{i:0;s:6:"0x1D80";i:1;s:6:"0x1DBF";i:2;s:30:"Phonetic Extensions Supplement";}i:51;a:3:{i:0;s:6:"0x1DC0";i:1;s:6:"0x1DFF";i:2;s:38:"Combining Diacritical Marks Supplement";}i:52;a:3:{i:0;s:6:"0x1E00";i:1;s:6:"0x1EFF";i:2;s:25:"Latin Extended Additional";}i:53;a:3:{i:0;s:6:"0x1F00";i:1;s:6:"0x1FFF";i:2;s:14:"Greek Extended";}i:54;a:3:{i:0;s:6:"0x2000";i:1;s:6:"0x206F";i:2;s:19:"General Punctuation";}i:55;a:3:{i:0;s:6:"0x2070";i:1;s:6:"0x209F";i:2;s:27:"Superscripts and Subscripts";}i:56;a:3:{i:0;s:6:"0x20A0";i:1;s:6:"0x20CF";i:2;s:16:"Currency Symbols";}i:57;a:3:{i:0;s:6:"0x20D0";i:1;s:6:"0x20FF";i:2;s:39:"Combining Diacritical Marks for Symbols";}i:58;a:3:{i:0;s:6:"0x2100";i:1;s:6:"0x214F";i:2;s:18:"Letterlike Symbols";}i:59;a:3:{i:0;s:6:"0x2150";i:1;s:6:"0x218F";i:2;s:12:"Number Forms";}i:60;a:3:{i:0;s:6:"0x2190";i:1;s:6:"0x21FF";i:2;s:6:"Arrows";}i:61;a:3:{i:0;s:6:"0x2200";i:1;s:6:"0x22FF";i:2;s:22:"Mathematical Operators";}i:62;a:3:{i:0;s:6:"0x2300";i:1;s:6:"0x23FF";i:2;s:23:"Miscellaneous Technical";}i:63;a:3:{i:0;s:6:"0x2400";i:1;s:6:"0x243F";i:2;s:16:"Control Pictures";}i:64;a:3:{i:0;s:6:"0x2440";i:1;s:6:"0x245F";i:2;s:29:"Optical Character Recognition";}i:65;a:3:{i:0;s:6:"0x2460";i:1;s:6:"0x24FF";i:2;s:22:"Enclosed Alphanumerics";}i:66;a:3:{i:0;s:6:"0x2500";i:1;s:6:"0x257F";i:2;s:11:"Box Drawing";}i:67;a:3:{i:0;s:6:"0x2580";i:1;s:6:"0x259F";i:2;s:14:"Block Elements";}i:68;a:3:{i:0;s:6:"0x25A0";i:1;s:6:"0x25FF";i:2;s:16:"Geometric Shapes";}i:69;a:3:{i:0;s:6:"0x2600";i:1;s:6:"0x26FF";i:2;s:21:"Miscellaneous Symbols";}i:70;a:3:{i:0;s:6:"0x2700";i:1;s:6:"0x27BF";i:2;s:8:"Dingbats";}i:71;a:3:{i:0;s:6:"0x27C0";i:1;s:6:"0x27EF";i:2;s:36:"Miscellaneous Mathematical Symbols-A";}i:72;a:3:{i:0;s:6:"0x27F0";i:1;s:6:"0x27FF";i:2;s:21:"Supplemental Arrows-A";}i:73;a:3:{i:0;s:6:"0x2800";i:1;s:6:"0x28FF";i:2;s:16:"Braille Patterns";}i:74;a:3:{i:0;s:6:"0x2900";i:1;s:6:"0x297F";i:2;s:21:"Supplemental Arrows-B";}i:75;a:3:{i:0;s:6:"0x2980";i:1;s:6:"0x29FF";i:2;s:36:"Miscellaneous Mathematical Symbols-B";}i:76;a:3:{i:0;s:6:"0x2A00";i:1;s:6:"0x2AFF";i:2;s:35:"Supplemental Mathematical Operators";}i:77;a:3:{i:0;s:6:"0x2B00";i:1;s:6:"0x2BFF";i:2;s:32:"Miscellaneous Symbols and Arrows";}i:78;a:3:{i:0;s:6:"0x2C00";i:1;s:6:"0x2C5F";i:2;s:10:"Glagolitic";}i:79;a:3:{i:0;s:6:"0x2C80";i:1;s:6:"0x2CFF";i:2;s:6:"Coptic";}i:80;a:3:{i:0;s:6:"0x2D00";i:1;s:6:"0x2D2F";i:2;s:19:"Georgian Supplement";}i:81;a:3:{i:0;s:6:"0x2D30";i:1;s:6:"0x2D7F";i:2;s:8:"Tifinagh";}i:82;a:3:{i:0;s:6:"0x2D80";i:1;s:6:"0x2DDF";i:2;s:17:"Ethiopic Extended";}i:83;a:3:{i:0;s:6:"0x2E00";i:1;s:6:"0x2E7F";i:2;s:24:"Supplemental Punctuation";}i:84;a:3:{i:0;s:6:"0x2E80";i:1;s:6:"0x2EFF";i:2;s:23:"CJK Radicals Supplement";}i:85;a:3:{i:0;s:6:"0x2F00";i:1;s:6:"0x2FDF";i:2;s:15:"Kangxi Radicals";}i:86;a:3:{i:0;s:6:"0x2FF0";i:1;s:6:"0x2FFF";i:2;s:34:"Ideographic Description Characters";}i:87;a:3:{i:0;s:6:"0x3000";i:1;s:6:"0x303F";i:2;s:27:"CJK Symbols and Punctuation";}i:88;a:3:{i:0;s:6:"0x3040";i:1;s:6:"0x309F";i:2;s:8:"Hiragana";}i:89;a:3:{i:0;s:6:"0x30A0";i:1;s:6:"0x30FF";i:2;s:8:"Katakana";}i:90;a:3:{i:0;s:6:"0x3100";i:1;s:6:"0x312F";i:2;s:8:"Bopomofo";}i:91;a:3:{i:0;s:6:"0x3130";i:1;s:6:"0x318F";i:2;s:25:"Hangul Compatibility Jamo";}i:92;a:3:{i:0;s:6:"0x3190";i:1;s:6:"0x319F";i:2;s:6:"Kanbun";}i:93;a:3:{i:0;s:6:"0x31A0";i:1;s:6:"0x31BF";i:2;s:17:"Bopomofo Extended";}i:94;a:3:{i:0;s:6:"0x31C0";i:1;s:6:"0x31EF";i:2;s:11:"CJK Strokes";}i:95;a:3:{i:0;s:6:"0x31F0";i:1;s:6:"0x31FF";i:2;s:28:"Katakana Phonetic Extensions";}i:96;a:3:{i:0;s:6:"0x3200";i:1;s:6:"0x32FF";i:2;s:31:"Enclosed CJK Letters and Months";}i:97;a:3:{i:0;s:6:"0x3300";i:1;s:6:"0x33FF";i:2;s:17:"CJK Compatibility";}i:98;a:3:{i:0;s:6:"0x3400";i:1;s:6:"0x4DBF";i:2;s:34:"CJK Unified Ideographs Extension A";}i:99;a:3:{i:0;s:6:"0x4DC0";i:1;s:6:"0x4DFF";i:2;s:23:"Yijing Hexagram Symbols";}i:100;a:3:{i:0;s:6:"0x4E00";i:1;s:6:"0x9FFF";i:2;s:22:"CJK Unified Ideographs";}i:101;a:3:{i:0;s:6:"0xA000";i:1;s:6:"0xA48F";i:2;s:12:"Yi Syllables";}i:102;a:3:{i:0;s:6:"0xA490";i:1;s:6:"0xA4CF";i:2;s:11:"Yi Radicals";}i:103;a:3:{i:0;s:6:"0xA700";i:1;s:6:"0xA71F";i:2;s:21:"Modifier Tone Letters";}i:104;a:3:{i:0;s:6:"0xA800";i:1;s:6:"0xA82F";i:2;s:12:"Syloti Nagri";}i:105;a:3:{i:0;s:6:"0xAC00";i:1;s:6:"0xD7AF";i:2;s:16:"Hangul Syllables";}i:106;a:3:{i:0;s:6:"0xD800";i:1;s:6:"0xDB7F";i:2;s:15:"High Surrogates";}i:107;a:3:{i:0;s:6:"0xDB80";i:1;s:6:"0xDBFF";i:2;s:27:"High Private Use Surrogates";}i:108;a:3:{i:0;s:6:"0xDC00";i:1;s:6:"0xDFFF";i:2;s:14:"Low Surrogates";}i:109;a:3:{i:0;s:6:"0xE000";i:1;s:6:"0xF8FF";i:2;s:16:"Private Use Area";}i:110;a:3:{i:0;s:6:"0xF900";i:1;s:6:"0xFAFF";i:2;s:28:"CJK Compatibility Ideographs";}i:111;a:3:{i:0;s:6:"0xFB00";i:1;s:6:"0xFB4F";i:2;s:29:"Alphabetic Presentation Forms";}i:112;a:3:{i:0;s:6:"0xFB50";i:1;s:6:"0xFDFF";i:2;s:27:"Arabic Presentation Forms-A";}i:113;a:3:{i:0;s:6:"0xFE00";i:1;s:6:"0xFE0F";i:2;s:19:"Variation Selectors";}i:114;a:3:{i:0;s:6:"0xFE10";i:1;s:6:"0xFE1F";i:2;s:14:"Vertical Forms";}i:115;a:3:{i:0;s:6:"0xFE20";i:1;s:6:"0xFE2F";i:2;s:20:"Combining Half Marks";}i:116;a:3:{i:0;s:6:"0xFE30";i:1;s:6:"0xFE4F";i:2;s:23:"CJK Compatibility Forms";}i:117;a:3:{i:0;s:6:"0xFE50";i:1;s:6:"0xFE6F";i:2;s:19:"Small Form Variants";}i:118;a:3:{i:0;s:6:"0xFE70";i:1;s:6:"0xFEFF";i:2;s:27:"Arabic Presentation Forms-B";}i:119;a:3:{i:0;s:6:"0xFF00";i:1;s:6:"0xFFEF";i:2;s:29:"Halfwidth and Fullwidth Forms";}i:120;a:3:{i:0;s:6:"0xFFF0";i:1;s:6:"0xFFFF";i:2;s:8:"Specials";}i:121;a:3:{i:0;s:7:"0x10000";i:1;s:7:"0x1007F";i:2;s:18:"Linear B Syllabary";}i:122;a:3:{i:0;s:7:"0x10080";i:1;s:7:"0x100FF";i:2;s:18:"Linear B Ideograms";}i:123;a:3:{i:0;s:7:"0x10100";i:1;s:7:"0x1013F";i:2;s:14:"Aegean Numbers";}i:124;a:3:{i:0;s:7:"0x10140";i:1;s:7:"0x1018F";i:2;s:21:"Ancient Greek Numbers";}i:125;a:3:{i:0;s:7:"0x10300";i:1;s:7:"0x1032F";i:2;s:10:"Old Italic";}i:126;a:3:{i:0;s:7:"0x10330";i:1;s:7:"0x1034F";i:2;s:6:"Gothic";}i:127;a:3:{i:0;s:7:"0x10380";i:1;s:7:"0x1039F";i:2;s:8:"Ugaritic";}i:128;a:3:{i:0;s:7:"0x103A0";i:1;s:7:"0x103DF";i:2;s:11:"Old Persian";}i:129;a:3:{i:0;s:7:"0x10400";i:1;s:7:"0x1044F";i:2;s:7:"Deseret";}i:130;a:3:{i:0;s:7:"0x10450";i:1;s:7:"0x1047F";i:2;s:7:"Shavian";}i:131;a:3:{i:0;s:7:"0x10480";i:1;s:7:"0x104AF";i:2;s:7:"Osmanya";}i:132;a:3:{i:0;s:7:"0x10800";i:1;s:7:"0x1083F";i:2;s:17:"Cypriot Syllabary";}i:133;a:3:{i:0;s:7:"0x10A00";i:1;s:7:"0x10A5F";i:2;s:10:"Kharoshthi";}i:134;a:3:{i:0;s:7:"0x1D000";i:1;s:7:"0x1D0FF";i:2;s:25:"Byzantine Musical Symbols";}i:135;a:3:{i:0;s:7:"0x1D100";i:1;s:7:"0x1D1FF";i:2;s:15:"Musical Symbols";}i:136;a:3:{i:0;s:7:"0x1D200";i:1;s:7:"0x1D24F";i:2;s:30:"Ancient Greek Musical Notation";}i:137;a:3:{i:0;s:7:"0x1D300";i:1;s:7:"0x1D35F";i:2;s:21:"Tai Xuan Jing Symbols";}i:138;a:3:{i:0;s:7:"0x1D400";i:1;s:7:"0x1D7FF";i:2;s:33:"Mathematical Alphanumeric Symbols";}i:139;a:3:{i:0;s:7:"0x20000";i:1;s:7:"0x2A6DF";i:2;s:34:"CJK Unified Ideographs Extension B";}i:140;a:3:{i:0;s:7:"0x2F800";i:1;s:7:"0x2FA1F";i:2;s:39:"CJK Compatibility Ideographs Supplement";}i:141;a:3:{i:0;s:7:"0xE0000";i:1;s:7:"0xE007F";i:2;s:4:"Tags";}i:142;a:3:{i:0;s:7:"0xE0100";i:1;s:7:"0xE01EF";i:2;s:30:"Variation Selectors Supplement";}i:143;a:3:{i:0;s:7:"0xF0000";i:1;s:7:"0xFFFFF";i:2;s:32:"Supplementary Private Use Area-A";}i:144;a:3:{i:0;s:8:"0x100000";i:1;s:8:"0x10FFFF";i:2;s:32:"Supplementary Private Use Area-B";}}
\ No newline at end of file
+a:145:{i:0;a:3:{i:0;i:0;i:1;i:127;i:2;s:11:"Basic Latin";}i:1;a:3:{i:0;i:128;i:1;i:255;i:2;s:18:"Latin-1 Supplement";}i:2;a:3:{i:0;i:256;i:1;i:383;i:2;s:16:"Latin Extended-A";}i:3;a:3:{i:0;i:384;i:1;i:591;i:2;s:16:"Latin Extended-B";}i:4;a:3:{i:0;i:592;i:1;i:687;i:2;s:14:"IPA Extensions";}i:5;a:3:{i:0;i:688;i:1;i:767;i:2;s:24:"Spacing Modifier Letters";}i:6;a:3:{i:0;i:768;i:1;i:879;i:2;s:27:"Combining Diacritical Marks";}i:7;a:3:{i:0;i:880;i:1;i:1023;i:2;s:16:"Greek and Coptic";}i:8;a:3:{i:0;i:1024;i:1;i:1279;i:2;s:8:"Cyrillic";}i:9;a:3:{i:0;i:1280;i:1;i:1327;i:2;s:19:"Cyrillic Supplement";}i:10;a:3:{i:0;i:1328;i:1;i:1423;i:2;s:8:"Armenian";}i:11;a:3:{i:0;i:1424;i:1;i:1535;i:2;s:6:"Hebrew";}i:12;a:3:{i:0;i:1536;i:1;i:1791;i:2;s:6:"Arabic";}i:13;a:3:{i:0;i:1792;i:1;i:1871;i:2;s:6:"Syriac";}i:14;a:3:{i:0;i:1872;i:1;i:1919;i:2;s:17:"Arabic Supplement";}i:15;a:3:{i:0;i:1920;i:1;i:1983;i:2;s:6:"Thaana";}i:16;a:3:{i:0;i:2304;i:1;i:2431;i:2;s:10:"Devanagari";}i:17;a:3:{i:0;i:2432;i:1;i:2559;i:2;s:7:"Bengali";}i:18;a:3:{i:0;i:2560;i:1;i:2687;i:2;s:8:"Gurmukhi";}i:19;a:3:{i:0;i:2688;i:1;i:2815;i:2;s:8:"Gujarati";}i:20;a:3:{i:0;i:2816;i:1;i:2943;i:2;s:5:"Oriya";}i:21;a:3:{i:0;i:2944;i:1;i:3071;i:2;s:5:"Tamil";}i:22;a:3:{i:0;i:3072;i:1;i:3199;i:2;s:6:"Telugu";}i:23;a:3:{i:0;i:3200;i:1;i:3327;i:2;s:7:"Kannada";}i:24;a:3:{i:0;i:3328;i:1;i:3455;i:2;s:9:"Malayalam";}i:25;a:3:{i:0;i:3456;i:1;i:3583;i:2;s:7:"Sinhala";}i:26;a:3:{i:0;i:3584;i:1;i:3711;i:2;s:4:"Thai";}i:27;a:3:{i:0;i:3712;i:1;i:3839;i:2;s:3:"Lao";}i:28;a:3:{i:0;i:3840;i:1;i:4095;i:2;s:7:"Tibetan";}i:29;a:3:{i:0;i:4096;i:1;i:4255;i:2;s:7:"Myanmar";}i:30;a:3:{i:0;i:4256;i:1;i:4351;i:2;s:8:"Georgian";}i:31;a:3:{i:0;i:4352;i:1;i:4607;i:2;s:11:"Hangul Jamo";}i:32;a:3:{i:0;i:4608;i:1;i:4991;i:2;s:8:"Ethiopic";}i:33;a:3:{i:0;i:4992;i:1;i:5023;i:2;s:19:"Ethiopic Supplement";}i:34;a:3:{i:0;i:5024;i:1;i:5119;i:2;s:8:"Cherokee";}i:35;a:3:{i:0;i:5120;i:1;i:5759;i:2;s:37:"Unified Canadian Aboriginal Syllabics";}i:36;a:3:{i:0;i:5760;i:1;i:5791;i:2;s:5:"Ogham";}i:37;a:3:{i:0;i:5792;i:1;i:5887;i:2;s:5:"Runic";}i:38;a:3:{i:0;i:5888;i:1;i:5919;i:2;s:7:"Tagalog";}i:39;a:3:{i:0;i:5920;i:1;i:5951;i:2;s:7:"Hanunoo";}i:40;a:3:{i:0;i:5952;i:1;i:5983;i:2;s:5:"Buhid";}i:41;a:3:{i:0;i:5984;i:1;i:6015;i:2;s:8:"Tagbanwa";}i:42;a:3:{i:0;i:6016;i:1;i:6143;i:2;s:5:"Khmer";}i:43;a:3:{i:0;i:6144;i:1;i:6319;i:2;s:9:"Mongolian";}i:44;a:3:{i:0;i:6400;i:1;i:6479;i:2;s:5:"Limbu";}i:45;a:3:{i:0;i:6480;i:1;i:6527;i:2;s:6:"Tai Le";}i:46;a:3:{i:0;i:6528;i:1;i:6623;i:2;s:11:"New Tai Lue";}i:47;a:3:{i:0;i:6624;i:1;i:6655;i:2;s:13:"Khmer Symbols";}i:48;a:3:{i:0;i:6656;i:1;i:6687;i:2;s:8:"Buginese";}i:49;a:3:{i:0;i:7424;i:1;i:7551;i:2;s:19:"Phonetic Extensions";}i:50;a:3:{i:0;i:7552;i:1;i:7615;i:2;s:30:"Phonetic Extensions Supplement";}i:51;a:3:{i:0;i:7616;i:1;i:7679;i:2;s:38:"Combining Diacritical Marks Supplement";}i:52;a:3:{i:0;i:7680;i:1;i:7935;i:2;s:25:"Latin Extended Additional";}i:53;a:3:{i:0;i:7936;i:1;i:8191;i:2;s:14:"Greek Extended";}i:54;a:3:{i:0;i:8192;i:1;i:8303;i:2;s:19:"General Punctuation";}i:55;a:3:{i:0;i:8304;i:1;i:8351;i:2;s:27:"Superscripts and Subscripts";}i:56;a:3:{i:0;i:8352;i:1;i:8399;i:2;s:16:"Currency Symbols";}i:57;a:3:{i:0;i:8400;i:1;i:8447;i:2;s:39:"Combining Diacritical Marks for Symbols";}i:58;a:3:{i:0;i:8448;i:1;i:8527;i:2;s:18:"Letterlike Symbols";}i:59;a:3:{i:0;i:8528;i:1;i:8591;i:2;s:12:"Number Forms";}i:60;a:3:{i:0;i:8592;i:1;i:8703;i:2;s:6:"Arrows";}i:61;a:3:{i:0;i:8704;i:1;i:8959;i:2;s:22:"Mathematical Operators";}i:62;a:3:{i:0;i:8960;i:1;i:9215;i:2;s:23:"Miscellaneous Technical";}i:63;a:3:{i:0;i:9216;i:1;i:9279;i:2;s:16:"Control Pictures";}i:64;a:3:{i:0;i:9280;i:1;i:9311;i:2;s:29:"Optical Character Recognition";}i:65;a:3:{i:0;i:9312;i:1;i:9471;i:2;s:22:"Enclosed Alphanumerics";}i:66;a:3:{i:0;i:9472;i:1;i:9599;i:2;s:11:"Box Drawing";}i:67;a:3:{i:0;i:9600;i:1;i:9631;i:2;s:14:"Block Elements";}i:68;a:3:{i:0;i:9632;i:1;i:9727;i:2;s:16:"Geometric Shapes";}i:69;a:3:{i:0;i:9728;i:1;i:9983;i:2;s:21:"Miscellaneous Symbols";}i:70;a:3:{i:0;i:9984;i:1;i:10175;i:2;s:8:"Dingbats";}i:71;a:3:{i:0;i:10176;i:1;i:10223;i:2;s:36:"Miscellaneous Mathematical Symbols-A";}i:72;a:3:{i:0;i:10224;i:1;i:10239;i:2;s:21:"Supplemental Arrows-A";}i:73;a:3:{i:0;i:10240;i:1;i:10495;i:2;s:16:"Braille Patterns";}i:74;a:3:{i:0;i:10496;i:1;i:10623;i:2;s:21:"Supplemental Arrows-B";}i:75;a:3:{i:0;i:10624;i:1;i:10751;i:2;s:36:"Miscellaneous Mathematical Symbols-B";}i:76;a:3:{i:0;i:10752;i:1;i:11007;i:2;s:35:"Supplemental Mathematical Operators";}i:77;a:3:{i:0;i:11008;i:1;i:11263;i:2;s:32:"Miscellaneous Symbols and Arrows";}i:78;a:3:{i:0;i:11264;i:1;i:11359;i:2;s:10:"Glagolitic";}i:79;a:3:{i:0;i:11392;i:1;i:11519;i:2;s:6:"Coptic";}i:80;a:3:{i:0;i:11520;i:1;i:11567;i:2;s:19:"Georgian Supplement";}i:81;a:3:{i:0;i:11568;i:1;i:11647;i:2;s:8:"Tifinagh";}i:82;a:3:{i:0;i:11648;i:1;i:11743;i:2;s:17:"Ethiopic Extended";}i:83;a:3:{i:0;i:11776;i:1;i:11903;i:2;s:24:"Supplemental Punctuation";}i:84;a:3:{i:0;i:11904;i:1;i:12031;i:2;s:23:"CJK Radicals Supplement";}i:85;a:3:{i:0;i:12032;i:1;i:12255;i:2;s:15:"Kangxi Radicals";}i:86;a:3:{i:0;i:12272;i:1;i:12287;i:2;s:34:"Ideographic Description Characters";}i:87;a:3:{i:0;i:12288;i:1;i:12351;i:2;s:27:"CJK Symbols and Punctuation";}i:88;a:3:{i:0;i:12352;i:1;i:12447;i:2;s:8:"Hiragana";}i:89;a:3:{i:0;i:12448;i:1;i:12543;i:2;s:8:"Katakana";}i:90;a:3:{i:0;i:12544;i:1;i:12591;i:2;s:8:"Bopomofo";}i:91;a:3:{i:0;i:12592;i:1;i:12687;i:2;s:25:"Hangul Compatibility Jamo";}i:92;a:3:{i:0;i:12688;i:1;i:12703;i:2;s:6:"Kanbun";}i:93;a:3:{i:0;i:12704;i:1;i:12735;i:2;s:17:"Bopomofo Extended";}i:94;a:3:{i:0;i:12736;i:1;i:12783;i:2;s:11:"CJK Strokes";}i:95;a:3:{i:0;i:12784;i:1;i:12799;i:2;s:28:"Katakana Phonetic Extensions";}i:96;a:3:{i:0;i:12800;i:1;i:13055;i:2;s:31:"Enclosed CJK Letters and Months";}i:97;a:3:{i:0;i:13056;i:1;i:13311;i:2;s:17:"CJK Compatibility";}i:98;a:3:{i:0;i:13312;i:1;i:19903;i:2;s:34:"CJK Unified Ideographs Extension A";}i:99;a:3:{i:0;i:19904;i:1;i:19967;i:2;s:23:"Yijing Hexagram Symbols";}i:100;a:3:{i:0;i:19968;i:1;i:40959;i:2;s:22:"CJK Unified Ideographs";}i:101;a:3:{i:0;i:40960;i:1;i:42127;i:2;s:12:"Yi Syllables";}i:102;a:3:{i:0;i:42128;i:1;i:42191;i:2;s:11:"Yi Radicals";}i:103;a:3:{i:0;i:42752;i:1;i:42783;i:2;s:21:"Modifier Tone Letters";}i:104;a:3:{i:0;i:43008;i:1;i:43055;i:2;s:12:"Syloti Nagri";}i:105;a:3:{i:0;i:44032;i:1;i:55215;i:2;s:16:"Hangul Syllables";}i:106;a:3:{i:0;i:55296;i:1;i:56191;i:2;s:15:"High Surrogates";}i:107;a:3:{i:0;i:56192;i:1;i:56319;i:2;s:27:"High Private Use Surrogates";}i:108;a:3:{i:0;i:56320;i:1;i:57343;i:2;s:14:"Low Surrogates";}i:109;a:3:{i:0;i:57344;i:1;i:63743;i:2;s:16:"Private Use Area";}i:110;a:3:{i:0;i:63744;i:1;i:64255;i:2;s:28:"CJK Compatibility Ideographs";}i:111;a:3:{i:0;i:64256;i:1;i:64335;i:2;s:29:"Alphabetic Presentation Forms";}i:112;a:3:{i:0;i:64336;i:1;i:65023;i:2;s:27:"Arabic Presentation Forms-A";}i:113;a:3:{i:0;i:65024;i:1;i:65039;i:2;s:19:"Variation Selectors";}i:114;a:3:{i:0;i:65040;i:1;i:65055;i:2;s:14:"Vertical Forms";}i:115;a:3:{i:0;i:65056;i:1;i:65071;i:2;s:20:"Combining Half Marks";}i:116;a:3:{i:0;i:65072;i:1;i:65103;i:2;s:23:"CJK Compatibility Forms";}i:117;a:3:{i:0;i:65104;i:1;i:65135;i:2;s:19:"Small Form Variants";}i:118;a:3:{i:0;i:65136;i:1;i:65279;i:2;s:27:"Arabic Presentation Forms-B";}i:119;a:3:{i:0;i:65280;i:1;i:65519;i:2;s:29:"Halfwidth and Fullwidth Forms";}i:120;a:3:{i:0;i:65520;i:1;i:65535;i:2;s:8:"Specials";}i:121;a:3:{i:0;i:65536;i:1;i:65663;i:2;s:18:"Linear B Syllabary";}i:122;a:3:{i:0;i:65664;i:1;i:65791;i:2;s:18:"Linear B Ideograms";}i:123;a:3:{i:0;i:65792;i:1;i:65855;i:2;s:14:"Aegean Numbers";}i:124;a:3:{i:0;i:65856;i:1;i:65935;i:2;s:21:"Ancient Greek Numbers";}i:125;a:3:{i:0;i:66304;i:1;i:66351;i:2;s:10:"Old Italic";}i:126;a:3:{i:0;i:66352;i:1;i:66383;i:2;s:6:"Gothic";}i:127;a:3:{i:0;i:66432;i:1;i:66463;i:2;s:8:"Ugaritic";}i:128;a:3:{i:0;i:66464;i:1;i:66527;i:2;s:11:"Old Persian";}i:129;a:3:{i:0;i:66560;i:1;i:66639;i:2;s:7:"Deseret";}i:130;a:3:{i:0;i:66640;i:1;i:66687;i:2;s:7:"Shavian";}i:131;a:3:{i:0;i:66688;i:1;i:66735;i:2;s:7:"Osmanya";}i:132;a:3:{i:0;i:67584;i:1;i:67647;i:2;s:17:"Cypriot Syllabary";}i:133;a:3:{i:0;i:68096;i:1;i:68191;i:2;s:10:"Kharoshthi";}i:134;a:3:{i:0;i:118784;i:1;i:119039;i:2;s:25:"Byzantine Musical Symbols";}i:135;a:3:{i:0;i:119040;i:1;i:119295;i:2;s:15:"Musical Symbols";}i:136;a:3:{i:0;i:119296;i:1;i:119375;i:2;s:30:"Ancient Greek Musical Notation";}i:137;a:3:{i:0;i:119552;i:1;i:119647;i:2;s:21:"Tai Xuan Jing Symbols";}i:138;a:3:{i:0;i:119808;i:1;i:120831;i:2;s:33:"Mathematical Alphanumeric Symbols";}i:139;a:3:{i:0;i:131072;i:1;i:173791;i:2;s:34:"CJK Unified Ideographs Extension B";}i:140;a:3:{i:0;i:194560;i:1;i:195103;i:2;s:39:"CJK Compatibility Ideographs Supplement";}i:141;a:3:{i:0;i:917504;i:1;i:917631;i:2;s:4:"Tags";}i:142;a:3:{i:0;i:917760;i:1;i:917999;i:2;s:30:"Variation Selectors Supplement";}i:143;a:3:{i:0;i:983040;i:1;i:1048575;i:2;s:32:"Supplementary Private Use Area-A";}i:144;a:3:{i:0;i:1048576;i:1;i:1114111;i:2;s:32:"Supplementary Private Use Area-B";}}
\ No newline at end of file
diff --git a/libraries/readability/Readability.php b/libraries/readability/Readability.php
index 8a3fb73..9ff38f4 100644
--- a/libraries/readability/Readability.php
+++ b/libraries/readability/Readability.php
@@ -122,6 +122,7 @@ class Readability
if ($parser=='gumbo') {
// Can we avoid this encoding/deocding step? Test on:
// http://www.medialens.org/index.php/alerts/alert-archive/2017/837-undermining-democracy-corporate-media-bias-on-jeremy-corbyn-boris-johnson-and-syria.html
+ $html = str_replace(''', "'", $html); // other named entities handled okay
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
$html = mb_convert_encoding($html, "UTF-8", 'HTML-ENTITIES');
$this->dom = @Layershifter\Gumbo\Parser::load($html);
diff --git a/makefulltextfeed.php b/makefulltextfeed.php
index ebaa01d..7592bf3 100644
--- a/makefulltextfeed.php
+++ b/makefulltextfeed.php
@@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh
// Copyright (c) 2017 Keyvan Minoukadeh
// License: AGPLv3
-// Version: 3.7
-// Date: 2017-02-12
+// Version: 3.8
+// Date: 2017-09-25
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
@@ -183,7 +183,9 @@ if (!isset($_REQUEST['url'])) {
die('No URL supplied');
}
$url = trim($_REQUEST['url']);
-if (strtolower(substr($url, 0, 7)) == 'feed://') {
+if (strtolower(substr($url, 0, 6)) == 'sec://') {
+ $url = 'https://'.substr($url, 6);
+} elseif (strtolower(substr($url, 0, 7)) == 'feed://') {
$url = 'http://'.substr($url, 7);
}
if (!preg_match('!^https?://.+!i', $url)) {
@@ -345,10 +347,10 @@ if ($options->content === 'user') {
// HTML5 output?
///////////////////////////////////////////////
if ($options->html5_output === 'user') {
- if (isset($_REQUEST['content']) && $_REQUEST['content'] === 'html5') {
- $options->html5_output = true;
- } else {
+ if (isset($_REQUEST['content']) && $_REQUEST['content'] === '1') {
$options->html5_output = false;
+ } else {
+ $options->html5_output = true;
}
}
@@ -820,7 +822,7 @@ foreach ($items as $key => $item) {
continue; // skip this feed item entry
}
}
- $base_url = get_base_url($readability->dom);
+ $base_url = get_base_url($readability->dom, $effective_url);
if (!$base_url) $base_url = $effective_url;
$content_block = ($extract_result) ? $extractor->getContent() : null;
$extracted_title = ($extract_result) ? $extractor->getTitle() : '';
@@ -945,6 +947,7 @@ foreach ($items as $key => $item) {
//unset($content_block);
// post-processing cleanup
$html = preg_replace('![\s\h\v]*
!u', '', $html);
+ $html = str_replace('
', '', $html);
if ($links == 'remove') {
$html = preg_replace('!]*>!', '', $html);
$html = preg_replace('!!', '', $html);
@@ -1080,6 +1083,7 @@ foreach ($items as $key => $item) {
$l_result = $l->detect($text_sample, 1);
if (count($l_result) > 0) {
$language = key($l_result);
+ debug('Language detected: '.$language);
}
}
} catch (Exception $e) {
@@ -1248,6 +1252,17 @@ function get_self_url() {
}
function validate_url($url) {
+ if (function_exists('idn_to_ascii')) {
+ if ($host = @parse_url($url, PHP_URL_HOST)) {
+ $puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46);
+ if ($host != $puny) {
+ $pos = strpos($url, $host);
+ if ($pos !== false) {
+ $url = substr_replace($url, $puny, $pos, strlen($host));
+ }
+ }
+ }
+ }
$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
@@ -1261,9 +1276,14 @@ function validate_url($url) {
}
}
-function get_base_url($dom) {
+function get_base_url($dom, $url=null) {
$xpath = new DOMXPath($dom);
- return @$xpath->evaluate('string(//head/base/@href)', $dom);
+ $base = @$xpath->evaluate('string(//head/base/@href)', $dom);
+ if (!$base) return false;
+ if (isset($url) && !preg_match('!^https?://!i', $base)) {
+ $base = make_absolute_str($url, $base);
+ }
+ return $base;
}
function is_ssl() {
@@ -1436,7 +1456,7 @@ function make_absolute_attr($base, $e, $attr) {
$url = str_replace(' ', '%20', $url);
if (!preg_match('!https?://!i', $url)) {
if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
- $e->setAttribute($attr, $absolute);
+ $e->setAttribute($attr, $absolute->get_uri());
}
}
}
@@ -1450,7 +1470,7 @@ function make_absolute_str($base, $url) {
return $url;
} else {
if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
- return $absolute;
+ return $absolute->get_uri();
}
return false;
}
@@ -1529,7 +1549,7 @@ function get_single_page($item, $html, $url) {
}
}
}
- $base_url = get_base_url($readability->dom);
+ $base_url = get_base_url($readability->dom, $url);
if (!$base_url) $base_url = $url;
// If we've got URL, resolve against $base_url
if (isset($single_page_url) && ($single_page_url = make_absolute_str($base_url, $single_page_url))) {