From 1ec2f36b3e67eb589f88154ed7c02bdae951f021 Mon Sep 17 00:00:00 2001 From: "FiveFilters.org" Date: Thu, 4 Apr 2019 23:46:36 +0200 Subject: [PATCH] Full-Text RSS 3.8 --- changelog.txt | 62 +++-- config.php | 13 +- ftr_compatibility_test.php | 12 +- index.php | 5 +- .../content-extractor/ContentExtractor.php | 66 +++-- libraries/content-extractor/SiteConfig.php | 213 +++++++++------- libraries/feedwriter/FeedItem.php | 3 +- libraries/feedwriter/FeedWriter.php | 2 + libraries/htmLawed/htmLawed.php | 228 +++++++++--------- .../html5php/HTML5/Parser/DOMTreeBuilder.php | 5 +- libraries/html5php/HTML5/Parser/Tokenizer.php | 77 +++--- libraries/html5php/README.md | 13 +- libraries/html5php/RELEASE.md | 12 +- .../humble-http-agent/HumbleHttpAgent.php | 47 +++- libraries/language-detect/LanguageDetect.php | 159 +++++------- .../LanguageDetect/Exception.php | 12 + .../language-detect/LanguageDetect/ISO639.php | 16 +- .../language-detect/LanguageDetect/Parser.php | 188 +++++++-------- libraries/language-detect/unicode_blocks.dat | 2 +- libraries/readability/Readability.php | 1 + makefulltextfeed.php | 44 +++- 21 files changed, 635 insertions(+), 545 deletions(-) diff --git a/changelog.txt b/changelog.txt index 9918041..c66978a 100644 --- a/changelog.txt +++ b/changelog.txt @@ -2,6 +2,22 @@ FiveFilters.org: Full-Text RSS http://fivefilters.org/content-only/ CHANGELOG ------------------------------------ +3.8 (2017-09-25) + - New site config directive: strip_attr: XPath attribute selector (e.g. //img/@srcset) - remove attribute from element + - New site config directive: insert_detected_image: yes/no (default yes) - places image in og:image in the body if no other images extracted + - Bug fix: Better handling of Internationalized Domain Names (IDNs) + - Bug fix: Relative base URLs () now resolved against page URL + - Bug fix: Wrong site config file chosen in certain cases (when wildcard and exact subdomain files available and cached in APCu) + - Bug fix: ' HTML entities not converted correctly when parsing with Gumbo PHP + - Remove srcset (+ sizes) attributes on img elements if it looks like they only contain relative URLs (browser will use src attribute value instead) + - https:// URLs now re-written to sec:// before being submitted to avoid overzealous security software blocking request on some servers - no redirect, only affects newly submitted URLs on index.php + - HTML5-PHP library updated + - Language Detect library updated + - Site config files updated for better extraction + - Minimum PHP version is now 5.4. If you must use PHP 5.3, please stick with Full-Text RSS 3.7 + - Tested with PHP 7.2 + - Other fixes/improvements + 3.7 (2017-02-12) - Request HTML5 output using HTML5-PHP - new config option $options->html5_output and new request parameter &content=html5 - Improve support for lazy-loading images @@ -23,31 +39,31 @@ CHANGELOG - Other fixes/improvements 3.6 (2016-02-21) -- Insert og:image (if we find one) at the top of the article when no images have been extracted -- Additional lazy image load handling - helps preserve more images designed for JS-enabled browsers -- Original GUID values from feed items now preserved -- New config option favour_effective_url determines if item's effective URL (after redirects) should replace original item URL in feed output -- Adding &use_effective_url to querystring will replace original feed item URL with effective URL (unless disabled with config option above) -- APCu stats view in admin panel fixed to work with recent versions of APCu -- HTML5-PHP library updated -- Tested for PHP 7 compatibility -- VPS Puppet script (ubuntu-15.10.pp) updated - fixes issue with IDN encodings, among other things. (This is intended for setting up a new Ubuntu 15.10 instance for running Full-Text RSS.) -- Site config files updated for better extraction -- Other minor fixes/improvements + - Insert og:image (if we find one) at the top of the article when no images have been extracted + - Additional lazy image load handling - helps preserve more images designed for JS-enabled browsers + - Original GUID values from feed items now preserved + - New config option favour_effective_url determines if item's effective URL (after redirects) should replace original item URL in feed output + - Adding &use_effective_url to querystring will replace original feed item URL with effective URL (unless disabled with config option above) + - APCu stats view in admin panel fixed to work with recent versions of APCu + - HTML5-PHP library updated + - Tested for PHP 7 compatibility + - VPS Puppet script (ubuntu-15.10.pp) updated - fixes issue with IDN encodings, among other things. (This is intended for setting up a new Ubuntu 15.10 instance for running Full-Text RSS.) + - Site config files updated for better extraction + - Other minor fixes/improvements 3.5 (2015-06-13) -- Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed -- Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled -- Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky) -- Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates) -- Humble HTTP Agent updated to support version 2 of PHP's HTTP extension -- HTML5-PHP library updated -- Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6 -- Config option removed: $options->user_agents - use site config files. -- Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional. -- Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4 -- Site config files updated for better extraction -- Other minor fixes/improvements + - Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed + - Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled + - Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky) + - Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates) + - Humble HTTP Agent updated to support version 2 of PHP's HTTP extension + - HTML5-PHP library updated + - Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6 + - Config option removed: $options->user_agents - use site config files. + - Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional. + - Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4 + - Site config files updated for better extraction + - Other minor fixes/improvements 3.4 (2014-09-08) - New request parameter: siteconfig lets you submit extraction rules directly in request diff --git a/config.php b/config.php index 06f6ec2..6860d36 100644 --- a/config.php +++ b/config.php @@ -61,16 +61,15 @@ $options->content = 'user'; // HTML5 output // ---------------------- -// By default, Full-Text RSS uses libxml to convert the parsed DOM tree back into HTML. -// If this is enabled, we'll use HTML5-PHP to produce the HTML. This will be a little -// slower, but might produce better results, adhering to the HTML5 spec. -// -// Note: in a future release we might make HTML5 output the default. +// Full-Text RSS used to rely on libxml to output HTML extracted from +// a web page. Since version 3.8 we use HTML5-PHP by default. +// If you prefer the old output, either set this to false or pass &content=1 +// in the querystring. // // Possible values... // HTML5 (slower): true // libxml (faster): false -// libxml unless user overrides (&content=html5): 'user' (default) +// HTML5 unless user overrides (&content=1): 'user' (default) $options->html5_output = 'user'; // Excerpts @@ -524,7 +523,7 @@ $options->cache_cleanup = 100; /// DO NOT CHANGE ANYTHING BELOW THIS /////////// ///////////////////////////////////////////////// -if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.7'); +if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.8'); if (basename(__FILE__) == 'config.php') { if (file_exists(dirname(__FILE__).'/custom_config.php')) { diff --git a/ftr_compatibility_test.php b/ftr_compatibility_test.php index 98ac060..4ce9161 100644 --- a/ftr_compatibility_test.php +++ b/ftr_compatibility_test.php @@ -16,12 +16,12 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o http://github.com/simplepie/simplepie/tree/master/compatibility_test/ */ -$app_name = 'Full-Text RSS 3.7'; +$app_name = 'Full-Text RSS 3.8'; // Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION. //$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION')); // HHVM works okay, but no Tidy and autoupdate of site config files not working (tested 3.7.1) -$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.3.0', '>=')); +$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.4.0', '>=')); $pcre_ok = extension_loaded('pcre'); $zlib_ok = extension_loaded('zlib'); $mbstring_ok = extension_loaded('mbstring'); @@ -32,6 +32,7 @@ $parallel_ok = ((extension_loaded('http') && class_exists('http\Client\Request') $allow_url_fopen_ok = (bool)ini_get('allow_url_fopen'); $filter_ok = extension_loaded('filter'); $gumbo_ok = class_exists('Layershifter\Gumbo\Parser'); +$idn_ok = function_exists('idn_to_ascii'); if (extension_loaded('xmlreader')) { $xml_ok = true; @@ -204,7 +205,7 @@ div.chunk { PHP - 5.3 or higher + 5.4 or higher @@ -354,6 +355,11 @@ div.chunk {

Further info

+ +

IDN support

+

When treating an internationalized domain name (IDN) Full-Text RSS will try to make use of PHP's idn_to_ascii function to convert the domain to ASCII. If this function does not exist, you might have trouble retrieving article content from internationalized domains.

+

idn_to_ascii is not'; ?> available on this server.

+

HTTP module

Full-Text RSS can make use of PHP's HTTP extension or curl_multi to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using file_get_contents.

content - 0, 1 (default), html5 - If set to 0, the extracted content will not be included in the output. If set to html5, we'll output HTML5. + 0, 1, html5 (default) + If set to 0, the extracted content will not be included in the output. If set to 1, we'll use regular libxml output - might not be HTML5 compliant. diff --git a/libraries/content-extractor/ContentExtractor.php b/libraries/content-extractor/ContentExtractor.php index 2519f2e..7f8c652 100644 --- a/libraries/content-extractor/ContentExtractor.php +++ b/libraries/content-extractor/ContentExtractor.php @@ -5,8 +5,8 @@ * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) * to extract content from HTML files. * - * @version 1.3 - * @date 2017-02-12 + * @version 1.4 + * @date 2017-09-25 * @author Keyvan Minoukadeh * @copyright 2017 Keyvan Minoukadeh * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 @@ -107,24 +107,13 @@ class ContentExtractor } // returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default) - public function buildSiteConfig($url, $html='', $add_to_cache=true) { + public function buildSiteConfig($url, $html='') { // extract host name $host = @parse_url($url, PHP_URL_HOST); $host = strtolower($host); if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); - // is merged version already cached? - if (SiteConfig::is_cached("$host.merged")) { - $config = SiteConfig::build("$host.merged"); - if ($config) { - $this->debug("Returning cached and merged site config for $host"); - return $config; - } - } // let's build from site_config/custom/ and standard/ $config = SiteConfig::build($host); - if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) { - SiteConfig::add_to_cache($host, $config); - } // if no match, use defaults if (!$config) $config = new SiteConfig(); // load fingerprint config? @@ -134,10 +123,6 @@ class ContentExtractor if ($config_fingerprint = SiteConfig::build($_fphost)) { $this->debug("Appending site config settings from $_fphost (fingerprint match)"); $config->append($config_fingerprint); - if ($add_to_cache && !SiteConfig::is_cached($_fphost)) { - //$config_fingerprint->cache_in_apc = true; - SiteConfig::add_to_cache($_fphost, $config_fingerprint); - } } } } @@ -146,19 +131,8 @@ class ContentExtractor if ($config_global = SiteConfig::build('global', true)) { $this->debug('Appending site config settings from global.txt'); $config->append($config_global); - if ($add_to_cache && !SiteConfig::is_cached('global')) { - //$config_global->cache_in_apc = true; - SiteConfig::add_to_cache('global', $config_global); - } } } - // store copy of merged config - if ($add_to_cache) { - // do not store in APC if wildcard match - $use_apc = ($host == $config->cache_key); - $config->cache_key = null; - SiteConfig::add_to_cache("$host.merged", $config, $use_apc); - } return $config; } @@ -398,10 +372,14 @@ class ContentExtractor $elems = @$xpath->query($pattern, $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { - $this->debug('Stripping '.$elems->length.' elements (strip)'); + $this->debug('Stripping '.$elems->length.' elements (strip: '.$pattern.')'); for ($i=$elems->length-1; $i >= 0; $i--) { if ($elems->item($i)->parentNode) { - $elems->item($i)->parentNode->removeChild($elems->item($i)); + if ($elems->item($i) instanceof DOMAttr) { + $elems->item($i)->parentNode->removeAttributeNode($elems->item($i)); + } else { + $elems->item($i)->parentNode->removeChild($elems->item($i)); + } } } } @@ -413,7 +391,7 @@ class ContentExtractor $elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { - $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)'); + $this->debug('Stripping '.$elems->length.' elements (strip_id_or_class: '.$string.')'); for ($i=$elems->length-1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } @@ -426,12 +404,13 @@ class ContentExtractor $elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom); // check for matches if ($elems && $elems->length > 0) { - $this->debug('Stripping '.$elems->length.' image elements'); + $this->debug('Stripping '.$elems->length.' elements (strip_image_src: '.$string.')'); for ($i=$elems->length-1; $i >= 0; $i--) { $elems->item($i)->parentNode->removeChild($elems->item($i)); } } } + // strip elements using Readability.com and Instapaper.com ignore class names // .entry-unrelated and .instapaper_ignore // See https://www.readability.com/publishers/guidelines/#view-plainGuidelines @@ -464,7 +443,22 @@ class ContentExtractor $elems->item($i)->parentNode->removeChild($elems->item($i)); } } - + + // strip img srcset/sizes attributes with relative URIs (src should be present and will be absolutised) + // TODO: absolutize srcet values rather than removing them + // To remove srcset from all image elements, site config files can contain: strip: //img/@srcset + $elems = $xpath->query("//img[@srcset and not(contains(@srcset, '//'))]", $this->readability->dom); + // check for matches + if ($elems && $elems->length > 0) { + $this->debug('Stripping '.$elems->length.' srcset attributes'); + foreach ($elems as $elem) { + $elem->removeAttribute('srcset'); + if ($elem->hasAttribute('sizes')) { + $elem->removeAttribute('sizes'); + } + } + } + // try to get body foreach ($this->config->body as $pattern) { $elems = @$xpath->query($pattern, $this->readability->dom); @@ -880,7 +874,7 @@ class ContentExtractor } } else { // If there's an og:image, but we have no images in the article, let's place it at the beginning of the article. - if ($this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') { + if ($this->config->insert_detected_image() && $this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') { $elems = @$xpath->query(".//img", $this->body); if ($elems->length === 0) { $_new_elem = $this->body->ownerDocument->createDocumentFragment(); @@ -902,7 +896,7 @@ class ContentExtractor return $this->success; } - + private function isDescendant(DOMElement $parent, DOMElement $child) { $node = $child->parentNode; while ($node != null) { diff --git a/libraries/content-extractor/SiteConfig.php b/libraries/content-extractor/SiteConfig.php index 3b90a75..fce2b04 100644 --- a/libraries/content-extractor/SiteConfig.php +++ b/libraries/content-extractor/SiteConfig.php @@ -5,10 +5,10 @@ * Each instance of this class should hold extraction patterns and other directives * for a website. See ContentExtractor class to see how it's used. * - * @version 1.0 - * @date 2015-06-09 + * @version 1.1 + * @date 2017-09-25 * @author Keyvan Minoukadeh - * @copyright 2015 Keyvan Minoukadeh + * @copyright 2017 Keyvan Minoukadeh * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 */ @@ -43,7 +43,6 @@ class SiteConfig // Process HTML with tidy before creating DOM (bool or null if undeclared) public $tidy = null; - protected $default_tidy = true; // used if undeclared // Autodetect title/body if xpath expressions fail to produce results. @@ -93,6 +92,12 @@ class SiteConfig public $parser = null; protected $default_parser = 'libxml'; // used if undeclared + // Insert detected image (currently only og:image) into beginning of extracted article + // Only does this if extracted article contains no images + // bool or null if undeclared + public $insert_detected_image = null; + protected $default_insert_detected_image = true; // used if undeclared + // Strings to search for in HTML before processing begins (used with $replace_string) public $find_string = array(); // Strings to replace those found in $find_string before HTML processing begins @@ -101,10 +106,9 @@ class SiteConfig // the options below cannot be set in the config files which this class represents //public $cache_in_apc = false; // used to decide if we should cache in apc or not - public $cache_key = null; public static $debug = false; protected static $apc = false; - protected static $config_path; + protected static $config_path_custom; protected static $config_path_fallback; protected static $config_cache = array(); const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; @@ -136,7 +140,13 @@ class SiteConfig self::$apc = $apc; return $apc; } - + + // return bool or null + public function insert_detected_image($use_default=true) { + if ($use_default) return (isset($this->insert_detected_image)) ? $this->insert_detected_image : $this->default_insert_detected_image; + return $this->insert_detected_image; + } + // return bool or null public function tidy($use_default=true) { if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy; @@ -162,15 +172,32 @@ class SiteConfig } public static function set_config_path($path, $fallback=null) { - self::$config_path = $path; + self::$config_path_custom = $path; self::$config_path_fallback = $fallback; } - + + protected static function load_cached_merged($host, $exact_host_match) { + if ($exact_host_match) { + $key = $host.'.merged.ex'; + } else { + $key = $host.'.merged'; + } + return self::load_cached($key); + } + + protected static function add_to_cache_merged($host, $exact_host_match, SiteConfig $config=null) { + if ($exact_host_match) { + $key = $host.'.merged.ex'; + } else { + $key = $host.'.merged'; + } + if (!isset($config)) $config = new SiteConfig(); + self::add_to_cache($key, $config); + } + public static function add_to_cache($key, SiteConfig $config, $use_apc=true) { $key = strtolower($key); if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); - if ($config->cache_key) $key = $config->cache_key; - $key .= '.'.self::get_key_suffix(); self::$config_cache[$key] = $config; if (self::$apc && $use_apc) { self::debug("Adding site config to APC cache with key sc.$key"); @@ -178,10 +205,23 @@ class SiteConfig } self::debug("Cached site config with key $key"); } - + + public static function load_cached($key) { + $key = strtolower($key); + if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); + //var_dump('in cache?', $key, self::$config_cache); + if (array_key_exists($key, self::$config_cache)) { + self::debug("... site config for $key already loaded in this request"); + return self::$config_cache[$key]; + } elseif (self::$apc && ($sconfig = apc_fetch("sc.$key"))) { + self::debug("... site config for $key found in APCu"); + return $sconfig; + } + return false; + } + public static function is_cached($key) { $key = strtolower($key); - $key .= '.'.self::get_key_suffix(); if (substr($key, 0, 4) == 'www.') $key = substr($key, 4); if (array_key_exists($key, self::$config_cache)) { return true; @@ -212,7 +252,7 @@ class SiteConfig } // check for single statement commands // we do not overwrite existing non null values - foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { + foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure', 'insert_detected_image') as $var) { if ($this->$var === null) $this->$var = $newconfig->$var; } // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!) @@ -222,16 +262,6 @@ class SiteConfig $this->$var = array_merge($this->$var, $newconfig->$var); } } - - // This is used to make sure that when a different primary folder is chosen - // The key for the cached result includes that folder choice. - // Otherwise, a subsequent request choosing a different folder - // could return the wrong cached config. - public static function get_key_suffix() { - $key_suffix = basename(self::$config_path); - if ($key_suffix === 'custom') $key_suffix = ''; - return $key_suffix; - } // Add test_contains to last test_url public function add_test_contains($test_contains) { @@ -274,6 +304,12 @@ class SiteConfig $host = strtolower($host); if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false; + // got a merged one? + $config = self::load_cached_merged($host, $exact_host_match); + if ($config) { + //self::debug('. returned merged config from a previous request'); + return $config; + } // check for site configuration $try = array($host); // should we look for wildcard matches @@ -284,102 +320,87 @@ class SiteConfig $try[] = '.'.implode('.', $split); } } - - // Which primary folder should we look inside? - // If it's not the default ('custom'), we need - // a key suffix to distinguish site config fules - // held in this folder from those in other folders. - $key_suffix = self::get_key_suffix(); - // look for site config file in primary folder - self::debug(". looking for site config for $host in primary folder"); + // look for site config file in custom folder + self::debug(". looking for site config for $host in custom folder"); + //var_dump($try); + $config = null; + $config_std = null; foreach ($try as $h) { - $h_key = "$h.$key_suffix"; - if (array_key_exists($h_key, self::$config_cache)) { - self::debug("... site config for $h already loaded in this request"); - return self::$config_cache[$h_key]; - } elseif (self::$apc && ($sconfig = apc_fetch("sc.$h_key"))) { - self::debug("... site config for $h in APC cache"); - return $sconfig; - } elseif (file_exists(self::$config_path."/$h.txt")) { + //$h_key = $h.'.'.$key_suffix; + $h_key = $h.'.custom'; + //var_dump($h_key, $h); + if ($config = self::load_cached($h_key)) { + break; + } elseif (file_exists(self::$config_path_custom."/$h.txt")) { self::debug("... found site config ($h.txt)"); - $file_primary = self::$config_path."/$h.txt"; - $matched_name = $h; + $file_custom = self::$config_path_custom."/$h.txt"; + $config = self::build_from_file($file_custom); + //$matched_name = $h; break; } } // if we found site config, process it - if (isset($file_primary)) { - $config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); - if (!$config_lines || !is_array($config_lines)) return false; - $config = self::build_from_array($config_lines); - // if APC caching is available and enabled, mark this for cache - //$config->cache_in_apc = true; - $config->cache_key = $matched_name; - - // if autodetec on failure is off (on by default) we do not need to look - // in secondary folder - if (!$config->autodetect_on_failure()) { - self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); - return $config; - } + // if autodetec on failure is off (on by default) we do not need to look + // in secondary folder + if ($config && !$config->autodetect_on_failure()) { + self::debug('... autodetect on failure is disabled (no other site config files will be loaded)'); + self::add_to_cache_merged($host, $exact_host_match, $config); + return $config; } // look for site config file in secondary folder if (isset(self::$config_path_fallback)) { - self::debug(". looking for site config for $host in secondary folder"); + self::debug(". looking for site config for $host in standard folder"); foreach ($try as $h) { - if (file_exists(self::$config_path_fallback."/$h.txt")) { - self::debug("... found site config in secondary folder ($h.txt)"); + if ($config_std = self::load_cached($h)) { + break; + } elseif (file_exists(self::$config_path_fallback."/$h.txt")) { + self::debug("... found site config in standard folder ($h.txt)"); $file_secondary = self::$config_path_fallback."/$h.txt"; - $matched_name = $h; + $config_std = self::build_from_file($file_secondary); break; } } - if (!isset($file_secondary)) { - self::debug("... no site config match in secondary folder"); - } } // return false if no config file found - if (!isset($file_primary) && !isset($file_secondary)) { + if (!$config && !$config_std) { self::debug("... no site config match for $host"); + self::add_to_cache_merged($host, $exact_host_match); return false; } - // return primary config if secondary not found - if (!isset($file_secondary) && isset($config)) { - return $config; - } - - // process secondary config file - $config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); - if (!$config_lines || !is_array($config_lines)) { - // failed to process secondary - if (isset($config)) { - // return primary config - return $config; - } else { - return false; - } - } - - // merge with primary and return - if (isset($config)) { + // final config handling + $config_final = null; + if (!$config_std && $config) { + $config_final = $config; + // merge with primary + } elseif ($config_std && $config) { self::debug('. merging config files'); - $config->append(self::build_from_array($config_lines)); - return $config; + $config->append($config_std); + $config_final = $config; } else { // return just secondary - $config = self::build_from_array($config_lines); + //$config = self::build_from_array($config_lines); // if APC caching is available and enabled, mark this for cache //$config->cache_in_apc = true; - $config->cache_key = $matched_name; - return $config; + $config_final = $config_std; } + self::add_to_cache_merged($host, $exact_host_match, $config_final); + return $config_final; } + public static function build_from_file($path, $cache=true) { + $key = basename($path, '.txt'); + $config_lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + if (!$config_lines || !is_array($config_lines)) return false; + $config = self::build_from_array($config_lines); + if ($cache) self::add_to_cache($key, $config); + return $config; + } + public static function build_from_string($string) { $config_lines = explode("\n", $string); return self::build_from_array($config_lines); @@ -399,13 +420,23 @@ class SiteConfig if (count($command) != 2) continue; $val = trim($command[1]); $command = trim($command[0]); - if ($command == '' || $val == '') continue; - + //if ($command == '' || $val == '') continue; + // $val can be empty, e.g. replace_string: + if ($command == '') continue; + + // strip_attr is now an alias for strip. + // In FTR 3.8 we can strip attributes from elements, not only the elements themselves + // e.g. strip: //img/@srcset (removes srcset attribute from all img elements) + // but for backward compatibility (to avoid errors with new config files + old version of FTR) + // we've introduced strip_attr and we'll recommend using that in our public site config rep. + // strip_attr: //img/@srcset + if ($command == 'strip_attr') $command = 'strip'; + // check for commands where we accept multiple statements if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header', 'test_url', 'find_string', 'replace_string'))) { array_push($config->$command, $val); // check for single statement commands that evaluate to true or false - } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { + } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure', 'insert_detected_image'))) { $config->$command = ($val == 'yes'); // check for single statement commands stored as strings } elseif (in_array($command, array('parser'))) { diff --git a/libraries/feedwriter/FeedItem.php b/libraries/feedwriter/FeedItem.php index ed7a8cf..e44268d 100644 --- a/libraries/feedwriter/FeedItem.php +++ b/libraries/feedwriter/FeedItem.php @@ -186,5 +186,4 @@ $this->setElement('enclosure','',$attributes); } - } // end of class FeedItem -?> + } \ No newline at end of file diff --git a/libraries/feedwriter/FeedWriter.php b/libraries/feedwriter/FeedWriter.php index 7061b02..42c7cd8 100644 --- a/libraries/feedwriter/FeedWriter.php +++ b/libraries/feedwriter/FeedWriter.php @@ -1,4 +1,6 @@ 1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'applet'=>1, 'area'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'blockquote'=>1, 'br'=>1, 'button'=>1, 'caption'=>1, 'center'=>1, 'cite'=>1, 'code'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'del'=>1, 'dfn'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'dt'=>1, 'em'=>1, 'embed'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'isindex'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'p'=>1, 'param'=>1, 'pre'=>1, 'q'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'tt'=>1, 'u'=>1, 'ul'=>1, 'var'=>1); // 86/deprecated+embed+ruby +$e = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'applet'=>1, 'area'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'blockquote'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'caption'=>1, 'center'=>1, 'cite'=>1, 'code'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'dd'=>1, 'del'=>1, 'details'=>1, 'dfn'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'dt'=>1, 'em'=>1, 'embed'=>1, 'fieldset'=>1, 'figcaption'=>1, 'figure'=>1, 'font'=>1, 'footer'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'isindex'=>1, 'kbd'=>1, 'keygen'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'link'=>1, 'main'=>1, 'map'=>1, 'mark'=>1, 'menu'=>1, 'meta'=>1, 'meter'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'output'=>1, 'p'=>1, 'param'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'small'=>1, 'source'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'style'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'time'=>1, 'tr'=>1, 'track'=>1, 'tt'=>1, 'u'=>1, 'ul'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1); // 118 incl. deprecated & some Ruby + if(!empty($C['safe'])){ - unset($e['applet'], $e['embed'], $e['iframe'], $e['object'], $e['script']); + unset($e['applet'], $e['audio'], $e['canvas'], $e['embed'], $e['iframe'], $e['object'], $e['script'], $e['video']); } $x = !empty($C['elements']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['elements']) : '*'; if($x == '-*'){$e = array();} @@ -39,21 +40,20 @@ else{ } $C['elements'] =& $e; // config attrs -$x = !empty($C['deny_attribute']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['deny_attribute']) : ''; -$x = array_flip((isset($x[0]) && $x[0] == '*') ? explode('-', $x) : explode(',', $x. (!empty($C['safe']) ? ',on*' : ''))); -if(isset($x['on*'])){ - unset($x['on*']); - $x += array('onblur'=>1, 'onchange'=>1, 'onclick'=>1, 'ondblclick'=>1, 'onfocus'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onmousedown'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onreset'=>1, 'onselect'=>1, 'onsubmit'=>1); -} +$x = !empty($C['deny_attribute']) ? strtolower(str_replace(array("\n", "\r", "\t", ' '), '', $C['deny_attribute'])) : ''; +$x = array_flip((isset($x[0]) && $x[0] == '*') ? str_replace('/', 'data-', explode('-', str_replace('data-', '/', $x))) : explode(',', $x. (!empty($C['safe']) ? ',on*' : ''))); $C['deny_attribute'] = $x; -// config URL -$x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':')) ? strtolower($C['schemes']) : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, telnet; *:file, http, https'; +// config URLs +$x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':')) ? strtolower($C['schemes']) : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet'. (empty($C['safe']) ? ', app, javascript; *: data, javascript, ' : '; *:'). 'file, http, https'; $C['schemes'] = array(); foreach(explode(';', str_replace(array(' ', "\t", "\r", "\n"), '', $x)) as $v){ $x = $x2 = null; list($x, $x2) = explode(':', $v, 2); if($x2){$C['schemes'][$x] = array_flip(explode(',', $x2));} } -if(!isset($C['schemes']['*'])){$C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1,);} +if(!isset($C['schemes']['*'])){ + $C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1); + if(empty($C['safe'])){$C['schemes']['*'] += array('data'=>1, 'javascript'=>1);} +} if(!empty($C['safe']) && empty($C['schemes']['style'])){$C['schemes']['style'] = array('!'=>1);} $C['abs_url'] = isset($C['abs_url']) ? $C['abs_url'] : 0; if(!isset($C['base_url']) or !preg_match('`^[a-zA-Z\d.+\-]+://[^/]+/(.+?/)?$`', $C['base_url'])){ @@ -81,7 +81,7 @@ $C['parent'] = isset($C['parent'][0]) ? strtolower($C['parent']) : 'body'; $C['show_setting'] = !empty($C['show_setting']) ? $C['show_setting'] : 0; $C['style_pass'] = empty($C['style_pass']) ? 0 : 1; $C['tidy'] = empty($C['tidy']) ? 0 : $C['tidy']; -$C['unique_ids'] = isset($C['unique_ids']) ? $C['unique_ids'] : 1; +$C['unique_ids'] = isset($C['unique_ids']) && (!preg_match('`\W`', $C['unique_ids'])) ? $C['unique_ids'] : 1; $C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 0; if(isset($GLOBALS['C'])){$reC = $GLOBALS['C'];} @@ -97,7 +97,7 @@ if($C['clean_ms_char']){ $t = strtr($t, $x); } if($C['cdata'] or $C['comment']){$t = preg_replace_callback('``sm', 'htmLawed::hl_cmtcd', $t);} -$t = preg_replace_callback('`&([A-Za-z][A-Za-z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', 'htmLawed::hl_ent', str_replace('&', '&', $t)); +$t = preg_replace_callback('`&([a-zA-Z][a-zA-Z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', 'htmLawed::hl_ent', str_replace('&', '&', $t)); if($C['unique_ids'] && !isset($GLOBALS['hl_Ids'])){$GLOBALS['hl_Ids'] = array();} if($C['hook']){$t = $C['hook']($t, $C, $S);} if($C['show_setting'] && preg_match('`^[a-z][a-z0-9_]*$`i', $C['show_setting'])){ @@ -112,18 +112,18 @@ unset($C, $e); if(isset($reC)){$GLOBALS['C'] = $reC;} if(isset($reS)){$GLOBALS['S'] = $reS;} return $t; -// eof } public static function hl_attrval($a, $t, $p){ // check attr val against $S -static $ma = array('accesskey', 'class', 'rel'); -$s = in_array($a, $ma) ? ' ' : ''; +static $ma = array('accesskey', 'class', 'itemtype', 'rel'); +$s = in_array($a, $ma) ? ' ' : ($a == 'srcset' ? ',': ''); $r = array(); $t = !empty($s) ? explode($s, $t) : array($t); foreach($t as $tk=>$tv){ - $o = 1; $l = strlen($tv); + $o = 1; $tv = trim($tv); $l = strlen($tv); foreach($p as $k=>$v){ + if(!$l){continue;} switch($k){ case 'maxlen': if($l > $v){$o = 0;} break; case 'minlen': if($l < $v){$o = 0;} @@ -146,30 +146,29 @@ foreach($t as $tk=>$tv){ } if($o){$r[] = $tv;} } +if($s == ','){$s = ', ';} $r = implode($s, $r); return (isset($r[0]) ? $r : (isset($p['default']) ? $p['default'] : 0)); -// eof } public static function hl_bal($t, $do=1, $in='div'){ // balance tags // by content $cB = array('blockquote'=>1, 'form'=>1, 'map'=>1, 'noscript'=>1); // Block -$cE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); // Empty -$cF = array('button'=>1, 'del'=>1, 'div'=>1, 'dd'=>1, 'fieldset'=>1, 'iframe'=>1, 'ins'=>1, 'li'=>1, 'noscript'=>1, 'object'=>1, 'td'=>1, 'th'=>1); // Flow; later context-wise dynamic move of ins & del to $cI -$cI = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'caption'=>1, 'cite'=>1, 'code'=>1, 'dfn'=>1, 'dt'=>1, 'em'=>1, 'font'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'i'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'p'=>1, 'pre'=>1, 'q'=>1, 'rb'=>1, 'rt'=>1, 's'=>1, 'samp'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); // Inline -$cN = array('a'=>array('a'=>1), 'button'=>array('a'=>1, 'button'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'fieldset'=>array('fieldset'=>1), 'form'=>array('form'=>1), 'label'=>array('label'=>1), 'noscript'=>array('script'=>1), 'pre'=>array('big'=>1, 'font'=>1, 'img'=>1, 'object'=>1, 'script'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1)); // Illegal +$cE = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); // Empty +$cF = array('a'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'button'=>1, 'canvas'=>1, 'del'=>1, 'details'=>1, 'div'=>1, 'dd'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'header'=>1, 'iframe'=>1, 'ins'=>1, 'li'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'section'=>1, 'style'=>1, 'td'=>1, 'th'=>1, 'video'=>1); // Flow; later context-wise dynamic move of ins & del to $cI +$cI = array('abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'caption'=>1, 'cite'=>1, 'code'=>1, 'data'=>1, 'datalist'=>1, 'dfn'=>1, 'dt'=>1, 'em'=>1, 'figcaption'=>1, 'font'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hgroup'=>1, 'i'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'mark'=>1, 'meter'=>1, 'output'=>1, 'p'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rt'=>1, 's'=>1, 'samp'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); // Inline +$cN = array('a'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'address'=>array('address'=>1, 'article'=>1, 'aside'=>1, 'header'=>1, 'keygen'=>1, 'footer'=>1, 'nav'=>1, 'section'=>1), 'button'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'fieldset'=>array('fieldset'=>1), 'footer'=>array('header'=>1, 'footer'=>1), 'form'=>array('form'=>1), 'header'=>array('header'=>1, 'footer'=>1), 'label'=>array('label'=>1), 'main'=>array('main'=>1), 'meter'=>array('meter'=>1), 'noscript'=>array('script'=>1), 'pre'=>array('big'=>1, 'font'=>1, 'img'=>1, 'object'=>1, 'script'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1), 'progress'=>array('progress'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1), 'time'=>array('time'=>1), ); // Illegal $cN2 = array_keys($cN); -$cR = array('blockquote'=>1, 'dir'=>1, 'dl'=>1, 'form'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); -$cS = array('colgroup'=>array('col'=>1), 'dir'=>array('li'=>1), 'dl'=>array('dd'=>1, 'dt'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child -if($GLOBALS['C']['direct_list_nest']){$cS['ol'] = $cS['ul'] += array('ol'=>1, 'ul'=>1);} -$cO = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'blockquote'=>array('script'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'form'=>array('script'=>1), 'map'=>array('area'=>1), 'object'=>array('param'=>1, 'embed'=>1)); // Other +$cS = array('colgroup'=>array('col'=>1), 'datalist'=>array('option'=>1), 'dir'=>array('li'=>1), 'dl'=>array('dd'=>1, 'dt'=>1), 'hgroup'=>array('h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child +if($GLOBALS['C']['direct_list_nest']){$cS['ol'] = $cS['ul'] = $cS['menu'] += array('menu'=>1, 'ol'=>1, 'ul'=>1);} +$cO = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'audio'=>array('source'=>1, 'track'=>1), 'blockquote'=>array('script'=>1), 'details'=>array('summary'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'figure'=>array('figcaption'=>1),'form'=>array('script'=>1), 'map'=>array('area'=>1), 'object'=>array('param'=>1, 'embed'=>1), 'video'=>array('source'=>1, 'track'=>1)); // Other $cT = array('colgroup'=>1, 'dd'=>1, 'dt'=>1, 'li'=>1, 'option'=>1, 'p'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1); // Omitable closing -// block/inline type; ins & del both type; #pcdata: text -$eB = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'isindex'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'table'=>1, 'ul'=>1); -$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'b'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'cite'=>1, 'code'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'map'=>1, 'object'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); -$eN = array('a'=>1, 'big'=>1, 'button'=>1, 'fieldset'=>1, 'font'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'label'=>1, 'object'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1); // Exclude from specific ele; $cN values -$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'dd'=>1, 'dt'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1); // Missing in $eB & $eI +// block/inline type; a/ins/del both type; #pcdata: text +$eB = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'details'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hr'=>1, 'isindex'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'section'=>1, 'style'=>1, 'table'=>1, 'ul'=>1); +$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'cite'=>1, 'code'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'figcaption'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'link'=>1, 'map'=>1, 'mark'=>1, 'meta'=>1, 'meter'=>1, 'object'=>1, 'output'=>1, 'progress'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'textarea'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1); +$eN = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'big'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'fieldset'=>1, 'font'=>1, 'footer'=>1, 'form'=>1, 'header'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'meter'=>1, 'nav'=>1, 'object'=>1, 'progress'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'time'=>1); // Exclude from specific ele; $cN values +$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'dd'=>1, 'dt'=>1, 'hgroup'=>1, 'keygen'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'source'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1, 'track'=>1); // Missing in $eB & $eI $eF = $eB + $eI; // $in sets allowed child @@ -223,7 +222,7 @@ for($i=-1, $ci=count($t); ++$i<$ci;){ if(isset($cE[$e]) or !in_array($e, $q)){continue;} // Empty/unopen if($p == $e){array_pop($q); echo ''; unset($e); continue;} // Last open $add = ''; // Nesting - close open tags that need to be - for($j=-1, $cj=count($q); ++$j<$cj;){ + for($j=-1, $cj=count($q); ++$j<$cj;){ if(($d = array_pop($q)) == $e){break;} else{$add .= "";} } @@ -304,7 +303,6 @@ while(!empty($q) && ($e = array_pop($q))){echo '';} $o = ob_get_contents(); ob_end_clean(); return $o; -// eof } public static function hl_cmtcd($t){ @@ -313,13 +311,12 @@ $t = $t[0]; global $C; if(!($v = $C[$n = $t[3] == '-' ? 'comment' : 'cdata'])){return $t;} if($v == 1){return '';} -if($n == 'comment'){ +if($n == 'comment' && $v < 4){ if(substr(($t = preg_replace('`--+`', '-', substr($t, 4, -3))), -1) != ' '){$t .= ' ';} } else{$t = substr($t, 1, -1);} $t = $v == 2 ? str_replace(array('&', '<', '>'), array('&', '<', '>'), $t) : $t; return str_replace(array('&', '<', '>'), array("\x03", "\x04", "\x05"), ($n == 'comment' ? "\x01\x02\x04!--$t--\x05\x02\x01" : "\x01\x01\x04$t\x05\x01\x01")); -// eof } public static function hl_ent($t){ @@ -335,7 +332,6 @@ if(($n = ctype_digit($t = substr($t, 1)) ? intval($t) : hexdec(substr($t, 1))) < return ($C['and_mark'] ? "\x06" : '&'). "amp;#{$t};"; } return ($C['and_mark'] ? "\x06" : '&'). '#'. (((ctype_digit($t) && $C['hexdec_entity'] < 2) or !$C['hexdec_entity']) ? $n : 'x'. dechex($n)). ';'; -// eof } public static function hl_prot($p, $c=null){ @@ -368,29 +364,36 @@ if($C['abs_url']){ } } return "{$b}{$p}{$a}"; -// eof } public static function hl_regex($p){ -// ?regex +// check regex if(empty($p)){return 0;} -if($t = ini_get('track_errors')){$o = isset($php_errormsg) ? $php_errormsg : null;} -else{ini_set('track_errors', 1);} -unset($php_errormsg); +if($v = function_exists('error_clear_last') && function_exists('error_get_last')){error_clear_last();} +else{ + if($t = ini_get('track_errors')){$o = isset($php_errormsg) ? $php_errormsg : null;} + else{ini_set('track_errors', 1);} + unset($php_errormsg); +} if(($d = ini_get('display_errors'))){ini_set('display_errors', 0);} preg_match($p, ''); +if($v){$r = error_get_last() == null ? 1 : 0; } +else{ + $r = isset($php_errormsg) ? 0 : 1; + if($t){$php_errormsg = isset($o) ? $o : null;} + else{ini_set('track_errors', 0);} +} if($d){ini_set('display_errors', 1);} -$r = isset($php_errormsg) ? 0 : 1; -if($t){$php_errormsg = isset($o) ? $o : null;} -else{ini_set('track_errors', 0);} return $r; -// eof } public static function hl_spec($t){ // final $spec $s = array(); -$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace_callback('/"(?>(`.|[^"])*)"/sm', create_function('$m', 'return substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", \'`"\'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\""), $m[0]), 1, -1);'), trim($t))); +if(!function_exists('hl_aux1')){function hl_aux1($m){ + return substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", '`"'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", '"'), $m[0]), 1, -1); +}} +$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace_callback('/"(?>(`.|[^"])*)"/sm', 'hl_aux1', trim($t))); for($i = count(($t = explode(';', $t))); --$i>=0;){ $w = $t[$i]; if(empty($w) or ($e = strpos($w, '=')) === false or !strlen(($a = substr($w, $e+1)))){continue;} @@ -410,12 +413,11 @@ for($i = count(($t = explode(';', $t))); --$i>=0;){ if(!count($y) && !count($n)){continue;} foreach(explode(',', substr($w, 0, $e)) as $v){ if(!strlen(($v = strtolower($v)))){continue;} - if(count($y)){$s[$v] = $y;} - if(count($n)){$s[$v]['n'] = $n;} + if(count($y)){if(!isset($s[$v])){$s[$v] = $y;} else{$s[$v] = array_merge($s[$v], $y);}} + if(count($n)){if(!isset($s[$v]['n'])){$s[$v]['n'] = $n;} else{$s[$v]['n'] = array_merge($s[$v]['n'], $n);}} } } return $s; -// eof } public static function hl_tag($t){ @@ -433,35 +435,37 @@ if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){ // attr string $a = str_replace(array("\n", "\r", "\t"), ' ', trim($m[3])); // tag transform -static $eD = array('applet'=>1, 'center'=>1, 'dir'=>1, 'embed'=>1, 'font'=>1, 'isindex'=>1, 'menu'=>1, 's'=>1, 'strike'=>1, 'u'=>1); // Deprecated +static $eD = array('acronym'=>1, 'applet'=>1, 'big'=>1, 'center'=>1, 'dir'=>1, 'font'=>1, 'isindex'=>1, 's'=>1, 'strike'=>1, 'tt'=>1); // Deprecated if($C['make_tag_strict'] && isset($eD[$e])){ $trt = htmLawed::hl_tag2($e, $a, $C['make_tag_strict']); if(!$e){return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('<', '>'), $t) : '');} } // close tag -static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'param'=>1); // Empty ele +static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); // Empty ele if(!empty($m[1])){ return (!isset($eE[$e]) ? (empty($C['hook_tag']) ? "" : $C['hook_tag']($e)) : (($C['keep_bad'])%2 ? str_replace(array('<', '>'), array('<', '>'), $t) : '')); } // open tag & attr -static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'allowfullscreen'=>array('iframe'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific -static $aNE = array('checked'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'disabled'=>1, 'ismap'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'readonly'=>1, 'selected'=>1); // Empty -static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* & src -static $aNU = array('class'=>array('param'=>1, 'script'=>1), 'dir'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'id'=>array('script'=>1), 'lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'xml:lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'onclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'ondblclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeydown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeypress'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeyup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousedown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousemove'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseout'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseover'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'style'=>array('param'=>1, 'script'=>1), 'title'=>array('param'=>1, 'script'=>1)); // Univ & exceptions +static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accept-charset'=>array('form'=>1), 'action'=>array('form'=>1), 'align'=>array('applet'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'div'=>1, 'embed'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'allowfullscreen'=>array('iframe'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'async'=>array('script'=>1), 'autocomplete'=>array('form'=>1, 'input'=>1), 'autofocus'=>array('button'=>1, 'input'=>1, 'keygen'=>1, 'select'=>1, 'textarea'=>1), 'autoplay'=>array('audio'=>1, 'video'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('img'=>1, 'object'=>1, 'table'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'challenge'=>array('keygen'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('command'=>1, 'input'=>1), 'cite'=>array('blockquote'=>1, 'del'=>1, 'ins'=>1, 'q'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('applet'=>1, 'object'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'content'=>array('meta'=>1), 'controls'=>array('audio'=>1, 'video'=>1), 'coords'=>array('a'=>1, 'area'=>1), 'crossorigin'=>array('img'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1, 'time'=>1), 'declare'=>array('object'=>1), 'default'=>array('track'=>1), 'defer'=>array('script'=>1), 'dirname'=>array('input'=>1, 'textarea'=>1), 'disabled'=>array('button'=>1, 'command'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'download'=>array('a'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'flashvars'=>array('embed'=>1), 'for'=>array('label'=>1, 'output'=>1), 'form'=>array('button'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'object'=>1, 'output'=>1, 'select'=>1, 'textarea'=>1), 'formaction'=>array('button'=>1, 'input'=>1), 'formenctype'=>array('button'=>1, 'input'=>1), 'formmethod'=>array('button'=>1, 'input'=>1), 'formnovalidate'=>array('button'=>1, 'input'=>1), 'formtarget'=>array('button'=>1, 'input'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('applet'=>1, 'canvas'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'high'=>array('meter'=>1), 'href'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hreflang'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'icon'=>array('command'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'keyparams'=>array('keygen'=>1), 'keytype'=>array('keygen'=>1), 'kind'=>array('track'=>1), 'label'=>array('command'=>1, 'menu'=>1, 'option'=>1, 'optgroup'=>1, 'track'=>1), 'language'=>array('script'=>1), 'list'=>array('input'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'loop'=>array('audio'=>1, 'video'=>1), 'low'=>array('meter'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'max'=>array('input'=>1, 'meter'=>1, 'progress'=>1), 'maxlength'=>array('input'=>1, 'textarea'=>1), 'media'=>array('a'=>1, 'area'=>1, 'link'=>1, 'source'=>1, 'style'=>1), 'mediagroup'=>array('audio'=>1, 'video'=>1), 'method'=>array('form'=>1), 'min'=>array('input'=>1, 'meter'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('input'=>1, 'select'=>1), 'muted'=>array('audio'=>1, 'video'=>1), 'name'=>array('a'=>1, 'applet'=>1, 'button'=>1, 'embed'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'keygen'=>1, 'map'=>1, 'object'=>1, 'output'=>1, 'param'=>1, 'select'=>1, 'textarea'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'novalidate'=>array('form'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'open'=>array('details'=>1), 'optimum'=>array('meter'=>1), 'pattern'=>array('input'=>1), 'ping'=>array('a'=>1, 'area'=>1), 'placeholder'=>array('input'=>1, 'textarea'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'poster'=>array('video'=>1), 'pqg'=>array('keygen'=>1), 'preload'=>array('audio'=>1, 'video'=>1), 'prompt'=>array('isindex'=>1), 'pubdate'=>array('time'=>1), 'radiogroup'=>array('command'=>1), 'readonly'=>array('input'=>1, 'textarea'=>1), 'rel'=>array('a'=>1, 'area'=>1, 'link'=>1), 'required'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'rev'=>array('a'=>1), 'reversed'=>array('ol'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'sandbox'=>array('iframe'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scoped'=>array('style'=>1), 'scrolling'=>array('iframe'=>1), 'seamless'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('a'=>1, 'area'=>1), 'size'=>array('font'=>1, 'hr'=>1, 'input'=>1, 'select'=>1), 'sizes'=>array('link'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('audio'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'script'=>1, 'source'=>1, 'track'=>1, 'video'=>1), 'srcdoc'=>array('iframe'=>1), 'srclang'=>array('track'=>1), 'srcset'=>array('img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'step'=>array('input'=>1), 'summary'=>array('table'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'area'=>1, 'button'=>1, 'command'=>1, 'embed'=>1, 'input'=>1, 'li'=>1, 'link'=>1, 'menu'=>1, 'object'=>1, 'ol'=>1, 'param'=>1, 'script'=>1, 'source'=>1, 'style'=>1, 'ul'=>1), 'typemustmatch'=>array('object'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('button'=>1, 'data'=>1, 'input'=>1, 'li'=>1, 'meter'=>1, 'option'=>1, 'param'=>1, 'progress'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'width'=>array('applet'=>1, 'canvas'=>1, 'col'=>1, 'colgroup'=>1, 'embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'wmode'=>array('embed'=>1), 'wrap'=>array('textarea'=>1)); // Ele-specific +static $aNA = array('aria-activedescendant'=>1, 'aria-atomic'=>1, 'aria-autocomplete'=>1, 'aria-busy'=>1, 'aria-checked'=>1, 'aria-controls'=>1, 'aria-describedby'=>1, 'aria-disabled'=>1, 'aria-dropeffect'=>1, 'aria-expanded'=>1, 'aria-flowto'=>1, 'aria-grabbed'=>1, 'aria-haspopup'=>1, 'aria-hidden'=>1, 'aria-invalid'=>1, 'aria-label'=>1, 'aria-labelledby'=>1, 'aria-level'=>1, 'aria-live'=>1, 'aria-multiline'=>1, 'aria-multiselectable'=>1, 'aria-orientation'=>1, 'aria-owns'=>1, 'aria-posinset'=>1, 'aria-pressed'=>1, 'aria-readonly'=>1, 'aria-relevant'=>1, 'aria-required'=>1, 'aria-selected'=>1, 'aria-setsize'=>1, 'aria-sort'=>1, 'aria-valuemax'=>1, 'aria-valuemin'=>1, 'aria-valuenow'=>1, 'aria-valuetext'=>1); // ARIA +static $aNE = array('allowfullscreen'=>1, 'checkbox'=>1, 'checked'=>1, 'command'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'default'=>1, 'disabled'=>1, 'hidden'=>1, 'inert'=>1, 'ismap'=>1, 'itemscope'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'open'=>1, 'radio'=>1, 'readonly'=>1, 'required'=>1, 'reversed'=>1, 'selected'=>1); // Empty +static $aNO = array('onabort'=>1, 'onblur'=>1, 'oncanplay'=>1, 'oncanplaythrough'=>1, 'onchange'=>1, 'onclick'=>1, 'oncontextmenu'=>1, 'oncopy'=>1, 'oncuechange'=>1, 'oncut'=>1, 'ondblclick'=>1, 'ondrag'=>1, 'ondragend'=>1, 'ondragenter'=>1, 'ondragleave'=>1, 'ondragover'=>1, 'ondragstart'=>1, 'ondrop'=>1, 'ondurationchange'=>1, 'onemptied'=>1, 'onended'=>1, 'onerror'=>1, 'onfocus'=>1, 'onformchange'=>1, 'onforminput'=>1, 'oninput'=>1, 'oninvalid'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onload'=>1, 'onloadeddata'=>1, 'onloadedmetadata'=>1, 'onloadstart'=>1, 'onlostpointercapture'=>1, 'onmousedown'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onmousewheel'=>1, 'onpaste'=>1, 'onpause'=>1, 'onplay'=>1, 'onplaying'=>1, 'onpointercancel'=>1, 'ongotpointercapture'=>1, 'onpointerdown'=>1, 'onpointerenter'=>1, 'onpointerleave'=>1, 'onpointermove'=>1, 'onpointerout'=>1, 'onpointerover'=>1, 'onpointerup'=>1, 'onprogress'=>1, 'onratechange'=>1, 'onreadystatechange'=>1, 'onreset'=>1, 'onsearch'=>1, 'onscroll'=>1, 'onseeked'=>1, 'onseeking'=>1, 'onselect'=>1, 'onshow'=>1, 'onstalled'=>1, 'onsubmit'=>1, 'onsuspend'=>1, 'ontimeupdate'=>1, 'ontoggle'=>1, 'ontouchcancel'=>1, 'ontouchend'=>1, 'ontouchmove'=>1, 'ontouchstart'=>1, 'onvolumechange'=>1, 'onwaiting'=>1, 'onwheel'=>1); // Event +static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'itemtype'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'src'=>1, 'srcset'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* +static $aNU = array('accesskey'=>1, 'class'=>1, 'contenteditable'=>1, 'contextmenu'=>1, 'dir'=>1, 'draggable'=>1, 'dropzone'=>1, 'hidden'=>1, 'id'=>1, 'inert'=>1, 'itemid'=>1, 'itemprop'=>1, 'itemref'=>1, 'itemscope'=>1, 'itemtype'=>1, 'lang'=>1, 'role'=>1, 'spellcheck'=>1, 'style'=>1, 'tabindex'=>1, 'title'=>1, 'translate'=>1, 'xmlns'=>1, 'xml:base'=>1, 'xml:lang'=>1, 'xml:space'=>1); // Univ; excludes on*, aria* if($C['lc_std_val']){ // predef attr vals for $eAL & $aNE ele - static $aNL = array('all'=>1, 'baseline'=>1, 'bottom'=>1, 'button'=>1, 'center'=>1, 'char'=>1, 'checkbox'=>1, 'circle'=>1, 'col'=>1, 'colgroup'=>1, 'cols'=>1, 'data'=>1, 'default'=>1, 'file'=>1, 'get'=>1, 'groups'=>1, 'hidden'=>1, 'image'=>1, 'justify'=>1, 'left'=>1, 'ltr'=>1, 'middle'=>1, 'none'=>1, 'object'=>1, 'password'=>1, 'poly'=>1, 'post'=>1, 'preserve'=>1, 'radio'=>1, 'rect'=>1, 'ref'=>1, 'reset'=>1, 'right'=>1, 'row'=>1, 'rowgroup'=>1, 'rows'=>1, 'rtl'=>1, 'submit'=>1, 'text'=>1, 'top'=>1); - static $eAL = array('a'=>1, 'area'=>1, 'bdo'=>1, 'button'=>1, 'col'=>1, 'form'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'xml:space'=>1); + static $aNL = array('all'=>1, 'auto'=>1, 'baseline'=>1, 'bottom'=>1, 'button'=>1, 'captions'=>1, 'center'=>1, 'chapters'=>1, 'char'=>1, 'checkbox'=>1, 'circle'=>1, 'col'=>1, 'colgroup'=>1, 'color'=>1, 'cols'=>1, 'data'=>1, 'date'=>1, 'datetime'=>1, 'datetime-local'=>1, 'default'=>1, 'descriptions'=>1, 'email'=>1, 'file'=>1, 'get'=>1, 'groups'=>1, 'hidden'=>1, 'image'=>1, 'justify'=>1, 'left'=>1, 'ltr'=>1, 'metadata'=>1, 'middle'=>1, 'month'=>1, 'none'=>1, 'number'=>1, 'object'=>1, 'password'=>1, 'poly'=>1, 'post'=>1, 'preserve'=>1, 'radio'=>1, 'range'=>1, 'rect'=>1, 'ref'=>1, 'reset'=>1, 'right'=>1, 'row'=>1, 'rowgroup'=>1, 'rows'=>1, 'rtl'=>1, 'search'=>1, 'submit'=>1, 'subtitles'=>1, 'tel'=>1, 'text'=>1, 'time'=>1, 'top'=>1, 'url'=>1, 'week'=>1); + static $eAL = array('a'=>1, 'area'=>1, 'bdo'=>1, 'button'=>1, 'col'=>1, 'fieldset'=>1, 'form'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'track'=>1, 'xml:space'=>1); $lcase = isset($eAL[$e]) ? 1 : 0; } $depTr = 0; if($C['no_deprecated_attr']){ - // dep attr:applicable ele - static $aND = array('align'=>array('caption'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1), 'bgcolor'=>array('table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'clear'=>array('br'=>1), 'compact'=>array('dl'=>1, 'ol'=>1, 'ul'=>1), 'height'=>array('td'=>1, 'th'=>1), 'hspace'=>array('img'=>1, 'object'=>1), 'language'=>array('script'=>1), 'name'=>array('a'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'map'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'size'=>array('hr'=>1), 'start'=>array('ol'=>1), 'type'=>array('li'=>1, 'ol'=>1, 'ul'=>1), 'value'=>array('li'=>1), 'vspace'=>array('img'=>1, 'object'=>1), 'width'=>array('hr'=>1, 'pre'=>1, 'td'=>1, 'th'=>1)); - static $eAD = array('a'=>1, 'br'=>1, 'caption'=>1, 'div'=>1, 'dl'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'li'=>1, 'map'=>1, 'object'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'script'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1, 'ul'=>1); + // depr attr:applicable ele + static $aND = array('align'=>array('caption'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1), 'bgcolor'=>array('table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellspacing'=>array('table'=>1), 'clear'=>array('br'=>1), 'compact'=>array('dl'=>1, 'ol'=>1, 'ul'=>1), 'height'=>array('td'=>1, 'th'=>1), 'hspace'=>array('img'=>1, 'object'=>1), 'language'=>array('script'=>1), 'name'=>array('a'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'map'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'size'=>array('hr'=>1), 'vspace'=>array('img'=>1, 'object'=>1), 'width'=>array('hr'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1)); + static $eAD = array('a'=>1, 'br'=>1, 'caption'=>1, 'div'=>1, 'dl'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'map'=>1, 'object'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'script'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1, 'ul'=>1); $depTr = isset($eAD[$e]) ? 1 : 0; } @@ -472,7 +476,7 @@ while(strlen($a)){ $w = 0; switch($mode){ case 0: // Name - if(preg_match('`^[a-zA-Z][\-a-zA-Z:]+`', $a, $m)){ + if(preg_match('`^[a-zA-Z][^\s=/]+`', $a, $m)){ $nm = strtolower($m[0]); $w = $mode = 1; $a = ltrim(substr_replace($a, '', 0, strlen($m[0]))); } @@ -500,9 +504,9 @@ if($mode == 1){$aA[$nm] = '';} // clean attrs global $S; $rl = isset($S[$e]) ? $S[$e] : array(); -$a = array(); $nfr = 0; +$a = array(); $nfr = 0; $d = $C['deny_attribute']; foreach($aA as $k=>$v){ - if(((isset($C['deny_attribute']['*']) ? isset($C['deny_attribute'][$k]) : !isset($C['deny_attribute'][$k])) && (isset($aN[$k][$e]) or (isset($aNU[$k]) && !isset($aNU[$k][$e]))) && !isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])){ + if(((isset($d['*']) ? isset($d[$k]) : !isset($d[$k])) && (isset($aN[$k][$e]) or isset($aNU[$k]) or (isset($aNO[$k]) && !isset($d['on*'])) or (isset($aNA[$k]) && !isset($d['aria*'])) or (!isset($d['data*']) && preg_match('`data-((?!xml)[^:]+$)`', $k))) && !isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])){ if(isset($aNE[$k])){$v = $k;} elseif(!empty($lcase) && (($e != 'button' or $e != 'input') or $k == 'type')){ // Rather loose but ?not cause issues $v = (isset($aNL[($v2 = strtolower($v))])) ? $v2 : $v; @@ -514,9 +518,26 @@ foreach($aA as $k=>$v){ } $v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'htmLawed::hl_prot', $v); $v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v; - }elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){ - $v = str_replace("­", ' ', (strpos($v, '&') !== false ? str_replace(array('­', '­', '­'), ' ', $v) : $v)); # double-quoted char is soft-hyphen; appears here as "­" or hyphen or something else depending on viewing software - $v = htmLawed::hl_prot($v, $k); + }elseif(isset($aNP[$k]) or isset($aNO[$k])){ + $v = str_replace("­", ' ', (strpos($v, '&') !== false ? str_replace(array('­', '­', '­'), ' ', $v) : $v)); # double-quoted char: soft-hyphen; appears here as "­" or hyphen or something else depending on viewing software + if($k == 'srcset'){ + $v2 = ''; + foreach(explode(',', $v) as $k1=>$v1){ + $v1 = explode(' ', ltrim($v1), 2); + $k1 = isset($v1[1]) ? trim($v1[1]) : ''; + $v1 = trim($v1[0]); + if(isset($v1[0])){$v2 .= htmLawed::hl_prot($v1, $k). (empty($k1) ? '' : ' '. $k1). ', ';} + } + $v = trim($v2, ', '); + } + if($k == 'itemtype'){ + $v2 = ''; + foreach(explode(' ', $v) as $v1){ + if(isset($v1[0])){$v2 .= htmLawed::hl_prot($v1, $k). ' ';} + } + $v = trim($v2, ' '); + } + else{$v = htmLawed::hl_prot($v, $k);} if($k == 'href'){ // X-spam if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){ $v = str_replace('@', htmlspecialchars($C['anti_mail_spam']), $v); @@ -541,18 +562,19 @@ foreach($aA as $k=>$v){ if($nfr){$a['rel'] = isset($a['rel']) ? $a['rel']. ' nofollow' : 'nofollow';} // rqd attr -static $eAR = array('area'=>array('alt'=>'area'), 'bdo'=>array('dir'=>'ltr'), 'form'=>array('action'=>''), 'img'=>array('src'=>'', 'alt'=>'image'), 'map'=>array('name'=>''), 'optgroup'=>array('label'=>''), 'param'=>array('name'=>''), 'script'=>array('type'=>'text/javascript'), 'textarea'=>array('rows'=>'10', 'cols'=>'50')); +static $eAR = array('area'=>array('alt'=>'area'), 'bdo'=>array('dir'=>'ltr'), 'command'=>array('label'=>''), 'form'=>array('action'=>''), 'img'=>array('src'=>'', 'alt'=>'image'), 'map'=>array('name'=>''), 'optgroup'=>array('label'=>''), 'param'=>array('name'=>''), 'style'=>array('scoped'=>''), 'textarea'=>array('rows'=>'10', 'cols'=>'50')); if(isset($eAR[$e])){ foreach($eAR[$e] as $k=>$v){ if(!isset($a[$k])){$a[$k] = isset($v[0]) ? $v : $k;} } } -// depr attrs +// depr attr if($depTr){ $c = array(); foreach($a as $k=>$v){ if($k == 'style' or !isset($aND[$k][$e])){continue;} + $v = str_replace(array('\\', ':', ';', '&#'), '', $v); if($k == 'align'){ unset($a['align']); if($e == 'img' && ($v == 'left' or $v == 'right')){$c[] = 'float: '. $v;} @@ -565,6 +587,8 @@ if($depTr){ unset($a['border']); $c[] = "border: {$v}px"; }elseif($k == 'bordercolor'){ unset($a['bordercolor']); $c[] = 'border-color: '. $v; + }elseif($k == 'cellspacing'){ + unset($a['cellspacing']); $c[] = "border-spacing: {$v}px"; }elseif($k == 'clear'){ unset($a['clear']); $c[] = 'clear: '. ($v != 'all' ? $v : 'both'); }elseif($k == 'compact'){ @@ -578,19 +602,13 @@ if($depTr){ $a['type'] = 'text/'. strtolower($v); }elseif($k == 'name'){ if($C['no_deprecated_attr'] == 2 or ($e != 'a' && $e != 'map')){unset($a['name']);} - if(!isset($a['id']) && preg_match('`[a-zA-Z][a-zA-Z\d.:_\-]*`', $v)){$a['id'] = $v;} + if(!isset($a['id']) && !preg_match('`\W`', $v)){$a['id'] = $v;} }elseif($k == 'noshade'){ unset($a['noshade']); $c[] = 'border-style: none; border: 0; background-color: gray; color: gray'; }elseif($k == 'nowrap'){ unset($a['nowrap']); $c[] = 'white-space: nowrap'; }elseif($k == 'size'){ unset($a['size']); $c[] = 'size: '. $v. 'px'; - }elseif($k == 'start' or $k == 'value'){ - unset($a[$k]); - }elseif($k == 'type'){ - unset($a['type']); - static $ol_type = array('i'=>'lower-roman', 'I'=>'upper-roman', 'a'=>'lower-latin', 'A'=>'upper-latin', '1'=>'decimal'); - $c[] = 'list-style-type: '. (isset($ol_type[$v]) ? $ol_type[$v] : 'decimal'); }elseif($k == 'vspace'){ unset($a['vspace']); $c[] = "margin-top: {$v}px; margin-bottom: {$v}px"; } @@ -602,7 +620,7 @@ if($depTr){ } // unique ID if($C['unique_ids'] && isset($a['id'])){ - if(!preg_match('`^[A-Za-z][A-Za-z0-9_\-.:]*$`', ($id = $a['id'])) or (isset($GLOBALS['hl_Ids'][$id]) && $C['unique_ids'] == 1)){unset($a['id']); + if(preg_match('`\s`', ($id = $a['id'])) or (isset($GLOBALS['hl_Ids'][$id]) && $C['unique_ids'] == 1)){unset($a['id']); }else{ while(isset($GLOBALS['hl_Ids'][$id])){$id = $C['unique_ids']. $id;} $GLOBALS['hl_Ids'][($a['id'] = $id)] = 1; @@ -624,15 +642,14 @@ if(empty($C['hook_tag'])){ return "<{$e}{$aA}". (isset($eE[$e]) ? ' /' : ''). '>'; } else{return $C['hook_tag']($e, $a);} -// eof } public static function hl_tag2(&$e, &$a, $t=1){ // transform tag -if($e == 'center'){$e = 'div'; return 'text-align: center;';} -if($e == 'dir' or $e == 'menu'){$e = 'ul'; return '';} +if($e == 'big'){$e = 'span'; return 'font-size: larger;';} if($e == 's' or $e == 'strike'){$e = 'span'; return 'text-decoration: line-through;';} -if($e == 'u'){$e = 'span'; return 'text-decoration: underline;';} +if($e == 'tt'){$e = 'code'; return '';} +if($e == 'center'){$e = 'div'; return 'text-align: center;';} static $fs = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%'); if($e == 'font'){ $a2 = ''; @@ -646,15 +663,19 @@ if($e == 'font'){ } $e = 'span'; return ltrim(str_replace('<', '', $a2)); } +if($e == 'acronym'){$e = 'abbr'; return '';} +if($e == 'dir'){$e = 'ul'; return '';} if($t == 2){$e = 0; return 0;} return ''; -// eof } public static function hl_tidy($t, $w, $p){ -// Tidy/compact HTM +// tidy/compact HTM if(strpos(' pre,script,textarea', "$p,")){return $t;} -$t = preg_replace('`\s+`', ' ', preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)()`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t)); +if(!function_exists('hl_aux2')){function hl_aux2($m){ + return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", ' '), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4]; +}} +$t = preg_replace(array('`(<\w[^>]*(?)\s+`', '`\s+`', '`(<\w[^>]*(?) `'), array(' $1', ' ', '$1'), preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)()`sm'), 'hl_aux2', $t)); if(($w = strtolower($w)) == -1){ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); } @@ -662,9 +683,9 @@ $s = strpos(" $w", 't') ? "\t" : ' '; $s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2)); $N = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0; $a = array('br'=>1); -$b = array('button'=>1, 'input'=>1, 'option'=>1, 'param'=>1); -$c = array('caption'=>1, 'dd'=>1, 'dt'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1); -$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); +$b = array('button'=>1, 'command'=>1, 'input'=>1, 'option'=>1, 'param'=>1, 'track'=>1); +$c = array('audio'=>1, 'canvas'=>1, 'caption'=>1, 'dd'=>1, 'dt'=>1, 'figcaption'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'style'=>1, 'summary'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1, 'video'=>1); +$d = array('address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'datalist'=>1, 'details'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'iframe'=>1, 'main'=>1, 'map'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); $T = explode('<', $t); $X = 1; while($X){ @@ -703,33 +724,12 @@ if(($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)){ $t = str_replace("\n", $l, $t); } return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); -// eof } public static function hl_version(){ -// rel -return '1.1.22'; -// eof +// version +return '1.2.4.1'; } -public static function kses($t, $h, $p=array('http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'gopher', 'mailto')){ -// kses compat -foreach($h as $k=>$v){ - $h[$k]['n']['*'] = 1; -} -$C['cdata'] = $C['comment'] = $C['make_tag_strict'] = $C['no_deprecated_attr'] = $C['unique_ids'] = 0; -$C['keep_bad'] = 1; -$C['elements'] = count($h) ? strtolower(implode(',', array_keys($h))) : '-*'; -$C['hook'] = 'htmLawed::kses_hook'; -$C['schemes'] = '*:'. implode(',', $p); -return htmLawed::hl($t, $C, $h); -// eof -} - -public static function kses_hook($t, &$C, &$S){ -// kses compat -return $t; -// eof -} // end class } diff --git a/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php b/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php index ccad229..b26860d 100644 --- a/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php +++ b/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php @@ -274,7 +274,8 @@ class DOMTreeBuilder implements EventHandler // SPECIAL TAG HANDLING: // Spec says do this, and "don't ask." - if ($name == 'image') { + // find the spec where this is defined... looks problematic + if ($name == 'image' && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) { $name = 'img'; } @@ -681,4 +682,4 @@ class DOMTreeBuilder implements EventHandler { return $this->current->tagName == $tagname; } -} \ No newline at end of file +} diff --git a/libraries/html5php/HTML5/Parser/Tokenizer.php b/libraries/html5php/HTML5/Parser/Tokenizer.php index 02b2aff..c42bc3d 100644 --- a/libraries/html5php/HTML5/Parser/Tokenizer.php +++ b/libraries/html5php/HTML5/Parser/Tokenizer.php @@ -83,11 +83,8 @@ class Tokenizer */ public function parse() { - $p = 0; do { - $p = $this->scanner->position(); $this->consumeData(); - // FIXME: Add infinite loop protection. } while ($this->carryOn); } @@ -145,7 +142,8 @@ class Tokenizer */ protected function characterData() { - if ($this->scanner->current() === false) { + $tok = $this->scanner->current(); + if ($tok === false) { return false; } switch ($this->textMode) { @@ -154,7 +152,6 @@ class Tokenizer case Elements::TEXT_RCDATA: return $this->rcdata(); default: - $tok = $this->scanner->current(); if (strspn($tok, "<&")) { return false; } @@ -408,24 +405,26 @@ class Tokenizer if ($tok == '/') { $this->scanner->next(); $this->scanner->whitespace(); - if ($this->scanner->current() == '>') { + $tok = $this->scanner->current(); + + if ($tok == '>') { $selfClose = true; return true; } - if ($this->scanner->current() === false) { + if ($tok === false) { $this->parseError("Unexpected EOF inside of tag."); return true; } // Basically, we skip the / token and go on. // See 8.2.4.43. - $this->parseError("Unexpected '%s' inside of a tag.", $this->scanner->current()); + $this->parseError("Unexpected '%s' inside of a tag.", $tok); return false; } - if ($this->scanner->current() == '>') { + if ($tok == '>') { return true; } - if ($this->scanner->current() === false) { + if ($tok === false) { $this->parseError("Unexpected EOF inside of tag."); return true; } @@ -541,15 +540,21 @@ class Tokenizer { $stoplist = "\f" . $quote; $val = ''; - $tok = $this->scanner->current(); - while (strspn($tok, $stoplist) == 0 && $tok !== false) { - if ($tok == '&') { - $val .= $this->decodeCharacterReference(true); - $tok = $this->scanner->current(); + + while (true) { + $tokens = $this->scanner->charsUntil($stoplist.'&'); + if ($tokens !== false) { + $val .= $tokens; } else { - $val .= $tok; - $tok = $this->scanner->next(); + break; } + + $tok = $this->scanner->current(); + if ($tok == '&') { + $val .= $this->decodeCharacterReference(true, $tok); + continue; + } + break; } $this->scanner->next(); return $val; @@ -591,18 +596,18 @@ class Tokenizer */ protected function bogusComment($leading = '') { - - // TODO: This can be done more efficiently when the - // scanner exposes a readUntil() method. $comment = $leading; + $tokens = $this->scanner->charsUntil('>'); + if ($tokens !== false) { + $comment .= $tokens; + } $tok = $this->scanner->current(); - do { + if ($tok !== false) { $comment .= $tok; - $tok = $this->scanner->next(); - } while ($tok !== false && $tok != '>'); + } $this->flushBuffer(); - $this->events->comment($comment . $tok); + $this->events->comment($comment); $this->scanner->next(); return true; @@ -646,15 +651,17 @@ class Tokenizer */ protected function isCommentEnd() { + $tok = $this->scanner->current(); + // EOF - if ($this->scanner->current() === false) { + if ($tok === false) { // Hit the end. $this->parseError("Unexpected EOF in a comment."); return true; } // If it doesn't start with -, not the end. - if ($this->scanner->current() != '-') { + if ($tok != '-') { return false; } @@ -737,7 +744,6 @@ class Tokenizer $pub = strtoupper($this->scanner->getAsciiAlpha()); $white = strlen($this->scanner->whitespace()); - $tok = $this->scanner->current(); // Get ID, and flag it as pub or system. if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { @@ -938,10 +944,11 @@ class Tokenizer $len = strlen($sequence); $buffer = ''; for ($i = 0; $i < $len; ++ $i) { - $buffer .= $this->scanner->current(); + $tok = $this->scanner->current(); + $buffer .= $tok; // EOF. Rewind and let the caller handle it. - if ($this->scanner->current() === false) { + if ($tok === false) { $this->scanner->unconsume($i); return false; } @@ -1067,18 +1074,22 @@ class Tokenizer } $entity = CharacterReference::lookupDecimal($numeric); } - } // String entity. - else { + } elseif ($tok === '=' && $inAttribute) { + return '&'; + } else { // String entity. + // Attempt to consume a string up to a ';'. // [a-zA-Z0-9]+; - $cname = $this->scanner->getAsciiAlpha(); + $cname = $this->scanner->getAsciiAlphaNum(); $entity = CharacterReference::lookupName($cname); // When no entity is found provide the name of the unmatched string // and continue on as the & is not part of an entity. The & will // be converted to & elsewhere. if ($entity == null) { - $this->parseError("No match in entity table for '%s'", $cname); + if (!$inAttribute || strlen($cname) === 0) { + $this->parseError("No match in entity table for '%s'", $cname); + } $this->scanner->unconsume($this->scanner->position() - $start); return '&'; } diff --git a/libraries/html5php/README.md b/libraries/html5php/README.md index 505a85f..e2cfdf9 100644 --- a/libraries/html5php/README.md +++ b/libraries/html5php/README.md @@ -1,14 +1,16 @@ # HTML5-PHP -The need for an HTML5 parser in PHP is clear. This project initially -began with the seemingly abandoned `html5lib` project [original source](https://code.google.com/p/html5lib/source/checkout). -But after some initial refactoring work, we began a new parser. +HTML5 is a standards-compliant HTML5 parser and writer written entirely in PHP. +It is stable and used in many production websites, and has +well over [one million downloads](https://packagist.org/packages/masterminds/html5). + +HTML5 provides the following features. - An HTML5 serializer - Support for PHP namespaces - Composer support - Event-based (SAX-like) parser -- DOM tree builder +- A DOM tree builder - Interoperability with [QueryPath](https://github.com/technosophos/querypath) - Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer @@ -16,6 +18,7 @@ But after some initial refactoring work, we began a new parser. [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5) [![Code Coverage](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master) [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master) +[![Stability: Sustained](https://masterminds.github.io/stability/sustained.svg)](https://masterminds.github.io/stability/sustained.html) ## Installation @@ -23,7 +26,7 @@ Install HTML5-PHP using [composer](http://getcomposer.org/). To install, add `masterminds/html5` to your `composer.json` file: -``` +```json { "require" : { "masterminds/html5": "2.*" diff --git a/libraries/html5php/RELEASE.md b/libraries/html5php/RELEASE.md index 56d5fa1..b4ddf82 100644 --- a/libraries/html5php/RELEASE.md +++ b/libraries/html5php/RELEASE.md @@ -1,6 +1,13 @@ # Release Notes -2.2.2 (2016-10-22) +2.3.0 (2017-09-04) + +- #129: image within inline svg breaks system (fixed by #133) +- #131: ² does not work (fixed by #132) +- #134: Improve tokenizer performance by 20% (alternative version of #130 thanks to @MichaelHeerklotz) +- #135: Raw & in attributes + +2.2.2 (2016-09-22) - #116: In XML mode, tags are case sensitive - #115: Fix PHP Notice in OutputRules @@ -14,8 +21,7 @@ 2.2.0 (2016-04-11) - #105: Enable composer cache (for CI/CD) -- #100: Use mb_substitute_character inset of ini_set for environments where - ini_set is disable (e.g., shared hosting) +- #100: Use mb_substitute_character inset of ini_set for environments where ini_set is disable (e.g., shared hosting) - #98: Allow link, meta, style tags in noscript tags - #96: Fixed xml:href on svgs that use the "use" breaking - #94: Counting UTF8 characters performance improvement diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php index 605a6ad..db0040d 100644 --- a/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/libraries/humble-http-agent/HumbleHttpAgent.php @@ -7,8 +7,8 @@ * For environments which do not have these options, it reverts to standard sequential * requests (using file_get_contents()) * - * @version 1.7 - * @date 2016-11-28 + * @version 1.8 + * @date 2017-09-25 * @see http://devel-m6w6.rhcloud.com/mdref/http * @author Keyvan Minoukadeh * @copyright 2011-2016 Keyvan Minoukadeh @@ -21,8 +21,9 @@ class HumbleHttpAgent const METHOD_CURL_MULTI = 2; const METHOD_FILE_GET_CONTENTS = 4; //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; - const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'; - const UA_PHP = 'PHP/5.6'; + // popular user agents from https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ + const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'; + const UA_PHP = 'PHP/7.1'; const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; protected $requests = array(); @@ -194,6 +195,24 @@ class HumbleHttpAgent public function getMetaRefreshURL($url, $html) { if ($html == '') return false; + + // TODO: parse HTML properly + // For now, to deal with cases where meta refresh matches but shouldn't, e.g. CNN's + // + // we do the string replacements in the site config file before looking for the meta refresh + if (isset($this->siteConfigBuilder)) { + $sconfig = $this->siteConfigBuilder->buildSiteConfig($url); + // do string replacements + if (!empty($sconfig->find_string)) { + if (count($sconfig->find_string) == count($sconfig->replace_string)) { + $html = str_replace($sconfig->find_string, $sconfig->replace_string, $html, $_count); + //$this->debug("Strings replaced: $_count (find_string and/or replace_string)"); + } else { + //$this->debug('Skipped string replacement - incorrect number of find-replace strings in site config'); + } + } + } + // if (!preg_match('!]+)["\']?!i', $html, $match)) { return false; @@ -211,7 +230,7 @@ class HumbleHttpAgent if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); - return $absolute->get_iri(); + return $absolute->get_uri(); } return false; } @@ -248,6 +267,21 @@ class HumbleHttpAgent } } + public function convertIdn($url) { + if (function_exists('idn_to_ascii')) { + if ($host = @parse_url($url, PHP_URL_HOST)) { + $puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46); + if ($host != $puny) { + $pos = strpos($url, $host); + if ($pos !== false) { + $url = substr_replace($url, $puny, $pos, strlen($host)); + } + } + } + } + return $url; + } + public function rewriteUrls($url) { foreach ($this->rewriteUrls as $find => $action) { if (strpos($url, $find) !== false) { @@ -327,6 +361,7 @@ class HumbleHttpAgent } else { $this->debug("......adding to pool"); $req_url = $this->rewriteUrls($url); + $req_url = $this->convertIdn($req_url); $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; $req_url = $this->removeFragment($req_url); if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { @@ -507,6 +542,7 @@ class HumbleHttpAgent } else { $this->debug("......adding to pool"); $req_url = $this->rewriteUrls($url); + $req_url = $this->convertIdn($req_url); $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; $req_url = $this->removeFragment($req_url); if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { @@ -649,6 +685,7 @@ class HumbleHttpAgent $this->debug("Sending request for $url"); $this->requests[$orig]['original_url'] = $orig; $req_url = $this->rewriteUrls($url); + $req_url = $this->convertIdn($req_url); $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; $req_url = $this->removeFragment($req_url); $httpContext = $this->httpContext; diff --git a/libraries/language-detect/LanguageDetect.php b/libraries/language-detect/LanguageDetect.php index a6922fa..52a4ee0 100644 --- a/libraries/language-detect/LanguageDetect.php +++ b/libraries/language-detect/LanguageDetect.php @@ -1,5 +1,4 @@ _data_dir will be ignored * - * @var string - * @access private + * @var string */ - var $_db_filename = 'lang.dat'; + protected $_db_filename = 'lang.dat'; /** * The filename that stores the unicode block definitions @@ -85,83 +83,74 @@ class Text_LanguageDetect * $this->_data_dir will be ignored * * @var string - * @access private */ - var $_unicode_db_filename = 'unicode_blocks.dat'; + protected $_unicode_db_filename = 'unicode_blocks.dat'; /** * The data directory * * Should be set by PEAR installer * - * @var string - * @access private + * @var string */ - var $_data_dir = '@data_dir@'; + protected $_data_dir = '@data_dir@'; /** * The trigram data for comparison * * Will be loaded on start from $this->_db_filename * - * @var array - * @access private - */ - var $_lang_db = array(); - - /** - * stores the map of the trigram data to unicode characters - * - * @access private * @var array */ - var $_unicode_map; + protected $_lang_db = array(); + + /** + * Stores the map of the trigram data to unicode characters + * + * @var array + */ + protected $_unicode_map; /** * The size of the trigram data arrays * - * @var int - * @access private + * @var int */ - var $_threshold = 300; + protected $_threshold = 300; /** - * the maximum possible score. + * The maximum possible score. * - * needed for score normalization. Different depending on the + * Needed for score normalization. Different depending on the * perl compatibility setting * - * @access private - * @var int - * @see setPerlCompatible() + * @var int + * @see setPerlCompatible() */ - var $_max_score = 0; + protected $_max_score = 0; /** * Whether or not to simulate perl's Language::Guess exactly * - * @access private - * @var bool - * @see setPerlCompatible() + * @var bool + * @see setPerlCompatible() */ - var $_perl_compatible = false; + protected $_perl_compatible = false; /** * Whether to use the unicode block detection to speed up processing * - * @access private * @var bool */ - var $_use_unicode_narrowing = true; + protected $_use_unicode_narrowing = true; /** - * stores the result of the clustering operation + * Stores the result of the clustering operation * - * @access private - * @var array - * @see clusterLanguages() + * @var array + * @see clusterLanguages() */ - var $_clusters; + protected $_clusters; /** * Which type of "language names" are accepted and returned: @@ -170,7 +159,7 @@ class Text_LanguageDetect * 2 - 2-letter ISO 639-1 code ("en") * 3 - 3-letter ISO 639-2 code ("eng") */ - var $_name_mode = 0; + protected $_name_mode = 0; /** * Constructor @@ -178,7 +167,7 @@ class Text_LanguageDetect * Will attempt to load the language database. If it fails, you will get * an exception. */ - function __construct() + public function __construct() { $data = $this->_readdb($this->_db_filename); $this->_checkTrigram($data['trigram']); @@ -200,9 +189,8 @@ class Text_LanguageDetect * @param string $fname File name to load * * @return string expected path to the language model database - * @access private */ - function _get_data_loc($fname) + protected function _get_data_loc($fname) { return dirname(__FILE__).'/'.$fname; } @@ -216,9 +204,8 @@ class Text_LanguageDetect * * @return array the language model data * @throws Text_LanguageDetect_Exception - * @access private */ - function _readdb($fname) + protected function _readdb($fname) { // finds the correct data dir $fname = $this->_get_data_loc($fname); @@ -246,9 +233,8 @@ class Text_LanguageDetect * @param array $trigram Trigram data from database * * @return void - * @access private */ - function _checkTrigram($trigram) + protected function _checkTrigram($trigram) { if (!is_array($trigram)) { if (ini_get('magic_quotes_runtime')) { @@ -340,11 +326,10 @@ class Text_LanguageDetect /** * Returns the number of languages that this object can detect * - * @access public * @return int the number of languages - * @throws Text_LanguageDetect_Exception + * @throws Text_LanguageDetect_Exception */ - function getLanguageCount() + public function getLanguageCount() { return count($this->_lang_db); } @@ -382,11 +367,10 @@ class Text_LanguageDetect /** * Returns the list of detectable languages * - * @access public * @return array the names of the languages known to this object<<<<<<< - * @throws Text_LanguageDetect_Exception + * @throws Text_LanguageDetect_Exception */ - function getLanguages() + public function getLanguages() { return $this->_convertToNameMode( array_keys($this->_lang_db) @@ -424,7 +408,7 @@ class Text_LanguageDetect * * @return void */ - function setNameMode($name_mode) + public function setNameMode($name_mode) { $this->_name_mode = $name_mode; } @@ -454,10 +438,9 @@ class Text_LanguageDetect * @param string $text text to convert * * @return array array of trigram frequencies - * @access private * @deprecated Superceded by the Text_LanguageDetect_Parser class */ - function _trigram($text) + protected function _trigram($text) { $s = new Text_LanguageDetect_Parser($text); $s->prepareTrigram(); @@ -475,9 +458,8 @@ class Text_LanguageDetect * @param array $arr array of trigram * * @return array ranks of trigrams - * @access protected */ - function _arr_rank($arr) + protected function _arr_rank($arr) { // sorts alphabetically first as a standard way of breaking rank ties @@ -505,12 +487,11 @@ class Text_LanguageDetect /** * Sorts an array by value breaking ties alphabetically * - * @param array &$arr the array to sort + * @param array $arr the array to sort * * @return void - * @access private */ - function _bub_sort(&$arr) + protected function _bub_sort(&$arr) { // should do the same as this perl statement: // sort { $trigrams{$b} == $trigrams{$a} @@ -548,9 +529,8 @@ class Text_LanguageDetect * * @return int 1 if $a is greater, -1 if not * @see _bub_sort() - * @access private */ - function _sort_func($a, $b) + protected function _sort_func($a, $b) { // each is actually a key/value pair, so that it can compare using both list($a_key, $a_value) = $a; @@ -588,9 +568,8 @@ class Text_LanguageDetect * * @return int the sum of the differences between the ranks of * the two trigram sets - * @access private */ - function _distance($arr1, $arr2) + protected function _distance($arr1, $arr2) { $sumdist = 0; @@ -621,9 +600,8 @@ class Text_LanguageDetect * * @return float the normalized score * @see _distance() - * @access private */ - function _normalize_score($score, $base_count = null) + protected function _normalize_score($score, $base_count = null) { if ($base_count === null) { $base_count = $this->_threshold; @@ -699,7 +677,7 @@ class Text_LanguageDetect $sample_obj->setPadStart(!$this->_perl_compatible); $sample_obj->analyze(); - $trigram_freqs =& $sample_obj->getTrigramRanks(); + $trigram_freqs = $sample_obj->getTrigramRanks(); $trigram_count = count($trigram_freqs); if ($trigram_count == 0) { @@ -710,7 +688,7 @@ class Text_LanguageDetect // use unicode block detection to narrow down the possibilities if ($this->_use_unicode_narrowing) { - $blocks =& $sample_obj->getUnicodeBlocks(); + $blocks = $sample_obj->getUnicodeBlocks(); if (is_array($blocks)) { $present_blocks = array_keys($blocks); @@ -962,16 +940,15 @@ class Text_LanguageDetect * * @return mixed Block name, -1 if it failed * @see unicodeBlockName() - * @access protected */ - function _unicode_block_name($unicode, $blocks, $block_count = -1) + protected function _unicode_block_name($unicode, $blocks, $block_count = -1) { // for a reference, see // http://www.unicode.org/Public/UNIDATA/Blocks.txt // assume that ascii characters are the most common // so try it first for efficiency - if ($unicode <= hexdec($blocks[0][1])) { + if ($unicode <= $blocks[0][1]) { return $blocks[0]; } @@ -989,11 +966,11 @@ class Text_LanguageDetect while ($low <= $high) { $mid = floor(($low + $high) / 2); - if ($unicode < hexdec($blocks[$mid][0])) { + if ($unicode < $blocks[$mid][0]) { // if it's lower than the lower bound $high = $mid - 1; - } elseif ($unicode > hexdec($blocks[$mid][1])) { + } elseif ($unicode > $blocks[$mid][1]) { // if it's higher than the upper bound $low = $mid + 1; @@ -1015,9 +992,8 @@ class Text_LanguageDetect * * @return array the database of unicode block definitions * @throws Text_LanguageDetect_Exception - * @access protected */ - function _read_unicode_block_db() + protected function _read_unicode_block_db() { // since the unicode definitions are always going to be the same, // might as well share the memory for the db with all other instances @@ -1136,14 +1112,13 @@ class Text_LanguageDetect * Uses a nearest neighbor technique to generate the maximum possible * number of dendograms from the similarity data. * - * @access public - * @return array language cluster data - * @throws Text_LanguageDetect_Exception - * @see languageSimilarity() - * @deprecated this function will eventually be removed and placed into + * @return array language cluster data + * @throws Text_LanguageDetect_Exception + * @see languageSimilarity() + * @deprecated this function will eventually be removed and placed into * the model generation class */ - function clusterLanguages() + public function clusterLanguages() { // todo: set the maximum number of clusters // return cached result, if any @@ -1452,7 +1427,7 @@ class Text_LanguageDetect } /** - * ut8-safe strlen() + * UTF8-safe strlen() * * Returns the numbers of characters (not bytes) in a utf8 string * @@ -1476,10 +1451,9 @@ class Text_LanguageDetect * @param string $char a utf8 (possibly multi-byte) char * * @return int unicode value - * @access protected * @link http://en.wikipedia.org/wiki/UTF-8 */ - function _utf8char2unicode($char) + protected function _utf8char2unicode($char) { // strlen() here will actually get the binary length of a single char switch (strlen($char)) { @@ -1516,20 +1490,19 @@ class Text_LanguageDetect } /** - * utf8-safe fast character iterator + * UTF8-safe fast character iterator * * Will get the next character starting from $counter, which will then be * incremented. If a multi-byte char the bytes will be concatenated and * $counter will be incremeted by the number of bytes in the char. * * @param string $str the string being iterated over - * @param int &$counter the iterator, will increment by reference + * @param int $counter the iterator, will increment by reference * @param bool $special_convert whether to do special conversions * * @return char the next (possibly multi-byte) char from $counter - * @access private */ - static function _next_char($str, &$counter, $special_convert = false) + protected static function _next_char($str, &$counter, $special_convert = false) { $char = $str{$counter++}; $ord = ord($char); @@ -1621,7 +1594,7 @@ class Text_LanguageDetect * * @return string|array Language name */ - function _convertFromNameMode($lang, $convertKey = false) + protected function _convertFromNameMode($lang, $convertKey = false) { if ($this->_name_mode == 0) { return $lang; @@ -1661,7 +1634,7 @@ class Text_LanguageDetect * * @return string|array Language name */ - function _convertToNameMode($lang, $convertKey = false) + protected function _convertToNameMode($lang, $convertKey = false) { if ($this->_name_mode == 0) { return $lang; @@ -1688,6 +1661,4 @@ class Text_LanguageDetect } return $newlang; } -} - -/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file +} \ No newline at end of file diff --git a/libraries/language-detect/LanguageDetect/Exception.php b/libraries/language-detect/LanguageDetect/Exception.php index 196d994..6566e6b 100644 --- a/libraries/language-detect/LanguageDetect/Exception.php +++ b/libraries/language-detect/LanguageDetect/Exception.php @@ -1,4 +1,16 @@ + * @license BSD http://www.opensource.org/licenses/bsd-license.php + * @link http://pear.php.net/package/Text_LanguageDetect/ + */ + class Text_LanguageDetect_Exception extends Exception { /** diff --git a/libraries/language-detect/LanguageDetect/ISO639.php b/libraries/language-detect/LanguageDetect/ISO639.php index 05b0590..e76dd2d 100644 --- a/libraries/language-detect/LanguageDetect/ISO639.php +++ b/libraries/language-detect/LanguageDetect/ISO639.php @@ -1,18 +1,4 @@ - * @copyright 2011 Christian Weiske - * @license http://www.debian.org/misc/bsd.license BSD - * @version SVN: $Id$ - * @link http://pear.php.net/package/Text_LanguageDetect/ - */ - /** * Provides a mapping between the languages from lang.dat and the * ISO 639-1 and ISO-639-2 codes. @@ -23,7 +9,7 @@ * @package Text_LanguageDetect * @author Christian Weiske * @copyright 2011 Christian Weiske - * @license http://www.debian.org/misc/bsd.license BSD + * @license BSD http://www.opensource.org/licenses/bsd-license.php * @link http://www.loc.gov/standards/iso639-2/php/code_list.php */ class Text_LanguageDetect_ISO639 diff --git a/libraries/language-detect/LanguageDetect/Parser.php b/libraries/language-detect/LanguageDetect/Parser.php index e859218..3558b81 100644 --- a/libraries/language-detect/LanguageDetect/Parser.php +++ b/libraries/language-detect/LanguageDetect/Parser.php @@ -1,18 +1,4 @@ + * @copyright 2006 Nicholas Pisarro + * @license BSD http://www.opensource.org/licenses/bsd-license.php + * @version Release: 1.0.0 + * @link http://pear.php.net/package/Text_LanguageDetect/ */ class Text_LanguageDetect_Parser extends Text_LanguageDetect { /** - * the piece of text being parsed + * The piece of text being parsed * - * @access private - * @var string + * @var string */ - var $_string; + protected $_string; /** - * stores the trigram frequencies of the sample + * Stores the trigram frequencies of the sample * - * @access private - * @var string + * @var string */ - var $_trigrams = array(); + protected $_trigrams = array(); /** - * stores the trigram ranks of the sample + * Stores the trigram ranks of the sample * - * @access private - * @var array + * @var array */ - var $_trigram_ranks = array(); + protected $_trigram_ranks = array(); /** - * stores the unicode blocks of the sample + * Stores the unicode blocks of the sample * - * @access private - * @var array + * @var array */ - var $_unicode_blocks = array(); - + protected $_unicode_blocks = array(); + /** * Whether the parser should compile the unicode ranges - * - * @access private - * @var bool + * + * @var bool */ - var $_compile_unicode = false; + protected $_compile_unicode = false; /** * Whether the parser should compile trigrams * - * @access private - * @var bool + * @var bool */ - var $_compile_trigram = false; + protected $_compile_trigram = false; /** * Whether the trigram parser should pad the beginning of the string * - * @access private - * @var bool + * @var bool */ - var $_trigram_pad_start = false; + protected $_trigram_pad_start = false; /** * Whether the unicode parser should skip non-alphabetical ascii chars * - * @access private - * @var bool + * @var bool */ - var $_unicode_skip_symbols = true; + protected $_unicode_skip_symbols = true; /** * Constructor * - * @access private - * @param string $string string to be parsed + * @param string $string string to be parsed */ - function __construct($string) { + public function __construct($string) + { $this->_string = $string; } + /** + * PHP 4 constructor for backwards compatibility. + * + * @param string $string string to be parsed + * + * @return void + */ + public function Text_LanguageDetect_Parser($string) + { + self::__construct($string); + } + /** * Returns true if a string is suitable for parsing * - * @param string $str input string to test - * @return bool true if acceptable, false if not + * @param string $str input string to test + * + * @return bool true if acceptable, false if not */ - public static function validateString($str) { + public static function validateString($str) + { if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { return true; } else { @@ -121,34 +114,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect } /** - * turn on/off trigram counting + * Turn on/off trigram counting * - * @access public - * @param bool $bool true for on, false for off + * @param bool $bool true for on, false for off + * + * @return void */ - function prepareTrigram($bool = true) + public function prepareTrigram($bool = true) { $this->_compile_trigram = $bool; } /** - * turn on/off unicode block counting + * Turn on/off unicode block counting * - * @access public - * @param bool $bool true for on, false for off + * @param bool $bool true for on, false for off + * + * @return void */ - function prepareUnicode($bool = true) + public function prepareUnicode($bool = true) { $this->_compile_unicode = $bool; } /** - * turn on/off padding the beginning of the sample string + * Turn on/off padding the beginning of the sample string * - * @access public - * @param bool $bool true for on, false for off + * @param bool $bool true for on, false for off + * + * @return void */ - function setPadStart($bool = true) + public function setPadStart($bool = true) { $this->_trigram_pad_start = $bool; } @@ -156,10 +152,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect /** * Should the unicode block counter skip non-alphabetical ascii chars? * - * @access public - * @param bool $bool true for on, false for off + * @param bool $bool true for on, false for off + * + * @return void */ - function setUnicodeSkipSymbols($bool = true) + public function setUnicodeSkipSymbols($bool = true) { $this->_unicode_skip_symbols = $bool; } @@ -167,10 +164,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect /** * Returns the trigram ranks for the text sample * - * @access public - * @return array trigram ranks in the text sample + * @return array Trigram ranks in the text sample */ - function &getTrigramRanks() + public function getTrigramRanks() { return $this->_trigram_ranks; } @@ -178,39 +174,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect /** * Return the trigram freqency table * - * only used in testing to make sure the parser is working + * Only used in testing to make sure the parser is working * - * @access public - * @return array trigram freqencies in the text sample + * @return array Trigram freqencies in the text sample */ - function &getTrigramFreqs() + public function getTrigramFreqs() { return $this->_trigram; } /** - * returns the array of unicode blocks + * Returns the array of unicode blocks * - * @access public - * @return array unicode blocks in the text sample + * @return array Unicode blocks in the text sample */ - function &getUnicodeBlocks() + public function getUnicodeBlocks() { return $this->_unicode_blocks; } /** * Executes the parsing operation - * - * Be sure to call the set*() functions to set options and the + * + * Be sure to call the set*() functions to set options and the * prepare*() functions first to tell it what kind of data to compute * * Afterwards the get*() functions can be used to access the compiled * information. * - * @access public + * @return void */ - function analyze() + public function analyze() { $len = strlen($this->_string); $byte_counter = 0; @@ -258,9 +252,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect if ($this->_compile_trigram) { if (!($b == ' ' && ($a == ' ' || $char == ' '))) { if (!isset($this->_trigram[$a . $b . $char])) { - $this->_trigram[$a . $b . $char] = 1; + $this->_trigram[$a . $b . $char] = 1; } else { - $this->_trigram[$a . $b . $char]++; + $this->_trigram[$a . $b . $char]++; } } @@ -271,10 +265,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect // unicode block detection if ($this->_compile_unicode) { if ($this->_unicode_skip_symbols - && strlen($char) == 1 - && ($char < 'A' || $char > 'z' - || ($char > 'Z' && $char < 'a')) - && $char != "'") { // does not skip the apostrophe + && strlen($char) == 1 + && ($char < 'A' || $char > 'z' + || ($char > 'Z' && $char < 'a')) + && $char != "'" + ) { // does not skip the apostrophe // since it's included in the language // models @@ -297,7 +292,8 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect if ($this->_compile_unicode) { foreach ($unicode_chars as $utf8_char => $count) { $search_result = $this->_unicode_block_name( - $this->_utf8char2unicode($utf8_char), $blocks, $block_count); + $this->_utf8char2unicode($utf8_char), $blocks, $block_count + ); if ($search_result != -1) { $block_name = $search_result[2]; @@ -342,6 +338,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect } } } -} - -/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file +} \ No newline at end of file diff --git a/libraries/language-detect/unicode_blocks.dat b/libraries/language-detect/unicode_blocks.dat index 3b24cd2..1f66cac 100644 --- a/libraries/language-detect/unicode_blocks.dat +++ b/libraries/language-detect/unicode_blocks.dat @@ -1 +1 @@ -a:145:{i:0;a:3:{i:0;s:6:"0x0000";i:1;s:6:"0x007F";i:2;s:11:"Basic Latin";}i:1;a:3:{i:0;s:6:"0x0080";i:1;s:6:"0x00FF";i:2;s:18:"Latin-1 Supplement";}i:2;a:3:{i:0;s:6:"0x0100";i:1;s:6:"0x017F";i:2;s:16:"Latin Extended-A";}i:3;a:3:{i:0;s:6:"0x0180";i:1;s:6:"0x024F";i:2;s:16:"Latin Extended-B";}i:4;a:3:{i:0;s:6:"0x0250";i:1;s:6:"0x02AF";i:2;s:14:"IPA Extensions";}i:5;a:3:{i:0;s:6:"0x02B0";i:1;s:6:"0x02FF";i:2;s:24:"Spacing Modifier Letters";}i:6;a:3:{i:0;s:6:"0x0300";i:1;s:6:"0x036F";i:2;s:27:"Combining Diacritical Marks";}i:7;a:3:{i:0;s:6:"0x0370";i:1;s:6:"0x03FF";i:2;s:16:"Greek and Coptic";}i:8;a:3:{i:0;s:6:"0x0400";i:1;s:6:"0x04FF";i:2;s:8:"Cyrillic";}i:9;a:3:{i:0;s:6:"0x0500";i:1;s:6:"0x052F";i:2;s:19:"Cyrillic Supplement";}i:10;a:3:{i:0;s:6:"0x0530";i:1;s:6:"0x058F";i:2;s:8:"Armenian";}i:11;a:3:{i:0;s:6:"0x0590";i:1;s:6:"0x05FF";i:2;s:6:"Hebrew";}i:12;a:3:{i:0;s:6:"0x0600";i:1;s:6:"0x06FF";i:2;s:6:"Arabic";}i:13;a:3:{i:0;s:6:"0x0700";i:1;s:6:"0x074F";i:2;s:6:"Syriac";}i:14;a:3:{i:0;s:6:"0x0750";i:1;s:6:"0x077F";i:2;s:17:"Arabic Supplement";}i:15;a:3:{i:0;s:6:"0x0780";i:1;s:6:"0x07BF";i:2;s:6:"Thaana";}i:16;a:3:{i:0;s:6:"0x0900";i:1;s:6:"0x097F";i:2;s:10:"Devanagari";}i:17;a:3:{i:0;s:6:"0x0980";i:1;s:6:"0x09FF";i:2;s:7:"Bengali";}i:18;a:3:{i:0;s:6:"0x0A00";i:1;s:6:"0x0A7F";i:2;s:8:"Gurmukhi";}i:19;a:3:{i:0;s:6:"0x0A80";i:1;s:6:"0x0AFF";i:2;s:8:"Gujarati";}i:20;a:3:{i:0;s:6:"0x0B00";i:1;s:6:"0x0B7F";i:2;s:5:"Oriya";}i:21;a:3:{i:0;s:6:"0x0B80";i:1;s:6:"0x0BFF";i:2;s:5:"Tamil";}i:22;a:3:{i:0;s:6:"0x0C00";i:1;s:6:"0x0C7F";i:2;s:6:"Telugu";}i:23;a:3:{i:0;s:6:"0x0C80";i:1;s:6:"0x0CFF";i:2;s:7:"Kannada";}i:24;a:3:{i:0;s:6:"0x0D00";i:1;s:6:"0x0D7F";i:2;s:9:"Malayalam";}i:25;a:3:{i:0;s:6:"0x0D80";i:1;s:6:"0x0DFF";i:2;s:7:"Sinhala";}i:26;a:3:{i:0;s:6:"0x0E00";i:1;s:6:"0x0E7F";i:2;s:4:"Thai";}i:27;a:3:{i:0;s:6:"0x0E80";i:1;s:6:"0x0EFF";i:2;s:3:"Lao";}i:28;a:3:{i:0;s:6:"0x0F00";i:1;s:6:"0x0FFF";i:2;s:7:"Tibetan";}i:29;a:3:{i:0;s:6:"0x1000";i:1;s:6:"0x109F";i:2;s:7:"Myanmar";}i:30;a:3:{i:0;s:6:"0x10A0";i:1;s:6:"0x10FF";i:2;s:8:"Georgian";}i:31;a:3:{i:0;s:6:"0x1100";i:1;s:6:"0x11FF";i:2;s:11:"Hangul Jamo";}i:32;a:3:{i:0;s:6:"0x1200";i:1;s:6:"0x137F";i:2;s:8:"Ethiopic";}i:33;a:3:{i:0;s:6:"0x1380";i:1;s:6:"0x139F";i:2;s:19:"Ethiopic Supplement";}i:34;a:3:{i:0;s:6:"0x13A0";i:1;s:6:"0x13FF";i:2;s:8:"Cherokee";}i:35;a:3:{i:0;s:6:"0x1400";i:1;s:6:"0x167F";i:2;s:37:"Unified Canadian Aboriginal Syllabics";}i:36;a:3:{i:0;s:6:"0x1680";i:1;s:6:"0x169F";i:2;s:5:"Ogham";}i:37;a:3:{i:0;s:6:"0x16A0";i:1;s:6:"0x16FF";i:2;s:5:"Runic";}i:38;a:3:{i:0;s:6:"0x1700";i:1;s:6:"0x171F";i:2;s:7:"Tagalog";}i:39;a:3:{i:0;s:6:"0x1720";i:1;s:6:"0x173F";i:2;s:7:"Hanunoo";}i:40;a:3:{i:0;s:6:"0x1740";i:1;s:6:"0x175F";i:2;s:5:"Buhid";}i:41;a:3:{i:0;s:6:"0x1760";i:1;s:6:"0x177F";i:2;s:8:"Tagbanwa";}i:42;a:3:{i:0;s:6:"0x1780";i:1;s:6:"0x17FF";i:2;s:5:"Khmer";}i:43;a:3:{i:0;s:6:"0x1800";i:1;s:6:"0x18AF";i:2;s:9:"Mongolian";}i:44;a:3:{i:0;s:6:"0x1900";i:1;s:6:"0x194F";i:2;s:5:"Limbu";}i:45;a:3:{i:0;s:6:"0x1950";i:1;s:6:"0x197F";i:2;s:6:"Tai Le";}i:46;a:3:{i:0;s:6:"0x1980";i:1;s:6:"0x19DF";i:2;s:11:"New Tai Lue";}i:47;a:3:{i:0;s:6:"0x19E0";i:1;s:6:"0x19FF";i:2;s:13:"Khmer Symbols";}i:48;a:3:{i:0;s:6:"0x1A00";i:1;s:6:"0x1A1F";i:2;s:8:"Buginese";}i:49;a:3:{i:0;s:6:"0x1D00";i:1;s:6:"0x1D7F";i:2;s:19:"Phonetic Extensions";}i:50;a:3:{i:0;s:6:"0x1D80";i:1;s:6:"0x1DBF";i:2;s:30:"Phonetic Extensions Supplement";}i:51;a:3:{i:0;s:6:"0x1DC0";i:1;s:6:"0x1DFF";i:2;s:38:"Combining Diacritical Marks Supplement";}i:52;a:3:{i:0;s:6:"0x1E00";i:1;s:6:"0x1EFF";i:2;s:25:"Latin Extended Additional";}i:53;a:3:{i:0;s:6:"0x1F00";i:1;s:6:"0x1FFF";i:2;s:14:"Greek Extended";}i:54;a:3:{i:0;s:6:"0x2000";i:1;s:6:"0x206F";i:2;s:19:"General Punctuation";}i:55;a:3:{i:0;s:6:"0x2070";i:1;s:6:"0x209F";i:2;s:27:"Superscripts and Subscripts";}i:56;a:3:{i:0;s:6:"0x20A0";i:1;s:6:"0x20CF";i:2;s:16:"Currency Symbols";}i:57;a:3:{i:0;s:6:"0x20D0";i:1;s:6:"0x20FF";i:2;s:39:"Combining Diacritical Marks for Symbols";}i:58;a:3:{i:0;s:6:"0x2100";i:1;s:6:"0x214F";i:2;s:18:"Letterlike Symbols";}i:59;a:3:{i:0;s:6:"0x2150";i:1;s:6:"0x218F";i:2;s:12:"Number Forms";}i:60;a:3:{i:0;s:6:"0x2190";i:1;s:6:"0x21FF";i:2;s:6:"Arrows";}i:61;a:3:{i:0;s:6:"0x2200";i:1;s:6:"0x22FF";i:2;s:22:"Mathematical Operators";}i:62;a:3:{i:0;s:6:"0x2300";i:1;s:6:"0x23FF";i:2;s:23:"Miscellaneous Technical";}i:63;a:3:{i:0;s:6:"0x2400";i:1;s:6:"0x243F";i:2;s:16:"Control Pictures";}i:64;a:3:{i:0;s:6:"0x2440";i:1;s:6:"0x245F";i:2;s:29:"Optical Character Recognition";}i:65;a:3:{i:0;s:6:"0x2460";i:1;s:6:"0x24FF";i:2;s:22:"Enclosed Alphanumerics";}i:66;a:3:{i:0;s:6:"0x2500";i:1;s:6:"0x257F";i:2;s:11:"Box Drawing";}i:67;a:3:{i:0;s:6:"0x2580";i:1;s:6:"0x259F";i:2;s:14:"Block Elements";}i:68;a:3:{i:0;s:6:"0x25A0";i:1;s:6:"0x25FF";i:2;s:16:"Geometric Shapes";}i:69;a:3:{i:0;s:6:"0x2600";i:1;s:6:"0x26FF";i:2;s:21:"Miscellaneous Symbols";}i:70;a:3:{i:0;s:6:"0x2700";i:1;s:6:"0x27BF";i:2;s:8:"Dingbats";}i:71;a:3:{i:0;s:6:"0x27C0";i:1;s:6:"0x27EF";i:2;s:36:"Miscellaneous Mathematical Symbols-A";}i:72;a:3:{i:0;s:6:"0x27F0";i:1;s:6:"0x27FF";i:2;s:21:"Supplemental Arrows-A";}i:73;a:3:{i:0;s:6:"0x2800";i:1;s:6:"0x28FF";i:2;s:16:"Braille Patterns";}i:74;a:3:{i:0;s:6:"0x2900";i:1;s:6:"0x297F";i:2;s:21:"Supplemental Arrows-B";}i:75;a:3:{i:0;s:6:"0x2980";i:1;s:6:"0x29FF";i:2;s:36:"Miscellaneous Mathematical Symbols-B";}i:76;a:3:{i:0;s:6:"0x2A00";i:1;s:6:"0x2AFF";i:2;s:35:"Supplemental Mathematical Operators";}i:77;a:3:{i:0;s:6:"0x2B00";i:1;s:6:"0x2BFF";i:2;s:32:"Miscellaneous Symbols and Arrows";}i:78;a:3:{i:0;s:6:"0x2C00";i:1;s:6:"0x2C5F";i:2;s:10:"Glagolitic";}i:79;a:3:{i:0;s:6:"0x2C80";i:1;s:6:"0x2CFF";i:2;s:6:"Coptic";}i:80;a:3:{i:0;s:6:"0x2D00";i:1;s:6:"0x2D2F";i:2;s:19:"Georgian Supplement";}i:81;a:3:{i:0;s:6:"0x2D30";i:1;s:6:"0x2D7F";i:2;s:8:"Tifinagh";}i:82;a:3:{i:0;s:6:"0x2D80";i:1;s:6:"0x2DDF";i:2;s:17:"Ethiopic Extended";}i:83;a:3:{i:0;s:6:"0x2E00";i:1;s:6:"0x2E7F";i:2;s:24:"Supplemental Punctuation";}i:84;a:3:{i:0;s:6:"0x2E80";i:1;s:6:"0x2EFF";i:2;s:23:"CJK Radicals Supplement";}i:85;a:3:{i:0;s:6:"0x2F00";i:1;s:6:"0x2FDF";i:2;s:15:"Kangxi Radicals";}i:86;a:3:{i:0;s:6:"0x2FF0";i:1;s:6:"0x2FFF";i:2;s:34:"Ideographic Description Characters";}i:87;a:3:{i:0;s:6:"0x3000";i:1;s:6:"0x303F";i:2;s:27:"CJK Symbols and Punctuation";}i:88;a:3:{i:0;s:6:"0x3040";i:1;s:6:"0x309F";i:2;s:8:"Hiragana";}i:89;a:3:{i:0;s:6:"0x30A0";i:1;s:6:"0x30FF";i:2;s:8:"Katakana";}i:90;a:3:{i:0;s:6:"0x3100";i:1;s:6:"0x312F";i:2;s:8:"Bopomofo";}i:91;a:3:{i:0;s:6:"0x3130";i:1;s:6:"0x318F";i:2;s:25:"Hangul Compatibility Jamo";}i:92;a:3:{i:0;s:6:"0x3190";i:1;s:6:"0x319F";i:2;s:6:"Kanbun";}i:93;a:3:{i:0;s:6:"0x31A0";i:1;s:6:"0x31BF";i:2;s:17:"Bopomofo Extended";}i:94;a:3:{i:0;s:6:"0x31C0";i:1;s:6:"0x31EF";i:2;s:11:"CJK Strokes";}i:95;a:3:{i:0;s:6:"0x31F0";i:1;s:6:"0x31FF";i:2;s:28:"Katakana Phonetic Extensions";}i:96;a:3:{i:0;s:6:"0x3200";i:1;s:6:"0x32FF";i:2;s:31:"Enclosed CJK Letters and Months";}i:97;a:3:{i:0;s:6:"0x3300";i:1;s:6:"0x33FF";i:2;s:17:"CJK Compatibility";}i:98;a:3:{i:0;s:6:"0x3400";i:1;s:6:"0x4DBF";i:2;s:34:"CJK Unified Ideographs Extension A";}i:99;a:3:{i:0;s:6:"0x4DC0";i:1;s:6:"0x4DFF";i:2;s:23:"Yijing Hexagram Symbols";}i:100;a:3:{i:0;s:6:"0x4E00";i:1;s:6:"0x9FFF";i:2;s:22:"CJK Unified Ideographs";}i:101;a:3:{i:0;s:6:"0xA000";i:1;s:6:"0xA48F";i:2;s:12:"Yi Syllables";}i:102;a:3:{i:0;s:6:"0xA490";i:1;s:6:"0xA4CF";i:2;s:11:"Yi Radicals";}i:103;a:3:{i:0;s:6:"0xA700";i:1;s:6:"0xA71F";i:2;s:21:"Modifier Tone Letters";}i:104;a:3:{i:0;s:6:"0xA800";i:1;s:6:"0xA82F";i:2;s:12:"Syloti Nagri";}i:105;a:3:{i:0;s:6:"0xAC00";i:1;s:6:"0xD7AF";i:2;s:16:"Hangul Syllables";}i:106;a:3:{i:0;s:6:"0xD800";i:1;s:6:"0xDB7F";i:2;s:15:"High Surrogates";}i:107;a:3:{i:0;s:6:"0xDB80";i:1;s:6:"0xDBFF";i:2;s:27:"High Private Use Surrogates";}i:108;a:3:{i:0;s:6:"0xDC00";i:1;s:6:"0xDFFF";i:2;s:14:"Low Surrogates";}i:109;a:3:{i:0;s:6:"0xE000";i:1;s:6:"0xF8FF";i:2;s:16:"Private Use Area";}i:110;a:3:{i:0;s:6:"0xF900";i:1;s:6:"0xFAFF";i:2;s:28:"CJK Compatibility Ideographs";}i:111;a:3:{i:0;s:6:"0xFB00";i:1;s:6:"0xFB4F";i:2;s:29:"Alphabetic Presentation Forms";}i:112;a:3:{i:0;s:6:"0xFB50";i:1;s:6:"0xFDFF";i:2;s:27:"Arabic Presentation Forms-A";}i:113;a:3:{i:0;s:6:"0xFE00";i:1;s:6:"0xFE0F";i:2;s:19:"Variation Selectors";}i:114;a:3:{i:0;s:6:"0xFE10";i:1;s:6:"0xFE1F";i:2;s:14:"Vertical Forms";}i:115;a:3:{i:0;s:6:"0xFE20";i:1;s:6:"0xFE2F";i:2;s:20:"Combining Half Marks";}i:116;a:3:{i:0;s:6:"0xFE30";i:1;s:6:"0xFE4F";i:2;s:23:"CJK Compatibility Forms";}i:117;a:3:{i:0;s:6:"0xFE50";i:1;s:6:"0xFE6F";i:2;s:19:"Small Form Variants";}i:118;a:3:{i:0;s:6:"0xFE70";i:1;s:6:"0xFEFF";i:2;s:27:"Arabic Presentation Forms-B";}i:119;a:3:{i:0;s:6:"0xFF00";i:1;s:6:"0xFFEF";i:2;s:29:"Halfwidth and Fullwidth Forms";}i:120;a:3:{i:0;s:6:"0xFFF0";i:1;s:6:"0xFFFF";i:2;s:8:"Specials";}i:121;a:3:{i:0;s:7:"0x10000";i:1;s:7:"0x1007F";i:2;s:18:"Linear B Syllabary";}i:122;a:3:{i:0;s:7:"0x10080";i:1;s:7:"0x100FF";i:2;s:18:"Linear B Ideograms";}i:123;a:3:{i:0;s:7:"0x10100";i:1;s:7:"0x1013F";i:2;s:14:"Aegean Numbers";}i:124;a:3:{i:0;s:7:"0x10140";i:1;s:7:"0x1018F";i:2;s:21:"Ancient Greek Numbers";}i:125;a:3:{i:0;s:7:"0x10300";i:1;s:7:"0x1032F";i:2;s:10:"Old Italic";}i:126;a:3:{i:0;s:7:"0x10330";i:1;s:7:"0x1034F";i:2;s:6:"Gothic";}i:127;a:3:{i:0;s:7:"0x10380";i:1;s:7:"0x1039F";i:2;s:8:"Ugaritic";}i:128;a:3:{i:0;s:7:"0x103A0";i:1;s:7:"0x103DF";i:2;s:11:"Old Persian";}i:129;a:3:{i:0;s:7:"0x10400";i:1;s:7:"0x1044F";i:2;s:7:"Deseret";}i:130;a:3:{i:0;s:7:"0x10450";i:1;s:7:"0x1047F";i:2;s:7:"Shavian";}i:131;a:3:{i:0;s:7:"0x10480";i:1;s:7:"0x104AF";i:2;s:7:"Osmanya";}i:132;a:3:{i:0;s:7:"0x10800";i:1;s:7:"0x1083F";i:2;s:17:"Cypriot Syllabary";}i:133;a:3:{i:0;s:7:"0x10A00";i:1;s:7:"0x10A5F";i:2;s:10:"Kharoshthi";}i:134;a:3:{i:0;s:7:"0x1D000";i:1;s:7:"0x1D0FF";i:2;s:25:"Byzantine Musical Symbols";}i:135;a:3:{i:0;s:7:"0x1D100";i:1;s:7:"0x1D1FF";i:2;s:15:"Musical Symbols";}i:136;a:3:{i:0;s:7:"0x1D200";i:1;s:7:"0x1D24F";i:2;s:30:"Ancient Greek Musical Notation";}i:137;a:3:{i:0;s:7:"0x1D300";i:1;s:7:"0x1D35F";i:2;s:21:"Tai Xuan Jing Symbols";}i:138;a:3:{i:0;s:7:"0x1D400";i:1;s:7:"0x1D7FF";i:2;s:33:"Mathematical Alphanumeric Symbols";}i:139;a:3:{i:0;s:7:"0x20000";i:1;s:7:"0x2A6DF";i:2;s:34:"CJK Unified Ideographs Extension B";}i:140;a:3:{i:0;s:7:"0x2F800";i:1;s:7:"0x2FA1F";i:2;s:39:"CJK Compatibility Ideographs Supplement";}i:141;a:3:{i:0;s:7:"0xE0000";i:1;s:7:"0xE007F";i:2;s:4:"Tags";}i:142;a:3:{i:0;s:7:"0xE0100";i:1;s:7:"0xE01EF";i:2;s:30:"Variation Selectors Supplement";}i:143;a:3:{i:0;s:7:"0xF0000";i:1;s:7:"0xFFFFF";i:2;s:32:"Supplementary Private Use Area-A";}i:144;a:3:{i:0;s:8:"0x100000";i:1;s:8:"0x10FFFF";i:2;s:32:"Supplementary Private Use Area-B";}} \ No newline at end of file +a:145:{i:0;a:3:{i:0;i:0;i:1;i:127;i:2;s:11:"Basic Latin";}i:1;a:3:{i:0;i:128;i:1;i:255;i:2;s:18:"Latin-1 Supplement";}i:2;a:3:{i:0;i:256;i:1;i:383;i:2;s:16:"Latin Extended-A";}i:3;a:3:{i:0;i:384;i:1;i:591;i:2;s:16:"Latin Extended-B";}i:4;a:3:{i:0;i:592;i:1;i:687;i:2;s:14:"IPA Extensions";}i:5;a:3:{i:0;i:688;i:1;i:767;i:2;s:24:"Spacing Modifier Letters";}i:6;a:3:{i:0;i:768;i:1;i:879;i:2;s:27:"Combining Diacritical Marks";}i:7;a:3:{i:0;i:880;i:1;i:1023;i:2;s:16:"Greek and Coptic";}i:8;a:3:{i:0;i:1024;i:1;i:1279;i:2;s:8:"Cyrillic";}i:9;a:3:{i:0;i:1280;i:1;i:1327;i:2;s:19:"Cyrillic Supplement";}i:10;a:3:{i:0;i:1328;i:1;i:1423;i:2;s:8:"Armenian";}i:11;a:3:{i:0;i:1424;i:1;i:1535;i:2;s:6:"Hebrew";}i:12;a:3:{i:0;i:1536;i:1;i:1791;i:2;s:6:"Arabic";}i:13;a:3:{i:0;i:1792;i:1;i:1871;i:2;s:6:"Syriac";}i:14;a:3:{i:0;i:1872;i:1;i:1919;i:2;s:17:"Arabic Supplement";}i:15;a:3:{i:0;i:1920;i:1;i:1983;i:2;s:6:"Thaana";}i:16;a:3:{i:0;i:2304;i:1;i:2431;i:2;s:10:"Devanagari";}i:17;a:3:{i:0;i:2432;i:1;i:2559;i:2;s:7:"Bengali";}i:18;a:3:{i:0;i:2560;i:1;i:2687;i:2;s:8:"Gurmukhi";}i:19;a:3:{i:0;i:2688;i:1;i:2815;i:2;s:8:"Gujarati";}i:20;a:3:{i:0;i:2816;i:1;i:2943;i:2;s:5:"Oriya";}i:21;a:3:{i:0;i:2944;i:1;i:3071;i:2;s:5:"Tamil";}i:22;a:3:{i:0;i:3072;i:1;i:3199;i:2;s:6:"Telugu";}i:23;a:3:{i:0;i:3200;i:1;i:3327;i:2;s:7:"Kannada";}i:24;a:3:{i:0;i:3328;i:1;i:3455;i:2;s:9:"Malayalam";}i:25;a:3:{i:0;i:3456;i:1;i:3583;i:2;s:7:"Sinhala";}i:26;a:3:{i:0;i:3584;i:1;i:3711;i:2;s:4:"Thai";}i:27;a:3:{i:0;i:3712;i:1;i:3839;i:2;s:3:"Lao";}i:28;a:3:{i:0;i:3840;i:1;i:4095;i:2;s:7:"Tibetan";}i:29;a:3:{i:0;i:4096;i:1;i:4255;i:2;s:7:"Myanmar";}i:30;a:3:{i:0;i:4256;i:1;i:4351;i:2;s:8:"Georgian";}i:31;a:3:{i:0;i:4352;i:1;i:4607;i:2;s:11:"Hangul Jamo";}i:32;a:3:{i:0;i:4608;i:1;i:4991;i:2;s:8:"Ethiopic";}i:33;a:3:{i:0;i:4992;i:1;i:5023;i:2;s:19:"Ethiopic Supplement";}i:34;a:3:{i:0;i:5024;i:1;i:5119;i:2;s:8:"Cherokee";}i:35;a:3:{i:0;i:5120;i:1;i:5759;i:2;s:37:"Unified Canadian Aboriginal Syllabics";}i:36;a:3:{i:0;i:5760;i:1;i:5791;i:2;s:5:"Ogham";}i:37;a:3:{i:0;i:5792;i:1;i:5887;i:2;s:5:"Runic";}i:38;a:3:{i:0;i:5888;i:1;i:5919;i:2;s:7:"Tagalog";}i:39;a:3:{i:0;i:5920;i:1;i:5951;i:2;s:7:"Hanunoo";}i:40;a:3:{i:0;i:5952;i:1;i:5983;i:2;s:5:"Buhid";}i:41;a:3:{i:0;i:5984;i:1;i:6015;i:2;s:8:"Tagbanwa";}i:42;a:3:{i:0;i:6016;i:1;i:6143;i:2;s:5:"Khmer";}i:43;a:3:{i:0;i:6144;i:1;i:6319;i:2;s:9:"Mongolian";}i:44;a:3:{i:0;i:6400;i:1;i:6479;i:2;s:5:"Limbu";}i:45;a:3:{i:0;i:6480;i:1;i:6527;i:2;s:6:"Tai Le";}i:46;a:3:{i:0;i:6528;i:1;i:6623;i:2;s:11:"New Tai Lue";}i:47;a:3:{i:0;i:6624;i:1;i:6655;i:2;s:13:"Khmer Symbols";}i:48;a:3:{i:0;i:6656;i:1;i:6687;i:2;s:8:"Buginese";}i:49;a:3:{i:0;i:7424;i:1;i:7551;i:2;s:19:"Phonetic Extensions";}i:50;a:3:{i:0;i:7552;i:1;i:7615;i:2;s:30:"Phonetic Extensions Supplement";}i:51;a:3:{i:0;i:7616;i:1;i:7679;i:2;s:38:"Combining Diacritical Marks Supplement";}i:52;a:3:{i:0;i:7680;i:1;i:7935;i:2;s:25:"Latin Extended Additional";}i:53;a:3:{i:0;i:7936;i:1;i:8191;i:2;s:14:"Greek Extended";}i:54;a:3:{i:0;i:8192;i:1;i:8303;i:2;s:19:"General Punctuation";}i:55;a:3:{i:0;i:8304;i:1;i:8351;i:2;s:27:"Superscripts and Subscripts";}i:56;a:3:{i:0;i:8352;i:1;i:8399;i:2;s:16:"Currency Symbols";}i:57;a:3:{i:0;i:8400;i:1;i:8447;i:2;s:39:"Combining Diacritical Marks for Symbols";}i:58;a:3:{i:0;i:8448;i:1;i:8527;i:2;s:18:"Letterlike Symbols";}i:59;a:3:{i:0;i:8528;i:1;i:8591;i:2;s:12:"Number Forms";}i:60;a:3:{i:0;i:8592;i:1;i:8703;i:2;s:6:"Arrows";}i:61;a:3:{i:0;i:8704;i:1;i:8959;i:2;s:22:"Mathematical Operators";}i:62;a:3:{i:0;i:8960;i:1;i:9215;i:2;s:23:"Miscellaneous Technical";}i:63;a:3:{i:0;i:9216;i:1;i:9279;i:2;s:16:"Control Pictures";}i:64;a:3:{i:0;i:9280;i:1;i:9311;i:2;s:29:"Optical Character Recognition";}i:65;a:3:{i:0;i:9312;i:1;i:9471;i:2;s:22:"Enclosed Alphanumerics";}i:66;a:3:{i:0;i:9472;i:1;i:9599;i:2;s:11:"Box Drawing";}i:67;a:3:{i:0;i:9600;i:1;i:9631;i:2;s:14:"Block Elements";}i:68;a:3:{i:0;i:9632;i:1;i:9727;i:2;s:16:"Geometric Shapes";}i:69;a:3:{i:0;i:9728;i:1;i:9983;i:2;s:21:"Miscellaneous Symbols";}i:70;a:3:{i:0;i:9984;i:1;i:10175;i:2;s:8:"Dingbats";}i:71;a:3:{i:0;i:10176;i:1;i:10223;i:2;s:36:"Miscellaneous Mathematical Symbols-A";}i:72;a:3:{i:0;i:10224;i:1;i:10239;i:2;s:21:"Supplemental Arrows-A";}i:73;a:3:{i:0;i:10240;i:1;i:10495;i:2;s:16:"Braille Patterns";}i:74;a:3:{i:0;i:10496;i:1;i:10623;i:2;s:21:"Supplemental Arrows-B";}i:75;a:3:{i:0;i:10624;i:1;i:10751;i:2;s:36:"Miscellaneous Mathematical Symbols-B";}i:76;a:3:{i:0;i:10752;i:1;i:11007;i:2;s:35:"Supplemental Mathematical Operators";}i:77;a:3:{i:0;i:11008;i:1;i:11263;i:2;s:32:"Miscellaneous Symbols and Arrows";}i:78;a:3:{i:0;i:11264;i:1;i:11359;i:2;s:10:"Glagolitic";}i:79;a:3:{i:0;i:11392;i:1;i:11519;i:2;s:6:"Coptic";}i:80;a:3:{i:0;i:11520;i:1;i:11567;i:2;s:19:"Georgian Supplement";}i:81;a:3:{i:0;i:11568;i:1;i:11647;i:2;s:8:"Tifinagh";}i:82;a:3:{i:0;i:11648;i:1;i:11743;i:2;s:17:"Ethiopic Extended";}i:83;a:3:{i:0;i:11776;i:1;i:11903;i:2;s:24:"Supplemental Punctuation";}i:84;a:3:{i:0;i:11904;i:1;i:12031;i:2;s:23:"CJK Radicals Supplement";}i:85;a:3:{i:0;i:12032;i:1;i:12255;i:2;s:15:"Kangxi Radicals";}i:86;a:3:{i:0;i:12272;i:1;i:12287;i:2;s:34:"Ideographic Description Characters";}i:87;a:3:{i:0;i:12288;i:1;i:12351;i:2;s:27:"CJK Symbols and Punctuation";}i:88;a:3:{i:0;i:12352;i:1;i:12447;i:2;s:8:"Hiragana";}i:89;a:3:{i:0;i:12448;i:1;i:12543;i:2;s:8:"Katakana";}i:90;a:3:{i:0;i:12544;i:1;i:12591;i:2;s:8:"Bopomofo";}i:91;a:3:{i:0;i:12592;i:1;i:12687;i:2;s:25:"Hangul Compatibility Jamo";}i:92;a:3:{i:0;i:12688;i:1;i:12703;i:2;s:6:"Kanbun";}i:93;a:3:{i:0;i:12704;i:1;i:12735;i:2;s:17:"Bopomofo Extended";}i:94;a:3:{i:0;i:12736;i:1;i:12783;i:2;s:11:"CJK Strokes";}i:95;a:3:{i:0;i:12784;i:1;i:12799;i:2;s:28:"Katakana Phonetic Extensions";}i:96;a:3:{i:0;i:12800;i:1;i:13055;i:2;s:31:"Enclosed CJK Letters and Months";}i:97;a:3:{i:0;i:13056;i:1;i:13311;i:2;s:17:"CJK Compatibility";}i:98;a:3:{i:0;i:13312;i:1;i:19903;i:2;s:34:"CJK Unified Ideographs Extension A";}i:99;a:3:{i:0;i:19904;i:1;i:19967;i:2;s:23:"Yijing Hexagram Symbols";}i:100;a:3:{i:0;i:19968;i:1;i:40959;i:2;s:22:"CJK Unified Ideographs";}i:101;a:3:{i:0;i:40960;i:1;i:42127;i:2;s:12:"Yi Syllables";}i:102;a:3:{i:0;i:42128;i:1;i:42191;i:2;s:11:"Yi Radicals";}i:103;a:3:{i:0;i:42752;i:1;i:42783;i:2;s:21:"Modifier Tone Letters";}i:104;a:3:{i:0;i:43008;i:1;i:43055;i:2;s:12:"Syloti Nagri";}i:105;a:3:{i:0;i:44032;i:1;i:55215;i:2;s:16:"Hangul Syllables";}i:106;a:3:{i:0;i:55296;i:1;i:56191;i:2;s:15:"High Surrogates";}i:107;a:3:{i:0;i:56192;i:1;i:56319;i:2;s:27:"High Private Use Surrogates";}i:108;a:3:{i:0;i:56320;i:1;i:57343;i:2;s:14:"Low Surrogates";}i:109;a:3:{i:0;i:57344;i:1;i:63743;i:2;s:16:"Private Use Area";}i:110;a:3:{i:0;i:63744;i:1;i:64255;i:2;s:28:"CJK Compatibility Ideographs";}i:111;a:3:{i:0;i:64256;i:1;i:64335;i:2;s:29:"Alphabetic Presentation Forms";}i:112;a:3:{i:0;i:64336;i:1;i:65023;i:2;s:27:"Arabic Presentation Forms-A";}i:113;a:3:{i:0;i:65024;i:1;i:65039;i:2;s:19:"Variation Selectors";}i:114;a:3:{i:0;i:65040;i:1;i:65055;i:2;s:14:"Vertical Forms";}i:115;a:3:{i:0;i:65056;i:1;i:65071;i:2;s:20:"Combining Half Marks";}i:116;a:3:{i:0;i:65072;i:1;i:65103;i:2;s:23:"CJK Compatibility Forms";}i:117;a:3:{i:0;i:65104;i:1;i:65135;i:2;s:19:"Small Form Variants";}i:118;a:3:{i:0;i:65136;i:1;i:65279;i:2;s:27:"Arabic Presentation Forms-B";}i:119;a:3:{i:0;i:65280;i:1;i:65519;i:2;s:29:"Halfwidth and Fullwidth Forms";}i:120;a:3:{i:0;i:65520;i:1;i:65535;i:2;s:8:"Specials";}i:121;a:3:{i:0;i:65536;i:1;i:65663;i:2;s:18:"Linear B Syllabary";}i:122;a:3:{i:0;i:65664;i:1;i:65791;i:2;s:18:"Linear B Ideograms";}i:123;a:3:{i:0;i:65792;i:1;i:65855;i:2;s:14:"Aegean Numbers";}i:124;a:3:{i:0;i:65856;i:1;i:65935;i:2;s:21:"Ancient Greek Numbers";}i:125;a:3:{i:0;i:66304;i:1;i:66351;i:2;s:10:"Old Italic";}i:126;a:3:{i:0;i:66352;i:1;i:66383;i:2;s:6:"Gothic";}i:127;a:3:{i:0;i:66432;i:1;i:66463;i:2;s:8:"Ugaritic";}i:128;a:3:{i:0;i:66464;i:1;i:66527;i:2;s:11:"Old Persian";}i:129;a:3:{i:0;i:66560;i:1;i:66639;i:2;s:7:"Deseret";}i:130;a:3:{i:0;i:66640;i:1;i:66687;i:2;s:7:"Shavian";}i:131;a:3:{i:0;i:66688;i:1;i:66735;i:2;s:7:"Osmanya";}i:132;a:3:{i:0;i:67584;i:1;i:67647;i:2;s:17:"Cypriot Syllabary";}i:133;a:3:{i:0;i:68096;i:1;i:68191;i:2;s:10:"Kharoshthi";}i:134;a:3:{i:0;i:118784;i:1;i:119039;i:2;s:25:"Byzantine Musical Symbols";}i:135;a:3:{i:0;i:119040;i:1;i:119295;i:2;s:15:"Musical Symbols";}i:136;a:3:{i:0;i:119296;i:1;i:119375;i:2;s:30:"Ancient Greek Musical Notation";}i:137;a:3:{i:0;i:119552;i:1;i:119647;i:2;s:21:"Tai Xuan Jing Symbols";}i:138;a:3:{i:0;i:119808;i:1;i:120831;i:2;s:33:"Mathematical Alphanumeric Symbols";}i:139;a:3:{i:0;i:131072;i:1;i:173791;i:2;s:34:"CJK Unified Ideographs Extension B";}i:140;a:3:{i:0;i:194560;i:1;i:195103;i:2;s:39:"CJK Compatibility Ideographs Supplement";}i:141;a:3:{i:0;i:917504;i:1;i:917631;i:2;s:4:"Tags";}i:142;a:3:{i:0;i:917760;i:1;i:917999;i:2;s:30:"Variation Selectors Supplement";}i:143;a:3:{i:0;i:983040;i:1;i:1048575;i:2;s:32:"Supplementary Private Use Area-A";}i:144;a:3:{i:0;i:1048576;i:1;i:1114111;i:2;s:32:"Supplementary Private Use Area-B";}} \ No newline at end of file diff --git a/libraries/readability/Readability.php b/libraries/readability/Readability.php index 8a3fb73..9ff38f4 100644 --- a/libraries/readability/Readability.php +++ b/libraries/readability/Readability.php @@ -122,6 +122,7 @@ class Readability if ($parser=='gumbo') { // Can we avoid this encoding/deocding step? Test on: // http://www.medialens.org/index.php/alerts/alert-archive/2017/837-undermining-democracy-corporate-media-bias-on-jeremy-corbyn-boris-johnson-and-syria.html + $html = str_replace(''', "'", $html); // other named entities handled okay $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $html = mb_convert_encoding($html, "UTF-8", 'HTML-ENTITIES'); $this->dom = @Layershifter\Gumbo\Parser::load($html); diff --git a/makefulltextfeed.php b/makefulltextfeed.php index ebaa01d..7592bf3 100644 --- a/makefulltextfeed.php +++ b/makefulltextfeed.php @@ -3,8 +3,8 @@ // Author: Keyvan Minoukadeh // Copyright (c) 2017 Keyvan Minoukadeh // License: AGPLv3 -// Version: 3.7 -// Date: 2017-02-12 +// Version: 3.8 +// Date: 2017-09-25 // More info: http://fivefilters.org/content-only/ // Help: http://help.fivefilters.org @@ -183,7 +183,9 @@ if (!isset($_REQUEST['url'])) { die('No URL supplied'); } $url = trim($_REQUEST['url']); -if (strtolower(substr($url, 0, 7)) == 'feed://') { +if (strtolower(substr($url, 0, 6)) == 'sec://') { + $url = 'https://'.substr($url, 6); +} elseif (strtolower(substr($url, 0, 7)) == 'feed://') { $url = 'http://'.substr($url, 7); } if (!preg_match('!^https?://.+!i', $url)) { @@ -345,10 +347,10 @@ if ($options->content === 'user') { // HTML5 output? /////////////////////////////////////////////// if ($options->html5_output === 'user') { - if (isset($_REQUEST['content']) && $_REQUEST['content'] === 'html5') { - $options->html5_output = true; - } else { + if (isset($_REQUEST['content']) && $_REQUEST['content'] === '1') { $options->html5_output = false; + } else { + $options->html5_output = true; } } @@ -820,7 +822,7 @@ foreach ($items as $key => $item) { continue; // skip this feed item entry } } - $base_url = get_base_url($readability->dom); + $base_url = get_base_url($readability->dom, $effective_url); if (!$base_url) $base_url = $effective_url; $content_block = ($extract_result) ? $extractor->getContent() : null; $extracted_title = ($extract_result) ? $extractor->getTitle() : ''; @@ -945,6 +947,7 @@ foreach ($items as $key => $item) { //unset($content_block); // post-processing cleanup $html = preg_replace('!

[\s\h\v]*

!u', '', $html); + $html = str_replace('

 

', '', $html); if ($links == 'remove') { $html = preg_replace('!]*>!', '', $html); $html = preg_replace('!!', '', $html); @@ -1080,6 +1083,7 @@ foreach ($items as $key => $item) { $l_result = $l->detect($text_sample, 1); if (count($l_result) > 0) { $language = key($l_result); + debug('Language detected: '.$language); } } } catch (Exception $e) { @@ -1248,6 +1252,17 @@ function get_self_url() { } function validate_url($url) { + if (function_exists('idn_to_ascii')) { + if ($host = @parse_url($url, PHP_URL_HOST)) { + $puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46); + if ($host != $puny) { + $pos = strpos($url, $host); + if ($pos !== false) { + $url = substr_replace($url, $puny, $pos, strlen($host)); + } + } + } + } $url = filter_var($url, FILTER_SANITIZE_URL); $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) @@ -1261,9 +1276,14 @@ function validate_url($url) { } } -function get_base_url($dom) { +function get_base_url($dom, $url=null) { $xpath = new DOMXPath($dom); - return @$xpath->evaluate('string(//head/base/@href)', $dom); + $base = @$xpath->evaluate('string(//head/base/@href)', $dom); + if (!$base) return false; + if (isset($url) && !preg_match('!^https?://!i', $base)) { + $base = make_absolute_str($url, $base); + } + return $base; } function is_ssl() { @@ -1436,7 +1456,7 @@ function make_absolute_attr($base, $e, $attr) { $url = str_replace(' ', '%20', $url); if (!preg_match('!https?://!i', $url)) { if ($absolute = SimplePie_IRI::absolutize($base, $url)) { - $e->setAttribute($attr, $absolute); + $e->setAttribute($attr, $absolute->get_uri()); } } } @@ -1450,7 +1470,7 @@ function make_absolute_str($base, $url) { return $url; } else { if ($absolute = SimplePie_IRI::absolutize($base, $url)) { - return $absolute; + return $absolute->get_uri(); } return false; } @@ -1529,7 +1549,7 @@ function get_single_page($item, $html, $url) { } } } - $base_url = get_base_url($readability->dom); + $base_url = get_base_url($readability->dom, $url); if (!$base_url) $base_url = $url; // If we've got URL, resolve against $base_url if (isset($single_page_url) && ($single_page_url = make_absolute_str($base_url, $single_page_url))) {