From e7753953f6dd5d69b889956f42e130d2d62f6514 Mon Sep 17 00:00:00 2001 From: "FiveFilters.org" Date: Sun, 14 Jun 2015 02:03:20 +0200 Subject: [PATCH] Full-Text RSS 3.4 --- admin/edit-pattern.php | 1 + admin/update.php | 24 +- cache/index.php | 3 +- cache/rss-with-key/index.php | 2 + cache/rss/index.php | 2 + changelog.txt | 18 + config.php | 61 +- css/feed.xsl | 7 +- extract.php | 2 +- index.php | 69 +- libraries/DisableSimplePieSanitize.php | 6 + .../content-extractor/ContentExtractor.php | 33 +- libraries/content-extractor/SiteConfig.php | 12 +- libraries/feedwriter/FeedWriter.php | 52 +- libraries/html5php/HTML5.php | 406 +- libraries/html5php/HTML5/Elements.php | 1171 ++--- libraries/html5php/HTML5/Entities.php | 4460 +++++++++-------- libraries/html5php/HTML5/Exception.php | 5 +- .../html5php/HTML5/InstructionProcessor.php | 56 +- .../HTML5/Parser/CharacterReference.php | 91 +- .../html5php/HTML5/Parser/DOMTreeBuilder.php | 1063 ++-- .../html5php/HTML5/Parser/EventHandler.php | 203 +- .../html5php/HTML5/Parser/FileInputStream.php | 47 +- .../html5php/HTML5/Parser/InputStream.php | 141 +- .../html5php/HTML5/Parser/ParseError.php | 5 +- libraries/html5php/HTML5/Parser/Scanner.php | 371 +- .../HTML5/Parser/StringInputStream.php | 524 +- libraries/html5php/HTML5/Parser/Tokenizer.php | 1975 ++++---- .../HTML5/Parser/TreeBuildingRules.php | 220 +- libraries/html5php/HTML5/Parser/UTF8Utils.php | 258 +- .../HTML5/Serializer/HTML5Entities.php | 3035 +++++------ .../html5php/HTML5/Serializer/OutputRules.php | 711 ++- .../HTML5/Serializer/RulesInterface.php | 167 +- .../html5php/HTML5/Serializer/Traverser.php | 246 +- libraries/html5php/autoloader.php | 3 +- .../humble-http-agent/HumbleHttpAgent.php | 6 +- libraries/readability/Readability.php | 9 +- makefulltextfeed.php | 235 +- manifest.yml | 16 - 39 files changed, 8217 insertions(+), 7499 deletions(-) create mode 100644 cache/rss-with-key/index.php create mode 100644 cache/rss/index.php create mode 100644 libraries/DisableSimplePieSanitize.php delete mode 100644 manifest.yml diff --git a/admin/edit-pattern.php b/admin/edit-pattern.php index d3ba4f1..d22c1a1 100644 --- a/admin/edit-pattern.php +++ b/admin/edit-pattern.php @@ -60,6 +60,7 @@ tpl_header('Edit site patterns'); $version = file_get_contents('../site_config/standard/version.txt'); function filter_only_text($filename) { + if ($filename === 'version.txt') return false; return (strtolower(substr($filename, -4)) == '.txt'); } function is_valid_hostname($host) { diff --git a/admin/update.php b/admin/update.php index a34c9e4..db15e5a 100644 --- a/admin/update.php +++ b/admin/update.php @@ -3,7 +3,7 @@ // Author: Keyvan Minoukadeh // Copyright (c) 2014 Keyvan Minoukadeh // License: AGPLv3 -// Date: 2013-05-02 +// Date: 2014-08-19 // More info: http://fivefilters.org/content-only/ // Help: http://help.fivefilters.org @@ -36,6 +36,8 @@ ini_set("display_errors", 1); //////////////////////////////// $admin_page = 'update'; require_once('../config.php'); +require_once('../libraries/humble-http-agent/HumbleHttpAgent.php'); +require_once('../libraries/humble-http-agent/CookieJar.php'); require_once 'template.php'; tpl_header('Update site patterns'); @@ -129,18 +131,21 @@ if ($_REQUEST['key'] !== $admin_hash) { // Check for updates ////////////////////////////////// //$ff_version = @file_get_contents('http://fivefilters.org/content-only/site_config/standard/version.txt'); -$_context = stream_context_create(array('http' => array('user_agent' => 'PHP/5.4'))); -$latest_info_json = @file_get_contents('https://api.github.com/repos/fivefilters/ftr-site-config', false, $_context); +$http = new HumbleHttpAgent(); +$latest_info_json = $http->get('https://api.github.com/repos/fivefilters/ftr-site-config'); +//$_context = stream_context_create(array('http' => array('user_agent' => 'PHP/5.5'), 'ssl'=>array('verify_peer'=>false))); +//$latest_info_json = file_get_contents('https://api.github.com/repos/fivefilters/ftr-site-config', false, $_context); if (!$latest_info_json) { println("Sorry, couldn't get info on latest site config files. Please try again later or contact us."); exit; } +$latest_info_json = $latest_info_json['body']; $latest_info_json = @json_decode($latest_info_json); if (!is_object($latest_info_json)) { println("Sorry, couldn't parse JSON from GitHub. Please try again later or contact us."); exit; } -$ff_version = $latest_info_json->updated_at; +$ff_version = $latest_info_json->pushed_at; if ($version == $ff_version) { die('Your site config files are up to date! If you have trouble extracting from a particular site, please email us: help@fivefilters.org'); } else { @@ -166,8 +171,15 @@ if (file_exists($tmp_old_local_dir)) { $standard_local_dir = '../site_config/standard/'; //@copy($latest_remote, $tmp_latest_local); //copy() does not appear to fill $http_response_header in certain environments -@file_put_contents($tmp_latest_local, @file_get_contents($latest_remote)); -$headers = implode("\n", $http_response_header); +//@file_put_contents($tmp_latest_local, @file_get_contents($latest_remote, false, $_context)); +$latest_remote_response = $http->get($latest_remote); +if (!is_array($latest_remote_response)) { + println("Sorry, something went wrong. Please contact us if the problem persists."); + exit; +} +@file_put_contents($tmp_latest_local, $latest_remote_response['body']); +//$headers = implode("\n", $http_response_header); +$headers = $latest_remote_response['headers']; //var_dump($headers); exit; if ((strpos($headers, 'HTTP/1.0 200') === false) && (strpos($headers, 'HTTP/1.1 200') === false)) { println("Sorry, something went wrong. Please contact us if the problem persists."); diff --git a/cache/index.php b/cache/index.php index a3d5f73..76ca8b3 100644 --- a/cache/index.php +++ b/cache/index.php @@ -1,3 +1,2 @@ \ No newline at end of file +// this is here to prevent directory listing over the web \ No newline at end of file diff --git a/cache/rss-with-key/index.php b/cache/rss-with-key/index.php new file mode 100644 index 0000000..76ca8b3 --- /dev/null +++ b/cache/rss-with-key/index.php @@ -0,0 +1,2 @@ +Native Ad for articles which appear to be native ads. + - New config option: user_submitted_config to determine whether siteconfig parameter is enabled or not + - Feed output now includes with URL of the generated feed + - Feed output now includes with URL of the original (input) URL + - Feed output now includes with URL to subscribe to the generated feed (using subtome.com) + - Feed preview stylesheet (feed.xsl) now presents a subscribe to feed link + - Fixed character encoding issue for certain texts + - Fixed character encoding issue for certain characters in HTML5 parsing mode + - Use base element, if present in HTML, when rewriting URLs + - HTML5-PHP library updated + - Other minor fixes/improvements + 3.3 (2014-05-13) - Content extractor now looks for Schema.org articleBody elements - New endpoint extract.php for developers looking for simpler JSON results (no RSS as input/output) diff --git a/config.php b/config.php index e8719d4..1deb0ca 100644 --- a/config.php +++ b/config.php @@ -187,11 +187,28 @@ $options->keep_enclosures = true; // Values will be placed inside the element inside each element // Possible values: // * Ignore language: 0 -// * Use article/feed metadata (e.g. HTML lang attribute): 1 (default) +// * Use article/feed metadata (e.g. HTML lang attribute): 1 // * As above, but guess if not present: 2 // * Always guess: 3 -// * User decides: 'user' (value of 0-3 can be passed in querystring: e.g. &lang=2) -$options->detect_language = 1; +// * User decides: 'user' (value of 0-3 can be passed in querystring: e.g. &lang=2, &lang=1 will be default if nothing supplied) +$options->detect_language = 'user'; + +// Allow user-submitted site config in request +// --------------- +// If enabled, a user can submit site config rules directly in the request +// using the siteconfig request parameter. Disabled (false) by default. +$options->user_submitted_config = false; + +// Remove items identified as native ads? +// --------------- +// Many news sites now carry native advertising - articles which have been +// paid for by a corporation to promote their brand or product. +// Full-Text RSS can identify such articles in certain sites. If an article +// is identified as being a native ad, we'll add a Native Ad +// element to the item. But you can also request that such ads be removed from +// the output altogether. To do so, set the option below to true. +// Note: this only has effect when the input URL is a feed, not a web page. +$options->remove_native_ads = false; ///////////////////////////////////////////////// /// RESTRICT ACCESS ///////////////////////////// @@ -213,6 +230,7 @@ $options->admin_credentials = array('username'=>'admin', 'password'=>''); // List of URLs (or parts of a URL) which the service will accept. // If the list is empty, all URLs (except those specified in the blocked list below) // will be permitted. +// Note: for feeds, this option applies to both feed URLs and item URLs within those feeds. // Empty: array(); // Non-empty example: array('example.com', 'anothersite.org'); $options->allowed_urls = array(); @@ -220,7 +238,8 @@ $options->allowed_urls = array(); // URLs to block // ---------------------- // List of URLs (or parts of a URL) which the service will not accept. -// Note: this list is ignored if allowed_urls is not empty +// Note: this list is ignored if allowed_urls is not empty. +// Note: for feeds, this option applies to both feed URLs and item URLs within those feeds. $options->blocked_urls = array(); // Key holder(s) only? @@ -231,22 +250,6 @@ $options->blocked_urls = array(); // key is provided. $options->key_required = false; -// Favour item titles in feed -// ---------------------- -// By default, when processing feeds, we assume item titles in the feed -// have not been truncated. So after processing web pages, the extracted titles -// are not used in the generated feed. If you prefer to have extracted titles in -// the feed you can either set this to false, in which case we will always favour -// extracted titles. Alternatively, if set to 'user' (default) we'll use the -// extracted title if you pass '&use_extracted_title' in the querystring. -// Possible values: -// * Favour feed titles: true -// * Favour extracted titles: false -// * Favour feed titles with user override: 'user' (default) -// Note: this has no effect when the input URL is to a web page - in these cases -// we always use the extracted title in the generated feed. -$options->favour_feed_titles = 'user'; - // Access keys (password protected access) // ------------------------------------ // NOTE: You do not need an API key from fivefilters.org to run your own @@ -307,6 +310,22 @@ $options->max_entries_with_key = 10; // false - disabled $options->xss_filter = 'user'; +// Favour item titles in feed +// ---------------------- +// By default, when processing feeds, we assume item titles in the feed +// have not been truncated. So after processing web pages, the extracted titles +// are not used in the generated feed. If you prefer to have extracted titles in +// the feed you can either set this to false, in which case we will always favour +// extracted titles. Alternatively, if set to 'user' (default) we'll use the +// extracted title if you pass '&use_extracted_title' in the querystring. +// Possible values: +// * Favour feed titles: true +// * Favour extracted titles: false +// * Favour feed titles with user override: 'user' (default) +// Note: this has no effect when the input URL is to a web page - in these cases +// we always use the extracted title in the generated feed. +$options->favour_feed_titles = 'user'; + // Allowed HTML parsers // ---------------------- // Full-Text RSS attempts to use PHP's libxml extension to process HTML. @@ -481,7 +500,7 @@ $options->cache_cleanup = 100; /// DO NOT CHANGE ANYTHING BELOW THIS /////////// ///////////////////////////////////////////////// -if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.3'); +if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.4'); if (basename(__FILE__) == 'config.php') { if (file_exists(dirname(__FILE__).'/custom_config.php')) { diff --git a/css/feed.xsl b/css/feed.xsl index 5684db7..89fd7a0 100644 --- a/css/feed.xsl +++ b/css/feed.xsl @@ -1,7 +1,8 @@ - + - + + @@ -11,7 +12,7 @@

(full-text feed)

-

You are viewing an auto-generated full-text RSS feed. RSS feeds allow you to stay up to date with the latest news and features you want from websites. To subscribe to it, you will need a News Reader or other similar device.

+

You are viewing an auto-generated full-text RSS feed. RSS feeds allow you to stay up to date with the latest news and features you want from websites.
Subscribe to this feed.

Below is the latest content available from this feed.

diff --git a/extract.php b/extract.php index 273319b..bed72da 100644 --- a/extract.php +++ b/extract.php @@ -45,7 +45,7 @@ HTTP/1.0 200 OK define('_FF_FTR_MODE', 'simple'); // Don't process URL as feed -$_POST['html'] = '1'; +$_POST['accept'] = 'html'; // JSON output only $_POST['format'] = 'json'; // Enable excerpts diff --git a/index.php b/index.php index 4dce4aa..27b1c3f 100644 --- a/index.php +++ b/index.php @@ -315,6 +315,12 @@ if (!defined('_FF_FTR_INDEX')) { html5php, libxml The default parser is libxml as it's the fastest. HTML5-PHP is an HTML5 parser implemented in PHP. It's slower than libxml, but can often produce better results. You can request HTML5-PHP be used as the parser in a site-specific config file (to ensure it gets used for all URLs for that site), or explicitly via this request parameter. + + + siteconfig + string + Site-specific extraction rules are usually stored in text files in the site_config folder. You can also submit extraction rules directly in your request using this parameter. + proxy @@ -361,43 +367,43 @@ if (!defined('_FF_FTR_INDEX')) { string (URL) This is the only required parameter. It should be the URL to a partial feed or a standard HTML page. You can omit the 'http://' prefix if you like. - + format rss (default), json The default Full-Text RSS output is RSS. The only other valid output format is JSON. To get JSON output, pass format=json in the querystring. Exclude it from the URL (or set it to ‘rss’) if you’d like RSS. - + summary 0 (default), 1 If set to 1, an excerpt will be included for each item in the output. - + content 0, 1 (default) If set to 0, the extracted content will not be included in the output. - - + + links preserve (default), footnotes, remove Links can either be preserved, made into footnotes, or removed. None of these options affect the link text, only the hyperlink itself. - + exc 0 (default), 1 If Full-Text RSS fails to extract the article body, the generated feed item will include a message saying extraction failed followed by the original item description (if present in the original feed). You ask Full-Text RSS to remove such items from the generated feed completely by passing 1 in this parameter. - + - html - 0 (default), 1 -

Treat input source as HTML (or parse-as-html-first mode). To enable, pass html=1 in the querystring. If enabled, Full-Text RSS will not attempt to parse the response as a feed. This increases performance slightly and should be used if you know that the URL is not a feed.

+ accept + auto (default), feed, html +

Tell Full-Text RSS what it should expect when fetching the input URL. By default Full-Text RSS tries to guess whether the response is a feed or regular HTML page. It's a good idea to be explicit by passing the appropriate type in this parameter. This is useful if, for example, a feed stops working and begins to return HTML or redirecs to a HTML page as a result of site changes. In such a scenario, if you've been explicit about the URL being a feed, Full-Text RSS will not parse HTML returned in response. If you pass accept=html (previously html=1), Full-Text RSS will not attempt to parse the response as a feed. This increases performance slightly and should be used if you know that the URL is not a feed.

-

Note: If excluded, or set to 0, Full-Text RSS first tries to parse the server's response as a feed, and only if it fails to parse as a feed will it revert to HTML parsing. In the default parse-as-feed-first mode, Full-Text RSS will identify itself as PHP first and only if a valid feed is returned will it identify itself as a browser in subsequent requests to fetch the feed items. In parse-as-html-first mode, Full-Text RSS will identify itself as a browser from the very first request.

+

Note: If excluded, or set to auto, Full-Text RSS first tries to parse the server's response as a feed, and only if it fails to parse as a feed will it revert to HTML parsing. In the default parse-as-feed-first mode, Full-Text RSS will identify itself as PHP first and only if a valid feed is returned will it identify itself as a browser in subsequent requests to fetch the feed items. In parse-as-html mode, Full-Text RSS will identify itself as a browser from the very first request.

@@ -405,9 +411,9 @@ if (!defined('_FF_FTR_INDEX')) { 0 (default), 1

Use this to enable XSS filtering. We have not enabled this by default because we assume the majority of our users do not display the HTML retrieved by Full-Text RSS in a web page without further processing. If you subscribe to our generated feeds in your news reader application, it should, if it's good software, already filter the resulting HTML for XSS attacks, making it redundant for Full-Text RSS do the same. Similarly with frameworks/CMSs which display feed content - the content should be treated like any other user-submitted content.

-

If you are writing an application yourself which is processing feeds generated by Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks or enable this option. This might be useful if you are processing our generated feeds with JavaScript on the client side - although there's client side xss filtering available too.

+

If you are writing an application yourself which is processing feeds generated by Full-Text RSS, you can either filter the HTML yourself to remove potential XSS attacks or enable this option. This might be useful if you are processing our generated feeds with JavaScript on the client side - although there's client side xss filtering available too.

-

If enabled, we'll pass retrieved HTML content through htmLawed (safe flag on and style attributes denied). Note: if enabled this will also remove certain elements you may want to preserve, such as iframes.

+

If enabled, we'll pass retrieved HTML content through htmLawed (safe flag on and style attributes denied). Note: if enabled this will also remove certain elements you may want to preserve, such as iframes.

@@ -444,6 +450,12 @@ if (!defined('_FF_FTR_INDEX')) { html5php, libxml The default parser is libxml as it's the fastest. HTML5-PHP is an HTML5 parser implemented in PHP. It's slower than libxml, but can often produce better results. You can request HTML5-PHP be used as the parser in a site-specific config file (to ensure it gets used for all URLs for that site), or explicitly via this request parameter. + + + siteconfig + string + Site-specific extraction rules are usually stored in text files in the site_config folder. You can also submit extraction rules directly in your request using this parameter. + proxy @@ -501,19 +513,24 @@ if (!defined('_FF_FTR_INDEX')) { - - key - string or number -

This parameter has two functions.

If you're calling Full-Text RSS programattically, it's better to use this parameter to provide the API key index number together with the hash parameter (see below) so that the actual API key does not get sent in the HTTP request.

If you pass the actual API key in this parameter, the hash parameter is not required. If you pass the actual API key to makefulltextfeed.php, Full-Text RSS will find the index number and generate the hash value automatically and redirect to a new URL to hide the API key. If you'd like to link to a generated feed publically while protecting your API key, make sure you copy and paste the URL that results after the redirect.

If you've configured Full-Text RSS to require a key, an invalid key will result in an error message.

- - - - hash - string - A SHA-1 hash value of the API key (actual key, not index number) and the URL supplied in the url parameter, concatenated. This parameter must be passed along with the API key's index number using the key parameter (see above). In PHP, for example: $hash = sha1($api_key.$url); - - - + + key + string or number +

This parameter has two functions.

If you're calling Full-Text RSS programattically, it's better to use this parameter to provide the API key index number together with the hash parameter (see below) so that the actual API key does not get sent in the HTTP request.

If you pass the actual API key in this parameter, the hash parameter is not required. If you pass the actual API key, Full-Text RSS will find the index number and generate the hash value automatically and redirect to a new URL to hide the API key. If you'd like to link to a generated feed publically while protecting your API key, make sure you copy and paste the URL that results after the redirect.

If you've configured Full-Text RSS to require a key, an invalid key will result in an error message.

+ + + + hash + string + A SHA-1 hash value of the API key (actual key, not index number) and the URL supplied in the url parameter, concatenated. This parameter must be passed along with the API key's index number using the key parameter (see above). In PHP, for example: $hash = sha1($api_key.$url); + + + + key_redirect + 0 or 1 (default) +

When supplying the API key with the key parameter, Full-Text RSS will generate a new URL and issue a HTTP redirect to the new URL to hide the API key (see description above). If you'd like to avoid an HTTP redirect, you can pass 0 in this parameter. We do not recommend you subscribe to feeds generated in this way.

+ + diff --git a/libraries/DisableSimplePieSanitize.php b/libraries/DisableSimplePieSanitize.php new file mode 100644 index 0000000..d276d19 --- /dev/null +++ b/libraries/DisableSimplePieSanitize.php @@ -0,0 +1,6 @@ +userSubmittedConfig (it gets reused) $this->html = null; $this->readability = null; $this->config = null; $this->title = null; + $this->nativeAd = false; $this->body = null; $this->author = array(); $this->language = null; @@ -156,7 +160,17 @@ class ContentExtractor // but it has problems of its own which we try to avoid with this option. public function process($html, $url, $smart_tidy=true) { $this->reset(); - $this->config = $this->buildSiteConfig($url, $html); + // use user submitted config and merge it with regular one + if (isset($this->userSubmittedConfig)) { + $this->debug('Using user-submitted site config'); + $this->config = $this->userSubmittedConfig; + if ($this->config->autodetect_on_failure()) { + $this->debug('Merging user-submitted site config with site config files associated with this URL and/or content'); + $this->config->append($this->buildSiteConfig($url, $html)); + } + } else { + $this->config = $this->buildSiteConfig($url, $html); + } // do string replacements if (!empty($this->config->find_string)) { @@ -225,6 +239,15 @@ class ContentExtractor } } + // check if this is a native ad + foreach ($this->config->native_ad_clue as $pattern) { + $elems = @$xpath->evaluate($pattern, $this->readability->dom); + if ($elems instanceof DOMNodeList && $elems->length > 0) { + $this->nativeAd = true; + break; + } + } + // try to get title foreach ($this->config->title as $pattern) { // $this->debug("Trying $pattern"); @@ -758,9 +781,17 @@ class ContentExtractor return false; } + public function setUserSubmittedConfig($config_string) { + $this->userSubmittedConfig = SiteConfig::build_from_string($config_string); + } + public function getContent() { return $this->body; } + + public function isNativeAd() { + return $this->nativeAd; + } public function getTitle() { return $this->title; diff --git a/libraries/content-extractor/SiteConfig.php b/libraries/content-extractor/SiteConfig.php index 8675ee2..60d465b 100644 --- a/libraries/content-extractor/SiteConfig.php +++ b/libraries/content-extractor/SiteConfig.php @@ -34,6 +34,9 @@ class SiteConfig // Strip images which contain these strings (0 or more) in the src attribute public $strip_image_src = array(); + + // Mark article as a native ad if any of these expressions match (0 or more xpath expressions) + public $native_ad_clue = array(); // Additional HTTP headers to send // NOT YET USED @@ -182,7 +185,7 @@ class SiteConfig public function append(SiteConfig $newconfig) { // check for commands where we accept multiple statements (no test_url) - foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) { + foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header') as $var) { // append array elements for this config variable from $newconfig to this config //$this->$var = $this->$var + $newconfig->$var; $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); @@ -323,6 +326,11 @@ class SiteConfig } } + public static function build_from_string($string) { + $config_lines = explode("\n", $string); + return self::build_from_array($config_lines); + } + public static function build_from_array(array $lines) { $config = new SiteConfig(); foreach ($lines as $line) { @@ -340,7 +348,7 @@ class SiteConfig if ($command == '' || $val == '') continue; // check for commands where we accept multiple statements - if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { + if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header', 'test_url', 'find_string', 'replace_string'))) { array_push($config->$command, $val); // check for single statement commands that evaluate to true or false } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { diff --git a/libraries/feedwriter/FeedWriter.php b/libraries/feedwriter/FeedWriter.php index f9ea03b..8279d02 100644 --- a/libraries/feedwriter/FeedWriter.php +++ b/libraries/feedwriter/FeedWriter.php @@ -19,6 +19,8 @@ define('JSONP', 3, true); class FeedWriter { private $self = null; // self URL - http://feed2.w3.org/docs/warning/MissingAtomSelfLink.html + private $alternate = array(); // alternate URL and title + private $related = array(); // related URL and title private $hubs = array(); // PubSubHubbub hubs private $channels = array(); // Collection of channel elements private $items = array(); // Collection of items as object of FeedItem class. @@ -240,10 +242,36 @@ define('JSONP', 3, true); * @param string URL * @return void */ - public function setSelf($self) + public function setSelf($url) { - $this->self = $self; - } + $this->self = $url; + } + + /** + * Set alternate URL + * + * @access public + * @param string URL + * @param string title + * @return void + */ + public function setAlternate($url, $title) + { + $this->alternate = array('url'=>$url, 'title'=>$title); + } + + /** + * Set related URL + * + * @access public + * @param string URL + * @param string title + * @return void + */ + public function setRelated($url, $title) + { + $this->related = array('url'=>$url, 'title'=>$title); + } /** * Set the 'description' channel element @@ -299,7 +327,7 @@ define('JSONP', 3, true); { $out = ''."\n"; if ($this->xsl) $out .= 'xsl).'"?>' . PHP_EOL; - $out .= '' . PHP_EOL; + $out .= '' . PHP_EOL; echo $out; } elseif ($this->version == JSON || $this->version == JSONP) @@ -342,7 +370,7 @@ define('JSONP', 3, true); { foreach ($attributes as $key => $value) { - $attrText .= " $key=\"$value\" "; + $attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" "; } } $nodeText .= "<{$tagName}{$attrText}>"; @@ -356,7 +384,7 @@ define('JSONP', 3, true); else { //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent); - $nodeText .= htmlspecialchars($tagContent); + $nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false); } //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]>" : ""; $nodeText .= ""; @@ -408,13 +436,21 @@ define('JSONP', 3, true); // add hubs foreach ($this->hubs as $hub) { //echo $this->makeNode('link', '', array('rel'=>'hub', 'href'=>$hub, 'xmlns'=>'http://www.w3.org/2005/Atom')); - echo '' . PHP_EOL; + echo '' . PHP_EOL; } // add self if (isset($this->self)) { //echo $this->makeNode('link', '', array('rel'=>'self', 'href'=>$this->self, 'xmlns'=>'http://www.w3.org/2005/Atom')); - echo '' . PHP_EOL; + echo '' . PHP_EOL; } + // add alternate + if (isset($this->alternate)) { + echo '' . PHP_EOL; + } + // add related + if (isset($this->related)) { + echo '' . PHP_EOL; + } //Print Items of channel foreach ($this->channels as $key => $value) { diff --git a/libraries/html5php/HTML5.php b/libraries/html5php/HTML5.php index 7295fb4..2585fd4 100644 --- a/libraries/html5php/HTML5.php +++ b/libraries/html5php/HTML5.php @@ -1,220 +1,242 @@ false + ); - // If the serializer should encode all entities. - 'encode_entities' => FALSE, - ); + protected $errors = array(); - /** - * Load and parse an HTML file. - * - * This will apply the HTML5 parser, which is tolerant of many - * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML - * 3. Note that in these cases, not all of the old data will be - * preserved. For example, XHTML's XML declaration will be removed. - * - * The rules governing parsing are set out in the HTML 5 spec. - * - * @param string $file - * The path to the file to parse. If this is a resource, it is - * assumed to be an open stream whose pointer is set to the first - * byte of input. - * @return \DOMDocument - * A DOM document. These object type is defined by the libxml - * library, and should have been included with your version of PHP. - */ - public static function load($file) { - - // Handle the case where file is a resource. - if (is_resource($file)) { - // FIXME: We need a StreamInputStream class. - return static::loadHTML(stream_get_contents($file)); + public function __construct(array $options = array()) + { + $this->options = array_merge($this->options, $options); } - $input = new FileInputStream($file); - return static::parse($input); - } - - /** - * Parse a HTML Document from a string. - * - * Take a string of HTML 5 (or earlier) and parse it into a - * DOMDocument. - * - * @param string $string - * A html5 document as a string. - * @return \DOMDocument - * A DOM document. DOM is part of libxml, which is included with - * almost all distribtions of PHP. - */ - public static function loadHTML($string) { - $input = new StringInputStream($string); - return static::parse($input); - } - - /** - * Convenience function to load an HTML file. - * - * This is here to provide backwards compatibility with the - * PHP DOM implementation. It simply calls load(). - * - * @param string $file - * The path to the file to parse. If this is a resource, it is - * assumed to be an open stream whose pointer is set to the first - * byte of input. - * - * @return \DOMDocument - * A DOM document. These object type is defined by the libxml - * library, and should have been included with your version of PHP. - */ - public static function loadHTMLFile($file, $options = NULL) { - return static::load($file, $options); - } - - /** - * Parse a HTML fragment from a string. - * - * @param string $string - * The html5 fragment as a string. - * - * @return \DOMDocumentFragment - * A DOM fragment. The DOM is part of libxml, which is included with - * almost all distributions of PHP. - */ - public static function loadHTMLFragment($string) { - $input = new StringInputStream($string); - return static::parseFragment($input); - } - - /** - * Save a DOM into a given file as HTML5. - * - * @param mixed $dom - * The DOM to be serialized. - * @param string $file - * The filename to be written. - * @param array $options - * Configuration options when serializing the DOM. These include: - * - encode_entities: Text written to the output is escaped by default and not all - * entities are encoded. If this is set to TRUE all entities will be encoded. - * Defaults to FALSE. - */ - public static function save($dom, $file, $options = array()) { - $options = $options + static::options(); - $close = TRUE; - if (is_resource($file)) { - $stream = $file; - $close = FALSE; - } - else { - $stream = fopen($file, 'w'); + /** + * Get the default options. + * + * @return array The default options. + */ + public function getOptions() + { + return $this->options; } - $rules = new OutputRules($stream, $options); - $trav = new Traverser($dom, $stream, $rules, $options); - $trav->walk(); + /** + * Load and parse an HTML file. + * + * This will apply the HTML5 parser, which is tolerant of many + * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML + * 3. Note that in these cases, not all of the old data will be + * preserved. For example, XHTML's XML declaration will be removed. + * + * The rules governing parsing are set out in the HTML 5 spec. + * + * @param string $file + * The path to the file to parse. If this is a resource, it is + * assumed to be an open stream whose pointer is set to the first + * byte of input. + * @return \DOMDocument A DOM document. These object type is defined by the libxml + * library, and should have been included with your version of PHP. + */ + public function load($file) + { + // Handle the case where file is a resource. + if (is_resource($file)) { + // FIXME: We need a StreamInputStream class. + return $this->loadHTML(stream_get_contents($file)); + } - if ($close) { - fclose($stream); + $input = new FileInputStream($file); + + return $this->parse($input); } - } - /** - * Convert a DOM into an HTML5 string. - * - * @param mixed $dom - * The DOM to be serialized. - * @param array $options - * Configuration options when serializing the DOM. These include: - * - encode_entities: Text written to the output is escaped by default and not all - * entities are encoded. If this is set to TRUE all entities will be encoded. - * Defaults to FALSE. - * - * @return string - * A HTML5 documented generated from the DOM. - */ - public static function saveHTML($dom, $options = array()) { - $stream = fopen('php://temp', 'w'); - static::save($dom, $stream, $options); - return stream_get_contents($stream, -1, 0); - } + /** + * Parse a HTML Document from a string. + * + * Take a string of HTML 5 (or earlier) and parse it into a + * DOMDocument. + * + * @param string $string + * A html5 document as a string. + * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with + * almost all distribtions of PHP. + */ + public function loadHTML($string) + { + $input = new StringInputStream($string); - /** - * Parse an input stream. - * - * Lower-level loading function. This requires an input stream instead - * of a string, file, or resource. - */ - public static function parse(\HTML5\Parser\InputStream $input) { - $events = new DOMTreeBuilder(); - $scanner = new Scanner($input); - $parser = new Tokenizer($scanner, $events); + return $this->parse($input); + } - $parser->parse(); + /** + * Convenience function to load an HTML file. + * + * This is here to provide backwards compatibility with the + * PHP DOM implementation. It simply calls load(). + * + * @param string $file + * The path to the file to parse. If this is a resource, it is + * assumed to be an open stream whose pointer is set to the first + * byte of input. + * + * @return \DOMDocument A DOM document. These object type is defined by the libxml + * library, and should have been included with your version of PHP. + */ + public function loadHTMLFile($file) + { + return $this->load($file); + } - return $events->document(); - } + /** + * Parse a HTML fragment from a string. + * + * @param string $string + * The html5 fragment as a string. + * + * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with + * almost all distributions of PHP. + */ + public function loadHTMLFragment($string) + { + $input = new StringInputStream($string); - /** - * Parse an input stream where the stream is a fragment. - * - * Lower-level loading function. This requires an input stream instead - * of a string, file, or resource. - */ - public static function parseFragment(\HTML5\Parser\InputStream $input) { - $events = new DOMTreeBuilder(TRUE); - $scanner = new Scanner($input); - $parser = new Tokenizer($scanner, $events); + return $this->parseFragment($input); + } - $parser->parse(); + /** + * Return all errors encountered into parsing phase + * + * @return array + */ + public function getErrors() + { + return $this->errors; + } - return $events->fragment(); - } + /** + * Return true it some errors were encountered into parsing phase + * + * @return bool + */ + public function hasErrors() + { + return count($this->errors) > 0; + } - /** - * Get the default options. - * - * @return array - * The default options. - */ - public static function options() { - return static::$options; - } + /** + * Parse an input stream. + * + * Lower-level loading function. This requires an input stream instead + * of a string, file, or resource. + */ + public function parse(\Masterminds\HTML5\Parser\InputStream $input) + { + $this->errors = array(); + $events = new DOMTreeBuilder(false, $this->options); + $scanner = new Scanner($input); + $parser = new Tokenizer($scanner, $events); - /** - * Set a default option. - * - * @param string $name - * The option name. - * @param mixed $value - * The option value. - */ - public static function setOption($name, $value) { - static::$options[$name] = $value; - } + $parser->parse(); + $this->errors = $events->getErrors(); + return $events->document(); + } + + /** + * Parse an input stream where the stream is a fragment. + * + * Lower-level loading function. This requires an input stream instead + * of a string, file, or resource. + */ + public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input) + { + $events = new DOMTreeBuilder(true, $this->options); + $scanner = new Scanner($input); + $parser = new Tokenizer($scanner, $events); + + $parser->parse(); + $this->errors = $events->getErrors(); + + return $events->fragment(); + } + + /** + * Save a DOM into a given file as HTML5. + * + * @param mixed $dom + * The DOM to be serialized. + * @param string $file + * The filename to be written. + * @param array $options + * Configuration options when serializing the DOM. These include: + * - encode_entities: Text written to the output is escaped by default and not all + * entities are encoded. If this is set to true all entities will be encoded. + * Defaults to false. + */ + public function save($dom, $file, $options = array()) + { + $close = true; + if (is_resource($file)) { + $stream = $file; + $close = false; + } else { + $stream = fopen($file, 'w'); + } + $options = array_merge($this->getOptions(), $options); + $rules = new OutputRules($stream, $options); + $trav = new Traverser($dom, $stream, $rules, $options); + + $trav->walk(); + + if ($close) { + fclose($stream); + } + } + + /** + * Convert a DOM into an HTML5 string. + * + * @param mixed $dom + * The DOM to be serialized. + * @param array $options + * Configuration options when serializing the DOM. These include: + * - encode_entities: Text written to the output is escaped by default and not all + * entities are encoded. If this is set to true all entities will be encoded. + * Defaults to false. + * + * @return string A HTML5 documented generated from the DOM. + */ + public function saveHTML($dom, $options = array()) + { + $stream = fopen('php://temp', 'w'); + $this->save($dom, $stream, array_merge($this->getOptions(), $options)); + + return stream_get_contents($stream, - 1, 0); + } } diff --git a/libraries/html5php/HTML5/Elements.php b/libraries/html5php/HTML5/Elements.php index 69d3882..819ce0e 100644 --- a/libraries/html5php/HTML5/Elements.php +++ b/libraries/html5php/HTML5/Elements.php @@ -2,613 +2,628 @@ /** * Provide general element functions. */ -namespace HTML5; +namespace Masterminds\HTML5; /** - * This class provides general information about HTML5 elements, - * including syntactic and semantic issues. Parsers and serializers can - * use this class as a reference point for information about the rules + * This class provides general information about HTML5 elements, + * including syntactic and semantic issues. + * Parsers and serializers can + * use this class as a reference point for information about the rules * of various HTML5 elements. * * @todo consider using a bitmask table lookup. There is enough overlap in - * naming that this could significantly shrink the size and maybe make it - * faster. See the Go teams implementation at https://code.google.com/p/go/source/browse/html/atom. + * naming that this could significantly shrink the size and maybe make it + * faster. See the Go teams implementation at https://code.google.com/p/go/source/browse/html/atom. */ -class Elements { +class Elements +{ - /** Indicates an element is described in the specification. */ - const KNOWN_ELEMENT = 1; + /** + * Indicates an element is described in the specification. + */ + const KNOWN_ELEMENT = 1; - // From section 8.1.2: "script", "style" - // From 8.2.5.4.7 ("in body" insertion mode): "noembed", "noscript" - // From 8.4 "style", "xmp", "iframe", "noembed", "noframes" - /** Indicates the contained text should be processed as raw text. */ - const TEXT_RAW = 2; + // From section 8.1.2: "script", "style" + // From 8.2.5.4.7 ("in body" insertion mode): "noembed", "noscript" + // From 8.4 "style", "xmp", "iframe", "noembed", "noframes" + /** + * Indicates the contained text should be processed as raw text. + */ + const TEXT_RAW = 2; - // From section 8.1.2: "textarea", "title" - /** Indicates the contained text should be processed as RCDATA. */ - const TEXT_RCDATA = 4; + // From section 8.1.2: "textarea", "title" + /** + * Indicates the contained text should be processed as RCDATA. + */ + const TEXT_RCDATA = 4; - /** Indicates the tag cannot have content. */ - const VOID_TAG = 8; + /** + * Indicates the tag cannot have content. + */ + const VOID_TAG = 8; - // "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", - // "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", - // "nav", "ol", "p", "section", "summary", "ul" - // "h1", "h2", "h3", "h4", "h5", "h6" - // "pre", "listing" - // "form" - // "plaintext" - /** - * Indicates that if a previous event is for a P tag, that element - * should be considered closed. - */ - const AUTOCLOSE_P = 16; + // "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", + // "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", + // "nav", "ol", "p", "section", "summary", "ul" + // "h1", "h2", "h3", "h4", "h5", "h6" + // "pre", "listing" + // "form" + // "plaintext" + /** + * Indicates that if a previous event is for a P tag, that element + * should be considered closed. + */ + const AUTOCLOSE_P = 16; - /** Indicates that the text inside is plaintext (pre). */ - const TEXT_PLAINTEXT = 32; + /** + * Indicates that the text inside is plaintext (pre). + */ + const TEXT_PLAINTEXT = 32; - // See https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements - /** Indicates that the tag is a block. */ - const BLOCK_TAG = 64; + // See https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements + /** + * Indicates that the tag is a block. + */ + const BLOCK_TAG = 64; + /** + * The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html. + * + * @var array + */ + public static $html5 = array( + "a" => 1, + "abbr" => 1, + "address" => 89, // NORMAL | VOID_TAG | AUTOCLOSE_P | BLOCK_TAG + "area" => 9, // NORMAL | VOID_TAG + "article" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "aside" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "audio" => 65, // NORMAL | BLOCK_TAG + "b" => 1, + "base" => 9, // NORMAL | VOID_TAG + "bdi" => 1, + "bdo" => 1, + "blockquote" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "body" => 1, + "br" => 9, // NORMAL | VOID_TAG + "button" => 1, + "canvas" => 65, // NORMAL | BLOCK_TAG + "caption" => 1, + "cite" => 1, + "code" => 1, + "col" => 9, // NORMAL | VOID_TAG + "colgroup" => 1, + "command" => 9, // NORMAL | VOID_TAG + // "data" => 1, // This is highly experimental and only part of the whatwg spec (not w3c). See https://developer.mozilla.org/en-US/docs/HTML/Element/data + "datalist" => 1, + "dd" => 65, // NORMAL | BLOCK_TAG + "del" => 1, + "details" => 17, // NORMAL | AUTOCLOSE_P, + "dfn" => 1, + "dialog" => 17, // NORMAL | AUTOCLOSE_P, + "div" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "dl" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "dt" => 1, + "em" => 1, + "embed" => 9, // NORMAL | VOID_TAG + "fieldset" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "figcaption" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "figure" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "footer" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "form" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "h1" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "h2" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "h3" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "h4" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "h5" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "h6" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "head" => 1, + "header" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "hgroup" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "hr" => 73, // NORMAL | VOID_TAG | BLOCK_TAG + "html" => 1, + "i" => 1, + "iframe" => 3, // NORMAL | TEXT_RAW + "img" => 9, // NORMAL | VOID_TAG + "input" => 9, // NORMAL | VOID_TAG + "kbd" => 1, + "ins" => 1, + "keygen" => 9, // NORMAL | VOID_TAG + "label" => 1, + "legend" => 1, + "li" => 1, + "link" => 9, // NORMAL | VOID_TAG + "map" => 1, + "mark" => 1, + "menu" => 17, // NORMAL | AUTOCLOSE_P, + "meta" => 9, // NORMAL | VOID_TAG + "meter" => 1, + "nav" => 17, // NORMAL | AUTOCLOSE_P, + "noscript" => 67, // NORMAL | TEXT_RAW | BLOCK_TAG + "object" => 1, + "ol" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "optgroup" => 1, + "option" => 1, + "output" => 65, // NORMAL | BLOCK_TAG + "p" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "param" => 9, // NORMAL | VOID_TAG + "pre" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "progress" => 1, + "q" => 1, + "rp" => 1, + "rt" => 1, + "ruby" => 1, + "s" => 1, + "samp" => 1, + "script" => 3, // NORMAL | TEXT_RAW + "section" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "select" => 1, + "small" => 1, + "source" => 9, // NORMAL | VOID_TAG + "span" => 1, + "strong" => 1, + "style" => 3, // NORMAL | TEXT_RAW + "sub" => 1, + "summary" => 17, // NORMAL | AUTOCLOSE_P, + "sup" => 1, + "table" => 65, // NORMAL | BLOCK_TAG + "tbody" => 1, + "td" => 1, + "textarea" => 5, // NORMAL | TEXT_RCDATA + "tfoot" => 65, // NORMAL | BLOCK_TAG + "th" => 1, + "thead" => 1, + "time" => 1, + "title" => 5, // NORMAL | TEXT_RCDATA + "tr" => 1, + "track" => 9, // NORMAL | VOID_TAG + "u" => 1, + "ul" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "var" => 1, + "video" => 65, // NORMAL | BLOCK_TAG + "wbr" => 9, // NORMAL | VOID_TAG - /** - * The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html. - * @var array - */ - public static $html5 = array( - "a" => 1, - "abbr" => 1, - "address" => 89, // NORMAL | VOID_TAG | AUTOCLOSE_P | BLOCK_TAG - "area" => 9, // NORMAL | VOID_TAG - "article" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "aside" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "audio" => 65, // NORMAL | BLOCK_TAG - "b" => 1, - "base" => 9, // NORMAL | VOID_TAG - "bdi" => 1, - "bdo" => 1, - "blockquote" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "body" => 1, - "br" => 9, // NORMAL | VOID_TAG - "button" => 1, - "canvas" => 65, // NORMAL | BLOCK_TAG - "caption" => 1, - "cite" => 1, - "code" => 1, - "col" => 9, // NORMAL | VOID_TAG - "colgroup" => 1, - "command" => 9, // NORMAL | VOID_TAG - //"data" => 1, // This is highly experimental and only part of the whatwg spec (not w3c). See https://developer.mozilla.org/en-US/docs/HTML/Element/data - "datalist" => 1, - "dd" => 65, // NORMAL | BLOCK_TAG - "del" => 1, - "details" => 17, // NORMAL | AUTOCLOSE_P, - "dfn" => 1, - "dialog" => 17, // NORMAL | AUTOCLOSE_P, - "div" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "dl" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "dt" => 1, - "em" => 1, - "embed" => 9, // NORMAL | VOID_TAG - "fieldset" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "figcaption" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "figure" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "footer" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "form" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "h1" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "h2" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "h3" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "h4" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "h5" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "h6" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "head" => 1, - "header" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "hgroup" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "hr" => 73, // NORMAL | VOID_TAG | BLOCK_TAG - "html" => 1, - "i" => 1, - "iframe" => 3, // NORMAL | TEXT_RAW - "img" => 9, // NORMAL | VOID_TAG - "input" => 9, // NORMAL | VOID_TAG - "kbd" => 1, - "ins" => 1, - "keygen" => 9, // NORMAL | VOID_TAG - "label" => 1, - "legend" => 1, - "li" => 1, - "link" => 9, // NORMAL | VOID_TAG - "map" => 1, - "mark" => 1, - "menu" => 17, // NORMAL | AUTOCLOSE_P, - "meta" => 9, // NORMAL | VOID_TAG - "meter" => 1, - "nav" => 17, // NORMAL | AUTOCLOSE_P, - "noscript" => 67, // NORMAL | TEXT_RAW | BLOCK_TAG - "object" => 1, - "ol" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "optgroup" => 1, - "option" => 1, - "output" => 65, // NORMAL | BLOCK_TAG - "p" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "param" => 9, // NORMAL | VOID_TAG - "pre" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "progress" => 1, - "q" => 1, - "rp" => 1, - "rt" => 1, - "ruby" => 1, - "s" => 1, - "samp" => 1, - "script" => 3, // NORMAL | TEXT_RAW - "section" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "select" => 1, - "small" => 1, - "source" => 9, // NORMAL | VOID_TAG - "span" => 1, - "strong" => 1, - "style" => 1, - "sub" => 1, - "summary" => 17, // NORMAL | AUTOCLOSE_P, - "sup" => 1, - "table" => 65, // NORMAL | BLOCK_TAG - "tbody" => 1, - "td" => 1, - "textarea" => 5, // NORMAL | TEXT_RCDATA - "tfoot" => 65, // NORMAL | BLOCK_TAG - "th" => 1, - "thead" => 1, - "time" => 1, - "title" => 5, // NORMAL | TEXT_RCDATA - "tr" => 1, - "track" => 9, // NORMAL | VOID_TAG - "u" => 1, - "ul" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "var" => 1, - "video" => 65, // NORMAL | BLOCK_TAG - "wbr" => 9, // NORMAL | VOID_TAG + // Legacy? + 'basefont' => 8, // VOID_TAG + 'bgsound' => 8, // VOID_TAG + 'noframes' => 2, // RAW_TEXT + 'frame' => 9, // NORMAL | VOID_TAG + 'frameset' => 1, + 'center' => 16, + 'dir' => 16, + 'listing' => 16, // AUTOCLOSE_P + 'plaintext' => 48, // AUTOCLOSE_P | TEXT_PLAINTEXT + 'applet' => 0, + 'marquee' => 0, + 'isindex' => 8, // VOID_TAG + 'xmp' => 20, // AUTOCLOSE_P | VOID_TAG | RAW_TEXT + 'noembed' => 2 // RAW_TEXT + ); - // Legacy? - 'basefont' => 8, // VOID_TAG - 'bgsound' => 8, // VOID_TAG - 'noframes' => 2, // RAW_TEXT - 'frame' => 9, // NORMAL | VOID_TAG - 'frameset' => 1, - 'center' => 16, 'dir' => 16, 'listing' => 16, // AUTOCLOSE_P - 'plaintext' => 48, // AUTOCLOSE_P | TEXT_PLAINTEXT - 'applet' => 0, - 'marquee' => 0, - 'isindex' => 8, // VOID_TAG - 'xmp' => 20, // AUTOCLOSE_P | VOID_TAG | RAW_TEXT - 'noembed' => 2, // RAW_TEXT - ); + /** + * The MathML elements. + * See http://www.w3.org/wiki/MathML/Elements. + * + * In our case we are only concerned with presentation MathML and not content + * MathML. There is a nice list of this subset at https://developer.mozilla.org/en-US/docs/MathML/Element. + * + * @var array + */ + public static $mathml = array( + "maction" => 1, + "maligngroup" => 1, + "malignmark" => 1, + "math" => 1, + "menclose" => 1, + "merror" => 1, + "mfenced" => 1, + "mfrac" => 1, + "mglyph" => 1, + "mi" => 1, + "mlabeledtr" => 1, + "mlongdiv" => 1, + "mmultiscripts" => 1, + "mn" => 1, + "mo" => 1, + "mover" => 1, + "mpadded" => 1, + "mphantom" => 1, + "mroot" => 1, + "mrow" => 1, + "ms" => 1, + "mscarries" => 1, + "mscarry" => 1, + "msgroup" => 1, + "msline" => 1, + "mspace" => 1, + "msqrt" => 1, + "msrow" => 1, + "mstack" => 1, + "mstyle" => 1, + "msub" => 1, + "msup" => 1, + "msubsup" => 1, + "mtable" => 1, + "mtd" => 1, + "mtext" => 1, + "mtr" => 1, + "munder" => 1, + "munderover" => 1 + ); - /** - * The MathML elements. See http://www.w3.org/wiki/MathML/Elements. - * - * In our case we are only concerned with presentation MathML and not content - * MathML. There is a nice list of this subset at https://developer.mozilla.org/en-US/docs/MathML/Element. - * - * @var array - */ - public static $mathml = array( - "maction" => 1, - "maligngroup" => 1, - "malignmark" => 1, - "math" => 1, - "menclose" => 1, - "merror" => 1, - "mfenced" => 1, - "mfrac" => 1, - "mglyph" => 1, - "mi" => 1, - "mlabeledtr" => 1, - "mlongdiv" => 1, - "mmultiscripts" => 1, - "mn" => 1, - "mo" => 1, - "mover" => 1, - "mpadded" => 1, - "mphantom" => 1, - "mroot" => 1, - "mrow" => 1, - "ms" => 1, - "mscarries" => 1, - "mscarry" => 1, - "msgroup" => 1, - "msline" => 1, - "mspace" => 1, - "msqrt" => 1, - "msrow" => 1, - "mstack" => 1, - "mstyle" => 1, - "msub" => 1, - "msup" => 1, - "msubsup" => 1, - "mtable" => 1, - "mtd" => 1, - "mtext" => 1, - "mtr" => 1, - "munder" => 1, - "munderover" => 1, - ); + /** + * The svg elements. + * + * The Mozilla documentation has a good list at https://developer.mozilla.org/en-US/docs/SVG/Element. + * The w3c list appears to be lacking in some areas like filter effect elements. + * That list can be found at http://www.w3.org/wiki/SVG/Elements. + * + * Note, FireFox appears to do a better job rendering filter effects than chrome. + * While they are in the spec I'm not sure how widely implemented they are. + * + * @var array + */ + public static $svg = array( + "a" => 1, + "altGlyph" => 1, + "altGlyphDef" => 1, + "altGlyphItem" => 1, + "animate" => 1, + "animateColor" => 1, + "animateMotion" => 1, + "animateTransform" => 1, + "circle" => 1, + "clipPath" => 1, + "color-profile" => 1, + "cursor" => 1, + "defs" => 1, + "desc" => 1, + "ellipse" => 1, + "feBlend" => 1, + "feColorMatrix" => 1, + "feComponentTransfer" => 1, + "feComposite" => 1, + "feConvolveMatrix" => 1, + "feDiffuseLighting" => 1, + "feDisplacementMap" => 1, + "feDistantLight" => 1, + "feFlood" => 1, + "feFuncA" => 1, + "feFuncB" => 1, + "feFuncG" => 1, + "feFuncR" => 1, + "feGaussianBlur" => 1, + "feImage" => 1, + "feMerge" => 1, + "feMergeNode" => 1, + "feMorphology" => 1, + "feOffset" => 1, + "fePointLight" => 1, + "feSpecularLighting" => 1, + "feSpotLight" => 1, + "feTile" => 1, + "feTurbulence" => 1, + "filter" => 1, + "font" => 1, + "font-face" => 1, + "font-face-format" => 1, + "font-face-name" => 1, + "font-face-src" => 1, + "font-face-uri" => 1, + "foreignObject" => 1, + "g" => 1, + "glyph" => 1, + "glyphRef" => 1, + "hkern" => 1, + "image" => 1, + "line" => 1, + "linearGradient" => 1, + "marker" => 1, + "mask" => 1, + "metadata" => 1, + "missing-glyph" => 1, + "mpath" => 1, + "path" => 1, + "pattern" => 1, + "polygon" => 1, + "polyline" => 1, + "radialGradient" => 1, + "rect" => 1, + "script" => 3, // NORMAL | RAW_TEXT + "set" => 1, + "stop" => 1, + "style" => 3, // NORMAL | RAW_TEXT + "svg" => 1, + "switch" => 1, + "symbol" => 1, + "text" => 1, + "textPath" => 1, + "title" => 1, + "tref" => 1, + "tspan" => 1, + "use" => 1, + "view" => 1, + "vkern" => 1 + ); - /** - * The svg elements. - * - * The Mozilla documentation has a good list at https://developer.mozilla.org/en-US/docs/SVG/Element. - * The w3c list appears to be lacking in some areas like filter effect elements. - * That list can be found at http://www.w3.org/wiki/SVG/Elements. - * - * Note, FireFox appears to do a better job rendering filter effects than chrome. - * While they are in the spec I'm not sure how widely implemented they are. - * - * @var array - */ - public static $svg = array( - "a" => 1, - "altGlyph" => 1, - "altGlyphDef" => 1, - "altGlyphItem" => 1, - "animate" => 1, - "animateColor" => 1, - "animateMotion" => 1, - "animateTransform" => 1, - "circle" => 1, - "clipPath" => 1, - "color-profile" => 1, - "cursor" => 1, - "defs" => 1, - "desc" => 1, - "ellipse" => 1, - "feBlend" => 1, - "feColorMatrix" => 1, - "feComponentTransfer" => 1, - "feComposite" => 1, - "feConvolveMatrix" => 1, - "feDiffuseLighting" => 1, - "feDisplacementMap" => 1, - "feDistantLight" => 1, - "feFlood" => 1, - "feFuncA" => 1, - "feFuncB" => 1, - "feFuncG" => 1, - "feFuncR" => 1, - "feGaussianBlur" => 1, - "feImage" => 1, - "feMerge" => 1, - "feMergeNode" => 1, - "feMorphology" => 1, - "feOffset" => 1, - "fePointLight" => 1, - "feSpecularLighting" => 1, - "feSpotLight" => 1, - "feTile" => 1, - "feTurbulence" => 1, - "filter" => 1, - "font" => 1, - "font-face" => 1, - "font-face-format" => 1, - "font-face-name" => 1, - "font-face-src" => 1, - "font-face-uri" => 1, - "foreignObject" => 1, - "g" => 1, - "glyph" => 1, - "glyphRef" => 1, - "hkern" => 1, - "image" => 1, - "line" => 1, - "linearGradient" => 1, - "marker" => 1, - "mask" => 1, - "metadata" => 1, - "missing-glyph" => 1, - "mpath" => 1, - "path" => 1, - "pattern" => 1, - "polygon" => 1, - "polyline" => 1, - "radialGradient" => 1, - "rect" => 1, - "script" => 3, // NORMAL | RAW_TEXT - "set" => 1, - "stop" => 1, - "style" => 3, // NORMAL | RAW_TEXT - "svg" => 1, - "switch" => 1, - "symbol" => 1, - "text" => 1, - "textPath" => 1, - "title" => 1, - "tref" => 1, - "tspan" => 1, - "use" => 1, - "view" => 1, - "vkern" => 1, - ); + /** + * Some attributes in SVG are case sensetitive. + * + * This map contains key/value pairs with the key as the lowercase attribute + * name and the value with the correct casing. + */ + public static $svgCaseSensitiveAttributeMap = array( + 'attributename' => 'attributeName', + 'attributetype' => 'attributeType', + 'basefrequency' => 'baseFrequency', + 'baseprofile' => 'baseProfile', + 'calcmode' => 'calcMode', + 'clippathunits' => 'clipPathUnits', + 'contentscripttype' => 'contentScriptType', + 'contentstyletype' => 'contentStyleType', + 'diffuseconstant' => 'diffuseConstant', + 'edgemode' => 'edgeMode', + 'externalresourcesrequired' => 'externalResourcesRequired', + 'filterres' => 'filterRes', + 'filterunits' => 'filterUnits', + 'glyphref' => 'glyphRef', + 'gradienttransform' => 'gradientTransform', + 'gradientunits' => 'gradientUnits', + 'kernelmatrix' => 'kernelMatrix', + 'kernelunitlength' => 'kernelUnitLength', + 'keypoints' => 'keyPoints', + 'keysplines' => 'keySplines', + 'keytimes' => 'keyTimes', + 'lengthadjust' => 'lengthAdjust', + 'limitingconeangle' => 'limitingConeAngle', + 'markerheight' => 'markerHeight', + 'markerunits' => 'markerUnits', + 'markerwidth' => 'markerWidth', + 'maskcontentunits' => 'maskContentUnits', + 'maskunits' => 'maskUnits', + 'numoctaves' => 'numOctaves', + 'pathlength' => 'pathLength', + 'patterncontentunits' => 'patternContentUnits', + 'patterntransform' => 'patternTransform', + 'patternunits' => 'patternUnits', + 'pointsatx' => 'pointsAtX', + 'pointsaty' => 'pointsAtY', + 'pointsatz' => 'pointsAtZ', + 'preservealpha' => 'preserveAlpha', + 'preserveaspectratio' => 'preserveAspectRatio', + 'primitiveunits' => 'primitiveUnits', + 'refx' => 'refX', + 'refy' => 'refY', + 'repeatcount' => 'repeatCount', + 'repeatdur' => 'repeatDur', + 'requiredextensions' => 'requiredExtensions', + 'requiredfeatures' => 'requiredFeatures', + 'specularconstant' => 'specularConstant', + 'specularexponent' => 'specularExponent', + 'spreadmethod' => 'spreadMethod', + 'startoffset' => 'startOffset', + 'stddeviation' => 'stdDeviation', + 'stitchtiles' => 'stitchTiles', + 'surfacescale' => 'surfaceScale', + 'systemlanguage' => 'systemLanguage', + 'tablevalues' => 'tableValues', + 'targetx' => 'targetX', + 'targety' => 'targetY', + 'textlength' => 'textLength', + 'viewbox' => 'viewBox', + 'viewtarget' => 'viewTarget', + 'xchannelselector' => 'xChannelSelector', + 'ychannelselector' => 'yChannelSelector', + 'zoomandpan' => 'zoomAndPan' + ); - /** - * Some attributes in SVG are case sensetitive. - * - * This map contains key/value pairs with the key as the lowercase attribute - * name and the value with the correct casing. - */ - public static $svgCaseSensitiveAttributeMap = array( - 'attributename' => 'attributeName', - 'attributetype' => 'attributeType', - 'basefrequency' => 'baseFrequency', - 'baseprofile' => 'baseProfile', - 'calcmode' => 'calcMode', - 'clippathunits' => 'clipPathUnits', - 'contentscripttype' => 'contentScriptType', - 'contentstyletype' => 'contentStyleType', - 'diffuseconstant' => 'diffuseConstant', - 'edgemode' => 'edgeMode', - 'externalresourcesrequired' => 'externalResourcesRequired', - 'filterres' => 'filterRes', - 'filterunits' => 'filterUnits', - 'glyphref' => 'glyphRef', - 'gradienttransform' => 'gradientTransform', - 'gradientunits' => 'gradientUnits', - 'kernelmatrix' => 'kernelMatrix', - 'kernelunitlength' => 'kernelUnitLength', - 'keypoints' => 'keyPoints', - 'keysplines' => 'keySplines', - 'keytimes' => 'keyTimes', - 'lengthadjust' => 'lengthAdjust', - 'limitingconeangle' => 'limitingConeAngle', - 'markerheight' => 'markerHeight', - 'markerunits' => 'markerUnits', - 'markerwidth' => 'markerWidth', - 'maskcontentunits' => 'maskContentUnits', - 'maskunits' => 'maskUnits', - 'numoctaves' => 'numOctaves', - 'pathlength' => 'pathLength', - 'patterncontentunits' => 'patternContentUnits', - 'patterntransform' => 'patternTransform', - 'patternunits' => 'patternUnits', - 'pointsatx' => 'pointsAtX', - 'pointsaty' => 'pointsAtY', - 'pointsatz' => 'pointsAtZ', - 'preservealpha' => 'preserveAlpha', - 'preserveaspectratio' => 'preserveAspectRatio', - 'primitiveunits' => 'primitiveUnits', - 'refx' => 'refX', - 'refy' => 'refY', - 'repeatcount' => 'repeatCount', - 'repeatdur' => 'repeatDur', - 'requiredextensions' => 'requiredExtensions', - 'requiredfeatures' => 'requiredFeatures', - 'specularconstant' => 'specularConstant', - 'specularexponent' => 'specularExponent', - 'spreadmethod' => 'spreadMethod', - 'startoffset' => 'startOffset', - 'stddeviation' => 'stdDeviation', - 'stitchtiles' => 'stitchTiles', - 'surfacescale' => 'surfaceScale', - 'systemlanguage' => 'systemLanguage', - 'tablevalues' => 'tableValues', - 'targetx' => 'targetX', - 'targety' => 'targetY', - 'textlength' => 'textLength', - 'viewbox' => 'viewBox', - 'viewtarget' => 'viewTarget', - 'xchannelselector' => 'xChannelSelector', - 'ychannelselector' => 'yChannelSelector', - 'zoomandpan' => 'zoomAndPan', - ); + /** + * Some SVG elements are case sensetitive. + * This map contains these. + * + * The map contains key/value store of the name is lowercase as the keys and + * the correct casing as the value. + */ + public static $svgCaseSensitiveElementMap = array( + 'altglyph' => 'altGlyph', + 'altglyphdef' => 'altGlyphDef', + 'altglyphitem' => 'altGlyphItem', + 'animatecolor' => 'animateColor', + 'animatemotion' => 'animateMotion', + 'animatetransform' => 'animateTransform', + 'clippath' => 'clipPath', + 'feblend' => 'feBlend', + 'fecolormatrix' => 'feColorMatrix', + 'fecomponenttransfer' => 'feComponentTransfer', + 'fecomposite' => 'feComposite', + 'feconvolvematrix' => 'feConvolveMatrix', + 'fediffuselighting' => 'feDiffuseLighting', + 'fedisplacementmap' => 'feDisplacementMap', + 'fedistantlight' => 'feDistantLight', + 'feflood' => 'feFlood', + 'fefunca' => 'feFuncA', + 'fefuncb' => 'feFuncB', + 'fefuncg' => 'feFuncG', + 'fefuncr' => 'feFuncR', + 'fegaussianblur' => 'feGaussianBlur', + 'feimage' => 'feImage', + 'femerge' => 'feMerge', + 'femergenode' => 'feMergeNode', + 'femorphology' => 'feMorphology', + 'feoffset' => 'feOffset', + 'fepointlight' => 'fePointLight', + 'fespecularlighting' => 'feSpecularLighting', + 'fespotlight' => 'feSpotLight', + 'fetile' => 'feTile', + 'feturbulence' => 'feTurbulence', + 'foreignobject' => 'foreignObject', + 'glyphref' => 'glyphRef', + 'lineargradient' => 'linearGradient', + 'radialgradient' => 'radialGradient', + 'textpath' => 'textPath' + ); - /** - * Some SVG elements are case sensetitive. This map contains these. - * - * The map contains key/value store of the name is lowercase as the keys and - * the correct casing as the value. - */ - public static $svgCaseSensitiveElementMap = array( - 'altglyph' => 'altGlyph', - 'altglyphdef' => 'altGlyphDef', - 'altglyphitem' => 'altGlyphItem', - 'animatecolor' => 'animateColor', - 'animatemotion' => 'animateMotion', - 'animatetransform' => 'animateTransform', - 'clippath' => 'clipPath', - 'feblend' => 'feBlend', - 'fecolormatrix' => 'feColorMatrix', - 'fecomponenttransfer' => 'feComponentTransfer', - 'fecomposite' => 'feComposite', - 'feconvolvematrix' => 'feConvolveMatrix', - 'fediffuselighting' => 'feDiffuseLighting', - 'fedisplacementmap' => 'feDisplacementMap', - 'fedistantlight' => 'feDistantLight', - 'feflood' => 'feFlood', - 'fefunca' => 'feFuncA', - 'fefuncb' => 'feFuncB', - 'fefuncg' => 'feFuncG', - 'fefuncr' => 'feFuncR', - 'fegaussianblur' => 'feGaussianBlur', - 'feimage' => 'feImage', - 'femerge' => 'feMerge', - 'femergenode' => 'feMergeNode', - 'femorphology' => 'feMorphology', - 'feoffset' => 'feOffset', - 'fepointlight' => 'fePointLight', - 'fespecularlighting' => 'feSpecularLighting', - 'fespotlight' => 'feSpotLight', - 'fetile' => 'feTile', - 'feturbulence' => 'feTurbulence', - 'foreignobject' => 'foreignObject', - 'glyphref' => 'glyphRef', - 'lineargradient' => 'linearGradient', - 'radialgradient' => 'radialGradient', - 'textpath' => 'textPath', - ); + /** + * Check whether the given element meets the given criterion. + * + * Example: + * + * Elements::isA('script', Elements::TEXT_RAW); // Returns true. + * + * Elements::isA('script', Elements::TEXT_RCDATA); // Returns false. + * + * @param string $name + * The element name. + * @param int $mask + * One of the constants on this class. + * @return boolean true if the element matches the mask, false otherwise. + */ + public static function isA($name, $mask) + { + if (! static::isElement($name)) { + return false; + } - /** - * Check whether the given element meets the given criterion. - * - * Example: - * - * Elements::isA('script', Elements::TEXT_RAW); // Returns true. - * - * Elements::isA('script', Elements::TEXT_RCDATA); // Returns false. - * - * @param string $name - * The element name. - * @param int $mask - * One of the constants on this class. - * @return boolean - * TRUE if the element matches the mask, FALSE otherwise. - */ - public static function isA($name, $mask) { - if (!static::isElement($name)) { - return FALSE; + return (static::element($name) & $mask) == $mask; } - return (static::element($name) & $mask) == $mask; - } - - /** - * Test if an element is a valid html5 element. - * - * @param string $name - * The name of the element. - * - * @return bool - * True if a html5 element and false otherwise. - */ - public static function isHtml5Element($name) { - - // html5 element names are case insensetitive. Forcing lowercase for the check. - // Do we need this check or will all data passed here already be lowercase? - return isset(static::$html5[strtolower($name)]); - } - - /** - * Test if an element name is a valid MathML presentation element. - * - * @param string $name - * The name of the element. - * - * @return bool - * True if a MathML name and false otherwise. - */ - public static function isMathMLElement($name) { - - // MathML is case-sensetitive unlike html5 elements. - return isset(static::$mathml[$name]); - } - - /** - * Test if an element is a valid SVG element. - * - * @param string $name - * The name of the element. - * - * @return boolean - * True if a SVG element and false otherise. - */ - public static function isSvgElement($name) { - - // SVG is case-sensetitive unlike html5 elements. - return isset(static::$svg[$name]); - } - - /** - * Is an element name valid in an html5 document. - * - * This includes html5 elements along with other allowed embedded content - * such as svg and mathml. - * - * @param string $name - * The name of the element. - * - * @return bool - * True if valid and false otherwise. - */ - public static function isElement($name) { - return static::isHtml5Element($name) || static::isMathMLElement($name) || static::isSvgElement($name); - } - - /** - * Get the element mask for the given element name. - * - * @param string $name - * The name of the element. - * - * @return int - * The element mask. - */ - public static function element($name) { - if (isset(static::$html5[$name])) { - return static::$html5[$name]; - } - if (isset(static::$svg[$name])) { - return static::$svg[$name]; - } - if (isset(static::$mathml[$name])) { - return static::$mathml[$name]; + /** + * Test if an element is a valid html5 element. + * + * @param string $name + * The name of the element. + * + * @return bool True if a html5 element and false otherwise. + */ + public static function isHtml5Element($name) + { + // html5 element names are case insensetitive. Forcing lowercase for the check. + // Do we need this check or will all data passed here already be lowercase? + return isset(static::$html5[strtolower($name)]); } - return FALSE; - } - - /** - * Normalize a SVG element name to its proper case and form. - * - * @param string $name - * The name of the element. - * - * @return string - * The normalized form of the element name. - */ - public static function normalizeSvgElement($name) { - $name = strtolower($name); - if (isset(static::$svgCaseSensitiveElementMap[$name])) { - $name = static::$svgCaseSensitiveElementMap[$name]; + /** + * Test if an element name is a valid MathML presentation element. + * + * @param string $name + * The name of the element. + * + * @return bool True if a MathML name and false otherwise. + */ + public static function isMathMLElement($name) + { + // MathML is case-sensetitive unlike html5 elements. + return isset(static::$mathml[$name]); } - return $name; - } - - /** - * Normalize a SVG attribute name to its proper case and form. - * - * @param string $name - * The name of the attribute. - * - * @return string - * The normalized form of the attribute name. - */ - public static function normalizeSvgAttribute($name) { - $name = strtolower($name); - if (isset(static::$svgCaseSensitiveAttributeMap[$name])) { - $name = static::$svgCaseSensitiveAttributeMap[$name]; + /** + * Test if an element is a valid SVG element. + * + * @param string $name + * The name of the element. + * + * @return boolean True if a SVG element and false otherise. + */ + public static function isSvgElement($name) + { + // SVG is case-sensetitive unlike html5 elements. + return isset(static::$svg[$name]); } - return $name; - } - - /** - * Normalize a MathML attribute name to its proper case and form. - * - * Note, all MathML element names are lowercase. - * - * @param string $name - * The name of the attribute. - * - * @return string - * The normalized form of the attribute name. - */ - public static function normalizeMathMlAttribute($name) { - $name = strtolower($name); - - // Only one attribute has a mixed case form for MathML. - if ($name == 'definitionurl') { - $name = 'definitionURL'; + /** + * Is an element name valid in an html5 document. + * + * This includes html5 elements along with other allowed embedded content + * such as svg and mathml. + * + * @param string $name + * The name of the element. + * + * @return bool True if valid and false otherwise. + */ + public static function isElement($name) + { + return static::isHtml5Element($name) || static::isMathMLElement($name) || static::isSvgElement($name); } - return $name; - } + /** + * Get the element mask for the given element name. + * + * @param string $name + * The name of the element. + * + * @return int The element mask. + */ + public static function element($name) + { + if (isset(static::$html5[$name])) { + return static::$html5[$name]; + } + if (isset(static::$svg[$name])) { + return static::$svg[$name]; + } + if (isset(static::$mathml[$name])) { + return static::$mathml[$name]; + } + + return false; + } + + /** + * Normalize a SVG element name to its proper case and form. + * + * @param string $name + * The name of the element. + * + * @return string The normalized form of the element name. + */ + public static function normalizeSvgElement($name) + { + $name = strtolower($name); + if (isset(static::$svgCaseSensitiveElementMap[$name])) { + $name = static::$svgCaseSensitiveElementMap[$name]; + } + + return $name; + } + + /** + * Normalize a SVG attribute name to its proper case and form. + * + * @param string $name + * The name of the attribute. + * + * @return string The normalized form of the attribute name. + */ + public static function normalizeSvgAttribute($name) + { + $name = strtolower($name); + if (isset(static::$svgCaseSensitiveAttributeMap[$name])) { + $name = static::$svgCaseSensitiveAttributeMap[$name]; + } + + return $name; + } + + /** + * Normalize a MathML attribute name to its proper case and form. + * + * Note, all MathML element names are lowercase. + * + * @param string $name + * The name of the attribute. + * + * @return string The normalized form of the attribute name. + */ + public static function normalizeMathMlAttribute($name) + { + $name = strtolower($name); + + // Only one attribute has a mixed case form for MathML. + if ($name == 'definitionurl') { + $name = 'definitionURL'; + } + + return $name; + } } diff --git a/libraries/html5php/HTML5/Entities.php b/libraries/html5php/HTML5/Entities.php index e5de9e2..2e605d6 100644 --- a/libraries/html5php/HTML5/Entities.php +++ b/libraries/html5php/HTML5/Entities.php @@ -1,2230 +1,2236 @@ 'Ã', - 'Aacut' => 'Ã', - 'aacute' => 'á', - 'aacut' => 'á', - 'Abreve' => 'Ä‚', - 'abreve' => 'ă', - 'ac' => '∾', - 'acd' => '∿', - 'acE' => '∾̳', - 'Acirc' => 'Â', - 'Acir' => 'Â', - 'acirc' => 'â', - 'acir' => 'â', - 'acute' => '´', - 'acut' => '´', - 'Acy' => 'Ð', - 'acy' => 'а', - 'AElig' => 'Æ', - 'AEli' => 'Æ', - 'aelig' => 'æ', - 'aeli' => 'æ', - 'af' => 'â¡', - 'Afr' => 'ð”„', - 'afr' => 'ð”ž', - 'Agrave' => 'À', - 'Agrav' => 'À', - 'agrave' => 'à', - 'agrav' => 'à', - 'alefsym' => 'ℵ', - 'aleph' => 'ℵ', - 'Alpha' => 'Α', - 'alpha' => 'α', - 'Amacr' => 'Ä€', - 'amacr' => 'Ä', - 'amalg' => '⨿', - 'AMP' => '&', - 'AM' => '&', - 'amp' => '&', - 'am' => '&', - 'And' => 'â©“', - 'and' => '∧', - 'andand' => 'â©•', - 'andd' => 'â©œ', - 'andslope' => '⩘', - 'andv' => 'â©š', - 'ang' => '∠', - 'ange' => '⦤', - 'angle' => '∠', - 'angmsd' => '∡', - 'angmsdaa' => '⦨', - 'angmsdab' => '⦩', - 'angmsdac' => '⦪', - 'angmsdad' => '⦫', - 'angmsdae' => '⦬', - 'angmsdaf' => '⦭', - 'angmsdag' => '⦮', - 'angmsdah' => '⦯', - 'angrt' => '∟', - 'angrtvb' => '⊾', - 'angrtvbd' => 'â¦', - 'angsph' => '∢', - 'angst' => 'Ã…', - 'angzarr' => 'â¼', - 'Aogon' => 'Ä„', - 'aogon' => 'Ä…', - 'Aopf' => 'ð”¸', - 'aopf' => 'ð•’', - 'ap' => '≈', - 'apacir' => '⩯', - 'apE' => 'â©°', - 'ape' => '≊', - 'apid' => '≋', - 'apos' => '\'', - 'ApplyFunction' => 'â¡', - 'approx' => '≈', - 'approxeq' => '≊', - 'Aring' => 'Ã…', - 'Arin' => 'Ã…', - 'aring' => 'Ã¥', - 'arin' => 'Ã¥', - 'Ascr' => 'ð’œ', - 'ascr' => 'ð’¶', - 'Assign' => '≔', - 'ast' => '*', - 'asymp' => '≈', - 'asympeq' => 'â‰', - 'Atilde' => 'Ã', - 'Atild' => 'Ã', - 'atilde' => 'ã', - 'atild' => 'ã', - 'Auml' => 'Ä', - 'Aum' => 'Ä', - 'auml' => 'ä', - 'aum' => 'ä', - 'awconint' => '∳', - 'awint' => '⨑', - 'backcong' => '≌', - 'backepsilon' => '϶', - 'backprime' => '‵', - 'backsim' => '∽', - 'backsimeq' => 'â‹', - 'Backslash' => '∖', - 'Barv' => '⫧', - 'barvee' => '⊽', - 'Barwed' => '⌆', - 'barwed' => '⌅', - 'barwedge' => '⌅', - 'bbrk' => '⎵', - 'bbrktbrk' => '⎶', - 'bcong' => '≌', - 'Bcy' => 'Б', - 'bcy' => 'б', - 'bdquo' => '„', - 'becaus' => '∵', - 'Because' => '∵', - 'because' => '∵', - 'bemptyv' => '⦰', - 'bepsi' => '϶', - 'bernou' => 'ℬ', - 'Bernoullis' => 'ℬ', - 'Beta' => 'Î’', - 'beta' => 'β', - 'beth' => 'ℶ', - 'between' => '≬', - 'Bfr' => 'ð”…', - 'bfr' => 'ð”Ÿ', - 'bigcap' => 'â‹‚', - 'bigcirc' => 'â—¯', - 'bigcup' => '⋃', - 'bigodot' => '⨀', - 'bigoplus' => 'â¨', - 'bigotimes' => '⨂', - 'bigsqcup' => '⨆', - 'bigstar' => '★', - 'bigtriangledown' => 'â–½', - 'bigtriangleup' => 'â–³', - 'biguplus' => '⨄', - 'bigvee' => 'â‹', - 'bigwedge' => 'â‹€', - 'bkarow' => 'â¤', - 'blacklozenge' => '⧫', - 'blacksquare' => 'â–ª', - 'blacktriangle' => 'â–´', - 'blacktriangledown' => 'â–¾', - 'blacktriangleleft' => 'â—‚', - 'blacktriangleright' => 'â–¸', - 'blank' => 'â£', - 'blk12' => 'â–’', - 'blk14' => 'â–‘', - 'blk34' => 'â–“', - 'block' => 'â–ˆ', - 'bne' => '=⃥', - 'bnequiv' => '≡⃥', - 'bNot' => 'â«­', - 'bnot' => 'âŒ', - 'Bopf' => 'ð”¹', - 'bopf' => 'ð•“', - 'bot' => '⊥', - 'bottom' => '⊥', - 'bowtie' => '⋈', - 'boxbox' => '⧉', - 'boxDL' => 'â•—', - 'boxDl' => 'â•–', - 'boxdL' => 'â••', - 'boxdl' => 'â”', - 'boxDR' => 'â•”', - 'boxDr' => 'â•“', - 'boxdR' => 'â•’', - 'boxdr' => '┌', - 'boxH' => 'â•', - 'boxh' => '─', - 'boxHD' => '╦', - 'boxHd' => '╤', - 'boxhD' => 'â•¥', - 'boxhd' => '┬', - 'boxHU' => 'â•©', - 'boxHu' => '╧', - 'boxhU' => '╨', - 'boxhu' => 'â”´', - 'boxminus' => '⊟', - 'boxplus' => '⊞', - 'boxtimes' => '⊠', - 'boxUL' => 'â•', - 'boxUl' => 'â•œ', - 'boxuL' => 'â•›', - 'boxul' => '┘', - 'boxUR' => 'â•š', - 'boxUr' => 'â•™', - 'boxuR' => '╘', - 'boxur' => 'â””', - 'boxV' => 'â•‘', - 'boxv' => '│', - 'boxVH' => '╬', - 'boxVh' => 'â•«', - 'boxvH' => '╪', - 'boxvh' => '┼', - 'boxVL' => 'â•£', - 'boxVl' => 'â•¢', - 'boxvL' => 'â•¡', - 'boxvl' => '┤', - 'boxVR' => 'â• ', - 'boxVr' => 'â•Ÿ', - 'boxvR' => 'â•ž', - 'boxvr' => '├', - 'bprime' => '‵', - 'Breve' => '˘', - 'breve' => '˘', - 'brvbar' => '¦', - 'brvba' => '¦', - 'Bscr' => 'ℬ', - 'bscr' => 'ð’·', - 'bsemi' => 'â', - 'bsim' => '∽', - 'bsime' => 'â‹', - 'bsol' => '\\', - 'bsolb' => '⧅', - 'bsolhsub' => '⟈', - 'bull' => '•', - 'bullet' => '•', - 'bump' => '≎', - 'bumpE' => '⪮', - 'bumpe' => 'â‰', - 'Bumpeq' => '≎', - 'bumpeq' => 'â‰', - 'Cacute' => 'Ć', - 'cacute' => 'ć', - 'Cap' => 'â‹’', - 'cap' => '∩', - 'capand' => 'â©„', - 'capbrcup' => '⩉', - 'capcap' => 'â©‹', - 'capcup' => '⩇', - 'capdot' => 'â©€', - 'CapitalDifferentialD' => 'â……', - 'caps' => '∩︀', - 'caret' => 'â', - 'caron' => 'ˇ', - 'Cayleys' => 'â„­', - 'ccaps' => 'â©', - 'Ccaron' => 'ÄŒ', - 'ccaron' => 'Ä', - 'Ccedil' => 'Ç', - 'Ccedi' => 'Ç', - 'ccedil' => 'ç', - 'ccedi' => 'ç', - 'Ccirc' => 'Ĉ', - 'ccirc' => 'ĉ', - 'Cconint' => '∰', - 'ccups' => 'â©Œ', - 'ccupssm' => 'â©', - 'Cdot' => 'ÄŠ', - 'cdot' => 'Ä‹', - 'cedil' => '¸', - 'cedi' => '¸', - 'Cedilla' => '¸', - 'cemptyv' => '⦲', - 'cent' => '¢', - 'cen' => '¢', - 'CenterDot' => '·', - 'centerdot' => '·', - 'Cfr' => 'â„­', - 'cfr' => 'ð” ', - 'CHcy' => 'Ч', - 'chcy' => 'ч', - 'check' => '✓', - 'checkmark' => '✓', - 'Chi' => 'Χ', - 'chi' => 'χ', - 'cir' => 'â—‹', - 'circ' => 'ˆ', - 'circeq' => '≗', - 'circlearrowleft' => '↺', - 'circlearrowright' => '↻', - 'circledast' => '⊛', - 'circledcirc' => '⊚', - 'circleddash' => 'âŠ', - 'CircleDot' => '⊙', - 'circledR' => '®', - 'circledS' => 'Ⓢ', - 'CircleMinus' => '⊖', - 'CirclePlus' => '⊕', - 'CircleTimes' => '⊗', - 'cirE' => '⧃', - 'cire' => '≗', - 'cirfnint' => 'â¨', - 'cirmid' => '⫯', - 'cirscir' => '⧂', - 'ClockwiseContourIntegral' => '∲', - 'CloseCurlyDoubleQuote' => 'â€', - 'CloseCurlyQuote' => '’', - 'clubs' => '♣', - 'clubsuit' => '♣', - 'Colon' => '∷', - 'colon' => ':', - 'Colone' => 'â©´', - 'colone' => '≔', - 'coloneq' => '≔', - 'comma' => ',', - 'commat' => '@', - 'comp' => 'âˆ', - 'compfn' => '∘', - 'complement' => 'âˆ', - 'complexes' => 'â„‚', - 'cong' => '≅', - 'congdot' => 'â©­', - 'Congruent' => '≡', - 'Conint' => '∯', - 'conint' => '∮', - 'ContourIntegral' => '∮', - 'Copf' => 'â„‚', - 'copf' => 'ð•”', - 'coprod' => 'âˆ', - 'Coproduct' => 'âˆ', - 'COPY' => '©', - 'COP' => '©', - 'copy' => '©', - 'cop' => '©', - 'copysr' => 'â„—', - 'CounterClockwiseContourIntegral' => '∳', - 'crarr' => '↵', - 'Cross' => '⨯', - 'cross' => '✗', - 'Cscr' => 'ð’ž', - 'cscr' => 'ð’¸', - 'csub' => 'â«', - 'csube' => 'â«‘', - 'csup' => 'â«', - 'csupe' => 'â«’', - 'ctdot' => '⋯', - 'cudarrl' => '⤸', - 'cudarrr' => '⤵', - 'cuepr' => 'â‹ž', - 'cuesc' => 'â‹Ÿ', - 'cularr' => '↶', - 'cularrp' => '⤽', - 'Cup' => 'â‹“', - 'cup' => '∪', - 'cupbrcap' => '⩈', - 'CupCap' => 'â‰', - 'cupcap' => '⩆', - 'cupcup' => 'â©Š', - 'cupdot' => 'âŠ', - 'cupor' => 'â©…', - 'cups' => '∪︀', - 'curarr' => '↷', - 'curarrm' => '⤼', - 'curlyeqprec' => 'â‹ž', - 'curlyeqsucc' => 'â‹Ÿ', - 'curlyvee' => 'â‹Ž', - 'curlywedge' => 'â‹', - 'curren' => '¤', - 'curre' => '¤', - 'curvearrowleft' => '↶', - 'curvearrowright' => '↷', - 'cuvee' => 'â‹Ž', - 'cuwed' => 'â‹', - 'cwconint' => '∲', - 'cwint' => '∱', - 'cylcty' => '⌭', - 'Dagger' => '‡', - 'dagger' => '†', - 'daleth' => 'ℸ', - 'Darr' => '↡', - 'dArr' => '⇓', - 'darr' => '↓', - 'dash' => 'â€', - 'Dashv' => '⫤', - 'dashv' => '⊣', - 'dbkarow' => 'â¤', - 'dblac' => 'Ë', - 'Dcaron' => 'ÄŽ', - 'dcaron' => 'Ä', - 'Dcy' => 'Д', - 'dcy' => 'д', - 'DD' => 'â……', - 'dd' => 'â…†', - 'ddagger' => '‡', - 'ddarr' => '⇊', - 'DDotrahd' => '⤑', - 'ddotseq' => 'â©·', - 'deg' => '°', - 'de' => '°', - 'Del' => '∇', - 'Delta' => 'Δ', - 'delta' => 'δ', - 'demptyv' => '⦱', - 'dfisht' => '⥿', - 'Dfr' => 'ð”‡', - 'dfr' => 'ð”¡', - 'dHar' => '⥥', - 'dharl' => '⇃', - 'dharr' => '⇂', - 'DiacriticalAcute' => '´', - 'DiacriticalDot' => 'Ë™', - 'DiacriticalDoubleAcute' => 'Ë', - 'DiacriticalGrave' => '`', - 'DiacriticalTilde' => 'Ëœ', - 'diam' => 'â‹„', - 'Diamond' => 'â‹„', - 'diamond' => 'â‹„', - 'diamondsuit' => '♦', - 'diams' => '♦', - 'die' => '¨', - 'DifferentialD' => 'â…†', - 'digamma' => 'Ï', - 'disin' => '⋲', - 'div' => '÷', - 'divide' => '÷', - 'divid' => '÷', - 'divideontimes' => '⋇', - 'divonx' => '⋇', - 'DJcy' => 'Ђ', - 'djcy' => 'Ñ’', - 'dlcorn' => '⌞', - 'dlcrop' => 'âŒ', - 'dollar' => '$', - 'Dopf' => 'ð”»', - 'dopf' => 'ð••', - 'Dot' => '¨', - 'dot' => 'Ë™', - 'DotDot' => '⃜', - 'doteq' => 'â‰', - 'doteqdot' => '≑', - 'DotEqual' => 'â‰', - 'dotminus' => '∸', - 'dotplus' => '∔', - 'dotsquare' => '⊡', - 'doublebarwedge' => '⌆', - 'DoubleContourIntegral' => '∯', - 'DoubleDot' => '¨', - 'DoubleDownArrow' => '⇓', - 'DoubleLeftArrow' => 'â‡', - 'DoubleLeftRightArrow' => '⇔', - 'DoubleLeftTee' => '⫤', - 'DoubleLongLeftArrow' => '⟸', - 'DoubleLongLeftRightArrow' => '⟺', - 'DoubleLongRightArrow' => '⟹', - 'DoubleRightArrow' => '⇒', - 'DoubleRightTee' => '⊨', - 'DoubleUpArrow' => '⇑', - 'DoubleUpDownArrow' => '⇕', - 'DoubleVerticalBar' => '∥', - 'DownArrow' => '↓', - 'Downarrow' => '⇓', - 'downarrow' => '↓', - 'DownArrowBar' => '⤓', - 'DownArrowUpArrow' => '⇵', - 'DownBreve' => 'Ì‘', - 'downdownarrows' => '⇊', - 'downharpoonleft' => '⇃', - 'downharpoonright' => '⇂', - 'DownLeftRightVector' => 'â¥', - 'DownLeftTeeVector' => '⥞', - 'DownLeftVector' => '↽', - 'DownLeftVectorBar' => '⥖', - 'DownRightTeeVector' => '⥟', - 'DownRightVector' => 'â‡', - 'DownRightVectorBar' => '⥗', - 'DownTee' => '⊤', - 'DownTeeArrow' => '↧', - 'drbkarow' => 'â¤', - 'drcorn' => '⌟', - 'drcrop' => '⌌', - 'Dscr' => 'ð’Ÿ', - 'dscr' => 'ð’¹', - 'DScy' => 'Ð…', - 'dscy' => 'Ñ•', - 'dsol' => '⧶', - 'Dstrok' => 'Ä', - 'dstrok' => 'Ä‘', - 'dtdot' => '⋱', - 'dtri' => 'â–¿', - 'dtrif' => 'â–¾', - 'duarr' => '⇵', - 'duhar' => '⥯', - 'dwangle' => '⦦', - 'DZcy' => 'Ð', - 'dzcy' => 'ÑŸ', - 'dzigrarr' => '⟿', - 'Eacute' => 'É', - 'Eacut' => 'É', - 'eacute' => 'é', - 'eacut' => 'é', - 'easter' => 'â©®', - 'Ecaron' => 'Äš', - 'ecaron' => 'Ä›', - 'ecir' => 'ê', - 'Ecirc' => 'Ê', - 'Ecir' => 'Ê', - 'ecirc' => 'ê', - 'ecolon' => '≕', - 'Ecy' => 'Э', - 'ecy' => 'Ñ', - 'eDDot' => 'â©·', - 'Edot' => 'Ä–', - 'eDot' => '≑', - 'edot' => 'Ä—', - 'ee' => 'â…‡', - 'efDot' => '≒', - 'Efr' => 'ð”ˆ', - 'efr' => 'ð”¢', - 'eg' => '⪚', - 'Egrave' => 'È', - 'Egrav' => 'È', - 'egrave' => 'è', - 'egrav' => 'è', - 'egs' => '⪖', - 'egsdot' => '⪘', - 'el' => '⪙', - 'Element' => '∈', - 'elinters' => 'â§', - 'ell' => 'â„“', - 'els' => '⪕', - 'elsdot' => '⪗', - 'Emacr' => 'Ä’', - 'emacr' => 'Ä“', - 'empty' => '∅', - 'emptyset' => '∅', - 'EmptySmallSquare' => 'â—»', - 'emptyv' => '∅', - 'EmptyVerySmallSquare' => 'â–«', - 'emsp' => ' ', - 'emsp13' => ' ', - 'emsp14' => ' ', - 'ENG' => 'ÅŠ', - 'eng' => 'Å‹', - 'ensp' => ' ', - 'Eogon' => 'Ę', - 'eogon' => 'Ä™', - 'Eopf' => 'ð”¼', - 'eopf' => 'ð•–', - 'epar' => 'â‹•', - 'eparsl' => '⧣', - 'eplus' => '⩱', - 'epsi' => 'ε', - 'Epsilon' => 'Ε', - 'epsilon' => 'ε', - 'epsiv' => 'ϵ', - 'eqcirc' => '≖', - 'eqcolon' => '≕', - 'eqsim' => '≂', - 'eqslantgtr' => '⪖', - 'eqslantless' => '⪕', - 'Equal' => '⩵', - 'equals' => '=', - 'EqualTilde' => '≂', - 'equest' => '≟', - 'Equilibrium' => '⇌', - 'equiv' => '≡', - 'equivDD' => '⩸', - 'eqvparsl' => '⧥', - 'erarr' => '⥱', - 'erDot' => '≓', - 'Escr' => 'â„°', - 'escr' => 'ℯ', - 'esdot' => 'â‰', - 'Esim' => '⩳', - 'esim' => '≂', - 'Eta' => 'Η', - 'eta' => 'η', - 'ETH' => 'Ã', - 'ET' => 'Ã', - 'eth' => 'ð', - 'et' => 'ð', - 'Euml' => 'Ë', - 'Eum' => 'Ë', - 'euml' => 'ë', - 'eum' => 'ë', - 'euro' => '€', - 'excl' => '!', - 'exist' => '∃', - 'Exists' => '∃', - 'expectation' => 'â„°', - 'ExponentialE' => 'â…‡', - 'exponentiale' => 'â…‡', - 'fallingdotseq' => '≒', - 'Fcy' => 'Ф', - 'fcy' => 'Ñ„', - 'female' => '♀', - 'ffilig' => 'ffi', - 'fflig' => 'ff', - 'ffllig' => 'ffl', - 'Ffr' => 'ð”‰', - 'ffr' => 'ð”£', - 'filig' => 'ï¬', - 'FilledSmallSquare' => 'â—¼', - 'FilledVerySmallSquare' => 'â–ª', - 'fjlig' => 'fj', - 'flat' => 'â™­', - 'fllig' => 'fl', - 'fltns' => 'â–±', - 'fnof' => 'Æ’', - 'Fopf' => 'ð”½', - 'fopf' => 'ð•—', - 'ForAll' => '∀', - 'forall' => '∀', - 'fork' => 'â‹”', - 'forkv' => 'â«™', - 'Fouriertrf' => 'ℱ', - 'fpartint' => 'â¨', - 'frac12' => '½', - 'frac1' => '¼', - 'frac13' => 'â…“', - 'frac14' => '¼', - 'frac15' => 'â…•', - 'frac16' => 'â…™', - 'frac18' => 'â…›', - 'frac23' => 'â…”', - 'frac25' => 'â…–', - 'frac34' => '¾', - 'frac3' => '¾', - 'frac35' => 'â…—', - 'frac38' => 'â…œ', - 'frac45' => 'â…˜', - 'frac56' => 'â…š', - 'frac58' => 'â…', - 'frac78' => 'â…ž', - 'frasl' => 'â„', - 'frown' => '⌢', - 'Fscr' => 'ℱ', - 'fscr' => 'ð’»', - 'gacute' => 'ǵ', - 'Gamma' => 'Γ', - 'gamma' => 'γ', - 'Gammad' => 'Ïœ', - 'gammad' => 'Ï', - 'gap' => '⪆', - 'Gbreve' => 'Äž', - 'gbreve' => 'ÄŸ', - 'Gcedil' => 'Ä¢', - 'Gcirc' => 'Äœ', - 'gcirc' => 'Ä', - 'Gcy' => 'Г', - 'gcy' => 'г', - 'Gdot' => 'Ä ', - 'gdot' => 'Ä¡', - 'gE' => '≧', - 'ge' => '≥', - 'gEl' => '⪌', - 'gel' => 'â‹›', - 'geq' => '≥', - 'geqq' => '≧', - 'geqslant' => '⩾', - 'ges' => '⩾', - 'gescc' => '⪩', - 'gesdot' => '⪀', - 'gesdoto' => '⪂', - 'gesdotol' => '⪄', - 'gesl' => '⋛︀', - 'gesles' => '⪔', - 'Gfr' => 'ð”Š', - 'gfr' => 'ð”¤', - 'Gg' => 'â‹™', - 'gg' => '≫', - 'ggg' => 'â‹™', - 'gimel' => 'â„·', - 'GJcy' => 'Ѓ', - 'gjcy' => 'Ñ“', - 'gl' => '≷', - 'gla' => '⪥', - 'glE' => '⪒', - 'glj' => '⪤', - 'gnap' => '⪊', - 'gnapprox' => '⪊', - 'gnE' => '≩', - 'gne' => '⪈', - 'gneq' => '⪈', - 'gneqq' => '≩', - 'gnsim' => '⋧', - 'Gopf' => 'ð”¾', - 'gopf' => 'ð•˜', - 'grave' => '`', - 'GreaterEqual' => '≥', - 'GreaterEqualLess' => 'â‹›', - 'GreaterFullEqual' => '≧', - 'GreaterGreater' => '⪢', - 'GreaterLess' => '≷', - 'GreaterSlantEqual' => '⩾', - 'GreaterTilde' => '≳', - 'Gscr' => 'ð’¢', - 'gscr' => 'â„Š', - 'gsim' => '≳', - 'gsime' => '⪎', - 'gsiml' => 'âª', - 'GT' => '>', - 'G' => '>', - 'Gt' => '≫', - 'gt' => '>', - 'g' => '>', - 'gtcc' => '⪧', - 'gtcir' => '⩺', - 'gtdot' => 'â‹—', - 'gtlPar' => '⦕', - 'gtquest' => '⩼', - 'gtrapprox' => '⪆', - 'gtrarr' => '⥸', - 'gtrdot' => 'â‹—', - 'gtreqless' => 'â‹›', - 'gtreqqless' => '⪌', - 'gtrless' => '≷', - 'gtrsim' => '≳', - 'gvertneqq' => '≩︀', - 'gvnE' => '≩︀', - 'Hacek' => 'ˇ', - 'hairsp' => ' ', - 'half' => '½', - 'hamilt' => 'â„‹', - 'HARDcy' => 'Ъ', - 'hardcy' => 'ÑŠ', - 'hArr' => '⇔', - 'harr' => '↔', - 'harrcir' => '⥈', - 'harrw' => '↭', - 'Hat' => '^', - 'hbar' => 'â„', - 'Hcirc' => 'Ĥ', - 'hcirc' => 'Ä¥', - 'hearts' => '♥', - 'heartsuit' => '♥', - 'hellip' => '…', - 'hercon' => '⊹', - 'Hfr' => 'â„Œ', - 'hfr' => 'ð”¥', - 'HilbertSpace' => 'â„‹', - 'hksearow' => '⤥', - 'hkswarow' => '⤦', - 'hoarr' => '⇿', - 'homtht' => '∻', - 'hookleftarrow' => '↩', - 'hookrightarrow' => '↪', - 'Hopf' => 'â„', - 'hopf' => 'ð•™', - 'horbar' => '―', - 'HorizontalLine' => '─', - 'Hscr' => 'â„‹', - 'hscr' => 'ð’½', - 'hslash' => 'â„', - 'Hstrok' => 'Ħ', - 'hstrok' => 'ħ', - 'HumpDownHump' => '≎', - 'HumpEqual' => 'â‰', - 'hybull' => 'âƒ', - 'hyphen' => 'â€', - 'Iacute' => 'Ã', - 'Iacut' => 'Ã', - 'iacute' => 'í', - 'iacut' => 'í', - 'ic' => 'â£', - 'Icirc' => 'ÃŽ', - 'Icir' => 'ÃŽ', - 'icirc' => 'î', - 'icir' => 'î', - 'Icy' => 'И', - 'icy' => 'и', - 'Idot' => 'Ä°', - 'IEcy' => 'Е', - 'iecy' => 'е', - 'iexcl' => '¡', - 'iexc' => '¡', - 'iff' => '⇔', - 'Ifr' => 'â„‘', - 'ifr' => 'ð”¦', - 'Igrave' => 'ÃŒ', - 'Igrav' => 'ÃŒ', - 'igrave' => 'ì', - 'igrav' => 'ì', - 'ii' => 'â…ˆ', - 'iiiint' => '⨌', - 'iiint' => '∭', - 'iinfin' => '⧜', - 'iiota' => 'â„©', - 'IJlig' => 'IJ', - 'ijlig' => 'ij', - 'Im' => 'â„‘', - 'Imacr' => 'Ī', - 'imacr' => 'Ä«', - 'image' => 'â„‘', - 'ImaginaryI' => 'â…ˆ', - 'imagline' => 'â„', - 'imagpart' => 'â„‘', - 'imath' => 'ı', - 'imof' => '⊷', - 'imped' => 'Ƶ', - 'Implies' => '⇒', - 'in' => '∈', - 'incare' => 'â„…', - 'infin' => '∞', - 'infintie' => 'â§', - 'inodot' => 'ı', - 'Int' => '∬', - 'int' => '∫', - 'intcal' => '⊺', - 'integers' => 'ℤ', - 'Integral' => '∫', - 'intercal' => '⊺', - 'Intersection' => 'â‹‚', - 'intlarhk' => '⨗', - 'intprod' => '⨼', - 'InvisibleComma' => 'â£', - 'InvisibleTimes' => 'â¢', - 'IOcy' => 'Ð', - 'iocy' => 'Ñ‘', - 'Iogon' => 'Ä®', - 'iogon' => 'į', - 'Iopf' => 'ð•€', - 'iopf' => 'ð•š', - 'Iota' => 'Ι', - 'iota' => 'ι', - 'iprod' => '⨼', - 'iquest' => '¿', - 'iques' => '¿', - 'Iscr' => 'â„', - 'iscr' => 'ð’¾', - 'isin' => '∈', - 'isindot' => '⋵', - 'isinE' => '⋹', - 'isins' => 'â‹´', - 'isinsv' => '⋳', - 'isinv' => '∈', - 'it' => 'â¢', - 'Itilde' => 'Ĩ', - 'itilde' => 'Ä©', - 'Iukcy' => 'І', - 'iukcy' => 'Ñ–', - 'Iuml' => 'Ã', - 'Ium' => 'Ã', - 'iuml' => 'ï', - 'ium' => 'ï', - 'Jcirc' => 'Ä´', - 'jcirc' => 'ĵ', - 'Jcy' => 'Й', - 'jcy' => 'й', - 'Jfr' => 'ð”', - 'jfr' => 'ð”§', - 'jmath' => 'È·', - 'Jopf' => 'ð•', - 'jopf' => 'ð•›', - 'Jscr' => 'ð’¥', - 'jscr' => 'ð’¿', - 'Jsercy' => 'Ј', - 'jsercy' => 'ј', - 'Jukcy' => 'Є', - 'jukcy' => 'Ñ”', - 'Kappa' => 'Κ', - 'kappa' => 'κ', - 'kappav' => 'Ï°', - 'Kcedil' => 'Ķ', - 'kcedil' => 'Ä·', - 'Kcy' => 'К', - 'kcy' => 'к', - 'Kfr' => 'ð”Ž', - 'kfr' => 'ð”¨', - 'kgreen' => 'ĸ', - 'KHcy' => 'Ð¥', - 'khcy' => 'Ñ…', - 'KJcy' => 'ÐŒ', - 'kjcy' => 'Ñœ', - 'Kopf' => 'ð•‚', - 'kopf' => 'ð•œ', - 'Kscr' => 'ð’¦', - 'kscr' => 'ð“€', - 'lAarr' => '⇚', - 'Lacute' => 'Ĺ', - 'lacute' => 'ĺ', - 'laemptyv' => '⦴', - 'lagran' => 'â„’', - 'Lambda' => 'Λ', - 'lambda' => 'λ', - 'Lang' => '⟪', - 'lang' => '⟨', - 'langd' => '⦑', - 'langle' => '⟨', - 'lap' => '⪅', - 'Laplacetrf' => 'â„’', - 'laquo' => '«', - 'laqu' => '«', - 'Larr' => '↞', - 'lArr' => 'â‡', - 'larr' => 'â†', - 'larrb' => '⇤', - 'larrbfs' => '⤟', - 'larrfs' => 'â¤', - 'larrhk' => '↩', - 'larrlp' => '↫', - 'larrpl' => '⤹', - 'larrsim' => '⥳', - 'larrtl' => '↢', - 'lat' => '⪫', - 'lAtail' => '⤛', - 'latail' => '⤙', - 'late' => '⪭', - 'lates' => '⪭︀', - 'lBarr' => '⤎', - 'lbarr' => '⤌', - 'lbbrk' => 'â²', - 'lbrace' => '{', - 'lbrack' => '[', - 'lbrke' => '⦋', - 'lbrksld' => 'â¦', - 'lbrkslu' => 'â¦', - 'Lcaron' => 'Ľ', - 'lcaron' => 'ľ', - 'Lcedil' => 'Ä»', - 'lcedil' => 'ļ', - 'lceil' => '⌈', - 'lcub' => '{', - 'Lcy' => 'Л', - 'lcy' => 'л', - 'ldca' => '⤶', - 'ldquo' => '“', - 'ldquor' => '„', - 'ldrdhar' => '⥧', - 'ldrushar' => '⥋', - 'ldsh' => '↲', - 'lE' => '≦', - 'le' => '≤', - 'LeftAngleBracket' => '⟨', - 'LeftArrow' => 'â†', - 'Leftarrow' => 'â‡', - 'leftarrow' => 'â†', - 'LeftArrowBar' => '⇤', - 'LeftArrowRightArrow' => '⇆', - 'leftarrowtail' => '↢', - 'LeftCeiling' => '⌈', - 'LeftDoubleBracket' => '⟦', - 'LeftDownTeeVector' => '⥡', - 'LeftDownVector' => '⇃', - 'LeftDownVectorBar' => '⥙', - 'LeftFloor' => '⌊', - 'leftharpoondown' => '↽', - 'leftharpoonup' => '↼', - 'leftleftarrows' => '⇇', - 'LeftRightArrow' => '↔', - 'Leftrightarrow' => '⇔', - 'leftrightarrow' => '↔', - 'leftrightarrows' => '⇆', - 'leftrightharpoons' => '⇋', - 'leftrightsquigarrow' => '↭', - 'LeftRightVector' => '⥎', - 'LeftTee' => '⊣', - 'LeftTeeArrow' => '↤', - 'LeftTeeVector' => '⥚', - 'leftthreetimes' => 'â‹‹', - 'LeftTriangle' => '⊲', - 'LeftTriangleBar' => 'â§', - 'LeftTriangleEqual' => '⊴', - 'LeftUpDownVector' => '⥑', - 'LeftUpTeeVector' => '⥠', - 'LeftUpVector' => '↿', - 'LeftUpVectorBar' => '⥘', - 'LeftVector' => '↼', - 'LeftVectorBar' => '⥒', - 'lEg' => '⪋', - 'leg' => 'â‹š', - 'leq' => '≤', - 'leqq' => '≦', - 'leqslant' => '⩽', - 'les' => '⩽', - 'lescc' => '⪨', - 'lesdot' => 'â©¿', - 'lesdoto' => 'âª', - 'lesdotor' => '⪃', - 'lesg' => '⋚︀', - 'lesges' => '⪓', - 'lessapprox' => '⪅', - 'lessdot' => 'â‹–', - 'lesseqgtr' => 'â‹š', - 'lesseqqgtr' => '⪋', - 'LessEqualGreater' => 'â‹š', - 'LessFullEqual' => '≦', - 'LessGreater' => '≶', - 'lessgtr' => '≶', - 'LessLess' => '⪡', - 'lesssim' => '≲', - 'LessSlantEqual' => '⩽', - 'LessTilde' => '≲', - 'lfisht' => '⥼', - 'lfloor' => '⌊', - 'Lfr' => 'ð”', - 'lfr' => 'ð”©', - 'lg' => '≶', - 'lgE' => '⪑', - 'lHar' => '⥢', - 'lhard' => '↽', - 'lharu' => '↼', - 'lharul' => '⥪', - 'lhblk' => 'â–„', - 'LJcy' => 'Љ', - 'ljcy' => 'Ñ™', - 'Ll' => '⋘', - 'll' => '≪', - 'llarr' => '⇇', - 'llcorner' => '⌞', - 'Lleftarrow' => '⇚', - 'llhard' => '⥫', - 'lltri' => 'â—º', - 'Lmidot' => 'Ä¿', - 'lmidot' => 'Å€', - 'lmoust' => '⎰', - 'lmoustache' => '⎰', - 'lnap' => '⪉', - 'lnapprox' => '⪉', - 'lnE' => '≨', - 'lne' => '⪇', - 'lneq' => '⪇', - 'lneqq' => '≨', - 'lnsim' => '⋦', - 'loang' => '⟬', - 'loarr' => '⇽', - 'lobrk' => '⟦', - 'LongLeftArrow' => '⟵', - 'Longleftarrow' => '⟸', - 'longleftarrow' => '⟵', - 'LongLeftRightArrow' => '⟷', - 'Longleftrightarrow' => '⟺', - 'longleftrightarrow' => '⟷', - 'longmapsto' => '⟼', - 'LongRightArrow' => '⟶', - 'Longrightarrow' => '⟹', - 'longrightarrow' => '⟶', - 'looparrowleft' => '↫', - 'looparrowright' => '↬', - 'lopar' => '⦅', - 'Lopf' => 'ð•ƒ', - 'lopf' => 'ð•', - 'loplus' => '⨭', - 'lotimes' => '⨴', - 'lowast' => '∗', - 'lowbar' => '_', - 'LowerLeftArrow' => '↙', - 'LowerRightArrow' => '↘', - 'loz' => 'â—Š', - 'lozenge' => 'â—Š', - 'lozf' => '⧫', - 'lpar' => '(', - 'lparlt' => '⦓', - 'lrarr' => '⇆', - 'lrcorner' => '⌟', - 'lrhar' => '⇋', - 'lrhard' => '⥭', - 'lrm' => '‎', - 'lrtri' => '⊿', - 'lsaquo' => '‹', - 'Lscr' => 'â„’', - 'lscr' => 'ð“', - 'Lsh' => '↰', - 'lsh' => '↰', - 'lsim' => '≲', - 'lsime' => 'âª', - 'lsimg' => 'âª', - 'lsqb' => '[', - 'lsquo' => '‘', - 'lsquor' => '‚', - 'Lstrok' => 'Å', - 'lstrok' => 'Å‚', - 'LT' => '<', - 'L' => '<', - 'Lt' => '≪', - 'lt' => '<', - 'l' => '<', - 'ltcc' => '⪦', - 'ltcir' => '⩹', - 'ltdot' => 'â‹–', - 'lthree' => 'â‹‹', - 'ltimes' => '⋉', - 'ltlarr' => '⥶', - 'ltquest' => 'â©»', - 'ltri' => 'â—ƒ', - 'ltrie' => '⊴', - 'ltrif' => 'â—‚', - 'ltrPar' => '⦖', - 'lurdshar' => '⥊', - 'luruhar' => '⥦', - 'lvertneqq' => '≨︀', - 'lvnE' => '≨︀', - 'macr' => '¯', - 'mac' => '¯', - 'male' => '♂', - 'malt' => '✠', - 'maltese' => '✠', - 'Map' => '⤅', - 'map' => '↦', - 'mapsto' => '↦', - 'mapstodown' => '↧', - 'mapstoleft' => '↤', - 'mapstoup' => '↥', - 'marker' => 'â–®', - 'mcomma' => '⨩', - 'Mcy' => 'Ðœ', - 'mcy' => 'м', - 'mdash' => '—', - 'mDDot' => '∺', - 'measuredangle' => '∡', - 'MediumSpace' => 'âŸ', - 'Mellintrf' => 'ℳ', - 'Mfr' => 'ð”', - 'mfr' => 'ð”ª', - 'mho' => '℧', - 'micro' => 'µ', - 'micr' => 'µ', - 'mid' => '∣', - 'midast' => '*', - 'midcir' => 'â«°', - 'middot' => '·', - 'middo' => '·', - 'minus' => '−', - 'minusb' => '⊟', - 'minusd' => '∸', - 'minusdu' => '⨪', - 'MinusPlus' => '∓', - 'mlcp' => 'â«›', - 'mldr' => '…', - 'mnplus' => '∓', - 'models' => '⊧', - 'Mopf' => 'ð•„', - 'mopf' => 'ð•ž', - 'mp' => '∓', - 'Mscr' => 'ℳ', - 'mscr' => 'ð“‚', - 'mstpos' => '∾', - 'Mu' => 'Îœ', - 'mu' => 'μ', - 'multimap' => '⊸', - 'mumap' => '⊸', - 'nabla' => '∇', - 'Nacute' => 'Ń', - 'nacute' => 'Å„', - 'nang' => '∠⃒', - 'nap' => '≉', - 'napE' => '⩰̸', - 'napid' => '≋̸', - 'napos' => 'ʼn', - 'napprox' => '≉', - 'natur' => 'â™®', - 'natural' => 'â™®', - 'naturals' => 'â„•', - 'nbsp' => ' ', - 'nbs' => ' ', - 'nbump' => '≎̸', - 'nbumpe' => 'â‰Ì¸', - 'ncap' => '⩃', - 'Ncaron' => 'Ň', - 'ncaron' => 'ň', - 'Ncedil' => 'Å…', - 'ncedil' => 'ņ', - 'ncong' => '≇', - 'ncongdot' => '⩭̸', - 'ncup' => 'â©‚', - 'Ncy' => 'Ð', - 'ncy' => 'н', - 'ndash' => '–', - 'ne' => '≠', - 'nearhk' => '⤤', - 'neArr' => '⇗', - 'nearr' => '↗', - 'nearrow' => '↗', - 'nedot' => 'â‰Ì¸', - 'NegativeMediumSpace' => '​', - 'NegativeThickSpace' => '​', - 'NegativeThinSpace' => '​', - 'NegativeVeryThinSpace' => '​', - 'nequiv' => '≢', - 'nesear' => '⤨', - 'nesim' => '≂̸', - 'NestedGreaterGreater' => '≫', - 'NestedLessLess' => '≪', - 'NewLine' => ' +namespace Masterminds\HTML5; + +/** + * Entity lookup tables. + * This class is automatically generated. + */ +class Entities +{ + + public static $byName = array( + 'Aacute' => 'Ã', + 'Aacut' => 'Ã', + 'aacute' => 'á', + 'aacut' => 'á', + 'Abreve' => 'Ä‚', + 'abreve' => 'ă', + 'ac' => '∾', + 'acd' => '∿', + 'acE' => '∾̳', + 'Acirc' => 'Â', + 'Acir' => 'Â', + 'acirc' => 'â', + 'acir' => 'â', + 'acute' => '´', + 'acut' => '´', + 'Acy' => 'Ð', + 'acy' => 'а', + 'AElig' => 'Æ', + 'AEli' => 'Æ', + 'aelig' => 'æ', + 'aeli' => 'æ', + 'af' => 'â¡', + 'Afr' => 'ð”„', + 'afr' => 'ð”ž', + 'Agrave' => 'À', + 'Agrav' => 'À', + 'agrave' => 'à', + 'agrav' => 'à', + 'alefsym' => 'ℵ', + 'aleph' => 'ℵ', + 'Alpha' => 'Α', + 'alpha' => 'α', + 'Amacr' => 'Ä€', + 'amacr' => 'Ä', + 'amalg' => '⨿', + 'AMP' => '&', + 'AM' => '&', + 'amp' => '&', + 'am' => '&', + 'And' => 'â©“', + 'and' => '∧', + 'andand' => 'â©•', + 'andd' => 'â©œ', + 'andslope' => '⩘', + 'andv' => 'â©š', + 'ang' => '∠', + 'ange' => '⦤', + 'angle' => '∠', + 'angmsd' => '∡', + 'angmsdaa' => '⦨', + 'angmsdab' => '⦩', + 'angmsdac' => '⦪', + 'angmsdad' => '⦫', + 'angmsdae' => '⦬', + 'angmsdaf' => '⦭', + 'angmsdag' => '⦮', + 'angmsdah' => '⦯', + 'angrt' => '∟', + 'angrtvb' => '⊾', + 'angrtvbd' => 'â¦', + 'angsph' => '∢', + 'angst' => 'Ã…', + 'angzarr' => 'â¼', + 'Aogon' => 'Ä„', + 'aogon' => 'Ä…', + 'Aopf' => 'ð”¸', + 'aopf' => 'ð•’', + 'ap' => '≈', + 'apacir' => '⩯', + 'apE' => 'â©°', + 'ape' => '≊', + 'apid' => '≋', + 'apos' => '\'', + 'ApplyFunction' => 'â¡', + 'approx' => '≈', + 'approxeq' => '≊', + 'Aring' => 'Ã…', + 'Arin' => 'Ã…', + 'aring' => 'Ã¥', + 'arin' => 'Ã¥', + 'Ascr' => 'ð’œ', + 'ascr' => 'ð’¶', + 'Assign' => '≔', + 'ast' => '*', + 'asymp' => '≈', + 'asympeq' => 'â‰', + 'Atilde' => 'Ã', + 'Atild' => 'Ã', + 'atilde' => 'ã', + 'atild' => 'ã', + 'Auml' => 'Ä', + 'Aum' => 'Ä', + 'auml' => 'ä', + 'aum' => 'ä', + 'awconint' => '∳', + 'awint' => '⨑', + 'backcong' => '≌', + 'backepsilon' => '϶', + 'backprime' => '‵', + 'backsim' => '∽', + 'backsimeq' => 'â‹', + 'Backslash' => '∖', + 'Barv' => '⫧', + 'barvee' => '⊽', + 'Barwed' => '⌆', + 'barwed' => '⌅', + 'barwedge' => '⌅', + 'bbrk' => '⎵', + 'bbrktbrk' => '⎶', + 'bcong' => '≌', + 'Bcy' => 'Б', + 'bcy' => 'б', + 'bdquo' => '„', + 'becaus' => '∵', + 'Because' => '∵', + 'because' => '∵', + 'bemptyv' => '⦰', + 'bepsi' => '϶', + 'bernou' => 'ℬ', + 'Bernoullis' => 'ℬ', + 'Beta' => 'Î’', + 'beta' => 'β', + 'beth' => 'ℶ', + 'between' => '≬', + 'Bfr' => 'ð”…', + 'bfr' => 'ð”Ÿ', + 'bigcap' => 'â‹‚', + 'bigcirc' => 'â—¯', + 'bigcup' => '⋃', + 'bigodot' => '⨀', + 'bigoplus' => 'â¨', + 'bigotimes' => '⨂', + 'bigsqcup' => '⨆', + 'bigstar' => '★', + 'bigtriangledown' => 'â–½', + 'bigtriangleup' => 'â–³', + 'biguplus' => '⨄', + 'bigvee' => 'â‹', + 'bigwedge' => 'â‹€', + 'bkarow' => 'â¤', + 'blacklozenge' => '⧫', + 'blacksquare' => 'â–ª', + 'blacktriangle' => 'â–´', + 'blacktriangledown' => 'â–¾', + 'blacktriangleleft' => 'â—‚', + 'blacktriangleright' => 'â–¸', + 'blank' => 'â£', + 'blk12' => 'â–’', + 'blk14' => 'â–‘', + 'blk34' => 'â–“', + 'block' => 'â–ˆ', + 'bne' => '=⃥', + 'bnequiv' => '≡⃥', + 'bNot' => 'â«­', + 'bnot' => 'âŒ', + 'Bopf' => 'ð”¹', + 'bopf' => 'ð•“', + 'bot' => '⊥', + 'bottom' => '⊥', + 'bowtie' => '⋈', + 'boxbox' => '⧉', + 'boxDL' => 'â•—', + 'boxDl' => 'â•–', + 'boxdL' => 'â••', + 'boxdl' => 'â”', + 'boxDR' => 'â•”', + 'boxDr' => 'â•“', + 'boxdR' => 'â•’', + 'boxdr' => '┌', + 'boxH' => 'â•', + 'boxh' => '─', + 'boxHD' => '╦', + 'boxHd' => '╤', + 'boxhD' => 'â•¥', + 'boxhd' => '┬', + 'boxHU' => 'â•©', + 'boxHu' => '╧', + 'boxhU' => '╨', + 'boxhu' => 'â”´', + 'boxminus' => '⊟', + 'boxplus' => '⊞', + 'boxtimes' => '⊠', + 'boxUL' => 'â•', + 'boxUl' => 'â•œ', + 'boxuL' => 'â•›', + 'boxul' => '┘', + 'boxUR' => 'â•š', + 'boxUr' => 'â•™', + 'boxuR' => '╘', + 'boxur' => 'â””', + 'boxV' => 'â•‘', + 'boxv' => '│', + 'boxVH' => '╬', + 'boxVh' => 'â•«', + 'boxvH' => '╪', + 'boxvh' => '┼', + 'boxVL' => 'â•£', + 'boxVl' => 'â•¢', + 'boxvL' => 'â•¡', + 'boxvl' => '┤', + 'boxVR' => 'â• ', + 'boxVr' => 'â•Ÿ', + 'boxvR' => 'â•ž', + 'boxvr' => '├', + 'bprime' => '‵', + 'Breve' => '˘', + 'breve' => '˘', + 'brvbar' => '¦', + 'brvba' => '¦', + 'Bscr' => 'ℬ', + 'bscr' => 'ð’·', + 'bsemi' => 'â', + 'bsim' => '∽', + 'bsime' => 'â‹', + 'bsol' => '\\', + 'bsolb' => '⧅', + 'bsolhsub' => '⟈', + 'bull' => '•', + 'bullet' => '•', + 'bump' => '≎', + 'bumpE' => '⪮', + 'bumpe' => 'â‰', + 'Bumpeq' => '≎', + 'bumpeq' => 'â‰', + 'Cacute' => 'Ć', + 'cacute' => 'ć', + 'Cap' => 'â‹’', + 'cap' => '∩', + 'capand' => 'â©„', + 'capbrcup' => '⩉', + 'capcap' => 'â©‹', + 'capcup' => '⩇', + 'capdot' => 'â©€', + 'CapitalDifferentialD' => 'â……', + 'caps' => '∩︀', + 'caret' => 'â', + 'caron' => 'ˇ', + 'Cayleys' => 'â„­', + 'ccaps' => 'â©', + 'Ccaron' => 'ÄŒ', + 'ccaron' => 'Ä', + 'Ccedil' => 'Ç', + 'Ccedi' => 'Ç', + 'ccedil' => 'ç', + 'ccedi' => 'ç', + 'Ccirc' => 'Ĉ', + 'ccirc' => 'ĉ', + 'Cconint' => '∰', + 'ccups' => 'â©Œ', + 'ccupssm' => 'â©', + 'Cdot' => 'ÄŠ', + 'cdot' => 'Ä‹', + 'cedil' => '¸', + 'cedi' => '¸', + 'Cedilla' => '¸', + 'cemptyv' => '⦲', + 'cent' => '¢', + 'cen' => '¢', + 'CenterDot' => '·', + 'centerdot' => '·', + 'Cfr' => 'â„­', + 'cfr' => 'ð” ', + 'CHcy' => 'Ч', + 'chcy' => 'ч', + 'check' => '✓', + 'checkmark' => '✓', + 'Chi' => 'Χ', + 'chi' => 'χ', + 'cir' => 'â—‹', + 'circ' => 'ˆ', + 'circeq' => '≗', + 'circlearrowleft' => '↺', + 'circlearrowright' => '↻', + 'circledast' => '⊛', + 'circledcirc' => '⊚', + 'circleddash' => 'âŠ', + 'CircleDot' => '⊙', + 'circledR' => '®', + 'circledS' => 'Ⓢ', + 'CircleMinus' => '⊖', + 'CirclePlus' => '⊕', + 'CircleTimes' => '⊗', + 'cirE' => '⧃', + 'cire' => '≗', + 'cirfnint' => 'â¨', + 'cirmid' => '⫯', + 'cirscir' => '⧂', + 'ClockwiseContourIntegral' => '∲', + 'CloseCurlyDoubleQuote' => 'â€', + 'CloseCurlyQuote' => '’', + 'clubs' => '♣', + 'clubsuit' => '♣', + 'Colon' => '∷', + 'colon' => ':', + 'Colone' => 'â©´', + 'colone' => '≔', + 'coloneq' => '≔', + 'comma' => ',', + 'commat' => '@', + 'comp' => 'âˆ', + 'compfn' => '∘', + 'complement' => 'âˆ', + 'complexes' => 'â„‚', + 'cong' => '≅', + 'congdot' => 'â©­', + 'Congruent' => '≡', + 'Conint' => '∯', + 'conint' => '∮', + 'ContourIntegral' => '∮', + 'Copf' => 'â„‚', + 'copf' => 'ð•”', + 'coprod' => 'âˆ', + 'Coproduct' => 'âˆ', + 'COPY' => '©', + 'COP' => '©', + 'copy' => '©', + 'cop' => '©', + 'copysr' => 'â„—', + 'CounterClockwiseContourIntegral' => '∳', + 'crarr' => '↵', + 'Cross' => '⨯', + 'cross' => '✗', + 'Cscr' => 'ð’ž', + 'cscr' => 'ð’¸', + 'csub' => 'â«', + 'csube' => 'â«‘', + 'csup' => 'â«', + 'csupe' => 'â«’', + 'ctdot' => '⋯', + 'cudarrl' => '⤸', + 'cudarrr' => '⤵', + 'cuepr' => 'â‹ž', + 'cuesc' => 'â‹Ÿ', + 'cularr' => '↶', + 'cularrp' => '⤽', + 'Cup' => 'â‹“', + 'cup' => '∪', + 'cupbrcap' => '⩈', + 'CupCap' => 'â‰', + 'cupcap' => '⩆', + 'cupcup' => 'â©Š', + 'cupdot' => 'âŠ', + 'cupor' => 'â©…', + 'cups' => '∪︀', + 'curarr' => '↷', + 'curarrm' => '⤼', + 'curlyeqprec' => 'â‹ž', + 'curlyeqsucc' => 'â‹Ÿ', + 'curlyvee' => 'â‹Ž', + 'curlywedge' => 'â‹', + 'curren' => '¤', + 'curre' => '¤', + 'curvearrowleft' => '↶', + 'curvearrowright' => '↷', + 'cuvee' => 'â‹Ž', + 'cuwed' => 'â‹', + 'cwconint' => '∲', + 'cwint' => '∱', + 'cylcty' => '⌭', + 'Dagger' => '‡', + 'dagger' => '†', + 'daleth' => 'ℸ', + 'Darr' => '↡', + 'dArr' => '⇓', + 'darr' => '↓', + 'dash' => 'â€', + 'Dashv' => '⫤', + 'dashv' => '⊣', + 'dbkarow' => 'â¤', + 'dblac' => 'Ë', + 'Dcaron' => 'ÄŽ', + 'dcaron' => 'Ä', + 'Dcy' => 'Д', + 'dcy' => 'д', + 'DD' => 'â……', + 'dd' => 'â…†', + 'ddagger' => '‡', + 'ddarr' => '⇊', + 'DDotrahd' => '⤑', + 'ddotseq' => 'â©·', + 'deg' => '°', + 'de' => '°', + 'Del' => '∇', + 'Delta' => 'Δ', + 'delta' => 'δ', + 'demptyv' => '⦱', + 'dfisht' => '⥿', + 'Dfr' => 'ð”‡', + 'dfr' => 'ð”¡', + 'dHar' => '⥥', + 'dharl' => '⇃', + 'dharr' => '⇂', + 'DiacriticalAcute' => '´', + 'DiacriticalDot' => 'Ë™', + 'DiacriticalDoubleAcute' => 'Ë', + 'DiacriticalGrave' => '`', + 'DiacriticalTilde' => 'Ëœ', + 'diam' => 'â‹„', + 'Diamond' => 'â‹„', + 'diamond' => 'â‹„', + 'diamondsuit' => '♦', + 'diams' => '♦', + 'die' => '¨', + 'DifferentialD' => 'â…†', + 'digamma' => 'Ï', + 'disin' => '⋲', + 'div' => '÷', + 'divide' => '÷', + 'divid' => '÷', + 'divideontimes' => '⋇', + 'divonx' => '⋇', + 'DJcy' => 'Ђ', + 'djcy' => 'Ñ’', + 'dlcorn' => '⌞', + 'dlcrop' => 'âŒ', + 'dollar' => '$', + 'Dopf' => 'ð”»', + 'dopf' => 'ð••', + 'Dot' => '¨', + 'dot' => 'Ë™', + 'DotDot' => '⃜', + 'doteq' => 'â‰', + 'doteqdot' => '≑', + 'DotEqual' => 'â‰', + 'dotminus' => '∸', + 'dotplus' => '∔', + 'dotsquare' => '⊡', + 'doublebarwedge' => '⌆', + 'DoubleContourIntegral' => '∯', + 'DoubleDot' => '¨', + 'DoubleDownArrow' => '⇓', + 'DoubleLeftArrow' => 'â‡', + 'DoubleLeftRightArrow' => '⇔', + 'DoubleLeftTee' => '⫤', + 'DoubleLongLeftArrow' => '⟸', + 'DoubleLongLeftRightArrow' => '⟺', + 'DoubleLongRightArrow' => '⟹', + 'DoubleRightArrow' => '⇒', + 'DoubleRightTee' => '⊨', + 'DoubleUpArrow' => '⇑', + 'DoubleUpDownArrow' => '⇕', + 'DoubleVerticalBar' => '∥', + 'DownArrow' => '↓', + 'Downarrow' => '⇓', + 'downarrow' => '↓', + 'DownArrowBar' => '⤓', + 'DownArrowUpArrow' => '⇵', + 'DownBreve' => 'Ì‘', + 'downdownarrows' => '⇊', + 'downharpoonleft' => '⇃', + 'downharpoonright' => '⇂', + 'DownLeftRightVector' => 'â¥', + 'DownLeftTeeVector' => '⥞', + 'DownLeftVector' => '↽', + 'DownLeftVectorBar' => '⥖', + 'DownRightTeeVector' => '⥟', + 'DownRightVector' => 'â‡', + 'DownRightVectorBar' => '⥗', + 'DownTee' => '⊤', + 'DownTeeArrow' => '↧', + 'drbkarow' => 'â¤', + 'drcorn' => '⌟', + 'drcrop' => '⌌', + 'Dscr' => 'ð’Ÿ', + 'dscr' => 'ð’¹', + 'DScy' => 'Ð…', + 'dscy' => 'Ñ•', + 'dsol' => '⧶', + 'Dstrok' => 'Ä', + 'dstrok' => 'Ä‘', + 'dtdot' => '⋱', + 'dtri' => 'â–¿', + 'dtrif' => 'â–¾', + 'duarr' => '⇵', + 'duhar' => '⥯', + 'dwangle' => '⦦', + 'DZcy' => 'Ð', + 'dzcy' => 'ÑŸ', + 'dzigrarr' => '⟿', + 'Eacute' => 'É', + 'Eacut' => 'É', + 'eacute' => 'é', + 'eacut' => 'é', + 'easter' => 'â©®', + 'Ecaron' => 'Äš', + 'ecaron' => 'Ä›', + 'ecir' => 'ê', + 'Ecirc' => 'Ê', + 'Ecir' => 'Ê', + 'ecirc' => 'ê', + 'ecolon' => '≕', + 'Ecy' => 'Э', + 'ecy' => 'Ñ', + 'eDDot' => 'â©·', + 'Edot' => 'Ä–', + 'eDot' => '≑', + 'edot' => 'Ä—', + 'ee' => 'â…‡', + 'efDot' => '≒', + 'Efr' => 'ð”ˆ', + 'efr' => 'ð”¢', + 'eg' => '⪚', + 'Egrave' => 'È', + 'Egrav' => 'È', + 'egrave' => 'è', + 'egrav' => 'è', + 'egs' => '⪖', + 'egsdot' => '⪘', + 'el' => '⪙', + 'Element' => '∈', + 'elinters' => 'â§', + 'ell' => 'â„“', + 'els' => '⪕', + 'elsdot' => '⪗', + 'Emacr' => 'Ä’', + 'emacr' => 'Ä“', + 'empty' => '∅', + 'emptyset' => '∅', + 'EmptySmallSquare' => 'â—»', + 'emptyv' => '∅', + 'EmptyVerySmallSquare' => 'â–«', + 'emsp' => ' ', + 'emsp13' => ' ', + 'emsp14' => ' ', + 'ENG' => 'ÅŠ', + 'eng' => 'Å‹', + 'ensp' => ' ', + 'Eogon' => 'Ę', + 'eogon' => 'Ä™', + 'Eopf' => 'ð”¼', + 'eopf' => 'ð•–', + 'epar' => 'â‹•', + 'eparsl' => '⧣', + 'eplus' => '⩱', + 'epsi' => 'ε', + 'Epsilon' => 'Ε', + 'epsilon' => 'ε', + 'epsiv' => 'ϵ', + 'eqcirc' => '≖', + 'eqcolon' => '≕', + 'eqsim' => '≂', + 'eqslantgtr' => '⪖', + 'eqslantless' => '⪕', + 'Equal' => '⩵', + 'equals' => '=', + 'EqualTilde' => '≂', + 'equest' => '≟', + 'Equilibrium' => '⇌', + 'equiv' => '≡', + 'equivDD' => '⩸', + 'eqvparsl' => '⧥', + 'erarr' => '⥱', + 'erDot' => '≓', + 'Escr' => 'â„°', + 'escr' => 'ℯ', + 'esdot' => 'â‰', + 'Esim' => '⩳', + 'esim' => '≂', + 'Eta' => 'Η', + 'eta' => 'η', + 'ETH' => 'Ã', + 'ET' => 'Ã', + 'eth' => 'ð', + 'et' => 'ð', + 'Euml' => 'Ë', + 'Eum' => 'Ë', + 'euml' => 'ë', + 'eum' => 'ë', + 'euro' => '€', + 'excl' => '!', + 'exist' => '∃', + 'Exists' => '∃', + 'expectation' => 'â„°', + 'ExponentialE' => 'â…‡', + 'exponentiale' => 'â…‡', + 'fallingdotseq' => '≒', + 'Fcy' => 'Ф', + 'fcy' => 'Ñ„', + 'female' => '♀', + 'ffilig' => 'ffi', + 'fflig' => 'ff', + 'ffllig' => 'ffl', + 'Ffr' => 'ð”‰', + 'ffr' => 'ð”£', + 'filig' => 'ï¬', + 'FilledSmallSquare' => 'â—¼', + 'FilledVerySmallSquare' => 'â–ª', + 'fjlig' => 'fj', + 'flat' => 'â™­', + 'fllig' => 'fl', + 'fltns' => 'â–±', + 'fnof' => 'Æ’', + 'Fopf' => 'ð”½', + 'fopf' => 'ð•—', + 'ForAll' => '∀', + 'forall' => '∀', + 'fork' => 'â‹”', + 'forkv' => 'â«™', + 'Fouriertrf' => 'ℱ', + 'fpartint' => 'â¨', + 'frac12' => '½', + 'frac1' => '¼', + 'frac13' => 'â…“', + 'frac14' => '¼', + 'frac15' => 'â…•', + 'frac16' => 'â…™', + 'frac18' => 'â…›', + 'frac23' => 'â…”', + 'frac25' => 'â…–', + 'frac34' => '¾', + 'frac3' => '¾', + 'frac35' => 'â…—', + 'frac38' => 'â…œ', + 'frac45' => 'â…˜', + 'frac56' => 'â…š', + 'frac58' => 'â…', + 'frac78' => 'â…ž', + 'frasl' => 'â„', + 'frown' => '⌢', + 'Fscr' => 'ℱ', + 'fscr' => 'ð’»', + 'gacute' => 'ǵ', + 'Gamma' => 'Γ', + 'gamma' => 'γ', + 'Gammad' => 'Ïœ', + 'gammad' => 'Ï', + 'gap' => '⪆', + 'Gbreve' => 'Äž', + 'gbreve' => 'ÄŸ', + 'Gcedil' => 'Ä¢', + 'Gcirc' => 'Äœ', + 'gcirc' => 'Ä', + 'Gcy' => 'Г', + 'gcy' => 'г', + 'Gdot' => 'Ä ', + 'gdot' => 'Ä¡', + 'gE' => '≧', + 'ge' => '≥', + 'gEl' => '⪌', + 'gel' => 'â‹›', + 'geq' => '≥', + 'geqq' => '≧', + 'geqslant' => '⩾', + 'ges' => '⩾', + 'gescc' => '⪩', + 'gesdot' => '⪀', + 'gesdoto' => '⪂', + 'gesdotol' => '⪄', + 'gesl' => '⋛︀', + 'gesles' => '⪔', + 'Gfr' => 'ð”Š', + 'gfr' => 'ð”¤', + 'Gg' => 'â‹™', + 'gg' => '≫', + 'ggg' => 'â‹™', + 'gimel' => 'â„·', + 'GJcy' => 'Ѓ', + 'gjcy' => 'Ñ“', + 'gl' => '≷', + 'gla' => '⪥', + 'glE' => '⪒', + 'glj' => '⪤', + 'gnap' => '⪊', + 'gnapprox' => '⪊', + 'gnE' => '≩', + 'gne' => '⪈', + 'gneq' => '⪈', + 'gneqq' => '≩', + 'gnsim' => '⋧', + 'Gopf' => 'ð”¾', + 'gopf' => 'ð•˜', + 'grave' => '`', + 'GreaterEqual' => '≥', + 'GreaterEqualLess' => 'â‹›', + 'GreaterFullEqual' => '≧', + 'GreaterGreater' => '⪢', + 'GreaterLess' => '≷', + 'GreaterSlantEqual' => '⩾', + 'GreaterTilde' => '≳', + 'Gscr' => 'ð’¢', + 'gscr' => 'â„Š', + 'gsim' => '≳', + 'gsime' => '⪎', + 'gsiml' => 'âª', + 'GT' => '>', + 'G' => '>', + 'Gt' => '≫', + 'gt' => '>', + 'g' => '>', + 'gtcc' => '⪧', + 'gtcir' => '⩺', + 'gtdot' => 'â‹—', + 'gtlPar' => '⦕', + 'gtquest' => '⩼', + 'gtrapprox' => '⪆', + 'gtrarr' => '⥸', + 'gtrdot' => 'â‹—', + 'gtreqless' => 'â‹›', + 'gtreqqless' => '⪌', + 'gtrless' => '≷', + 'gtrsim' => '≳', + 'gvertneqq' => '≩︀', + 'gvnE' => '≩︀', + 'Hacek' => 'ˇ', + 'hairsp' => ' ', + 'half' => '½', + 'hamilt' => 'â„‹', + 'HARDcy' => 'Ъ', + 'hardcy' => 'ÑŠ', + 'hArr' => '⇔', + 'harr' => '↔', + 'harrcir' => '⥈', + 'harrw' => '↭', + 'Hat' => '^', + 'hbar' => 'â„', + 'Hcirc' => 'Ĥ', + 'hcirc' => 'Ä¥', + 'hearts' => '♥', + 'heartsuit' => '♥', + 'hellip' => '…', + 'hercon' => '⊹', + 'Hfr' => 'â„Œ', + 'hfr' => 'ð”¥', + 'HilbertSpace' => 'â„‹', + 'hksearow' => '⤥', + 'hkswarow' => '⤦', + 'hoarr' => '⇿', + 'homtht' => '∻', + 'hookleftarrow' => '↩', + 'hookrightarrow' => '↪', + 'Hopf' => 'â„', + 'hopf' => 'ð•™', + 'horbar' => '―', + 'HorizontalLine' => '─', + 'Hscr' => 'â„‹', + 'hscr' => 'ð’½', + 'hslash' => 'â„', + 'Hstrok' => 'Ħ', + 'hstrok' => 'ħ', + 'HumpDownHump' => '≎', + 'HumpEqual' => 'â‰', + 'hybull' => 'âƒ', + 'hyphen' => 'â€', + 'Iacute' => 'Ã', + 'Iacut' => 'Ã', + 'iacute' => 'í', + 'iacut' => 'í', + 'ic' => 'â£', + 'Icirc' => 'ÃŽ', + 'Icir' => 'ÃŽ', + 'icirc' => 'î', + 'icir' => 'î', + 'Icy' => 'И', + 'icy' => 'и', + 'Idot' => 'Ä°', + 'IEcy' => 'Е', + 'iecy' => 'е', + 'iexcl' => '¡', + 'iexc' => '¡', + 'iff' => '⇔', + 'Ifr' => 'â„‘', + 'ifr' => 'ð”¦', + 'Igrave' => 'ÃŒ', + 'Igrav' => 'ÃŒ', + 'igrave' => 'ì', + 'igrav' => 'ì', + 'ii' => 'â…ˆ', + 'iiiint' => '⨌', + 'iiint' => '∭', + 'iinfin' => '⧜', + 'iiota' => 'â„©', + 'IJlig' => 'IJ', + 'ijlig' => 'ij', + 'Im' => 'â„‘', + 'Imacr' => 'Ī', + 'imacr' => 'Ä«', + 'image' => 'â„‘', + 'ImaginaryI' => 'â…ˆ', + 'imagline' => 'â„', + 'imagpart' => 'â„‘', + 'imath' => 'ı', + 'imof' => '⊷', + 'imped' => 'Ƶ', + 'Implies' => '⇒', + 'in' => '∈', + 'incare' => 'â„…', + 'infin' => '∞', + 'infintie' => 'â§', + 'inodot' => 'ı', + 'Int' => '∬', + 'int' => '∫', + 'intcal' => '⊺', + 'integers' => 'ℤ', + 'Integral' => '∫', + 'intercal' => '⊺', + 'Intersection' => 'â‹‚', + 'intlarhk' => '⨗', + 'intprod' => '⨼', + 'InvisibleComma' => 'â£', + 'InvisibleTimes' => 'â¢', + 'IOcy' => 'Ð', + 'iocy' => 'Ñ‘', + 'Iogon' => 'Ä®', + 'iogon' => 'į', + 'Iopf' => 'ð•€', + 'iopf' => 'ð•š', + 'Iota' => 'Ι', + 'iota' => 'ι', + 'iprod' => '⨼', + 'iquest' => '¿', + 'iques' => '¿', + 'Iscr' => 'â„', + 'iscr' => 'ð’¾', + 'isin' => '∈', + 'isindot' => '⋵', + 'isinE' => '⋹', + 'isins' => 'â‹´', + 'isinsv' => '⋳', + 'isinv' => '∈', + 'it' => 'â¢', + 'Itilde' => 'Ĩ', + 'itilde' => 'Ä©', + 'Iukcy' => 'І', + 'iukcy' => 'Ñ–', + 'Iuml' => 'Ã', + 'Ium' => 'Ã', + 'iuml' => 'ï', + 'ium' => 'ï', + 'Jcirc' => 'Ä´', + 'jcirc' => 'ĵ', + 'Jcy' => 'Й', + 'jcy' => 'й', + 'Jfr' => 'ð”', + 'jfr' => 'ð”§', + 'jmath' => 'È·', + 'Jopf' => 'ð•', + 'jopf' => 'ð•›', + 'Jscr' => 'ð’¥', + 'jscr' => 'ð’¿', + 'Jsercy' => 'Ј', + 'jsercy' => 'ј', + 'Jukcy' => 'Є', + 'jukcy' => 'Ñ”', + 'Kappa' => 'Κ', + 'kappa' => 'κ', + 'kappav' => 'Ï°', + 'Kcedil' => 'Ķ', + 'kcedil' => 'Ä·', + 'Kcy' => 'К', + 'kcy' => 'к', + 'Kfr' => 'ð”Ž', + 'kfr' => 'ð”¨', + 'kgreen' => 'ĸ', + 'KHcy' => 'Ð¥', + 'khcy' => 'Ñ…', + 'KJcy' => 'ÐŒ', + 'kjcy' => 'Ñœ', + 'Kopf' => 'ð•‚', + 'kopf' => 'ð•œ', + 'Kscr' => 'ð’¦', + 'kscr' => 'ð“€', + 'lAarr' => '⇚', + 'Lacute' => 'Ĺ', + 'lacute' => 'ĺ', + 'laemptyv' => '⦴', + 'lagran' => 'â„’', + 'Lambda' => 'Λ', + 'lambda' => 'λ', + 'Lang' => '⟪', + 'lang' => '⟨', + 'langd' => '⦑', + 'langle' => '⟨', + 'lap' => '⪅', + 'Laplacetrf' => 'â„’', + 'laquo' => '«', + 'laqu' => '«', + 'Larr' => '↞', + 'lArr' => 'â‡', + 'larr' => 'â†', + 'larrb' => '⇤', + 'larrbfs' => '⤟', + 'larrfs' => 'â¤', + 'larrhk' => '↩', + 'larrlp' => '↫', + 'larrpl' => '⤹', + 'larrsim' => '⥳', + 'larrtl' => '↢', + 'lat' => '⪫', + 'lAtail' => '⤛', + 'latail' => '⤙', + 'late' => '⪭', + 'lates' => '⪭︀', + 'lBarr' => '⤎', + 'lbarr' => '⤌', + 'lbbrk' => 'â²', + 'lbrace' => '{', + 'lbrack' => '[', + 'lbrke' => '⦋', + 'lbrksld' => 'â¦', + 'lbrkslu' => 'â¦', + 'Lcaron' => 'Ľ', + 'lcaron' => 'ľ', + 'Lcedil' => 'Ä»', + 'lcedil' => 'ļ', + 'lceil' => '⌈', + 'lcub' => '{', + 'Lcy' => 'Л', + 'lcy' => 'л', + 'ldca' => '⤶', + 'ldquo' => '“', + 'ldquor' => '„', + 'ldrdhar' => '⥧', + 'ldrushar' => '⥋', + 'ldsh' => '↲', + 'lE' => '≦', + 'le' => '≤', + 'LeftAngleBracket' => '⟨', + 'LeftArrow' => 'â†', + 'Leftarrow' => 'â‡', + 'leftarrow' => 'â†', + 'LeftArrowBar' => '⇤', + 'LeftArrowRightArrow' => '⇆', + 'leftarrowtail' => '↢', + 'LeftCeiling' => '⌈', + 'LeftDoubleBracket' => '⟦', + 'LeftDownTeeVector' => '⥡', + 'LeftDownVector' => '⇃', + 'LeftDownVectorBar' => '⥙', + 'LeftFloor' => '⌊', + 'leftharpoondown' => '↽', + 'leftharpoonup' => '↼', + 'leftleftarrows' => '⇇', + 'LeftRightArrow' => '↔', + 'Leftrightarrow' => '⇔', + 'leftrightarrow' => '↔', + 'leftrightarrows' => '⇆', + 'leftrightharpoons' => '⇋', + 'leftrightsquigarrow' => '↭', + 'LeftRightVector' => '⥎', + 'LeftTee' => '⊣', + 'LeftTeeArrow' => '↤', + 'LeftTeeVector' => '⥚', + 'leftthreetimes' => 'â‹‹', + 'LeftTriangle' => '⊲', + 'LeftTriangleBar' => 'â§', + 'LeftTriangleEqual' => '⊴', + 'LeftUpDownVector' => '⥑', + 'LeftUpTeeVector' => '⥠', + 'LeftUpVector' => '↿', + 'LeftUpVectorBar' => '⥘', + 'LeftVector' => '↼', + 'LeftVectorBar' => '⥒', + 'lEg' => '⪋', + 'leg' => 'â‹š', + 'leq' => '≤', + 'leqq' => '≦', + 'leqslant' => '⩽', + 'les' => '⩽', + 'lescc' => '⪨', + 'lesdot' => 'â©¿', + 'lesdoto' => 'âª', + 'lesdotor' => '⪃', + 'lesg' => '⋚︀', + 'lesges' => '⪓', + 'lessapprox' => '⪅', + 'lessdot' => 'â‹–', + 'lesseqgtr' => 'â‹š', + 'lesseqqgtr' => '⪋', + 'LessEqualGreater' => 'â‹š', + 'LessFullEqual' => '≦', + 'LessGreater' => '≶', + 'lessgtr' => '≶', + 'LessLess' => '⪡', + 'lesssim' => '≲', + 'LessSlantEqual' => '⩽', + 'LessTilde' => '≲', + 'lfisht' => '⥼', + 'lfloor' => '⌊', + 'Lfr' => 'ð”', + 'lfr' => 'ð”©', + 'lg' => '≶', + 'lgE' => '⪑', + 'lHar' => '⥢', + 'lhard' => '↽', + 'lharu' => '↼', + 'lharul' => '⥪', + 'lhblk' => 'â–„', + 'LJcy' => 'Љ', + 'ljcy' => 'Ñ™', + 'Ll' => '⋘', + 'll' => '≪', + 'llarr' => '⇇', + 'llcorner' => '⌞', + 'Lleftarrow' => '⇚', + 'llhard' => '⥫', + 'lltri' => 'â—º', + 'Lmidot' => 'Ä¿', + 'lmidot' => 'Å€', + 'lmoust' => '⎰', + 'lmoustache' => '⎰', + 'lnap' => '⪉', + 'lnapprox' => '⪉', + 'lnE' => '≨', + 'lne' => '⪇', + 'lneq' => '⪇', + 'lneqq' => '≨', + 'lnsim' => '⋦', + 'loang' => '⟬', + 'loarr' => '⇽', + 'lobrk' => '⟦', + 'LongLeftArrow' => '⟵', + 'Longleftarrow' => '⟸', + 'longleftarrow' => '⟵', + 'LongLeftRightArrow' => '⟷', + 'Longleftrightarrow' => '⟺', + 'longleftrightarrow' => '⟷', + 'longmapsto' => '⟼', + 'LongRightArrow' => '⟶', + 'Longrightarrow' => '⟹', + 'longrightarrow' => '⟶', + 'looparrowleft' => '↫', + 'looparrowright' => '↬', + 'lopar' => '⦅', + 'Lopf' => 'ð•ƒ', + 'lopf' => 'ð•', + 'loplus' => '⨭', + 'lotimes' => '⨴', + 'lowast' => '∗', + 'lowbar' => '_', + 'LowerLeftArrow' => '↙', + 'LowerRightArrow' => '↘', + 'loz' => 'â—Š', + 'lozenge' => 'â—Š', + 'lozf' => '⧫', + 'lpar' => '(', + 'lparlt' => '⦓', + 'lrarr' => '⇆', + 'lrcorner' => '⌟', + 'lrhar' => '⇋', + 'lrhard' => '⥭', + 'lrm' => '‎', + 'lrtri' => '⊿', + 'lsaquo' => '‹', + 'Lscr' => 'â„’', + 'lscr' => 'ð“', + 'Lsh' => '↰', + 'lsh' => '↰', + 'lsim' => '≲', + 'lsime' => 'âª', + 'lsimg' => 'âª', + 'lsqb' => '[', + 'lsquo' => '‘', + 'lsquor' => '‚', + 'Lstrok' => 'Å', + 'lstrok' => 'Å‚', + 'LT' => '<', + 'L' => '<', + 'Lt' => '≪', + 'lt' => '<', + 'l' => '<', + 'ltcc' => '⪦', + 'ltcir' => '⩹', + 'ltdot' => 'â‹–', + 'lthree' => 'â‹‹', + 'ltimes' => '⋉', + 'ltlarr' => '⥶', + 'ltquest' => 'â©»', + 'ltri' => 'â—ƒ', + 'ltrie' => '⊴', + 'ltrif' => 'â—‚', + 'ltrPar' => '⦖', + 'lurdshar' => '⥊', + 'luruhar' => '⥦', + 'lvertneqq' => '≨︀', + 'lvnE' => '≨︀', + 'macr' => '¯', + 'mac' => '¯', + 'male' => '♂', + 'malt' => '✠', + 'maltese' => '✠', + 'Map' => '⤅', + 'map' => '↦', + 'mapsto' => '↦', + 'mapstodown' => '↧', + 'mapstoleft' => '↤', + 'mapstoup' => '↥', + 'marker' => 'â–®', + 'mcomma' => '⨩', + 'Mcy' => 'Ðœ', + 'mcy' => 'м', + 'mdash' => '—', + 'mDDot' => '∺', + 'measuredangle' => '∡', + 'MediumSpace' => 'âŸ', + 'Mellintrf' => 'ℳ', + 'Mfr' => 'ð”', + 'mfr' => 'ð”ª', + 'mho' => '℧', + 'micro' => 'µ', + 'micr' => 'µ', + 'mid' => '∣', + 'midast' => '*', + 'midcir' => 'â«°', + 'middot' => '·', + 'middo' => '·', + 'minus' => '−', + 'minusb' => '⊟', + 'minusd' => '∸', + 'minusdu' => '⨪', + 'MinusPlus' => '∓', + 'mlcp' => 'â«›', + 'mldr' => '…', + 'mnplus' => '∓', + 'models' => '⊧', + 'Mopf' => 'ð•„', + 'mopf' => 'ð•ž', + 'mp' => '∓', + 'Mscr' => 'ℳ', + 'mscr' => 'ð“‚', + 'mstpos' => '∾', + 'Mu' => 'Îœ', + 'mu' => 'μ', + 'multimap' => '⊸', + 'mumap' => '⊸', + 'nabla' => '∇', + 'Nacute' => 'Ń', + 'nacute' => 'Å„', + 'nang' => '∠⃒', + 'nap' => '≉', + 'napE' => '⩰̸', + 'napid' => '≋̸', + 'napos' => 'ʼn', + 'napprox' => '≉', + 'natur' => 'â™®', + 'natural' => 'â™®', + 'naturals' => 'â„•', + 'nbsp' => ' ', + 'nbs' => ' ', + 'nbump' => '≎̸', + 'nbumpe' => 'â‰Ì¸', + 'ncap' => '⩃', + 'Ncaron' => 'Ň', + 'ncaron' => 'ň', + 'Ncedil' => 'Å…', + 'ncedil' => 'ņ', + 'ncong' => '≇', + 'ncongdot' => '⩭̸', + 'ncup' => 'â©‚', + 'Ncy' => 'Ð', + 'ncy' => 'н', + 'ndash' => '–', + 'ne' => '≠', + 'nearhk' => '⤤', + 'neArr' => '⇗', + 'nearr' => '↗', + 'nearrow' => '↗', + 'nedot' => 'â‰Ì¸', + 'NegativeMediumSpace' => '​', + 'NegativeThickSpace' => '​', + 'NegativeThinSpace' => '​', + 'NegativeVeryThinSpace' => '​', + 'nequiv' => '≢', + 'nesear' => '⤨', + 'nesim' => '≂̸', + 'NestedGreaterGreater' => '≫', + 'NestedLessLess' => '≪', + 'NewLine' => ' ', - 'nexist' => '∄', - 'nexists' => '∄', - 'Nfr' => 'ð”‘', - 'nfr' => 'ð”«', - 'ngE' => '≧̸', - 'nge' => '≱', - 'ngeq' => '≱', - 'ngeqq' => '≧̸', - 'ngeqslant' => '⩾̸', - 'nges' => '⩾̸', - 'nGg' => '⋙̸', - 'ngsim' => '≵', - 'nGt' => '≫⃒', - 'ngt' => '≯', - 'ngtr' => '≯', - 'nGtv' => '≫̸', - 'nhArr' => '⇎', - 'nharr' => '↮', - 'nhpar' => '⫲', - 'ni' => '∋', - 'nis' => '⋼', - 'nisd' => '⋺', - 'niv' => '∋', - 'NJcy' => 'Њ', - 'njcy' => 'Ñš', - 'nlArr' => 'â‡', - 'nlarr' => '↚', - 'nldr' => '‥', - 'nlE' => '≦̸', - 'nle' => '≰', - 'nLeftarrow' => 'â‡', - 'nleftarrow' => '↚', - 'nLeftrightarrow' => '⇎', - 'nleftrightarrow' => '↮', - 'nleq' => '≰', - 'nleqq' => '≦̸', - 'nleqslant' => '⩽̸', - 'nles' => '⩽̸', - 'nless' => '≮', - 'nLl' => '⋘̸', - 'nlsim' => '≴', - 'nLt' => '≪⃒', - 'nlt' => '≮', - 'nltri' => '⋪', - 'nltrie' => '⋬', - 'nLtv' => '≪̸', - 'nmid' => '∤', - 'NoBreak' => 'â ', - 'NonBreakingSpace' => ' ', - 'Nopf' => 'â„•', - 'nopf' => 'ð•Ÿ', - 'Not' => '⫬', - 'not' => '¬', - 'no' => '¬', - 'NotCongruent' => '≢', - 'NotCupCap' => '≭', - 'NotDoubleVerticalBar' => '∦', - 'NotElement' => '∉', - 'NotEqual' => '≠', - 'NotEqualTilde' => '≂̸', - 'NotExists' => '∄', - 'NotGreater' => '≯', - 'NotGreaterEqual' => '≱', - 'NotGreaterFullEqual' => '≧̸', - 'NotGreaterGreater' => '≫̸', - 'NotGreaterLess' => '≹', - 'NotGreaterSlantEqual' => '⩾̸', - 'NotGreaterTilde' => '≵', - 'NotHumpDownHump' => '≎̸', - 'NotHumpEqual' => 'â‰Ì¸', - 'notin' => '∉', - 'notindot' => '⋵̸', - 'notinE' => '⋹̸', - 'notinva' => '∉', - 'notinvb' => 'â‹·', - 'notinvc' => '⋶', - 'NotLeftTriangle' => '⋪', - 'NotLeftTriangleBar' => 'â§Ì¸', - 'NotLeftTriangleEqual' => '⋬', - 'NotLess' => '≮', - 'NotLessEqual' => '≰', - 'NotLessGreater' => '≸', - 'NotLessLess' => '≪̸', - 'NotLessSlantEqual' => '⩽̸', - 'NotLessTilde' => '≴', - 'NotNestedGreaterGreater' => '⪢̸', - 'NotNestedLessLess' => '⪡̸', - 'notni' => '∌', - 'notniva' => '∌', - 'notnivb' => '⋾', - 'notnivc' => '⋽', - 'NotPrecedes' => '⊀', - 'NotPrecedesEqual' => '⪯̸', - 'NotPrecedesSlantEqual' => 'â‹ ', - 'NotReverseElement' => '∌', - 'NotRightTriangle' => 'â‹«', - 'NotRightTriangleBar' => 'â§Ì¸', - 'NotRightTriangleEqual' => 'â‹­', - 'NotSquareSubset' => 'âŠÌ¸', - 'NotSquareSubsetEqual' => 'â‹¢', - 'NotSquareSuperset' => 'âŠÌ¸', - 'NotSquareSupersetEqual' => 'â‹£', - 'NotSubset' => '⊂⃒', - 'NotSubsetEqual' => '⊈', - 'NotSucceeds' => 'âŠ', - 'NotSucceedsEqual' => '⪰̸', - 'NotSucceedsSlantEqual' => 'â‹¡', - 'NotSucceedsTilde' => '≿̸', - 'NotSuperset' => '⊃⃒', - 'NotSupersetEqual' => '⊉', - 'NotTilde' => 'â‰', - 'NotTildeEqual' => '≄', - 'NotTildeFullEqual' => '≇', - 'NotTildeTilde' => '≉', - 'NotVerticalBar' => '∤', - 'npar' => '∦', - 'nparallel' => '∦', - 'nparsl' => '⫽⃥', - 'npart' => '∂̸', - 'npolint' => '⨔', - 'npr' => '⊀', - 'nprcue' => 'â‹ ', - 'npre' => '⪯̸', - 'nprec' => '⊀', - 'npreceq' => '⪯̸', - 'nrArr' => 'â‡', - 'nrarr' => '↛', - 'nrarrc' => '⤳̸', - 'nrarrw' => 'â†Ì¸', - 'nRightarrow' => 'â‡', - 'nrightarrow' => '↛', - 'nrtri' => 'â‹«', - 'nrtrie' => 'â‹­', - 'nsc' => 'âŠ', - 'nsccue' => 'â‹¡', - 'nsce' => '⪰̸', - 'Nscr' => 'ð’©', - 'nscr' => 'ð“ƒ', - 'nshortmid' => '∤', - 'nshortparallel' => '∦', - 'nsim' => 'â‰', - 'nsime' => '≄', - 'nsimeq' => '≄', - 'nsmid' => '∤', - 'nspar' => '∦', - 'nsqsube' => 'â‹¢', - 'nsqsupe' => 'â‹£', - 'nsub' => '⊄', - 'nsubE' => '⫅̸', - 'nsube' => '⊈', - 'nsubset' => '⊂⃒', - 'nsubseteq' => '⊈', - 'nsubseteqq' => '⫅̸', - 'nsucc' => 'âŠ', - 'nsucceq' => '⪰̸', - 'nsup' => '⊅', - 'nsupE' => '⫆̸', - 'nsupe' => '⊉', - 'nsupset' => '⊃⃒', - 'nsupseteq' => '⊉', - 'nsupseteqq' => '⫆̸', - 'ntgl' => '≹', - 'Ntilde' => 'Ñ', - 'Ntild' => 'Ñ', - 'ntilde' => 'ñ', - 'ntild' => 'ñ', - 'ntlg' => '≸', - 'ntriangleleft' => '⋪', - 'ntrianglelefteq' => '⋬', - 'ntriangleright' => 'â‹«', - 'ntrianglerighteq' => 'â‹­', - 'Nu' => 'Î', - 'nu' => 'ν', - 'num' => '#', - 'numero' => 'â„–', - 'numsp' => ' ', - 'nvap' => 'â‰âƒ’', - 'nVDash' => '⊯', - 'nVdash' => '⊮', - 'nvDash' => '⊭', - 'nvdash' => '⊬', - 'nvge' => '≥⃒', - 'nvgt' => '>⃒', - 'nvHarr' => '⤄', - 'nvinfin' => '⧞', - 'nvlArr' => '⤂', - 'nvle' => '≤⃒', - 'nvlt' => '<⃒', - 'nvltrie' => '⊴⃒', - 'nvrArr' => '⤃', - 'nvrtrie' => '⊵⃒', - 'nvsim' => '∼⃒', - 'nwarhk' => '⤣', - 'nwArr' => '⇖', - 'nwarr' => '↖', - 'nwarrow' => '↖', - 'nwnear' => '⤧', - 'Oacute' => 'Ó', - 'Oacut' => 'Ó', - 'oacute' => 'ó', - 'oacut' => 'ó', - 'oast' => '⊛', - 'ocir' => 'ô', - 'Ocirc' => 'Ô', - 'Ocir' => 'Ô', - 'ocirc' => 'ô', - 'Ocy' => 'О', - 'ocy' => 'о', - 'odash' => 'âŠ', - 'Odblac' => 'Å', - 'odblac' => 'Å‘', - 'odiv' => '⨸', - 'odot' => '⊙', - 'odsold' => '⦼', - 'OElig' => 'Å’', - 'oelig' => 'Å“', - 'ofcir' => '⦿', - 'Ofr' => 'ð”’', - 'ofr' => 'ð”¬', - 'ogon' => 'Ë›', - 'Ograve' => 'Ã’', - 'Ograv' => 'Ã’', - 'ograve' => 'ò', - 'ograv' => 'ò', - 'ogt' => 'â§', - 'ohbar' => '⦵', - 'ohm' => 'Ω', - 'oint' => '∮', - 'olarr' => '↺', - 'olcir' => '⦾', - 'olcross' => '⦻', - 'oline' => '‾', - 'olt' => '⧀', - 'Omacr' => 'ÅŒ', - 'omacr' => 'Å', - 'Omega' => 'Ω', - 'omega' => 'ω', - 'Omicron' => 'Ο', - 'omicron' => 'ο', - 'omid' => '⦶', - 'ominus' => '⊖', - 'Oopf' => 'ð•†', - 'oopf' => 'ð• ', - 'opar' => '⦷', - 'OpenCurlyDoubleQuote' => '“', - 'OpenCurlyQuote' => '‘', - 'operp' => '⦹', - 'oplus' => '⊕', - 'Or' => 'â©”', - 'or' => '∨', - 'orarr' => '↻', - 'ord' => 'º', - 'order' => 'â„´', - 'orderof' => 'â„´', - 'ordf' => 'ª', - 'ordm' => 'º', - 'origof' => '⊶', - 'oror' => 'â©–', - 'orslope' => 'â©—', - 'orv' => 'â©›', - 'oS' => 'Ⓢ', - 'Oscr' => 'ð’ª', - 'oscr' => 'â„´', - 'Oslash' => 'Ø', - 'Oslas' => 'Ø', - 'oslash' => 'ø', - 'oslas' => 'ø', - 'osol' => '⊘', - 'Otilde' => 'Õ', - 'Otild' => 'Õ', - 'otilde' => 'õ', - 'otild' => 'õ', - 'Otimes' => '⨷', - 'otimes' => '⊗', - 'otimesas' => '⨶', - 'Ouml' => 'Ö', - 'Oum' => 'Ö', - 'ouml' => 'ö', - 'oum' => 'ö', - 'ovbar' => '⌽', - 'OverBar' => '‾', - 'OverBrace' => 'âž', - 'OverBracket' => '⎴', - 'OverParenthesis' => 'âœ', - 'par' => '¶', - 'para' => '¶', - 'parallel' => '∥', - 'parsim' => '⫳', - 'parsl' => '⫽', - 'part' => '∂', - 'PartialD' => '∂', - 'Pcy' => 'П', - 'pcy' => 'п', - 'percnt' => '%', - 'period' => '.', - 'permil' => '‰', - 'perp' => '⊥', - 'pertenk' => '‱', - 'Pfr' => 'ð”“', - 'pfr' => 'ð”­', - 'Phi' => 'Φ', - 'phi' => 'φ', - 'phiv' => 'Ï•', - 'phmmat' => 'ℳ', - 'phone' => '☎', - 'Pi' => 'Π', - 'pi' => 'Ï€', - 'pitchfork' => 'â‹”', - 'piv' => 'Ï–', - 'planck' => 'â„', - 'planckh' => 'â„Ž', - 'plankv' => 'â„', - 'plus' => '+', - 'plusacir' => '⨣', - 'plusb' => '⊞', - 'pluscir' => '⨢', - 'plusdo' => '∔', - 'plusdu' => '⨥', - 'pluse' => '⩲', - 'PlusMinus' => '±', - 'plusmn' => '±', - 'plusm' => '±', - 'plussim' => '⨦', - 'plustwo' => '⨧', - 'pm' => '±', - 'Poincareplane' => 'â„Œ', - 'pointint' => '⨕', - 'Popf' => 'â„™', - 'popf' => 'ð•¡', - 'pound' => '£', - 'poun' => '£', - 'Pr' => '⪻', - 'pr' => '≺', - 'prap' => '⪷', - 'prcue' => '≼', - 'prE' => '⪳', - 'pre' => '⪯', - 'prec' => '≺', - 'precapprox' => '⪷', - 'preccurlyeq' => '≼', - 'Precedes' => '≺', - 'PrecedesEqual' => '⪯', - 'PrecedesSlantEqual' => '≼', - 'PrecedesTilde' => '≾', - 'preceq' => '⪯', - 'precnapprox' => '⪹', - 'precneqq' => '⪵', - 'precnsim' => '⋨', - 'precsim' => '≾', - 'Prime' => '″', - 'prime' => '′', - 'primes' => 'â„™', - 'prnap' => '⪹', - 'prnE' => '⪵', - 'prnsim' => '⋨', - 'prod' => 'âˆ', - 'Product' => 'âˆ', - 'profalar' => '⌮', - 'profline' => '⌒', - 'profsurf' => '⌓', - 'prop' => 'âˆ', - 'Proportion' => '∷', - 'Proportional' => 'âˆ', - 'propto' => 'âˆ', - 'prsim' => '≾', - 'prurel' => '⊰', - 'Pscr' => 'ð’«', - 'pscr' => 'ð“…', - 'Psi' => 'Ψ', - 'psi' => 'ψ', - 'puncsp' => ' ', - 'Qfr' => 'ð””', - 'qfr' => 'ð”®', - 'qint' => '⨌', - 'Qopf' => 'â„š', - 'qopf' => 'ð•¢', - 'qprime' => 'â—', - 'Qscr' => 'ð’¬', - 'qscr' => 'ð“†', - 'quaternions' => 'â„', - 'quatint' => '⨖', - 'quest' => '?', - 'questeq' => '≟', - 'QUOT' => '"', - 'QUO' => '"', - 'quot' => '"', - 'quo' => '"', - 'rAarr' => '⇛', - 'race' => '∽̱', - 'Racute' => 'Å”', - 'racute' => 'Å•', - 'radic' => '√', - 'raemptyv' => '⦳', - 'Rang' => '⟫', - 'rang' => '⟩', - 'rangd' => '⦒', - 'range' => '⦥', - 'rangle' => '⟩', - 'raquo' => '»', - 'raqu' => '»', - 'Rarr' => '↠', - 'rArr' => '⇒', - 'rarr' => '→', - 'rarrap' => '⥵', - 'rarrb' => '⇥', - 'rarrbfs' => '⤠', - 'rarrc' => '⤳', - 'rarrfs' => '⤞', - 'rarrhk' => '↪', - 'rarrlp' => '↬', - 'rarrpl' => '⥅', - 'rarrsim' => '⥴', - 'Rarrtl' => '⤖', - 'rarrtl' => '↣', - 'rarrw' => 'â†', - 'rAtail' => '⤜', - 'ratail' => '⤚', - 'ratio' => '∶', - 'rationals' => 'â„š', - 'RBarr' => 'â¤', - 'rBarr' => 'â¤', - 'rbarr' => 'â¤', - 'rbbrk' => 'â³', - 'rbrace' => '}', - 'rbrack' => ']', - 'rbrke' => '⦌', - 'rbrksld' => '⦎', - 'rbrkslu' => 'â¦', - 'Rcaron' => 'Ř', - 'rcaron' => 'Å™', - 'Rcedil' => 'Å–', - 'rcedil' => 'Å—', - 'rceil' => '⌉', - 'rcub' => '}', - 'Rcy' => 'Р', - 'rcy' => 'Ñ€', - 'rdca' => '⤷', - 'rdldhar' => '⥩', - 'rdquo' => 'â€', - 'rdquor' => 'â€', - 'rdsh' => '↳', - 'Re' => 'â„œ', - 'real' => 'â„œ', - 'realine' => 'â„›', - 'realpart' => 'â„œ', - 'reals' => 'â„', - 'rect' => 'â–­', - 'REG' => '®', - 'RE' => '®', - 'reg' => '®', - 're' => '®', - 'ReverseElement' => '∋', - 'ReverseEquilibrium' => '⇋', - 'ReverseUpEquilibrium' => '⥯', - 'rfisht' => '⥽', - 'rfloor' => '⌋', - 'Rfr' => 'â„œ', - 'rfr' => 'ð”¯', - 'rHar' => '⥤', - 'rhard' => 'â‡', - 'rharu' => '⇀', - 'rharul' => '⥬', - 'Rho' => 'Ρ', - 'rho' => 'Ï', - 'rhov' => 'ϱ', - 'RightAngleBracket' => '⟩', - 'RightArrow' => '→', - 'Rightarrow' => '⇒', - 'rightarrow' => '→', - 'RightArrowBar' => '⇥', - 'RightArrowLeftArrow' => '⇄', - 'rightarrowtail' => '↣', - 'RightCeiling' => '⌉', - 'RightDoubleBracket' => '⟧', - 'RightDownTeeVector' => 'â¥', - 'RightDownVector' => '⇂', - 'RightDownVectorBar' => '⥕', - 'RightFloor' => '⌋', - 'rightharpoondown' => 'â‡', - 'rightharpoonup' => '⇀', - 'rightleftarrows' => '⇄', - 'rightleftharpoons' => '⇌', - 'rightrightarrows' => '⇉', - 'rightsquigarrow' => 'â†', - 'RightTee' => '⊢', - 'RightTeeArrow' => '↦', - 'RightTeeVector' => '⥛', - 'rightthreetimes' => 'â‹Œ', - 'RightTriangle' => '⊳', - 'RightTriangleBar' => 'â§', - 'RightTriangleEqual' => '⊵', - 'RightUpDownVector' => 'â¥', - 'RightUpTeeVector' => '⥜', - 'RightUpVector' => '↾', - 'RightUpVectorBar' => '⥔', - 'RightVector' => '⇀', - 'RightVectorBar' => '⥓', - 'ring' => 'Ëš', - 'risingdotseq' => '≓', - 'rlarr' => '⇄', - 'rlhar' => '⇌', - 'rlm' => 'â€', - 'rmoust' => '⎱', - 'rmoustache' => '⎱', - 'rnmid' => 'â«®', - 'roang' => '⟭', - 'roarr' => '⇾', - 'robrk' => '⟧', - 'ropar' => '⦆', - 'Ropf' => 'â„', - 'ropf' => 'ð•£', - 'roplus' => '⨮', - 'rotimes' => '⨵', - 'RoundImplies' => '⥰', - 'rpar' => ')', - 'rpargt' => '⦔', - 'rppolint' => '⨒', - 'rrarr' => '⇉', - 'Rrightarrow' => '⇛', - 'rsaquo' => '›', - 'Rscr' => 'â„›', - 'rscr' => 'ð“‡', - 'Rsh' => '↱', - 'rsh' => '↱', - 'rsqb' => ']', - 'rsquo' => '’', - 'rsquor' => '’', - 'rthree' => 'â‹Œ', - 'rtimes' => 'â‹Š', - 'rtri' => 'â–¹', - 'rtrie' => '⊵', - 'rtrif' => 'â–¸', - 'rtriltri' => '⧎', - 'RuleDelayed' => '⧴', - 'ruluhar' => '⥨', - 'rx' => 'â„ž', - 'Sacute' => 'Åš', - 'sacute' => 'Å›', - 'sbquo' => '‚', - 'Sc' => '⪼', - 'sc' => '≻', - 'scap' => '⪸', - 'Scaron' => 'Å ', - 'scaron' => 'Å¡', - 'sccue' => '≽', - 'scE' => '⪴', - 'sce' => '⪰', - 'Scedil' => 'Åž', - 'scedil' => 'ÅŸ', - 'Scirc' => 'Åœ', - 'scirc' => 'Å', - 'scnap' => '⪺', - 'scnE' => '⪶', - 'scnsim' => 'â‹©', - 'scpolint' => '⨓', - 'scsim' => '≿', - 'Scy' => 'С', - 'scy' => 'Ñ', - 'sdot' => 'â‹…', - 'sdotb' => '⊡', - 'sdote' => '⩦', - 'searhk' => '⤥', - 'seArr' => '⇘', - 'searr' => '↘', - 'searrow' => '↘', - 'sect' => '§', - 'sec' => '§', - 'semi' => ';', - 'seswar' => '⤩', - 'setminus' => '∖', - 'setmn' => '∖', - 'sext' => '✶', - 'Sfr' => 'ð”–', - 'sfr' => 'ð”°', - 'sfrown' => '⌢', - 'sharp' => '♯', - 'SHCHcy' => 'Щ', - 'shchcy' => 'щ', - 'SHcy' => 'Ш', - 'shcy' => 'ш', - 'ShortDownArrow' => '↓', - 'ShortLeftArrow' => 'â†', - 'shortmid' => '∣', - 'shortparallel' => '∥', - 'ShortRightArrow' => '→', - 'ShortUpArrow' => '↑', - 'shy' => '­', - 'sh' => '­', - 'Sigma' => 'Σ', - 'sigma' => 'σ', - 'sigmaf' => 'Ï‚', - 'sigmav' => 'Ï‚', - 'sim' => '∼', - 'simdot' => '⩪', - 'sime' => '≃', - 'simeq' => '≃', - 'simg' => '⪞', - 'simgE' => '⪠', - 'siml' => 'âª', - 'simlE' => '⪟', - 'simne' => '≆', - 'simplus' => '⨤', - 'simrarr' => '⥲', - 'slarr' => 'â†', - 'SmallCircle' => '∘', - 'smallsetminus' => '∖', - 'smashp' => '⨳', - 'smeparsl' => '⧤', - 'smid' => '∣', - 'smile' => '⌣', - 'smt' => '⪪', - 'smte' => '⪬', - 'smtes' => '⪬︀', - 'SOFTcy' => 'Ь', - 'softcy' => 'ÑŒ', - 'sol' => '/', - 'solb' => '⧄', - 'solbar' => '⌿', - 'Sopf' => 'ð•Š', - 'sopf' => 'ð•¤', - 'spades' => 'â™ ', - 'spadesuit' => 'â™ ', - 'spar' => '∥', - 'sqcap' => '⊓', - 'sqcaps' => '⊓︀', - 'sqcup' => '⊔', - 'sqcups' => '⊔︀', - 'Sqrt' => '√', - 'sqsub' => 'âŠ', - 'sqsube' => '⊑', - 'sqsubset' => 'âŠ', - 'sqsubseteq' => '⊑', - 'sqsup' => 'âŠ', - 'sqsupe' => '⊒', - 'sqsupset' => 'âŠ', - 'sqsupseteq' => '⊒', - 'squ' => 'â–¡', - 'Square' => 'â–¡', - 'square' => 'â–¡', - 'SquareIntersection' => '⊓', - 'SquareSubset' => 'âŠ', - 'SquareSubsetEqual' => '⊑', - 'SquareSuperset' => 'âŠ', - 'SquareSupersetEqual' => '⊒', - 'SquareUnion' => '⊔', - 'squarf' => 'â–ª', - 'squf' => 'â–ª', - 'srarr' => '→', - 'Sscr' => 'ð’®', - 'sscr' => 'ð“ˆ', - 'ssetmn' => '∖', - 'ssmile' => '⌣', - 'sstarf' => '⋆', - 'Star' => '⋆', - 'star' => '☆', - 'starf' => '★', - 'straightepsilon' => 'ϵ', - 'straightphi' => 'Ï•', - 'strns' => '¯', - 'Sub' => 'â‹', - 'sub' => '⊂', - 'subdot' => '⪽', - 'subE' => 'â«…', - 'sube' => '⊆', - 'subedot' => '⫃', - 'submult' => 'â«', - 'subnE' => 'â«‹', - 'subne' => '⊊', - 'subplus' => '⪿', - 'subrarr' => '⥹', - 'Subset' => 'â‹', - 'subset' => '⊂', - 'subseteq' => '⊆', - 'subseteqq' => 'â«…', - 'SubsetEqual' => '⊆', - 'subsetneq' => '⊊', - 'subsetneqq' => 'â«‹', - 'subsim' => '⫇', - 'subsub' => 'â«•', - 'subsup' => 'â«“', - 'succ' => '≻', - 'succapprox' => '⪸', - 'succcurlyeq' => '≽', - 'Succeeds' => '≻', - 'SucceedsEqual' => '⪰', - 'SucceedsSlantEqual' => '≽', - 'SucceedsTilde' => '≿', - 'succeq' => '⪰', - 'succnapprox' => '⪺', - 'succneqq' => '⪶', - 'succnsim' => 'â‹©', - 'succsim' => '≿', - 'SuchThat' => '∋', - 'Sum' => '∑', - 'sum' => '∑', - 'sung' => '♪', - 'Sup' => 'â‹‘', - 'sup' => '³', - 'sup1' => '¹', - 'sup2' => '²', - 'sup3' => '³', - 'supdot' => '⪾', - 'supdsub' => '⫘', - 'supE' => '⫆', - 'supe' => '⊇', - 'supedot' => 'â«„', - 'Superset' => '⊃', - 'SupersetEqual' => '⊇', - 'suphsol' => '⟉', - 'suphsub' => 'â«—', - 'suplarr' => '⥻', - 'supmult' => 'â«‚', - 'supnE' => 'â«Œ', - 'supne' => '⊋', - 'supplus' => 'â«€', - 'Supset' => 'â‹‘', - 'supset' => '⊃', - 'supseteq' => '⊇', - 'supseteqq' => '⫆', - 'supsetneq' => '⊋', - 'supsetneqq' => 'â«Œ', - 'supsim' => '⫈', - 'supsub' => 'â«”', - 'supsup' => 'â«–', - 'swarhk' => '⤦', - 'swArr' => '⇙', - 'swarr' => '↙', - 'swarrow' => '↙', - 'swnwar' => '⤪', - 'szlig' => 'ß', - 'szli' => 'ß', - 'Tab' => ' ', - 'target' => '⌖', - 'Tau' => 'Τ', - 'tau' => 'Ï„', - 'tbrk' => '⎴', - 'Tcaron' => 'Ť', - 'tcaron' => 'Å¥', - 'Tcedil' => 'Å¢', - 'tcedil' => 'Å£', - 'Tcy' => 'Т', - 'tcy' => 'Ñ‚', - 'tdot' => '⃛', - 'telrec' => '⌕', - 'Tfr' => 'ð”—', - 'tfr' => 'ð”±', - 'there4' => '∴', - 'Therefore' => '∴', - 'therefore' => '∴', - 'Theta' => 'Θ', - 'theta' => 'θ', - 'thetasym' => 'Ï‘', - 'thetav' => 'Ï‘', - 'thickapprox' => '≈', - 'thicksim' => '∼', - 'ThickSpace' => 'âŸâ€Š', - 'thinsp' => ' ', - 'ThinSpace' => ' ', - 'thkap' => '≈', - 'thksim' => '∼', - 'THORN' => 'Þ', - 'THOR' => 'Þ', - 'thorn' => 'þ', - 'thor' => 'þ', - 'Tilde' => '∼', - 'tilde' => 'Ëœ', - 'TildeEqual' => '≃', - 'TildeFullEqual' => '≅', - 'TildeTilde' => '≈', - 'times' => '×', - 'time' => '×', - 'timesb' => '⊠', - 'timesbar' => '⨱', - 'timesd' => '⨰', - 'tint' => '∭', - 'toea' => '⤨', - 'top' => '⊤', - 'topbot' => '⌶', - 'topcir' => '⫱', - 'Topf' => 'ð•‹', - 'topf' => 'ð•¥', - 'topfork' => 'â«š', - 'tosa' => '⤩', - 'tprime' => '‴', - 'TRADE' => 'â„¢', - 'trade' => 'â„¢', - 'triangle' => 'â–µ', - 'triangledown' => 'â–¿', - 'triangleleft' => 'â—ƒ', - 'trianglelefteq' => '⊴', - 'triangleq' => '≜', - 'triangleright' => 'â–¹', - 'trianglerighteq' => '⊵', - 'tridot' => 'â—¬', - 'trie' => '≜', - 'triminus' => '⨺', - 'TripleDot' => '⃛', - 'triplus' => '⨹', - 'trisb' => 'â§', - 'tritime' => '⨻', - 'trpezium' => 'â¢', - 'Tscr' => 'ð’¯', - 'tscr' => 'ð“‰', - 'TScy' => 'Ц', - 'tscy' => 'ц', - 'TSHcy' => 'Ћ', - 'tshcy' => 'Ñ›', - 'Tstrok' => 'Ŧ', - 'tstrok' => 'ŧ', - 'twixt' => '≬', - 'twoheadleftarrow' => '↞', - 'twoheadrightarrow' => '↠', - 'Uacute' => 'Ú', - 'Uacut' => 'Ú', - 'uacute' => 'ú', - 'uacut' => 'ú', - 'Uarr' => '↟', - 'uArr' => '⇑', - 'uarr' => '↑', - 'Uarrocir' => '⥉', - 'Ubrcy' => 'ÐŽ', - 'ubrcy' => 'Ñž', - 'Ubreve' => 'Ŭ', - 'ubreve' => 'Å­', - 'Ucirc' => 'Û', - 'Ucir' => 'Û', - 'ucirc' => 'û', - 'ucir' => 'û', - 'Ucy' => 'У', - 'ucy' => 'у', - 'udarr' => '⇅', - 'Udblac' => 'Å°', - 'udblac' => 'ű', - 'udhar' => '⥮', - 'ufisht' => '⥾', - 'Ufr' => 'ð”˜', - 'ufr' => 'ð”²', - 'Ugrave' => 'Ù', - 'Ugrav' => 'Ù', - 'ugrave' => 'ù', - 'ugrav' => 'ù', - 'uHar' => '⥣', - 'uharl' => '↿', - 'uharr' => '↾', - 'uhblk' => 'â–€', - 'ulcorn' => '⌜', - 'ulcorner' => '⌜', - 'ulcrop' => 'âŒ', - 'ultri' => 'â—¸', - 'Umacr' => 'Ū', - 'umacr' => 'Å«', - 'uml' => '¨', - 'um' => '¨', - 'UnderBar' => '_', - 'UnderBrace' => 'âŸ', - 'UnderBracket' => '⎵', - 'UnderParenthesis' => 'â', - 'Union' => '⋃', - 'UnionPlus' => '⊎', - 'Uogon' => 'Ų', - 'uogon' => 'ų', - 'Uopf' => 'ð•Œ', - 'uopf' => 'ð•¦', - 'UpArrow' => '↑', - 'Uparrow' => '⇑', - 'uparrow' => '↑', - 'UpArrowBar' => '⤒', - 'UpArrowDownArrow' => '⇅', - 'UpDownArrow' => '↕', - 'Updownarrow' => '⇕', - 'updownarrow' => '↕', - 'UpEquilibrium' => '⥮', - 'upharpoonleft' => '↿', - 'upharpoonright' => '↾', - 'uplus' => '⊎', - 'UpperLeftArrow' => '↖', - 'UpperRightArrow' => '↗', - 'Upsi' => 'Ï’', - 'upsi' => 'Ï…', - 'upsih' => 'Ï’', - 'Upsilon' => 'Î¥', - 'upsilon' => 'Ï…', - 'UpTee' => '⊥', - 'UpTeeArrow' => '↥', - 'upuparrows' => '⇈', - 'urcorn' => 'âŒ', - 'urcorner' => 'âŒ', - 'urcrop' => '⌎', - 'Uring' => 'Å®', - 'uring' => 'ů', - 'urtri' => 'â—¹', - 'Uscr' => 'ð’°', - 'uscr' => 'ð“Š', - 'utdot' => 'â‹°', - 'Utilde' => 'Ũ', - 'utilde' => 'Å©', - 'utri' => 'â–µ', - 'utrif' => 'â–´', - 'uuarr' => '⇈', - 'Uuml' => 'Ãœ', - 'Uum' => 'Ãœ', - 'uuml' => 'ü', - 'uum' => 'ü', - 'uwangle' => '⦧', - 'vangrt' => '⦜', - 'varepsilon' => 'ϵ', - 'varkappa' => 'Ï°', - 'varnothing' => '∅', - 'varphi' => 'Ï•', - 'varpi' => 'Ï–', - 'varpropto' => 'âˆ', - 'vArr' => '⇕', - 'varr' => '↕', - 'varrho' => 'ϱ', - 'varsigma' => 'Ï‚', - 'varsubsetneq' => '⊊︀', - 'varsubsetneqq' => '⫋︀', - 'varsupsetneq' => '⊋︀', - 'varsupsetneqq' => '⫌︀', - 'vartheta' => 'Ï‘', - 'vartriangleleft' => '⊲', - 'vartriangleright' => '⊳', - 'Vbar' => 'â««', - 'vBar' => '⫨', - 'vBarv' => 'â«©', - 'Vcy' => 'Ð’', - 'vcy' => 'в', - 'VDash' => '⊫', - 'Vdash' => '⊩', - 'vDash' => '⊨', - 'vdash' => '⊢', - 'Vdashl' => '⫦', - 'Vee' => 'â‹', - 'vee' => '∨', - 'veebar' => '⊻', - 'veeeq' => '≚', - 'vellip' => 'â‹®', - 'Verbar' => '‖', - 'verbar' => '|', - 'Vert' => '‖', - 'vert' => '|', - 'VerticalBar' => '∣', - 'VerticalLine' => '|', - 'VerticalSeparator' => 'â˜', - 'VerticalTilde' => '≀', - 'VeryThinSpace' => ' ', - 'Vfr' => 'ð”™', - 'vfr' => 'ð”³', - 'vltri' => '⊲', - 'vnsub' => '⊂⃒', - 'vnsup' => '⊃⃒', - 'Vopf' => 'ð•', - 'vopf' => 'ð•§', - 'vprop' => 'âˆ', - 'vrtri' => '⊳', - 'Vscr' => 'ð’±', - 'vscr' => 'ð“‹', - 'vsubnE' => '⫋︀', - 'vsubne' => '⊊︀', - 'vsupnE' => '⫌︀', - 'vsupne' => '⊋︀', - 'Vvdash' => '⊪', - 'vzigzag' => '⦚', - 'Wcirc' => 'Å´', - 'wcirc' => 'ŵ', - 'wedbar' => 'â©Ÿ', - 'Wedge' => 'â‹€', - 'wedge' => '∧', - 'wedgeq' => '≙', - 'weierp' => '℘', - 'Wfr' => 'ð”š', - 'wfr' => 'ð”´', - 'Wopf' => 'ð•Ž', - 'wopf' => 'ð•¨', - 'wp' => '℘', - 'wr' => '≀', - 'wreath' => '≀', - 'Wscr' => 'ð’²', - 'wscr' => 'ð“Œ', - 'xcap' => 'â‹‚', - 'xcirc' => 'â—¯', - 'xcup' => '⋃', - 'xdtri' => 'â–½', - 'Xfr' => 'ð”›', - 'xfr' => 'ð”µ', - 'xhArr' => '⟺', - 'xharr' => '⟷', - 'Xi' => 'Ξ', - 'xi' => 'ξ', - 'xlArr' => '⟸', - 'xlarr' => '⟵', - 'xmap' => '⟼', - 'xnis' => 'â‹»', - 'xodot' => '⨀', - 'Xopf' => 'ð•', - 'xopf' => 'ð•©', - 'xoplus' => 'â¨', - 'xotime' => '⨂', - 'xrArr' => '⟹', - 'xrarr' => '⟶', - 'Xscr' => 'ð’³', - 'xscr' => 'ð“', - 'xsqcup' => '⨆', - 'xuplus' => '⨄', - 'xutri' => 'â–³', - 'xvee' => 'â‹', - 'xwedge' => 'â‹€', - 'Yacute' => 'Ã', - 'Yacut' => 'Ã', - 'yacute' => 'ý', - 'yacut' => 'ý', - 'YAcy' => 'Я', - 'yacy' => 'Ñ', - 'Ycirc' => 'Ŷ', - 'ycirc' => 'Å·', - 'Ycy' => 'Ы', - 'ycy' => 'Ñ‹', - 'yen' => 'Â¥', - 'ye' => 'Â¥', - 'Yfr' => 'ð”œ', - 'yfr' => 'ð”¶', - 'YIcy' => 'Ї', - 'yicy' => 'Ñ—', - 'Yopf' => 'ð•', - 'yopf' => 'ð•ª', - 'Yscr' => 'ð’´', - 'yscr' => 'ð“Ž', - 'YUcy' => 'Ю', - 'yucy' => 'ÑŽ', - 'Yuml' => 'Ÿ', - 'yuml' => 'ÿ', - 'yum' => 'ÿ', - 'Zacute' => 'Ź', - 'zacute' => 'ź', - 'Zcaron' => 'Ž', - 'zcaron' => 'ž', - 'Zcy' => 'З', - 'zcy' => 'з', - 'Zdot' => 'Å»', - 'zdot' => 'ż', - 'zeetrf' => 'ℨ', - 'ZeroWidthSpace' => '​', - 'Zeta' => 'Ζ', - 'zeta' => 'ζ', - 'Zfr' => 'ℨ', - 'zfr' => 'ð”·', - 'ZHcy' => 'Ж', - 'zhcy' => 'ж', - 'zigrarr' => 'â‡', - 'Zopf' => 'ℤ', - 'zopf' => 'ð•«', - 'Zscr' => 'ð’µ', - 'zscr' => 'ð“', - 'zwj' => 'â€', - 'zwnj' => '‌', -); + 'nexist' => '∄', + 'nexists' => '∄', + 'Nfr' => 'ð”‘', + 'nfr' => 'ð”«', + 'ngE' => '≧̸', + 'nge' => '≱', + 'ngeq' => '≱', + 'ngeqq' => '≧̸', + 'ngeqslant' => '⩾̸', + 'nges' => '⩾̸', + 'nGg' => '⋙̸', + 'ngsim' => '≵', + 'nGt' => '≫⃒', + 'ngt' => '≯', + 'ngtr' => '≯', + 'nGtv' => '≫̸', + 'nhArr' => '⇎', + 'nharr' => '↮', + 'nhpar' => '⫲', + 'ni' => '∋', + 'nis' => '⋼', + 'nisd' => '⋺', + 'niv' => '∋', + 'NJcy' => 'Њ', + 'njcy' => 'Ñš', + 'nlArr' => 'â‡', + 'nlarr' => '↚', + 'nldr' => '‥', + 'nlE' => '≦̸', + 'nle' => '≰', + 'nLeftarrow' => 'â‡', + 'nleftarrow' => '↚', + 'nLeftrightarrow' => '⇎', + 'nleftrightarrow' => '↮', + 'nleq' => '≰', + 'nleqq' => '≦̸', + 'nleqslant' => '⩽̸', + 'nles' => '⩽̸', + 'nless' => '≮', + 'nLl' => '⋘̸', + 'nlsim' => '≴', + 'nLt' => '≪⃒', + 'nlt' => '≮', + 'nltri' => '⋪', + 'nltrie' => '⋬', + 'nLtv' => '≪̸', + 'nmid' => '∤', + 'NoBreak' => 'â ', + 'NonBreakingSpace' => ' ', + 'Nopf' => 'â„•', + 'nopf' => 'ð•Ÿ', + 'Not' => '⫬', + 'not' => '¬', + 'no' => '¬', + 'NotCongruent' => '≢', + 'NotCupCap' => '≭', + 'NotDoubleVerticalBar' => '∦', + 'NotElement' => '∉', + 'NotEqual' => '≠', + 'NotEqualTilde' => '≂̸', + 'NotExists' => '∄', + 'NotGreater' => '≯', + 'NotGreaterEqual' => '≱', + 'NotGreaterFullEqual' => '≧̸', + 'NotGreaterGreater' => '≫̸', + 'NotGreaterLess' => '≹', + 'NotGreaterSlantEqual' => '⩾̸', + 'NotGreaterTilde' => '≵', + 'NotHumpDownHump' => '≎̸', + 'NotHumpEqual' => 'â‰Ì¸', + 'notin' => '∉', + 'notindot' => '⋵̸', + 'notinE' => '⋹̸', + 'notinva' => '∉', + 'notinvb' => 'â‹·', + 'notinvc' => '⋶', + 'NotLeftTriangle' => '⋪', + 'NotLeftTriangleBar' => 'â§Ì¸', + 'NotLeftTriangleEqual' => '⋬', + 'NotLess' => '≮', + 'NotLessEqual' => '≰', + 'NotLessGreater' => '≸', + 'NotLessLess' => '≪̸', + 'NotLessSlantEqual' => '⩽̸', + 'NotLessTilde' => '≴', + 'NotNestedGreaterGreater' => '⪢̸', + 'NotNestedLessLess' => '⪡̸', + 'notni' => '∌', + 'notniva' => '∌', + 'notnivb' => '⋾', + 'notnivc' => '⋽', + 'NotPrecedes' => '⊀', + 'NotPrecedesEqual' => '⪯̸', + 'NotPrecedesSlantEqual' => 'â‹ ', + 'NotReverseElement' => '∌', + 'NotRightTriangle' => 'â‹«', + 'NotRightTriangleBar' => 'â§Ì¸', + 'NotRightTriangleEqual' => 'â‹­', + 'NotSquareSubset' => 'âŠÌ¸', + 'NotSquareSubsetEqual' => 'â‹¢', + 'NotSquareSuperset' => 'âŠÌ¸', + 'NotSquareSupersetEqual' => 'â‹£', + 'NotSubset' => '⊂⃒', + 'NotSubsetEqual' => '⊈', + 'NotSucceeds' => 'âŠ', + 'NotSucceedsEqual' => '⪰̸', + 'NotSucceedsSlantEqual' => 'â‹¡', + 'NotSucceedsTilde' => '≿̸', + 'NotSuperset' => '⊃⃒', + 'NotSupersetEqual' => '⊉', + 'NotTilde' => 'â‰', + 'NotTildeEqual' => '≄', + 'NotTildeFullEqual' => '≇', + 'NotTildeTilde' => '≉', + 'NotVerticalBar' => '∤', + 'npar' => '∦', + 'nparallel' => '∦', + 'nparsl' => '⫽⃥', + 'npart' => '∂̸', + 'npolint' => '⨔', + 'npr' => '⊀', + 'nprcue' => 'â‹ ', + 'npre' => '⪯̸', + 'nprec' => '⊀', + 'npreceq' => '⪯̸', + 'nrArr' => 'â‡', + 'nrarr' => '↛', + 'nrarrc' => '⤳̸', + 'nrarrw' => 'â†Ì¸', + 'nRightarrow' => 'â‡', + 'nrightarrow' => '↛', + 'nrtri' => 'â‹«', + 'nrtrie' => 'â‹­', + 'nsc' => 'âŠ', + 'nsccue' => 'â‹¡', + 'nsce' => '⪰̸', + 'Nscr' => 'ð’©', + 'nscr' => 'ð“ƒ', + 'nshortmid' => '∤', + 'nshortparallel' => '∦', + 'nsim' => 'â‰', + 'nsime' => '≄', + 'nsimeq' => '≄', + 'nsmid' => '∤', + 'nspar' => '∦', + 'nsqsube' => 'â‹¢', + 'nsqsupe' => 'â‹£', + 'nsub' => '⊄', + 'nsubE' => '⫅̸', + 'nsube' => '⊈', + 'nsubset' => '⊂⃒', + 'nsubseteq' => '⊈', + 'nsubseteqq' => '⫅̸', + 'nsucc' => 'âŠ', + 'nsucceq' => '⪰̸', + 'nsup' => '⊅', + 'nsupE' => '⫆̸', + 'nsupe' => '⊉', + 'nsupset' => '⊃⃒', + 'nsupseteq' => '⊉', + 'nsupseteqq' => '⫆̸', + 'ntgl' => '≹', + 'Ntilde' => 'Ñ', + 'Ntild' => 'Ñ', + 'ntilde' => 'ñ', + 'ntild' => 'ñ', + 'ntlg' => '≸', + 'ntriangleleft' => '⋪', + 'ntrianglelefteq' => '⋬', + 'ntriangleright' => 'â‹«', + 'ntrianglerighteq' => 'â‹­', + 'Nu' => 'Î', + 'nu' => 'ν', + 'num' => '#', + 'numero' => 'â„–', + 'numsp' => ' ', + 'nvap' => 'â‰âƒ’', + 'nVDash' => '⊯', + 'nVdash' => '⊮', + 'nvDash' => '⊭', + 'nvdash' => '⊬', + 'nvge' => '≥⃒', + 'nvgt' => '>⃒', + 'nvHarr' => '⤄', + 'nvinfin' => '⧞', + 'nvlArr' => '⤂', + 'nvle' => '≤⃒', + 'nvlt' => '<⃒', + 'nvltrie' => '⊴⃒', + 'nvrArr' => '⤃', + 'nvrtrie' => '⊵⃒', + 'nvsim' => '∼⃒', + 'nwarhk' => '⤣', + 'nwArr' => '⇖', + 'nwarr' => '↖', + 'nwarrow' => '↖', + 'nwnear' => '⤧', + 'Oacute' => 'Ó', + 'Oacut' => 'Ó', + 'oacute' => 'ó', + 'oacut' => 'ó', + 'oast' => '⊛', + 'ocir' => 'ô', + 'Ocirc' => 'Ô', + 'Ocir' => 'Ô', + 'ocirc' => 'ô', + 'Ocy' => 'О', + 'ocy' => 'о', + 'odash' => 'âŠ', + 'Odblac' => 'Å', + 'odblac' => 'Å‘', + 'odiv' => '⨸', + 'odot' => '⊙', + 'odsold' => '⦼', + 'OElig' => 'Å’', + 'oelig' => 'Å“', + 'ofcir' => '⦿', + 'Ofr' => 'ð”’', + 'ofr' => 'ð”¬', + 'ogon' => 'Ë›', + 'Ograve' => 'Ã’', + 'Ograv' => 'Ã’', + 'ograve' => 'ò', + 'ograv' => 'ò', + 'ogt' => 'â§', + 'ohbar' => '⦵', + 'ohm' => 'Ω', + 'oint' => '∮', + 'olarr' => '↺', + 'olcir' => '⦾', + 'olcross' => '⦻', + 'oline' => '‾', + 'olt' => '⧀', + 'Omacr' => 'ÅŒ', + 'omacr' => 'Å', + 'Omega' => 'Ω', + 'omega' => 'ω', + 'Omicron' => 'Ο', + 'omicron' => 'ο', + 'omid' => '⦶', + 'ominus' => '⊖', + 'Oopf' => 'ð•†', + 'oopf' => 'ð• ', + 'opar' => '⦷', + 'OpenCurlyDoubleQuote' => '“', + 'OpenCurlyQuote' => '‘', + 'operp' => '⦹', + 'oplus' => '⊕', + 'Or' => 'â©”', + 'or' => '∨', + 'orarr' => '↻', + 'ord' => 'º', + 'order' => 'â„´', + 'orderof' => 'â„´', + 'ordf' => 'ª', + 'ordm' => 'º', + 'origof' => '⊶', + 'oror' => 'â©–', + 'orslope' => 'â©—', + 'orv' => 'â©›', + 'oS' => 'Ⓢ', + 'Oscr' => 'ð’ª', + 'oscr' => 'â„´', + 'Oslash' => 'Ø', + 'Oslas' => 'Ø', + 'oslash' => 'ø', + 'oslas' => 'ø', + 'osol' => '⊘', + 'Otilde' => 'Õ', + 'Otild' => 'Õ', + 'otilde' => 'õ', + 'otild' => 'õ', + 'Otimes' => '⨷', + 'otimes' => '⊗', + 'otimesas' => '⨶', + 'Ouml' => 'Ö', + 'Oum' => 'Ö', + 'ouml' => 'ö', + 'oum' => 'ö', + 'ovbar' => '⌽', + 'OverBar' => '‾', + 'OverBrace' => 'âž', + 'OverBracket' => '⎴', + 'OverParenthesis' => 'âœ', + 'par' => '¶', + 'para' => '¶', + 'parallel' => '∥', + 'parsim' => '⫳', + 'parsl' => '⫽', + 'part' => '∂', + 'PartialD' => '∂', + 'Pcy' => 'П', + 'pcy' => 'п', + 'percnt' => '%', + 'period' => '.', + 'permil' => '‰', + 'perp' => '⊥', + 'pertenk' => '‱', + 'Pfr' => 'ð”“', + 'pfr' => 'ð”­', + 'Phi' => 'Φ', + 'phi' => 'φ', + 'phiv' => 'Ï•', + 'phmmat' => 'ℳ', + 'phone' => '☎', + 'Pi' => 'Π', + 'pi' => 'Ï€', + 'pitchfork' => 'â‹”', + 'piv' => 'Ï–', + 'planck' => 'â„', + 'planckh' => 'â„Ž', + 'plankv' => 'â„', + 'plus' => '+', + 'plusacir' => '⨣', + 'plusb' => '⊞', + 'pluscir' => '⨢', + 'plusdo' => '∔', + 'plusdu' => '⨥', + 'pluse' => '⩲', + 'PlusMinus' => '±', + 'plusmn' => '±', + 'plusm' => '±', + 'plussim' => '⨦', + 'plustwo' => '⨧', + 'pm' => '±', + 'Poincareplane' => 'â„Œ', + 'pointint' => '⨕', + 'Popf' => 'â„™', + 'popf' => 'ð•¡', + 'pound' => '£', + 'poun' => '£', + 'Pr' => '⪻', + 'pr' => '≺', + 'prap' => '⪷', + 'prcue' => '≼', + 'prE' => '⪳', + 'pre' => '⪯', + 'prec' => '≺', + 'precapprox' => '⪷', + 'preccurlyeq' => '≼', + 'Precedes' => '≺', + 'PrecedesEqual' => '⪯', + 'PrecedesSlantEqual' => '≼', + 'PrecedesTilde' => '≾', + 'preceq' => '⪯', + 'precnapprox' => '⪹', + 'precneqq' => '⪵', + 'precnsim' => '⋨', + 'precsim' => '≾', + 'Prime' => '″', + 'prime' => '′', + 'primes' => 'â„™', + 'prnap' => '⪹', + 'prnE' => '⪵', + 'prnsim' => '⋨', + 'prod' => 'âˆ', + 'Product' => 'âˆ', + 'profalar' => '⌮', + 'profline' => '⌒', + 'profsurf' => '⌓', + 'prop' => 'âˆ', + 'Proportion' => '∷', + 'Proportional' => 'âˆ', + 'propto' => 'âˆ', + 'prsim' => '≾', + 'prurel' => '⊰', + 'Pscr' => 'ð’«', + 'pscr' => 'ð“…', + 'Psi' => 'Ψ', + 'psi' => 'ψ', + 'puncsp' => ' ', + 'Qfr' => 'ð””', + 'qfr' => 'ð”®', + 'qint' => '⨌', + 'Qopf' => 'â„š', + 'qopf' => 'ð•¢', + 'qprime' => 'â—', + 'Qscr' => 'ð’¬', + 'qscr' => 'ð“†', + 'quaternions' => 'â„', + 'quatint' => '⨖', + 'quest' => '?', + 'questeq' => '≟', + 'QUOT' => '"', + 'QUO' => '"', + 'quot' => '"', + 'quo' => '"', + 'rAarr' => '⇛', + 'race' => '∽̱', + 'Racute' => 'Å”', + 'racute' => 'Å•', + 'radic' => '√', + 'raemptyv' => '⦳', + 'Rang' => '⟫', + 'rang' => '⟩', + 'rangd' => '⦒', + 'range' => '⦥', + 'rangle' => '⟩', + 'raquo' => '»', + 'raqu' => '»', + 'Rarr' => '↠', + 'rArr' => '⇒', + 'rarr' => '→', + 'rarrap' => '⥵', + 'rarrb' => '⇥', + 'rarrbfs' => '⤠', + 'rarrc' => '⤳', + 'rarrfs' => '⤞', + 'rarrhk' => '↪', + 'rarrlp' => '↬', + 'rarrpl' => '⥅', + 'rarrsim' => '⥴', + 'Rarrtl' => '⤖', + 'rarrtl' => '↣', + 'rarrw' => 'â†', + 'rAtail' => '⤜', + 'ratail' => '⤚', + 'ratio' => '∶', + 'rationals' => 'â„š', + 'RBarr' => 'â¤', + 'rBarr' => 'â¤', + 'rbarr' => 'â¤', + 'rbbrk' => 'â³', + 'rbrace' => '}', + 'rbrack' => ']', + 'rbrke' => '⦌', + 'rbrksld' => '⦎', + 'rbrkslu' => 'â¦', + 'Rcaron' => 'Ř', + 'rcaron' => 'Å™', + 'Rcedil' => 'Å–', + 'rcedil' => 'Å—', + 'rceil' => '⌉', + 'rcub' => '}', + 'Rcy' => 'Р', + 'rcy' => 'Ñ€', + 'rdca' => '⤷', + 'rdldhar' => '⥩', + 'rdquo' => 'â€', + 'rdquor' => 'â€', + 'rdsh' => '↳', + 'Re' => 'â„œ', + 'real' => 'â„œ', + 'realine' => 'â„›', + 'realpart' => 'â„œ', + 'reals' => 'â„', + 'rect' => 'â–­', + 'REG' => '®', + 'RE' => '®', + 'reg' => '®', + 're' => '®', + 'ReverseElement' => '∋', + 'ReverseEquilibrium' => '⇋', + 'ReverseUpEquilibrium' => '⥯', + 'rfisht' => '⥽', + 'rfloor' => '⌋', + 'Rfr' => 'â„œ', + 'rfr' => 'ð”¯', + 'rHar' => '⥤', + 'rhard' => 'â‡', + 'rharu' => '⇀', + 'rharul' => '⥬', + 'Rho' => 'Ρ', + 'rho' => 'Ï', + 'rhov' => 'ϱ', + 'RightAngleBracket' => '⟩', + 'RightArrow' => '→', + 'Rightarrow' => '⇒', + 'rightarrow' => '→', + 'RightArrowBar' => '⇥', + 'RightArrowLeftArrow' => '⇄', + 'rightarrowtail' => '↣', + 'RightCeiling' => '⌉', + 'RightDoubleBracket' => '⟧', + 'RightDownTeeVector' => 'â¥', + 'RightDownVector' => '⇂', + 'RightDownVectorBar' => '⥕', + 'RightFloor' => '⌋', + 'rightharpoondown' => 'â‡', + 'rightharpoonup' => '⇀', + 'rightleftarrows' => '⇄', + 'rightleftharpoons' => '⇌', + 'rightrightarrows' => '⇉', + 'rightsquigarrow' => 'â†', + 'RightTee' => '⊢', + 'RightTeeArrow' => '↦', + 'RightTeeVector' => '⥛', + 'rightthreetimes' => 'â‹Œ', + 'RightTriangle' => '⊳', + 'RightTriangleBar' => 'â§', + 'RightTriangleEqual' => '⊵', + 'RightUpDownVector' => 'â¥', + 'RightUpTeeVector' => '⥜', + 'RightUpVector' => '↾', + 'RightUpVectorBar' => '⥔', + 'RightVector' => '⇀', + 'RightVectorBar' => '⥓', + 'ring' => 'Ëš', + 'risingdotseq' => '≓', + 'rlarr' => '⇄', + 'rlhar' => '⇌', + 'rlm' => 'â€', + 'rmoust' => '⎱', + 'rmoustache' => '⎱', + 'rnmid' => 'â«®', + 'roang' => '⟭', + 'roarr' => '⇾', + 'robrk' => '⟧', + 'ropar' => '⦆', + 'Ropf' => 'â„', + 'ropf' => 'ð•£', + 'roplus' => '⨮', + 'rotimes' => '⨵', + 'RoundImplies' => '⥰', + 'rpar' => ')', + 'rpargt' => '⦔', + 'rppolint' => '⨒', + 'rrarr' => '⇉', + 'Rrightarrow' => '⇛', + 'rsaquo' => '›', + 'Rscr' => 'â„›', + 'rscr' => 'ð“‡', + 'Rsh' => '↱', + 'rsh' => '↱', + 'rsqb' => ']', + 'rsquo' => '’', + 'rsquor' => '’', + 'rthree' => 'â‹Œ', + 'rtimes' => 'â‹Š', + 'rtri' => 'â–¹', + 'rtrie' => '⊵', + 'rtrif' => 'â–¸', + 'rtriltri' => '⧎', + 'RuleDelayed' => '⧴', + 'ruluhar' => '⥨', + 'rx' => 'â„ž', + 'Sacute' => 'Åš', + 'sacute' => 'Å›', + 'sbquo' => '‚', + 'Sc' => '⪼', + 'sc' => '≻', + 'scap' => '⪸', + 'Scaron' => 'Å ', + 'scaron' => 'Å¡', + 'sccue' => '≽', + 'scE' => '⪴', + 'sce' => '⪰', + 'Scedil' => 'Åž', + 'scedil' => 'ÅŸ', + 'Scirc' => 'Åœ', + 'scirc' => 'Å', + 'scnap' => '⪺', + 'scnE' => '⪶', + 'scnsim' => 'â‹©', + 'scpolint' => '⨓', + 'scsim' => '≿', + 'Scy' => 'С', + 'scy' => 'Ñ', + 'sdot' => 'â‹…', + 'sdotb' => '⊡', + 'sdote' => '⩦', + 'searhk' => '⤥', + 'seArr' => '⇘', + 'searr' => '↘', + 'searrow' => '↘', + 'sect' => '§', + 'sec' => '§', + 'semi' => ';', + 'seswar' => '⤩', + 'setminus' => '∖', + 'setmn' => '∖', + 'sext' => '✶', + 'Sfr' => 'ð”–', + 'sfr' => 'ð”°', + 'sfrown' => '⌢', + 'sharp' => '♯', + 'SHCHcy' => 'Щ', + 'shchcy' => 'щ', + 'SHcy' => 'Ш', + 'shcy' => 'ш', + 'ShortDownArrow' => '↓', + 'ShortLeftArrow' => 'â†', + 'shortmid' => '∣', + 'shortparallel' => '∥', + 'ShortRightArrow' => '→', + 'ShortUpArrow' => '↑', + 'shy' => '­', + 'sh' => '­', + 'Sigma' => 'Σ', + 'sigma' => 'σ', + 'sigmaf' => 'Ï‚', + 'sigmav' => 'Ï‚', + 'sim' => '∼', + 'simdot' => '⩪', + 'sime' => '≃', + 'simeq' => '≃', + 'simg' => '⪞', + 'simgE' => '⪠', + 'siml' => 'âª', + 'simlE' => '⪟', + 'simne' => '≆', + 'simplus' => '⨤', + 'simrarr' => '⥲', + 'slarr' => 'â†', + 'SmallCircle' => '∘', + 'smallsetminus' => '∖', + 'smashp' => '⨳', + 'smeparsl' => '⧤', + 'smid' => '∣', + 'smile' => '⌣', + 'smt' => '⪪', + 'smte' => '⪬', + 'smtes' => '⪬︀', + 'SOFTcy' => 'Ь', + 'softcy' => 'ÑŒ', + 'sol' => '/', + 'solb' => '⧄', + 'solbar' => '⌿', + 'Sopf' => 'ð•Š', + 'sopf' => 'ð•¤', + 'spades' => 'â™ ', + 'spadesuit' => 'â™ ', + 'spar' => '∥', + 'sqcap' => '⊓', + 'sqcaps' => '⊓︀', + 'sqcup' => '⊔', + 'sqcups' => '⊔︀', + 'Sqrt' => '√', + 'sqsub' => 'âŠ', + 'sqsube' => '⊑', + 'sqsubset' => 'âŠ', + 'sqsubseteq' => '⊑', + 'sqsup' => 'âŠ', + 'sqsupe' => '⊒', + 'sqsupset' => 'âŠ', + 'sqsupseteq' => '⊒', + 'squ' => 'â–¡', + 'Square' => 'â–¡', + 'square' => 'â–¡', + 'SquareIntersection' => '⊓', + 'SquareSubset' => 'âŠ', + 'SquareSubsetEqual' => '⊑', + 'SquareSuperset' => 'âŠ', + 'SquareSupersetEqual' => '⊒', + 'SquareUnion' => '⊔', + 'squarf' => 'â–ª', + 'squf' => 'â–ª', + 'srarr' => '→', + 'Sscr' => 'ð’®', + 'sscr' => 'ð“ˆ', + 'ssetmn' => '∖', + 'ssmile' => '⌣', + 'sstarf' => '⋆', + 'Star' => '⋆', + 'star' => '☆', + 'starf' => '★', + 'straightepsilon' => 'ϵ', + 'straightphi' => 'Ï•', + 'strns' => '¯', + 'Sub' => 'â‹', + 'sub' => '⊂', + 'subdot' => '⪽', + 'subE' => 'â«…', + 'sube' => '⊆', + 'subedot' => '⫃', + 'submult' => 'â«', + 'subnE' => 'â«‹', + 'subne' => '⊊', + 'subplus' => '⪿', + 'subrarr' => '⥹', + 'Subset' => 'â‹', + 'subset' => '⊂', + 'subseteq' => '⊆', + 'subseteqq' => 'â«…', + 'SubsetEqual' => '⊆', + 'subsetneq' => '⊊', + 'subsetneqq' => 'â«‹', + 'subsim' => '⫇', + 'subsub' => 'â«•', + 'subsup' => 'â«“', + 'succ' => '≻', + 'succapprox' => '⪸', + 'succcurlyeq' => '≽', + 'Succeeds' => '≻', + 'SucceedsEqual' => '⪰', + 'SucceedsSlantEqual' => '≽', + 'SucceedsTilde' => '≿', + 'succeq' => '⪰', + 'succnapprox' => '⪺', + 'succneqq' => '⪶', + 'succnsim' => 'â‹©', + 'succsim' => '≿', + 'SuchThat' => '∋', + 'Sum' => '∑', + 'sum' => '∑', + 'sung' => '♪', + 'Sup' => 'â‹‘', + 'sup' => '³', + 'sup1' => '¹', + 'sup2' => '²', + 'sup3' => '³', + 'supdot' => '⪾', + 'supdsub' => '⫘', + 'supE' => '⫆', + 'supe' => '⊇', + 'supedot' => 'â«„', + 'Superset' => '⊃', + 'SupersetEqual' => '⊇', + 'suphsol' => '⟉', + 'suphsub' => 'â«—', + 'suplarr' => '⥻', + 'supmult' => 'â«‚', + 'supnE' => 'â«Œ', + 'supne' => '⊋', + 'supplus' => 'â«€', + 'Supset' => 'â‹‘', + 'supset' => '⊃', + 'supseteq' => '⊇', + 'supseteqq' => '⫆', + 'supsetneq' => '⊋', + 'supsetneqq' => 'â«Œ', + 'supsim' => '⫈', + 'supsub' => 'â«”', + 'supsup' => 'â«–', + 'swarhk' => '⤦', + 'swArr' => '⇙', + 'swarr' => '↙', + 'swarrow' => '↙', + 'swnwar' => '⤪', + 'szlig' => 'ß', + 'szli' => 'ß', + 'Tab' => ' ', + 'target' => '⌖', + 'Tau' => 'Τ', + 'tau' => 'Ï„', + 'tbrk' => '⎴', + 'Tcaron' => 'Ť', + 'tcaron' => 'Å¥', + 'Tcedil' => 'Å¢', + 'tcedil' => 'Å£', + 'Tcy' => 'Т', + 'tcy' => 'Ñ‚', + 'tdot' => '⃛', + 'telrec' => '⌕', + 'Tfr' => 'ð”—', + 'tfr' => 'ð”±', + 'there4' => '∴', + 'Therefore' => '∴', + 'therefore' => '∴', + 'Theta' => 'Θ', + 'theta' => 'θ', + 'thetasym' => 'Ï‘', + 'thetav' => 'Ï‘', + 'thickapprox' => '≈', + 'thicksim' => '∼', + 'ThickSpace' => 'âŸâ€Š', + 'thinsp' => ' ', + 'ThinSpace' => ' ', + 'thkap' => '≈', + 'thksim' => '∼', + 'THORN' => 'Þ', + 'THOR' => 'Þ', + 'thorn' => 'þ', + 'thor' => 'þ', + 'Tilde' => '∼', + 'tilde' => 'Ëœ', + 'TildeEqual' => '≃', + 'TildeFullEqual' => '≅', + 'TildeTilde' => '≈', + 'times' => '×', + 'time' => '×', + 'timesb' => '⊠', + 'timesbar' => '⨱', + 'timesd' => '⨰', + 'tint' => '∭', + 'toea' => '⤨', + 'top' => '⊤', + 'topbot' => '⌶', + 'topcir' => '⫱', + 'Topf' => 'ð•‹', + 'topf' => 'ð•¥', + 'topfork' => 'â«š', + 'tosa' => '⤩', + 'tprime' => '‴', + 'TRADE' => 'â„¢', + 'trade' => 'â„¢', + 'triangle' => 'â–µ', + 'triangledown' => 'â–¿', + 'triangleleft' => 'â—ƒ', + 'trianglelefteq' => '⊴', + 'triangleq' => '≜', + 'triangleright' => 'â–¹', + 'trianglerighteq' => '⊵', + 'tridot' => 'â—¬', + 'trie' => '≜', + 'triminus' => '⨺', + 'TripleDot' => '⃛', + 'triplus' => '⨹', + 'trisb' => 'â§', + 'tritime' => '⨻', + 'trpezium' => 'â¢', + 'Tscr' => 'ð’¯', + 'tscr' => 'ð“‰', + 'TScy' => 'Ц', + 'tscy' => 'ц', + 'TSHcy' => 'Ћ', + 'tshcy' => 'Ñ›', + 'Tstrok' => 'Ŧ', + 'tstrok' => 'ŧ', + 'twixt' => '≬', + 'twoheadleftarrow' => '↞', + 'twoheadrightarrow' => '↠', + 'Uacute' => 'Ú', + 'Uacut' => 'Ú', + 'uacute' => 'ú', + 'uacut' => 'ú', + 'Uarr' => '↟', + 'uArr' => '⇑', + 'uarr' => '↑', + 'Uarrocir' => '⥉', + 'Ubrcy' => 'ÐŽ', + 'ubrcy' => 'Ñž', + 'Ubreve' => 'Ŭ', + 'ubreve' => 'Å­', + 'Ucirc' => 'Û', + 'Ucir' => 'Û', + 'ucirc' => 'û', + 'ucir' => 'û', + 'Ucy' => 'У', + 'ucy' => 'у', + 'udarr' => '⇅', + 'Udblac' => 'Å°', + 'udblac' => 'ű', + 'udhar' => '⥮', + 'ufisht' => '⥾', + 'Ufr' => 'ð”˜', + 'ufr' => 'ð”²', + 'Ugrave' => 'Ù', + 'Ugrav' => 'Ù', + 'ugrave' => 'ù', + 'ugrav' => 'ù', + 'uHar' => '⥣', + 'uharl' => '↿', + 'uharr' => '↾', + 'uhblk' => 'â–€', + 'ulcorn' => '⌜', + 'ulcorner' => '⌜', + 'ulcrop' => 'âŒ', + 'ultri' => 'â—¸', + 'Umacr' => 'Ū', + 'umacr' => 'Å«', + 'uml' => '¨', + 'um' => '¨', + 'UnderBar' => '_', + 'UnderBrace' => 'âŸ', + 'UnderBracket' => '⎵', + 'UnderParenthesis' => 'â', + 'Union' => '⋃', + 'UnionPlus' => '⊎', + 'Uogon' => 'Ų', + 'uogon' => 'ų', + 'Uopf' => 'ð•Œ', + 'uopf' => 'ð•¦', + 'UpArrow' => '↑', + 'Uparrow' => '⇑', + 'uparrow' => '↑', + 'UpArrowBar' => '⤒', + 'UpArrowDownArrow' => '⇅', + 'UpDownArrow' => '↕', + 'Updownarrow' => '⇕', + 'updownarrow' => '↕', + 'UpEquilibrium' => '⥮', + 'upharpoonleft' => '↿', + 'upharpoonright' => '↾', + 'uplus' => '⊎', + 'UpperLeftArrow' => '↖', + 'UpperRightArrow' => '↗', + 'Upsi' => 'Ï’', + 'upsi' => 'Ï…', + 'upsih' => 'Ï’', + 'Upsilon' => 'Î¥', + 'upsilon' => 'Ï…', + 'UpTee' => '⊥', + 'UpTeeArrow' => '↥', + 'upuparrows' => '⇈', + 'urcorn' => 'âŒ', + 'urcorner' => 'âŒ', + 'urcrop' => '⌎', + 'Uring' => 'Å®', + 'uring' => 'ů', + 'urtri' => 'â—¹', + 'Uscr' => 'ð’°', + 'uscr' => 'ð“Š', + 'utdot' => 'â‹°', + 'Utilde' => 'Ũ', + 'utilde' => 'Å©', + 'utri' => 'â–µ', + 'utrif' => 'â–´', + 'uuarr' => '⇈', + 'Uuml' => 'Ãœ', + 'Uum' => 'Ãœ', + 'uuml' => 'ü', + 'uum' => 'ü', + 'uwangle' => '⦧', + 'vangrt' => '⦜', + 'varepsilon' => 'ϵ', + 'varkappa' => 'Ï°', + 'varnothing' => '∅', + 'varphi' => 'Ï•', + 'varpi' => 'Ï–', + 'varpropto' => 'âˆ', + 'vArr' => '⇕', + 'varr' => '↕', + 'varrho' => 'ϱ', + 'varsigma' => 'Ï‚', + 'varsubsetneq' => '⊊︀', + 'varsubsetneqq' => '⫋︀', + 'varsupsetneq' => '⊋︀', + 'varsupsetneqq' => '⫌︀', + 'vartheta' => 'Ï‘', + 'vartriangleleft' => '⊲', + 'vartriangleright' => '⊳', + 'Vbar' => 'â««', + 'vBar' => '⫨', + 'vBarv' => 'â«©', + 'Vcy' => 'Ð’', + 'vcy' => 'в', + 'VDash' => '⊫', + 'Vdash' => '⊩', + 'vDash' => '⊨', + 'vdash' => '⊢', + 'Vdashl' => '⫦', + 'Vee' => 'â‹', + 'vee' => '∨', + 'veebar' => '⊻', + 'veeeq' => '≚', + 'vellip' => 'â‹®', + 'Verbar' => '‖', + 'verbar' => '|', + 'Vert' => '‖', + 'vert' => '|', + 'VerticalBar' => '∣', + 'VerticalLine' => '|', + 'VerticalSeparator' => 'â˜', + 'VerticalTilde' => '≀', + 'VeryThinSpace' => ' ', + 'Vfr' => 'ð”™', + 'vfr' => 'ð”³', + 'vltri' => '⊲', + 'vnsub' => '⊂⃒', + 'vnsup' => '⊃⃒', + 'Vopf' => 'ð•', + 'vopf' => 'ð•§', + 'vprop' => 'âˆ', + 'vrtri' => '⊳', + 'Vscr' => 'ð’±', + 'vscr' => 'ð“‹', + 'vsubnE' => '⫋︀', + 'vsubne' => '⊊︀', + 'vsupnE' => '⫌︀', + 'vsupne' => '⊋︀', + 'Vvdash' => '⊪', + 'vzigzag' => '⦚', + 'Wcirc' => 'Å´', + 'wcirc' => 'ŵ', + 'wedbar' => 'â©Ÿ', + 'Wedge' => 'â‹€', + 'wedge' => '∧', + 'wedgeq' => '≙', + 'weierp' => '℘', + 'Wfr' => 'ð”š', + 'wfr' => 'ð”´', + 'Wopf' => 'ð•Ž', + 'wopf' => 'ð•¨', + 'wp' => '℘', + 'wr' => '≀', + 'wreath' => '≀', + 'Wscr' => 'ð’²', + 'wscr' => 'ð“Œ', + 'xcap' => 'â‹‚', + 'xcirc' => 'â—¯', + 'xcup' => '⋃', + 'xdtri' => 'â–½', + 'Xfr' => 'ð”›', + 'xfr' => 'ð”µ', + 'xhArr' => '⟺', + 'xharr' => '⟷', + 'Xi' => 'Ξ', + 'xi' => 'ξ', + 'xlArr' => '⟸', + 'xlarr' => '⟵', + 'xmap' => '⟼', + 'xnis' => 'â‹»', + 'xodot' => '⨀', + 'Xopf' => 'ð•', + 'xopf' => 'ð•©', + 'xoplus' => 'â¨', + 'xotime' => '⨂', + 'xrArr' => '⟹', + 'xrarr' => '⟶', + 'Xscr' => 'ð’³', + 'xscr' => 'ð“', + 'xsqcup' => '⨆', + 'xuplus' => '⨄', + 'xutri' => 'â–³', + 'xvee' => 'â‹', + 'xwedge' => 'â‹€', + 'Yacute' => 'Ã', + 'Yacut' => 'Ã', + 'yacute' => 'ý', + 'yacut' => 'ý', + 'YAcy' => 'Я', + 'yacy' => 'Ñ', + 'Ycirc' => 'Ŷ', + 'ycirc' => 'Å·', + 'Ycy' => 'Ы', + 'ycy' => 'Ñ‹', + 'yen' => 'Â¥', + 'ye' => 'Â¥', + 'Yfr' => 'ð”œ', + 'yfr' => 'ð”¶', + 'YIcy' => 'Ї', + 'yicy' => 'Ñ—', + 'Yopf' => 'ð•', + 'yopf' => 'ð•ª', + 'Yscr' => 'ð’´', + 'yscr' => 'ð“Ž', + 'YUcy' => 'Ю', + 'yucy' => 'ÑŽ', + 'Yuml' => 'Ÿ', + 'yuml' => 'ÿ', + 'yum' => 'ÿ', + 'Zacute' => 'Ź', + 'zacute' => 'ź', + 'Zcaron' => 'Ž', + 'zcaron' => 'ž', + 'Zcy' => 'З', + 'zcy' => 'з', + 'Zdot' => 'Å»', + 'zdot' => 'ż', + 'zeetrf' => 'ℨ', + 'ZeroWidthSpace' => '​', + 'Zeta' => 'Ζ', + 'zeta' => 'ζ', + 'Zfr' => 'ℨ', + 'zfr' => 'ð”·', + 'ZHcy' => 'Ж', + 'zhcy' => 'ж', + 'zigrarr' => 'â‡', + 'Zopf' => 'ℤ', + 'zopf' => 'ð•«', + 'Zscr' => 'ð’µ', + 'zscr' => 'ð“', + 'zwj' => 'â€', + 'zwnj' => '‌' + ); } diff --git a/libraries/html5php/HTML5/Exception.php b/libraries/html5php/HTML5/Exception.php index aa650a6..8f33126 100644 --- a/libraries/html5php/HTML5/Exception.php +++ b/libraries/html5php/HTML5/Exception.php @@ -1,8 +1,9 @@ createDocumentType('html'); - //$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); - $this->doc = $impl->createDocument(NULL, NULL, $dt); - $this->doc->errors = array(); - - // $this->current = $this->doc->documentElement; - $this->current = $this->doc; //->documentElement; - - // Create a rules engine for tags. - $this->rules = new TreeBuildingRules($this->doc); - - if ($isFragment) { - $this->isFragment = TRUE; - $this->insertMode = static::IM_IN_BODY; - $ele = $this->doc->createElement('html'); - $this->doc->appendChild($ele); - $this->current = $ele; - } - } - - /** - * Get the document. - */ - public function document() { - return $this->doc; - } - - /** - * Get the DOM fragment for the body. - * - * This returns a DOMNodeList because a fragment may have zero or more - * DOMNodes at its root. - * - * @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context - * - * @return \DOMFragmentDocumentFragment - */ - public function fragment() { - $append = $this->doc->documentElement->childNodes; - $frag = $this->doc->createDocumentFragment(); - - // appendChild() modifies the DOMNodeList, so we - // have to buffer up the items first, then use the - // array buffer and loop twice. - $buffer = array(); - foreach ($append as $node) { - $buffer[] = $node; - } - - foreach ($buffer as $node) { - $frag->appendChild($node); - } - - $frag->errors = $this->doc->errors; - return $frag; - } - - /** - * Provide an instruction processor. - * - * This is used for handling Processor Instructions as they are - * inserted. If omitted, PI's are inserted directly into the DOM tree. - */ - public function setInstructionProcessor(\HTML5\InstructionProcessor $proc) { - $this->processor = $proc; - } - - public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE) { - // This is used solely for setting quirks mode. Currently we don't - // try to preserve the inbound DT. We convert it to HTML5. - $this->quirks = $quirks; - - if ($this->insertMode > static::IM_INITIAL) { - $this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name); - return; - } - - $this->insertMode = static::IM_BEFORE_HTML; - } - - /** - * Process the start tag. - * - * @todo - * - XMLNS namespace handling (we need to parse, even if it's not valid) - * - XLink, MathML and SVG namespace handling - * - Omission rules: 8.1.2.4 Optional tags - */ - public function startTag($name, $attributes = array(), $selfClosing = FALSE) { - // fprintf(STDOUT, $name); - $lname = $this->normalizeTagName($name); - - // Make sure we have an html element. - if (!$this->doc->documentElement && $name !== 'html') { - $this->startTag('html'); - } - - // Set quirks mode if we're at IM_INITIAL with no doctype. - if ($this->insertMode == static::IM_INITIAL) { - $this->quirks = TRUE; - $this->parseError("No DOCTYPE specified."); - } - - // SPECIAL TAG HANDLING: - // Spec says do this, and "don't ask." - if ($name == 'image') { - $name = 'img'; - } - - - // Autoclose p tags where appropriate. - if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) { - $this->autoclose('p'); - } - - // Set insert mode: - switch ($name) { - case 'html': - $this->insertMode = static::IM_BEFORE_HEAD; - break; - case 'head': - if ($this->insertMode > static::IM_BEFORE_HEAD) { - $this->parseError("Unexpected head tag outside of head context."); - } - else { - $this->insertMode = static::IM_IN_HEAD; - } - break; - case 'body': - $this->insertMode = static::IM_IN_BODY; - break; - case 'svg': - $this->insertMode = static::IM_IN_SVG; - break; - case 'math': - $this->insertMode = static::IM_IN_MATHML; - break; - case 'noscript': - if ($this->insertMode == static::IM_IN_HEAD) { - $this->insertMode = static::IM_IN_HEAD_NOSCRIPT; - } - break; - - } - - // Special case handling for SVG. - if ($this->insertMode == static::IM_IN_SVG) { - $lname = Elements::normalizeSvgElement($lname); - } - - try { - $ele = $this->doc->createElement($lname); - } - catch(\DOMException $e) { - $this->parseError("Illegal tag name: <$lname>. Replaced with ."); - $ele = $this->doc->createElement('invalid'); - } - - foreach ($attributes as $aName => $aVal) { - - if ($this->insertMode == static::IM_IN_SVG) { - $aName = Elements::normalizeSvgAttribute($aName); - } - elseif ($this->insertMode == static::IM_IN_MATHML) { - $aName = Elements::normalizeMathMlAttribute($aName); - } - - try { - $ele->setAttribute($aName, $aVal); - } - catch(\DOMException $e) { - $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); - continue; - } - - // This is necessary on a non-DTD schema, like HTML5. - if ($aName == 'id') { - $ele->setIdAttribute('id', TRUE); - } - } - - // Some elements have special processing rules. Handle those separately. - if ($this->rules->hasRules($name)) { - $this->current = $this->rules->evaluate($ele, $this->current); - } - // Otherwise, it's a standard element. - else { - $this->current->appendChild($ele); - - // XXX: Need to handle self-closing tags and unary tags. - if (!Elements::isA($name, Elements::VOID_TAG)) { - $this->current = $ele; - } - } - - // This is sort of a last-ditch attempt to correct for cases where no head/body - // elements are provided. - if ($this->insertMode <= static::IM_BEFORE_HEAD && $name != 'head' && $name != 'html') { - $this->insertMode = static::IM_IN_BODY; - } - - // Return the element mask, which the tokenizer can then use to set - // various processing rules. - return Elements::element($name); - } - - public function endTag($name) { - $lname = $this->normalizeTagName($name); - - // Ignore closing tags for unary elements. - if (Elements::isA($name, Elements::VOID_TAG)) { - return; - } - - if ($this->insertMode <= static::IM_BEFORE_HTML) { - // 8.2.5.4.2 - if (in_array($name, array('html', 'br', 'head', 'title'))) { - $this->startTag('html'); - $this->endTag($name); - $this->insertMode = static::IM_BEFORE_HEAD; - return; - } - - // Ignore the tag. - $this->parseError("Illegal closing tag at global scope."); - return; - } - - // Special case handling for SVG. - if ($this->insertMode == static::IM_IN_SVG) { - $lname = Elements::normalizeSvgElement($lname); - } - - // XXX: Not sure whether we need this anymore. - // if ($name != $lname) { - // return $this->quirksTreeResolver($lname); - //} - - // XXX: HTML has no parent. What do we do, though, - // if this element appears in the wrong place? - if ($lname == 'html') { - return; - } - - //$this->current = $this->current->parentNode; - if (!$this->autoclose($lname)) { - $this->parseError('Could not find closing tag for ' . $lname); - } - - //switch ($this->insertMode) { - switch ($lname) { - case "head": - $this->insertMode = static::IM_AFTER_HEAD; - break; - case "body": - $this->insertMode = static::IM_AFTER_BODY; - break; - case "svg": - case "mathml": - $this->insertMode = static::IM_IN_BODY; - break; - } - } - - public function comment($cdata) { - // TODO: Need to handle case where comment appears outside of the HTML tag. - $node = $this->doc->createComment($cdata); - $this->current->appendChild($node); - } - - public function text($data) { - // XXX: Hmmm.... should we really be this strict? - if ($this->insertMode < static::IM_IN_HEAD) { - // Per '8.2.5.4.3 The "before head" insertion mode' the characters - // " \t\n\r\f" should be ignored but no mention of a parse error. This is - // practical as most documents contain these characters. Other text is not - // expected here so recording a parse error is necessary. - $dataTmp = trim($data, " \t\n\r\f"); - if (!empty($dataTmp)) { - //fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); - $this->parseError("Unexpected text. Ignoring: " . $dataTmp); - } - return; - } - //fprintf(STDOUT, "Appending text %s.", $data); - $node = $this->doc->createTextNode($data); - $this->current->appendChild($node); - } - - public function eof() { - // If the $current isn't the $root, do we need to do anything? - } - - public function parseError($msg, $line = 0, $col = 0) { - $this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg); - } - - public function cdata($data) { - $node = $this->doc->createCDATASection($data); - $this->current->appendChild($node); - } - - public function processingInstruction($name, $data = NULL) { - // XXX: Ignore initial XML declaration, per the spec. - if ($this->insertMode == static::IM_INITIAL && 'xml' == strtolower($name)) { - return; - } - - // Important: The processor may modify the current DOM tree however - // it sees fit. - if (isset($this->processor)) { - $res = $this->processor->process($this->current, $name, $data); - if (!empty($res)) { - $this->current = $res; - } - return; - } - - // Otherwise, this is just a dumb PI element. - $node = $this->doc->createProcessingInstruction($name, $data); - - $this->current->appendChild($node); - } - - // ========================================================================== - // UTILITIES - // ========================================================================== - - /** - * Apply normalization rules to a tag name. - * - * See sections 2.9 and 8.1.2. - * - * @param string $name - * The tag name. - * @return string - * The normalized tag name. - */ - protected function normalizeTagName($name) { - /* Section 2.9 suggests that we should not do this. - if (strpos($name, ':') !== FALSE) { - // We know from the grammar that there must be at least one other - // char besides :, since : is not a legal tag start. - $parts = explode(':', $name); - return array_pop($parts); - } +class DOMTreeBuilder implements EventHandler +{ + /** + * Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0 */ + const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml'; - return $name; - } + const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML'; - protected function quirksTreeResolver($name) { - throw new \Exception("Not implemented."); + const NAMESPACE_SVG = 'http://www.w3.org/2000/svg'; - } + const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink'; - /** - * Automatically climb the tree and close the closest node with the matching $tag. - */ - protected function autoclose($tag) { - $working = $this->current; - do { - if ($working->nodeType != XML_ELEMENT_NODE) { - return FALSE; - } - if ($working->tagName == $tag) { - $this->current = $working->parentNode; - return TRUE; - } - } while ($working = $working->parentNode); - return FALSE; + const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace'; - } + const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; - /** - * Checks if the given tagname is an ancestor of the present candidate. - * - * If $this->current or anything above $this->current matches the given tag - * name, this returns TRUE. - */ - protected function isAncestor($tagname) { - $candidate = $this->current; - while ($candidate->nodeType === XML_ELEMENT_NODE) { - if ($candidate->tagName == $tagname) { - return TRUE; - } - $candidate = $candidate->parentNode; + /** + * Holds the HTML5 element names that causes a namespace switch + * + * @var array + */ + protected $nsRoots = array( + 'html' => self::NAMESPACE_HTML, + 'svg' => self::NAMESPACE_SVG, + 'math' => self::NAMESPACE_MATHML + ); + + /** + * Holds the always available namespaces (which does not require the XMLNS declaration). + * + * @var array + */ + protected $implicitNamespaces = array( + 'xml' => self::NAMESPACE_XML, + 'xmlns' => self::NAMESPACE_XMLNS, + 'xlink' => self::NAMESPACE_XLINK + ); + + /** + * Holds a stack of currently active namespaces. + * + * @var array + */ + protected $nsStack = array(); + + /** + * Holds the number of namespaces declared by a node. + * + * @var array + */ + protected $pushes = array(); + + /** + * Defined in 8.2.5. + */ + const IM_INITIAL = 0; + + const IM_BEFORE_HTML = 1; + + const IM_BEFORE_HEAD = 2; + + const IM_IN_HEAD = 3; + + const IM_IN_HEAD_NOSCRIPT = 4; + + const IM_AFTER_HEAD = 5; + + const IM_IN_BODY = 6; + + const IM_TEXT = 7; + + const IM_IN_TABLE = 8; + + const IM_IN_TABLE_TEXT = 9; + + const IM_IN_CAPTION = 10; + + const IM_IN_COLUMN_GROUP = 11; + + const IM_IN_TABLE_BODY = 12; + + const IM_IN_ROW = 13; + + const IM_IN_CELL = 14; + + const IM_IN_SELECT = 15; + + const IM_IN_SELECT_IN_TABLE = 16; + + const IM_AFTER_BODY = 17; + + const IM_IN_FRAMESET = 18; + + const IM_AFTER_FRAMESET = 19; + + const IM_AFTER_AFTER_BODY = 20; + + const IM_AFTER_AFTER_FRAMESET = 21; + + const IM_IN_SVG = 22; + + const IM_IN_MATHML = 23; + + protected $options = array(); + + protected $stack = array(); + + protected $current; // Pointer in the tag hierarchy. + protected $doc; + + protected $frag; + + protected $processor; + + protected $insertMode = 0; + + /** + * Quirks mode is enabled by default. + * Any document that is missing the + * DT will be considered to be in quirks mode. + */ + protected $quirks = true; + + protected $errors = array(); + + public function __construct($isFragment = false, array $options = array()) + { + $this->options = $options; + + $impl = new \DOMImplementation(); + // XXX: + // Create the doctype. For now, we are always creating HTML5 + // documents, and attempting to up-convert any older DTDs to HTML5. + $dt = $impl->createDocumentType('html'); + // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); + $this->doc = $impl->createDocument(null, null, $dt); + $this->errors = array(); + + $this->current = $this->doc; // ->documentElement; + + // Create a rules engine for tags. + $this->rules = new TreeBuildingRules($this->doc); + + // Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options + array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array( + '' => self::NAMESPACE_HTML + ) + $this->implicitNamespaces); + + if ($isFragment) { + $this->insertMode = static::IM_IN_BODY; + $this->frag = $this->doc->createDocumentFragment(); + $this->current = $this->frag; + } } - return FALSE; - } - /** - * Returns TRUE if the immediate parent element is of the given tagname. - */ - protected function isParent($tagname) { - return $this->current->tagName == $tagname; - } + /** + * Get the document. + */ + public function document() + { + return $this->doc; + } + + /** + * Get the DOM fragment for the body. + * + * This returns a DOMNodeList because a fragment may have zero or more + * DOMNodes at its root. + * + * @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context + * + * @return \DOMFragmentDocumentFragment + */ + public function fragment() + { + return $this->frag; + } + + /** + * Provide an instruction processor. + * + * This is used for handling Processor Instructions as they are + * inserted. If omitted, PI's are inserted directly into the DOM tree. + */ + public function setInstructionProcessor(\Masterminds\HTML5\InstructionProcessor $proc) + { + $this->processor = $proc; + } + + public function doctype($name, $idType = 0, $id = null, $quirks = false) + { + // This is used solely for setting quirks mode. Currently we don't + // try to preserve the inbound DT. We convert it to HTML5. + $this->quirks = $quirks; + + if ($this->insertMode > static::IM_INITIAL) { + $this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name); + + return; + } + + $this->insertMode = static::IM_BEFORE_HTML; + } + + /** + * Process the start tag. + * + * @todo - XMLNS namespace handling (we need to parse, even if it's not valid) + * - XLink, MathML and SVG namespace handling + * - Omission rules: 8.1.2.4 Optional tags + */ + public function startTag($name, $attributes = array(), $selfClosing = false) + { + // fprintf(STDOUT, $name); + $lname = $this->normalizeTagName($name); + + // Make sure we have an html element. + if (! $this->doc->documentElement && $name !== 'html' && ! $this->frag) { + $this->startTag('html'); + } + + // Set quirks mode if we're at IM_INITIAL with no doctype. + if ($this->insertMode == static::IM_INITIAL) { + $this->quirks = true; + $this->parseError("No DOCTYPE specified."); + } + + // SPECIAL TAG HANDLING: + // Spec says do this, and "don't ask." + if ($name == 'image') { + $name = 'img'; + } + + // Autoclose p tags where appropriate. + if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) { + $this->autoclose('p'); + } + + // Set insert mode: + switch ($name) { + case 'html': + $this->insertMode = static::IM_BEFORE_HEAD; + break; + case 'head': + if ($this->insertMode > static::IM_BEFORE_HEAD) { + $this->parseError("Unexpected head tag outside of head context."); + } else { + $this->insertMode = static::IM_IN_HEAD; + } + break; + case 'body': + $this->insertMode = static::IM_IN_BODY; + break; + case 'svg': + $this->insertMode = static::IM_IN_SVG; + break; + case 'math': + $this->insertMode = static::IM_IN_MATHML; + break; + case 'noscript': + if ($this->insertMode == static::IM_IN_HEAD) { + $this->insertMode = static::IM_IN_HEAD_NOSCRIPT; + } + break; + } + + // Special case handling for SVG. + if ($this->insertMode == static::IM_IN_SVG) { + $lname = Elements::normalizeSvgElement($lname); + } + + $pushes = 0; + // when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace + if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) { + array_unshift($this->nsStack, array( + '' => $this->nsRoots[$lname] + ) + $this->nsStack[0]); + $pushes ++; + } + $needsWorkaround = false; + if (isset($this->options["xmlNamespaces"]) && $this->options["xmlNamespaces"]) { + // when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack + foreach ($attributes as $aName => $aVal) { + if ($aName === 'xmlns') { + $needsWorkaround = $aVal; + array_unshift($this->nsStack, array( + '' => $aVal + ) + $this->nsStack[0]); + $pushes ++; + } elseif ((($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '') === 'xmlns') { + array_unshift($this->nsStack, array( + substr($aName, $pos + 1) => $aVal + ) + $this->nsStack[0]); + $pushes ++; + } + } + } + + try { + $prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : ''; + if ($needsWorkaround!==false) { + + $xml = "<$lname xmlns=\"$needsWorkaround\" ".(strlen($prefix) && isset($this->nsStack[0][$prefix])?("xmlns:$prefix=\"".$this->nsStack[0][$prefix]."\""):"")."/>"; + + $frag = new \DOMDocument('1.0', 'UTF-8'); + $frag->loadXML($xml); + + $ele = $this->doc->importNode($frag->documentElement, true); + + } else { + if (isset($this->nsStack[0][$prefix])) { + $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); + } else { + $ele = $this->doc->createElement($lname); + } + } + + } catch (\DOMException $e) { + $this->parseError("Illegal tag name: <$lname>. Replaced with ."); + $ele = $this->doc->createElement('invalid'); + } + + // When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them. + // When we are on a void tag, we do not need to care about namesapce nesting. + if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) { + // PHP tends to free the memory used by DOM, + // to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes + // see https://bugs.php.net/bug.php?id=67459 + $this->pushes[spl_object_hash($ele)] = array($pushes, $ele); + + // SEE https://github.com/facebook/hhvm/issues/2962 + if (defined('HHVM_VERSION')) { + $ele->setAttribute('html5-php-fake-id-attribute', spl_object_hash($ele)); + } + } + + foreach ($attributes as $aName => $aVal) { + // xmlns attributes can't be set + if ($aName === 'xmlns') { + continue; + } + + if ($this->insertMode == static::IM_IN_SVG) { + $aName = Elements::normalizeSvgAttribute($aName); + } elseif ($this->insertMode == static::IM_IN_MATHML) { + $aName = Elements::normalizeMathMlAttribute($aName); + } + + try { + $prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false; + + if ($prefix==='xmlns') { + $ele->setAttributeNs(self::NAMESPACE_XMLNS, $aName, $aVal); + } elseif ($prefix!==false && isset($this->nsStack[0][$prefix])) { + $ele->setAttributeNs($this->nsStack[0][$prefix], $aName, $aVal); + } else { + $ele->setAttribute($aName, $aVal); + } + } catch (\DOMException $e) { + $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); + continue; + } + + // This is necessary on a non-DTD schema, like HTML5. + if ($aName == 'id') { + $ele->setIdAttribute('id', true); + } + } + + // Some elements have special processing rules. Handle those separately. + if ($this->rules->hasRules($name)) { + $this->current = $this->rules->evaluate($ele, $this->current); + } // Otherwise, it's a standard element. + else { + $this->current->appendChild($ele); + + // XXX: Need to handle self-closing tags and unary tags. + if (! Elements::isA($name, Elements::VOID_TAG)) { + $this->current = $ele; + } + } + + // This is sort of a last-ditch attempt to correct for cases where no head/body + // elements are provided. + if ($this->insertMode <= static::IM_BEFORE_HEAD && $name != 'head' && $name != 'html') { + $this->insertMode = static::IM_IN_BODY; + } + + // When we are on a void tag, we do not need to care about namesapce nesting, + // but we have to remove the namespaces pushed to $nsStack. + if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) { + // remove the namespaced definded by current node + for ($i = 0; $i < $pushes; $i ++) { + array_shift($this->nsStack); + } + } + // Return the element mask, which the tokenizer can then use to set + // various processing rules. + return Elements::element($name); + } + + public function endTag($name) + { + $lname = $this->normalizeTagName($name); + + // Ignore closing tags for unary elements. + if (Elements::isA($name, Elements::VOID_TAG)) { + return; + } + + if ($this->insertMode <= static::IM_BEFORE_HTML) { + // 8.2.5.4.2 + if (in_array($name, array( + 'html', + 'br', + 'head', + 'title' + ))) { + $this->startTag('html'); + $this->endTag($name); + $this->insertMode = static::IM_BEFORE_HEAD; + + return; + } + + // Ignore the tag. + $this->parseError("Illegal closing tag at global scope."); + + return; + } + + // Special case handling for SVG. + if ($this->insertMode == static::IM_IN_SVG) { + $lname = Elements::normalizeSvgElement($lname); + } + + // See https://github.com/facebook/hhvm/issues/2962 + if (defined('HHVM_VERSION') && ($cid = $this->current->getAttribute('html5-php-fake-id-attribute'))) { + $this->current->removeAttribute('html5-php-fake-id-attribute'); + } else { + $cid = spl_object_hash($this->current); + } + + // XXX: Not sure whether we need this anymore. + // if ($name != $lname) { + // return $this->quirksTreeResolver($lname); + // } + + // XXX: HTML has no parent. What do we do, though, + // if this element appears in the wrong place? + if ($lname == 'html') { + return; + } + + // remove the namespaced definded by current node + if (isset($this->pushes[$cid])) { + for ($i = 0; $i < $this->pushes[$cid][0]; $i ++) { + array_shift($this->nsStack); + } + unset($this->pushes[$cid]); + } + + if (! $this->autoclose($lname)) { + $this->parseError('Could not find closing tag for ' . $lname); + } + + // switch ($this->insertMode) { + switch ($lname) { + case "head": + $this->insertMode = static::IM_AFTER_HEAD; + break; + case "body": + $this->insertMode = static::IM_AFTER_BODY; + break; + case "svg": + case "mathml": + $this->insertMode = static::IM_IN_BODY; + break; + } + } + + public function comment($cdata) + { + // TODO: Need to handle case where comment appears outside of the HTML tag. + $node = $this->doc->createComment($cdata); + $this->current->appendChild($node); + } + + public function text($data) + { + // XXX: Hmmm.... should we really be this strict? + if ($this->insertMode < static::IM_IN_HEAD) { + // Per '8.2.5.4.3 The "before head" insertion mode' the characters + // " \t\n\r\f" should be ignored but no mention of a parse error. This is + // practical as most documents contain these characters. Other text is not + // expected here so recording a parse error is necessary. + $dataTmp = trim($data, " \t\n\r\f"); + if (! empty($dataTmp)) { + // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); + $this->parseError("Unexpected text. Ignoring: " . $dataTmp); + } + + return; + } + // fprintf(STDOUT, "Appending text %s.", $data); + $node = $this->doc->createTextNode($data); + $this->current->appendChild($node); + } + + public function eof() + { + // If the $current isn't the $root, do we need to do anything? + } + + public function parseError($msg, $line = 0, $col = 0) + { + $this->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg); + } + + public function getErrors() + { + return $this->errors; + } + + public function cdata($data) + { + $node = $this->doc->createCDATASection($data); + $this->current->appendChild($node); + } + + public function processingInstruction($name, $data = null) + { + // XXX: Ignore initial XML declaration, per the spec. + if ($this->insertMode == static::IM_INITIAL && 'xml' == strtolower($name)) { + return; + } + + // Important: The processor may modify the current DOM tree however + // it sees fit. + if (isset($this->processor)) { + $res = $this->processor->process($this->current, $name, $data); + if (! empty($res)) { + $this->current = $res; + } + + return; + } + + // Otherwise, this is just a dumb PI element. + $node = $this->doc->createProcessingInstruction($name, $data); + + $this->current->appendChild($node); + } + + // ========================================================================== + // UTILITIES + // ========================================================================== + + /** + * Apply normalization rules to a tag name. + * + * See sections 2.9 and 8.1.2. + * + * @param string $name + * The tag name. + * @return string The normalized tag name. + */ + protected function normalizeTagName($name) + { + /* + * Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== false) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); } + */ + return $name; + } + + protected function quirksTreeResolver($name) + { + throw new \Exception("Not implemented."); + } + + /** + * Automatically climb the tree and close the closest node with the matching $tag. + */ + protected function autoclose($tag) + { + $working = $this->current; + do { + if ($working->nodeType != XML_ELEMENT_NODE) { + return false; + } + if ($working->tagName == $tag) { + $this->current = $working->parentNode; + + return true; + } + } while ($working = $working->parentNode); + return false; + } + + /** + * Checks if the given tagname is an ancestor of the present candidate. + * + * If $this->current or anything above $this->current matches the given tag + * name, this returns true. + */ + protected function isAncestor($tagname) + { + $candidate = $this->current; + while ($candidate->nodeType === XML_ELEMENT_NODE) { + if ($candidate->tagName == $tagname) { + return true; + } + $candidate = $candidate->parentNode; + } + + return false; + } + + /** + * Returns true if the immediate parent element is of the given tagname. + */ + protected function isParent($tagname) + { + return $this->current->tagName == $tagname; + } } diff --git a/libraries/html5php/HTML5/Parser/EventHandler.php b/libraries/html5php/HTML5/Parser/EventHandler.php index 4034938..2d55347 100644 --- a/libraries/html5php/HTML5/Parser/EventHandler.php +++ b/libraries/html5php/HTML5/Parser/EventHandler.php @@ -1,111 +1,122 @@ ) - * @return numeric - * One of the Tokenizer::TEXTMODE_* constants. - */ - public function startTag($name, $attributes = array(), $selfClosing = FALSE); - /** - * An end-tag. - */ - public function endTag($name); - /** - * A comment section (unparsed character data). - */ - public function comment($cdata); - /** - * A unit of parsed character data. - * - * Entities in this text are *already decoded*. - */ - public function text($cdata); - /** - * Indicates that the document has been entirely processed. - */ - public function eof(); - /** - * Emitted when the parser encounters an error condition. - */ - public function parseError($msg, $line, $col); +interface EventHandler +{ - /** - * A CDATA section. - * - * @param string $data - * The unparsed character data. - */ - public function cdata($data); - /** - * This is a holdover from the XML spec. - * - * While user agents don't get PIs, server-side does. - * - * @param string $name - * The name of the processor (e.g. 'php'). - * @param string $data - * The unparsed data. - */ - public function processingInstruction($name, $data = NULL); + const DOCTYPE_NONE = 0; + + const DOCTYPE_PUBLIC = 1; + + const DOCTYPE_SYSTEM = 2; + + /** + * A doctype declaration. + * + * @param string $name + * The name of the root element. + * @param int $idType + * One of DOCTYPE_NONE, DOCTYPE_PUBLIC, or DOCTYPE_SYSTEM. + * @param string $id + * The identifier. For DOCTYPE_PUBLIC, this is the public ID. If DOCTYPE_SYSTEM, + * then this is a system ID. + * @param boolean $quirks + * Indicates whether the builder should enter quirks mode. + */ + public function doctype($name, $idType = 0, $id = null, $quirks = false); + + /** + * A start tag. + * + * IMPORTANT: The parser watches the return value of this event. If this returns + * an integer, the parser will switch TEXTMODE patters according to the int. + * + * This is how the Tree Builder can tell the Tokenizer when a certain tag should + * cause the parser to go into RAW text mode. + * + * The HTML5 standard requires that the builder is the one that initiates this + * step, and this is the only way short of a circular reference that we can + * do that. + * + * Example: if a startTag even for a `script` name is fired, and the startTag() + * implementation returns Tokenizer::TEXTMODE_RAW, then the tokenizer will + * switch into RAW text mode and consume data until it reaches a closing + * `script` tag. + * + * The textmode is automatically reset to Tokenizer::TEXTMODE_NORMAL when the + * closing tag is encounter. **This behavior may change.** + * + * @param string $name + * The tag name. + * @param array $attributes + * An array with all of the tag's attributes. + * @param boolean $selfClosing + * An indicator of whether or not this tag is self-closing () + * @return numeric One of the Tokenizer::TEXTMODE_* constants. + */ + public function startTag($name, $attributes = array(), $selfClosing = false); + + /** + * An end-tag. + */ + public function endTag($name); + + /** + * A comment section (unparsed character data). + */ + public function comment($cdata); + + /** + * A unit of parsed character data. + * + * Entities in this text are *already decoded*. + */ + public function text($cdata); + + /** + * Indicates that the document has been entirely processed. + */ + public function eof(); + + /** + * Emitted when the parser encounters an error condition. + */ + public function parseError($msg, $line, $col); + + /** + * A CDATA section. + * + * @param string $data + * The unparsed character data. + */ + public function cdata($data); + + /** + * This is a holdover from the XML spec. + * + * While user agents don't get PIs, server-side does. + * + * @param string $name + * The name of the processor (e.g. 'php'). + * @param string $data + * The unparsed data. + */ + public function processingInstruction($name, $data = null); } diff --git a/libraries/html5php/HTML5/Parser/FileInputStream.php b/libraries/html5php/HTML5/Parser/FileInputStream.php index c1bb128..e58006a 100644 --- a/libraries/html5php/HTML5/Parser/FileInputStream.php +++ b/libraries/html5php/HTML5/Parser/FileInputStream.php @@ -1,35 +1,32 @@ is = $input; - } + const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ'; - /** - * Get the current position. - * - * @return int - * The current intiger byte position. - */ - public function position() { - return $this->is->key(); - } + protected $is; - /** - * Take a peek at the next character in the data. - * - * @return string - * The next character. - */ - public function peek() { - return $this->is->peek(); - } + // Flipping this to true will give minisculely more debugging info. + public $debug = false; - /** - * Get the next character. - * - * Note: This advances the pointer. - * - * @return string - * The next character. - */ - public function next() { - $this->is->next(); - if ($this->is->valid()) { - if ($this->debug) fprintf(STDOUT, "> %s\n", $this->is->current()); - return $this->is->current(); + /** + * Create a new Scanner. + * + * @param \Masterminds\HTML5\Parser\InputStream $input + * An InputStream to be scanned. + */ + public function __construct($input) + { + $this->is = $input; } - return FALSE; - } - /** - * Get the current character. - * - * Note, this does not advance the pointer. - * - * @return string - * The current character. - */ - public function current() { - if ($this->is->valid()) { - return $this->is->current(); + /** + * Get the current position. + * + * @return int The current intiger byte position. + */ + public function position() + { + return $this->is->key(); } - return FALSE; - } - /** - * Silently consume N chars. - */ - public function consume($count = 1) { - for ($i = 0; $i < $count; ++$i) { - $this->next(); + /** + * Take a peek at the next character in the data. + * + * @return string The next character. + */ + public function peek() + { + return $this->is->peek(); } - } - /** - * Unconsume some of the data. This moves the data pointer backwards. - * - * @param int $howMany - * The number of characters to move the pointer back. - */ - public function unconsume($howMany = 1) { - $this->is->unconsume($howMany); - } + /** + * Get the next character. + * + * Note: This advances the pointer. + * + * @return string The next character. + */ + public function next() + { + $this->is->next(); + if ($this->is->valid()) { + if ($this->debug) + fprintf(STDOUT, "> %s\n", $this->is->current()); + return $this->is->current(); + } - /** - * Get the next group of that contains hex characters. - * - * Note, along with getting the characters the pointer in the data will be - * moved as well. - * - * @return string - * The next group that is hex characters. - */ - public function getHex() { - return $this->is->charsWhile(static::CHARS_HEX); - } + return false; + } - /** - * Get the next group of characters that are ASCII Alpha characters. - * - * Note, along with getting the characters the pointer in the data will be - * moved as well. - * - * @return string - * The next group of ASCII alpha characters. - */ - public function getAsciiAlpha() { - return $this->is->charsWhile(static::CHARS_ALPHA); - } + /** + * Get the current character. + * + * Note, this does not advance the pointer. + * + * @return string The current character. + */ + public function current() + { + if ($this->is->valid()) { + return $this->is->current(); + } - /** - * Get the next group of characters that are ASCII Alpha characters and numbers. - * - * Note, along with getting the characters the pointer in the data will be - * moved as well. - * - * @return string - * The next group of ASCII alpha characters and numbers. - */ - public function getAsciiAlphaNum() { - return $this->is->charsWhile(static::CHARS_ALNUM); - } + return false; + } - /** - * Get the next group of numbers. - * - * Note, along with getting the characters the pointer in the data will be - * moved as well. - * - * @return string - * The next group of numbers. - */ - public function getNumeric() { - return $this->is->charsWhile('0123456789'); - } + /** + * Silently consume N chars. + */ + public function consume($count = 1) + { + for ($i = 0; $i < $count; ++ $i) { + $this->next(); + } + } - /** - * Consume whitespace. - * - * Whitespace in HTML5 is: formfeed, tab, newline, space. - */ - public function whitespace() { - return $this->is->charsWhile("\n\t\f "); - } + /** + * Unconsume some of the data. + * This moves the data pointer backwards. + * + * @param int $howMany + * The number of characters to move the pointer back. + */ + public function unconsume($howMany = 1) + { + $this->is->unconsume($howMany); + } - /** - * Returns the current line that is being consumed. - * - * @return int - * The current line number. - */ - public function currentLine() { - return $this->is->currentLine(); - } + /** + * Get the next group of that contains hex characters. + * + * Note, along with getting the characters the pointer in the data will be + * moved as well. + * + * @return string The next group that is hex characters. + */ + public function getHex() + { + return $this->is->charsWhile(static::CHARS_HEX); + } - /** - * Read chars until something in the mask is encountered. - */ - public function charsUntil($mask) { - return $this->is->charsUntil($mask); - } - /** - * Read chars as long as the mask matches. - */ - public function charsWhile($mask) { - return $this->is->charsWhile($mask); - } + /** + * Get the next group of characters that are ASCII Alpha characters. + * + * Note, along with getting the characters the pointer in the data will be + * moved as well. + * + * @return string The next group of ASCII alpha characters. + */ + public function getAsciiAlpha() + { + return $this->is->charsWhile(static::CHARS_ALPHA); + } - /** - * Returns the current column of the current line that the tokenizer is at. - * - * Newlines are column 0. The first char after a newline is column 1. - * - * @return int - * The column number. - */ - public function columnOffset() { - return $this->is->columnOffset(); - } + /** + * Get the next group of characters that are ASCII Alpha characters and numbers. + * + * Note, along with getting the characters the pointer in the data will be + * moved as well. + * + * @return string The next group of ASCII alpha characters and numbers. + */ + public function getAsciiAlphaNum() + { + return $this->is->charsWhile(static::CHARS_ALNUM); + } - /** - * Get all characters until EOF. - * - * This consumes characters until the EOF. - * - * @return int - * The number of characters remaining. - */ - public function remainingChars() { - return $this->is->remainingChars(); - } + /** + * Get the next group of numbers. + * + * Note, along with getting the characters the pointer in the data will be + * moved as well. + * + * @return string The next group of numbers. + */ + public function getNumeric() + { + return $this->is->charsWhile('0123456789'); + } + + /** + * Consume whitespace. + * + * Whitespace in HTML5 is: formfeed, tab, newline, space. + */ + public function whitespace() + { + return $this->is->charsWhile("\n\t\f "); + } + + /** + * Returns the current line that is being consumed. + * + * @return int The current line number. + */ + public function currentLine() + { + return $this->is->currentLine(); + } + + /** + * Read chars until something in the mask is encountered. + */ + public function charsUntil($mask) + { + return $this->is->charsUntil($mask); + } + + /** + * Read chars as long as the mask matches. + */ + public function charsWhile($mask) + { + return $this->is->charsWhile($mask); + } + + /** + * Returns the current column of the current line that the tokenizer is at. + * + * Newlines are column 0. The first char after a newline is column 1. + * + * @return int The column number. + */ + public function columnOffset() + { + return $this->is->columnOffset(); + } + + /** + * Get all characters until EOF. + * + * This consumes characters until the EOF. + * + * @return int The number of characters remaining. + */ + public function remainingChars() + { + return $this->is->remainingChars(); + } } diff --git a/libraries/html5php/HTML5/Parser/StringInputStream.php b/libraries/html5php/HTML5/Parser/StringInputStream.php index ca5fee0..4cac3c2 100644 --- a/libraries/html5php/HTML5/Parser/StringInputStream.php +++ b/libraries/html5php/HTML5/Parser/StringInputStream.php @@ -2,17 +2,17 @@ /** * Loads a string to be parsed. */ -namespace HTML5\Parser; +namespace Masterminds\HTML5\Parser; /* * - * Based on code from html5lib: +* Based on code from html5lib: Copyright 2009 Geoffrey Sneddon Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including + "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to @@ -33,283 +33,299 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // Some conventions: // - /* */ indicates verbatim text from the HTML 5 specification -// MPB: Not sure which version of the spec. Moving from HTML5lib to +// MPB: Not sure which version of the spec. Moving from HTML5lib to // HTML5-PHP, I have been using this version: // http://www.w3.org/TR/2012/CR-html5-20121217/Overview.html#contents // // - // indicates regular comments -class StringInputStream implements InputStream { - /** - * The string data we're parsing. - */ - private $data; +class StringInputStream implements InputStream +{ - /** - * The current integer byte position we are in $data - */ - private $char; + /** + * The string data we're parsing. + */ + private $data; - /** - * Length of $data; when $char === $data, we are at the end-of-file. - */ - private $EOF; + /** + * The current integer byte position we are in $data + */ + private $char; - /** - * Parse errors. - */ - public $errors = array(); + /** + * Length of $data; when $char === $data, we are at the end-of-file. + */ + private $EOF; - /** - * Create a new InputStream wrapper. - * - * @param $data Data to parse - */ - public function __construct($data, $encoding = 'UTF-8', $debug = '') { + /** + * Parse errors. + */ + public $errors = array(); - $data = UTF8Utils::convertToUTF8($data, $encoding); - if ($debug) fprintf(STDOUT, $debug, $data, strlen($data)); + /** + * Create a new InputStream wrapper. + * + * @param $data Data + * to parse + */ + public function __construct($data, $encoding = 'UTF-8', $debug = '') + { + $data = UTF8Utils::convertToUTF8($data, $encoding); + if ($debug) + fprintf(STDOUT, $debug, $data, strlen($data)); - // There is good reason to question whether it makes sense to - // do this here, since most of these checks are done during - // parsing, and since this check doesn't actually *do* anything. - $this->errors = UTF8Utils::checkForIllegalCodepoints($data); - //if (!empty($e)) { - // throw new ParseError("UTF-8 encoding issues: " . implode(', ', $e)); - //} + // There is good reason to question whether it makes sense to + // do this here, since most of these checks are done during + // parsing, and since this check doesn't actually *do* anything. + $this->errors = UTF8Utils::checkForIllegalCodepoints($data); + // if (!empty($e)) { + // throw new ParseError("UTF-8 encoding issues: " . implode(', ', $e)); + // } - $data = $this->replaceLinefeeds($data); + $data = $this->replaceLinefeeds($data); - $this->data = $data; - $this->char = 0; - $this->EOF = strlen($data); - } - - /** - * Replace linefeed characters according to the spec. - */ - protected function replaceLinefeeds($data) { - /* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED - (LF) characters are treated specially. Any CR characters - that are followed by LF characters must be removed, and any - CR characters not followed by LF characters must be converted - to LF characters. Thus, newlines in HTML DOMs are represented - by LF characters, and there are never any CR characters in the - input to the tokenization stage. */ - $crlfTable = array( - "\0" => "\xEF\xBF\xBD", - "\r\n" => "\n", - "\r" => "\n", - ); - return strtr($data, $crlfTable); - } - - /** - * Returns the current line that the tokenizer is at. - */ - public function currentLine() { - if (empty($this->EOF) || $this->char == 0) { - return 1; - } - // Add one to $this->char because we want the number for the next - // byte to be processed. - return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; - } - - /** - * @deprecated - */ - public function getCurrentLine() { - return currentLine(); - } - - /** - * Returns the current column of the current line that the tokenizer is at. - * - * Newlines are column 0. The first char after a newline is column 1. - * - * @return int - * The column number. - */ - public function columnOffset() { - - // Short circuit for the first char. - if ($this->char == 0) { - return 0; - } - // strrpos is weird, and the offset needs to be negative for what we - // want (i.e., the last \n before $this->char). This needs to not have - // one (to make it point to the next character, the one we want the - // position of) added to it because strrpos's behaviour includes the - // final offset byte. - $backwardFrom = $this->char - 1 - strlen($this->data); - $lastLine = strrpos($this->data, "\n", $backwardFrom); - - // However, for here we want the length up until the next byte to be - // processed, so add one to the current byte ($this->char). - if ($lastLine !== FALSE) { - $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); - } - else { - // After a newline. - $findLengthOf = substr($this->data, 0, $this->char); + $this->data = $data; + $this->char = 0; + $this->EOF = strlen($data); } - return UTF8Utils::countChars($findLengthOf); - } + /** + * Replace linefeed characters according to the spec. + */ + protected function replaceLinefeeds($data) + { + /* + * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. Any CR characters that are followed by LF characters must be removed, and any CR characters not followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are represented by LF characters, and there are never any CR characters in the input to the tokenization stage. + */ + $crlfTable = array( + "\0" => "\xEF\xBF\xBD", + "\r\n" => "\n", + "\r" => "\n" + ); - /** - * @deprecated - */ - public function getColumnOffset() { - return $this->columnOffset(); - } - - /** - * Get the current character. - * - * @return string - * The current character. - */ - public function current() { - return $this->data[$this->char]; - } - - /** - * Advance the pointer. This is part of the Iterator interface. - */ - public function next() { - $this->char++; - } - - /** - * Rewind to the start of the string. - */ - public function rewind() { - $this->char = 0; - } - - /** - * Is the current pointer location valid. - * - * @return bool - * Is the current pointer location valid. - */ - public function valid() { - if ($this->char < $this->EOF) { - return TRUE; + return strtr($data, $crlfTable); } - return FALSE; - } - - /** - * Get all characters until EOF. - * - * This reads to the end of the file, and sets the read marker at the - * end of the file. - * - * @note This performs bounds checking - * - * @return string - * Returns the remaining text. If called when the InputStream is - * already exhausted, it returns an empty string. - */ - public function remainingChars() { - if ($this->char < $this->EOF) { - $data = substr($this->data, $this->char); - $this->char = $this->EOF; - return $data; - } - return '';//FALSE; - } - - /** - * Read to a particular match (or until $max bytes are consumed). - * - * This operates on byte sequences, not characters. - * - * Matches as far as possible until we reach a certain set of bytes - * and returns the matched substring. - * - * @param string $bytes - * Bytes to match. - * @param int $max - * Maximum number of bytes to scan. - * @return mixed - * Index or FALSE if no match is found. You should use strong - * equality when checking the result, since index could be 0. - */ - public function charsUntil($bytes, $max = null) { - if ($this->char >= $this->EOF) { - return FALSE; + /** + * Returns the current line that the tokenizer is at. + */ + public function currentLine() + { + if (empty($this->EOF) || $this->char == 0) { + return 1; + } + // Add one to $this->char because we want the number for the next + // byte to be processed. + return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; } - if ($max === 0 || $max) { - $len = strcspn($this->data, $bytes, $this->char, $max); - } - else { - $len = strcspn($this->data, $bytes, $this->char); + /** + * + * @deprecated + * + */ + public function getCurrentLine() + { + return currentLine(); } - $string = (string) substr($this->data, $this->char, $len); - $this->char += $len; - return $string; - } + /** + * Returns the current column of the current line that the tokenizer is at. + * + * Newlines are column 0. The first char after a newline is column 1. + * + * @return int The column number. + */ + public function columnOffset() + { + // Short circuit for the first char. + if ($this->char == 0) { + return 0; + } + // strrpos is weird, and the offset needs to be negative for what we + // want (i.e., the last \n before $this->char). This needs to not have + // one (to make it point to the next character, the one we want the + // position of) added to it because strrpos's behaviour includes the + // final offset byte. + $backwardFrom = $this->char - 1 - strlen($this->data); + $lastLine = strrpos($this->data, "\n", $backwardFrom); - /** - * Returns the string so long as $bytes matches. - * - * Matches as far as possible with a certain set of bytes - * and returns the matched substring. - * - * @param string $bytes - * A mask of bytes to match. If ANY byte in this mask matches the - * current char, the pointer advances and the char is part of the - * substring. - * @param int $max - * The max number of chars to read. - */ - public function charsWhile($bytes, $max = null) { - if ($this->char >= $this->EOF) { - return FALSE; + // However, for here we want the length up until the next byte to be + // processed, so add one to the current byte ($this->char). + if ($lastLine !== false) { + $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); + } else { + // After a newline. + $findLengthOf = substr($this->data, 0, $this->char); + } + + return UTF8Utils::countChars($findLengthOf); } - if ($max === 0 || $max) { - $len = strspn($this->data, $bytes, $this->char, $max); - } - else { - $len = strspn($this->data, $bytes, $this->char); - } - $string = (string) substr($this->data, $this->char, $len); - $this->char += $len; - return $string; - } - - /** - * Unconsume characters. - * - * @param int $howMany - * The number of characters to unconsume. - */ - public function unconsume($howMany = 1) { - if (($this->char - $howMany) >= 0) { - $this->char = $this->char - $howMany; - } - } - - /** - * Look ahead without moving cursor. - */ - public function peek() { - if (($this->char + 1) <= $this->EOF) { - return $this->data[$this->char + 1]; + /** + * + * @deprecated + * + */ + public function getColumnOffset() + { + return $this->columnOffset(); } - return FALSE; - } + /** + * Get the current character. + * + * @return string The current character. + */ + public function current() + { + return $this->data[$this->char]; + } - public function key() { - return $this->char; - } + /** + * Advance the pointer. + * This is part of the Iterator interface. + */ + public function next() + { + $this->char ++; + } + + /** + * Rewind to the start of the string. + */ + public function rewind() + { + $this->char = 0; + } + + /** + * Is the current pointer location valid. + * + * @return bool Is the current pointer location valid. + */ + public function valid() + { + if ($this->char < $this->EOF) { + return true; + } + + return false; + } + + /** + * Get all characters until EOF. + * + * This reads to the end of the file, and sets the read marker at the + * end of the file. + * + * @note This performs bounds checking + * + * @return string Returns the remaining text. If called when the InputStream is + * already exhausted, it returns an empty string. + */ + public function remainingChars() + { + if ($this->char < $this->EOF) { + $data = substr($this->data, $this->char); + $this->char = $this->EOF; + + return $data; + } + + return ''; // false; + } + + /** + * Read to a particular match (or until $max bytes are consumed). + * + * This operates on byte sequences, not characters. + * + * Matches as far as possible until we reach a certain set of bytes + * and returns the matched substring. + * + * @param string $bytes + * Bytes to match. + * @param int $max + * Maximum number of bytes to scan. + * @return mixed Index or false if no match is found. You should use strong + * equality when checking the result, since index could be 0. + */ + public function charsUntil($bytes, $max = null) + { + if ($this->char >= $this->EOF) { + return false; + } + + if ($max === 0 || $max) { + $len = strcspn($this->data, $bytes, $this->char, $max); + } else { + $len = strcspn($this->data, $bytes, $this->char); + } + + $string = (string) substr($this->data, $this->char, $len); + $this->char += $len; + + return $string; + } + + /** + * Returns the string so long as $bytes matches. + * + * Matches as far as possible with a certain set of bytes + * and returns the matched substring. + * + * @param string $bytes + * A mask of bytes to match. If ANY byte in this mask matches the + * current char, the pointer advances and the char is part of the + * substring. + * @param int $max + * The max number of chars to read. + */ + public function charsWhile($bytes, $max = null) + { + if ($this->char >= $this->EOF) { + return false; + } + + if ($max === 0 || $max) { + $len = strspn($this->data, $bytes, $this->char, $max); + } else { + $len = strspn($this->data, $bytes, $this->char); + } + $string = (string) substr($this->data, $this->char, $len); + $this->char += $len; + + return $string; + } + + /** + * Unconsume characters. + * + * @param int $howMany + * The number of characters to unconsume. + */ + public function unconsume($howMany = 1) + { + if (($this->char - $howMany) >= 0) { + $this->char = $this->char - $howMany; + } + } + + /** + * Look ahead without moving cursor. + */ + public function peek() + { + if (($this->char + 1) <= $this->EOF) { + return $this->data[$this->char + 1]; + } + + return false; + } + + public function key() + { + return $this->char; + } } diff --git a/libraries/html5php/HTML5/Parser/Tokenizer.php b/libraries/html5php/HTML5/Parser/Tokenizer.php index a78cf23..92510de 100644 --- a/libraries/html5php/HTML5/Parser/Tokenizer.php +++ b/libraries/html5php/HTML5/Parser/Tokenizer.php @@ -1,7 +1,7 @@ scanner = $scanner; - $this->events = $eventHandler; - } - - /** - * Begin parsing. - * - * This will begin scanning the document, tokenizing as it goes. - * Tokens are emitted into the event handler. - * - * Tokenizing will continue until the document is completely - * read. Errors are emitted into the event handler, but - * the parser will attempt to continue parsing until the - * entire input stream is read. - */ - public function parse() { - $p = 0; - do { - $p = $this->scanner->position(); - $this->consumeData(); - - // FIXME: Add infinite loop protection. - } - while ($this->carryOn); - } - - /** - * Set the text mode for the character data reader. - * - * HTML5 defines three different modes for reading text: - * - Normal: Read until a tag is encountered. - * - RCDATA: Read until a tag is encountered, but skip a few otherwise- - * special characters. - * - Raw: Read until a special closing tag is encountered (viz. pre, script) - * - * This allows those modes to be set. - * - * Normally, setting is done by the event handler via a special return code on - * startTag(), but it can also be set manually using this function. - * - * @param integer $textmode - * One of Elements::TEXT_* - * @param string $untilTag - * The tag that should stop RAW or RCDATA mode. Normal mode does not - * use this indicator. - */ - public function setTextMode($textmode, $untilTag = NULL) { - $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); - $this->untilTag = $untilTag; - } - - /** - * Consume a character and make a move. - * HTML5 8.2.4.1 - */ - protected function consumeData() { - // Character Ref - /* - $this->characterReference() || - $this->tagOpen() || - $this->eof() || - $this->characterData(); + /** + * Buffer for text. */ + protected $text = ''; - $this->characterReference(); - $this->tagOpen(); - $this->eof(); - $this->characterData(); + // When this goes to false, the parser stops. + protected $carryOn = true; + protected $textMode = 0; // TEXTMODE_NORMAL; + protected $untilTag = null; - return $this->carryOn; - } + const WHITE = "\t\n\f "; - /** - * Parse anything that looks like character data. - * - * Different rules apply based on the current text mode. - * - * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. - */ - protected function characterData() { - if ($this->scanner->current() === FALSE) { - return FALSE; - } - switch ($this->textMode) { - case Elements::TEXT_RAW: - case Elements::TEXT_RCDATA: - return $this->rawText(); - default: - $tok = $this->scanner->current(); - if (strspn($tok, "<&")) { - return FALSE; - } - return $this->text(); - } - } - - /** - * This buffers the current token as character data. - */ - protected function text() { - $tok = $this->scanner->current(); - - // This should never happen... - if ($tok === FALSE) { - return FALSE; - } - // Null - if ($tok === "\00") { - $this->parseError("Received NULL character."); - } - // fprintf(STDOUT, "Writing '%s'", $tok); - $this->buffer($tok); - $this->scanner->next(); - return TRUE; - } - - /** - * Read text in RAW mode. - */ - protected function rawText() { - if (is_null($this->untilTag)) { - return $this->text(); - } - $sequence = 'untilTag . '>'; - $txt = $this->readUntilSequence($sequence); - $this->events->text($txt); - $this->setTextMode(0); - return $this->endTag(); - } - - /** - * If the document is read, emit an EOF event. - */ - protected function eof() { - if ($this->scanner->current() === FALSE) { - //fprintf(STDOUT, "EOF"); - $this->flushBuffer(); - $this->events->eof(); - $this->carryOn = FALSE; - return TRUE; - } - return FALSE; - } - - /** - * Handle character references (aka entities). - * - * This version is specific to PCDATA, as it buffers data into the - * text buffer. For a generic version, see decodeCharacterReference(). - * - * HTML5 8.2.4.2 - */ - protected function characterReference() { - $ref = $this->decodeCharacterReference(); - if ($ref !== FALSE) { - $this->buffer($ref); - return TRUE; - } - return FALSE; - } - - - /** - * Emit a tagStart event on encountering a tag. - * - * 8.2.4.8 - */ - protected function tagOpen() { - if ($this->scanner->current() != '<') { - return FALSE; + /** + * Create a new tokenizer. + * + * Typically, parsing a document involves creating a new tokenizer, giving + * it a scanner (input) and an event handler (output), and then calling + * the Tokenizer::parse() method.` + * + * @param \Masterminds\HTML5\Parser\Scanner $scanner + * A scanner initialized with an input stream. + * @param \Masterminds\HTML5\Parser\EventHandler $eventHandler + * An event handler, initialized and ready to receive + * events. + */ + public function __construct($scanner, $eventHandler) + { + $this->scanner = $scanner; + $this->events = $eventHandler; } - // Any buffered text data can go out now. - $this->flushBuffer(); + /** + * Begin parsing. + * + * This will begin scanning the document, tokenizing as it goes. + * Tokens are emitted into the event handler. + * + * Tokenizing will continue until the document is completely + * read. Errors are emitted into the event handler, but + * the parser will attempt to continue parsing until the + * entire input stream is read. + */ + public function parse() + { + $p = 0; + do { + $p = $this->scanner->position(); + $this->consumeData(); - $this->scanner->next(); - - return $this->markupDeclaration() || - $this->endTag() || - $this->processingInstruction() || - $this->tagName() || - // This always returns false. - $this->parseError("Illegal tag opening") || - $this->characterData(); - } - - /** - * Look for markup. - */ - protected function markupDeclaration() { - if ($this->scanner->current() != '!') { - return FALSE; + // FIXME: Add infinite loop protection. + } while ($this->carryOn); } - $tok = $this->scanner->next(); - - // Comment: - if ($tok == '-' && $this->scanner->peek() == '-') { - $this->scanner->next(); // Consume the other '-' - $this->scanner->next(); // Next char. - return $this->comment(); - } - // Doctype - elseif($tok == 'D' || $tok == 'd') { - return $this->doctype(''); - } - // CDATA section - elseif($tok == '[') { - return $this->cdataSection(); + /** + * Set the text mode for the character data reader. + * + * HTML5 defines three different modes for reading text: + * - Normal: Read until a tag is encountered. + * - RCDATA: Read until a tag is encountered, but skip a few otherwise- + * special characters. + * - Raw: Read until a special closing tag is encountered (viz. pre, script) + * + * This allows those modes to be set. + * + * Normally, setting is done by the event handler via a special return code on + * startTag(), but it can also be set manually using this function. + * + * @param integer $textmode + * One of Elements::TEXT_* + * @param string $untilTag + * The tag that should stop RAW or RCDATA mode. Normal mode does not + * use this indicator. + */ + public function setTextMode($textmode, $untilTag = null) + { + $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); + $this->untilTag = $untilTag; } - // FINISH - $this->parseError("Expected . Emit an empty comment because 8.2.4.46 says to. + if ($tok == '>') { + // Parse error. Emit the comment token. + $this->parseError("Expected comment data, got '>'"); + $this->events->comment(''); + $this->scanner->next(); + return true; + } + + // Replace NULL with the replacement char. + if ($tok == "\0") { + $tok = UTF8Utils::FFFD; + } + while (! $this->isCommentEnd()) { + $comment .= $tok; + $tok = $this->scanner->next(); + } + + $this->events->comment($comment); + $this->scanner->next(); + return true; + } + + /** + * Check if the scanner has reached the end of a comment. + */ + protected function isCommentEnd() + { + // EOF + if ($this->scanner->current() === false) { + // Hit the end. + $this->parseError("Unexpected EOF in a comment."); + return true; + } + + // If it doesn't start with -, not the end. + if ($this->scanner->current() != '-') { + return false; + } + + // Advance one, and test for '->' + if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') { + $this->scanner->next(); // Consume the last '>' + return true; + } + // Unread '-'; + $this->scanner->unconsume(1); + return false; + } + + /** + * Parse a DOCTYPE. + * + * Parse a DOCTYPE declaration. This method has strong bearing on whether or + * not Quirksmode is enabled on the event handler. + * + * @todo This method is a little long. Should probably refactor. + */ + protected function doctype() + { + if (strcasecmp($this->scanner->current(), 'D')) { + return false; + } + // Check that string is DOCTYPE. + $chars = $this->scanner->charsWhile("DOCTYPEdoctype"); + if (strcasecmp($chars, 'DOCTYPE')) { + $this->parseError('Expected DOCTYPE, got %s', $chars); + return $this->bogusComment('scanner->whitespace(); + $tok = $this->scanner->current(); + + // EOF: die. + if ($tok === false) { + $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); + return $this->eof(); + } + + $doctypeName = ''; + + // NULL char: convert. + if ($tok === "\0") { + $this->parseError("Unexpected null character in DOCTYPE."); + $doctypeName .= UTF8::FFFD; + $tok = $this->scanner->next(); + } + + $stop = " \n\f>"; + $doctypeName = $this->scanner->charsUntil($stop); + // Lowercase ASCII, replace \0 with FFFD + $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD)); + + $tok = $this->scanner->current(); + + // If false, emit a parse error, DOCTYPE, and return. + if ($tok === false) { + $this->parseError('Unexpected EOF in DOCTYPE declaration.'); + $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true); + return true; + } + + // Short DOCTYPE, like + if ($tok == '>') { + // DOCTYPE without a name. + if (strlen($doctypeName) == 0) { + $this->parseError("Expected a DOCTYPE name. Got nothing."); + $this->events->doctype($doctypeName, 0, null, true); + $this->scanner->next(); + return true; + } + $this->events->doctype($doctypeName); + $this->scanner->next(); + return true; + } + $this->scanner->whitespace(); + + $pub = strtoupper($this->scanner->getAsciiAlpha()); + $white = strlen($this->scanner->whitespace()); + $tok = $this->scanner->current(); + + // Get ID, and flag it as pub or system. + if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { + // Get the sys ID. + $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; + $id = $this->quotedString("\0>"); + if ($id === false) { + $this->events->doctype($doctypeName, $type, $pub, false); + return false; + } + + // Premature EOF. + if ($this->scanner->current() === false) { + $this->parseError("Unexpected EOF in DOCTYPE"); + $this->events->doctype($doctypeName, $type, $id, true); + return true; + } + + // Well-formed complete DOCTYPE. + $this->scanner->whitespace(); + if ($this->scanner->current() == '>') { + $this->events->doctype($doctypeName, $type, $id, false); + $this->scanner->next(); + return true; + } + + // If we get here, we have scanner->charsUntil(">"); + $this->parseError("Malformed DOCTYPE."); + $this->events->doctype($doctypeName, $type, $id, true); + $this->scanner->next(); + return true; + } + + // Else it's a bogus DOCTYPE. + // Consume to > and trash. + $this->scanner->charsUntil('>'); + + $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub); + $this->events->doctype($doctypeName, 0, null, true); + $this->scanner->next(); + return true; + } + + /** + * Utility for reading a quoted string. + * + * @param string $stopchars + * Characters (in addition to a close-quote) that should stop the string. + * E.g. sometimes '>' is higher precedence than '"' or "'". + * @return mixed String if one is found (quotations omitted) + */ + protected function quotedString($stopchars) + { + $tok = $this->scanner->current(); + if ($tok == '"' || $tok == "'") { + $this->scanner->next(); + $ret = $this->scanner->charsUntil($tok . $stopchars); + if ($this->scanner->current() == $tok) { + $this->scanner->next(); + } else { + // Parse error because no close quote. + $this->parseError("Expected %s, got %s", $tok, $this->scanner->current()); + } + return $ret; + } + return false; + } + + /** + * Handle a CDATA section. + */ + protected function cdataSection() + { + if ($this->scanner->current() != '[') { + return false; + } + $cdata = ''; + $this->scanner->next(); + + $chars = $this->scanner->charsWhile('CDAT'); + if ($chars != 'CDATA' || $this->scanner->current() != '[') { + $this->parseError('Expected [CDATA[, got %s', $chars); + return $this->bogusComment('scanner->next(); - } - } - return $val; - } + do { + if ($tok === false) { + $this->parseError('Unexpected EOF inside CDATA.'); + $this->bogusComment('scanner->next(); + } while (! $this->sequenceMatches(']]>')); + // Consume ]]> + $this->scanner->consume(3); - /** - * Consume malformed markup as if it were a comment. - * 8.2.4.44 - * - * The spec requires that the ENTIRE tag-like thing be enclosed inside of - * the comment. So this will generate comments like: - * - * <!--</+foo>--> - * - * @param string $leading - * Prepend any leading characters. This essentially - * negates the need to backtrack, but it's sort of - * a hack. - */ - protected function bogusComment($leading = '') { - - // TODO: This can be done more efficiently when the - // scanner exposes a readUntil() method. - $comment = $leading; - $tok = $this->scanner->current(); - do { - $comment .= $tok; - $tok = $this->scanner->next(); - } while ($tok !== FALSE && $tok != '>'); - - $this->flushBuffer(); - $this->events->comment($comment . $tok); - $this->scanner->next(); - - return TRUE; - } - - /** - * Read a comment. - * - * Expects the first tok to be inside of the comment. - */ - protected function comment() { - $tok = $this->scanner->current(); - $comment = ''; - - // . Emit an empty comment because 8.2.4.46 says to. - if ($tok == '>') { - // Parse error. Emit the comment token. - $this->parseError("Expected comment data, got '>'"); - $this->events->comment(''); - $this->scanner->next(); - return TRUE; + $this->events->cdata($cdata); + return true; } - // Replace NULL with the replacement char. - if ($tok == "\0") { - $tok = UTF8Utils::FFFD; - } - while (!$this->isCommentEnd()) { - $comment .= $tok; - $tok = $this->scanner->next(); - } + // ================================================================ + // Non-HTML5 + // ================================================================ + /** + * Handle a processing instruction. + * + * XML processing instructions are supposed to be ignored in HTML5, + * treated as "bogus comments". However, since we're not a user + * agent, we allow them. We consume until ?> and then issue a + * EventListener::processingInstruction() event. + */ + protected function processingInstruction() + { + if ($this->scanner->current() != '?') { + return false; + } - $this->events->comment($comment); - $this->scanner->next(); - return TRUE; - } + $tok = $this->scanner->next(); + $procName = $this->scanner->getAsciiAlpha(); + $white = strlen($this->scanner->whitespace()); - /** - * Check if the scanner has reached the end of a comment. - */ - protected function isCommentEnd() { - // EOF - if($this->scanner->current() === FALSE) { - // Hit the end. - $this->parseError("Unexpected EOF in a comment."); - return TRUE; - } + // If not a PI, send to bogusComment. + if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) { + $this->parseError("Expected processing instruction name, got $tok"); + $this->bogusComment('scanner->current() != '-') { - return FALSE; - } + $data = ''; + // As long as it's not the case that the next two chars are ? and >. + while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) { + $data .= $this->scanner->current(); + $tok = $this->scanner->next(); + if ($tok === false) { + $this->parseError("Unexpected EOF in processing instruction."); + $this->events->processingInstruction($procName, $data); + return true; + } + } - // Advance one, and test for '->' - if ($this->scanner->next() == '-' - && $this->scanner->peek() == '>') { - $this->scanner->next(); // Consume the last '>' - return TRUE; - } - // Unread '-'; - $this->scanner->unconsume(1); - return FALSE; - } - - /** - * Parse a DOCTYPE. - * - * Parse a DOCTYPE declaration. This method has strong bearing on whether or - * not Quirksmode is enabled on the event handler. - * - * @todo This method is a little long. Should probably refactor. - */ - protected function doctype() { - if (strcasecmp($this->scanner->current(), 'D')) { - return FALSE; - } - // Check that string is DOCTYPE. - $chars = $this->scanner->charsWhile("DOCTYPEdoctype"); - if (strcasecmp($chars, 'DOCTYPE')) { - $this->parseError('Expected DOCTYPE, got %s', $chars); - return $this->bogusComment('scanner->whitespace(); - $tok = $this->scanner->current(); - - // EOF: die. - if ($tok === FALSE) { - $this->events->doctype('html5',EventHandler::DOCTYPE_NONE,'', TRUE); - return $this->eof(); - } - - $doctypeName = ''; - - // NULL char: convert. - if ($tok === "\0") { - $this->parseError("Unexpected NULL character in DOCTYPE."); - $doctypeName .= UTF8::FFFD; - $tok = $this->scanner->next(); - } - - $stop = " \n\f>"; - $doctypeName = $this->scanner->charsUntil($stop); - // Lowercase ASCII, replace \0 with FFFD - $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD)); - - $tok = $this->scanner->current(); - - // If FALSE, emit a parse error, DOCTYPE, and return. - if ($tok === FALSE) { - $this->parseError('Unexpected EOF in DOCTYPE declaration.'); - $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, NULL, TRUE); - return TRUE; - } - - // Short DOCTYPE, like - if ($tok == '>') { - // DOCTYPE without a name. - if (strlen($doctypeName) == 0) { - $this->parseError("Expected a DOCTYPE name. Got nothing."); - $this->events->doctype($doctypeName, 0, NULL, TRUE); - $this->scanner->next(); - return TRUE; - } - $this->events->doctype($doctypeName); - $this->scanner->next(); - return TRUE; - } - $this->scanner->whitespace(); - - $pub = strtoupper($this->scanner->getAsciiAlpha()); - $white = strlen($this->scanner->whitespace()); - $tok = $this->scanner->current(); - - // Get ID, and flag it as pub or system. - if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { - // Get the sys ID. - $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; - $id = $this->quotedString("\0>"); - if ($id === FALSE) { - $this->events->doctype($doctypeName, $type, $pub, FALSE); - return FALSE; - } - - // Premature EOF. - if ($this->scanner->current() === FALSE) { - $this->parseError("Unexpected EOF in DOCTYPE"); - $this->events->doctype($doctypeName, $type, $id, TRUE); - return TRUE; - } - - // Well-formed complete DOCTYPE. - $this->scanner->whitespace(); - if ($this->scanner->current() == '>') { - $this->events->doctype($doctypeName, $type, $id, FALSE); - $this->scanner->next(); - return TRUE; - } - - // If we get here, we have scanner->charsUntil(">"); - $this->parseError("Malformed DOCTYPE."); - $this->events->doctype($doctypeName, $type, $id, TRUE); - $this->scanner->next(); - return TRUE; - } - - // Else it's a bogus DOCTYPE. - // Consume to > and trash. - $this->scanner->charsUntil('>'); - - $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub); - $this->events->doctype($doctypeName, 0, NULL, TRUE); - $this->scanner->next(); - return TRUE; - - } - - /** - * Utility for reading a quoted string. - * - * @param string $stopchars - * Characters (in addition to a close-quote) that should stop the string. - * E.g. sometimes '>' is higher precedence than '"' or "'". - * @return mixed - * String if one is found (quotations omitted) - */ - protected function quotedString($stopchars) { - $tok = $this->scanner->current(); - if ($tok == '"' || $tok == "'") { - $this->scanner->next(); - $ret = $this->scanner->charsUntil($tok . $stopchars); - if ($this->scanner->current() == $tok) { - $this->scanner->next(); - } - else { - // Parse error because no close quote. - $this->parseError("Expected %s, got %s", $tok, $this->scanner->current()); - } - return $ret; - } - return FALSE; - } - - - /** - * Handle a CDATA section. - */ - protected function cdataSection() { - if ($this->scanner->current() != '[') { - return FALSE; - } - $cdata = ''; - $this->scanner->next(); - - $chars = $this->scanner->charsWhile('CDAT'); - if ($chars != 'CDATA' || $this->scanner->current() != '[') { - $this->parseError('Expected [CDATA[, got %s', $chars); - return $this->bogusComment('scanner->next(); - do { - if ($tok === FALSE) { - $this->parseError('Unexpected EOF inside CDATA.'); - $this->bogusComment('scanner->next(); - } - while (!$this->sequenceMatches(']]>')); - - // Consume ]]> - $this->scanner->consume(3); - - $this->events->cdata($cdata); - return TRUE; - - } - - // ================================================================ - // Non-HTML5 - // ================================================================ - /** - * Handle a processing instruction. - * - * XML processing instructions are supposed to be ignored in HTML5, - * treated as "bogus comments". However, since we're not a user - * agent, we allow them. We consume until ?> and then issue a - * EventListener::processingInstruction() event. - */ - protected function processingInstruction() { - if ($this->scanner->current() != '?') { - return FALSE; - } - - $tok = $this->scanner->next(); - $procName = $this->scanner->getAsciiAlpha(); - $white = strlen($this->scanner->whitespace()); - - // If not a PI, send to bogusComment. - if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == FALSE) { - $this->parseError("Expected processing instruction name, got $tok"); - $this->bogusComment('. - while (!($this->scanner->current() == '?' && $this->scanner->peek() == '>')) { - $data .= $this->scanner->current(); - - $tok = $this->scanner->next(); - if ($tok === FALSE) { - $this->parseError("Unexpected EOF in processing instruction."); + $this->scanner->next(); // > + $this->scanner->next(); // Next token. $this->events->processingInstruction($procName, $data); - return TRUE; - } - + return true; } - $this->scanner->next(); // > - $this->scanner->next(); // Next token. - $this->events->processingInstruction($procName, $data); - return TRUE; - } + // ================================================================ + // UTILITY FUNCTIONS + // ================================================================ + /** + * Read from the input stream until we get to the desired sequene + * or hit the end of the input stream. + */ + protected function readUntilSequence($sequence) + { + $buffer = ''; - // ================================================================ - // UTILITY FUNCTIONS - // ================================================================ + // Optimization for reading larger blocks faster. + $first = substr($sequence, 0, 1); + while ($this->scanner->current() !== false) { + $buffer .= $this->scanner->charsUntil($first); - /** - * Read from the input stream until we get to the desired sequene - * or hit the end of the input stream. - */ - protected function readUntilSequence($sequence) { - $buffer = ''; + // Stop as soon as we hit the stopping condition. + if ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))) { + return $buffer; + } + $buffer .= $this->scanner->current(); + $this->scanner->next(); + } - // Optimization for reading larger blocks faster. - $first = substr($sequence, 0, 1); - while ($this->scanner->current() !== FALSE) { - $buffer .= $this->scanner->charsUntil($first); - - // Stop as soon as we hit the stopping condition. - if ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))) { + // If we get here, we hit the EOF. + $this->parseError("Unexpected EOF during text read."); return $buffer; - } - $buffer .= $this->scanner->current(); - $this->scanner->next(); } - // If we get here, we hit the EOF. - $this->parseError("Unexpected EOF during text read."); - return $buffer; - } + /** + * Check if upcomming chars match the given sequence. + * + * This will read the stream for the $sequence. If it's + * found, this will return true. If not, return false. + * Since this unconsumes any chars it reads, the caller + * will still need to read the next sequence, even if + * this returns true. + * + * Example: $this->sequenceMatches('') will + * see if the input stream is at the start of a + * '' string. + */ + protected function sequenceMatches($sequence) + { + $len = strlen($sequence); + $buffer = ''; + for ($i = 0; $i < $len; ++ $i) { + $buffer .= $this->scanner->current(); - /** - * Check if upcomming chars match the given sequence. - * - * This will read the stream for the $sequence. If it's - * found, this will return TRUE. If not, return FALSE. - * Since this unconsumes any chars it reads, the caller - * will still need to read the next sequence, even if - * this returns TRUE. - * - * Example: $this->sequenceMatches('') will - * see if the input stream is at the start of a - * '' string. - */ - protected function sequenceMatches($sequence) { - $len = strlen($sequence); - $buffer = ''; - for ($i = 0; $i < $len; ++$i) { - $buffer .= $this->scanner->current(); - - // EOF. Rewind and let the caller handle it. - if ($this->scanner->current() === FALSE) { - $this->scanner->unconsume($i); - return FALSE; - } - $this->scanner->next(); - } - - $this->scanner->unconsume($len); - return $buffer == $sequence; - - } - - /** - * Send a TEXT event with the contents of the text buffer. - * - * This emits an EventHandler::text() event with the current contents of the - * temporary text buffer. (The buffer is used to group as much PCDATA - * as we can instead of emitting lots and lots of TEXT events.) - */ - protected function flushBuffer() { - if (empty($this->text)) { - return; - } - $this->events->text($this->text); - $this->text = ''; - } - - /** - * Add text to the temporary buffer. - * - * @see flushBuffer() - */ - protected function buffer($str) { - $this->text .= $str; - } - - /** - * Emit a parse error. - * - * A parse error always returns FALSE because it never consumes any - * characters. - */ - protected function parseError($msg) { - $args = func_get_args(); - - if (count($args) > 1) { - array_shift($args); - $msg = vsprintf($msg, $args); - } - - $line = $this->scanner->currentLine(); - $col = $this->scanner->columnOffset(); - $this->events->parseError($msg, $line, $col); - return FALSE; - } - - /** - * Decode a character reference and return the string. - * - * Returns FALSE if the entity could not be found. If $inAttribute is set - * to TRUE, a bare & will be returned as-is. - * - * @param boolean $inAttribute - * Set to TRUE if the text is inside of an attribute value. - * FALSE otherwise. - */ - protected function decodeCharacterReference($inAttribute = FALSE) { - - // If it fails this, it's definitely not an entity. - if ($this->scanner->current() != '&') { - return FALSE; - } - - // Next char after &. - $tok = $this->scanner->next(); - $entity = ''; - $start = $this->scanner->position(); - - if ($tok == FALSE) { - return '&'; - } - - // These indicate not an entity. We return just - // the &. - if (strspn($tok, static::WHITE . "&<") == 1) { - //$this->scanner->next(); - return '&'; - } - - // Numeric entity - if ($tok == '#') { - $tok = $this->scanner->next(); - - // Hexidecimal encoding. - // X[0-9a-fA-F]+; - // x[0-9a-fA-F]+; - if ($tok == 'x' || $tok == 'X') { - $tok = $this->scanner->next(); // Consume x - - // Convert from hex code to char. - $hex = $this->scanner->getHex(); - if (empty($hex)) { - $this->parseError("Expected &#xHEX;, got &#x%s", $tok); - // We unconsume because we don't know what parser rules might - // be in effect for the remaining chars. For example. '&#>' - // might result in a specific parsing rule inside of tag - // contexts, while not inside of pcdata context. - $this->scanner->unconsume(2); - return '&'; + // EOF. Rewind and let the caller handle it. + if ($this->scanner->current() === false) { + $this->scanner->unconsume($i); + return false; + } + $this->scanner->next(); } - $entity = CharacterReference::lookupHex($hex); - } - // Decimal encoding. - // [0-9]+; - else { - // Convert from decimal to char. - $numeric = $this->scanner->getNumeric(); - if ($numeric === FALSE) { - $this->parseError("Expected &#DIGITS;, got &#%s", $tok); - $this->scanner->unconsume(2); - return '&'; + + $this->scanner->unconsume($len); + return $buffer == $sequence; + } + + /** + * Send a TEXT event with the contents of the text buffer. + * + * This emits an EventHandler::text() event with the current contents of the + * temporary text buffer. (The buffer is used to group as much PCDATA + * as we can instead of emitting lots and lots of TEXT events.) + */ + protected function flushBuffer() + { + if ($this->text === '') { + return; } - $entity = CharacterReference::lookupDecimal($numeric); - } - } - // String entity. - else { - // Attempt to consume a string up to a ';'. - // [a-zA-Z0-9]+; - $cname = $this->scanner->getAsciiAlpha(); - $entity = CharacterReference::lookupName($cname); - if ($entity == NULL) { - $this->parseError("No match in entity table for '%s'", $entity); - } + $this->events->text($this->text); + $this->text = ''; } - // The scanner has advanced the cursor for us. - $tok = $this->scanner->current(); - - // We have an entity. We're done here. - if ($tok == ';') { - $this->scanner->next(); - return $entity; + /** + * Add text to the temporary buffer. + * + * @see flushBuffer() + */ + protected function buffer($str) + { + $this->text .= $str; } - // If in an attribute, then failing to match ; means unconsume the - // entire string. Otherwise, failure to match is an error. - if ($inAttribute) { - $this->scanner->unconsume($this->scanner->position() - $start); - return '&'; + /** + * Emit a parse error. + * + * A parse error always returns false because it never consumes any + * characters. + */ + protected function parseError($msg) + { + $args = func_get_args(); + + if (count($args) > 1) { + array_shift($args); + $msg = vsprintf($msg, $args); + } + + $line = $this->scanner->currentLine(); + $col = $this->scanner->columnOffset(); + $this->events->parseError($msg, $line, $col); + return false; } - $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok); - return '&' . $entity; + /** + * Decode a character reference and return the string. + * + * Returns false if the entity could not be found. If $inAttribute is set + * to true, a bare & will be returned as-is. + * + * @param boolean $inAttribute + * Set to true if the text is inside of an attribute value. + * false otherwise. + */ + protected function decodeCharacterReference($inAttribute = false) + { - } + // If it fails this, it's definitely not an entity. + if ($this->scanner->current() != '&') { + return false; + } + // Next char after &. + $tok = $this->scanner->next(); + $entity = ''; + $start = $this->scanner->position(); + + if ($tok == false) { + return '&'; + } + + // These indicate not an entity. We return just + // the &. + if (strspn($tok, static::WHITE . "&<") == 1) { + // $this->scanner->next(); + return '&'; + } + + // Numeric entity + if ($tok == '#') { + $tok = $this->scanner->next(); + + // Hexidecimal encoding. + // X[0-9a-fA-F]+; + // x[0-9a-fA-F]+; + if ($tok == 'x' || $tok == 'X') { + $tok = $this->scanner->next(); // Consume x + + // Convert from hex code to char. + $hex = $this->scanner->getHex(); + if (empty($hex)) { + $this->parseError("Expected &#xHEX;, got &#x%s", $tok); + // We unconsume because we don't know what parser rules might + // be in effect for the remaining chars. For example. '&#>' + // might result in a specific parsing rule inside of tag + // contexts, while not inside of pcdata context. + $this->scanner->unconsume(2); + return '&'; + } + $entity = CharacterReference::lookupHex($hex); + } // Decimal encoding. + // [0-9]+; + else { + // Convert from decimal to char. + $numeric = $this->scanner->getNumeric(); + if ($numeric === false) { + $this->parseError("Expected &#DIGITS;, got &#%s", $tok); + $this->scanner->unconsume(2); + return '&'; + } + $entity = CharacterReference::lookupDecimal($numeric); + } + } // String entity. + else { + // Attempt to consume a string up to a ';'. + // [a-zA-Z0-9]+; + $cname = $this->scanner->getAsciiAlpha(); + $entity = CharacterReference::lookupName($cname); + if ($entity == null) { + $this->parseError("No match in entity table for '%s'", $entity); + } + } + + // The scanner has advanced the cursor for us. + $tok = $this->scanner->current(); + + // We have an entity. We're done here. + if ($tok == ';') { + $this->scanner->next(); + return $entity; + } + + // If in an attribute, then failing to match ; means unconsume the + // entire string. Otherwise, failure to match is an error. + if ($inAttribute) { + $this->scanner->unconsume($this->scanner->position() - $start); + return '&'; + } + + $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok); + return '&' . $entity; + } } diff --git a/libraries/html5php/HTML5/Parser/TreeBuildingRules.php b/libraries/html5php/HTML5/Parser/TreeBuildingRules.php index b87c6b5..2af3c66 100644 --- a/libraries/html5php/HTML5/Parser/TreeBuildingRules.php +++ b/libraries/html5php/HTML5/Parser/TreeBuildingRules.php @@ -1,114 +1,140 @@ 1, - 'dd' => 1, - 'dt' => 1, - 'rt' => 1, - 'rp' => 1, - 'tr' => 1, - 'th' => 1, - 'td' => 1, - 'thead' => 1, - 'tfoot' => 1, - 'tbody' => 1, - 'table' => 1, - 'optgroup' => 1, - 'option' => 1, - ); + protected static $tags = array( + 'li' => 1, + 'dd' => 1, + 'dt' => 1, + 'rt' => 1, + 'rp' => 1, + 'tr' => 1, + 'th' => 1, + 'td' => 1, + 'thead' => 1, + 'tfoot' => 1, + 'tbody' => 1, + 'table' => 1, + 'optgroup' => 1, + 'option' => 1 + ); - /** - * Build a new rules engine. - * - * @param \DOMDocument $doc - * The DOM document to use for evaluation and modification. - */ - public function __construct($doc) { - $this->doc = $doc; - } - - /** - * Returns TRUE if the given tagname has special processing rules. - */ - public function hasRules($tagname) { - return isset(static::$tags[$tagname]); - } - - /** - * Evaluate the rule for the current tag name. - * - * This may modify the existing DOM. - * - * @return \DOMElement - * The new Current DOM element. - */ - public function evaluate($new, $current) { - - switch($new->tagName) { - case 'li': - return $this->handleLI($new, $current); - case 'dt': - case 'dd': - return $this->handleDT($new, $current); - case 'rt': - case 'rp': - return $this->handleRT($new, $current); - case 'optgroup': - return $this->closeIfCurrentMatches($new, $current, array('optgroup')); - case 'option': - return $this->closeIfCurrentMatches($new, $current, array('option', 'optgroup')); - case 'tr': - return $this->closeIfCurrentMatches($new, $current, array('tr')); - case 'td': - case 'th': - return $this->closeIfCurrentMatches($new, $current, array('th', 'td')); - case 'tbody': - case 'thead': - case 'tfoot': - case 'table': // Spec isn't explicit about this, but it's necessary. - return $this->closeIfCurrentMatches($new, $current, array('thead', 'tfoot', 'tbody')); + /** + * Build a new rules engine. + * + * @param \DOMDocument $doc + * The DOM document to use for evaluation and modification. + */ + public function __construct($doc) + { + $this->doc = $doc; } - return $current; - } - - protected function handleLI($ele, $current) { - return $this->closeIfCurrentMatches($ele, $current, array('li')); - } - - protected function handleDT($ele, $current) { - return $this->closeIfCurrentMatches($ele, $current, array('dt','dd')); - } - protected function handleRT($ele, $current) { - return $this->closeIfCurrentMatches($ele, $current, array('rt','rp')); - } - - protected function closeIfCurrentMatches($ele, $current, $match) { - $tname = $current->tagName; - if (in_array($current->tagName, $match)) { - $current->parentNode->appendChild($ele); + /** + * Returns true if the given tagname has special processing rules. + */ + public function hasRules($tagname) + { + return isset(static::$tags[$tagname]); } - else { - $current->appendChild($ele); - } - return $ele; - } + /** + * Evaluate the rule for the current tag name. + * + * This may modify the existing DOM. + * + * @return \DOMElement The new Current DOM element. + */ + public function evaluate($new, $current) + { + switch ($new->tagName) { + case 'li': + return $this->handleLI($new, $current); + case 'dt': + case 'dd': + return $this->handleDT($new, $current); + case 'rt': + case 'rp': + return $this->handleRT($new, $current); + case 'optgroup': + return $this->closeIfCurrentMatches($new, $current, array( + 'optgroup' + )); + case 'option': + return $this->closeIfCurrentMatches($new, $current, array( + 'option', + 'optgroup' + )); + case 'tr': + return $this->closeIfCurrentMatches($new, $current, array( + 'tr' + )); + case 'td': + case 'th': + return $this->closeIfCurrentMatches($new, $current, array( + 'th', + 'td' + )); + case 'tbody': + case 'thead': + case 'tfoot': + case 'table': // Spec isn't explicit about this, but it's necessary. + + return $this->closeIfCurrentMatches($new, $current, array( + 'thead', + 'tfoot', + 'tbody' + )); + } + + return $current; + } + + protected function handleLI($ele, $current) + { + return $this->closeIfCurrentMatches($ele, $current, array( + 'li' + )); + } + + protected function handleDT($ele, $current) + { + return $this->closeIfCurrentMatches($ele, $current, array( + 'dt', + 'dd' + )); + } + + protected function handleRT($ele, $current) + { + return $this->closeIfCurrentMatches($ele, $current, array( + 'rt', + 'rp' + )); + } + + protected function closeIfCurrentMatches($ele, $current, $match) + { + $tname = $current->tagName; + if (in_array($current->tagName, $match)) { + $current->parentNode->appendChild($ele); + } else { + $current->appendChild($ele); + } + + return $ele; + } } diff --git a/libraries/html5php/HTML5/Parser/UTF8Utils.php b/libraries/html5php/HTML5/Parser/UTF8Utils.php index 974a670..d319252 100644 --- a/libraries/html5php/HTML5/Parser/UTF8Utils.php +++ b/libraries/html5php/HTML5/Parser/UTF8Utils.php @@ -1,13 +1,14 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including + "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to @@ -25,145 +26,130 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -namespace HTML5\Parser; /** * UTF-8 Utilities */ -class UTF8Utils { - /** - * The Unicode replacement character.. - */ - const FFFD = "\xEF\xBF\xBD"; - /** - * Count the number of characters in a string. - * - * UTF-8 aware. This will try (in order) iconv, - * MB, libxml, and finally a custom counter. - * - * @todo Move this to a general utility class. - */ - public static function countChars($string) { - // Get the length for the string we need. - if(function_exists('iconv_strlen')) { - return iconv_strlen($string, 'utf-8'); - } - elseif(function_exists('mb_strlen')) { - return mb_strlen($string, 'utf-8'); - } - elseif(function_exists('utf8_decode')) { - // MPB: Will this work? Won't certain decodes lead to two chars - // extrapolated out of 2-byte chars? - return strlen(utf8_decode($string)); - } - $count = count_chars($string); - // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) - // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) - return array_sum(array_slice($count, 0, 0x80)) + - array_sum(array_slice($count, 0xC2, 0x33)); - } +class UTF8Utils +{ - /** - * Convert data from the given encoding to UTF-8. - * - * This has not yet been tested with charactersets other than UTF-8. - * It should work with ISO-8859-1/-13 and standard Latin Win charsets. - * - * @param string $data - * The data to convert. - * @param string $encoding - * A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php - */ - public static function convertToUTF8($data, $encoding = 'UTF-8') { - /* - * From the HTML5 spec: - Given an encoding, the bytes in the input stream must be - converted to Unicode characters for the tokeniser, as - described by the rules for that encoding, except that the - leading U+FEFF BYTE ORDER MARK character, if any, must not - be stripped by the encoding layer (it is stripped by the rule below). + /** + * The Unicode replacement character.. + */ + const FFFD = "\xEF\xBF\xBD"; - Bytes or sequences of bytes in the original byte stream that - could not be converted to Unicode characters must be converted - to U+FFFD REPLACEMENT CHARACTER code points. */ - - // mb_convert_encoding is chosen over iconv because of a bug. The best - // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643 - // which contains links to the actual but reports as well as work around - // details. - if (function_exists('mb_convert_encoding')) { - // mb library has the following behaviors: - // - UTF-16 surrogates result in FALSE. - // - Overlongs and outside Plane 16 result in empty strings. - - // Before we run mb_convert_encoding we need to tell it what to do with - // characters it does not know. This could be different than the parent - // application executing this library so we store the value, change it - // to our needs, and then change it back when we are done. This feels - // a little excessive and it would be great if there was a better way. - $save = ini_get('mbstring.substitute_character'); - ini_set('mbstring.substitute_character', "none"); - $data = mb_convert_encoding($data, 'UTF-8', $encoding); - ini_set('mbstring.substitute_character', $save); - } - // @todo Get iconv running in at least some environments if that is possible. - elseif (function_exists('iconv') && $encoding != 'auto') { - // fprintf(STDOUT, "iconv found\n"); - // iconv has the following behaviors: - // - Overlong representations are ignored. - // - Beyond Plane 16 is replaced with a lower char. - // - Incomplete sequences generate a warning. - $data = @iconv($encoding, 'UTF-8//IGNORE', $data); - } - else { - // we can make a conforming native implementation - throw new Exception('Not implemented, please install mbstring or iconv'); + /** + * Count the number of characters in a string. + * + * UTF-8 aware. This will try (in order) iconv, + * MB, libxml, and finally a custom counter. + * + * @todo Move this to a general utility class. + */ + public static function countChars($string) + { + // Get the length for the string we need. + if (function_exists('iconv_strlen')) { + return iconv_strlen($string, 'utf-8'); + } elseif (function_exists('mb_strlen')) { + return mb_strlen($string, 'utf-8'); + } elseif (function_exists('utf8_decode')) { + // MPB: Will this work? Won't certain decodes lead to two chars + // extrapolated out of 2-byte chars? + return strlen(utf8_decode($string)); + } + $count = count_chars($string); + // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) + // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) + return array_sum(array_slice($count, 0, 0x80)) + array_sum(array_slice($count, 0xC2, 0x33)); } - /* One leading U+FEFF BYTE ORDER MARK character must be - ignored if any are present. */ - if (substr($data, 0, 3) === "\xEF\xBB\xBF") { - $data = substr($data, 3); + /** + * Convert data from the given encoding to UTF-8. + * + * This has not yet been tested with charactersets other than UTF-8. + * It should work with ISO-8859-1/-13 and standard Latin Win charsets. + * + * @param string $data + * The data to convert. + * @param string $encoding + * A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php + */ + public static function convertToUTF8($data, $encoding = 'UTF-8') + { + /* + * From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted to Unicode characters for the tokeniser, as described by the rules for that encoding, except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes in the original byte stream that could not be converted to Unicode characters must be converted to U+FFFD REPLACEMENT CHARACTER code points. + */ + + // mb_convert_encoding is chosen over iconv because of a bug. The best + // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643 + // which contains links to the actual but reports as well as work around + // details. + if (function_exists('mb_convert_encoding')) { + // mb library has the following behaviors: + // - UTF-16 surrogates result in false. + // - Overlongs and outside Plane 16 result in empty strings. + + // Before we run mb_convert_encoding we need to tell it what to do with + // characters it does not know. This could be different than the parent + // application executing this library so we store the value, change it + // to our needs, and then change it back when we are done. This feels + // a little excessive and it would be great if there was a better way. + $save = ini_get('mbstring.substitute_character'); + ini_set('mbstring.substitute_character', "none"); + $data = mb_convert_encoding($data, 'UTF-8', $encoding); + ini_set('mbstring.substitute_character', $save); + } // @todo Get iconv running in at least some environments if that is possible. + elseif (function_exists('iconv') && $encoding != 'auto') { + // fprintf(STDOUT, "iconv found\n"); + // iconv has the following behaviors: + // - Overlong representations are ignored. + // - Beyond Plane 16 is replaced with a lower char. + // - Incomplete sequences generate a warning. + $data = @iconv($encoding, 'UTF-8//IGNORE', $data); + } else { + // we can make a conforming native implementation + throw new Exception('Not implemented, please install mbstring or iconv'); + } + + /* + * One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present. + */ + if (substr($data, 0, 3) === "\xEF\xBB\xBF") { + $data = substr($data, 3); + } + + return $data; } - return $data; - } + /** + * Checks for Unicode code points that are not valid in a document. + * + * @param string $data + * A string to analyze. + * @return array An array of (string) error messages produced by the scanning. + */ + public static function checkForIllegalCodepoints($data) + { + if (! function_exists('preg_match_all')) { + throw\Exception('The PCRE library is not loaded or is not available.'); + } - /** - * Checks for Unicode code points that are not valid in a document. - * - * @param string $data - * A string to analyze. - * @return array - * An array of (string) error messages produced by the scanning. - */ - public static function checkForIllegalCodepoints($data) { - if (!function_exists('preg_match_all')) { - throw \Exception('The PCRE library is not loaded or is not available.'); - } + // Vestigal error handling. + $errors = array(); - // Vestigal error handling. - $errors = array(); + /* + * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such characters is a parse error. + */ + for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i ++) { + $errors[] = 'null-character'; + } - /* All U+0000 NULL characters in the input must be replaced - by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such - characters is a parse error. */ - for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) { - $errors[] = 'null-character'; - } - - /* Any occurrences of any characters in the ranges U+0001 to - U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, - U+D800 to U+DFFF , U+FDD0 to U+FDEF, and - characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, - U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, - U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, - U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, - U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and - U+10FFFF are parse errors. (These are all control characters - or permanently undefined Unicode characters.) */ - // Check PCRE is loaded. - $count = preg_match_all( - '/(?: + /* + * Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. (These are all control characters or permanently undefined Unicode characters.) + */ + // Check PCRE is loaded. + $count = preg_match_all( + '/(?: [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F | \xC2[\x80-\x9F] # U+0080 to U+009F @@ -175,13 +161,11 @@ class UTF8Utils { \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF | [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) - )/x', - $data, - $matches - ); - for ($i = 0; $i < $count; $i++) { - $errors[] = 'invalid-codepoint'; + )/x', $data, $matches); + for ($i = 0; $i < $count; $i ++) { + $errors[] = 'invalid-codepoint'; + } + + return $errors; } - return $errors; - } } diff --git a/libraries/html5php/HTML5/Serializer/HTML5Entities.php b/libraries/html5php/HTML5/Serializer/HTML5Entities.php index 5717002..4f90f84 100644 --- a/libraries/html5php/HTML5/Serializer/HTML5Entities.php +++ b/libraries/html5php/HTML5/Serializer/HTML5Entities.php @@ -3,1528 +3,1531 @@ * @file * This contains HTML5 entities to use with serializing. * - * The list here is mildly different from the list at \HTML5\Entities because + * The list here is mildly different from the list at \Masterminds\HTML5\Entities because * that list was generated from the w3c. It contains some entities that are * not entirely proper such as &am; which maps to &. This list is meant to be * a fallback for PHP versions prior to PHP 5.4 when dealing with encoding. */ -namespace HTML5\Serializer; +namespace Masterminds\HTML5\Serializer; /** - * A mapping of entities to their html5 representation. Used for older PHP + * A mapping of entities to their html5 representation. + * Used for older PHP * versions that don't have the mapping. */ -class HTML5Entities { - public static $map = array ( - ' ' => ' ', - "\n" => ' ', - '!' => '!', - '"' => '"', - '#' => '#', - '$' => '$', - '%' => '%', - '&' => '&', - '\'' => ''', - '(' => '(', - ')' => ')', - '*' => '*', - '+' => '+', - ',' => ',', - '.' => '.', - '/' => '/', - ':' => ':', - ';' => ';', - '<' => '<', - '<⃒' => '&nvlt', - '=' => '=', - '=⃥' => '&bne', - '>' => '>', - '>⃒' => '&nvgt', - '?' => '?', - '@' => '@', - '[' => '[', - '\\' => '\', - ']' => ']', - '^' => '^', - '_' => '_', - '`' => '`', - 'fj' => '&fjlig', - '{' => '{', - '|' => '|', - '}' => '}', - ' ' => ' ', - '¡' => '¡', - '¢' => '¢', - '£' => '£', - '¤' => '¤', - 'Â¥' => '¥', - '¦' => '¦', - '§' => '§', - '¨' => '¨', - '©' => '©', - 'ª' => 'ª', - '«' => '«', - '¬' => '¬', - '­' => '­', - '®' => '®', - '¯' => '¯', - '°' => '°', - '±' => '±', - '²' => '²', - '³' => '³', - '´' => '´', - 'µ' => 'µ', - '¶' => '¶', - '·' => '·', - '¸' => '¸', - '¹' => '¹', - 'º' => 'º', - '»' => '»', - '¼' => '¼', - '½' => '½', - '¾' => '¾', - '¿' => '¿', - 'À' => 'À', - 'Ã' => 'Á', - 'Â' => 'Â', - 'Ã' => 'Ã', - 'Ä' => 'Ä', - 'Ã…' => 'Å', - 'Æ' => 'Æ', - 'Ç' => 'Ç', - 'È' => 'È', - 'É' => 'É', - 'Ê' => 'Ê', - 'Ë' => 'Ë', - 'ÃŒ' => 'Ì', - 'Ã' => 'Í', - 'ÃŽ' => 'Î', - 'Ã' => 'Ï', - 'Ã' => 'Ð', - 'Ñ' => 'Ñ', - 'Ã’' => 'Ò', - 'Ó' => 'Ó', - 'Ô' => 'Ô', - 'Õ' => 'Õ', - 'Ö' => 'Ö', - '×' => '×', - 'Ø' => 'Ø', - 'Ù' => 'Ù', - 'Ú' => 'Ú', - 'Û' => 'Û', - 'Ãœ' => 'Ü', - 'Ã' => 'Ý', - 'Þ' => 'Þ', - 'ß' => 'ß', - 'à' => 'à', - 'á' => 'á', - 'â' => 'â', - 'ã' => 'ã', - 'ä' => 'ä', - 'Ã¥' => 'å', - 'æ' => 'æ', - 'ç' => 'ç', - 'è' => 'è', - 'é' => 'é', - 'ê' => 'ê', - 'ë' => 'ë', - 'ì' => 'ì', - 'í' => 'í', - 'î' => 'î', - 'ï' => 'ï', - 'ð' => 'ð', - 'ñ' => 'ñ', - 'ò' => 'ò', - 'ó' => 'ó', - 'ô' => 'ô', - 'õ' => 'õ', - 'ö' => 'ö', - '÷' => '÷', - 'ø' => 'ø', - 'ù' => 'ù', - 'ú' => 'ú', - 'û' => 'û', - 'ü' => 'ü', - 'ý' => 'ý', - 'þ' => 'þ', - 'ÿ' => 'ÿ', - 'Ä€' => 'Ā', - 'Ä' => 'ā', - 'Ä‚' => 'Ă', - 'ă' => 'ă', - 'Ä„' => 'Ą', - 'Ä…' => 'ą', - 'Ć' => 'Ć', - 'ć' => 'ć', - 'Ĉ' => 'Ĉ', - 'ĉ' => 'ĉ', - 'ÄŠ' => 'Ċ', - 'Ä‹' => 'ċ', - 'ÄŒ' => 'Č', - 'Ä' => 'č', - 'ÄŽ' => 'Ď', - 'Ä' => 'ď', - 'Ä' => 'Đ', - 'Ä‘' => 'đ', - 'Ä’' => 'Ē', - 'Ä“' => 'ē', - 'Ä–' => 'Ė', - 'Ä—' => 'ė', - 'Ę' => 'Ę', - 'Ä™' => 'ę', - 'Äš' => 'Ě', - 'Ä›' => 'ě', - 'Äœ' => 'Ĝ', - 'Ä' => 'ĝ', - 'Äž' => 'Ğ', - 'ÄŸ' => 'ğ', - 'Ä ' => 'Ġ', - 'Ä¡' => 'ġ', - 'Ä¢' => 'Ģ', - 'Ĥ' => 'Ĥ', - 'Ä¥' => 'ĥ', - 'Ħ' => 'Ħ', - 'ħ' => 'ħ', - 'Ĩ' => 'Ĩ', - 'Ä©' => 'ĩ', - 'Ī' => 'Ī', - 'Ä«' => 'ī', - 'Ä®' => 'Į', - 'į' => 'į', - 'Ä°' => 'İ', - 'ı' => 'ı', - 'IJ' => 'IJ', - 'ij' => 'ij', - 'Ä´' => 'Ĵ', - 'ĵ' => 'ĵ', - 'Ķ' => 'Ķ', - 'Ä·' => 'ķ', - 'ĸ' => 'ĸ', - 'Ĺ' => 'Ĺ', - 'ĺ' => 'ĺ', - 'Ä»' => 'Ļ', - 'ļ' => 'ļ', - 'Ľ' => 'Ľ', - 'ľ' => 'ľ', - 'Ä¿' => 'Ŀ', - 'Å€' => 'ŀ', - 'Å' => 'Ł', - 'Å‚' => 'ł', - 'Ń' => 'Ń', - 'Å„' => 'ń', - 'Å…' => 'Ņ', - 'ņ' => 'ņ', - 'Ň' => 'Ň', - 'ň' => 'ň', - 'ʼn' => 'ʼn', - 'ÅŠ' => 'Ŋ', - 'Å‹' => 'ŋ', - 'ÅŒ' => 'Ō', - 'Å' => 'ō', - 'Å' => 'Ő', - 'Å‘' => 'ő', - 'Å’' => 'Œ', - 'Å“' => 'œ', - 'Å”' => 'Ŕ', - 'Å•' => 'ŕ', - 'Å–' => 'Ŗ', - 'Å—' => 'ŗ', - 'Ř' => 'Ř', - 'Å™' => 'ř', - 'Åš' => 'Ś', - 'Å›' => 'ś', - 'Åœ' => 'Ŝ', - 'Å' => 'ŝ', - 'Åž' => 'Ş', - 'ÅŸ' => 'ş', - 'Å ' => 'Š', - 'Å¡' => 'š', - 'Å¢' => 'Ţ', - 'Å£' => 'ţ', - 'Ť' => 'Ť', - 'Å¥' => 'ť', - 'Ŧ' => 'Ŧ', - 'ŧ' => 'ŧ', - 'Ũ' => 'Ũ', - 'Å©' => 'ũ', - 'Ū' => 'Ū', - 'Å«' => 'ū', - 'Ŭ' => 'Ŭ', - 'Å­' => 'ŭ', - 'Å®' => 'Ů', - 'ů' => 'ů', - 'Å°' => 'Ű', - 'ű' => 'ű', - 'Ų' => 'Ų', - 'ų' => 'ų', - 'Å´' => 'Ŵ', - 'ŵ' => 'ŵ', - 'Ŷ' => 'Ŷ', - 'Å·' => 'ŷ', - 'Ÿ' => 'Ÿ', - 'Ź' => 'Ź', - 'ź' => 'ź', - 'Å»' => 'Ż', - 'ż' => 'ż', - 'Ž' => 'Ž', - 'ž' => 'ž', - 'Æ’' => 'ƒ', - 'Ƶ' => 'Ƶ', - 'ǵ' => 'ǵ', - 'È·' => 'ȷ', - 'ˆ' => 'ˆ', - 'ˇ' => 'ˇ', - '˘' => '˘', - 'Ë™' => '˙', - 'Ëš' => '˚', - 'Ë›' => '˛', - 'Ëœ' => '˜', - 'Ë' => '˝', - 'Ì‘' => '̑', - 'Α' => 'Α', - 'Î’' => 'Β', - 'Γ' => 'Γ', - 'Δ' => 'Δ', - 'Ε' => 'Ε', - 'Ζ' => 'Ζ', - 'Η' => 'Η', - 'Θ' => 'Θ', - 'Ι' => 'Ι', - 'Κ' => 'Κ', - 'Λ' => 'Λ', - 'Îœ' => 'Μ', - 'Î' => 'Ν', - 'Ξ' => 'Ξ', - 'Ο' => 'Ο', - 'Π' => 'Π', - 'Ρ' => 'Ρ', - 'Σ' => 'Σ', - 'Τ' => 'Τ', - 'Î¥' => 'Υ', - 'Φ' => 'Φ', - 'Χ' => 'Χ', - 'Ψ' => 'Ψ', - 'Ω' => 'Ω', - 'α' => 'α', - 'β' => 'β', - 'γ' => 'γ', - 'δ' => 'δ', - 'ε' => 'ε', - 'ζ' => 'ζ', - 'η' => 'η', - 'θ' => 'θ', - 'ι' => 'ι', - 'κ' => 'κ', - 'λ' => 'λ', - 'μ' => 'μ', - 'ν' => 'ν', - 'ξ' => 'ξ', - 'ο' => 'ο', - 'Ï€' => 'π', - 'Ï' => 'ρ', - 'Ï‚' => 'ς', - 'σ' => 'σ', - 'Ï„' => 'τ', - 'Ï…' => 'υ', - 'φ' => 'φ', - 'χ' => 'χ', - 'ψ' => 'ψ', - 'ω' => 'ω', - 'Ï‘' => 'ϑ', - 'Ï’' => 'ϒ', - 'Ï•' => 'ϕ', - 'Ï–' => 'ϖ', - 'Ïœ' => 'Ϝ', - 'Ï' => 'ϝ', - 'Ï°' => 'ϰ', - 'ϱ' => 'ϱ', - 'ϵ' => 'ϵ', - '϶' => '϶', - 'Ð' => 'Ё', - 'Ђ' => 'Ђ', - 'Ѓ' => 'Ѓ', - 'Є' => 'Є', - 'Ð…' => 'Ѕ', - 'І' => 'І', - 'Ї' => 'Ї', - 'Ј' => 'Ј', - 'Љ' => 'Љ', - 'Њ' => 'Њ', - 'Ћ' => 'Ћ', - 'ÐŒ' => 'Ќ', - 'ÐŽ' => 'Ў', - 'Ð' => 'Џ', - 'Ð' => 'А', - 'Б' => 'Б', - 'Ð’' => 'В', - 'Г' => 'Г', - 'Д' => 'Д', - 'Е' => 'Е', - 'Ж' => 'Ж', - 'З' => 'З', - 'И' => 'И', - 'Й' => 'Й', - 'К' => 'К', - 'Л' => 'Л', - 'Ðœ' => 'М', - 'Ð' => 'Н', - 'О' => 'О', - 'П' => 'П', - 'Р' => 'Р', - 'С' => 'С', - 'Т' => 'Т', - 'У' => 'У', - 'Ф' => 'Ф', - 'Ð¥' => 'Х', - 'Ц' => 'Ц', - 'Ч' => 'Ч', - 'Ш' => 'Ш', - 'Щ' => 'Щ', - 'Ъ' => 'Ъ', - 'Ы' => 'Ы', - 'Ь' => 'Ь', - 'Э' => 'Э', - 'Ю' => 'Ю', - 'Я' => 'Я', - 'а' => 'а', - 'б' => 'б', - 'в' => 'в', - 'г' => 'г', - 'д' => 'д', - 'е' => 'е', - 'ж' => 'ж', - 'з' => 'з', - 'и' => 'и', - 'й' => 'й', - 'к' => 'к', - 'л' => 'л', - 'м' => 'м', - 'н' => 'н', - 'о' => 'о', - 'п' => 'п', - 'Ñ€' => 'р', - 'Ñ' => 'с', - 'Ñ‚' => 'т', - 'у' => 'у', - 'Ñ„' => 'ф', - 'Ñ…' => 'х', - 'ц' => 'ц', - 'ч' => 'ч', - 'ш' => 'ш', - 'щ' => 'щ', - 'ÑŠ' => 'ъ', - 'Ñ‹' => 'ы', - 'ÑŒ' => 'ь', - 'Ñ' => 'э', - 'ÑŽ' => 'ю', - 'Ñ' => 'я', - 'Ñ‘' => 'ё', - 'Ñ’' => 'ђ', - 'Ñ“' => 'ѓ', - 'Ñ”' => 'є', - 'Ñ•' => 'ѕ', - 'Ñ–' => 'і', - 'Ñ—' => 'ї', - 'ј' => 'ј', - 'Ñ™' => 'љ', - 'Ñš' => 'њ', - 'Ñ›' => 'ћ', - 'Ñœ' => 'ќ', - 'Ñž' => 'ў', - 'ÑŸ' => 'џ', - ' ' => ' ', - ' ' => ' ', - ' ' => ' ', - ' ' => ' ', - ' ' => ' ', - ' ' => ' ', - ' ' => ' ', - ' ' => ' ', - '​' => '​', - '‌' => '‌', - 'â€' => '‍', - '‎' => '‎', - 'â€' => '‏', - 'â€' => '‐', - '–' => '–', - '—' => '—', - '―' => '―', - '‖' => '‖', - '‘' => '‘', - '’' => '’', - '‚' => '‚', - '“' => '“', - 'â€' => '”', - '„' => '„', - '†' => '†', - '‡' => '‡', - '•' => '•', - '‥' => '‥', - '…' => '…', - '‰' => '‰', - '‱' => '‱', - '′' => '′', - '″' => '″', - '‴' => '‴', - '‵' => '‵', - '‹' => '‹', - '›' => '›', - '‾' => '‾', - 'â' => '⁁', - 'âƒ' => '⁃', - 'â„' => '⁄', - 'â' => '⁏', - 'â—' => '⁗', - 'âŸ' => ' ', - 'âŸâ€Š' => '&ThickSpace', - 'â ' => '⁠', - 'â¡' => '⁡', - 'â¢' => '⁢', - 'â£' => '⁣', - '€' => '€', - '⃛' => '⃛', - '⃜' => '⃜', - 'â„‚' => 'ℂ', - 'â„…' => '℅', - 'â„Š' => 'ℊ', - 'â„‹' => 'ℋ', - 'â„Œ' => 'ℌ', - 'â„' => 'ℍ', - 'â„Ž' => 'ℎ', - 'â„' => 'ℏ', - 'â„' => 'ℐ', - 'â„‘' => 'ℑ', - 'â„’' => 'ℒ', - 'â„“' => 'ℓ', - 'â„•' => 'ℕ', - 'â„–' => '№', - 'â„—' => '℗', - '℘' => '℘', - 'â„™' => 'ℙ', - 'â„š' => 'ℚ', - 'â„›' => 'ℛ', - 'â„œ' => 'ℜ', - 'â„' => 'ℝ', - 'â„ž' => '℞', - 'â„¢' => '™', - 'ℤ' => 'ℤ', - '℧' => '℧', - 'ℨ' => 'ℨ', - 'â„©' => '℩', - 'ℬ' => 'ℬ', - 'â„­' => 'ℭ', - 'ℯ' => 'ℯ', - 'â„°' => 'ℰ', - 'ℱ' => 'ℱ', - 'ℳ' => 'ℳ', - 'â„´' => 'ℴ', - 'ℵ' => 'ℵ', - 'ℶ' => 'ℶ', - 'â„·' => 'ℷ', - 'ℸ' => 'ℸ', - 'â……' => 'ⅅ', - 'â…†' => 'ⅆ', - 'â…‡' => 'ⅇ', - 'â…ˆ' => 'ⅈ', - 'â…“' => '⅓', - 'â…”' => '⅔', - 'â…•' => '⅕', - 'â…–' => '⅖', - 'â…—' => '⅗', - 'â…˜' => '⅘', - 'â…™' => '⅙', - 'â…š' => '⅚', - 'â…›' => '⅛', - 'â…œ' => '⅜', - 'â…' => '⅝', - 'â…ž' => '⅞', - 'â†' => '←', - '↑' => '↑', - '→' => '→', - '↓' => '↓', - '↔' => '↔', - '↕' => '↕', - '↖' => '↖', - '↗' => '↗', - '↘' => '↘', - '↙' => '↙', - '↚' => '↚', - '↛' => '↛', - 'â†' => '↝', - 'â†Ì¸' => '&nrarrw', - '↞' => '↞', - '↟' => '↟', - '↠' => '↠', - '↡' => '↡', - '↢' => '↢', - '↣' => '↣', - '↤' => '↤', - '↥' => '↥', - '↦' => '↦', - '↧' => '↧', - '↩' => '↩', - '↪' => '↪', - '↫' => '↫', - '↬' => '↬', - '↭' => '↭', - '↮' => '↮', - '↰' => '↰', - '↱' => '↱', - '↲' => '↲', - '↳' => '↳', - '↵' => '↵', - '↶' => '↶', - '↷' => '↷', - '↺' => '↺', - '↻' => '↻', - '↼' => '↼', - '↽' => '↽', - '↾' => '↾', - '↿' => '↿', - '⇀' => '⇀', - 'â‡' => '⇁', - '⇂' => '⇂', - '⇃' => '⇃', - '⇄' => '⇄', - '⇅' => '⇅', - '⇆' => '⇆', - '⇇' => '⇇', - '⇈' => '⇈', - '⇉' => '⇉', - '⇊' => '⇊', - '⇋' => '⇋', - '⇌' => '⇌', - 'â‡' => '⇍', - '⇎' => '⇎', - 'â‡' => '⇏', - 'â‡' => '⇐', - '⇑' => '⇑', - '⇒' => '⇒', - '⇓' => '⇓', - '⇔' => '⇔', - '⇕' => '⇕', - '⇖' => '⇖', - '⇗' => '⇗', - '⇘' => '⇘', - '⇙' => '⇙', - '⇚' => '⇚', - '⇛' => '⇛', - 'â‡' => '⇝', - '⇤' => '⇤', - '⇥' => '⇥', - '⇵' => '⇵', - '⇽' => '⇽', - '⇾' => '⇾', - '⇿' => '⇿', - '∀' => '∀', - 'âˆ' => '∁', - '∂' => '∂', - '∂̸' => '&npart', - '∃' => '∃', - '∄' => '∄', - '∅' => '∅', - '∇' => '∇', - '∈' => '∈', - '∉' => '∉', - '∋' => '∋', - '∌' => '∌', - 'âˆ' => '∏', - 'âˆ' => '∐', - '∑' => '∑', - '−' => '−', - '∓' => '∓', - '∔' => '∔', - '∖' => '∖', - '∗' => '∗', - '∘' => '∘', - '√' => '√', - 'âˆ' => '∝', - '∞' => '∞', - '∟' => '∟', - '∠' => '∠', - '∠⃒' => '&nang', - '∡' => '∡', - '∢' => '∢', - '∣' => '∣', - '∤' => '∤', - '∥' => '∥', - '∦' => '∦', - '∧' => '∧', - '∨' => '∨', - '∩' => '∩', - '∩︀' => '&caps', - '∪' => '∪', - '∪︀' => '&cups', - '∫' => '∫', - '∬' => '∬', - '∭' => '∭', - '∮' => '∮', - '∯' => '∯', - '∰' => '∰', - '∱' => '∱', - '∲' => '∲', - '∳' => '∳', - '∴' => '∴', - '∵' => '∵', - '∶' => '∶', - '∷' => '∷', - '∸' => '∸', - '∺' => '∺', - '∻' => '∻', - '∼' => '∼', - '∼⃒' => '&nvsim', - '∽' => '∽', - '∽̱' => '&race', - '∾' => '∾', - '∾̳' => '&acE', - '∿' => '∿', - '≀' => '≀', - 'â‰' => '≁', - '≂' => '≂', - '≂̸' => '&nesim', - '≃' => '≃', - '≄' => '≄', - '≅' => '≅', - '≆' => '≆', - '≇' => '≇', - '≈' => '≈', - '≉' => '≉', - '≊' => '≊', - '≋' => '≋', - '≋̸' => '&napid', - '≌' => '≌', - 'â‰' => '≍', - 'â‰âƒ’' => '&nvap', - '≎' => '≎', - '≎̸' => '&nbump', - 'â‰' => '≏', - 'â‰Ì¸' => '&nbumpe', - 'â‰' => '≐', - 'â‰Ì¸' => '&nedot', - '≑' => '≑', - '≒' => '≒', - '≓' => '≓', - '≔' => '≔', - '≕' => '≕', - '≖' => '≖', - '≗' => '≗', - '≙' => '≙', - '≚' => '≚', - '≜' => '≜', - '≟' => '≟', - '≠' => '≠', - '≡' => '≡', - '≡⃥' => '&bnequiv', - '≢' => '≢', - '≤' => '≤', - '≤⃒' => '&nvle', - '≥' => '≥', - '≥⃒' => '&nvge', - '≦' => '≦', - '≦̸' => '&nlE', - '≧' => '≧', - '≧̸' => '&NotGreaterFullEqual', - '≨' => '≨', - '≨︀' => '&lvertneqq', - '≩' => '≩', - '≩︀' => '&gvertneqq', - '≪' => '≪', - '≪̸' => '&nLtv', - '≪⃒' => '&nLt', - '≫' => '≫', - '≫̸' => '&NotGreaterGreater', - '≫⃒' => '&nGt', - '≬' => '≬', - '≭' => '≭', - '≮' => '≮', - '≯' => '≯', - '≰' => '≰', - '≱' => '≱', - '≲' => '≲', - '≳' => '≳', - '≴' => '≴', - '≵' => '≵', - '≶' => '≶', - '≷' => '≷', - '≸' => '≸', - '≹' => '≹', - '≺' => '≺', - '≻' => '≻', - '≼' => '≼', - '≽' => '≽', - '≾' => '≾', - '≿' => '≿', - '≿̸' => '&NotSucceedsTilde', - '⊀' => '⊀', - 'âŠ' => '⊁', - '⊂' => '⊂', - '⊂⃒' => '&vnsub', - '⊃' => '⊃', - '⊃⃒' => '&nsupset', - '⊄' => '⊄', - '⊅' => '⊅', - '⊆' => '⊆', - '⊇' => '⊇', - '⊈' => '⊈', - '⊉' => '⊉', - '⊊' => '⊊', - '⊊︀' => '&vsubne', - '⊋' => '⊋', - '⊋︀' => '&vsupne', - 'âŠ' => '⊍', - '⊎' => '⊎', - 'âŠ' => '⊏', - 'âŠÌ¸' => '&NotSquareSubset', - 'âŠ' => '⊐', - 'âŠÌ¸' => '&NotSquareSuperset', - '⊑' => '⊑', - '⊒' => '⊒', - '⊓' => '⊓', - '⊓︀' => '&sqcaps', - '⊔' => '⊔', - '⊔︀' => '&sqcups', - '⊕' => '⊕', - '⊖' => '⊖', - '⊗' => '⊗', - '⊘' => '⊘', - '⊙' => '⊙', - '⊚' => '⊚', - '⊛' => '⊛', - 'âŠ' => '⊝', - '⊞' => '⊞', - '⊟' => '⊟', - '⊠' => '⊠', - '⊡' => '⊡', - '⊢' => '⊢', - '⊣' => '⊣', - '⊤' => '⊤', - '⊥' => '⊥', - '⊧' => '⊧', - '⊨' => '⊨', - '⊩' => '⊩', - '⊪' => '⊪', - '⊫' => '⊫', - '⊬' => '⊬', - '⊭' => '⊭', - '⊮' => '⊮', - '⊯' => '⊯', - '⊰' => '⊰', - '⊲' => '⊲', - '⊳' => '⊳', - '⊴' => '⊴', - '⊴⃒' => '&nvltrie', - '⊵' => '⊵', - '⊵⃒' => '&nvrtrie', - '⊶' => '⊶', - '⊷' => '⊷', - '⊸' => '⊸', - '⊹' => '⊹', - '⊺' => '⊺', - '⊻' => '⊻', - '⊽' => '⊽', - '⊾' => '⊾', - '⊿' => '⊿', - 'â‹€' => '⋀', - 'â‹' => '⋁', - 'â‹‚' => '⋂', - '⋃' => '⋃', - 'â‹„' => '⋄', - 'â‹…' => '⋅', - '⋆' => '⋆', - '⋇' => '⋇', - '⋈' => '⋈', - '⋉' => '⋉', - 'â‹Š' => '⋊', - 'â‹‹' => '⋋', - 'â‹Œ' => '⋌', - 'â‹' => '⋍', - 'â‹Ž' => '⋎', - 'â‹' => '⋏', - 'â‹' => '⋐', - 'â‹‘' => '⋑', - 'â‹’' => '⋒', - 'â‹“' => '⋓', - 'â‹”' => '⋔', - 'â‹•' => '⋕', - 'â‹–' => '⋖', - 'â‹—' => '⋗', - '⋘' => '⋘', - '⋘̸' => '&nLl', - 'â‹™' => '⋙', - '⋙̸' => '&nGg', - 'â‹š' => '⋚', - '⋚︀' => '&lesg', - 'â‹›' => '⋛', - '⋛︀' => '&gesl', - 'â‹ž' => '⋞', - 'â‹Ÿ' => '⋟', - 'â‹ ' => '⋠', - 'â‹¡' => '⋡', - 'â‹¢' => '⋢', - 'â‹£' => '⋣', - '⋦' => '⋦', - '⋧' => '⋧', - '⋨' => '⋨', - 'â‹©' => '⋩', - '⋪' => '⋪', - 'â‹«' => '⋫', - '⋬' => '⋬', - 'â‹­' => '⋭', - 'â‹®' => '⋮', - '⋯' => '⋯', - 'â‹°' => '⋰', - '⋱' => '⋱', - '⋲' => '⋲', - '⋳' => '⋳', - 'â‹´' => '⋴', - '⋵' => '⋵', - '⋵̸' => '¬indot', - '⋶' => '⋶', - 'â‹·' => '⋷', - '⋹' => '⋹', - '⋹̸' => '¬inE', - '⋺' => '⋺', - 'â‹»' => '⋻', - '⋼' => '⋼', - '⋽' => '⋽', - '⋾' => '⋾', - '⌅' => '⌅', - '⌆' => '⌆', - '⌈' => '⌈', - '⌉' => '⌉', - '⌊' => '⌊', - '⌋' => '⌋', - '⌌' => '⌌', - 'âŒ' => '⌍', - '⌎' => '⌎', - 'âŒ' => '⌏', - 'âŒ' => '⌐', - '⌒' => '⌒', - '⌓' => '⌓', - '⌕' => '⌕', - '⌖' => '⌖', - '⌜' => '⌜', - 'âŒ' => '⌝', - '⌞' => '⌞', - '⌟' => '⌟', - '⌢' => '⌢', - '⌣' => '⌣', - '⌭' => '⌭', - '⌮' => '⌮', - '⌶' => '⌶', - '⌽' => '⌽', - '⌿' => '⌿', - 'â¼' => '⍼', - '⎰' => '⎰', - '⎱' => '⎱', - '⎴' => '⎴', - '⎵' => '⎵', - '⎶' => '⎶', - 'âœ' => '⏜', - 'â' => '⏝', - 'âž' => '⏞', - 'âŸ' => '⏟', - 'â¢' => '⏢', - 'â§' => '⏧', - 'â£' => '␣', - 'Ⓢ' => 'Ⓢ', - '─' => '─', - '│' => '│', - '┌' => '┌', - 'â”' => '┐', - 'â””' => '└', - '┘' => '┘', - '├' => '├', - '┤' => '┤', - '┬' => '┬', - 'â”´' => '┴', - '┼' => '┼', - 'â•' => '═', - 'â•‘' => '║', - 'â•’' => '╒', - 'â•“' => '╓', - 'â•”' => '╔', - 'â••' => '╕', - 'â•–' => '╖', - 'â•—' => '╗', - '╘' => '╘', - 'â•™' => '╙', - 'â•š' => '╚', - 'â•›' => '╛', - 'â•œ' => '╜', - 'â•' => '╝', - 'â•ž' => '╞', - 'â•Ÿ' => '╟', - 'â• ' => '╠', - 'â•¡' => '╡', - 'â•¢' => '╢', - 'â•£' => '╣', - '╤' => '╤', - 'â•¥' => '╥', - '╦' => '╦', - '╧' => '╧', - '╨' => '╨', - 'â•©' => '╩', - '╪' => '╪', - 'â•«' => '╫', - '╬' => '╬', - 'â–€' => '▀', - 'â–„' => '▄', - 'â–ˆ' => '█', - 'â–‘' => '░', - 'â–’' => '▒', - 'â–“' => '▓', - 'â–¡' => '□', - 'â–ª' => '▪', - 'â–«' => '▫', - 'â–­' => '▭', - 'â–®' => '▮', - 'â–±' => '▱', - 'â–³' => '△', - 'â–´' => '▴', - 'â–µ' => '▵', - 'â–¸' => '▸', - 'â–¹' => '▹', - 'â–½' => '▽', - 'â–¾' => '▾', - 'â–¿' => '▿', - 'â—‚' => '◂', - 'â—ƒ' => '◃', - 'â—Š' => '◊', - 'â—‹' => '○', - 'â—¬' => '◬', - 'â—¯' => '◯', - 'â—¸' => '◸', - 'â—¹' => '◹', - 'â—º' => '◺', - 'â—»' => '◻', - 'â—¼' => '◼', - '★' => '★', - '☆' => '☆', - '☎' => '☎', - '♀' => '♀', - '♂' => '♂', - 'â™ ' => '♠', - '♣' => '♣', - '♥' => '♥', - '♦' => '♦', - '♪' => '♪', - 'â™­' => '♭', - 'â™®' => '♮', - '♯' => '♯', - '✓' => '✓', - '✗' => '✗', - '✠' => '✠', - '✶' => '✶', - 'â˜' => '❘', - 'â²' => '❲', - 'â³' => '❳', - '⟈' => '⟈', - '⟉' => '⟉', - '⟦' => '⟦', - '⟧' => '⟧', - '⟨' => '⟨', - '⟩' => '⟩', - '⟪' => '⟪', - '⟫' => '⟫', - '⟬' => '⟬', - '⟭' => '⟭', - '⟵' => '⟵', - '⟶' => '⟶', - '⟷' => '⟷', - '⟸' => '⟸', - '⟹' => '⟹', - '⟺' => '⟺', - '⟼' => '⟼', - '⟿' => '⟿', - '⤂' => '⤂', - '⤃' => '⤃', - '⤄' => '⤄', - '⤅' => '⤅', - '⤌' => '⤌', - 'â¤' => '⤍', - '⤎' => '⤎', - 'â¤' => '⤏', - 'â¤' => '⤐', - '⤑' => '⤑', - '⤒' => '⤒', - '⤓' => '⤓', - '⤖' => '⤖', - '⤙' => '⤙', - '⤚' => '⤚', - '⤛' => '⤛', - '⤜' => '⤜', - 'â¤' => '⤝', - '⤞' => '⤞', - '⤟' => '⤟', - '⤠' => '⤠', - '⤣' => '⤣', - '⤤' => '⤤', - '⤥' => '⤥', - '⤦' => '⤦', - '⤧' => '⤧', - '⤨' => '⤨', - '⤩' => '⤩', - '⤪' => '⤪', - '⤳' => '⤳', - '⤳̸' => '&nrarrc', - '⤵' => '⤵', - '⤶' => '⤶', - '⤷' => '⤷', - '⤸' => '⤸', - '⤹' => '⤹', - '⤼' => '⤼', - '⤽' => '⤽', - '⥅' => '⥅', - '⥈' => '⥈', - '⥉' => '⥉', - '⥊' => '⥊', - '⥋' => '⥋', - '⥎' => '⥎', - 'â¥' => '⥏', - 'â¥' => '⥐', - '⥑' => '⥑', - '⥒' => '⥒', - '⥓' => '⥓', - '⥔' => '⥔', - '⥕' => '⥕', - '⥖' => '⥖', - '⥗' => '⥗', - '⥘' => '⥘', - '⥙' => '⥙', - '⥚' => '⥚', - '⥛' => '⥛', - '⥜' => '⥜', - 'â¥' => '⥝', - '⥞' => '⥞', - '⥟' => '⥟', - '⥠' => '⥠', - '⥡' => '⥡', - '⥢' => '⥢', - '⥣' => '⥣', - '⥤' => '⥤', - '⥥' => '⥥', - '⥦' => '⥦', - '⥧' => '⥧', - '⥨' => '⥨', - '⥩' => '⥩', - '⥪' => '⥪', - '⥫' => '⥫', - '⥬' => '⥬', - '⥭' => '⥭', - '⥮' => '⥮', - '⥯' => '⥯', - '⥰' => '⥰', - '⥱' => '⥱', - '⥲' => '⥲', - '⥳' => '⥳', - '⥴' => '⥴', - '⥵' => '⥵', - '⥶' => '⥶', - '⥸' => '⥸', - '⥹' => '⥹', - '⥻' => '⥻', - '⥼' => '⥼', - '⥽' => '⥽', - '⥾' => '⥾', - '⥿' => '⥿', - '⦅' => '⦅', - '⦆' => '⦆', - '⦋' => '⦋', - '⦌' => '⦌', - 'â¦' => '⦍', - '⦎' => '⦎', - 'â¦' => '⦏', - 'â¦' => '⦐', - '⦑' => '⦑', - '⦒' => '⦒', - '⦓' => '⦓', - '⦔' => '⦔', - '⦕' => '⦕', - '⦖' => '⦖', - '⦚' => '⦚', - '⦜' => '⦜', - 'â¦' => '⦝', - '⦤' => '⦤', - '⦥' => '⦥', - '⦦' => '⦦', - '⦧' => '⦧', - '⦨' => '⦨', - '⦩' => '⦩', - '⦪' => '⦪', - '⦫' => '⦫', - '⦬' => '⦬', - '⦭' => '⦭', - '⦮' => '⦮', - '⦯' => '⦯', - '⦰' => '⦰', - '⦱' => '⦱', - '⦲' => '⦲', - '⦳' => '⦳', - '⦴' => '⦴', - '⦵' => '⦵', - '⦶' => '⦶', - '⦷' => '⦷', - '⦹' => '⦹', - '⦻' => '⦻', - '⦼' => '⦼', - '⦾' => '⦾', - '⦿' => '⦿', - '⧀' => '⧀', - 'â§' => '⧁', - '⧂' => '⧂', - '⧃' => '⧃', - '⧄' => '⧄', - '⧅' => '⧅', - '⧉' => '⧉', - 'â§' => '⧍', - '⧎' => '⧎', - 'â§' => '⧏', - 'â§Ì¸' => '&NotLeftTriangleBar', - 'â§' => '⧐', - 'â§Ì¸' => '&NotRightTriangleBar', - '⧜' => '⧜', - 'â§' => '⧝', - '⧞' => '⧞', - '⧣' => '⧣', - '⧤' => '⧤', - '⧥' => '⧥', - '⧫' => '⧫', - '⧴' => '⧴', - '⧶' => '⧶', - '⨀' => '⨀', - 'â¨' => '⨁', - '⨂' => '⨂', - '⨄' => '⨄', - '⨆' => '⨆', - '⨌' => '⨌', - 'â¨' => '⨍', - 'â¨' => '⨐', - '⨑' => '⨑', - '⨒' => '⨒', - '⨓' => '⨓', - '⨔' => '⨔', - '⨕' => '⨕', - '⨖' => '⨖', - '⨗' => '⨗', - '⨢' => '⨢', - '⨣' => '⨣', - '⨤' => '⨤', - '⨥' => '⨥', - '⨦' => '⨦', - '⨧' => '⨧', - '⨩' => '⨩', - '⨪' => '⨪', - '⨭' => '⨭', - '⨮' => '⨮', - '⨯' => '⨯', - '⨰' => '⨰', - '⨱' => '⨱', - '⨳' => '⨳', - '⨴' => '⨴', - '⨵' => '⨵', - '⨶' => '⨶', - '⨷' => '⨷', - '⨸' => '⨸', - '⨹' => '⨹', - '⨺' => '⨺', - '⨻' => '⨻', - '⨼' => '⨼', - '⨿' => '⨿', - 'â©€' => '⩀', - 'â©‚' => '⩂', - '⩃' => '⩃', - 'â©„' => '⩄', - 'â©…' => '⩅', - '⩆' => '⩆', - '⩇' => '⩇', - '⩈' => '⩈', - '⩉' => '⩉', - 'â©Š' => '⩊', - 'â©‹' => '⩋', - 'â©Œ' => '⩌', - 'â©' => '⩍', - 'â©' => '⩐', - 'â©“' => '⩓', - 'â©”' => '⩔', - 'â©•' => '⩕', - 'â©–' => '⩖', - 'â©—' => '⩗', - '⩘' => '⩘', - 'â©š' => '⩚', - 'â©›' => '⩛', - 'â©œ' => '⩜', - 'â©' => '⩝', - 'â©Ÿ' => '⩟', - '⩦' => '⩦', - '⩪' => '⩪', - 'â©­' => '⩭', - '⩭̸' => '&ncongdot', - 'â©®' => '⩮', - '⩯' => '⩯', - 'â©°' => '⩰', - '⩰̸' => '&napE', - '⩱' => '⩱', - '⩲' => '⩲', - '⩳' => '⩳', - 'â©´' => '⩴', - '⩵' => '⩵', - 'â©·' => '⩷', - '⩸' => '⩸', - '⩹' => '⩹', - '⩺' => '⩺', - 'â©»' => '⩻', - '⩼' => '⩼', - '⩽' => '⩽', - '⩽̸' => '&nles', - '⩾' => '⩾', - '⩾̸' => '&nges', - 'â©¿' => '⩿', - '⪀' => '⪀', - 'âª' => '⪁', - '⪂' => '⪂', - '⪃' => '⪃', - '⪄' => '⪄', - '⪅' => '⪅', - '⪆' => '⪆', - '⪇' => '⪇', - '⪈' => '⪈', - '⪉' => '⪉', - '⪊' => '⪊', - '⪋' => '⪋', - '⪌' => '⪌', - 'âª' => '⪍', - '⪎' => '⪎', - 'âª' => '⪏', - 'âª' => '⪐', - '⪑' => '⪑', - '⪒' => '⪒', - '⪓' => '⪓', - '⪔' => '⪔', - '⪕' => '⪕', - '⪖' => '⪖', - '⪗' => '⪗', - '⪘' => '⪘', - '⪙' => '⪙', - '⪚' => '⪚', - 'âª' => '⪝', - '⪞' => '⪞', - '⪟' => '⪟', - '⪠' => '⪠', - '⪡' => '⪡', - '⪡̸' => '&NotNestedLessLess', - '⪢' => '⪢', - '⪢̸' => '&NotNestedGreaterGreater', - '⪤' => '⪤', - '⪥' => '⪥', - '⪦' => '⪦', - '⪧' => '⪧', - '⪨' => '⪨', - '⪩' => '⪩', - '⪪' => '⪪', - '⪫' => '⪫', - '⪬' => '⪬', - '⪬︀' => '&smtes', - '⪭' => '⪭', - '⪭︀' => '&lates', - '⪮' => '⪮', - '⪯' => '⪯', - '⪯̸' => '&NotPrecedesEqual', - '⪰' => '⪰', - '⪰̸' => '&NotSucceedsEqual', - '⪳' => '⪳', - '⪴' => '⪴', - '⪵' => '⪵', - '⪶' => '⪶', - '⪷' => '⪷', - '⪸' => '⪸', - '⪹' => '⪹', - '⪺' => '⪺', - '⪻' => '⪻', - '⪼' => '⪼', - '⪽' => '⪽', - '⪾' => '⪾', - '⪿' => '⪿', - 'â«€' => '⫀', - 'â«' => '⫁', - 'â«‚' => '⫂', - '⫃' => '⫃', - 'â«„' => '⫄', - 'â«…' => '⫅', - '⫅̸' => '&nsubE', - '⫆' => '⫆', - '⫆̸' => '&nsupseteqq', - '⫇' => '⫇', - '⫈' => '⫈', - 'â«‹' => '⫋', - '⫋︀' => '&vsubnE', - 'â«Œ' => '⫌', - '⫌︀' => '&varsupsetneqq', - 'â«' => '⫏', - 'â«' => '⫐', - 'â«‘' => '⫑', - 'â«’' => '⫒', - 'â«“' => '⫓', - 'â«”' => '⫔', - 'â«•' => '⫕', - 'â«–' => '⫖', - 'â«—' => '⫗', - '⫘' => '⫘', - 'â«™' => '⫙', - 'â«š' => '⫚', - 'â«›' => '⫛', - '⫤' => '⫤', - '⫦' => '⫦', - '⫧' => '⫧', - '⫨' => '⫨', - 'â«©' => '⫩', - 'â««' => '⫫', - '⫬' => '⫬', - 'â«­' => '⫭', - 'â«®' => '⫮', - '⫯' => '⫯', - 'â«°' => '⫰', - '⫱' => '⫱', - '⫲' => '⫲', - '⫳' => '⫳', - '⫽︀' => '&varsupsetneqq', - 'ff' => 'ff', - 'ï¬' => 'fi', - 'fl' => 'fl', - 'ffi' => 'ffi', - 'ffl' => 'ffl', - 'ð’œ' => '𝒜', - 'ð’ž' => '𝒞', - 'ð’Ÿ' => '𝒟', - 'ð’¢' => '𝒢', - 'ð’¥' => '𝒥', - 'ð’¦' => '𝒦', - 'ð’©' => '𝒩', - 'ð’ª' => '𝒪', - 'ð’«' => '𝒫', - 'ð’¬' => '𝒬', - 'ð’®' => '𝒮', - 'ð’¯' => '𝒯', - 'ð’°' => '𝒰', - 'ð’±' => '𝒱', - 'ð’²' => '𝒲', - 'ð’³' => '𝒳', - 'ð’´' => '𝒴', - 'ð’µ' => '𝒵', - 'ð’¶' => '𝒶', - 'ð’·' => '𝒷', - 'ð’¸' => '𝒸', - 'ð’¹' => '𝒹', - 'ð’»' => '𝒻', - 'ð’½' => '𝒽', - 'ð’¾' => '𝒾', - 'ð’¿' => '𝒿', - 'ð“€' => '𝓀', - 'ð“' => '𝓁', - 'ð“‚' => '𝓂', - 'ð“ƒ' => '𝓃', - 'ð“…' => '𝓅', - 'ð“†' => '𝓆', - 'ð“‡' => '𝓇', - 'ð“ˆ' => '𝓈', - 'ð“‰' => '𝓉', - 'ð“Š' => '𝓊', - 'ð“‹' => '𝓋', - 'ð“Œ' => '𝓌', - 'ð“' => '𝓍', - 'ð“Ž' => '𝓎', - 'ð“' => '𝓏', - 'ð”„' => '𝔄', - 'ð”…' => '𝔅', - 'ð”‡' => '𝔇', - 'ð”ˆ' => '𝔈', - 'ð”‰' => '𝔉', - 'ð”Š' => '𝔊', - 'ð”' => '𝔍', - 'ð”Ž' => '𝔎', - 'ð”' => '𝔏', - 'ð”' => '𝔐', - 'ð”‘' => '𝔑', - 'ð”’' => '𝔒', - 'ð”“' => '𝔓', - 'ð””' => '𝔔', - 'ð”–' => '𝔖', - 'ð”—' => '𝔗', - 'ð”˜' => '𝔘', - 'ð”™' => '𝔙', - 'ð”š' => '𝔚', - 'ð”›' => '𝔛', - 'ð”œ' => '𝔜', - 'ð”ž' => '𝔞', - 'ð”Ÿ' => '𝔟', - 'ð” ' => '𝔠', - 'ð”¡' => '𝔡', - 'ð”¢' => '𝔢', - 'ð”£' => '𝔣', - 'ð”¤' => '𝔤', - 'ð”¥' => '𝔥', - 'ð”¦' => '𝔦', - 'ð”§' => '𝔧', - 'ð”¨' => '𝔨', - 'ð”©' => '𝔩', - 'ð”ª' => '𝔪', - 'ð”«' => '𝔫', - 'ð”¬' => '𝔬', - 'ð”­' => '𝔭', - 'ð”®' => '𝔮', - 'ð”¯' => '𝔯', - 'ð”°' => '𝔰', - 'ð”±' => '𝔱', - 'ð”²' => '𝔲', - 'ð”³' => '𝔳', - 'ð”´' => '𝔴', - 'ð”µ' => '𝔵', - 'ð”¶' => '𝔶', - 'ð”·' => '𝔷', - 'ð”¸' => '𝔸', - 'ð”¹' => '𝔹', - 'ð”»' => '𝔻', - 'ð”¼' => '𝔼', - 'ð”½' => '𝔽', - 'ð”¾' => '𝔾', - 'ð•€' => '𝕀', - 'ð•' => '𝕁', - 'ð•‚' => '𝕂', - 'ð•ƒ' => '𝕃', - 'ð•„' => '𝕄', - 'ð•†' => '𝕆', - 'ð•Š' => '𝕊', - 'ð•‹' => '𝕋', - 'ð•Œ' => '𝕌', - 'ð•' => '𝕍', - 'ð•Ž' => '𝕎', - 'ð•' => '𝕏', - 'ð•' => '𝕐', - 'ð•’' => '𝕒', - 'ð•“' => '𝕓', - 'ð•”' => '𝕔', - 'ð••' => '𝕕', - 'ð•–' => '𝕖', - 'ð•—' => '𝕗', - 'ð•˜' => '𝕘', - 'ð•™' => '𝕙', - 'ð•š' => '𝕚', - 'ð•›' => '𝕛', - 'ð•œ' => '𝕜', - 'ð•' => '𝕝', - 'ð•ž' => '𝕞', - 'ð•Ÿ' => '𝕟', - 'ð• ' => '𝕠', - 'ð•¡' => '𝕡', - 'ð•¢' => '𝕢', - 'ð•£' => '𝕣', - 'ð•¤' => '𝕤', - 'ð•¥' => '𝕥', - 'ð•¦' => '𝕦', - 'ð•§' => '𝕧', - 'ð•¨' => '𝕨', - 'ð•©' => '𝕩', - 'ð•ª' => '𝕪', - 'ð•«' => '𝕫', - ); +class HTML5Entities +{ + + public static $map = array( + ' ' => ' ', + "\n" => ' ', + '!' => '!', + '"' => '"', + '#' => '#', + '$' => '$', + '%' => '%', + '&' => '&', + '\'' => ''', + '(' => '(', + ')' => ')', + '*' => '*', + '+' => '+', + ',' => ',', + '.' => '.', + '/' => '/', + ':' => ':', + ';' => ';', + '<' => '<', + '<⃒' => '&nvlt', + '=' => '=', + '=⃥' => '&bne', + '>' => '>', + '>⃒' => '&nvgt', + '?' => '?', + '@' => '@', + '[' => '[', + '\\' => '\', + ']' => ']', + '^' => '^', + '_' => '_', + '`' => '`', + 'fj' => '&fjlig', + '{' => '{', + '|' => '|', + '}' => '}', + ' ' => ' ', + '¡' => '¡', + '¢' => '¢', + '£' => '£', + '¤' => '¤', + 'Â¥' => '¥', + '¦' => '¦', + '§' => '§', + '¨' => '¨', + '©' => '©', + 'ª' => 'ª', + '«' => '«', + '¬' => '¬', + '­' => '­', + '®' => '®', + '¯' => '¯', + '°' => '°', + '±' => '±', + '²' => '²', + '³' => '³', + '´' => '´', + 'µ' => 'µ', + '¶' => '¶', + '·' => '·', + '¸' => '¸', + '¹' => '¹', + 'º' => 'º', + '»' => '»', + '¼' => '¼', + '½' => '½', + '¾' => '¾', + '¿' => '¿', + 'À' => 'À', + 'Ã' => 'Á', + 'Â' => 'Â', + 'Ã' => 'Ã', + 'Ä' => 'Ä', + 'Ã…' => 'Å', + 'Æ' => 'Æ', + 'Ç' => 'Ç', + 'È' => 'È', + 'É' => 'É', + 'Ê' => 'Ê', + 'Ë' => 'Ë', + 'ÃŒ' => 'Ì', + 'Ã' => 'Í', + 'ÃŽ' => 'Î', + 'Ã' => 'Ï', + 'Ã' => 'Ð', + 'Ñ' => 'Ñ', + 'Ã’' => 'Ò', + 'Ó' => 'Ó', + 'Ô' => 'Ô', + 'Õ' => 'Õ', + 'Ö' => 'Ö', + '×' => '×', + 'Ø' => 'Ø', + 'Ù' => 'Ù', + 'Ú' => 'Ú', + 'Û' => 'Û', + 'Ãœ' => 'Ü', + 'Ã' => 'Ý', + 'Þ' => 'Þ', + 'ß' => 'ß', + 'à' => 'à', + 'á' => 'á', + 'â' => 'â', + 'ã' => 'ã', + 'ä' => 'ä', + 'Ã¥' => 'å', + 'æ' => 'æ', + 'ç' => 'ç', + 'è' => 'è', + 'é' => 'é', + 'ê' => 'ê', + 'ë' => 'ë', + 'ì' => 'ì', + 'í' => 'í', + 'î' => 'î', + 'ï' => 'ï', + 'ð' => 'ð', + 'ñ' => 'ñ', + 'ò' => 'ò', + 'ó' => 'ó', + 'ô' => 'ô', + 'õ' => 'õ', + 'ö' => 'ö', + '÷' => '÷', + 'ø' => 'ø', + 'ù' => 'ù', + 'ú' => 'ú', + 'û' => 'û', + 'ü' => 'ü', + 'ý' => 'ý', + 'þ' => 'þ', + 'ÿ' => 'ÿ', + 'Ä€' => 'Ā', + 'Ä' => 'ā', + 'Ä‚' => 'Ă', + 'ă' => 'ă', + 'Ä„' => 'Ą', + 'Ä…' => 'ą', + 'Ć' => 'Ć', + 'ć' => 'ć', + 'Ĉ' => 'Ĉ', + 'ĉ' => 'ĉ', + 'ÄŠ' => 'Ċ', + 'Ä‹' => 'ċ', + 'ÄŒ' => 'Č', + 'Ä' => 'č', + 'ÄŽ' => 'Ď', + 'Ä' => 'ď', + 'Ä' => 'Đ', + 'Ä‘' => 'đ', + 'Ä’' => 'Ē', + 'Ä“' => 'ē', + 'Ä–' => 'Ė', + 'Ä—' => 'ė', + 'Ę' => 'Ę', + 'Ä™' => 'ę', + 'Äš' => 'Ě', + 'Ä›' => 'ě', + 'Äœ' => 'Ĝ', + 'Ä' => 'ĝ', + 'Äž' => 'Ğ', + 'ÄŸ' => 'ğ', + 'Ä ' => 'Ġ', + 'Ä¡' => 'ġ', + 'Ä¢' => 'Ģ', + 'Ĥ' => 'Ĥ', + 'Ä¥' => 'ĥ', + 'Ħ' => 'Ħ', + 'ħ' => 'ħ', + 'Ĩ' => 'Ĩ', + 'Ä©' => 'ĩ', + 'Ī' => 'Ī', + 'Ä«' => 'ī', + 'Ä®' => 'Į', + 'į' => 'į', + 'Ä°' => 'İ', + 'ı' => 'ı', + 'IJ' => 'IJ', + 'ij' => 'ij', + 'Ä´' => 'Ĵ', + 'ĵ' => 'ĵ', + 'Ķ' => 'Ķ', + 'Ä·' => 'ķ', + 'ĸ' => 'ĸ', + 'Ĺ' => 'Ĺ', + 'ĺ' => 'ĺ', + 'Ä»' => 'Ļ', + 'ļ' => 'ļ', + 'Ľ' => 'Ľ', + 'ľ' => 'ľ', + 'Ä¿' => 'Ŀ', + 'Å€' => 'ŀ', + 'Å' => 'Ł', + 'Å‚' => 'ł', + 'Ń' => 'Ń', + 'Å„' => 'ń', + 'Å…' => 'Ņ', + 'ņ' => 'ņ', + 'Ň' => 'Ň', + 'ň' => 'ň', + 'ʼn' => 'ʼn', + 'ÅŠ' => 'Ŋ', + 'Å‹' => 'ŋ', + 'ÅŒ' => 'Ō', + 'Å' => 'ō', + 'Å' => 'Ő', + 'Å‘' => 'ő', + 'Å’' => 'Œ', + 'Å“' => 'œ', + 'Å”' => 'Ŕ', + 'Å•' => 'ŕ', + 'Å–' => 'Ŗ', + 'Å—' => 'ŗ', + 'Ř' => 'Ř', + 'Å™' => 'ř', + 'Åš' => 'Ś', + 'Å›' => 'ś', + 'Åœ' => 'Ŝ', + 'Å' => 'ŝ', + 'Åž' => 'Ş', + 'ÅŸ' => 'ş', + 'Å ' => 'Š', + 'Å¡' => 'š', + 'Å¢' => 'Ţ', + 'Å£' => 'ţ', + 'Ť' => 'Ť', + 'Å¥' => 'ť', + 'Ŧ' => 'Ŧ', + 'ŧ' => 'ŧ', + 'Ũ' => 'Ũ', + 'Å©' => 'ũ', + 'Ū' => 'Ū', + 'Å«' => 'ū', + 'Ŭ' => 'Ŭ', + 'Å­' => 'ŭ', + 'Å®' => 'Ů', + 'ů' => 'ů', + 'Å°' => 'Ű', + 'ű' => 'ű', + 'Ų' => 'Ų', + 'ų' => 'ų', + 'Å´' => 'Ŵ', + 'ŵ' => 'ŵ', + 'Ŷ' => 'Ŷ', + 'Å·' => 'ŷ', + 'Ÿ' => 'Ÿ', + 'Ź' => 'Ź', + 'ź' => 'ź', + 'Å»' => 'Ż', + 'ż' => 'ż', + 'Ž' => 'Ž', + 'ž' => 'ž', + 'Æ’' => 'ƒ', + 'Ƶ' => 'Ƶ', + 'ǵ' => 'ǵ', + 'È·' => 'ȷ', + 'ˆ' => 'ˆ', + 'ˇ' => 'ˇ', + '˘' => '˘', + 'Ë™' => '˙', + 'Ëš' => '˚', + 'Ë›' => '˛', + 'Ëœ' => '˜', + 'Ë' => '˝', + 'Ì‘' => '̑', + 'Α' => 'Α', + 'Î’' => 'Β', + 'Γ' => 'Γ', + 'Δ' => 'Δ', + 'Ε' => 'Ε', + 'Ζ' => 'Ζ', + 'Η' => 'Η', + 'Θ' => 'Θ', + 'Ι' => 'Ι', + 'Κ' => 'Κ', + 'Λ' => 'Λ', + 'Îœ' => 'Μ', + 'Î' => 'Ν', + 'Ξ' => 'Ξ', + 'Ο' => 'Ο', + 'Π' => 'Π', + 'Ρ' => 'Ρ', + 'Σ' => 'Σ', + 'Τ' => 'Τ', + 'Î¥' => 'Υ', + 'Φ' => 'Φ', + 'Χ' => 'Χ', + 'Ψ' => 'Ψ', + 'Ω' => 'Ω', + 'α' => 'α', + 'β' => 'β', + 'γ' => 'γ', + 'δ' => 'δ', + 'ε' => 'ε', + 'ζ' => 'ζ', + 'η' => 'η', + 'θ' => 'θ', + 'ι' => 'ι', + 'κ' => 'κ', + 'λ' => 'λ', + 'μ' => 'μ', + 'ν' => 'ν', + 'ξ' => 'ξ', + 'ο' => 'ο', + 'Ï€' => 'π', + 'Ï' => 'ρ', + 'Ï‚' => 'ς', + 'σ' => 'σ', + 'Ï„' => 'τ', + 'Ï…' => 'υ', + 'φ' => 'φ', + 'χ' => 'χ', + 'ψ' => 'ψ', + 'ω' => 'ω', + 'Ï‘' => 'ϑ', + 'Ï’' => 'ϒ', + 'Ï•' => 'ϕ', + 'Ï–' => 'ϖ', + 'Ïœ' => 'Ϝ', + 'Ï' => 'ϝ', + 'Ï°' => 'ϰ', + 'ϱ' => 'ϱ', + 'ϵ' => 'ϵ', + '϶' => '϶', + 'Ð' => 'Ё', + 'Ђ' => 'Ђ', + 'Ѓ' => 'Ѓ', + 'Є' => 'Є', + 'Ð…' => 'Ѕ', + 'І' => 'І', + 'Ї' => 'Ї', + 'Ј' => 'Ј', + 'Љ' => 'Љ', + 'Њ' => 'Њ', + 'Ћ' => 'Ћ', + 'ÐŒ' => 'Ќ', + 'ÐŽ' => 'Ў', + 'Ð' => 'Џ', + 'Ð' => 'А', + 'Б' => 'Б', + 'Ð’' => 'В', + 'Г' => 'Г', + 'Д' => 'Д', + 'Е' => 'Е', + 'Ж' => 'Ж', + 'З' => 'З', + 'И' => 'И', + 'Й' => 'Й', + 'К' => 'К', + 'Л' => 'Л', + 'Ðœ' => 'М', + 'Ð' => 'Н', + 'О' => 'О', + 'П' => 'П', + 'Р' => 'Р', + 'С' => 'С', + 'Т' => 'Т', + 'У' => 'У', + 'Ф' => 'Ф', + 'Ð¥' => 'Х', + 'Ц' => 'Ц', + 'Ч' => 'Ч', + 'Ш' => 'Ш', + 'Щ' => 'Щ', + 'Ъ' => 'Ъ', + 'Ы' => 'Ы', + 'Ь' => 'Ь', + 'Э' => 'Э', + 'Ю' => 'Ю', + 'Я' => 'Я', + 'а' => 'а', + 'б' => 'б', + 'в' => 'в', + 'г' => 'г', + 'д' => 'д', + 'е' => 'е', + 'ж' => 'ж', + 'з' => 'з', + 'и' => 'и', + 'й' => 'й', + 'к' => 'к', + 'л' => 'л', + 'м' => 'м', + 'н' => 'н', + 'о' => 'о', + 'п' => 'п', + 'Ñ€' => 'р', + 'Ñ' => 'с', + 'Ñ‚' => 'т', + 'у' => 'у', + 'Ñ„' => 'ф', + 'Ñ…' => 'х', + 'ц' => 'ц', + 'ч' => 'ч', + 'ш' => 'ш', + 'щ' => 'щ', + 'ÑŠ' => 'ъ', + 'Ñ‹' => 'ы', + 'ÑŒ' => 'ь', + 'Ñ' => 'э', + 'ÑŽ' => 'ю', + 'Ñ' => 'я', + 'Ñ‘' => 'ё', + 'Ñ’' => 'ђ', + 'Ñ“' => 'ѓ', + 'Ñ”' => 'є', + 'Ñ•' => 'ѕ', + 'Ñ–' => 'і', + 'Ñ—' => 'ї', + 'ј' => 'ј', + 'Ñ™' => 'љ', + 'Ñš' => 'њ', + 'Ñ›' => 'ћ', + 'Ñœ' => 'ќ', + 'Ñž' => 'ў', + 'ÑŸ' => 'џ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + '​' => '​', + '‌' => '‌', + 'â€' => '‍', + '‎' => '‎', + 'â€' => '‏', + 'â€' => '‐', + '–' => '–', + '—' => '—', + '―' => '―', + '‖' => '‖', + '‘' => '‘', + '’' => '’', + '‚' => '‚', + '“' => '“', + 'â€' => '”', + '„' => '„', + '†' => '†', + '‡' => '‡', + '•' => '•', + '‥' => '‥', + '…' => '…', + '‰' => '‰', + '‱' => '‱', + '′' => '′', + '″' => '″', + '‴' => '‴', + '‵' => '‵', + '‹' => '‹', + '›' => '›', + '‾' => '‾', + 'â' => '⁁', + 'âƒ' => '⁃', + 'â„' => '⁄', + 'â' => '⁏', + 'â—' => '⁗', + 'âŸ' => ' ', + 'âŸâ€Š' => '&ThickSpace', + 'â ' => '⁠', + 'â¡' => '⁡', + 'â¢' => '⁢', + 'â£' => '⁣', + '€' => '€', + '⃛' => '⃛', + '⃜' => '⃜', + 'â„‚' => 'ℂ', + 'â„…' => '℅', + 'â„Š' => 'ℊ', + 'â„‹' => 'ℋ', + 'â„Œ' => 'ℌ', + 'â„' => 'ℍ', + 'â„Ž' => 'ℎ', + 'â„' => 'ℏ', + 'â„' => 'ℐ', + 'â„‘' => 'ℑ', + 'â„’' => 'ℒ', + 'â„“' => 'ℓ', + 'â„•' => 'ℕ', + 'â„–' => '№', + 'â„—' => '℗', + '℘' => '℘', + 'â„™' => 'ℙ', + 'â„š' => 'ℚ', + 'â„›' => 'ℛ', + 'â„œ' => 'ℜ', + 'â„' => 'ℝ', + 'â„ž' => '℞', + 'â„¢' => '™', + 'ℤ' => 'ℤ', + '℧' => '℧', + 'ℨ' => 'ℨ', + 'â„©' => '℩', + 'ℬ' => 'ℬ', + 'â„­' => 'ℭ', + 'ℯ' => 'ℯ', + 'â„°' => 'ℰ', + 'ℱ' => 'ℱ', + 'ℳ' => 'ℳ', + 'â„´' => 'ℴ', + 'ℵ' => 'ℵ', + 'ℶ' => 'ℶ', + 'â„·' => 'ℷ', + 'ℸ' => 'ℸ', + 'â……' => 'ⅅ', + 'â…†' => 'ⅆ', + 'â…‡' => 'ⅇ', + 'â…ˆ' => 'ⅈ', + 'â…“' => '⅓', + 'â…”' => '⅔', + 'â…•' => '⅕', + 'â…–' => '⅖', + 'â…—' => '⅗', + 'â…˜' => '⅘', + 'â…™' => '⅙', + 'â…š' => '⅚', + 'â…›' => '⅛', + 'â…œ' => '⅜', + 'â…' => '⅝', + 'â…ž' => '⅞', + 'â†' => '←', + '↑' => '↑', + '→' => '→', + '↓' => '↓', + '↔' => '↔', + '↕' => '↕', + '↖' => '↖', + '↗' => '↗', + '↘' => '↘', + '↙' => '↙', + '↚' => '↚', + '↛' => '↛', + 'â†' => '↝', + 'â†Ì¸' => '&nrarrw', + '↞' => '↞', + '↟' => '↟', + '↠' => '↠', + '↡' => '↡', + '↢' => '↢', + '↣' => '↣', + '↤' => '↤', + '↥' => '↥', + '↦' => '↦', + '↧' => '↧', + '↩' => '↩', + '↪' => '↪', + '↫' => '↫', + '↬' => '↬', + '↭' => '↭', + '↮' => '↮', + '↰' => '↰', + '↱' => '↱', + '↲' => '↲', + '↳' => '↳', + '↵' => '↵', + '↶' => '↶', + '↷' => '↷', + '↺' => '↺', + '↻' => '↻', + '↼' => '↼', + '↽' => '↽', + '↾' => '↾', + '↿' => '↿', + '⇀' => '⇀', + 'â‡' => '⇁', + '⇂' => '⇂', + '⇃' => '⇃', + '⇄' => '⇄', + '⇅' => '⇅', + '⇆' => '⇆', + '⇇' => '⇇', + '⇈' => '⇈', + '⇉' => '⇉', + '⇊' => '⇊', + '⇋' => '⇋', + '⇌' => '⇌', + 'â‡' => '⇍', + '⇎' => '⇎', + 'â‡' => '⇏', + 'â‡' => '⇐', + '⇑' => '⇑', + '⇒' => '⇒', + '⇓' => '⇓', + '⇔' => '⇔', + '⇕' => '⇕', + '⇖' => '⇖', + '⇗' => '⇗', + '⇘' => '⇘', + '⇙' => '⇙', + '⇚' => '⇚', + '⇛' => '⇛', + 'â‡' => '⇝', + '⇤' => '⇤', + '⇥' => '⇥', + '⇵' => '⇵', + '⇽' => '⇽', + '⇾' => '⇾', + '⇿' => '⇿', + '∀' => '∀', + 'âˆ' => '∁', + '∂' => '∂', + '∂̸' => '&npart', + '∃' => '∃', + '∄' => '∄', + '∅' => '∅', + '∇' => '∇', + '∈' => '∈', + '∉' => '∉', + '∋' => '∋', + '∌' => '∌', + 'âˆ' => '∏', + 'âˆ' => '∐', + '∑' => '∑', + '−' => '−', + '∓' => '∓', + '∔' => '∔', + '∖' => '∖', + '∗' => '∗', + '∘' => '∘', + '√' => '√', + 'âˆ' => '∝', + '∞' => '∞', + '∟' => '∟', + '∠' => '∠', + '∠⃒' => '&nang', + '∡' => '∡', + '∢' => '∢', + '∣' => '∣', + '∤' => '∤', + '∥' => '∥', + '∦' => '∦', + '∧' => '∧', + '∨' => '∨', + '∩' => '∩', + '∩︀' => '&caps', + '∪' => '∪', + '∪︀' => '&cups', + '∫' => '∫', + '∬' => '∬', + '∭' => '∭', + '∮' => '∮', + '∯' => '∯', + '∰' => '∰', + '∱' => '∱', + '∲' => '∲', + '∳' => '∳', + '∴' => '∴', + '∵' => '∵', + '∶' => '∶', + '∷' => '∷', + '∸' => '∸', + '∺' => '∺', + '∻' => '∻', + '∼' => '∼', + '∼⃒' => '&nvsim', + '∽' => '∽', + '∽̱' => '&race', + '∾' => '∾', + '∾̳' => '&acE', + '∿' => '∿', + '≀' => '≀', + 'â‰' => '≁', + '≂' => '≂', + '≂̸' => '&nesim', + '≃' => '≃', + '≄' => '≄', + '≅' => '≅', + '≆' => '≆', + '≇' => '≇', + '≈' => '≈', + '≉' => '≉', + '≊' => '≊', + '≋' => '≋', + '≋̸' => '&napid', + '≌' => '≌', + 'â‰' => '≍', + 'â‰âƒ’' => '&nvap', + '≎' => '≎', + '≎̸' => '&nbump', + 'â‰' => '≏', + 'â‰Ì¸' => '&nbumpe', + 'â‰' => '≐', + 'â‰Ì¸' => '&nedot', + '≑' => '≑', + '≒' => '≒', + '≓' => '≓', + '≔' => '≔', + '≕' => '≕', + '≖' => '≖', + '≗' => '≗', + '≙' => '≙', + '≚' => '≚', + '≜' => '≜', + '≟' => '≟', + '≠' => '≠', + '≡' => '≡', + '≡⃥' => '&bnequiv', + '≢' => '≢', + '≤' => '≤', + '≤⃒' => '&nvle', + '≥' => '≥', + '≥⃒' => '&nvge', + '≦' => '≦', + '≦̸' => '&nlE', + '≧' => '≧', + '≧̸' => '&NotGreaterFullEqual', + '≨' => '≨', + '≨︀' => '&lvertneqq', + '≩' => '≩', + '≩︀' => '&gvertneqq', + '≪' => '≪', + '≪̸' => '&nLtv', + '≪⃒' => '&nLt', + '≫' => '≫', + '≫̸' => '&NotGreaterGreater', + '≫⃒' => '&nGt', + '≬' => '≬', + '≭' => '≭', + '≮' => '≮', + '≯' => '≯', + '≰' => '≰', + '≱' => '≱', + '≲' => '≲', + '≳' => '≳', + '≴' => '≴', + '≵' => '≵', + '≶' => '≶', + '≷' => '≷', + '≸' => '≸', + '≹' => '≹', + '≺' => '≺', + '≻' => '≻', + '≼' => '≼', + '≽' => '≽', + '≾' => '≾', + '≿' => '≿', + '≿̸' => '&NotSucceedsTilde', + '⊀' => '⊀', + 'âŠ' => '⊁', + '⊂' => '⊂', + '⊂⃒' => '&vnsub', + '⊃' => '⊃', + '⊃⃒' => '&nsupset', + '⊄' => '⊄', + '⊅' => '⊅', + '⊆' => '⊆', + '⊇' => '⊇', + '⊈' => '⊈', + '⊉' => '⊉', + '⊊' => '⊊', + '⊊︀' => '&vsubne', + '⊋' => '⊋', + '⊋︀' => '&vsupne', + 'âŠ' => '⊍', + '⊎' => '⊎', + 'âŠ' => '⊏', + 'âŠÌ¸' => '&NotSquareSubset', + 'âŠ' => '⊐', + 'âŠÌ¸' => '&NotSquareSuperset', + '⊑' => '⊑', + '⊒' => '⊒', + '⊓' => '⊓', + '⊓︀' => '&sqcaps', + '⊔' => '⊔', + '⊔︀' => '&sqcups', + '⊕' => '⊕', + '⊖' => '⊖', + '⊗' => '⊗', + '⊘' => '⊘', + '⊙' => '⊙', + '⊚' => '⊚', + '⊛' => '⊛', + 'âŠ' => '⊝', + '⊞' => '⊞', + '⊟' => '⊟', + '⊠' => '⊠', + '⊡' => '⊡', + '⊢' => '⊢', + '⊣' => '⊣', + '⊤' => '⊤', + '⊥' => '⊥', + '⊧' => '⊧', + '⊨' => '⊨', + '⊩' => '⊩', + '⊪' => '⊪', + '⊫' => '⊫', + '⊬' => '⊬', + '⊭' => '⊭', + '⊮' => '⊮', + '⊯' => '⊯', + '⊰' => '⊰', + '⊲' => '⊲', + '⊳' => '⊳', + '⊴' => '⊴', + '⊴⃒' => '&nvltrie', + '⊵' => '⊵', + '⊵⃒' => '&nvrtrie', + '⊶' => '⊶', + '⊷' => '⊷', + '⊸' => '⊸', + '⊹' => '⊹', + '⊺' => '⊺', + '⊻' => '⊻', + '⊽' => '⊽', + '⊾' => '⊾', + '⊿' => '⊿', + 'â‹€' => '⋀', + 'â‹' => '⋁', + 'â‹‚' => '⋂', + '⋃' => '⋃', + 'â‹„' => '⋄', + 'â‹…' => '⋅', + '⋆' => '⋆', + '⋇' => '⋇', + '⋈' => '⋈', + '⋉' => '⋉', + 'â‹Š' => '⋊', + 'â‹‹' => '⋋', + 'â‹Œ' => '⋌', + 'â‹' => '⋍', + 'â‹Ž' => '⋎', + 'â‹' => '⋏', + 'â‹' => '⋐', + 'â‹‘' => '⋑', + 'â‹’' => '⋒', + 'â‹“' => '⋓', + 'â‹”' => '⋔', + 'â‹•' => '⋕', + 'â‹–' => '⋖', + 'â‹—' => '⋗', + '⋘' => '⋘', + '⋘̸' => '&nLl', + 'â‹™' => '⋙', + '⋙̸' => '&nGg', + 'â‹š' => '⋚', + '⋚︀' => '&lesg', + 'â‹›' => '⋛', + '⋛︀' => '&gesl', + 'â‹ž' => '⋞', + 'â‹Ÿ' => '⋟', + 'â‹ ' => '⋠', + 'â‹¡' => '⋡', + 'â‹¢' => '⋢', + 'â‹£' => '⋣', + '⋦' => '⋦', + '⋧' => '⋧', + '⋨' => '⋨', + 'â‹©' => '⋩', + '⋪' => '⋪', + 'â‹«' => '⋫', + '⋬' => '⋬', + 'â‹­' => '⋭', + 'â‹®' => '⋮', + '⋯' => '⋯', + 'â‹°' => '⋰', + '⋱' => '⋱', + '⋲' => '⋲', + '⋳' => '⋳', + 'â‹´' => '⋴', + '⋵' => '⋵', + '⋵̸' => '¬indot', + '⋶' => '⋶', + 'â‹·' => '⋷', + '⋹' => '⋹', + '⋹̸' => '¬inE', + '⋺' => '⋺', + 'â‹»' => '⋻', + '⋼' => '⋼', + '⋽' => '⋽', + '⋾' => '⋾', + '⌅' => '⌅', + '⌆' => '⌆', + '⌈' => '⌈', + '⌉' => '⌉', + '⌊' => '⌊', + '⌋' => '⌋', + '⌌' => '⌌', + 'âŒ' => '⌍', + '⌎' => '⌎', + 'âŒ' => '⌏', + 'âŒ' => '⌐', + '⌒' => '⌒', + '⌓' => '⌓', + '⌕' => '⌕', + '⌖' => '⌖', + '⌜' => '⌜', + 'âŒ' => '⌝', + '⌞' => '⌞', + '⌟' => '⌟', + '⌢' => '⌢', + '⌣' => '⌣', + '⌭' => '⌭', + '⌮' => '⌮', + '⌶' => '⌶', + '⌽' => '⌽', + '⌿' => '⌿', + 'â¼' => '⍼', + '⎰' => '⎰', + '⎱' => '⎱', + '⎴' => '⎴', + '⎵' => '⎵', + '⎶' => '⎶', + 'âœ' => '⏜', + 'â' => '⏝', + 'âž' => '⏞', + 'âŸ' => '⏟', + 'â¢' => '⏢', + 'â§' => '⏧', + 'â£' => '␣', + 'Ⓢ' => 'Ⓢ', + '─' => '─', + '│' => '│', + '┌' => '┌', + 'â”' => '┐', + 'â””' => '└', + '┘' => '┘', + '├' => '├', + '┤' => '┤', + '┬' => '┬', + 'â”´' => '┴', + '┼' => '┼', + 'â•' => '═', + 'â•‘' => '║', + 'â•’' => '╒', + 'â•“' => '╓', + 'â•”' => '╔', + 'â••' => '╕', + 'â•–' => '╖', + 'â•—' => '╗', + '╘' => '╘', + 'â•™' => '╙', + 'â•š' => '╚', + 'â•›' => '╛', + 'â•œ' => '╜', + 'â•' => '╝', + 'â•ž' => '╞', + 'â•Ÿ' => '╟', + 'â• ' => '╠', + 'â•¡' => '╡', + 'â•¢' => '╢', + 'â•£' => '╣', + '╤' => '╤', + 'â•¥' => '╥', + '╦' => '╦', + '╧' => '╧', + '╨' => '╨', + 'â•©' => '╩', + '╪' => '╪', + 'â•«' => '╫', + '╬' => '╬', + 'â–€' => '▀', + 'â–„' => '▄', + 'â–ˆ' => '█', + 'â–‘' => '░', + 'â–’' => '▒', + 'â–“' => '▓', + 'â–¡' => '□', + 'â–ª' => '▪', + 'â–«' => '▫', + 'â–­' => '▭', + 'â–®' => '▮', + 'â–±' => '▱', + 'â–³' => '△', + 'â–´' => '▴', + 'â–µ' => '▵', + 'â–¸' => '▸', + 'â–¹' => '▹', + 'â–½' => '▽', + 'â–¾' => '▾', + 'â–¿' => '▿', + 'â—‚' => '◂', + 'â—ƒ' => '◃', + 'â—Š' => '◊', + 'â—‹' => '○', + 'â—¬' => '◬', + 'â—¯' => '◯', + 'â—¸' => '◸', + 'â—¹' => '◹', + 'â—º' => '◺', + 'â—»' => '◻', + 'â—¼' => '◼', + '★' => '★', + '☆' => '☆', + '☎' => '☎', + '♀' => '♀', + '♂' => '♂', + 'â™ ' => '♠', + '♣' => '♣', + '♥' => '♥', + '♦' => '♦', + '♪' => '♪', + 'â™­' => '♭', + 'â™®' => '♮', + '♯' => '♯', + '✓' => '✓', + '✗' => '✗', + '✠' => '✠', + '✶' => '✶', + 'â˜' => '❘', + 'â²' => '❲', + 'â³' => '❳', + '⟈' => '⟈', + '⟉' => '⟉', + '⟦' => '⟦', + '⟧' => '⟧', + '⟨' => '⟨', + '⟩' => '⟩', + '⟪' => '⟪', + '⟫' => '⟫', + '⟬' => '⟬', + '⟭' => '⟭', + '⟵' => '⟵', + '⟶' => '⟶', + '⟷' => '⟷', + '⟸' => '⟸', + '⟹' => '⟹', + '⟺' => '⟺', + '⟼' => '⟼', + '⟿' => '⟿', + '⤂' => '⤂', + '⤃' => '⤃', + '⤄' => '⤄', + '⤅' => '⤅', + '⤌' => '⤌', + 'â¤' => '⤍', + '⤎' => '⤎', + 'â¤' => '⤏', + 'â¤' => '⤐', + '⤑' => '⤑', + '⤒' => '⤒', + '⤓' => '⤓', + '⤖' => '⤖', + '⤙' => '⤙', + '⤚' => '⤚', + '⤛' => '⤛', + '⤜' => '⤜', + 'â¤' => '⤝', + '⤞' => '⤞', + '⤟' => '⤟', + '⤠' => '⤠', + '⤣' => '⤣', + '⤤' => '⤤', + '⤥' => '⤥', + '⤦' => '⤦', + '⤧' => '⤧', + '⤨' => '⤨', + '⤩' => '⤩', + '⤪' => '⤪', + '⤳' => '⤳', + '⤳̸' => '&nrarrc', + '⤵' => '⤵', + '⤶' => '⤶', + '⤷' => '⤷', + '⤸' => '⤸', + '⤹' => '⤹', + '⤼' => '⤼', + '⤽' => '⤽', + '⥅' => '⥅', + '⥈' => '⥈', + '⥉' => '⥉', + '⥊' => '⥊', + '⥋' => '⥋', + '⥎' => '⥎', + 'â¥' => '⥏', + 'â¥' => '⥐', + '⥑' => '⥑', + '⥒' => '⥒', + '⥓' => '⥓', + '⥔' => '⥔', + '⥕' => '⥕', + '⥖' => '⥖', + '⥗' => '⥗', + '⥘' => '⥘', + '⥙' => '⥙', + '⥚' => '⥚', + '⥛' => '⥛', + '⥜' => '⥜', + 'â¥' => '⥝', + '⥞' => '⥞', + '⥟' => '⥟', + '⥠' => '⥠', + '⥡' => '⥡', + '⥢' => '⥢', + '⥣' => '⥣', + '⥤' => '⥤', + '⥥' => '⥥', + '⥦' => '⥦', + '⥧' => '⥧', + '⥨' => '⥨', + '⥩' => '⥩', + '⥪' => '⥪', + '⥫' => '⥫', + '⥬' => '⥬', + '⥭' => '⥭', + '⥮' => '⥮', + '⥯' => '⥯', + '⥰' => '⥰', + '⥱' => '⥱', + '⥲' => '⥲', + '⥳' => '⥳', + '⥴' => '⥴', + '⥵' => '⥵', + '⥶' => '⥶', + '⥸' => '⥸', + '⥹' => '⥹', + '⥻' => '⥻', + '⥼' => '⥼', + '⥽' => '⥽', + '⥾' => '⥾', + '⥿' => '⥿', + '⦅' => '⦅', + '⦆' => '⦆', + '⦋' => '⦋', + '⦌' => '⦌', + 'â¦' => '⦍', + '⦎' => '⦎', + 'â¦' => '⦏', + 'â¦' => '⦐', + '⦑' => '⦑', + '⦒' => '⦒', + '⦓' => '⦓', + '⦔' => '⦔', + '⦕' => '⦕', + '⦖' => '⦖', + '⦚' => '⦚', + '⦜' => '⦜', + 'â¦' => '⦝', + '⦤' => '⦤', + '⦥' => '⦥', + '⦦' => '⦦', + '⦧' => '⦧', + '⦨' => '⦨', + '⦩' => '⦩', + '⦪' => '⦪', + '⦫' => '⦫', + '⦬' => '⦬', + '⦭' => '⦭', + '⦮' => '⦮', + '⦯' => '⦯', + '⦰' => '⦰', + '⦱' => '⦱', + '⦲' => '⦲', + '⦳' => '⦳', + '⦴' => '⦴', + '⦵' => '⦵', + '⦶' => '⦶', + '⦷' => '⦷', + '⦹' => '⦹', + '⦻' => '⦻', + '⦼' => '⦼', + '⦾' => '⦾', + '⦿' => '⦿', + '⧀' => '⧀', + 'â§' => '⧁', + '⧂' => '⧂', + '⧃' => '⧃', + '⧄' => '⧄', + '⧅' => '⧅', + '⧉' => '⧉', + 'â§' => '⧍', + '⧎' => '⧎', + 'â§' => '⧏', + 'â§Ì¸' => '&NotLeftTriangleBar', + 'â§' => '⧐', + 'â§Ì¸' => '&NotRightTriangleBar', + '⧜' => '⧜', + 'â§' => '⧝', + '⧞' => '⧞', + '⧣' => '⧣', + '⧤' => '⧤', + '⧥' => '⧥', + '⧫' => '⧫', + '⧴' => '⧴', + '⧶' => '⧶', + '⨀' => '⨀', + 'â¨' => '⨁', + '⨂' => '⨂', + '⨄' => '⨄', + '⨆' => '⨆', + '⨌' => '⨌', + 'â¨' => '⨍', + 'â¨' => '⨐', + '⨑' => '⨑', + '⨒' => '⨒', + '⨓' => '⨓', + '⨔' => '⨔', + '⨕' => '⨕', + '⨖' => '⨖', + '⨗' => '⨗', + '⨢' => '⨢', + '⨣' => '⨣', + '⨤' => '⨤', + '⨥' => '⨥', + '⨦' => '⨦', + '⨧' => '⨧', + '⨩' => '⨩', + '⨪' => '⨪', + '⨭' => '⨭', + '⨮' => '⨮', + '⨯' => '⨯', + '⨰' => '⨰', + '⨱' => '⨱', + '⨳' => '⨳', + '⨴' => '⨴', + '⨵' => '⨵', + '⨶' => '⨶', + '⨷' => '⨷', + '⨸' => '⨸', + '⨹' => '⨹', + '⨺' => '⨺', + '⨻' => '⨻', + '⨼' => '⨼', + '⨿' => '⨿', + 'â©€' => '⩀', + 'â©‚' => '⩂', + '⩃' => '⩃', + 'â©„' => '⩄', + 'â©…' => '⩅', + '⩆' => '⩆', + '⩇' => '⩇', + '⩈' => '⩈', + '⩉' => '⩉', + 'â©Š' => '⩊', + 'â©‹' => '⩋', + 'â©Œ' => '⩌', + 'â©' => '⩍', + 'â©' => '⩐', + 'â©“' => '⩓', + 'â©”' => '⩔', + 'â©•' => '⩕', + 'â©–' => '⩖', + 'â©—' => '⩗', + '⩘' => '⩘', + 'â©š' => '⩚', + 'â©›' => '⩛', + 'â©œ' => '⩜', + 'â©' => '⩝', + 'â©Ÿ' => '⩟', + '⩦' => '⩦', + '⩪' => '⩪', + 'â©­' => '⩭', + '⩭̸' => '&ncongdot', + 'â©®' => '⩮', + '⩯' => '⩯', + 'â©°' => '⩰', + '⩰̸' => '&napE', + '⩱' => '⩱', + '⩲' => '⩲', + '⩳' => '⩳', + 'â©´' => '⩴', + '⩵' => '⩵', + 'â©·' => '⩷', + '⩸' => '⩸', + '⩹' => '⩹', + '⩺' => '⩺', + 'â©»' => '⩻', + '⩼' => '⩼', + '⩽' => '⩽', + '⩽̸' => '&nles', + '⩾' => '⩾', + '⩾̸' => '&nges', + 'â©¿' => '⩿', + '⪀' => '⪀', + 'âª' => '⪁', + '⪂' => '⪂', + '⪃' => '⪃', + '⪄' => '⪄', + '⪅' => '⪅', + '⪆' => '⪆', + '⪇' => '⪇', + '⪈' => '⪈', + '⪉' => '⪉', + '⪊' => '⪊', + '⪋' => '⪋', + '⪌' => '⪌', + 'âª' => '⪍', + '⪎' => '⪎', + 'âª' => '⪏', + 'âª' => '⪐', + '⪑' => '⪑', + '⪒' => '⪒', + '⪓' => '⪓', + '⪔' => '⪔', + '⪕' => '⪕', + '⪖' => '⪖', + '⪗' => '⪗', + '⪘' => '⪘', + '⪙' => '⪙', + '⪚' => '⪚', + 'âª' => '⪝', + '⪞' => '⪞', + '⪟' => '⪟', + '⪠' => '⪠', + '⪡' => '⪡', + '⪡̸' => '&NotNestedLessLess', + '⪢' => '⪢', + '⪢̸' => '&NotNestedGreaterGreater', + '⪤' => '⪤', + '⪥' => '⪥', + '⪦' => '⪦', + '⪧' => '⪧', + '⪨' => '⪨', + '⪩' => '⪩', + '⪪' => '⪪', + '⪫' => '⪫', + '⪬' => '⪬', + '⪬︀' => '&smtes', + '⪭' => '⪭', + '⪭︀' => '&lates', + '⪮' => '⪮', + '⪯' => '⪯', + '⪯̸' => '&NotPrecedesEqual', + '⪰' => '⪰', + '⪰̸' => '&NotSucceedsEqual', + '⪳' => '⪳', + '⪴' => '⪴', + '⪵' => '⪵', + '⪶' => '⪶', + '⪷' => '⪷', + '⪸' => '⪸', + '⪹' => '⪹', + '⪺' => '⪺', + '⪻' => '⪻', + '⪼' => '⪼', + '⪽' => '⪽', + '⪾' => '⪾', + '⪿' => '⪿', + 'â«€' => '⫀', + 'â«' => '⫁', + 'â«‚' => '⫂', + '⫃' => '⫃', + 'â«„' => '⫄', + 'â«…' => '⫅', + '⫅̸' => '&nsubE', + '⫆' => '⫆', + '⫆̸' => '&nsupseteqq', + '⫇' => '⫇', + '⫈' => '⫈', + 'â«‹' => '⫋', + '⫋︀' => '&vsubnE', + 'â«Œ' => '⫌', + '⫌︀' => '&varsupsetneqq', + 'â«' => '⫏', + 'â«' => '⫐', + 'â«‘' => '⫑', + 'â«’' => '⫒', + 'â«“' => '⫓', + 'â«”' => '⫔', + 'â«•' => '⫕', + 'â«–' => '⫖', + 'â«—' => '⫗', + '⫘' => '⫘', + 'â«™' => '⫙', + 'â«š' => '⫚', + 'â«›' => '⫛', + '⫤' => '⫤', + '⫦' => '⫦', + '⫧' => '⫧', + '⫨' => '⫨', + 'â«©' => '⫩', + 'â««' => '⫫', + '⫬' => '⫬', + 'â«­' => '⫭', + 'â«®' => '⫮', + '⫯' => '⫯', + 'â«°' => '⫰', + '⫱' => '⫱', + '⫲' => '⫲', + '⫳' => '⫳', + '⫽︀' => '&varsupsetneqq', + 'ff' => 'ff', + 'ï¬' => 'fi', + 'fl' => 'fl', + 'ffi' => 'ffi', + 'ffl' => 'ffl', + 'ð’œ' => '𝒜', + 'ð’ž' => '𝒞', + 'ð’Ÿ' => '𝒟', + 'ð’¢' => '𝒢', + 'ð’¥' => '𝒥', + 'ð’¦' => '𝒦', + 'ð’©' => '𝒩', + 'ð’ª' => '𝒪', + 'ð’«' => '𝒫', + 'ð’¬' => '𝒬', + 'ð’®' => '𝒮', + 'ð’¯' => '𝒯', + 'ð’°' => '𝒰', + 'ð’±' => '𝒱', + 'ð’²' => '𝒲', + 'ð’³' => '𝒳', + 'ð’´' => '𝒴', + 'ð’µ' => '𝒵', + 'ð’¶' => '𝒶', + 'ð’·' => '𝒷', + 'ð’¸' => '𝒸', + 'ð’¹' => '𝒹', + 'ð’»' => '𝒻', + 'ð’½' => '𝒽', + 'ð’¾' => '𝒾', + 'ð’¿' => '𝒿', + 'ð“€' => '𝓀', + 'ð“' => '𝓁', + 'ð“‚' => '𝓂', + 'ð“ƒ' => '𝓃', + 'ð“…' => '𝓅', + 'ð“†' => '𝓆', + 'ð“‡' => '𝓇', + 'ð“ˆ' => '𝓈', + 'ð“‰' => '𝓉', + 'ð“Š' => '𝓊', + 'ð“‹' => '𝓋', + 'ð“Œ' => '𝓌', + 'ð“' => '𝓍', + 'ð“Ž' => '𝓎', + 'ð“' => '𝓏', + 'ð”„' => '𝔄', + 'ð”…' => '𝔅', + 'ð”‡' => '𝔇', + 'ð”ˆ' => '𝔈', + 'ð”‰' => '𝔉', + 'ð”Š' => '𝔊', + 'ð”' => '𝔍', + 'ð”Ž' => '𝔎', + 'ð”' => '𝔏', + 'ð”' => '𝔐', + 'ð”‘' => '𝔑', + 'ð”’' => '𝔒', + 'ð”“' => '𝔓', + 'ð””' => '𝔔', + 'ð”–' => '𝔖', + 'ð”—' => '𝔗', + 'ð”˜' => '𝔘', + 'ð”™' => '𝔙', + 'ð”š' => '𝔚', + 'ð”›' => '𝔛', + 'ð”œ' => '𝔜', + 'ð”ž' => '𝔞', + 'ð”Ÿ' => '𝔟', + 'ð” ' => '𝔠', + 'ð”¡' => '𝔡', + 'ð”¢' => '𝔢', + 'ð”£' => '𝔣', + 'ð”¤' => '𝔤', + 'ð”¥' => '𝔥', + 'ð”¦' => '𝔦', + 'ð”§' => '𝔧', + 'ð”¨' => '𝔨', + 'ð”©' => '𝔩', + 'ð”ª' => '𝔪', + 'ð”«' => '𝔫', + 'ð”¬' => '𝔬', + 'ð”­' => '𝔭', + 'ð”®' => '𝔮', + 'ð”¯' => '𝔯', + 'ð”°' => '𝔰', + 'ð”±' => '𝔱', + 'ð”²' => '𝔲', + 'ð”³' => '𝔳', + 'ð”´' => '𝔴', + 'ð”µ' => '𝔵', + 'ð”¶' => '𝔶', + 'ð”·' => '𝔷', + 'ð”¸' => '𝔸', + 'ð”¹' => '𝔹', + 'ð”»' => '𝔻', + 'ð”¼' => '𝔼', + 'ð”½' => '𝔽', + 'ð”¾' => '𝔾', + 'ð•€' => '𝕀', + 'ð•' => '𝕁', + 'ð•‚' => '𝕂', + 'ð•ƒ' => '𝕃', + 'ð•„' => '𝕄', + 'ð•†' => '𝕆', + 'ð•Š' => '𝕊', + 'ð•‹' => '𝕋', + 'ð•Œ' => '𝕌', + 'ð•' => '𝕍', + 'ð•Ž' => '𝕎', + 'ð•' => '𝕏', + 'ð•' => '𝕐', + 'ð•’' => '𝕒', + 'ð•“' => '𝕓', + 'ð•”' => '𝕔', + 'ð••' => '𝕕', + 'ð•–' => '𝕖', + 'ð•—' => '𝕗', + 'ð•˜' => '𝕘', + 'ð•™' => '𝕙', + 'ð•š' => '𝕚', + 'ð•›' => '𝕛', + 'ð•œ' => '𝕜', + 'ð•' => '𝕝', + 'ð•ž' => '𝕞', + 'ð•Ÿ' => '𝕟', + 'ð• ' => '𝕠', + 'ð•¡' => '𝕡', + 'ð•¢' => '𝕢', + 'ð•£' => '𝕣', + 'ð•¤' => '𝕤', + 'ð•¥' => '𝕥', + 'ð•¦' => '𝕦', + 'ð•§' => '𝕧', + 'ð•¨' => '𝕨', + 'ð•©' => '𝕩', + 'ð•ª' => '𝕪', + 'ð•«' => '𝕫' + ); } diff --git a/libraries/html5php/HTML5/Serializer/OutputRules.php b/libraries/html5php/HTML5/Serializer/OutputRules.php index 3af1cde..7ea7c6a 100644 --- a/libraries/html5php/HTML5/Serializer/OutputRules.php +++ b/libraries/html5php/HTML5/Serializer/OutputRules.php @@ -6,309 +6,474 @@ * These output rules are likely to generate output similar to the document that * was parsed. It is not intended to output exactly the document that was parsed. */ -namespace HTML5\Serializer; +namespace Masterminds\HTML5\Serializer; -use \HTML5\Elements; +use Masterminds\HTML5\Elements; /** * Generate the output html5 based on element rules. */ -class OutputRules implements \HTML5\Serializer\RulesInterface { +class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface +{ + /** + * Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0 + */ + const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml'; - const IM_IN_HTML = 1; - const IM_IN_SVG = 2; - const IM_IN_MATHML = 3; + const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML'; - protected $traverser; - protected $encode = FALSE; - protected $out; - protected $outputMode; + const NAMESPACE_SVG = 'http://www.w3.org/2000/svg'; - const DOCTYPE = ''; + const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink'; - public function __construct($output, $options = array()) { + const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace'; - if (isset($options['encode_entities'])) { - $this->encode = $options['encode_entities']; + const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; + + /** + * Holds the HTML5 element names that causes a namespace switch + * + * @var array + */ + protected $implicitNamespaces = array( + self::NAMESPACE_HTML, + self::NAMESPACE_SVG, + self::NAMESPACE_MATHML, + self::NAMESPACE_XML, + self::NAMESPACE_XMLNS, + ); + + const IM_IN_HTML = 1; + + const IM_IN_SVG = 2; + + const IM_IN_MATHML = 3; + + /** + * Used as cache to detect if is available ENT_HTML5 + * @var boolean + */ + private $hasHTML5 = false; + + protected $traverser; + + protected $encode = false; + + protected $out; + + protected $outputMode; + + private $xpath; + + protected $nonBooleanAttributes = array( + /* + array( + 'nodeNamespace'=>'http://www.w3.org/1999/xhtml', + 'attrNamespace'=>'http://www.w3.org/1999/xhtml', + + 'nodeName'=>'img', 'nodeName'=>array('img', 'a'), + 'attrName'=>'alt', 'attrName'=>array('title', 'alt'), + + + 'prefixes'=>['xh'=>'http://www.w3.org/1999/xhtml'), + 'xpath' => "@checked[../../xh:input[@type='radio' or @type='checkbox']]", + ), + */ + array( + 'nodeNamespace'=>'http://www.w3.org/1999/xhtml', + 'attrName'=>array('alt', 'title'), + ), + + ); + + const DOCTYPE = ''; + + public function __construct($output, $options = array()) + { + if (isset($options['encode_entities'])) { + $this->encode = $options['encode_entities']; + } + + $this->outputMode = static::IM_IN_HTML; + $this->out = $output; + + // If HHVM, see https://github.com/facebook/hhvm/issues/2727 + $this->hasHTML5 = defined('ENT_HTML5') && !defined('HHVM_VERSION'); + } + public function addRule(array $rule) + { + $this->nonBooleanAttributes[] = $rule; } - $this->outputMode = static::IM_IN_HTML; - $this->out = $output; - } + public function setTraverser(\Masterminds\HTML5\Serializer\Traverser $traverser) + { + $this->traverser = $traverser; - public function setTraverser(\HTML5\Serializer\Traverser $traverser) { - $this->traverser = $traverser; - - return $this; - } - - public function document($dom) { - $this->doctype(); - $this->traverser->node($dom->documentElement); - $this->nl(); - } - - protected function doctype() { - $this->wr(static::DOCTYPE); - $this->nl(); - } - - public function element($ele) { - $name = $ele->tagName; - - // Per spec: - // If the element has a declared namespace in the HTML, MathML or - // SVG namespaces, we use the lname instead of the tagName. - if ($this->traverser->isLocalElement($ele)) { - $name = $ele->localName; + return $this; } - // If we are in SVG or MathML there is special handling. - // Using if/elseif instead of switch because it's faster in PHP. - if ($name == 'svg') { - $this->outputMode = static::IM_IN_SVG; - $name = Elements::normalizeSvgElement($name); - } - elseif ($name == 'math') { - $this->outputMode = static::IM_IN_MATHML; + public function document($dom) + { + $this->doctype(); + $this->traverser->node($dom->documentElement); + $this->nl(); } - $this->openTag($ele); - - // Handle children. - if ($ele->hasChildNodes()) { - $this->traverser->children($ele->childNodes); + protected function doctype() + { + $this->wr(static::DOCTYPE); + $this->nl(); } - // Close out the SVG or MathML special handling. - if ($name == 'svg' || $name == 'math') { - $this->outputMode = static::IM_IN_HTML; + public function element($ele) + { + $name = $ele->tagName; + + // Per spec: + // If the element has a declared namespace in the HTML, MathML or + // SVG namespaces, we use the lname instead of the tagName. + if ($this->traverser->isLocalElement($ele)) { + $name = $ele->localName; + } + + // If we are in SVG or MathML there is special handling. + // Using if/elseif instead of switch because it's faster in PHP. + if ($name == 'svg') { + $this->outputMode = static::IM_IN_SVG; + $name = Elements::normalizeSvgElement($name); + } elseif ($name == 'math') { + $this->outputMode = static::IM_IN_MATHML; + } + + $this->openTag($ele); + if (Elements::isA($name, Elements::TEXT_RAW)) { + foreach ($ele->childNodes as $child) { + $this->wr($child->data); + } + } else { + // Handle children. + if ($ele->hasChildNodes()) { + $this->traverser->children($ele->childNodes); + } + + // Close out the SVG or MathML special handling. + if ($name == 'svg' || $name == 'math') { + $this->outputMode = static::IM_IN_HTML; + } + } + + // If not unary, add a closing tag. + if (! Elements::isA($name, Elements::VOID_TAG)) { + $this->closeTag($ele); + } } - // If not unary, add a closing tag. - if (!Elements::isA($name, Elements::VOID_TAG)) { - $this->closeTag($ele); - } - } + /** + * Write a text node. + * + * @param \DOMText $ele + * The text node to write. + */ + public function text($ele) + { + if (isset($ele->parentNode) && isset($ele->parentNode->tagName) && Elements::isA($ele->parentNode->localName, Elements::TEXT_RAW)) { + $this->wr($ele->data); + return; + } - /** - * Write a text node. - * - * @param \DOMText $ele - * The text node to write. - */ - public function text($ele) { - if (isset($ele->parentNode) && isset($ele->parentNode->tagName) && Elements::isA($ele->parentNode->tagName, Elements::TEXT_RAW)) { - $this->wr($ele->data); - return; + // FIXME: This probably needs some flags set. + $this->wr($this->enc($ele->data)); } - // FIXME: This probably needs some flags set. - $this->wr($this->enc($ele->data)); - - } - - public function cdata($ele) { - // This encodes CDATA. - $this->wr($ele->ownerDocument->saveXML($ele)); - } - - public function comment($ele) { - // These produce identical output. - //$this->wr(''); - $this->wr($ele->ownerDocument->saveXML($ele)); - } - - public function processorInstruction($ele) { - $this->wr('wr($ele->target)->wr(' ')->wr($ele->data)->wr('?>'); - } - - /** - * Write the opening tag. - * - * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the - * qualified name (8.3). - * - * @param \DOMNode $ele - * The element being written. - */ - protected function openTag($ele) { - $this->wr('<')->wr($ele->tagName); - $this->attrs($ele); - - if ($this->outputMode == static::IM_IN_HTML) { - $this->wr('>'); - } - // If we are not in html mode we are in SVG, MathML, or XML embedded content. - else { - if ($ele->hasChildNodes()) { - $this->wr('>'); - } - // If there are no children this is self closing. - else { - $this->wr(' />'); - } - } - } - - protected function attrs($ele) { - // FIXME: Needs support for xml, xmlns, xlink, and namespaced elements. - if (!$ele->hasAttributes()) { - return $this; + public function cdata($ele) + { + // This encodes CDATA. + $this->wr($ele->ownerDocument->saveXML($ele)); } - // TODO: Currently, this always writes name="value", and does not do - // value-less attributes. - $map = $ele->attributes; - $len = $map->length; - for ($i = 0; $i < $len; ++$i) { - $node = $map->item($i); - $val = $this->enc($node->value, TRUE); - - // XXX: The spec says that we need to ensure that anything in - // the XML, XMLNS, or XLink NS's should use the canonical - // prefix. It seems that DOM does this for us already, but there - // may be exceptions. - $name = $node->name; - - // Special handling for attributes in SVG and MathML. - // Using if/elseif instead of switch because it's faster in PHP. - if ($this->outputMode == static::IM_IN_SVG) { - $name = Elements::normalizeSvgAttribute($name); - } - elseif ($this->outputMode == static::IM_IN_MATHML) { - $name = Elements::normalizeMathMlAttribute($name); - } - - $this->wr(' ')->wr($name); - if (isset($val) && $val !== '') { - $this->wr('="')->wr($val)->wr('"'); - } - } - } - - /** - * Write the closing tag. - * - * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the - * qualified name (8.3). - * - * @param \DOMNode $ele - * The element being written. - */ - protected function closeTag($ele) { - if ($this->outputMode == static::IM_IN_HTML || $ele->hasChildNodes()) { - $this->wr('wr($ele->tagName)->wr('>'); - } - } - - /** - * Write to the output. - * - * @param string $text - * The string to put into the output. - * - * @return HTML5\Serializer\Traverser - * $this so it can be used in chaining. - */ - protected function wr($text) { - fwrite($this->out, $text); - return $this; - } - - /** - * Write a new line character. - * - * @return HTML5\Serializer\Traverser - * $this so it can be used in chaining. - */ - protected function nl() { - fwrite($this->out, PHP_EOL); - return $this; - } - - /** - * Encode text. - * - * When encode is set to FALSE, the default value, the text passed in is - * escaped per section 8.3 of the html5 spec. For details on how text is - * escaped see the escape() method. - * - * When encoding is set to true the text is converted to named character - * references where appropriate. Section 8.1.4 Character references of the - * html5 spec refers to using named character references. This is useful for - * characters that can't otherwise legally be used in the text. - * - * The named character references are listed in section 8.5. - * - * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#named-character-references - * - * True encoding will turn all named character references into their entities. - * This includes such characters as +.# and many other common ones. By default - * encoding here will just escape &'<>". - * - * Note, PHP 5.4+ has better html5 encoding. - * - * @todo Use the Entities class in php 5.3 to have html5 entities. - * - * @param string $text - * text to encode. - * @param boolean $attribute - * True if we are encoding an attrubute, false otherwise - * - * @return string - * The encoded text. - */ - protected function enc($text, $attribute = FALSE) { - - // Escape the text rather than convert to named character references. - if (!$this->encode) { - return $this->escape($text, $attribute); + public function comment($ele) + { + // These produce identical output. + // $this->wr(''); + $this->wr($ele->ownerDocument->saveXML($ele)); } - // If we are in PHP 5.4+ we can use the native html5 entity functionality to - // convert the named character references. - if (defined('ENT_HTML5')) { - return htmlentities($text, ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, 'UTF-8', FALSE); + public function processorInstruction($ele) + { + $this->wr('wr($ele->target) + ->wr(' ') + ->wr($ele->data) + ->wr('?>'); } - // If a version earlier than 5.4 html5 entities are not entirely handled. - // This manually handles them. - else { - return strtr($text, \HTML5\Serializer\HTML5Entities::$map); - } - } + /** + * Write the namespace attributes + * + * + * @param \DOMNode $ele + * The element being written. + */ + protected function namespaceAttrs($ele) + { + if (!$this->xpath || $this->xpath->document !== $ele->ownerDocument){ + $this->xpath = new \DOMXPath($ele->ownerDocument); + } - /** - * Escape test. - * - * According to the html5 spec section 8.3 Serializing HTML fragments, text - * within tags that are not style, script, xmp, iframe, noembed, and noframes - * need to be properly escaped. - * - * The & should be converted to &, no breaking space unicode characters - * converted to  , when in attribute mode the " should be converted to - * ", and when not in attribute mode the < and > should be converted to - * < and >. - * - * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#escapingString - * - * @param string $text - * text to escape. - * @param boolean $attribute - * True if we are escaping an attrubute, false otherwise - */ - protected function escape($text, $attribute = FALSE) { - - // Not using htmlspecialchars because, while it does escaping, it doesn't - // match the requirements of section 8.5. For example, it doesn't handle - // non-breaking spaces. - if ($attribute) { - $replace = array('"'=>'"', '&'=>'&', "\xc2\xa0"=>' '); - } - else { - $replace = array('<'=>'<', '>'=>'>', '&'=>'&', "\xc2\xa0"=>' '); + foreach( $this->xpath->query('namespace::*[not(.=../../namespace::*)]', $ele ) as $nsNode ) { + if (!in_array($nsNode->nodeValue, $this->implicitNamespaces)) { + $this->wr(' ')->wr($nsNode->nodeName)->wr('="')->wr($nsNode->nodeValue)->wr('"'); + } + } } - return strtr($text, $replace); - } + /** + * Write the opening tag. + * + * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the + * qualified name (8.3). + * + * @param \DOMNode $ele + * The element being written. + */ + protected function openTag($ele) + { + $this->wr('<')->wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName); + + + $this->attrs($ele); + $this->namespaceAttrs($ele); + + + if ($this->outputMode == static::IM_IN_HTML) { + $this->wr('>'); + } // If we are not in html mode we are in SVG, MathML, or XML embedded content. + else { + if ($ele->hasChildNodes()) { + $this->wr('>'); + } // If there are no children this is self closing. + else { + $this->wr(' />'); + } + } + } + + protected function attrs($ele) + { + // FIXME: Needs support for xml, xmlns, xlink, and namespaced elements. + if (! $ele->hasAttributes()) { + return $this; + } + + // TODO: Currently, this always writes name="value", and does not do + // value-less attributes. + $map = $ele->attributes; + $len = $map->length; + for ($i = 0; $i < $len; ++ $i) { + $node = $map->item($i); + $val = $this->enc($node->value, true); + + // XXX: The spec says that we need to ensure that anything in + // the XML, XMLNS, or XLink NS's should use the canonical + // prefix. It seems that DOM does this for us already, but there + // may be exceptions. + $name = $node->name; + + // Special handling for attributes in SVG and MathML. + // Using if/elseif instead of switch because it's faster in PHP. + if ($this->outputMode == static::IM_IN_SVG) { + $name = Elements::normalizeSvgAttribute($name); + } elseif ($this->outputMode == static::IM_IN_MATHML) { + $name = Elements::normalizeMathMlAttribute($name); + } + + $this->wr(' ')->wr($name); + + if ((isset($val) && $val !== '') || $this->nonBooleanAttribute($node)) { + $this->wr('="')->wr($val)->wr('"'); + } + } + } + + + protected function nonBooleanAttribute(\DOMAttr $attr) + { + $ele = $attr->ownerElement; + foreach($this->nonBooleanAttributes as $rule){ + + if(isset($rule['nodeNamespace']) && $rule['nodeNamespace']!==$ele->namespaceURI){ + continue; + } + if(isset($rule['attNamespace']) && $rule['attNamespace']!==$attr->namespaceURI){ + continue; + } + if(isset($rule['nodeName']) && !is_array($rule['nodeName']) && $rule['nodeName']!==$ele->localName){ + continue; + } + if(isset($rule['nodeName']) && is_array($rule['nodeName']) && !in_array($ele->localName, $rule['nodeName'], true)){ + continue; + } + if(isset($rule['attrName']) && !is_array($rule['attrName']) && $rule['attrName']!==$attr->localName){ + continue; + } + if(isset($rule['attrName']) && is_array($rule['attrName']) && !in_array($attr->localName, $rule['attrName'], true)){ + continue; + } + if(isset($rule['xpath'])){ + + $xp = $this->getXPath($attr); + if(isset($rule['prefixes'])){ + foreach($rule['prefixes'] as $nsPrefix => $ns){ + $xp->registerNamespace($nsPrefix, $ns); + } + } + if(!$xp->query($rule['xpath'], $attr->ownerElement)->length){ + continue; + } + } + + return true; + } + + return false; + } + + private function getXPath(\DOMNode $node){ + if(!$this->xpath){ + $this->xpath = new \DOMXPath($node->ownerDocument); + } + return $this->xpath; + } + + /** + * Write the closing tag. + * + * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the + * qualified name (8.3). + * + * @param \DOMNode $ele + * The element being written. + */ + protected function closeTag($ele) + { + if ($this->outputMode == static::IM_IN_HTML || $ele->hasChildNodes()) { + $this->wr('wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName)->wr('>'); + } + } + + /** + * Write to the output. + * + * @param string $text + * The string to put into the output. + * + * @return \Masterminds\HTML5\Serializer\Traverser $this so it can be used in chaining. + */ + protected function wr($text) + { + fwrite($this->out, $text); + return $this; + } + + /** + * Write a new line character. + * + * @return \Masterminds\HTML5\Serializer\Traverser $this so it can be used in chaining. + */ + protected function nl() + { + fwrite($this->out, PHP_EOL); + return $this; + } + + /** + * Encode text. + * + * When encode is set to false, the default value, the text passed in is + * escaped per section 8.3 of the html5 spec. For details on how text is + * escaped see the escape() method. + * + * When encoding is set to true the text is converted to named character + * references where appropriate. Section 8.1.4 Character references of the + * html5 spec refers to using named character references. This is useful for + * characters that can't otherwise legally be used in the text. + * + * The named character references are listed in section 8.5. + * + * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#named-character-references True encoding will turn all named character references into their entities. + * This includes such characters as +.# and many other common ones. By default + * encoding here will just escape &'<>". + * + * Note, PHP 5.4+ has better html5 encoding. + * + * @todo Use the Entities class in php 5.3 to have html5 entities. + * + * @param string $text + * text to encode. + * @param boolean $attribute + * True if we are encoding an attrubute, false otherwise + * + * @return string The encoded text. + */ + protected function enc($text, $attribute = false) + { + + // Escape the text rather than convert to named character references. + if (! $this->encode) { + return $this->escape($text, $attribute); + } + + // If we are in PHP 5.4+ we can use the native html5 entity functionality to + // convert the named character references. + + if ($this->hasHTML5) { + return htmlentities($text, ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, 'UTF-8', false); + } // If a version earlier than 5.4 html5 entities are not entirely handled. + // This manually handles them. + else { + return strtr($text, \Masterminds\HTML5\Serializer\HTML5Entities::$map); + } + } + + /** + * Escape test. + * + * According to the html5 spec section 8.3 Serializing HTML fragments, text + * within tags that are not style, script, xmp, iframe, noembed, and noframes + * need to be properly escaped. + * + * The & should be converted to &, no breaking space unicode characters + * converted to  , when in attribute mode the " should be converted to + * ", and when not in attribute mode the < and > should be converted to + * < and >. + * + * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#escapingString + * + * @param string $text + * text to escape. + * @param boolean $attribute + * True if we are escaping an attrubute, false otherwise + */ + protected function escape($text, $attribute = false) + { + + // Not using htmlspecialchars because, while it does escaping, it doesn't + // match the requirements of section 8.5. For example, it doesn't handle + // non-breaking spaces. + if ($attribute) { + $replace = array( + '"' => '"', + '&' => '&', + "\xc2\xa0" => ' ' + ); + } else { + $replace = array( + '<' => '<', + '>' => '>', + '&' => '&', + "\xc2\xa0" => ' ' + ); + } + + return strtr($text, $replace); + } } diff --git a/libraries/html5php/HTML5/Serializer/RulesInterface.php b/libraries/html5php/HTML5/Serializer/RulesInterface.php index 18ac8ca..6ef5e5e 100644 --- a/libraries/html5php/HTML5/Serializer/RulesInterface.php +++ b/libraries/html5php/HTML5/Serializer/RulesInterface.php @@ -3,100 +3,101 @@ * @file * The interface definition for Rules to generate output. */ -namespace HTML5\Serializer; +namespace Masterminds\HTML5\Serializer; /** * To create a new rule set for writing output the RulesInterface needs to be - * implemented. The resulting class can be specified in the options with the + * implemented. + * The resulting class can be specified in the options with the * key of rules. * - * For an example implementation see \HTML5\Serializer\OutputRules. + * For an example implementation see \Masterminds\HTML5\Serializer\OutputRules. */ -interface RulesInterface { +interface RulesInterface +{ - /** - * The class constructor. - * - * Note, before the rules can be used a traverser must be registered. - * - * @param mixed $output - * The output stream to write output to. - * @param array $options - * An array of options. - */ - public function __construct($output, $options = array()); + /** + * The class constructor. + * + * Note, before the rules can be used a traverser must be registered. + * + * @param mixed $output + * The output stream to write output to. + * @param array $options + * An array of options. + */ + public function __construct($output, $options = array()); - /** - * Register the traverser used in but the rules. - * - * Note, only one traverser can be used by the rules. - * - * @param \HTML5\Serializer\Traverser $traverser - * The traverser used in the rules. - * @return \HTML5\Serializer\RulesInterface - * $this for the current object. - */ - public function setTraverser(\HTML5\Serializer\Traverser $traverser); + /** + * Register the traverser used in but the rules. + * + * Note, only one traverser can be used by the rules. + * + * @param \Masterminds\HTML5\Serializer\Traverser $traverser + * The traverser used in the rules. + * @return \Masterminds\HTML5\Serializer\RulesInterface $this for the current object. + */ + public function setTraverser(\Masterminds\HTML5\Serializer\Traverser $traverser); - /** - * Write a document element (\DOMDocument). - * - * Instead of returning the result write it to the output stream ($output) - * that was passed into the constructor. - * - * @param \DOMDocument $dom - */ - public function document($dom); + /** + * Write a document element (\DOMDocument). + * + * Instead of returning the result write it to the output stream ($output) + * that was passed into the constructor. + * + * @param \DOMDocument $dom + */ + public function document($dom); - /** - * Write an element. - * - * Instead of returning the result write it to the output stream ($output) - * that was passed into the constructor. - * - * @param mixed $ele - */ - public function element($ele); + /** + * Write an element. + * + * Instead of returning the result write it to the output stream ($output) + * that was passed into the constructor. + * + * @param mixed $ele + */ + public function element($ele); - /** - * Write a text node. - * - * Instead of returning the result write it to the output stream ($output) - * that was passed into the constructor. - * - * @param mixed $ele - */ - public function text($ele); + /** + * Write a text node. + * + * Instead of returning the result write it to the output stream ($output) + * that was passed into the constructor. + * + * @param mixed $ele + */ + public function text($ele); - /** - * Write a CDATA node. - * - * Instead of returning the result write it to the output stream ($output) - * that was passed into the constructor. - * - * @param mixed $ele - */ - public function cdata($ele); + /** + * Write a CDATA node. + * + * Instead of returning the result write it to the output stream ($output) + * that was passed into the constructor. + * + * @param mixed $ele + */ + public function cdata($ele); - /** - * Write a comment node. - * - * Instead of returning the result write it to the output stream ($output) - * that was passed into the constructor. - * - * @param mixed $ele - */ - public function comment($ele); + /** + * Write a comment node. + * + * Instead of returning the result write it to the output stream ($output) + * that was passed into the constructor. + * + * @param mixed $ele + */ + public function comment($ele); - /** - * Write a processor instruction. - * - * To learn about processor instructions see \HTML5\InstructionProcessor - * - * Instead of returning the result write it to the output stream ($output) - * that was passed into the constructor. - * - * @param mixed $ele - */ - public function processorInstruction($ele); -} \ No newline at end of file + /** + * Write a processor instruction. + * + * To learn about processor instructions see \Masterminds\HTML5\InstructionProcessor + * + * Instead of returning the result write it to the output stream ($output) + * that was passed into the constructor. + * + * @param mixed $ele + */ + public function processorInstruction($ele); +} diff --git a/libraries/html5php/HTML5/Serializer/Traverser.php b/libraries/html5php/HTML5/Serializer/Traverser.php index 0794458..e910f3a 100644 --- a/libraries/html5php/HTML5/Serializer/Traverser.php +++ b/libraries/html5php/HTML5/Serializer/Traverser.php @@ -1,142 +1,150 @@ 'html', - 'http://www.w3.org/1998/Math/MathML' => 'math', - 'http://www.w3.org/2000/svg' => 'svg', - ); + /** + * Namespaces that should be treated as "local" to HTML5. + */ + static $local_ns = array( + 'http://www.w3.org/1999/xhtml' => 'html', + 'http://www.w3.org/1998/Math/MathML' => 'math', + 'http://www.w3.org/2000/svg' => 'svg' + ); - protected $dom; - protected $options; - protected $encode = FALSE; - protected $rules; - protected $out; + protected $dom; - /** - * Create a traverser. - * - * @param DOMNode|DOMNodeList $dom - * The document or node to traverse. - * @param resource $out - * A stream that allows writing. The traverser will output into this - * stream. - * @param array $options - * An array or options for the traverser as key/value pairs. These include: - * - encode_entities: A bool to specify if full encding should happen for all named - * charachter references. Defaults to FALSE which escapes &'<>". - * - output_rules: The path to the class handling the output rules. - */ - public function __construct($dom, $out, RulesInterface $rules, $options = array()) { - $this->dom = $dom; - $this->out = $out; - $this->rules = $rules; - $this->options = $options; + protected $options; - $this->rules->setTraverser($this); - } + protected $encode = false; - /** - * Tell the traverser to walk the DOM. - * - * @return resource $out - * Returns the output stream. - */ - public function walk() { - - if ($this->dom instanceof \DOMDocument) { - $this->rules->document($this->dom); - } - elseif ($this->dom instanceof \DOMDocumentFragment) { - // Document fragments are a special case. Only the children need to - // be serialized. - if ($this->dom->hasChildNodes()) { - $this->children($this->dom->childNodes); - } - } - // If NodeList, loop - elseif ($this->dom instanceof \DOMNodeList) { - // If this is a NodeList of DOMDocuments this will not work. - $this->children($this->dom); - } - // Else assume this is a DOMNode-like datastructure. - else { - $this->node($this->dom); + protected $rules; + + protected $out; + + /** + * Create a traverser. + * + * @param DOMNode|DOMNodeList $dom + * The document or node to traverse. + * @param resource $out + * A stream that allows writing. The traverser will output into this + * stream. + * @param array $options + * An array or options for the traverser as key/value pairs. These include: + * - encode_entities: A bool to specify if full encding should happen for all named + * charachter references. Defaults to false which escapes &'<>". + * - output_rules: The path to the class handling the output rules. + */ + public function __construct($dom, $out, RulesInterface $rules, $options = array()) + { + $this->dom = $dom; + $this->out = $out; + $this->rules = $rules; + $this->options = $options; + + $this->rules->setTraverser($this); } - return $this->out; - } + /** + * Tell the traverser to walk the DOM. + * + * @return resource $out + * Returns the output stream. + */ + public function walk() + { + if ($this->dom instanceof \DOMDocument) { + $this->rules->document($this->dom); + } elseif ($this->dom instanceof \DOMDocumentFragment) { + // Document fragments are a special case. Only the children need to + // be serialized. + if ($this->dom->hasChildNodes()) { + $this->children($this->dom->childNodes); + } + } // If NodeList, loop + elseif ($this->dom instanceof \DOMNodeList) { + // If this is a NodeList of DOMDocuments this will not work. + $this->children($this->dom); + } // Else assume this is a DOMNode-like datastructure. + else { + $this->node($this->dom); + } - /** - * Process a node in the DOM. - * - * @param mixed $node - * A node implementing \DOMNode. - */ - public function node($node) { - // A listing of types is at http://php.net/manual/en/dom.constants.php - switch ($node->nodeType) { - case XML_ELEMENT_NODE: - $this->rules->element($node); - break; - case XML_TEXT_NODE: - $this->rules->text($node); - break; - case XML_CDATA_SECTION_NODE: - $this->rules->cdata($node); - break; - // FIXME: It appears that the parser doesn't do PI's. - case XML_PI_NODE: - $this->rules->processorInstruction($node); - break; - case XML_COMMENT_NODE: - $this->rules->comment($node); - break; - // Currently we don't support embedding DTDs. - default: - print ''; - break; + return $this->out; } - } - /** - * Walk through all the nodes on a node list. - * - * @param \DOMNodeList $nl - * A list of child elements to walk through. - */ - public function children($nl) { - foreach ($nl as $node) { - $this->node($node); + /** + * Process a node in the DOM. + * + * @param mixed $node + * A node implementing \DOMNode. + */ + public function node($node) + { + // A listing of types is at http://php.net/manual/en/dom.constants.php + switch ($node->nodeType) { + case XML_ELEMENT_NODE: + $this->rules->element($node); + break; + case XML_TEXT_NODE: + $this->rules->text($node); + break; + case XML_CDATA_SECTION_NODE: + $this->rules->cdata($node); + break; + // FIXME: It appears that the parser doesn't do PI's. + case XML_PI_NODE: + $this->rules->processorInstruction($node); + break; + case XML_COMMENT_NODE: + $this->rules->comment($node); + break; + // Currently we don't support embedding DTDs. + default: + print ''; + break; + } } - } - /** - * Is an element local? - * - * @param mixed $ele - * An element that implement \DOMNode. - * - * @return bool - * True if local and false otherwise. - */ - public function isLocalElement($ele) { - $uri = $ele->namespaceURI; - if (empty($uri)) { - return FALSE; + /** + * Walk through all the nodes on a node list. + * + * @param \DOMNodeList $nl + * A list of child elements to walk through. + */ + public function children($nl) + { + foreach ($nl as $node) { + $this->node($node); + } + } + + /** + * Is an element local? + * + * @param mixed $ele + * An element that implement \DOMNode. + * + * @return bool True if local and false otherwise. + */ + public function isLocalElement($ele) + { + $uri = $ele->namespaceURI; + if (empty($uri)) { + return false; + } + + return isset(static::$local_ns[$uri]); } - return isset(static::$local_ns[$uri]); - } } diff --git a/libraries/html5php/autoloader.php b/libraries/html5php/autoloader.php index 559f343..1976d30 100644 --- a/libraries/html5php/autoloader.php +++ b/libraries/html5php/autoloader.php @@ -23,10 +23,11 @@ class HTML5PHP_Autoloader public function autoload($class) { // Only load the class if it starts with "HTML5" - if (strpos($class, 'HTML5') !== 0) + if (strpos($class, 'Masterminds\HTML5') !== 0) { return; } + $class = substr($class, 12); //die($class); $filename = $this->path . DIRECTORY_SEPARATOR . str_replace('\\', DIRECTORY_SEPARATOR, $class) . '.php'; diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php index 7028113..23af46f 100644 --- a/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/libraries/humble-http-agent/HumbleHttpAgent.php @@ -394,7 +394,7 @@ class HumbleHttpAgent // for AJAX sites, e.g. Blogger with its dynamic views templates. // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 150000)); if ($redirectURL) { $this->redirectQueue[$orig] = $redirectURL; } @@ -515,7 +515,7 @@ class HumbleHttpAgent // for AJAX sites, e.g. Blogger with its dynamic views templates. // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 150000)); if ($redirectURL) { $this->redirectQueue[$orig] = $redirectURL; } @@ -601,7 +601,7 @@ class HumbleHttpAgent // for AJAX sites, e.g. Blogger with its dynamic views templates. // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 150000)); if ($redirectURL) { $this->redirectQueue[$orig] = $redirectURL; } diff --git a/libraries/readability/Readability.php b/libraries/readability/Readability.php index fc4cce9..b99f3bf 100644 --- a/libraries/readability/Readability.php +++ b/libraries/readability/Readability.php @@ -113,19 +113,22 @@ class Readability function __construct($html, $url=null, $parser='libxml') { $this->url = $url; - /* Turn all double br's into p's */ + /* Turn all double
s into

s */ $html = preg_replace($this->regexps['replaceBrs'], '

', $html); $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); - $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); if (trim($html) == '') $html = ''; if ($parser=='html5lib' || $parser=='html5php') { if (version_compare(PHP_VERSION, '5.3.0') >= 0) { - $this->dom = HTML5::loadHTML($html); + //use Masterminds\HTML5; + $html5class = 'Masterminds\HTML5'; + $html5 = new $html5class(); + $this->dom = $html5->loadHTML($html); } } if ($this->dom === null) { $this->dom = new DOMDocument(); $this->dom->preserveWhiteSpace = false; + $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); @$this->dom->loadHTML($html); } $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); diff --git a/makefulltextfeed.php b/makefulltextfeed.php index 29642df..603aada 100644 --- a/makefulltextfeed.php +++ b/makefulltextfeed.php @@ -3,8 +3,8 @@ // Author: Keyvan Minoukadeh // Copyright (c) 2014 Keyvan Minoukadeh // License: AGPLv3 -// Version: 3.3 -// Date: 2014-05-07 +// Version: 3.4 +// Date: 2014-08-28 // More info: http://fivefilters.org/content-only/ // Help: http://help.fivefilters.org @@ -29,6 +29,7 @@ along with this program. If not, see . // For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage error_reporting(E_ALL ^ E_NOTICE); +libxml_use_internal_errors(true); ini_set("display_errors", 1); @set_time_limit(120); @@ -82,9 +83,11 @@ function autoload($class_name) { // Language detect 'Text_LanguageDetect' => 'language-detect/LanguageDetect.php', // HTML5 PHP (can't be used unless PHP version is >= 5.3) - 'HTML5' => 'html5php/HTML5.php', + 'Masterminds\HTML5' => 'html5php/HTML5.php', // htmLawed - used if XSS filter is enabled (xss_filter) - 'htmLawed' => 'htmLawed/htmLawed.php' + 'htmLawed' => 'htmLawed/htmLawed.php', + // Disable SimplePie sanitization + 'DisableSimplePieSanitize' => 'DisableSimplePieSanitize.php' ); if (isset($mapping[$class_name])) { debug("** Loading class $class_name ({$mapping[$class_name]})"); @@ -180,19 +183,9 @@ if (strtolower(substr($url, 0, 7)) == 'feed://') { if (!preg_match('!^https?://.+!i', $url)) { $url = 'http://'.$url; } +$url = validate_url($url); +if (!$url) die('Invalid URL supplied'); -$url = filter_var($url, FILTER_SANITIZE_URL); -$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); -// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) -if ($test === false) { - $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); -} -if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { - // all okay - unset($test); -} else { - die('Invalid URL supplied'); -} debug("Supplied URL: $url"); ///////////////////////////////// @@ -200,34 +193,19 @@ debug("Supplied URL: $url"); // (if in 'full' mode) ///////////////////////////////// if ((_FF_FTR_MODE == 'full') && isset($_REQUEST['key']) && ($key_index = array_search($_REQUEST['key'], $options->api_keys)) !== false) { - $host = $_SERVER['HTTP_HOST']; - $path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\'); - $_qs_url = (strtolower(substr($url, 0, 7)) == 'http://') ? substr($url, 7) : $url; - $redirect = 'http://'.htmlspecialchars($host.$path).'/makefulltextfeed.php?url='.urlencode($_qs_url); - $redirect .= '&key='.$key_index; - $redirect .= '&hash='.urlencode(sha1($_REQUEST['key'].$url)); - if (isset($_REQUEST['html'])) $redirect .= '&html='.urlencode($_REQUEST['html']); - if (isset($_REQUEST['max'])) $redirect .= '&max='.(int)$_REQUEST['max']; - if (isset($_REQUEST['links'])) $redirect .= '&links='.urlencode($_REQUEST['links']); - if (isset($_REQUEST['exc'])) $redirect .= '&exc='.urlencode($_REQUEST['exc']); - if (isset($_REQUEST['format'])) $redirect .= '&format='.urlencode($_REQUEST['format']); - if (isset($_REQUEST['callback'])) $redirect .= '&callback='.urlencode($_REQUEST['callback']); - if (isset($_REQUEST['l'])) $redirect .= '&l='.urlencode($_REQUEST['l']); - if (isset($_REQUEST['lang'])) $redirect .= '&lang='.urlencode($_REQUEST['lang']); - if (isset($_REQUEST['xss'])) $redirect .= '&xss'; - if (isset($_REQUEST['use_extracted_title'])) $redirect .= '&use_extracted_title'; - if (isset($_REQUEST['content'])) $redirect .= '&content='.urlencode($_REQUEST['content']); - if (isset($_REQUEST['summary'])) $redirect .= '&summary='.urlencode($_REQUEST['summary']); - if (isset($_REQUEST['debug'])) $redirect .= '&debug'; - if (isset($_REQUEST['parser'])) $redirect .= '&parser='.urlencode($_REQUEST['parser']); - if (isset($_REQUEST['proxy'])) $redirect .= '&proxy='.urlencode($_REQUEST['proxy']); - if ($debug_mode) { - debug('Redirecting to hide access key, follow URL below to continue'); - debug("Location: $redirect"); + if (isset($_REQUEST['key_redirect']) && $_REQUEST['key_redirect'] === '0') { + $_REQUEST['hash'] = sha1($_REQUEST['key'].$url); + $_REQUEST['key'] = $key_index; } else { - header("Location: $redirect"); + $redirect = get_self_url(); + if ($debug_mode) { + debug('Redirecting to hide access key, follow URL below to continue'); + debug("Location: $redirect"); + } else { + header("Location: $redirect"); + } + exit; } - exit; } /////////////////////////////////////////////// @@ -241,9 +219,25 @@ if (!ini_get('date.timezone') || !@date_default_timezone_set(ini_get('date.timez } /////////////////////////////////////////////// -// Check if the request is explicitly for an HTML page +// Should we treat input URL as feed or HTML? /////////////////////////////////////////////// -$html_only = (isset($_REQUEST['html']) && ($_REQUEST['html'] == '1' || $_REQUEST['html'] == 'true')); +$accept = 'auto'; +if (isset($_REQUEST['accept']) && in_array(strtolower($_REQUEST['accept']), array('html', 'feed', 'auto'))) { + $accept = strtolower($_REQUEST['accept']); +} elseif (isset($_REQUEST['html']) && ($_REQUEST['html'] == '1' || $_REQUEST['html'] == 'true')) { + $accept = 'html'; +} + +/////////////////////////////////////////////// +// User-submitted site config +/////////////////////////////////////////////// +$user_submitted_config = null; +if (isset($_REQUEST['siteconfig'])) { + $user_submitted_config = $_REQUEST['siteconfig']; + if (!$options->user_submitted_content && $user_submitted_config) { + die('User-submitted site configs are currently disabled. Please remove the siteconfig parameter.'); + } +} /////////////////////////////////////////////// // Check if valid key supplied @@ -463,8 +457,8 @@ if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') { ////////////////////////////////// if ($options->caching) { debug('Caching is enabled...'); - $cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary. - (int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.$parser._FF_FTR_MODE); + $cache_id = md5($max.$url.(int)$valid_key.$accept.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary. + (int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.$parser.$user_submitted_config._FF_FTR_MODE); $check_cache = true; if ($options->apc && $options->smart_cache) { apc_add("cache.$cache_id", 0, $options->cache_time*60); @@ -548,11 +542,14 @@ SiteConfig::use_apc($options->apc); $extractor->fingerprints = $options->fingerprints; $extractor->allowedParsers = $options->allowed_parsers; $extractor->parserOverride = $parser; +if ($options->user_submitted_config && $user_submitted_config) { + $extractor->setUserSubmittedConfig($user_submitted_config); +} //////////////////////////////// // Get RSS/Atom feed //////////////////////////////// -if (!$html_only) { +if ($accept !== 'html') { debug('--------'); debug("Attempting to process URL as feed"); // Send user agent header showing PHP (prevents a HTML response from feedburner) @@ -563,6 +560,9 @@ if (!$html_only) { // some feeds use the text/html content type - force_feed tells SimplePie to process anyway $feed->force_feed(true); $feed->set_file_class('SimplePie_HumbleHttpAgent'); + $feed->set_sanitize_class('DisableSimplePieSanitize'); + // need to assign this manually it seems + $feed->sanitize = new DisableSimplePieSanitize(); //$feed->set_feed_url($url); // colons appearing in the URL's path get encoded $feed->feed_url = $url; $feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE); @@ -578,6 +578,8 @@ if (!$html_only) { //$feed->get_title(); if ($result && (!is_array($feed->data) || count($feed->data) == 0)) { die('Sorry, no feed items found'); + } elseif (!$result && $accept === 'feed') { + die('Sorry, couldn\'t parse as feed'); } // from now on, we'll identify ourselves as a browser $http->userAgentDefault = HumbleHttpAgent::UA_BROWSER; @@ -589,7 +591,7 @@ if (!$html_only) { // single-item feeds. //////////////////////////////////////////////////////////////////////////////// $isDummyFeed = false; -if ($html_only || !$result) { +if ($accept === 'html' || !$result) { debug('--------'); debug("Constructing a single-item feed from URL"); $isDummyFeed = true; @@ -627,6 +629,8 @@ if ($html_only || !$result) { //////////////////////////////////////////// $output = new FeedWriter(); if (_FF_FTR_MODE === 'simple') $output->enableSimpleJson(); +//$feed_title = $feed->get_title(); +//echo $feed_title; exit; $output->setTitle(strip_tags($feed->get_title())); $output->setDescription(strip_tags($feed->get_description())); $output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it @@ -635,7 +639,9 @@ if ($ttl !== null) { $ttl = (int)$ttl[0]['data']; $output->setTtl($ttl); } -//$output->setSelf('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']); +$output->setSelf(get_self_url()); +$output->setAlternate($url, 'Source URL'); +$output->setRelated('http://www.subtome.com/#/subscribe?feeds='.urlencode(get_self_url()).'&back='.urlencode(get_self_url()), 'Subscribe to feed'); $output->setLink($feed->get_link()); // Google Reader uses this for pulling in favicons if ($img_url = $feed->get_image_url()) { $output->setImage($feed->get_title(), $feed->get_link(), $img_url); @@ -656,7 +662,12 @@ foreach ($items as $key => $item) { // simplepie already sanitizes URLs so let's not do it again here. //$permalink = $http->validateUrl($permalink); if ($permalink) { - $urls_sanitized[] = $permalink; + if (!url_allowed($permalink)) { + debug('URL blocked, skipping...'); + $permalink = false; + } else { + $urls_sanitized[] = $permalink; + } } $urls[$key] = $permalink; } @@ -669,6 +680,7 @@ $http->fetchAll($urls_sanitized); $item_count = 0; foreach ($items as $key => $item) { + libxml_clear_errors(); debug('--------'); debug('Processing feed item '.($item_count+1)); $do_content_extraction = true; @@ -697,7 +709,10 @@ foreach ($items as $key => $item) { // errors being treated as valid responses. if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300)) { $effective_url = $response['effective_url']; - if (!url_allowed($effective_url)) continue; + if (!url_allowed($effective_url)) { + debug('URL blocked, skipping...'); + continue; + } // check if action defined for returned Content-Type $mime_info = get_mime_action_info($response['headers']); if (isset($mime_info['action'])) { @@ -727,7 +742,7 @@ foreach ($items as $key => $item) { } // check site config for single page URL - fetch it if found $is_single_page = false; - if ($options->singlepage && ($single_page_response = getSinglePage($item, $html, $effective_url))) { + if ($options->singlepage && ($single_page_response = get_single_page($item, $html, $effective_url))) { $is_single_page = true; $effective_url = $single_page_response['effective_url']; // check if action defined for returned Content-Type @@ -765,6 +780,13 @@ foreach ($items as $key => $item) { debug("Here's the full HTML after it's been parsed by Full-Text RSS:"); die($readability->dom->saveXML($readability->dom->documentElement)); } + // is this a native ad? + if ($extract_result && $extractor->isNativeAd()) { + debug("This article appears to be a native ad"); + if (!$isDummyFeed && $options->remove_native_ads) { + continue; // skip this feed item entry + } + } $content_block = ($extract_result) ? $extractor->getContent() : null; $extracted_title = ($extract_result) ? $extractor->getTitle() : ''; // Deal with multi-page articles @@ -779,7 +801,7 @@ foreach ($items as $key => $item) { debug('--------'); debug('Processing next page: '.$next_page_url); // If we've got URL, resolve against $url - if ($next_page_url = makeAbsoluteStr($effective_url, $next_page_url)) { + if ($next_page_url = make_absolute_str($effective_url, $next_page_url)) { // check it's not what we have already! if (!in_array($next_page_url, $multi_page_urls)) { // it's not, so let's attempt to fetch it @@ -844,7 +866,12 @@ foreach ($items as $key => $item) { $html .= $item->get_description(); } else { $readability->clean($content_block, 'select'); - if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block); + if ($options->rewrite_relative_urls) { + $base_url = get_base_url($readability->dom); + if (!$base_url) $base_url = $effective_url; + // rewrite URLs + make_absolute($base_url, $content_block); + } // footnotes if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) { $readability->addFootnotes($content_block); @@ -987,12 +1014,16 @@ foreach ($items as $key => $item) { // add effective URL (URL after redirects) if (isset($effective_url)) { //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g. - //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-–-25th-March-2012-Special-Program-from-Liari-(Karachi) + //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-–-25th-March-2012-Special-Program-from-Liari-(Karachi) //temporary measure: use utf8_encode() $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url))); } else { $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink())); } + // is this a native ad? + if ($extractor->isNativeAd()) { + $newitem->addElement('dc:type', 'Native Ad'); + } // add categories if ($categories = $item->get_categories()) { @@ -1075,7 +1106,7 @@ if (!$debug_mode) { $output->generateFeed(); $output = ob_get_contents(); ob_end_clean(); - if ($html_only && $item_count == 0) { + if ($accept === 'html' && $item_count == 0) { // do not cache - in case of temporary server glitch at source URL } else { $cache = get_cache(); @@ -1092,10 +1123,77 @@ if (!$debug_mode) { // HELPER FUNCTIONS /////////////////////////////// +function get_self_url() { + global $options, $url; + $scheme = (is_ssl()) ? 'https://' : 'http://'; + $host = $_SERVER['HTTP_HOST']; + $path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\'); + $_qs_url = (strtolower(substr($url, 0, 7)) == 'http://') ? substr($url, 7) : $url; + $self = $scheme.htmlspecialchars($host.$path).'/makefulltextfeed.php?url='.urlencode($_qs_url); + + // hide API key if we can + if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->api_keys)) !== false) { + $_hash = sha1($_GET['key'].$url); + $self .= '&key='.$key_index; + $self .= '&hash='.urlencode($_hash); + } elseif(isset($_GET['key']) && isset($_GET['hash'])) { + $self .= '&key='.urlencode($_GET['key']); + $self .= '&hash='.urlencode($_GET['hash']); + } + + if (isset($_GET['html'])) $self .= '&html='.urlencode($_GET['html']); + if (isset($_GET['accept'])) $self .= '&accept='.urlencode($_GET['accept']); + if (isset($_GET['max'])) $self .= '&max='.(int)$_GET['max']; + if (isset($_GET['links'])) $self .= '&links='.urlencode($_GET['links']); + if (isset($_GET['exc'])) $self .= '&exc='.urlencode($_GET['exc']); + if (isset($_GET['format'])) $self .= '&format='.urlencode($_GET['format']); + if (isset($_GET['callback'])) $self .= '&callback='.urlencode($_GET['callback']); + if (isset($_GET['l'])) $self .= '&l='.urlencode($_GET['l']); + if (isset($_GET['lang'])) $self .= '&lang='.urlencode($_GET['lang']); + if (isset($_GET['xss'])) $self .= '&xss'; + if (isset($_GET['use_extracted_title'])) $self .= '&use_extracted_title'; + if (isset($_GET['content'])) $self .= '&content='.urlencode($_GET['content']); + if (isset($_GET['summary'])) $self .= '&summary='.urlencode($_GET['summary']); + if (isset($_GET['debug'])) $self .= '&debug'; + if (isset($_GET['parser'])) $self .= '&parser='.urlencode($_GET['parser']); + if (isset($_GET['proxy'])) $self .= '&proxy='.urlencode($_GET['proxy']); + if (isset($_GET['siteconfig'])) $self .= '&siteconfig='.urlencode($_GET['siteconfig']); + return $self; +} + +function validate_url($url) { + $url = filter_var($url, FILTER_SANITIZE_URL); + $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); + // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) + if ($test === false) { + $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); + } + if ($test !== false && $test !== null && preg_match('!^https?://!i', $url)) { + return $url; + } else { + return false; + } +} + +function get_base_url($dom) { + $xpath = new DOMXPath($dom); + return @$xpath->evaluate('string(//head/base/@href)', $dom); +} + +function is_ssl() { + if (isset($_SERVER['HTTPS']) && ($_SERVER['HTTPS'] != '') && ($_SERVER['HTTPS'] != 'off')) { + return true; + } elseif (isset($_SERVER['HTTP_X_FORWARDED_PROTO']) && $_SERVER['HTTP_X_FORWARDED_PROTO'] == 'https') { + return true; + } else { + return false; + } +} + // Adapted from WordPress // http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173 function get_excerpt($text, $num_words=55, $more=null) { - if (null === $more) $more = '…'; + if (null === $more) $more = '…'; $text = strip_tags($text); //TODO: Check if word count is based on single characters (East Asian characters) /* @@ -1183,9 +1281,10 @@ function convert_to_utf8($html, $header=null) { } } } - if (isset($encoding)) $encoding = trim($encoding); - // trim is important here! - if (!$encoding || (strtolower($encoding) == 'iso-8859-1')) { + if (isset($encoding)) $encoding = strtolower(trim($encoding)); + // fix bad encoding values + if ($encoding === 'iso-8850-1') $encoding = 'iso-8859-1'; + if (!$encoding || ($encoding === 'iso-8859-1')) { // replace MS Word smart qutoes $trans = array(); $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark @@ -1219,7 +1318,7 @@ function convert_to_utf8($html, $header=null) { $encoding = 'utf-8'; } else { debug('Character encoding: '.$encoding); - if (strtolower($encoding) != 'utf-8') { + if ($encoding !== 'utf-8') { debug('Converting to UTF-8'); $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); } @@ -1228,7 +1327,7 @@ function convert_to_utf8($html, $header=null) { return $html; } -function makeAbsolute($base, $elem) { +function make_absolute($base, $elem) { $base = new SimplePie_IRI($base); // remove '//' in URL path (used to prevent URLs from resolving properly) // TODO: check if this is still the case @@ -1238,12 +1337,12 @@ function makeAbsolute($base, $elem) { for ($i = $elems->length-1; $i >= 0; $i--) { $e = $elems->item($i); //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); - makeAbsoluteAttr($base, $e, $attr); + make_absolute_attr($base, $e, $attr); } - if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr); + if (strtolower($elem->tagName) == $tag) make_absolute_attr($base, $elem, $attr); } } -function makeAbsoluteAttr($base, $e, $attr) { +function make_absolute_attr($base, $e, $attr) { if ($e->hasAttribute($attr)) { // Trim leading and trailing white space. I don't really like this but // unfortunately it does appear on some sites. e.g. @@ -1256,7 +1355,7 @@ function makeAbsoluteAttr($base, $e, $attr) { } } } -function makeAbsoluteStr($base, $url) { +function make_absolute_str($base, $url) { $base = new SimplePie_IRI($base); // remove '//' in URL path (causes URLs not to resolve properly) if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); @@ -1271,7 +1370,7 @@ function makeAbsoluteStr($base, $url) { } } // returns single page response, or false if not found -function getSinglePage($item, $html, $url) { +function get_single_page($item, $html, $url) { global $http, $extractor; debug('Looking for site config files to see if single page link exists'); $site_config = $extractor->buildSiteConfig($url, $html); @@ -1308,7 +1407,7 @@ function getSinglePage($item, $html, $url) { } } // If we've got URL, resolve against $url - if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) { + if (isset($single_page_url) && ($single_page_url = make_absolute_str($url, $single_page_url))) { // check it's not what we have already! if ($single_page_url != $url) { // it's not, so let's try to fetch it... diff --git a/manifest.yml b/manifest.yml deleted file mode 100644 index 0834cba..0000000 --- a/manifest.yml +++ /dev/null @@ -1,16 +0,0 @@ -# This file is only used when deploying Full-Text RSS to AppFog. -# See http://help.fivefilters.org/customer/portal/articles/1143210-hosting ---- -applications: - .: -# name: full-text-rss - framework: - name: php - info: - mem: 512M - description: PHP Application - exec: - infra: aws -# url: ${name}.${target-base} - mem: 512M - instances: 1