Full-Text RSS 3.4

This commit is contained in:
FiveFilters.org 2015-06-14 02:03:20 +02:00
parent cfe4c012ef
commit e7753953f6
39 changed files with 8217 additions and 7499 deletions

View File

@ -60,6 +60,7 @@ tpl_header('Edit site patterns');
$version = file_get_contents('../site_config/standard/version.txt'); $version = file_get_contents('../site_config/standard/version.txt');
function filter_only_text($filename) { function filter_only_text($filename) {
if ($filename === 'version.txt') return false;
return (strtolower(substr($filename, -4)) == '.txt'); return (strtolower(substr($filename, -4)) == '.txt');
} }
function is_valid_hostname($host) { function is_valid_hostname($host) {

View File

@ -3,7 +3,7 @@
// Author: Keyvan Minoukadeh // Author: Keyvan Minoukadeh
// Copyright (c) 2014 Keyvan Minoukadeh // Copyright (c) 2014 Keyvan Minoukadeh
// License: AGPLv3 // License: AGPLv3
// Date: 2013-05-02 // Date: 2014-08-19
// More info: http://fivefilters.org/content-only/ // More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org // Help: http://help.fivefilters.org
@ -36,6 +36,8 @@ ini_set("display_errors", 1);
//////////////////////////////// ////////////////////////////////
$admin_page = 'update'; $admin_page = 'update';
require_once('../config.php'); require_once('../config.php');
require_once('../libraries/humble-http-agent/HumbleHttpAgent.php');
require_once('../libraries/humble-http-agent/CookieJar.php');
require_once 'template.php'; require_once 'template.php';
tpl_header('Update site patterns'); tpl_header('Update site patterns');
@ -129,18 +131,21 @@ if ($_REQUEST['key'] !== $admin_hash) {
// Check for updates // Check for updates
////////////////////////////////// //////////////////////////////////
//$ff_version = @file_get_contents('http://fivefilters.org/content-only/site_config/standard/version.txt'); //$ff_version = @file_get_contents('http://fivefilters.org/content-only/site_config/standard/version.txt');
$_context = stream_context_create(array('http' => array('user_agent' => 'PHP/5.4'))); $http = new HumbleHttpAgent();
$latest_info_json = @file_get_contents('https://api.github.com/repos/fivefilters/ftr-site-config', false, $_context); $latest_info_json = $http->get('https://api.github.com/repos/fivefilters/ftr-site-config');
//$_context = stream_context_create(array('http' => array('user_agent' => 'PHP/5.5'), 'ssl'=>array('verify_peer'=>false)));
//$latest_info_json = file_get_contents('https://api.github.com/repos/fivefilters/ftr-site-config', false, $_context);
if (!$latest_info_json) { if (!$latest_info_json) {
println("Sorry, couldn't get info on latest site config files. Please try again later or contact us."); println("Sorry, couldn't get info on latest site config files. Please try again later or contact us.");
exit; exit;
} }
$latest_info_json = $latest_info_json['body'];
$latest_info_json = @json_decode($latest_info_json); $latest_info_json = @json_decode($latest_info_json);
if (!is_object($latest_info_json)) { if (!is_object($latest_info_json)) {
println("Sorry, couldn't parse JSON from GitHub. Please try again later or contact us."); println("Sorry, couldn't parse JSON from GitHub. Please try again later or contact us.");
exit; exit;
} }
$ff_version = $latest_info_json->updated_at; $ff_version = $latest_info_json->pushed_at;
if ($version == $ff_version) { if ($version == $ff_version) {
die('Your site config files are up to date! If you have trouble extracting from a particular site, please email us: help@fivefilters.org'); die('Your site config files are up to date! If you have trouble extracting from a particular site, please email us: help@fivefilters.org');
} else { } else {
@ -166,8 +171,15 @@ if (file_exists($tmp_old_local_dir)) {
$standard_local_dir = '../site_config/standard/'; $standard_local_dir = '../site_config/standard/';
//@copy($latest_remote, $tmp_latest_local); //@copy($latest_remote, $tmp_latest_local);
//copy() does not appear to fill $http_response_header in certain environments //copy() does not appear to fill $http_response_header in certain environments
@file_put_contents($tmp_latest_local, @file_get_contents($latest_remote)); //@file_put_contents($tmp_latest_local, @file_get_contents($latest_remote, false, $_context));
$headers = implode("\n", $http_response_header); $latest_remote_response = $http->get($latest_remote);
if (!is_array($latest_remote_response)) {
println("Sorry, something went wrong. Please contact us if the problem persists.");
exit;
}
@file_put_contents($tmp_latest_local, $latest_remote_response['body']);
//$headers = implode("\n", $http_response_header);
$headers = $latest_remote_response['headers'];
//var_dump($headers); exit; //var_dump($headers); exit;
if ((strpos($headers, 'HTTP/1.0 200') === false) && (strpos($headers, 'HTTP/1.1 200') === false)) { if ((strpos($headers, 'HTTP/1.0 200') === false) && (strpos($headers, 'HTTP/1.1 200') === false)) {
println("Sorry, something went wrong. Please contact us if the problem persists."); println("Sorry, something went wrong. Please contact us if the problem persists.");

1
cache/index.php vendored
View File

@ -1,3 +1,2 @@
<?php <?php
// this is here to prevent directory listing over the web // this is here to prevent directory listing over the web
?>

2
cache/rss-with-key/index.php vendored Normal file
View File

@ -0,0 +1,2 @@
<?php
// this is here to prevent directory listing over the web

2
cache/rss/index.php vendored Normal file
View File

@ -0,0 +1,2 @@
<?php
// this is here to prevent directory listing over the web

View File

@ -2,6 +2,24 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/ http://fivefilters.org/content-only/
CHANGELOG CHANGELOG
------------------------------------ ------------------------------------
3.4 (2014-09-08)
- New request parameter: siteconfig lets you submit extraction rules directly in request
- New request paramter: accept=(auto|feed|html) determines what we'll accept as a response (deprecates html=1 parameter)
- New request parameter: key_redirect=0 to prevent HTTP redirect to hide API key
- Site config files can now contain native_ad_clue: [xpath] to check for elements which signify that the article is a native ad
- New config option: remove_native_ads - set to true and when we notice native ads (see above) we'll remove them from the output (only when processing feeds, doesn't affect output when input URL points to an HTML page).
- Feed output will include <dc:type>Native Ad</dc:type> for articles which appear to be native ads.
- New config option: user_submitted_config to determine whether siteconfig parameter is enabled or not
- Feed output now includes <atom:link rel="self"...> with URL of the generated feed
- Feed output now includes <atom:link rel="alternate"...> with URL of the original (input) URL
- Feed output now includes <atom:link rel="related"...> with URL to subscribe to the generated feed (using subtome.com)
- Feed preview stylesheet (feed.xsl) now presents a subscribe to feed link
- Fixed character encoding issue for certain texts
- Fixed character encoding issue for certain characters in HTML5 parsing mode
- Use base element, if present in HTML, when rewriting URLs
- HTML5-PHP library updated
- Other minor fixes/improvements
3.3 (2014-05-13) 3.3 (2014-05-13)
- Content extractor now looks for Schema.org articleBody elements - Content extractor now looks for Schema.org articleBody elements
- New endpoint extract.php for developers looking for simpler JSON results (no RSS as input/output) - New endpoint extract.php for developers looking for simpler JSON results (no RSS as input/output)

View File

@ -187,11 +187,28 @@ $options->keep_enclosures = true;
// Values will be placed inside the <dc:language> element inside each <item> element // Values will be placed inside the <dc:language> element inside each <item> element
// Possible values: // Possible values:
// * Ignore language: 0 // * Ignore language: 0
// * Use article/feed metadata (e.g. HTML lang attribute): 1 (default) // * Use article/feed metadata (e.g. HTML lang attribute): 1
// * As above, but guess if not present: 2 // * As above, but guess if not present: 2
// * Always guess: 3 // * Always guess: 3
// * User decides: 'user' (value of 0-3 can be passed in querystring: e.g. &lang=2) // * User decides: 'user' (value of 0-3 can be passed in querystring: e.g. &lang=2, &lang=1 will be default if nothing supplied)
$options->detect_language = 1; $options->detect_language = 'user';
// Allow user-submitted site config in request
// ---------------
// If enabled, a user can submit site config rules directly in the request
// using the siteconfig request parameter. Disabled (false) by default.
$options->user_submitted_config = false;
// Remove items identified as native ads?
// ---------------
// Many news sites now carry native advertising - articles which have been
// paid for by a corporation to promote their brand or product.
// Full-Text RSS can identify such articles in certain sites. If an article
// is identified as being a native ad, we'll add a <dc:type>Native Ad</dc:type>
// element to the item. But you can also request that such ads be removed from
// the output altogether. To do so, set the option below to true.
// Note: this only has effect when the input URL is a feed, not a web page.
$options->remove_native_ads = false;
///////////////////////////////////////////////// /////////////////////////////////////////////////
/// RESTRICT ACCESS ///////////////////////////// /// RESTRICT ACCESS /////////////////////////////
@ -213,6 +230,7 @@ $options->admin_credentials = array('username'=>'admin', 'password'=>'');
// List of URLs (or parts of a URL) which the service will accept. // List of URLs (or parts of a URL) which the service will accept.
// If the list is empty, all URLs (except those specified in the blocked list below) // If the list is empty, all URLs (except those specified in the blocked list below)
// will be permitted. // will be permitted.
// Note: for feeds, this option applies to both feed URLs and item URLs within those feeds.
// Empty: array(); // Empty: array();
// Non-empty example: array('example.com', 'anothersite.org'); // Non-empty example: array('example.com', 'anothersite.org');
$options->allowed_urls = array(); $options->allowed_urls = array();
@ -220,7 +238,8 @@ $options->allowed_urls = array();
// URLs to block // URLs to block
// ---------------------- // ----------------------
// List of URLs (or parts of a URL) which the service will not accept. // List of URLs (or parts of a URL) which the service will not accept.
// Note: this list is ignored if allowed_urls is not empty // Note: this list is ignored if allowed_urls is not empty.
// Note: for feeds, this option applies to both feed URLs and item URLs within those feeds.
$options->blocked_urls = array(); $options->blocked_urls = array();
// Key holder(s) only? // Key holder(s) only?
@ -231,22 +250,6 @@ $options->blocked_urls = array();
// key is provided. // key is provided.
$options->key_required = false; $options->key_required = false;
// Favour item titles in feed
// ----------------------
// By default, when processing feeds, we assume item titles in the feed
// have not been truncated. So after processing web pages, the extracted titles
// are not used in the generated feed. If you prefer to have extracted titles in
// the feed you can either set this to false, in which case we will always favour
// extracted titles. Alternatively, if set to 'user' (default) we'll use the
// extracted title if you pass '&use_extracted_title' in the querystring.
// Possible values:
// * Favour feed titles: true
// * Favour extracted titles: false
// * Favour feed titles with user override: 'user' (default)
// Note: this has no effect when the input URL is to a web page - in these cases
// we always use the extracted title in the generated feed.
$options->favour_feed_titles = 'user';
// Access keys (password protected access) // Access keys (password protected access)
// ------------------------------------ // ------------------------------------
// NOTE: You do not need an API key from fivefilters.org to run your own // NOTE: You do not need an API key from fivefilters.org to run your own
@ -307,6 +310,22 @@ $options->max_entries_with_key = 10;
// false - disabled // false - disabled
$options->xss_filter = 'user'; $options->xss_filter = 'user';
// Favour item titles in feed
// ----------------------
// By default, when processing feeds, we assume item titles in the feed
// have not been truncated. So after processing web pages, the extracted titles
// are not used in the generated feed. If you prefer to have extracted titles in
// the feed you can either set this to false, in which case we will always favour
// extracted titles. Alternatively, if set to 'user' (default) we'll use the
// extracted title if you pass '&use_extracted_title' in the querystring.
// Possible values:
// * Favour feed titles: true
// * Favour extracted titles: false
// * Favour feed titles with user override: 'user' (default)
// Note: this has no effect when the input URL is to a web page - in these cases
// we always use the extracted title in the generated feed.
$options->favour_feed_titles = 'user';
// Allowed HTML parsers // Allowed HTML parsers
// ---------------------- // ----------------------
// Full-Text RSS attempts to use PHP's libxml extension to process HTML. // Full-Text RSS attempts to use PHP's libxml extension to process HTML.
@ -481,7 +500,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS /////////// /// DO NOT CHANGE ANYTHING BELOW THIS ///////////
///////////////////////////////////////////////// /////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.3'); if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.4');
if (basename(__FILE__) == 'config.php') { if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) { if (file_exists(dirname(__FILE__).'/custom_config.php')) {

View File

@ -1,7 +1,8 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:content="http://purl.org/rss/1.0/modules/content/"> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:atom="http://www.w3.org/2005/Atom">
<xsl:output method="html" /> <xsl:output method="html" />
<xsl:variable name="title" select="/rss/channel/title"/> <xsl:variable name="title" select="/rss/channel/title"/>
<xsl:variable name="subscribe" select="/rss/channel/atom:link[@rel='related']/@href"/>
<xsl:template match="/"> <xsl:template match="/">
<html> <html>
<head> <head>
@ -11,7 +12,7 @@
<body> <body>
<div id="explanation"> <div id="explanation">
<h1><xsl:value-of select="$title"/> <span class="small"> (full-text feed)</span></h1> <h1><xsl:value-of select="$title"/> <span class="small"> (full-text feed)</span></h1>
<p>You are viewing an auto-generated full-text <acronym title="Really Simple Syndication">RSS</acronym> feed. RSS feeds allow you to stay up to date with the latest news and features you want from websites. To subscribe to it, you will need a News Reader or other similar device.</p> <p>You are viewing an auto-generated full-text <acronym title="Really Simple Syndication">RSS</acronym> feed. RSS feeds allow you to stay up to date with the latest news and features you want from websites.<br /><a href="{$subscribe}">Subscribe to this feed.</a></p>
<p>Below is the latest content available from this feed.</p> <p>Below is the latest content available from this feed.</p>
</div> </div>

View File

@ -45,7 +45,7 @@ HTTP/1.0 200 OK
define('_FF_FTR_MODE', 'simple'); define('_FF_FTR_MODE', 'simple');
// Don't process URL as feed // Don't process URL as feed
$_POST['html'] = '1'; $_POST['accept'] = 'html';
// JSON output only // JSON output only
$_POST['format'] = 'json'; $_POST['format'] = 'json';
// Enable excerpts // Enable excerpts

View File

@ -316,6 +316,12 @@ if (!defined('_FF_FTR_INDEX')) {
<td>The default parser is libxml as it's the fastest. HTML5-PHP is an HTML5 parser implemented in PHP. It's slower than libxml, but can often produce better results. You can request HTML5-PHP be used as the parser in a site-specific config file (to ensure it gets used for all URLs for that site), or explicitly via this request parameter.</td> <td>The default parser is libxml as it's the fastest. HTML5-PHP is an HTML5 parser implemented in PHP. It's slower than libxml, but can often produce better results. You can request HTML5-PHP be used as the parser in a site-specific config file (to ensure it gets used for all URLs for that site), or explicitly via this request parameter.</td>
</tr> </tr>
<tr>
<td>siteconfig</td>
<td>string</td>
<td>Site-specific extraction rules are usually stored in text files in the site_config folder. You can also submit <a href="http://help.fivefilters.org/customer/portal/articles/223153-site-patterns">extraction rules</a> directly in your request using this parameter.</td>
</tr>
<tr> <tr>
<td>proxy</td> <td>proxy</td>
<td><tt>0</tt>, <tt>1</tt>, string (proxy name)</td> <td><tt>0</tt>, <tt>1</tt>, string (proxy name)</td>
@ -393,11 +399,11 @@ if (!defined('_FF_FTR_INDEX')) {
</tr> </tr>
<tr> <tr>
<td>html</td> <td>accept</td>
<td><tt>0</tt> (default), <tt>1</tt></td> <td><tt>auto</tt> (default), <tt>feed</tt>, <tt>html</tt></td>
<td><p>Treat input source as HTML (or parse-as-html-first mode). To enable, pass html=1 in the querystring. If enabled, Full-Text RSS will not attempt to parse the response as a feed. This increases performance slightly and should be used if you know that the URL is not a feed.</p> <td><p>Tell Full-Text RSS what it should expect when fetching the input URL. By default Full-Text RSS tries to guess whether the response is a feed or regular HTML page. It's a good idea to be explicit by passing the appropriate type in this parameter. This is useful if, for example, a feed stops working and begins to return HTML or redirecs to a HTML page as a result of site changes. In such a scenario, if you've been explicit about the URL being a feed, Full-Text RSS will not parse HTML returned in response. If you pass accept=html (previously html=1), Full-Text RSS will not attempt to parse the response as a feed. This increases performance slightly and should be used if you know that the URL is not a feed.</p>
<p>Note: If excluded, or set to 0, Full-Text RSS first tries to parse the server's response as a feed, and only if it fails to parse as a feed will it revert to HTML parsing. In the default parse-as-feed-first mode, Full-Text RSS will identify itself as PHP first and only if a valid feed is returned will it identify itself as a browser in subsequent requests to fetch the feed items. In parse-as-html-first mode, Full-Text RSS will identify itself as a browser from the very first request.</p></td> <p>Note: If excluded, or set to <tt>auto</tt>, Full-Text RSS first tries to parse the server's response as a feed, and only if it fails to parse as a feed will it revert to HTML parsing. In the default parse-as-feed-first mode, Full-Text RSS will identify itself as PHP first and only if a valid feed is returned will it identify itself as a browser in subsequent requests to fetch the feed items. In parse-as-html mode, Full-Text RSS will identify itself as a browser from the very first request.</p></td>
</tr> </tr>
<tr> <tr>
@ -445,6 +451,12 @@ if (!defined('_FF_FTR_INDEX')) {
<td>The default parser is libxml as it's the fastest. HTML5-PHP is an HTML5 parser implemented in PHP. It's slower than libxml, but can often produce better results. You can request HTML5-PHP be used as the parser in a site-specific config file (to ensure it gets used for all URLs for that site), or explicitly via this request parameter.</td> <td>The default parser is libxml as it's the fastest. HTML5-PHP is an HTML5 parser implemented in PHP. It's slower than libxml, but can often produce better results. You can request HTML5-PHP be used as the parser in a site-specific config file (to ensure it gets used for all URLs for that site), or explicitly via this request parameter.</td>
</tr> </tr>
<tr>
<td>siteconfig</td>
<td>string</td>
<td>Site-specific extraction rules are usually stored in text files in the site_config folder. You can also submit <a href="http://help.fivefilters.org/customer/portal/articles/223153-site-patterns">extraction rules</a> directly in your request using this parameter.</td>
</tr>
<tr> <tr>
<td>proxy</td> <td>proxy</td>
<td><tt>0</tt>, <tt>1</tt>, string (proxy name)</td> <td><tt>0</tt>, <tt>1</tt>, string (proxy name)</td>
@ -504,7 +516,7 @@ if (!defined('_FF_FTR_INDEX')) {
<tr> <tr>
<td>key</td> <td>key</td>
<td>string or number</td> <td>string or number</td>
<td><p>This parameter has two functions.</p><p>If you're calling Full-Text RSS programattically, it's better to use this parameter to provide the API key index number together with the hash parameter (see below) so that the actual API key does not get sent in the HTTP request.</p><p>If you pass the actual API key in this parameter, the hash parameter is not required. If you pass the actual API key to makefulltextfeed.php, Full-Text RSS will find the index number and generate the hash value automatically and redirect to a new URL to hide the API key. If you'd like to link to a generated feed publically while protecting your API key, make sure you copy and paste the URL that results after the redirect.</p><p>If you've configured Full-Text RSS to require a key, an invalid key will result in an error message.</p></td> <td><p>This parameter has two functions.</p><p>If you're calling Full-Text RSS programattically, it's better to use this parameter to provide the API key index number together with the hash parameter (see below) so that the actual API key does not get sent in the HTTP request.</p><p>If you pass the actual API key in this parameter, the hash parameter is not required. If you pass the actual API key, Full-Text RSS will find the index number and generate the hash value automatically and redirect to a new URL to hide the API key. If you'd like to link to a generated feed publically while protecting your API key, make sure you copy and paste the URL that results after the redirect.</p><p>If you've configured Full-Text RSS to require a key, an invalid key will result in an error message.</p></td>
</tr> </tr>
<tr> <tr>
@ -513,6 +525,11 @@ if (!defined('_FF_FTR_INDEX')) {
<td>A SHA-1 hash value of the API key (actual key, not index number) and the URL supplied in the <tt>url</tt> parameter, concatenated. This parameter must be passed along with the API key's index number using the <tt>key</tt> parameter (see above). In PHP, for example: <tt>$hash = sha1($api_key.$url);</tt></td> <td>A SHA-1 hash value of the API key (actual key, not index number) and the URL supplied in the <tt>url</tt> parameter, concatenated. This parameter must be passed along with the API key's index number using the <tt>key</tt> parameter (see above). In PHP, for example: <tt>$hash = sha1($api_key.$url);</tt></td>
</tr> </tr>
<tr>
<td>key_redirect</td>
<td>0 or 1 (default)</td>
<td><p>When supplying the API key with the <tt>key</tt> parameter, Full-Text RSS will generate a new URL and issue a HTTP redirect to the new URL to hide the API key (see description above). If you'd like to avoid an HTTP redirect, you can pass 0 in this parameter. We do not recommend you subscribe to feeds generated in this way.</p></td>
</tr>
</tbody> </tbody>
</table> </table>

View File

@ -0,0 +1,6 @@
<?php
class DisableSimplePieSanitize extends SimplePie_Sanitize {
function sanitize($data, $type, $base = '') {
return $data;
}
}

View File

@ -33,7 +33,9 @@ class ContentExtractor
); );
protected $html; protected $html;
protected $config; protected $config;
protected $userSubmittedConfig;
protected $title; protected $title;
protected $nativeAd = false;
protected $author = array(); protected $author = array();
protected $language; protected $language;
protected $date; protected $date;
@ -65,10 +67,12 @@ class ContentExtractor
} }
public function reset() { public function reset() {
// we do not reset $this->userSubmittedConfig (it gets reused)
$this->html = null; $this->html = null;
$this->readability = null; $this->readability = null;
$this->config = null; $this->config = null;
$this->title = null; $this->title = null;
$this->nativeAd = false;
$this->body = null; $this->body = null;
$this->author = array(); $this->author = array();
$this->language = null; $this->language = null;
@ -156,7 +160,17 @@ class ContentExtractor
// but it has problems of its own which we try to avoid with this option. // but it has problems of its own which we try to avoid with this option.
public function process($html, $url, $smart_tidy=true) { public function process($html, $url, $smart_tidy=true) {
$this->reset(); $this->reset();
// use user submitted config and merge it with regular one
if (isset($this->userSubmittedConfig)) {
$this->debug('Using user-submitted site config');
$this->config = $this->userSubmittedConfig;
if ($this->config->autodetect_on_failure()) {
$this->debug('Merging user-submitted site config with site config files associated with this URL and/or content');
$this->config->append($this->buildSiteConfig($url, $html));
}
} else {
$this->config = $this->buildSiteConfig($url, $html); $this->config = $this->buildSiteConfig($url, $html);
}
// do string replacements // do string replacements
if (!empty($this->config->find_string)) { if (!empty($this->config->find_string)) {
@ -225,6 +239,15 @@ class ContentExtractor
} }
} }
// check if this is a native ad
foreach ($this->config->native_ad_clue as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
if ($elems instanceof DOMNodeList && $elems->length > 0) {
$this->nativeAd = true;
break;
}
}
// try to get title // try to get title
foreach ($this->config->title as $pattern) { foreach ($this->config->title as $pattern) {
// $this->debug("Trying $pattern"); // $this->debug("Trying $pattern");
@ -758,10 +781,18 @@ class ContentExtractor
return false; return false;
} }
public function setUserSubmittedConfig($config_string) {
$this->userSubmittedConfig = SiteConfig::build_from_string($config_string);
}
public function getContent() { public function getContent() {
return $this->body; return $this->body;
} }
public function isNativeAd() {
return $this->nativeAd;
}
public function getTitle() { public function getTitle() {
return $this->title; return $this->title;
} }

View File

@ -35,6 +35,9 @@ class SiteConfig
// Strip images which contain these strings (0 or more) in the src attribute // Strip images which contain these strings (0 or more) in the src attribute
public $strip_image_src = array(); public $strip_image_src = array();
// Mark article as a native ad if any of these expressions match (0 or more xpath expressions)
public $native_ad_clue = array();
// Additional HTTP headers to send // Additional HTTP headers to send
// NOT YET USED // NOT YET USED
public $http_header = array(); public $http_header = array();
@ -182,7 +185,7 @@ class SiteConfig
public function append(SiteConfig $newconfig) { public function append(SiteConfig $newconfig) {
// check for commands where we accept multiple statements (no test_url) // check for commands where we accept multiple statements (no test_url)
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) { foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header') as $var) {
// append array elements for this config variable from $newconfig to this config // append array elements for this config variable from $newconfig to this config
//$this->$var = $this->$var + $newconfig->$var; //$this->$var = $this->$var + $newconfig->$var;
$this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
@ -323,6 +326,11 @@ class SiteConfig
} }
} }
public static function build_from_string($string) {
$config_lines = explode("\n", $string);
return self::build_from_array($config_lines);
}
public static function build_from_array(array $lines) { public static function build_from_array(array $lines) {
$config = new SiteConfig(); $config = new SiteConfig();
foreach ($lines as $line) { foreach ($lines as $line) {
@ -340,7 +348,7 @@ class SiteConfig
if ($command == '' || $val == '') continue; if ($command == '' || $val == '') continue;
// check for commands where we accept multiple statements // check for commands where we accept multiple statements
if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'test_url', 'find_string', 'replace_string'))) { if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
array_push($config->$command, $val); array_push($config->$command, $val);
// check for single statement commands that evaluate to true or false // check for single statement commands that evaluate to true or false
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {

View File

@ -19,6 +19,8 @@ define('JSONP', 3, true);
class FeedWriter class FeedWriter
{ {
private $self = null; // self URL - http://feed2.w3.org/docs/warning/MissingAtomSelfLink.html private $self = null; // self URL - http://feed2.w3.org/docs/warning/MissingAtomSelfLink.html
private $alternate = array(); // alternate URL and title
private $related = array(); // related URL and title
private $hubs = array(); // PubSubHubbub hubs private $hubs = array(); // PubSubHubbub hubs
private $channels = array(); // Collection of channel elements private $channels = array(); // Collection of channel elements
private $items = array(); // Collection of items as object of FeedItem class. private $items = array(); // Collection of items as object of FeedItem class.
@ -240,9 +242,35 @@ define('JSONP', 3, true);
* @param string URL * @param string URL
* @return void * @return void
*/ */
public function setSelf($self) public function setSelf($url)
{ {
$this->self = $self; $this->self = $url;
}
/**
* Set alternate URL
*
* @access public
* @param string URL
* @param string title
* @return void
*/
public function setAlternate($url, $title)
{
$this->alternate = array('url'=>$url, 'title'=>$title);
}
/**
* Set related URL
*
* @access public
* @param string URL
* @param string title
* @return void
*/
public function setRelated($url, $title)
{
$this->related = array('url'=>$url, 'title'=>$title);
} }
/** /**
@ -299,7 +327,7 @@ define('JSONP', 3, true);
{ {
$out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; $out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; $out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
echo $out; echo $out;
} }
elseif ($this->version == JSON || $this->version == JSONP) elseif ($this->version == JSON || $this->version == JSONP)
@ -342,7 +370,7 @@ define('JSONP', 3, true);
{ {
foreach ($attributes as $key => $value) foreach ($attributes as $key => $value)
{ {
$attrText .= " $key=\"$value\" "; $attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" ";
} }
} }
$nodeText .= "<{$tagName}{$attrText}>"; $nodeText .= "<{$tagName}{$attrText}>";
@ -356,7 +384,7 @@ define('JSONP', 3, true);
else else
{ {
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent); //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent);
$nodeText .= htmlspecialchars($tagContent); $nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false);
} }
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>"; //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>";
$nodeText .= "</$tagName>"; $nodeText .= "</$tagName>";
@ -408,12 +436,20 @@ define('JSONP', 3, true);
// add hubs // add hubs
foreach ($this->hubs as $hub) { foreach ($this->hubs as $hub) {
//echo $this->makeNode('link', '', array('rel'=>'hub', 'href'=>$hub, 'xmlns'=>'http://www.w3.org/2005/Atom')); //echo $this->makeNode('link', '', array('rel'=>'hub', 'href'=>$hub, 'xmlns'=>'http://www.w3.org/2005/Atom'));
echo '<link rel="hub" href="'.htmlspecialchars($hub).'" xmlns="http://www.w3.org/2005/Atom" />' . PHP_EOL; echo '<atom:link rel="hub" href="'.htmlspecialchars($hub).'" />' . PHP_EOL;
} }
// add self // add self
if (isset($this->self)) { if (isset($this->self)) {
//echo $this->makeNode('link', '', array('rel'=>'self', 'href'=>$this->self, 'xmlns'=>'http://www.w3.org/2005/Atom')); //echo $this->makeNode('link', '', array('rel'=>'self', 'href'=>$this->self, 'xmlns'=>'http://www.w3.org/2005/Atom'));
echo '<link rel="self" href="'.htmlspecialchars($this->self).'" xmlns="http://www.w3.org/2005/Atom" />' . PHP_EOL; echo '<atom:link rel="self" href="'.htmlspecialchars($this->self).'" />' . PHP_EOL;
}
// add alternate
if (isset($this->alternate)) {
echo '<atom:link rel="alternate" title="'.htmlspecialchars($this->alternate['title']).'" href="'.htmlspecialchars($this->alternate['url']).'" />' . PHP_EOL;
}
// add related
if (isset($this->related)) {
echo '<atom:link rel="related" title="'.htmlspecialchars($this->related['title']).'" href="'.htmlspecialchars($this->related['url']).'" />' . PHP_EOL;
} }
//Print Items of channel //Print Items of channel
foreach ($this->channels as $key => $value) foreach ($this->channels as $key => $value)

View File

@ -1,14 +1,13 @@
<?php <?php
/** namespace Masterminds;
* The main HTML5 front end.
*/ use Masterminds\HTML5\Parser\FileInputStream;
use HTML5\Parser\StringInputStream; use Masterminds\HTML5\Parser\StringInputStream;
use HTML5\Parser\FileInputStream; use Masterminds\HTML5\Parser\DOMTreeBuilder;
use HTML5\Parser\Scanner; use Masterminds\HTML5\Parser\Scanner;
use HTML5\Parser\Tokenizer; use Masterminds\HTML5\Parser\Tokenizer;
use HTML5\Parser\DOMTreeBuilder; use Masterminds\HTML5\Serializer\OutputRules;
use HTML5\Serializer\OutputRules; use Masterminds\HTML5\Serializer\Traverser;
use HTML5\Serializer\Traverser;
/** /**
* This class offers convenience methods for parsing and serializing HTML5. * This class offers convenience methods for parsing and serializing HTML5.
@ -17,18 +16,36 @@ use HTML5\Serializer\Traverser;
* *
* EXPERIMENTAL. This may change or be completely replaced. * EXPERIMENTAL. This may change or be completely replaced.
*/ */
class HTML5 { class HTML5
{
/** /**
* Global options for the parser and serializer. * Global options for the parser and serializer.
*
* @var array * @var array
*/ */
public static $options = array( protected $options = array(
// If the serializer should encode all entities. // If the serializer should encode all entities.
'encode_entities' => FALSE, 'encode_entities' => false
); );
protected $errors = array();
public function __construct(array $options = array())
{
$this->options = array_merge($this->options, $options);
}
/**
* Get the default options.
*
* @return array The default options.
*/
public function getOptions()
{
return $this->options;
}
/** /**
* Load and parse an HTML file. * Load and parse an HTML file.
* *
@ -43,20 +60,20 @@ class HTML5 {
* The path to the file to parse. If this is a resource, it is * The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first * assumed to be an open stream whose pointer is set to the first
* byte of input. * byte of input.
* @return \DOMDocument * @return \DOMDocument A DOM document. These object type is defined by the libxml
* A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP. * library, and should have been included with your version of PHP.
*/ */
public static function load($file) { public function load($file)
{
// Handle the case where file is a resource. // Handle the case where file is a resource.
if (is_resource($file)) { if (is_resource($file)) {
// FIXME: We need a StreamInputStream class. // FIXME: We need a StreamInputStream class.
return static::loadHTML(stream_get_contents($file)); return $this->loadHTML(stream_get_contents($file));
} }
$input = new FileInputStream($file); $input = new FileInputStream($file);
return static::parse($input);
return $this->parse($input);
} }
/** /**
@ -67,13 +84,14 @@ class HTML5 {
* *
* @param string $string * @param string $string
* A html5 document as a string. * A html5 document as a string.
* @return \DOMDocument * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
* A DOM document. DOM is part of libxml, which is included with
* almost all distribtions of PHP. * almost all distribtions of PHP.
*/ */
public static function loadHTML($string) { public function loadHTML($string)
{
$input = new StringInputStream($string); $input = new StringInputStream($string);
return static::parse($input);
return $this->parse($input);
} }
/** /**
@ -87,12 +105,12 @@ class HTML5 {
* assumed to be an open stream whose pointer is set to the first * assumed to be an open stream whose pointer is set to the first
* byte of input. * byte of input.
* *
* @return \DOMDocument * @return \DOMDocument A DOM document. These object type is defined by the libxml
* A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP. * library, and should have been included with your version of PHP.
*/ */
public static function loadHTMLFile($file, $options = NULL) { public function loadHTMLFile($file)
return static::load($file, $options); {
return $this->load($file);
} }
/** /**
@ -101,13 +119,71 @@ class HTML5 {
* @param string $string * @param string $string
* The html5 fragment as a string. * The html5 fragment as a string.
* *
* @return \DOMDocumentFragment * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
* A DOM fragment. The DOM is part of libxml, which is included with
* almost all distributions of PHP. * almost all distributions of PHP.
*/ */
public static function loadHTMLFragment($string) { public function loadHTMLFragment($string)
{
$input = new StringInputStream($string); $input = new StringInputStream($string);
return static::parseFragment($input);
return $this->parseFragment($input);
}
/**
* Return all errors encountered into parsing phase
*
* @return array
*/
public function getErrors()
{
return $this->errors;
}
/**
* Return true it some errors were encountered into parsing phase
*
* @return bool
*/
public function hasErrors()
{
return count($this->errors) > 0;
}
/**
* Parse an input stream.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public function parse(\Masterminds\HTML5\Parser\InputStream $input)
{
$this->errors = array();
$events = new DOMTreeBuilder(false, $this->options);
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);
$parser->parse();
$this->errors = $events->getErrors();
return $events->document();
}
/**
* Parse an input stream where the stream is a fragment.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input)
{
$events = new DOMTreeBuilder(true, $this->options);
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);
$parser->parse();
$this->errors = $events->getErrors();
return $events->fragment();
} }
/** /**
@ -120,19 +196,19 @@ class HTML5 {
* @param array $options * @param array $options
* Configuration options when serializing the DOM. These include: * Configuration options when serializing the DOM. These include:
* - encode_entities: Text written to the output is escaped by default and not all * - encode_entities: Text written to the output is escaped by default and not all
* entities are encoded. If this is set to TRUE all entities will be encoded. * entities are encoded. If this is set to true all entities will be encoded.
* Defaults to FALSE. * Defaults to false.
*/ */
public static function save($dom, $file, $options = array()) { public function save($dom, $file, $options = array())
$options = $options + static::options(); {
$close = TRUE; $close = true;
if (is_resource($file)) { if (is_resource($file)) {
$stream = $file; $stream = $file;
$close = FALSE; $close = false;
} } else {
else {
$stream = fopen($file, 'w'); $stream = fopen($file, 'w');
} }
$options = array_merge($this->getOptions(), $options);
$rules = new OutputRules($stream, $options); $rules = new OutputRules($stream, $options);
$trav = new Traverser($dom, $stream, $rules, $options); $trav = new Traverser($dom, $stream, $rules, $options);
@ -151,70 +227,16 @@ class HTML5 {
* @param array $options * @param array $options
* Configuration options when serializing the DOM. These include: * Configuration options when serializing the DOM. These include:
* - encode_entities: Text written to the output is escaped by default and not all * - encode_entities: Text written to the output is escaped by default and not all
* entities are encoded. If this is set to TRUE all entities will be encoded. * entities are encoded. If this is set to true all entities will be encoded.
* Defaults to FALSE. * Defaults to false.
* *
* @return string * @return string A HTML5 documented generated from the DOM.
* A HTML5 documented generated from the DOM.
*/ */
public static function saveHTML($dom, $options = array()) { public function saveHTML($dom, $options = array())
{
$stream = fopen('php://temp', 'w'); $stream = fopen('php://temp', 'w');
static::save($dom, $stream, $options); $this->save($dom, $stream, array_merge($this->getOptions(), $options));
return stream_get_contents($stream, - 1, 0); return stream_get_contents($stream, - 1, 0);
} }
/**
* Parse an input stream.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public static function parse(\HTML5\Parser\InputStream $input) {
$events = new DOMTreeBuilder();
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);
$parser->parse();
return $events->document();
}
/**
* Parse an input stream where the stream is a fragment.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public static function parseFragment(\HTML5\Parser\InputStream $input) {
$events = new DOMTreeBuilder(TRUE);
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);
$parser->parse();
return $events->fragment();
}
/**
* Get the default options.
*
* @return array
* The default options.
*/
public static function options() {
return static::$options;
}
/**
* Set a default option.
*
* @param string $name
* The option name.
* @param mixed $value
* The option value.
*/
public static function setOption($name, $value) {
static::$options[$name] = $value;
}
} }

View File

@ -2,11 +2,12 @@
/** /**
* Provide general element functions. * Provide general element functions.
*/ */
namespace HTML5; namespace Masterminds\HTML5;
/** /**
* This class provides general information about HTML5 elements, * This class provides general information about HTML5 elements,
* including syntactic and semantic issues. Parsers and serializers can * including syntactic and semantic issues.
* Parsers and serializers can
* use this class as a reference point for information about the rules * use this class as a reference point for information about the rules
* of various HTML5 elements. * of various HTML5 elements.
* *
@ -14,22 +15,31 @@ namespace HTML5;
* naming that this could significantly shrink the size and maybe make it * naming that this could significantly shrink the size and maybe make it
* faster. See the Go teams implementation at https://code.google.com/p/go/source/browse/html/atom. * faster. See the Go teams implementation at https://code.google.com/p/go/source/browse/html/atom.
*/ */
class Elements { class Elements
{
/** Indicates an element is described in the specification. */ /**
* Indicates an element is described in the specification.
*/
const KNOWN_ELEMENT = 1; const KNOWN_ELEMENT = 1;
// From section 8.1.2: "script", "style" // From section 8.1.2: "script", "style"
// From 8.2.5.4.7 ("in body" insertion mode): "noembed", "noscript" // From 8.2.5.4.7 ("in body" insertion mode): "noembed", "noscript"
// From 8.4 "style", "xmp", "iframe", "noembed", "noframes" // From 8.4 "style", "xmp", "iframe", "noembed", "noframes"
/** Indicates the contained text should be processed as raw text. */ /**
* Indicates the contained text should be processed as raw text.
*/
const TEXT_RAW = 2; const TEXT_RAW = 2;
// From section 8.1.2: "textarea", "title" // From section 8.1.2: "textarea", "title"
/** Indicates the contained text should be processed as RCDATA. */ /**
* Indicates the contained text should be processed as RCDATA.
*/
const TEXT_RCDATA = 4; const TEXT_RCDATA = 4;
/** Indicates the tag cannot have content. */ /**
* Indicates the tag cannot have content.
*/
const VOID_TAG = 8; const VOID_TAG = 8;
// "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl", // "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl",
@ -45,16 +55,20 @@ class Elements {
*/ */
const AUTOCLOSE_P = 16; const AUTOCLOSE_P = 16;
/** Indicates that the text inside is plaintext (pre). */ /**
* Indicates that the text inside is plaintext (pre).
*/
const TEXT_PLAINTEXT = 32; const TEXT_PLAINTEXT = 32;
// See https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements // See https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements
/** Indicates that the tag is a block. */ /**
* Indicates that the tag is a block.
*/
const BLOCK_TAG = 64; const BLOCK_TAG = 64;
/** /**
* The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html. * The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html.
*
* @var array * @var array
*/ */
public static $html5 = array( public static $html5 = array(
@ -148,7 +162,7 @@ class Elements {
"source" => 9, // NORMAL | VOID_TAG "source" => 9, // NORMAL | VOID_TAG
"span" => 1, "span" => 1,
"strong" => 1, "strong" => 1,
"style" => 1, "style" => 3, // NORMAL | TEXT_RAW
"sub" => 1, "sub" => 1,
"summary" => 17, // NORMAL | AUTOCLOSE_P, "summary" => 17, // NORMAL | AUTOCLOSE_P,
"sup" => 1, "sup" => 1,
@ -175,17 +189,20 @@ class Elements {
'noframes' => 2, // RAW_TEXT 'noframes' => 2, // RAW_TEXT
'frame' => 9, // NORMAL | VOID_TAG 'frame' => 9, // NORMAL | VOID_TAG
'frameset' => 1, 'frameset' => 1,
'center' => 16, 'dir' => 16, 'listing' => 16, // AUTOCLOSE_P 'center' => 16,
'dir' => 16,
'listing' => 16, // AUTOCLOSE_P
'plaintext' => 48, // AUTOCLOSE_P | TEXT_PLAINTEXT 'plaintext' => 48, // AUTOCLOSE_P | TEXT_PLAINTEXT
'applet' => 0, 'applet' => 0,
'marquee' => 0, 'marquee' => 0,
'isindex' => 8, // VOID_TAG 'isindex' => 8, // VOID_TAG
'xmp' => 20, // AUTOCLOSE_P | VOID_TAG | RAW_TEXT 'xmp' => 20, // AUTOCLOSE_P | VOID_TAG | RAW_TEXT
'noembed' => 2, // RAW_TEXT 'noembed' => 2 // RAW_TEXT
); );
/** /**
* The MathML elements. See http://www.w3.org/wiki/MathML/Elements. * The MathML elements.
* See http://www.w3.org/wiki/MathML/Elements.
* *
* In our case we are only concerned with presentation MathML and not content * In our case we are only concerned with presentation MathML and not content
* MathML. There is a nice list of this subset at https://developer.mozilla.org/en-US/docs/MathML/Element. * MathML. There is a nice list of this subset at https://developer.mozilla.org/en-US/docs/MathML/Element.
@ -231,7 +248,7 @@ class Elements {
"mtext" => 1, "mtext" => 1,
"mtr" => 1, "mtr" => 1,
"munder" => 1, "munder" => 1,
"munderover" => 1, "munderover" => 1
); );
/** /**
@ -326,7 +343,7 @@ class Elements {
"tspan" => 1, "tspan" => 1,
"use" => 1, "use" => 1,
"view" => 1, "view" => 1,
"vkern" => 1, "vkern" => 1
); );
/** /**
@ -397,11 +414,12 @@ class Elements {
'viewtarget' => 'viewTarget', 'viewtarget' => 'viewTarget',
'xchannelselector' => 'xChannelSelector', 'xchannelselector' => 'xChannelSelector',
'ychannelselector' => 'yChannelSelector', 'ychannelselector' => 'yChannelSelector',
'zoomandpan' => 'zoomAndPan', 'zoomandpan' => 'zoomAndPan'
); );
/** /**
* Some SVG elements are case sensetitive. This map contains these. * Some SVG elements are case sensetitive.
* This map contains these.
* *
* The map contains key/value store of the name is lowercase as the keys and * The map contains key/value store of the name is lowercase as the keys and
* the correct casing as the value. * the correct casing as the value.
@ -442,7 +460,7 @@ class Elements {
'glyphref' => 'glyphRef', 'glyphref' => 'glyphRef',
'lineargradient' => 'linearGradient', 'lineargradient' => 'linearGradient',
'radialgradient' => 'radialGradient', 'radialgradient' => 'radialGradient',
'textpath' => 'textPath', 'textpath' => 'textPath'
); );
/** /**
@ -458,12 +476,12 @@ class Elements {
* The element name. * The element name.
* @param int $mask * @param int $mask
* One of the constants on this class. * One of the constants on this class.
* @return boolean * @return boolean true if the element matches the mask, false otherwise.
* TRUE if the element matches the mask, FALSE otherwise.
*/ */
public static function isA($name, $mask) { public static function isA($name, $mask)
{
if (! static::isElement($name)) { if (! static::isElement($name)) {
return FALSE; return false;
} }
return (static::element($name) & $mask) == $mask; return (static::element($name) & $mask) == $mask;
@ -475,11 +493,10 @@ class Elements {
* @param string $name * @param string $name
* The name of the element. * The name of the element.
* *
* @return bool * @return bool True if a html5 element and false otherwise.
* True if a html5 element and false otherwise.
*/ */
public static function isHtml5Element($name) { public static function isHtml5Element($name)
{
// html5 element names are case insensetitive. Forcing lowercase for the check. // html5 element names are case insensetitive. Forcing lowercase for the check.
// Do we need this check or will all data passed here already be lowercase? // Do we need this check or will all data passed here already be lowercase?
return isset(static::$html5[strtolower($name)]); return isset(static::$html5[strtolower($name)]);
@ -491,11 +508,10 @@ class Elements {
* @param string $name * @param string $name
* The name of the element. * The name of the element.
* *
* @return bool * @return bool True if a MathML name and false otherwise.
* True if a MathML name and false otherwise.
*/ */
public static function isMathMLElement($name) { public static function isMathMLElement($name)
{
// MathML is case-sensetitive unlike html5 elements. // MathML is case-sensetitive unlike html5 elements.
return isset(static::$mathml[$name]); return isset(static::$mathml[$name]);
} }
@ -506,11 +522,10 @@ class Elements {
* @param string $name * @param string $name
* The name of the element. * The name of the element.
* *
* @return boolean * @return boolean True if a SVG element and false otherise.
* True if a SVG element and false otherise.
*/ */
public static function isSvgElement($name) { public static function isSvgElement($name)
{
// SVG is case-sensetitive unlike html5 elements. // SVG is case-sensetitive unlike html5 elements.
return isset(static::$svg[$name]); return isset(static::$svg[$name]);
} }
@ -524,10 +539,10 @@ class Elements {
* @param string $name * @param string $name
* The name of the element. * The name of the element.
* *
* @return bool * @return bool True if valid and false otherwise.
* True if valid and false otherwise.
*/ */
public static function isElement($name) { public static function isElement($name)
{
return static::isHtml5Element($name) || static::isMathMLElement($name) || static::isSvgElement($name); return static::isHtml5Element($name) || static::isMathMLElement($name) || static::isSvgElement($name);
} }
@ -537,10 +552,10 @@ class Elements {
* @param string $name * @param string $name
* The name of the element. * The name of the element.
* *
* @return int * @return int The element mask.
* The element mask.
*/ */
public static function element($name) { public static function element($name)
{
if (isset(static::$html5[$name])) { if (isset(static::$html5[$name])) {
return static::$html5[$name]; return static::$html5[$name];
} }
@ -551,7 +566,7 @@ class Elements {
return static::$mathml[$name]; return static::$mathml[$name];
} }
return FALSE; return false;
} }
/** /**
@ -560,10 +575,10 @@ class Elements {
* @param string $name * @param string $name
* The name of the element. * The name of the element.
* *
* @return string * @return string The normalized form of the element name.
* The normalized form of the element name.
*/ */
public static function normalizeSvgElement($name) { public static function normalizeSvgElement($name)
{
$name = strtolower($name); $name = strtolower($name);
if (isset(static::$svgCaseSensitiveElementMap[$name])) { if (isset(static::$svgCaseSensitiveElementMap[$name])) {
$name = static::$svgCaseSensitiveElementMap[$name]; $name = static::$svgCaseSensitiveElementMap[$name];
@ -578,10 +593,10 @@ class Elements {
* @param string $name * @param string $name
* The name of the attribute. * The name of the attribute.
* *
* @return string * @return string The normalized form of the attribute name.
* The normalized form of the attribute name.
*/ */
public static function normalizeSvgAttribute($name) { public static function normalizeSvgAttribute($name)
{
$name = strtolower($name); $name = strtolower($name);
if (isset(static::$svgCaseSensitiveAttributeMap[$name])) { if (isset(static::$svgCaseSensitiveAttributeMap[$name])) {
$name = static::$svgCaseSensitiveAttributeMap[$name]; $name = static::$svgCaseSensitiveAttributeMap[$name];
@ -598,10 +613,10 @@ class Elements {
* @param string $name * @param string $name
* The name of the attribute. * The name of the attribute.
* *
* @return string * @return string The normalized form of the attribute name.
* The normalized form of the attribute name.
*/ */
public static function normalizeMathMlAttribute($name) { public static function normalizeMathMlAttribute($name)
{
$name = strtolower($name); $name = strtolower($name);
// Only one attribute has a mixed case form for MathML. // Only one attribute has a mixed case form for MathML.

View File

@ -1,7 +1,13 @@
<?php <?php
namespace HTML5; namespace Masterminds\HTML5;
/** Entity lookup tables. This class is automatically generated. */
class Entities { /**
* Entity lookup tables.
* This class is automatically generated.
*/
class Entities
{
public static $byName = array( public static $byName = array(
'Aacute' => 'Á', 'Aacute' => 'Á',
'Aacut' => 'Á', 'Aacut' => 'Á',
@ -2225,6 +2231,6 @@ class Entities {
'Zscr' => '𝒵', 'Zscr' => '𝒵',
'zscr' => '𝓏', 'zscr' => '𝓏',
'zwj' => '', 'zwj' => '',
'zwnj' => '', 'zwnj' => ''
); );
} }

View File

@ -1,8 +1,9 @@
<?php <?php
namespace HTML5; namespace Masterminds\HTML5;
/** /**
* The base exception for the HTML5 project. * The base exception for the HTML5 project.
*/ */
class Exception extends \Exception { class Exception extends \Exception
{
} }

View File

@ -2,7 +2,7 @@
/** /**
* A handler for processor instructions. * A handler for processor instructions.
*/ */
namespace HTML5; namespace Masterminds\HTML5;
/** /**
* Provide an processor to handle embedded instructions. * Provide an processor to handle embedded instructions.
@ -16,7 +16,8 @@ namespace HTML5;
* One could, for example, use this mechanism to execute well-formed PHP * One could, for example, use this mechanism to execute well-formed PHP
* code embedded inside of an HTML5 document. * code embedded inside of an HTML5 document.
*/ */
interface InstructionProcessor { interface InstructionProcessor
{
/** /**
* Process an individual processing instruction. * Process an individual processing instruction.
@ -33,8 +34,7 @@ interface InstructionProcessor {
* The instruction's name. E.g. `&lt;?php` has the name `php`. * The instruction's name. E.g. `&lt;?php` has the name `php`.
* @param string $data * @param string $data
* All of the data between the opening and closing PI marks. * All of the data between the opening and closing PI marks.
* @return DOMElement * @return DOMElement The element that should be considered "Current". This may just be
* The element that should be considered "Current". This may just be
* the element passed in, but if the processor added more elements, * the element passed in, but if the processor added more elements,
* it may choose to reset the current element to one of the elements * it may choose to reset the current element to one of the elements
* it created. (When in doubt, return the element passed in.) * it created. (When in doubt, return the element passed in.)

View File

@ -1,29 +1,36 @@
<?php <?php
namespace HTML5\Parser; namespace Masterminds\HTML5\Parser;
use \HTML5\Entities; use Masterminds\HTML5\Entities;
/** /**
* Manage entity references. * Manage entity references.
* *
* This is a simple resolver for HTML5 character reference entitites. * This is a simple resolver for HTML5 character reference entitites.
* See \HTML5\Entities for the list of supported entities. * See \Masterminds\HTML5\Entities for the list of supported entities.
*/ */
class CharacterReference { class CharacterReference
{
protected static $numeric_mask = array(0x0, 0x2FFFF, 0, 0xFFFF); protected static $numeric_mask = array(
0x0,
0x2FFFF,
0,
0xFFFF
);
/** /**
* Given a name (e.g. 'amp'), lookup the UTF-8 character ('&') * Given a name (e.g.
* 'amp'), lookup the UTF-8 character ('&')
* *
* @param string $name * @param string $name
* The name to look up. * The name to look up.
* @return string * @return string The character sequence. In UTF-8 this may be more than one byte.
* The character sequence. In UTF-8 this may be more than one byte.
*/ */
public static function lookupName($name) { public static function lookupName($name)
{
// Do we really want to return NULL here? or FFFD // Do we really want to return NULL here? or FFFD
return isset(Entities::$byName[$name]) ? Entities::$byName[$name] : NULL; return isset(Entities::$byName[$name]) ? Entities::$byName[$name] : null;
} }
/** /**
@ -32,15 +39,14 @@ class CharacterReference {
* (NOT USED ANYWHERE) * (NOT USED ANYWHERE)
*/ */
/* /*
public static function lookupCode($codePoint) { * public static function lookupCode($codePoint) { return 'POINT'; }
return 'POINT';
}
*/ */
/** /**
* Given a decimal number, return the UTF-8 character. * Given a decimal number, return the UTF-8 character.
*/ */
public static function lookupDecimal($int) { public static function lookupDecimal($int)
{
$entity = '&#' . $int . ';'; $entity = '&#' . $int . ';';
// UNTESTED: This may fail on some planes. Couldn't find full documentation // UNTESTED: This may fail on some planes. Couldn't find full documentation
// on the value of the mask array. // on the value of the mask array.
@ -50,7 +56,8 @@ class CharacterReference {
/** /**
* Given a hexidecimal number, return the UTF-8 character. * Given a hexidecimal number, return the UTF-8 character.
*/ */
public static function lookupHex($hexdec) { public static function lookupHex($hexdec)
{
return static::lookupDecimal(hexdec($hexdec)); return static::lookupDecimal(hexdec($hexdec));
} }
} }

View File

@ -1,7 +1,8 @@
<?php <?php
namespace HTML5\Parser; namespace Masterminds\HTML5\Parser;
use Masterminds\HTML5\Elements;
use HTML5\Elements;
/** /**
* Create an HTML5 DOM tree from events. * Create an HTML5 DOM tree from events.
* *
@ -20,83 +21,167 @@ use HTML5\Elements;
* re-written to accomodate this. See, for example, the Go language HTML5 * re-written to accomodate this. See, for example, the Go language HTML5
* parser. * parser.
*/ */
class DOMTreeBuilder implements EventHandler { class DOMTreeBuilder implements EventHandler
{
/**
* Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0
*/
const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml';
const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML';
const NAMESPACE_SVG = 'http://www.w3.org/2000/svg';
const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink';
const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace';
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
/**
* Holds the HTML5 element names that causes a namespace switch
*
* @var array
*/
protected $nsRoots = array(
'html' => self::NAMESPACE_HTML,
'svg' => self::NAMESPACE_SVG,
'math' => self::NAMESPACE_MATHML
);
/**
* Holds the always available namespaces (which does not require the XMLNS declaration).
*
* @var array
*/
protected $implicitNamespaces = array(
'xml' => self::NAMESPACE_XML,
'xmlns' => self::NAMESPACE_XMLNS,
'xlink' => self::NAMESPACE_XLINK
);
/**
* Holds a stack of currently active namespaces.
*
* @var array
*/
protected $nsStack = array();
/**
* Holds the number of namespaces declared by a node.
*
* @var array
*/
protected $pushes = array();
/** /**
* Defined in 8.2.5. * Defined in 8.2.5.
*/ */
const IM_INITIAL = 0; const IM_INITIAL = 0;
const IM_BEFORE_HTML = 1; const IM_BEFORE_HTML = 1;
const IM_BEFORE_HEAD = 2; const IM_BEFORE_HEAD = 2;
const IM_IN_HEAD = 3; const IM_IN_HEAD = 3;
const IM_IN_HEAD_NOSCRIPT = 4; const IM_IN_HEAD_NOSCRIPT = 4;
const IM_AFTER_HEAD = 5; const IM_AFTER_HEAD = 5;
const IM_IN_BODY = 6; const IM_IN_BODY = 6;
const IM_TEXT = 7; const IM_TEXT = 7;
const IM_IN_TABLE = 8; const IM_IN_TABLE = 8;
const IM_IN_TABLE_TEXT = 9; const IM_IN_TABLE_TEXT = 9;
const IM_IN_CAPTION = 10; const IM_IN_CAPTION = 10;
const IM_IN_COLUMN_GROUP = 11; const IM_IN_COLUMN_GROUP = 11;
const IM_IN_TABLE_BODY = 12; const IM_IN_TABLE_BODY = 12;
const IM_IN_ROW = 13; const IM_IN_ROW = 13;
const IM_IN_CELL = 14; const IM_IN_CELL = 14;
const IM_IN_SELECT = 15; const IM_IN_SELECT = 15;
const IM_IN_SELECT_IN_TABLE = 16; const IM_IN_SELECT_IN_TABLE = 16;
const IM_AFTER_BODY = 17; const IM_AFTER_BODY = 17;
const IM_IN_FRAMESET = 18; const IM_IN_FRAMESET = 18;
const IM_AFTER_FRAMESET = 19; const IM_AFTER_FRAMESET = 19;
const IM_AFTER_AFTER_BODY = 20; const IM_AFTER_AFTER_BODY = 20;
const IM_AFTER_AFTER_FRAMESET = 21; const IM_AFTER_AFTER_FRAMESET = 21;
const IM_IN_SVG = 22; const IM_IN_SVG = 22;
const IM_IN_MATHML = 23; const IM_IN_MATHML = 23;
protected $options = array();
protected $stack = array(); protected $stack = array();
protected $current; // Pointer in the tag hierarchy. protected $current; // Pointer in the tag hierarchy.
protected $doc; protected $doc;
protected $frag;
protected $processor; protected $processor;
protected $insertMode = 0; protected $insertMode = 0;
/** /**
* Quirks mode is enabled by default. Any document that is missing the * Quirks mode is enabled by default.
* Any document that is missing the
* DT will be considered to be in quirks mode. * DT will be considered to be in quirks mode.
*/ */
protected $quirks = TRUE; protected $quirks = true;
public $isFragment = FALSE; protected $errors = array();
public function __construct($isFragment = false, array $options = array())
{
$this->options = $options;
public function __construct($isFragment = FALSE) {
$impl = new \DOMImplementation(); $impl = new \DOMImplementation();
// XXX: // XXX:
// Create the doctype. For now, we are always creating HTML5 // Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5. // documents, and attempting to up-convert any older DTDs to HTML5.
$dt = $impl->createDocumentType('html'); $dt = $impl->createDocumentType('html');
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc = $impl->createDocument(NULL, NULL, $dt); $this->doc = $impl->createDocument(null, null, $dt);
$this->doc->errors = array(); $this->errors = array();
// $this->current = $this->doc->documentElement;
$this->current = $this->doc; // ->documentElement; $this->current = $this->doc; // ->documentElement;
// Create a rules engine for tags. // Create a rules engine for tags.
$this->rules = new TreeBuildingRules($this->doc); $this->rules = new TreeBuildingRules($this->doc);
// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array(
'' => self::NAMESPACE_HTML
) + $this->implicitNamespaces);
if ($isFragment) { if ($isFragment) {
$this->isFragment = TRUE;
$this->insertMode = static::IM_IN_BODY; $this->insertMode = static::IM_IN_BODY;
$ele = $this->doc->createElement('html'); $this->frag = $this->doc->createDocumentFragment();
$this->doc->appendChild($ele); $this->current = $this->frag;
$this->current = $ele;
} }
} }
/** /**
* Get the document. * Get the document.
*/ */
public function document() { public function document()
{
return $this->doc; return $this->doc;
} }
@ -110,24 +195,9 @@ class DOMTreeBuilder implements EventHandler {
* *
* @return \DOMFragmentDocumentFragment * @return \DOMFragmentDocumentFragment
*/ */
public function fragment() { public function fragment()
$append = $this->doc->documentElement->childNodes; {
$frag = $this->doc->createDocumentFragment(); return $this->frag;
// appendChild() modifies the DOMNodeList, so we
// have to buffer up the items first, then use the
// array buffer and loop twice.
$buffer = array();
foreach ($append as $node) {
$buffer[] = $node;
}
foreach ($buffer as $node) {
$frag->appendChild($node);
}
$frag->errors = $this->doc->errors;
return $frag;
} }
/** /**
@ -136,17 +206,20 @@ class DOMTreeBuilder implements EventHandler {
* This is used for handling Processor Instructions as they are * This is used for handling Processor Instructions as they are
* inserted. If omitted, PI's are inserted directly into the DOM tree. * inserted. If omitted, PI's are inserted directly into the DOM tree.
*/ */
public function setInstructionProcessor(\HTML5\InstructionProcessor $proc) { public function setInstructionProcessor(\Masterminds\HTML5\InstructionProcessor $proc)
{
$this->processor = $proc; $this->processor = $proc;
} }
public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE) { public function doctype($name, $idType = 0, $id = null, $quirks = false)
{
// This is used solely for setting quirks mode. Currently we don't // This is used solely for setting quirks mode. Currently we don't
// try to preserve the inbound DT. We convert it to HTML5. // try to preserve the inbound DT. We convert it to HTML5.
$this->quirks = $quirks; $this->quirks = $quirks;
if ($this->insertMode > static::IM_INITIAL) { if ($this->insertMode > static::IM_INITIAL) {
$this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name); $this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name);
return; return;
} }
@ -156,23 +229,23 @@ class DOMTreeBuilder implements EventHandler {
/** /**
* Process the start tag. * Process the start tag.
* *
* @todo * @todo - XMLNS namespace handling (we need to parse, even if it's not valid)
* - XMLNS namespace handling (we need to parse, even if it's not valid)
* - XLink, MathML and SVG namespace handling * - XLink, MathML and SVG namespace handling
* - Omission rules: 8.1.2.4 Optional tags * - Omission rules: 8.1.2.4 Optional tags
*/ */
public function startTag($name, $attributes = array(), $selfClosing = FALSE) { public function startTag($name, $attributes = array(), $selfClosing = false)
{
// fprintf(STDOUT, $name); // fprintf(STDOUT, $name);
$lname = $this->normalizeTagName($name); $lname = $this->normalizeTagName($name);
// Make sure we have an html element. // Make sure we have an html element.
if (!$this->doc->documentElement && $name !== 'html') { if (! $this->doc->documentElement && $name !== 'html' && ! $this->frag) {
$this->startTag('html'); $this->startTag('html');
} }
// Set quirks mode if we're at IM_INITIAL with no doctype. // Set quirks mode if we're at IM_INITIAL with no doctype.
if ($this->insertMode == static::IM_INITIAL) { if ($this->insertMode == static::IM_INITIAL) {
$this->quirks = TRUE; $this->quirks = true;
$this->parseError("No DOCTYPE specified."); $this->parseError("No DOCTYPE specified.");
} }
@ -182,7 +255,6 @@ class DOMTreeBuilder implements EventHandler {
$name = 'img'; $name = 'img';
} }
// Autoclose p tags where appropriate. // Autoclose p tags where appropriate.
if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) { if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) {
$this->autoclose('p'); $this->autoclose('p');
@ -196,8 +268,7 @@ class DOMTreeBuilder implements EventHandler {
case 'head': case 'head':
if ($this->insertMode > static::IM_BEFORE_HEAD) { if ($this->insertMode > static::IM_BEFORE_HEAD) {
$this->parseError("Unexpected head tag outside of head context."); $this->parseError("Unexpected head tag outside of head context.");
} } else {
else {
$this->insertMode = static::IM_IN_HEAD; $this->insertMode = static::IM_IN_HEAD;
} }
break; break;
@ -215,7 +286,6 @@ class DOMTreeBuilder implements EventHandler {
$this->insertMode = static::IM_IN_HEAD_NOSCRIPT; $this->insertMode = static::IM_IN_HEAD_NOSCRIPT;
} }
break; break;
} }
// Special case handling for SVG. // Special case handling for SVG.
@ -223,42 +293,110 @@ class DOMTreeBuilder implements EventHandler {
$lname = Elements::normalizeSvgElement($lname); $lname = Elements::normalizeSvgElement($lname);
} }
$pushes = 0;
// when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace
if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) {
array_unshift($this->nsStack, array(
'' => $this->nsRoots[$lname]
) + $this->nsStack[0]);
$pushes ++;
}
$needsWorkaround = false;
if (isset($this->options["xmlNamespaces"]) && $this->options["xmlNamespaces"]) {
// when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack
foreach ($attributes as $aName => $aVal) {
if ($aName === 'xmlns') {
$needsWorkaround = $aVal;
array_unshift($this->nsStack, array(
'' => $aVal
) + $this->nsStack[0]);
$pushes ++;
} elseif ((($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '') === 'xmlns') {
array_unshift($this->nsStack, array(
substr($aName, $pos + 1) => $aVal
) + $this->nsStack[0]);
$pushes ++;
}
}
}
try { try {
$prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : '';
if ($needsWorkaround!==false) {
$xml = "<$lname xmlns=\"$needsWorkaround\" ".(strlen($prefix) && isset($this->nsStack[0][$prefix])?("xmlns:$prefix=\"".$this->nsStack[0][$prefix]."\""):"")."/>";
$frag = new \DOMDocument('1.0', 'UTF-8');
$frag->loadXML($xml);
$ele = $this->doc->importNode($frag->documentElement, true);
} else {
if (isset($this->nsStack[0][$prefix])) {
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
} else {
$ele = $this->doc->createElement($lname); $ele = $this->doc->createElement($lname);
} }
catch(\DOMException $e) { }
} catch (\DOMException $e) {
$this->parseError("Illegal tag name: <$lname>. Replaced with <invalid>."); $this->parseError("Illegal tag name: <$lname>. Replaced with <invalid>.");
$ele = $this->doc->createElement('invalid'); $ele = $this->doc->createElement('invalid');
} }
// When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them.
// When we are on a void tag, we do not need to care about namesapce nesting.
if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) {
// PHP tends to free the memory used by DOM,
// to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes
// see https://bugs.php.net/bug.php?id=67459
$this->pushes[spl_object_hash($ele)] = array($pushes, $ele);
// SEE https://github.com/facebook/hhvm/issues/2962
if (defined('HHVM_VERSION')) {
$ele->setAttribute('html5-php-fake-id-attribute', spl_object_hash($ele));
}
}
foreach ($attributes as $aName => $aVal) { foreach ($attributes as $aName => $aVal) {
// xmlns attributes can't be set
if ($aName === 'xmlns') {
continue;
}
if ($this->insertMode == static::IM_IN_SVG) { if ($this->insertMode == static::IM_IN_SVG) {
$aName = Elements::normalizeSvgAttribute($aName); $aName = Elements::normalizeSvgAttribute($aName);
} } elseif ($this->insertMode == static::IM_IN_MATHML) {
elseif ($this->insertMode == static::IM_IN_MATHML) {
$aName = Elements::normalizeMathMlAttribute($aName); $aName = Elements::normalizeMathMlAttribute($aName);
} }
try { try {
$prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false;
if ($prefix==='xmlns') {
$ele->setAttributeNs(self::NAMESPACE_XMLNS, $aName, $aVal);
} elseif ($prefix!==false && isset($this->nsStack[0][$prefix])) {
$ele->setAttributeNs($this->nsStack[0][$prefix], $aName, $aVal);
} else {
$ele->setAttribute($aName, $aVal); $ele->setAttribute($aName, $aVal);
} }
catch(\DOMException $e) { } catch (\DOMException $e) {
$this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName");
continue; continue;
} }
// This is necessary on a non-DTD schema, like HTML5. // This is necessary on a non-DTD schema, like HTML5.
if ($aName == 'id') { if ($aName == 'id') {
$ele->setIdAttribute('id', TRUE); $ele->setIdAttribute('id', true);
} }
} }
// Some elements have special processing rules. Handle those separately. // Some elements have special processing rules. Handle those separately.
if ($this->rules->hasRules($name)) { if ($this->rules->hasRules($name)) {
$this->current = $this->rules->evaluate($ele, $this->current); $this->current = $this->rules->evaluate($ele, $this->current);
} } // Otherwise, it's a standard element.
// Otherwise, it's a standard element.
else { else {
$this->current->appendChild($ele); $this->current->appendChild($ele);
@ -274,12 +412,21 @@ class DOMTreeBuilder implements EventHandler {
$this->insertMode = static::IM_IN_BODY; $this->insertMode = static::IM_IN_BODY;
} }
// When we are on a void tag, we do not need to care about namesapce nesting,
// but we have to remove the namespaces pushed to $nsStack.
if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) {
// remove the namespaced definded by current node
for ($i = 0; $i < $pushes; $i ++) {
array_shift($this->nsStack);
}
}
// Return the element mask, which the tokenizer can then use to set // Return the element mask, which the tokenizer can then use to set
// various processing rules. // various processing rules.
return Elements::element($name); return Elements::element($name);
} }
public function endTag($name) { public function endTag($name)
{
$lname = $this->normalizeTagName($name); $lname = $this->normalizeTagName($name);
// Ignore closing tags for unary elements. // Ignore closing tags for unary elements.
@ -289,15 +436,22 @@ class DOMTreeBuilder implements EventHandler {
if ($this->insertMode <= static::IM_BEFORE_HTML) { if ($this->insertMode <= static::IM_BEFORE_HTML) {
// 8.2.5.4.2 // 8.2.5.4.2
if (in_array($name, array('html', 'br', 'head', 'title'))) { if (in_array($name, array(
'html',
'br',
'head',
'title'
))) {
$this->startTag('html'); $this->startTag('html');
$this->endTag($name); $this->endTag($name);
$this->insertMode = static::IM_BEFORE_HEAD; $this->insertMode = static::IM_BEFORE_HEAD;
return; return;
} }
// Ignore the tag. // Ignore the tag.
$this->parseError("Illegal closing tag at global scope."); $this->parseError("Illegal closing tag at global scope.");
return; return;
} }
@ -306,6 +460,13 @@ class DOMTreeBuilder implements EventHandler {
$lname = Elements::normalizeSvgElement($lname); $lname = Elements::normalizeSvgElement($lname);
} }
// See https://github.com/facebook/hhvm/issues/2962
if (defined('HHVM_VERSION') && ($cid = $this->current->getAttribute('html5-php-fake-id-attribute'))) {
$this->current->removeAttribute('html5-php-fake-id-attribute');
} else {
$cid = spl_object_hash($this->current);
}
// XXX: Not sure whether we need this anymore. // XXX: Not sure whether we need this anymore.
// if ($name != $lname) { // if ($name != $lname) {
// return $this->quirksTreeResolver($lname); // return $this->quirksTreeResolver($lname);
@ -317,7 +478,14 @@ class DOMTreeBuilder implements EventHandler {
return; return;
} }
//$this->current = $this->current->parentNode; // remove the namespaced definded by current node
if (isset($this->pushes[$cid])) {
for ($i = 0; $i < $this->pushes[$cid][0]; $i ++) {
array_shift($this->nsStack);
}
unset($this->pushes[$cid]);
}
if (! $this->autoclose($lname)) { if (! $this->autoclose($lname)) {
$this->parseError('Could not find closing tag for ' . $lname); $this->parseError('Could not find closing tag for ' . $lname);
} }
@ -337,13 +505,15 @@ class DOMTreeBuilder implements EventHandler {
} }
} }
public function comment($cdata) { public function comment($cdata)
{
// TODO: Need to handle case where comment appears outside of the HTML tag. // TODO: Need to handle case where comment appears outside of the HTML tag.
$node = $this->doc->createComment($cdata); $node = $this->doc->createComment($cdata);
$this->current->appendChild($node); $this->current->appendChild($node);
} }
public function text($data) { public function text($data)
{
// XXX: Hmmm.... should we really be this strict? // XXX: Hmmm.... should we really be this strict?
if ($this->insertMode < static::IM_IN_HEAD) { if ($this->insertMode < static::IM_IN_HEAD) {
// Per '8.2.5.4.3 The "before head" insertion mode' the characters // Per '8.2.5.4.3 The "before head" insertion mode' the characters
@ -355,6 +525,7 @@ class DOMTreeBuilder implements EventHandler {
// fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode);
$this->parseError("Unexpected text. Ignoring: " . $dataTmp); $this->parseError("Unexpected text. Ignoring: " . $dataTmp);
} }
return; return;
} }
// fprintf(STDOUT, "Appending text %s.", $data); // fprintf(STDOUT, "Appending text %s.", $data);
@ -362,20 +533,29 @@ class DOMTreeBuilder implements EventHandler {
$this->current->appendChild($node); $this->current->appendChild($node);
} }
public function eof() { public function eof()
{
// If the $current isn't the $root, do we need to do anything? // If the $current isn't the $root, do we need to do anything?
} }
public function parseError($msg, $line = 0, $col = 0) { public function parseError($msg, $line = 0, $col = 0)
$this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg); {
$this->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg);
} }
public function cdata($data) { public function getErrors()
{
return $this->errors;
}
public function cdata($data)
{
$node = $this->doc->createCDATASection($data); $node = $this->doc->createCDATASection($data);
$this->current->appendChild($node); $this->current->appendChild($node);
} }
public function processingInstruction($name, $data = NULL) { public function processingInstruction($name, $data = null)
{
// XXX: Ignore initial XML declaration, per the spec. // XXX: Ignore initial XML declaration, per the spec.
if ($this->insertMode == static::IM_INITIAL && 'xml' == strtolower($name)) { if ($this->insertMode == static::IM_INITIAL && 'xml' == strtolower($name)) {
return; return;
@ -388,6 +568,7 @@ class DOMTreeBuilder implements EventHandler {
if (! empty($res)) { if (! empty($res)) {
$this->current = $res; $this->current = $res;
} }
return; return;
} }
@ -408,68 +589,64 @@ class DOMTreeBuilder implements EventHandler {
* *
* @param string $name * @param string $name
* The tag name. * The tag name.
* @return string * @return string The normalized tag name.
* The normalized tag name.
*/ */
protected function normalizeTagName($name) { protected function normalizeTagName($name)
/* Section 2.9 suggests that we should not do this. {
if (strpos($name, ':') !== FALSE) { /*
// We know from the grammar that there must be at least one other * Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== false) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); }
// char besides :, since : is not a legal tag start.
$parts = explode(':', $name);
return array_pop($parts);
}
*/ */
return $name; return $name;
} }
protected function quirksTreeResolver($name) { protected function quirksTreeResolver($name)
{
throw new \Exception("Not implemented."); throw new \Exception("Not implemented.");
} }
/** /**
* Automatically climb the tree and close the closest node with the matching $tag. * Automatically climb the tree and close the closest node with the matching $tag.
*/ */
protected function autoclose($tag) { protected function autoclose($tag)
{
$working = $this->current; $working = $this->current;
do { do {
if ($working->nodeType != XML_ELEMENT_NODE) { if ($working->nodeType != XML_ELEMENT_NODE) {
return FALSE; return false;
} }
if ($working->tagName == $tag) { if ($working->tagName == $tag) {
$this->current = $working->parentNode; $this->current = $working->parentNode;
return TRUE;
return true;
} }
} while ($working = $working->parentNode); } while ($working = $working->parentNode);
return FALSE; return false;
} }
/** /**
* Checks if the given tagname is an ancestor of the present candidate. * Checks if the given tagname is an ancestor of the present candidate.
* *
* If $this->current or anything above $this->current matches the given tag * If $this->current or anything above $this->current matches the given tag
* name, this returns TRUE. * name, this returns true.
*/ */
protected function isAncestor($tagname) { protected function isAncestor($tagname)
{
$candidate = $this->current; $candidate = $this->current;
while ($candidate->nodeType === XML_ELEMENT_NODE) { while ($candidate->nodeType === XML_ELEMENT_NODE) {
if ($candidate->tagName == $tagname) { if ($candidate->tagName == $tagname) {
return TRUE; return true;
} }
$candidate = $candidate->parentNode; $candidate = $candidate->parentNode;
} }
return FALSE;
return false;
} }
/** /**
* Returns TRUE if the immediate parent element is of the given tagname. * Returns true if the immediate parent element is of the given tagname.
*/ */
protected function isParent($tagname) { protected function isParent($tagname)
{
return $this->current->tagName == $tagname; return $this->current->tagName == $tagname;
} }
} }

View File

@ -1,5 +1,5 @@
<?php <?php
namespace HTML5\Parser; namespace Masterminds\HTML5\Parser;
/** /**
* Standard events for HTML5. * Standard events for HTML5.
@ -18,10 +18,15 @@ namespace HTML5\Parser;
* *
* See HTML5 spec section 8.2.4 * See HTML5 spec section 8.2.4
*/ */
interface EventHandler { interface EventHandler
{
const DOCTYPE_NONE = 0; const DOCTYPE_NONE = 0;
const DOCTYPE_PUBLIC = 1; const DOCTYPE_PUBLIC = 1;
const DOCTYPE_SYSTEM = 2; const DOCTYPE_SYSTEM = 2;
/** /**
* A doctype declaration. * A doctype declaration.
* *
@ -35,7 +40,8 @@ interface EventHandler {
* @param boolean $quirks * @param boolean $quirks
* Indicates whether the builder should enter quirks mode. * Indicates whether the builder should enter quirks mode.
*/ */
public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE); public function doctype($name, $idType = 0, $id = null, $quirks = false);
/** /**
* A start tag. * A start tag.
* *
@ -63,28 +69,32 @@ interface EventHandler {
* An array with all of the tag's attributes. * An array with all of the tag's attributes.
* @param boolean $selfClosing * @param boolean $selfClosing
* An indicator of whether or not this tag is self-closing (<foo/>) * An indicator of whether or not this tag is self-closing (<foo/>)
* @return numeric * @return numeric One of the Tokenizer::TEXTMODE_* constants.
* One of the Tokenizer::TEXTMODE_* constants.
*/ */
public function startTag($name, $attributes = array(), $selfClosing = FALSE); public function startTag($name, $attributes = array(), $selfClosing = false);
/** /**
* An end-tag. * An end-tag.
*/ */
public function endTag($name); public function endTag($name);
/** /**
* A comment section (unparsed character data). * A comment section (unparsed character data).
*/ */
public function comment($cdata); public function comment($cdata);
/** /**
* A unit of parsed character data. * A unit of parsed character data.
* *
* Entities in this text are *already decoded*. * Entities in this text are *already decoded*.
*/ */
public function text($cdata); public function text($cdata);
/** /**
* Indicates that the document has been entirely processed. * Indicates that the document has been entirely processed.
*/ */
public function eof(); public function eof();
/** /**
* Emitted when the parser encounters an error condition. * Emitted when the parser encounters an error condition.
*/ */
@ -97,6 +107,7 @@ interface EventHandler {
* The unparsed character data. * The unparsed character data.
*/ */
public function cdata($data); public function cdata($data);
/** /**
* This is a holdover from the XML spec. * This is a holdover from the XML spec.
* *
@ -107,5 +118,5 @@ interface EventHandler {
* @param string $data * @param string $data
* The unparsed data. * The unparsed data.
*/ */
public function processingInstruction($name, $data = NULL); public function processingInstruction($name, $data = null);
} }

View File

@ -1,21 +1,20 @@
<?php <?php
namespace HTML5\Parser; namespace Masterminds\HTML5\Parser;
/** /**
* The FileInputStream loads a file to be parsed. * The FileInputStream loads a file to be parsed.
* *
* @todo A buffered input stream would be useful.
*/
class FileInputStream extends StringInputStream implements InputStream {
/*
* So right now we read files into strings and then process the * So right now we read files into strings and then process the
* string. We chose to do this largely for the sake of expediency of * string. We chose to do this largely for the sake of expediency of
* development, and also because we could optimize toward processing * development, and also because we could optimize toward processing
* arbitrarily large chunks of the input. But in the future, we'd * arbitrarily large chunks of the input. But in the future, we'd
* really like to rewrite this class to efficiently handle lower level * really like to rewrite this class to efficiently handle lower level
* stream reads (and thus efficiently handle large documents). * stream reads (and thus efficiently handle large documents).
*
* @todo A buffered input stream would be useful.
*/ */
class FileInputStream extends StringInputStream implements InputStream
{
/** /**
* Load a file input stream. * Load a file input stream.
@ -23,13 +22,11 @@ class FileInputStream extends StringInputStream implements InputStream {
* @param string $data * @param string $data
* The file or url path to load. * The file or url path to load.
*/ */
function __construct($data, $encoding = 'UTF-8', $debug = '') { public function __construct($data, $encoding = 'UTF-8', $debug = '')
{
// Get the contents of the file. // Get the contents of the file.
$content = file_get_contents($data); $content = file_get_contents($data);
parent::__construct($content, $encoding, $debug); parent::__construct($content, $encoding, $debug);
} }
} }

View File

@ -1,5 +1,5 @@
<?php <?php
namespace HTML5\Parser; namespace Masterminds\HTML5\Parser;
/** /**
* Interface for stream readers. * Interface for stream readers.
@ -10,7 +10,8 @@ namespace HTML5\Parser;
* Currently provided InputStream implementations include * Currently provided InputStream implementations include
* FileInputStream and StringInputStream. * FileInputStream and StringInputStream.
*/ */
interface InputStream extends \Iterator { interface InputStream extends \Iterator
{
/** /**
* Returns the current line that is being consumed. * Returns the current line that is being consumed.
@ -26,8 +27,7 @@ interface InputStream extends \Iterator {
* *
* @TODO Move this to the scanner. * @TODO Move this to the scanner.
* *
* @return int * @return int The column number.
* The column number.
*/ */
public function columnOffset(); public function columnOffset();
@ -51,8 +51,7 @@ interface InputStream extends \Iterator {
* Bytes to match. * Bytes to match.
* @param int $max * @param int $max
* Maximum number of bytes to scan. * Maximum number of bytes to scan.
* @return mixed * @return mixed Index or false if no match is found. You should use strong
* Index or FALSE if no match is found. You should use strong
* equality when checking the result, since index could be 0. * equality when checking the result, since index could be 0.
*/ */
public function charsUntil($bytes, $max = null); public function charsUntil($bytes, $max = null);

View File

@ -1,8 +1,9 @@
<?php <?php
namespace HTML5\Parser; namespace Masterminds\HTML5\Parser;
/** /**
* Emit when the parser has an error. * Emit when the parser has an error.
*/ */
class ParseError extends \Exception { class ParseError extends \Exception
{
} }

View File

@ -1,48 +1,53 @@
<?php <?php
namespace HTML5\Parser; namespace Masterminds\HTML5\Parser;
/** /**
* The scanner. * The scanner.
* *
* This scans over an input stream. * This scans over an input stream.
*/ */
class Scanner { class Scanner
{
const CHARS_HEX = 'abcdefABCDEF01234567890'; const CHARS_HEX = 'abcdefABCDEF01234567890';
const CHARS_ALNUM = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; const CHARS_ALNUM = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ'; const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ';
protected $is; protected $is;
// Flipping this to TRUE will give minisculely more debugging info. // Flipping this to true will give minisculely more debugging info.
public $debug = FALSE; public $debug = false;
/** /**
* Create a new Scanner. * Create a new Scanner.
* *
* @param \HTML5\Parser\InputStream $input * @param \Masterminds\HTML5\Parser\InputStream $input
* An InputStream to be scanned. * An InputStream to be scanned.
*/ */
public function __construct($input) { public function __construct($input)
{
$this->is = $input; $this->is = $input;
} }
/** /**
* Get the current position. * Get the current position.
* *
* @return int * @return int The current intiger byte position.
* The current intiger byte position.
*/ */
public function position() { public function position()
{
return $this->is->key(); return $this->is->key();
} }
/** /**
* Take a peek at the next character in the data. * Take a peek at the next character in the data.
* *
* @return string * @return string The next character.
* The next character.
*/ */
public function peek() { public function peek()
{
return $this->is->peek(); return $this->is->peek();
} }
@ -51,16 +56,18 @@ class Scanner {
* *
* Note: This advances the pointer. * Note: This advances the pointer.
* *
* @return string * @return string The next character.
* The next character.
*/ */
public function next() { public function next()
{
$this->is->next(); $this->is->next();
if ($this->is->valid()) { if ($this->is->valid()) {
if ($this->debug) fprintf(STDOUT, "> %s\n", $this->is->current()); if ($this->debug)
fprintf(STDOUT, "> %s\n", $this->is->current());
return $this->is->current(); return $this->is->current();
} }
return FALSE;
return false;
} }
/** /**
@ -68,32 +75,36 @@ class Scanner {
* *
* Note, this does not advance the pointer. * Note, this does not advance the pointer.
* *
* @return string * @return string The current character.
* The current character.
*/ */
public function current() { public function current()
{
if ($this->is->valid()) { if ($this->is->valid()) {
return $this->is->current(); return $this->is->current();
} }
return FALSE;
return false;
} }
/** /**
* Silently consume N chars. * Silently consume N chars.
*/ */
public function consume($count = 1) { public function consume($count = 1)
{
for ($i = 0; $i < $count; ++ $i) { for ($i = 0; $i < $count; ++ $i) {
$this->next(); $this->next();
} }
} }
/** /**
* Unconsume some of the data. This moves the data pointer backwards. * Unconsume some of the data.
* This moves the data pointer backwards.
* *
* @param int $howMany * @param int $howMany
* The number of characters to move the pointer back. * The number of characters to move the pointer back.
*/ */
public function unconsume($howMany = 1) { public function unconsume($howMany = 1)
{
$this->is->unconsume($howMany); $this->is->unconsume($howMany);
} }
@ -103,10 +114,10 @@ class Scanner {
* Note, along with getting the characters the pointer in the data will be * Note, along with getting the characters the pointer in the data will be
* moved as well. * moved as well.
* *
* @return string * @return string The next group that is hex characters.
* The next group that is hex characters.
*/ */
public function getHex() { public function getHex()
{
return $this->is->charsWhile(static::CHARS_HEX); return $this->is->charsWhile(static::CHARS_HEX);
} }
@ -116,10 +127,10 @@ class Scanner {
* Note, along with getting the characters the pointer in the data will be * Note, along with getting the characters the pointer in the data will be
* moved as well. * moved as well.
* *
* @return string * @return string The next group of ASCII alpha characters.
* The next group of ASCII alpha characters.
*/ */
public function getAsciiAlpha() { public function getAsciiAlpha()
{
return $this->is->charsWhile(static::CHARS_ALPHA); return $this->is->charsWhile(static::CHARS_ALPHA);
} }
@ -129,10 +140,10 @@ class Scanner {
* Note, along with getting the characters the pointer in the data will be * Note, along with getting the characters the pointer in the data will be
* moved as well. * moved as well.
* *
* @return string * @return string The next group of ASCII alpha characters and numbers.
* The next group of ASCII alpha characters and numbers.
*/ */
public function getAsciiAlphaNum() { public function getAsciiAlphaNum()
{
return $this->is->charsWhile(static::CHARS_ALNUM); return $this->is->charsWhile(static::CHARS_ALNUM);
} }
@ -142,10 +153,10 @@ class Scanner {
* Note, along with getting the characters the pointer in the data will be * Note, along with getting the characters the pointer in the data will be
* moved as well. * moved as well.
* *
* @return string * @return string The next group of numbers.
* The next group of numbers.
*/ */
public function getNumeric() { public function getNumeric()
{
return $this->is->charsWhile('0123456789'); return $this->is->charsWhile('0123456789');
} }
@ -154,30 +165,34 @@ class Scanner {
* *
* Whitespace in HTML5 is: formfeed, tab, newline, space. * Whitespace in HTML5 is: formfeed, tab, newline, space.
*/ */
public function whitespace() { public function whitespace()
{
return $this->is->charsWhile("\n\t\f "); return $this->is->charsWhile("\n\t\f ");
} }
/** /**
* Returns the current line that is being consumed. * Returns the current line that is being consumed.
* *
* @return int * @return int The current line number.
* The current line number.
*/ */
public function currentLine() { public function currentLine()
{
return $this->is->currentLine(); return $this->is->currentLine();
} }
/** /**
* Read chars until something in the mask is encountered. * Read chars until something in the mask is encountered.
*/ */
public function charsUntil($mask) { public function charsUntil($mask)
{
return $this->is->charsUntil($mask); return $this->is->charsUntil($mask);
} }
/** /**
* Read chars as long as the mask matches. * Read chars as long as the mask matches.
*/ */
public function charsWhile($mask) { public function charsWhile($mask)
{
return $this->is->charsWhile($mask); return $this->is->charsWhile($mask);
} }
@ -186,10 +201,10 @@ class Scanner {
* *
* Newlines are column 0. The first char after a newline is column 1. * Newlines are column 0. The first char after a newline is column 1.
* *
* @return int * @return int The column number.
* The column number.
*/ */
public function columnOffset() { public function columnOffset()
{
return $this->is->columnOffset(); return $this->is->columnOffset();
} }
@ -198,10 +213,10 @@ class Scanner {
* *
* This consumes characters until the EOF. * This consumes characters until the EOF.
* *
* @return int * @return int The number of characters remaining.
* The number of characters remaining.
*/ */
public function remainingChars() { public function remainingChars()
{
return $this->is->remainingChars(); return $this->is->remainingChars();
} }
} }

View File

@ -2,7 +2,7 @@
/** /**
* Loads a string to be parsed. * Loads a string to be parsed.
*/ */
namespace HTML5\Parser; namespace Masterminds\HTML5\Parser;
/* /*
* *
@ -39,7 +39,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
// //
// - // indicates regular comments // - // indicates regular comments
class StringInputStream implements InputStream { class StringInputStream implements InputStream
{
/** /**
* The string data we're parsing. * The string data we're parsing.
*/ */
@ -63,12 +65,14 @@ class StringInputStream implements InputStream {
/** /**
* Create a new InputStream wrapper. * Create a new InputStream wrapper.
* *
* @param $data Data to parse * @param $data Data
* to parse
*/ */
public function __construct($data, $encoding = 'UTF-8', $debug = '') { public function __construct($data, $encoding = 'UTF-8', $debug = '')
{
$data = UTF8Utils::convertToUTF8($data, $encoding); $data = UTF8Utils::convertToUTF8($data, $encoding);
if ($debug) fprintf(STDOUT, $debug, $data, strlen($data)); if ($debug)
fprintf(STDOUT, $debug, $data, strlen($data));
// There is good reason to question whether it makes sense to // There is good reason to question whether it makes sense to
// do this here, since most of these checks are done during // do this here, since most of these checks are done during
@ -88,26 +92,25 @@ class StringInputStream implements InputStream {
/** /**
* Replace linefeed characters according to the spec. * Replace linefeed characters according to the spec.
*/ */
protected function replaceLinefeeds($data) { protected function replaceLinefeeds($data)
/* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED {
(LF) characters are treated specially. Any CR characters /*
that are followed by LF characters must be removed, and any * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. Any CR characters that are followed by LF characters must be removed, and any CR characters not followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are represented by LF characters, and there are never any CR characters in the input to the tokenization stage.
CR characters not followed by LF characters must be converted */
to LF characters. Thus, newlines in HTML DOMs are represented
by LF characters, and there are never any CR characters in the
input to the tokenization stage. */
$crlfTable = array( $crlfTable = array(
"\0" => "\xEF\xBF\xBD", "\0" => "\xEF\xBF\xBD",
"\r\n" => "\n", "\r\n" => "\n",
"\r" => "\n", "\r" => "\n"
); );
return strtr($data, $crlfTable); return strtr($data, $crlfTable);
} }
/** /**
* Returns the current line that the tokenizer is at. * Returns the current line that the tokenizer is at.
*/ */
public function currentLine() { public function currentLine()
{
if (empty($this->EOF) || $this->char == 0) { if (empty($this->EOF) || $this->char == 0) {
return 1; return 1;
} }
@ -117,9 +120,12 @@ class StringInputStream implements InputStream {
} }
/** /**
*
* @deprecated * @deprecated
*
*/ */
public function getCurrentLine() { public function getCurrentLine()
{
return currentLine(); return currentLine();
} }
@ -128,11 +134,10 @@ class StringInputStream implements InputStream {
* *
* Newlines are column 0. The first char after a newline is column 1. * Newlines are column 0. The first char after a newline is column 1.
* *
* @return int * @return int The column number.
* The column number.
*/ */
public function columnOffset() { public function columnOffset()
{
// Short circuit for the first char. // Short circuit for the first char.
if ($this->char == 0) { if ($this->char == 0) {
return 0; return 0;
@ -147,10 +152,9 @@ class StringInputStream implements InputStream {
// However, for here we want the length up until the next byte to be // However, for here we want the length up until the next byte to be
// processed, so add one to the current byte ($this->char). // processed, so add one to the current byte ($this->char).
if ($lastLine !== FALSE) { if ($lastLine !== false) {
$findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
} } else {
else {
// After a newline. // After a newline.
$findLengthOf = substr($this->data, 0, $this->char); $findLengthOf = substr($this->data, 0, $this->char);
} }
@ -159,48 +163,54 @@ class StringInputStream implements InputStream {
} }
/** /**
*
* @deprecated * @deprecated
*
*/ */
public function getColumnOffset() { public function getColumnOffset()
{
return $this->columnOffset(); return $this->columnOffset();
} }
/** /**
* Get the current character. * Get the current character.
* *
* @return string * @return string The current character.
* The current character.
*/ */
public function current() { public function current()
{
return $this->data[$this->char]; return $this->data[$this->char];
} }
/** /**
* Advance the pointer. This is part of the Iterator interface. * Advance the pointer.
* This is part of the Iterator interface.
*/ */
public function next() { public function next()
{
$this->char ++; $this->char ++;
} }
/** /**
* Rewind to the start of the string. * Rewind to the start of the string.
*/ */
public function rewind() { public function rewind()
{
$this->char = 0; $this->char = 0;
} }
/** /**
* Is the current pointer location valid. * Is the current pointer location valid.
* *
* @return bool * @return bool Is the current pointer location valid.
* Is the current pointer location valid.
*/ */
public function valid() { public function valid()
{
if ($this->char < $this->EOF) { if ($this->char < $this->EOF) {
return TRUE; return true;
} }
return FALSE; return false;
} }
/** /**
@ -211,17 +221,19 @@ class StringInputStream implements InputStream {
* *
* @note This performs bounds checking * @note This performs bounds checking
* *
* @return string * @return string Returns the remaining text. If called when the InputStream is
* Returns the remaining text. If called when the InputStream is
* already exhausted, it returns an empty string. * already exhausted, it returns an empty string.
*/ */
public function remainingChars() { public function remainingChars()
{
if ($this->char < $this->EOF) { if ($this->char < $this->EOF) {
$data = substr($this->data, $this->char); $data = substr($this->data, $this->char);
$this->char = $this->EOF; $this->char = $this->EOF;
return $data; return $data;
} }
return '';//FALSE;
return ''; // false;
} }
/** /**
@ -236,24 +248,24 @@ class StringInputStream implements InputStream {
* Bytes to match. * Bytes to match.
* @param int $max * @param int $max
* Maximum number of bytes to scan. * Maximum number of bytes to scan.
* @return mixed * @return mixed Index or false if no match is found. You should use strong
* Index or FALSE if no match is found. You should use strong
* equality when checking the result, since index could be 0. * equality when checking the result, since index could be 0.
*/ */
public function charsUntil($bytes, $max = null) { public function charsUntil($bytes, $max = null)
{
if ($this->char >= $this->EOF) { if ($this->char >= $this->EOF) {
return FALSE; return false;
} }
if ($max === 0 || $max) { if ($max === 0 || $max) {
$len = strcspn($this->data, $bytes, $this->char, $max); $len = strcspn($this->data, $bytes, $this->char, $max);
} } else {
else {
$len = strcspn($this->data, $bytes, $this->char); $len = strcspn($this->data, $bytes, $this->char);
} }
$string = (string) substr($this->data, $this->char, $len); $string = (string) substr($this->data, $this->char, $len);
$this->char += $len; $this->char += $len;
return $string; return $string;
} }
@ -270,19 +282,20 @@ class StringInputStream implements InputStream {
* @param int $max * @param int $max
* The max number of chars to read. * The max number of chars to read.
*/ */
public function charsWhile($bytes, $max = null) { public function charsWhile($bytes, $max = null)
{
if ($this->char >= $this->EOF) { if ($this->char >= $this->EOF) {
return FALSE; return false;
} }
if ($max === 0 || $max) { if ($max === 0 || $max) {
$len = strspn($this->data, $bytes, $this->char, $max); $len = strspn($this->data, $bytes, $this->char, $max);
} } else {
else {
$len = strspn($this->data, $bytes, $this->char); $len = strspn($this->data, $bytes, $this->char);
} }
$string = (string) substr($this->data, $this->char, $len); $string = (string) substr($this->data, $this->char, $len);
$this->char += $len; $this->char += $len;
return $string; return $string;
} }
@ -292,7 +305,8 @@ class StringInputStream implements InputStream {
* @param int $howMany * @param int $howMany
* The number of characters to unconsume. * The number of characters to unconsume.
*/ */
public function unconsume($howMany = 1) { public function unconsume($howMany = 1)
{
if (($this->char - $howMany) >= 0) { if (($this->char - $howMany) >= 0) {
$this->char = $this->char - $howMany; $this->char = $this->char - $howMany;
} }
@ -301,15 +315,17 @@ class StringInputStream implements InputStream {
/** /**
* Look ahead without moving cursor. * Look ahead without moving cursor.
*/ */
public function peek() { public function peek()
{
if (($this->char + 1) <= $this->EOF) { if (($this->char + 1) <= $this->EOF) {
return $this->data[$this->char + 1]; return $this->data[$this->char + 1];
} }
return FALSE; return false;
} }
public function key() { public function key()
{
return $this->char; return $this->char;
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,5 @@
<?php <?php
namespace HTML5\Parser; namespace Masterminds\HTML5\Parser;
use HTML5\Elements;
/** /**
* Handles special-case rules for the DOM tree builder. * Handles special-case rules for the DOM tree builder.
@ -11,11 +9,11 @@ use HTML5\Elements;
* *
* See section 8.1.2.4 of the spec. * See section 8.1.2.4 of the spec.
* *
* @todo * @todo - colgroup and col special behaviors
* - colgroup and col special behaviors
* - body and head special behaviors * - body and head special behaviors
*/ */
class TreeBuildingRules { class TreeBuildingRules
{
protected static $tags = array( protected static $tags = array(
'li' => 1, 'li' => 1,
@ -31,7 +29,7 @@ class TreeBuildingRules {
'tbody' => 1, 'tbody' => 1,
'table' => 1, 'table' => 1,
'optgroup' => 1, 'optgroup' => 1,
'option' => 1, 'option' => 1
); );
/** /**
@ -40,14 +38,16 @@ class TreeBuildingRules {
* @param \DOMDocument $doc * @param \DOMDocument $doc
* The DOM document to use for evaluation and modification. * The DOM document to use for evaluation and modification.
*/ */
public function __construct($doc) { public function __construct($doc)
{
$this->doc = $doc; $this->doc = $doc;
} }
/** /**
* Returns TRUE if the given tagname has special processing rules. * Returns true if the given tagname has special processing rules.
*/ */
public function hasRules($tagname) { public function hasRules($tagname)
{
return isset(static::$tags[$tagname]); return isset(static::$tags[$tagname]);
} }
@ -56,11 +56,10 @@ class TreeBuildingRules {
* *
* This may modify the existing DOM. * This may modify the existing DOM.
* *
* @return \DOMElement * @return \DOMElement The new Current DOM element.
* The new Current DOM element.
*/ */
public function evaluate($new, $current) { public function evaluate($new, $current)
{
switch ($new->tagName) { switch ($new->tagName) {
case 'li': case 'li':
return $this->handleLI($new, $current); return $this->handleLI($new, $current);
@ -71,44 +70,71 @@ class TreeBuildingRules {
case 'rp': case 'rp':
return $this->handleRT($new, $current); return $this->handleRT($new, $current);
case 'optgroup': case 'optgroup':
return $this->closeIfCurrentMatches($new, $current, array('optgroup')); return $this->closeIfCurrentMatches($new, $current, array(
'optgroup'
));
case 'option': case 'option':
return $this->closeIfCurrentMatches($new, $current, array('option', 'optgroup')); return $this->closeIfCurrentMatches($new, $current, array(
'option',
'optgroup'
));
case 'tr': case 'tr':
return $this->closeIfCurrentMatches($new, $current, array('tr')); return $this->closeIfCurrentMatches($new, $current, array(
'tr'
));
case 'td': case 'td':
case 'th': case 'th':
return $this->closeIfCurrentMatches($new, $current, array('th', 'td')); return $this->closeIfCurrentMatches($new, $current, array(
'th',
'td'
));
case 'tbody': case 'tbody':
case 'thead': case 'thead':
case 'tfoot': case 'tfoot':
case 'table': // Spec isn't explicit about this, but it's necessary. case 'table': // Spec isn't explicit about this, but it's necessary.
return $this->closeIfCurrentMatches($new, $current, array('thead', 'tfoot', 'tbody'));
return $this->closeIfCurrentMatches($new, $current, array(
'thead',
'tfoot',
'tbody'
));
} }
return $current; return $current;
} }
protected function handleLI($ele, $current) { protected function handleLI($ele, $current)
return $this->closeIfCurrentMatches($ele, $current, array('li')); {
return $this->closeIfCurrentMatches($ele, $current, array(
'li'
));
} }
protected function handleDT($ele, $current) { protected function handleDT($ele, $current)
return $this->closeIfCurrentMatches($ele, $current, array('dt','dd')); {
} return $this->closeIfCurrentMatches($ele, $current, array(
protected function handleRT($ele, $current) { 'dt',
return $this->closeIfCurrentMatches($ele, $current, array('rt','rp')); 'dd'
));
} }
protected function closeIfCurrentMatches($ele, $current, $match) { protected function handleRT($ele, $current)
{
return $this->closeIfCurrentMatches($ele, $current, array(
'rt',
'rp'
));
}
protected function closeIfCurrentMatches($ele, $current, $match)
{
$tname = $current->tagName; $tname = $current->tagName;
if (in_array($current->tagName, $match)) { if (in_array($current->tagName, $match)) {
$current->parentNode->appendChild($ele); $current->parentNode->appendChild($ele);
} } else {
else {
$current->appendChild($ele); $current->appendChild($ele);
} }
return $ele;
return $ele;
} }
} }

View File

@ -1,4 +1,5 @@
<?php <?php
namespace Masterminds\HTML5\Parser;
/* /*
* *
* Portions based on code from html5lib files with the following copyright: * Portions based on code from html5lib files with the following copyright:
@ -25,15 +26,17 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
namespace HTML5\Parser;
/** /**
* UTF-8 Utilities * UTF-8 Utilities
*/ */
class UTF8Utils { class UTF8Utils
{
/** /**
* The Unicode replacement character.. * The Unicode replacement character..
*/ */
const FFFD = "\xEF\xBF\xBD"; const FFFD = "\xEF\xBF\xBD";
/** /**
* Count the number of characters in a string. * Count the number of characters in a string.
* *
@ -42,15 +45,14 @@ class UTF8Utils {
* *
* @todo Move this to a general utility class. * @todo Move this to a general utility class.
*/ */
public static function countChars($string) { public static function countChars($string)
{
// Get the length for the string we need. // Get the length for the string we need.
if (function_exists('iconv_strlen')) { if (function_exists('iconv_strlen')) {
return iconv_strlen($string, 'utf-8'); return iconv_strlen($string, 'utf-8');
} } elseif (function_exists('mb_strlen')) {
elseif(function_exists('mb_strlen')) {
return mb_strlen($string, 'utf-8'); return mb_strlen($string, 'utf-8');
} } elseif (function_exists('utf8_decode')) {
elseif(function_exists('utf8_decode')) {
// MPB: Will this work? Won't certain decodes lead to two chars // MPB: Will this work? Won't certain decodes lead to two chars
// extrapolated out of 2-byte chars? // extrapolated out of 2-byte chars?
return strlen(utf8_decode($string)); return strlen(utf8_decode($string));
@ -58,8 +60,7 @@ class UTF8Utils {
$count = count_chars($string); $count = count_chars($string);
// 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range)
// 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range)
return array_sum(array_slice($count, 0, 0x80)) + return array_sum(array_slice($count, 0, 0x80)) + array_sum(array_slice($count, 0xC2, 0x33));
array_sum(array_slice($count, 0xC2, 0x33));
} }
/** /**
@ -73,18 +74,11 @@ class UTF8Utils {
* @param string $encoding * @param string $encoding
* A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php * A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php
*/ */
public static function convertToUTF8($data, $encoding = 'UTF-8') { public static function convertToUTF8($data, $encoding = 'UTF-8')
{
/* /*
* From the HTML5 spec: * From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted to Unicode characters for the tokeniser, as described by the rules for that encoding, except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes in the original byte stream that could not be converted to Unicode characters must be converted to U+FFFD REPLACEMENT CHARACTER code points.
Given an encoding, the bytes in the input stream must be */
converted to Unicode characters for the tokeniser, as
described by the rules for that encoding, except that the
leading U+FEFF BYTE ORDER MARK character, if any, must not
be stripped by the encoding layer (it is stripped by the rule below).
Bytes or sequences of bytes in the original byte stream that
could not be converted to Unicode characters must be converted
to U+FFFD REPLACEMENT CHARACTER code points. */
// mb_convert_encoding is chosen over iconv because of a bug. The best // mb_convert_encoding is chosen over iconv because of a bug. The best
// details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643 // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643
@ -92,7 +86,7 @@ class UTF8Utils {
// details. // details.
if (function_exists('mb_convert_encoding')) { if (function_exists('mb_convert_encoding')) {
// mb library has the following behaviors: // mb library has the following behaviors:
// - UTF-16 surrogates result in FALSE. // - UTF-16 surrogates result in false.
// - Overlongs and outside Plane 16 result in empty strings. // - Overlongs and outside Plane 16 result in empty strings.
// Before we run mb_convert_encoding we need to tell it what to do with // Before we run mb_convert_encoding we need to tell it what to do with
@ -104,8 +98,7 @@ class UTF8Utils {
ini_set('mbstring.substitute_character', "none"); ini_set('mbstring.substitute_character', "none");
$data = mb_convert_encoding($data, 'UTF-8', $encoding); $data = mb_convert_encoding($data, 'UTF-8', $encoding);
ini_set('mbstring.substitute_character', $save); ini_set('mbstring.substitute_character', $save);
} } // @todo Get iconv running in at least some environments if that is possible.
// @todo Get iconv running in at least some environments if that is possible.
elseif (function_exists('iconv') && $encoding != 'auto') { elseif (function_exists('iconv') && $encoding != 'auto') {
// fprintf(STDOUT, "iconv found\n"); // fprintf(STDOUT, "iconv found\n");
// iconv has the following behaviors: // iconv has the following behaviors:
@ -113,14 +106,14 @@ class UTF8Utils {
// - Beyond Plane 16 is replaced with a lower char. // - Beyond Plane 16 is replaced with a lower char.
// - Incomplete sequences generate a warning. // - Incomplete sequences generate a warning.
$data = @iconv($encoding, 'UTF-8//IGNORE', $data); $data = @iconv($encoding, 'UTF-8//IGNORE', $data);
} } else {
else {
// we can make a conforming native implementation // we can make a conforming native implementation
throw new Exception('Not implemented, please install mbstring or iconv'); throw new Exception('Not implemented, please install mbstring or iconv');
} }
/* One leading U+FEFF BYTE ORDER MARK character must be /*
ignored if any are present. */ * One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present.
*/
if (substr($data, 0, 3) === "\xEF\xBB\xBF") { if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
$data = substr($data, 3); $data = substr($data, 3);
} }
@ -133,10 +126,10 @@ class UTF8Utils {
* *
* @param string $data * @param string $data
* A string to analyze. * A string to analyze.
* @return array * @return array An array of (string) error messages produced by the scanning.
* An array of (string) error messages produced by the scanning.
*/ */
public static function checkForIllegalCodepoints($data) { public static function checkForIllegalCodepoints($data)
{
if (! function_exists('preg_match_all')) { if (! function_exists('preg_match_all')) {
throw\Exception('The PCRE library is not loaded or is not available.'); throw\Exception('The PCRE library is not loaded or is not available.');
} }
@ -144,23 +137,16 @@ class UTF8Utils {
// Vestigal error handling. // Vestigal error handling.
$errors = array(); $errors = array();
/* All U+0000 NULL characters in the input must be replaced /*
by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such characters is a parse error.
characters is a parse error. */ */
for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i ++) { for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i ++) {
$errors[] = 'null-character'; $errors[] = 'null-character';
} }
/* Any occurrences of any characters in the ranges U+0001 to /*
U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, * Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. (These are all control characters or permanently undefined Unicode characters.)
U+D800 to U+DFFF , U+FDD0 to U+FDEF, and */
characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and
U+10FFFF are parse errors. (These are all control characters
or permanently undefined Unicode characters.) */
// Check PCRE is loaded. // Check PCRE is loaded.
$count = preg_match_all( $count = preg_match_all(
'/(?: '/(?:
@ -175,13 +161,11 @@ class UTF8Utils {
\xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
| |
[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
)/x', )/x', $data, $matches);
$data,
$matches
);
for ($i = 0; $i < $count; $i ++) { for ($i = 0; $i < $count; $i ++) {
$errors[] = 'invalid-codepoint'; $errors[] = 'invalid-codepoint';
} }
return $errors; return $errors;
} }
} }

View File

@ -3,18 +3,21 @@
* @file * @file
* This contains HTML5 entities to use with serializing. * This contains HTML5 entities to use with serializing.
* *
* The list here is mildly different from the list at \HTML5\Entities because * The list here is mildly different from the list at \Masterminds\HTML5\Entities because
* that list was generated from the w3c. It contains some entities that are * that list was generated from the w3c. It contains some entities that are
* not entirely proper such as &am; which maps to &. This list is meant to be * not entirely proper such as &am; which maps to &. This list is meant to be
* a fallback for PHP versions prior to PHP 5.4 when dealing with encoding. * a fallback for PHP versions prior to PHP 5.4 when dealing with encoding.
*/ */
namespace HTML5\Serializer; namespace Masterminds\HTML5\Serializer;
/** /**
* A mapping of entities to their html5 representation. Used for older PHP * A mapping of entities to their html5 representation.
* Used for older PHP
* versions that don't have the mapping. * versions that don't have the mapping.
*/ */
class HTML5Entities { class HTML5Entities
{
public static $map = array( public static $map = array(
' ' => '&Tab;', ' ' => '&Tab;',
"\n" => '&NewLine;', "\n" => '&NewLine;',
@ -1525,6 +1528,6 @@ class HTML5Entities {
'𝕨' => '&wopf;', '𝕨' => '&wopf;',
'𝕩' => '&xopf;', '𝕩' => '&xopf;',
'𝕪' => '&yopf;', '𝕪' => '&yopf;',
'𝕫' => '&zopf;', '𝕫' => '&zopf;'
); );
} }

View File

@ -6,54 +6,127 @@
* These output rules are likely to generate output similar to the document that * These output rules are likely to generate output similar to the document that
* was parsed. It is not intended to output exactly the document that was parsed. * was parsed. It is not intended to output exactly the document that was parsed.
*/ */
namespace HTML5\Serializer; namespace Masterminds\HTML5\Serializer;
use \HTML5\Elements; use Masterminds\HTML5\Elements;
/** /**
* Generate the output html5 based on element rules. * Generate the output html5 based on element rules.
*/ */
class OutputRules implements \HTML5\Serializer\RulesInterface { class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
{
/**
* Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0
*/
const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml';
const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML';
const NAMESPACE_SVG = 'http://www.w3.org/2000/svg';
const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink';
const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace';
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
/**
* Holds the HTML5 element names that causes a namespace switch
*
* @var array
*/
protected $implicitNamespaces = array(
self::NAMESPACE_HTML,
self::NAMESPACE_SVG,
self::NAMESPACE_MATHML,
self::NAMESPACE_XML,
self::NAMESPACE_XMLNS,
);
const IM_IN_HTML = 1; const IM_IN_HTML = 1;
const IM_IN_SVG = 2; const IM_IN_SVG = 2;
const IM_IN_MATHML = 3; const IM_IN_MATHML = 3;
/**
* Used as cache to detect if is available ENT_HTML5
* @var boolean
*/
private $hasHTML5 = false;
protected $traverser; protected $traverser;
protected $encode = FALSE;
protected $encode = false;
protected $out; protected $out;
protected $outputMode; protected $outputMode;
private $xpath;
protected $nonBooleanAttributes = array(
/*
array(
'nodeNamespace'=>'http://www.w3.org/1999/xhtml',
'attrNamespace'=>'http://www.w3.org/1999/xhtml',
'nodeName'=>'img', 'nodeName'=>array('img', 'a'),
'attrName'=>'alt', 'attrName'=>array('title', 'alt'),
'prefixes'=>['xh'=>'http://www.w3.org/1999/xhtml'),
'xpath' => "@checked[../../xh:input[@type='radio' or @type='checkbox']]",
),
*/
array(
'nodeNamespace'=>'http://www.w3.org/1999/xhtml',
'attrName'=>array('alt', 'title'),
),
);
const DOCTYPE = '<!DOCTYPE html>'; const DOCTYPE = '<!DOCTYPE html>';
public function __construct($output, $options = array()) { public function __construct($output, $options = array())
{
if (isset($options['encode_entities'])) { if (isset($options['encode_entities'])) {
$this->encode = $options['encode_entities']; $this->encode = $options['encode_entities'];
} }
$this->outputMode = static::IM_IN_HTML; $this->outputMode = static::IM_IN_HTML;
$this->out = $output; $this->out = $output;
// If HHVM, see https://github.com/facebook/hhvm/issues/2727
$this->hasHTML5 = defined('ENT_HTML5') && !defined('HHVM_VERSION');
}
public function addRule(array $rule)
{
$this->nonBooleanAttributes[] = $rule;
} }
public function setTraverser(\HTML5\Serializer\Traverser $traverser) { public function setTraverser(\Masterminds\HTML5\Serializer\Traverser $traverser)
{
$this->traverser = $traverser; $this->traverser = $traverser;
return $this; return $this;
} }
public function document($dom) { public function document($dom)
{
$this->doctype(); $this->doctype();
$this->traverser->node($dom->documentElement); $this->traverser->node($dom->documentElement);
$this->nl(); $this->nl();
} }
protected function doctype() { protected function doctype()
{
$this->wr(static::DOCTYPE); $this->wr(static::DOCTYPE);
$this->nl(); $this->nl();
} }
public function element($ele) { public function element($ele)
{
$name = $ele->tagName; $name = $ele->tagName;
// Per spec: // Per spec:
@ -68,13 +141,16 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
if ($name == 'svg') { if ($name == 'svg') {
$this->outputMode = static::IM_IN_SVG; $this->outputMode = static::IM_IN_SVG;
$name = Elements::normalizeSvgElement($name); $name = Elements::normalizeSvgElement($name);
} } elseif ($name == 'math') {
elseif ($name == 'math') {
$this->outputMode = static::IM_IN_MATHML; $this->outputMode = static::IM_IN_MATHML;
} }
$this->openTag($ele); $this->openTag($ele);
if (Elements::isA($name, Elements::TEXT_RAW)) {
foreach ($ele->childNodes as $child) {
$this->wr($child->data);
}
} else {
// Handle children. // Handle children.
if ($ele->hasChildNodes()) { if ($ele->hasChildNodes()) {
$this->traverser->children($ele->childNodes); $this->traverser->children($ele->childNodes);
@ -84,6 +160,7 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
if ($name == 'svg' || $name == 'math') { if ($name == 'svg' || $name == 'math') {
$this->outputMode = static::IM_IN_HTML; $this->outputMode = static::IM_IN_HTML;
} }
}
// If not unary, add a closing tag. // If not unary, add a closing tag.
if (! Elements::isA($name, Elements::VOID_TAG)) { if (! Elements::isA($name, Elements::VOID_TAG)) {
@ -97,30 +174,56 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
* @param \DOMText $ele * @param \DOMText $ele
* The text node to write. * The text node to write.
*/ */
public function text($ele) { public function text($ele)
if (isset($ele->parentNode) && isset($ele->parentNode->tagName) && Elements::isA($ele->parentNode->tagName, Elements::TEXT_RAW)) { {
if (isset($ele->parentNode) && isset($ele->parentNode->tagName) && Elements::isA($ele->parentNode->localName, Elements::TEXT_RAW)) {
$this->wr($ele->data); $this->wr($ele->data);
return; return;
} }
// FIXME: This probably needs some flags set. // FIXME: This probably needs some flags set.
$this->wr($this->enc($ele->data)); $this->wr($this->enc($ele->data));
} }
public function cdata($ele) { public function cdata($ele)
{
// This encodes CDATA. // This encodes CDATA.
$this->wr($ele->ownerDocument->saveXML($ele)); $this->wr($ele->ownerDocument->saveXML($ele));
} }
public function comment($ele) { public function comment($ele)
{
// These produce identical output. // These produce identical output.
// $this->wr('<!--')->wr($ele->data)->wr('-->'); // $this->wr('<!--')->wr($ele->data)->wr('-->');
$this->wr($ele->ownerDocument->saveXML($ele)); $this->wr($ele->ownerDocument->saveXML($ele));
} }
public function processorInstruction($ele) { public function processorInstruction($ele)
$this->wr('<?')->wr($ele->target)->wr(' ')->wr($ele->data)->wr('?>'); {
$this->wr('<?')
->wr($ele->target)
->wr(' ')
->wr($ele->data)
->wr('?>');
}
/**
* Write the namespace attributes
*
*
* @param \DOMNode $ele
* The element being written.
*/
protected function namespaceAttrs($ele)
{
if (!$this->xpath || $this->xpath->document !== $ele->ownerDocument){
$this->xpath = new \DOMXPath($ele->ownerDocument);
}
foreach( $this->xpath->query('namespace::*[not(.=../../namespace::*)]', $ele ) as $nsNode ) {
if (!in_array($nsNode->nodeValue, $this->implicitNamespaces)) {
$this->wr(' ')->wr($nsNode->nodeName)->wr('="')->wr($nsNode->nodeValue)->wr('"');
}
}
} }
/** /**
@ -132,26 +235,30 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
* @param \DOMNode $ele * @param \DOMNode $ele
* The element being written. * The element being written.
*/ */
protected function openTag($ele) { protected function openTag($ele)
$this->wr('<')->wr($ele->tagName); {
$this->wr('<')->wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName);
$this->attrs($ele); $this->attrs($ele);
$this->namespaceAttrs($ele);
if ($this->outputMode == static::IM_IN_HTML) { if ($this->outputMode == static::IM_IN_HTML) {
$this->wr('>'); $this->wr('>');
} } // If we are not in html mode we are in SVG, MathML, or XML embedded content.
// If we are not in html mode we are in SVG, MathML, or XML embedded content.
else { else {
if ($ele->hasChildNodes()) { if ($ele->hasChildNodes()) {
$this->wr('>'); $this->wr('>');
} } // If there are no children this is self closing.
// If there are no children this is self closing.
else { else {
$this->wr(' />'); $this->wr(' />');
} }
} }
} }
protected function attrs($ele) { protected function attrs($ele)
{
// FIXME: Needs support for xml, xmlns, xlink, and namespaced elements. // FIXME: Needs support for xml, xmlns, xlink, and namespaced elements.
if (! $ele->hasAttributes()) { if (! $ele->hasAttributes()) {
return $this; return $this;
@ -163,7 +270,7 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
$len = $map->length; $len = $map->length;
for ($i = 0; $i < $len; ++ $i) { for ($i = 0; $i < $len; ++ $i) {
$node = $map->item($i); $node = $map->item($i);
$val = $this->enc($node->value, TRUE); $val = $this->enc($node->value, true);
// XXX: The spec says that we need to ensure that anything in // XXX: The spec says that we need to ensure that anything in
// the XML, XMLNS, or XLink NS's should use the canonical // the XML, XMLNS, or XLink NS's should use the canonical
@ -175,18 +282,68 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
// Using if/elseif instead of switch because it's faster in PHP. // Using if/elseif instead of switch because it's faster in PHP.
if ($this->outputMode == static::IM_IN_SVG) { if ($this->outputMode == static::IM_IN_SVG) {
$name = Elements::normalizeSvgAttribute($name); $name = Elements::normalizeSvgAttribute($name);
} } elseif ($this->outputMode == static::IM_IN_MATHML) {
elseif ($this->outputMode == static::IM_IN_MATHML) {
$name = Elements::normalizeMathMlAttribute($name); $name = Elements::normalizeMathMlAttribute($name);
} }
$this->wr(' ')->wr($name); $this->wr(' ')->wr($name);
if (isset($val) && $val !== '') {
if ((isset($val) && $val !== '') || $this->nonBooleanAttribute($node)) {
$this->wr('="')->wr($val)->wr('"'); $this->wr('="')->wr($val)->wr('"');
} }
} }
} }
protected function nonBooleanAttribute(\DOMAttr $attr)
{
$ele = $attr->ownerElement;
foreach($this->nonBooleanAttributes as $rule){
if(isset($rule['nodeNamespace']) && $rule['nodeNamespace']!==$ele->namespaceURI){
continue;
}
if(isset($rule['attNamespace']) && $rule['attNamespace']!==$attr->namespaceURI){
continue;
}
if(isset($rule['nodeName']) && !is_array($rule['nodeName']) && $rule['nodeName']!==$ele->localName){
continue;
}
if(isset($rule['nodeName']) && is_array($rule['nodeName']) && !in_array($ele->localName, $rule['nodeName'], true)){
continue;
}
if(isset($rule['attrName']) && !is_array($rule['attrName']) && $rule['attrName']!==$attr->localName){
continue;
}
if(isset($rule['attrName']) && is_array($rule['attrName']) && !in_array($attr->localName, $rule['attrName'], true)){
continue;
}
if(isset($rule['xpath'])){
$xp = $this->getXPath($attr);
if(isset($rule['prefixes'])){
foreach($rule['prefixes'] as $nsPrefix => $ns){
$xp->registerNamespace($nsPrefix, $ns);
}
}
if(!$xp->query($rule['xpath'], $attr->ownerElement)->length){
continue;
}
}
return true;
}
return false;
}
private function getXPath(\DOMNode $node){
if(!$this->xpath){
$this->xpath = new \DOMXPath($node->ownerDocument);
}
return $this->xpath;
}
/** /**
* Write the closing tag. * Write the closing tag.
* *
@ -196,9 +353,10 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
* @param \DOMNode $ele * @param \DOMNode $ele
* The element being written. * The element being written.
*/ */
protected function closeTag($ele) { protected function closeTag($ele)
{
if ($this->outputMode == static::IM_IN_HTML || $ele->hasChildNodes()) { if ($this->outputMode == static::IM_IN_HTML || $ele->hasChildNodes()) {
$this->wr('</')->wr($ele->tagName)->wr('>'); $this->wr('</')->wr($this->traverser->isLocalElement($ele) ? $ele->localName : $ele->tagName)->wr('>');
} }
} }
@ -208,10 +366,10 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
* @param string $text * @param string $text
* The string to put into the output. * The string to put into the output.
* *
* @return HTML5\Serializer\Traverser * @return \Masterminds\HTML5\Serializer\Traverser $this so it can be used in chaining.
* $this so it can be used in chaining.
*/ */
protected function wr($text) { protected function wr($text)
{
fwrite($this->out, $text); fwrite($this->out, $text);
return $this; return $this;
} }
@ -219,10 +377,10 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
/** /**
* Write a new line character. * Write a new line character.
* *
* @return HTML5\Serializer\Traverser * @return \Masterminds\HTML5\Serializer\Traverser $this so it can be used in chaining.
* $this so it can be used in chaining.
*/ */
protected function nl() { protected function nl()
{
fwrite($this->out, PHP_EOL); fwrite($this->out, PHP_EOL);
return $this; return $this;
} }
@ -230,7 +388,7 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
/** /**
* Encode text. * Encode text.
* *
* When encode is set to FALSE, the default value, the text passed in is * When encode is set to false, the default value, the text passed in is
* escaped per section 8.3 of the html5 spec. For details on how text is * escaped per section 8.3 of the html5 spec. For details on how text is
* escaped see the escape() method. * escaped see the escape() method.
* *
@ -241,9 +399,7 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
* *
* The named character references are listed in section 8.5. * The named character references are listed in section 8.5.
* *
* @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#named-character-references * @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#named-character-references True encoding will turn all named character references into their entities.
*
* True encoding will turn all named character references into their entities.
* This includes such characters as +.# and many other common ones. By default * This includes such characters as +.# and many other common ones. By default
* encoding here will just escape &'<>". * encoding here will just escape &'<>".
* *
@ -256,10 +412,10 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
* @param boolean $attribute * @param boolean $attribute
* True if we are encoding an attrubute, false otherwise * True if we are encoding an attrubute, false otherwise
* *
* @return string * @return string The encoded text.
* The encoded text.
*/ */
protected function enc($text, $attribute = FALSE) { protected function enc($text, $attribute = false)
{
// Escape the text rather than convert to named character references. // Escape the text rather than convert to named character references.
if (! $this->encode) { if (! $this->encode) {
@ -268,13 +424,13 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
// If we are in PHP 5.4+ we can use the native html5 entity functionality to // If we are in PHP 5.4+ we can use the native html5 entity functionality to
// convert the named character references. // convert the named character references.
if (defined('ENT_HTML5')) {
return htmlentities($text, ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, 'UTF-8', FALSE); if ($this->hasHTML5) {
} return htmlentities($text, ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, 'UTF-8', false);
// If a version earlier than 5.4 html5 entities are not entirely handled. } // If a version earlier than 5.4 html5 entities are not entirely handled.
// This manually handles them. // This manually handles them.
else { else {
return strtr($text, \HTML5\Serializer\HTML5Entities::$map); return strtr($text, \Masterminds\HTML5\Serializer\HTML5Entities::$map);
} }
} }
@ -297,16 +453,25 @@ class OutputRules implements \HTML5\Serializer\RulesInterface {
* @param boolean $attribute * @param boolean $attribute
* True if we are escaping an attrubute, false otherwise * True if we are escaping an attrubute, false otherwise
*/ */
protected function escape($text, $attribute = FALSE) { protected function escape($text, $attribute = false)
{
// Not using htmlspecialchars because, while it does escaping, it doesn't // Not using htmlspecialchars because, while it does escaping, it doesn't
// match the requirements of section 8.5. For example, it doesn't handle // match the requirements of section 8.5. For example, it doesn't handle
// non-breaking spaces. // non-breaking spaces.
if ($attribute) { if ($attribute) {
$replace = array('"'=>'&quot;', '&'=>'&amp;', "\xc2\xa0"=>'&nbsp;'); $replace = array(
} '"' => '&quot;',
else { '&' => '&amp;',
$replace = array('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', "\xc2\xa0"=>'&nbsp;'); "\xc2\xa0" => '&nbsp;'
);
} else {
$replace = array(
'<' => '&lt;',
'>' => '&gt;',
'&' => '&amp;',
"\xc2\xa0" => '&nbsp;'
);
} }
return strtr($text, $replace); return strtr($text, $replace);

View File

@ -3,16 +3,18 @@
* @file * @file
* The interface definition for Rules to generate output. * The interface definition for Rules to generate output.
*/ */
namespace HTML5\Serializer; namespace Masterminds\HTML5\Serializer;
/** /**
* To create a new rule set for writing output the RulesInterface needs to be * To create a new rule set for writing output the RulesInterface needs to be
* implemented. The resulting class can be specified in the options with the * implemented.
* The resulting class can be specified in the options with the
* key of rules. * key of rules.
* *
* For an example implementation see \HTML5\Serializer\OutputRules. * For an example implementation see \Masterminds\HTML5\Serializer\OutputRules.
*/ */
interface RulesInterface { interface RulesInterface
{
/** /**
* The class constructor. * The class constructor.
@ -31,12 +33,11 @@ interface RulesInterface {
* *
* Note, only one traverser can be used by the rules. * Note, only one traverser can be used by the rules.
* *
* @param \HTML5\Serializer\Traverser $traverser * @param \Masterminds\HTML5\Serializer\Traverser $traverser
* The traverser used in the rules. * The traverser used in the rules.
* @return \HTML5\Serializer\RulesInterface * @return \Masterminds\HTML5\Serializer\RulesInterface $this for the current object.
* $this for the current object.
*/ */
public function setTraverser(\HTML5\Serializer\Traverser $traverser); public function setTraverser(\Masterminds\HTML5\Serializer\Traverser $traverser);
/** /**
* Write a document element (\DOMDocument). * Write a document element (\DOMDocument).
@ -91,7 +92,7 @@ interface RulesInterface {
/** /**
* Write a processor instruction. * Write a processor instruction.
* *
* To learn about processor instructions see \HTML5\InstructionProcessor * To learn about processor instructions see \Masterminds\HTML5\InstructionProcessor
* *
* Instead of returning the result write it to the output stream ($output) * Instead of returning the result write it to the output stream ($output)
* that was passed into the constructor. * that was passed into the constructor.

View File

@ -1,5 +1,5 @@
<?php <?php
namespace HTML5\Serializer; namespace Masterminds\HTML5\Serializer;
/** /**
* Traverser for walking a DOM tree. * Traverser for walking a DOM tree.
@ -10,19 +10,26 @@ namespace HTML5\Serializer;
* *
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments * @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments
*/ */
class Traverser { class Traverser
{
/** Namespaces that should be treated as "local" to HTML5. */ /**
* Namespaces that should be treated as "local" to HTML5.
*/
static $local_ns = array( static $local_ns = array(
'http://www.w3.org/1999/xhtml' => 'html', 'http://www.w3.org/1999/xhtml' => 'html',
'http://www.w3.org/1998/Math/MathML' => 'math', 'http://www.w3.org/1998/Math/MathML' => 'math',
'http://www.w3.org/2000/svg' => 'svg', 'http://www.w3.org/2000/svg' => 'svg'
); );
protected $dom; protected $dom;
protected $options; protected $options;
protected $encode = FALSE;
protected $encode = false;
protected $rules; protected $rules;
protected $out; protected $out;
/** /**
@ -36,10 +43,11 @@ class Traverser {
* @param array $options * @param array $options
* An array or options for the traverser as key/value pairs. These include: * An array or options for the traverser as key/value pairs. These include:
* - encode_entities: A bool to specify if full encding should happen for all named * - encode_entities: A bool to specify if full encding should happen for all named
* charachter references. Defaults to FALSE which escapes &'<>". * charachter references. Defaults to false which escapes &'<>".
* - output_rules: The path to the class handling the output rules. * - output_rules: The path to the class handling the output rules.
*/ */
public function __construct($dom, $out, RulesInterface $rules, $options = array()) { public function __construct($dom, $out, RulesInterface $rules, $options = array())
{
$this->dom = $dom; $this->dom = $dom;
$this->out = $out; $this->out = $out;
$this->rules = $rules; $this->rules = $rules;
@ -54,24 +62,21 @@ class Traverser {
* @return resource $out * @return resource $out
* Returns the output stream. * Returns the output stream.
*/ */
public function walk() { public function walk()
{
if ($this->dom instanceof \DOMDocument) { if ($this->dom instanceof \DOMDocument) {
$this->rules->document($this->dom); $this->rules->document($this->dom);
} } elseif ($this->dom instanceof \DOMDocumentFragment) {
elseif ($this->dom instanceof \DOMDocumentFragment) {
// Document fragments are a special case. Only the children need to // Document fragments are a special case. Only the children need to
// be serialized. // be serialized.
if ($this->dom->hasChildNodes()) { if ($this->dom->hasChildNodes()) {
$this->children($this->dom->childNodes); $this->children($this->dom->childNodes);
} }
} } // If NodeList, loop
// If NodeList, loop
elseif ($this->dom instanceof \DOMNodeList) { elseif ($this->dom instanceof \DOMNodeList) {
// If this is a NodeList of DOMDocuments this will not work. // If this is a NodeList of DOMDocuments this will not work.
$this->children($this->dom); $this->children($this->dom);
} } // Else assume this is a DOMNode-like datastructure.
// Else assume this is a DOMNode-like datastructure.
else { else {
$this->node($this->dom); $this->node($this->dom);
} }
@ -85,7 +90,8 @@ class Traverser {
* @param mixed $node * @param mixed $node
* A node implementing \DOMNode. * A node implementing \DOMNode.
*/ */
public function node($node) { public function node($node)
{
// A listing of types is at http://php.net/manual/en/dom.constants.php // A listing of types is at http://php.net/manual/en/dom.constants.php
switch ($node->nodeType) { switch ($node->nodeType) {
case XML_ELEMENT_NODE: case XML_ELEMENT_NODE:
@ -117,7 +123,8 @@ class Traverser {
* @param \DOMNodeList $nl * @param \DOMNodeList $nl
* A list of child elements to walk through. * A list of child elements to walk through.
*/ */
public function children($nl) { public function children($nl)
{
foreach ($nl as $node) { foreach ($nl as $node) {
$this->node($node); $this->node($node);
} }
@ -129,14 +136,15 @@ class Traverser {
* @param mixed $ele * @param mixed $ele
* An element that implement \DOMNode. * An element that implement \DOMNode.
* *
* @return bool * @return bool True if local and false otherwise.
* True if local and false otherwise.
*/ */
public function isLocalElement($ele) { public function isLocalElement($ele)
{
$uri = $ele->namespaceURI; $uri = $ele->namespaceURI;
if (empty($uri)) { if (empty($uri)) {
return FALSE; return false;
} }
return isset(static::$local_ns[$uri]); return isset(static::$local_ns[$uri]);
} }
} }

View File

@ -23,10 +23,11 @@ class HTML5PHP_Autoloader
public function autoload($class) public function autoload($class)
{ {
// Only load the class if it starts with "HTML5" // Only load the class if it starts with "HTML5"
if (strpos($class, 'HTML5') !== 0) if (strpos($class, 'Masterminds\HTML5') !== 0)
{ {
return; return;
} }
$class = substr($class, 12);
//die($class); //die($class);
$filename = $this->path . DIRECTORY_SEPARATOR . str_replace('\\', DIRECTORY_SEPARATOR, $class) . '.php'; $filename = $this->path . DIRECTORY_SEPARATOR . str_replace('\\', DIRECTORY_SEPARATOR, $class) . '.php';

View File

@ -394,7 +394,7 @@ class HumbleHttpAgent
// for AJAX sites, e.g. Blogger with its dynamic views templates. // for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) { if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 150000));
if ($redirectURL) { if ($redirectURL) {
$this->redirectQueue[$orig] = $redirectURL; $this->redirectQueue[$orig] = $redirectURL;
} }
@ -515,7 +515,7 @@ class HumbleHttpAgent
// for AJAX sites, e.g. Blogger with its dynamic views templates. // for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) { if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 150000));
if ($redirectURL) { if ($redirectURL) {
$this->redirectQueue[$orig] = $redirectURL; $this->redirectQueue[$orig] = $redirectURL;
} }
@ -601,7 +601,7 @@ class HumbleHttpAgent
// for AJAX sites, e.g. Blogger with its dynamic views templates. // for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) { if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 150000));
if ($redirectURL) { if ($redirectURL) {
$this->redirectQueue[$orig] = $redirectURL; $this->redirectQueue[$orig] = $redirectURL;
} }

View File

@ -113,19 +113,22 @@ class Readability
function __construct($html, $url=null, $parser='libxml') function __construct($html, $url=null, $parser='libxml')
{ {
$this->url = $url; $this->url = $url;
/* Turn all double br's into p's */ /* Turn all double <br>s into <p>s */
$html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html); $html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
$html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
if (trim($html) == '') $html = '<html></html>'; if (trim($html) == '') $html = '<html></html>';
if ($parser=='html5lib' || $parser=='html5php') { if ($parser=='html5lib' || $parser=='html5php') {
if (version_compare(PHP_VERSION, '5.3.0') >= 0) { if (version_compare(PHP_VERSION, '5.3.0') >= 0) {
$this->dom = HTML5::loadHTML($html); //use Masterminds\HTML5;
$html5class = 'Masterminds\HTML5';
$html5 = new $html5class();
$this->dom = $html5->loadHTML($html);
} }
} }
if ($this->dom === null) { if ($this->dom === null) {
$this->dom = new DOMDocument(); $this->dom = new DOMDocument();
$this->dom->preserveWhiteSpace = false; $this->dom->preserveWhiteSpace = false;
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
@$this->dom->loadHTML($html); @$this->dom->loadHTML($html);
} }
$this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');

View File

@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh // Author: Keyvan Minoukadeh
// Copyright (c) 2014 Keyvan Minoukadeh // Copyright (c) 2014 Keyvan Minoukadeh
// License: AGPLv3 // License: AGPLv3
// Version: 3.3 // Version: 3.4
// Date: 2014-05-07 // Date: 2014-08-28
// More info: http://fivefilters.org/content-only/ // More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org // Help: http://help.fivefilters.org
@ -29,6 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage // For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
error_reporting(E_ALL ^ E_NOTICE); error_reporting(E_ALL ^ E_NOTICE);
libxml_use_internal_errors(true);
ini_set("display_errors", 1); ini_set("display_errors", 1);
@set_time_limit(120); @set_time_limit(120);
@ -82,9 +83,11 @@ function autoload($class_name) {
// Language detect // Language detect
'Text_LanguageDetect' => 'language-detect/LanguageDetect.php', 'Text_LanguageDetect' => 'language-detect/LanguageDetect.php',
// HTML5 PHP (can't be used unless PHP version is >= 5.3) // HTML5 PHP (can't be used unless PHP version is >= 5.3)
'HTML5' => 'html5php/HTML5.php', 'Masterminds\HTML5' => 'html5php/HTML5.php',
// htmLawed - used if XSS filter is enabled (xss_filter) // htmLawed - used if XSS filter is enabled (xss_filter)
'htmLawed' => 'htmLawed/htmLawed.php' 'htmLawed' => 'htmLawed/htmLawed.php',
// Disable SimplePie sanitization
'DisableSimplePieSanitize' => 'DisableSimplePieSanitize.php'
); );
if (isset($mapping[$class_name])) { if (isset($mapping[$class_name])) {
debug("** Loading class $class_name ({$mapping[$class_name]})"); debug("** Loading class $class_name ({$mapping[$class_name]})");
@ -180,19 +183,9 @@ if (strtolower(substr($url, 0, 7)) == 'feed://') {
if (!preg_match('!^https?://.+!i', $url)) { if (!preg_match('!^https?://.+!i', $url)) {
$url = 'http://'.$url; $url = 'http://'.$url;
} }
$url = validate_url($url);
if (!$url) die('Invalid URL supplied');
$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
if ($test === false) {
$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
}
if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
// all okay
unset($test);
} else {
die('Invalid URL supplied');
}
debug("Supplied URL: $url"); debug("Supplied URL: $url");
///////////////////////////////// /////////////////////////////////
@ -200,27 +193,11 @@ debug("Supplied URL: $url");
// (if in 'full' mode) // (if in 'full' mode)
///////////////////////////////// /////////////////////////////////
if ((_FF_FTR_MODE == 'full') && isset($_REQUEST['key']) && ($key_index = array_search($_REQUEST['key'], $options->api_keys)) !== false) { if ((_FF_FTR_MODE == 'full') && isset($_REQUEST['key']) && ($key_index = array_search($_REQUEST['key'], $options->api_keys)) !== false) {
$host = $_SERVER['HTTP_HOST']; if (isset($_REQUEST['key_redirect']) && $_REQUEST['key_redirect'] === '0') {
$path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\'); $_REQUEST['hash'] = sha1($_REQUEST['key'].$url);
$_qs_url = (strtolower(substr($url, 0, 7)) == 'http://') ? substr($url, 7) : $url; $_REQUEST['key'] = $key_index;
$redirect = 'http://'.htmlspecialchars($host.$path).'/makefulltextfeed.php?url='.urlencode($_qs_url); } else {
$redirect .= '&key='.$key_index; $redirect = get_self_url();
$redirect .= '&hash='.urlencode(sha1($_REQUEST['key'].$url));
if (isset($_REQUEST['html'])) $redirect .= '&html='.urlencode($_REQUEST['html']);
if (isset($_REQUEST['max'])) $redirect .= '&max='.(int)$_REQUEST['max'];
if (isset($_REQUEST['links'])) $redirect .= '&links='.urlencode($_REQUEST['links']);
if (isset($_REQUEST['exc'])) $redirect .= '&exc='.urlencode($_REQUEST['exc']);
if (isset($_REQUEST['format'])) $redirect .= '&format='.urlencode($_REQUEST['format']);
if (isset($_REQUEST['callback'])) $redirect .= '&callback='.urlencode($_REQUEST['callback']);
if (isset($_REQUEST['l'])) $redirect .= '&l='.urlencode($_REQUEST['l']);
if (isset($_REQUEST['lang'])) $redirect .= '&lang='.urlencode($_REQUEST['lang']);
if (isset($_REQUEST['xss'])) $redirect .= '&xss';
if (isset($_REQUEST['use_extracted_title'])) $redirect .= '&use_extracted_title';
if (isset($_REQUEST['content'])) $redirect .= '&content='.urlencode($_REQUEST['content']);
if (isset($_REQUEST['summary'])) $redirect .= '&summary='.urlencode($_REQUEST['summary']);
if (isset($_REQUEST['debug'])) $redirect .= '&debug';
if (isset($_REQUEST['parser'])) $redirect .= '&parser='.urlencode($_REQUEST['parser']);
if (isset($_REQUEST['proxy'])) $redirect .= '&proxy='.urlencode($_REQUEST['proxy']);
if ($debug_mode) { if ($debug_mode) {
debug('Redirecting to hide access key, follow URL below to continue'); debug('Redirecting to hide access key, follow URL below to continue');
debug("Location: $redirect"); debug("Location: $redirect");
@ -229,6 +206,7 @@ if ((_FF_FTR_MODE == 'full') && isset($_REQUEST['key']) && ($key_index = array_s
} }
exit; exit;
} }
}
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Set timezone. // Set timezone.
@ -241,9 +219,25 @@ if (!ini_get('date.timezone') || !@date_default_timezone_set(ini_get('date.timez
} }
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Check if the request is explicitly for an HTML page // Should we treat input URL as feed or HTML?
/////////////////////////////////////////////// ///////////////////////////////////////////////
$html_only = (isset($_REQUEST['html']) && ($_REQUEST['html'] == '1' || $_REQUEST['html'] == 'true')); $accept = 'auto';
if (isset($_REQUEST['accept']) && in_array(strtolower($_REQUEST['accept']), array('html', 'feed', 'auto'))) {
$accept = strtolower($_REQUEST['accept']);
} elseif (isset($_REQUEST['html']) && ($_REQUEST['html'] == '1' || $_REQUEST['html'] == 'true')) {
$accept = 'html';
}
///////////////////////////////////////////////
// User-submitted site config
///////////////////////////////////////////////
$user_submitted_config = null;
if (isset($_REQUEST['siteconfig'])) {
$user_submitted_config = $_REQUEST['siteconfig'];
if (!$options->user_submitted_content && $user_submitted_config) {
die('User-submitted site configs are currently disabled. Please remove the siteconfig parameter.');
}
}
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Check if valid key supplied // Check if valid key supplied
@ -463,8 +457,8 @@ if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') {
////////////////////////////////// //////////////////////////////////
if ($options->caching) { if ($options->caching) {
debug('Caching is enabled...'); debug('Caching is enabled...');
$cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary. $cache_id = md5($max.$url.(int)$valid_key.$accept.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.
(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.$parser._FF_FTR_MODE); (int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.$parser.$user_submitted_config._FF_FTR_MODE);
$check_cache = true; $check_cache = true;
if ($options->apc && $options->smart_cache) { if ($options->apc && $options->smart_cache) {
apc_add("cache.$cache_id", 0, $options->cache_time*60); apc_add("cache.$cache_id", 0, $options->cache_time*60);
@ -548,11 +542,14 @@ SiteConfig::use_apc($options->apc);
$extractor->fingerprints = $options->fingerprints; $extractor->fingerprints = $options->fingerprints;
$extractor->allowedParsers = $options->allowed_parsers; $extractor->allowedParsers = $options->allowed_parsers;
$extractor->parserOverride = $parser; $extractor->parserOverride = $parser;
if ($options->user_submitted_config && $user_submitted_config) {
$extractor->setUserSubmittedConfig($user_submitted_config);
}
//////////////////////////////// ////////////////////////////////
// Get RSS/Atom feed // Get RSS/Atom feed
//////////////////////////////// ////////////////////////////////
if (!$html_only) { if ($accept !== 'html') {
debug('--------'); debug('--------');
debug("Attempting to process URL as feed"); debug("Attempting to process URL as feed");
// Send user agent header showing PHP (prevents a HTML response from feedburner) // Send user agent header showing PHP (prevents a HTML response from feedburner)
@ -563,6 +560,9 @@ if (!$html_only) {
// some feeds use the text/html content type - force_feed tells SimplePie to process anyway // some feeds use the text/html content type - force_feed tells SimplePie to process anyway
$feed->force_feed(true); $feed->force_feed(true);
$feed->set_file_class('SimplePie_HumbleHttpAgent'); $feed->set_file_class('SimplePie_HumbleHttpAgent');
$feed->set_sanitize_class('DisableSimplePieSanitize');
// need to assign this manually it seems
$feed->sanitize = new DisableSimplePieSanitize();
//$feed->set_feed_url($url); // colons appearing in the URL's path get encoded //$feed->set_feed_url($url); // colons appearing in the URL's path get encoded
$feed->feed_url = $url; $feed->feed_url = $url;
$feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE); $feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
@ -578,6 +578,8 @@ if (!$html_only) {
//$feed->get_title(); //$feed->get_title();
if ($result && (!is_array($feed->data) || count($feed->data) == 0)) { if ($result && (!is_array($feed->data) || count($feed->data) == 0)) {
die('Sorry, no feed items found'); die('Sorry, no feed items found');
} elseif (!$result && $accept === 'feed') {
die('Sorry, couldn\'t parse as feed');
} }
// from now on, we'll identify ourselves as a browser // from now on, we'll identify ourselves as a browser
$http->userAgentDefault = HumbleHttpAgent::UA_BROWSER; $http->userAgentDefault = HumbleHttpAgent::UA_BROWSER;
@ -589,7 +591,7 @@ if (!$html_only) {
// single-item feeds. // single-item feeds.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
$isDummyFeed = false; $isDummyFeed = false;
if ($html_only || !$result) { if ($accept === 'html' || !$result) {
debug('--------'); debug('--------');
debug("Constructing a single-item feed from URL"); debug("Constructing a single-item feed from URL");
$isDummyFeed = true; $isDummyFeed = true;
@ -627,6 +629,8 @@ if ($html_only || !$result) {
//////////////////////////////////////////// ////////////////////////////////////////////
$output = new FeedWriter(); $output = new FeedWriter();
if (_FF_FTR_MODE === 'simple') $output->enableSimpleJson(); if (_FF_FTR_MODE === 'simple') $output->enableSimpleJson();
//$feed_title = $feed->get_title();
//echo $feed_title; exit;
$output->setTitle(strip_tags($feed->get_title())); $output->setTitle(strip_tags($feed->get_title()));
$output->setDescription(strip_tags($feed->get_description())); $output->setDescription(strip_tags($feed->get_description()));
$output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it $output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it
@ -635,7 +639,9 @@ if ($ttl !== null) {
$ttl = (int)$ttl[0]['data']; $ttl = (int)$ttl[0]['data'];
$output->setTtl($ttl); $output->setTtl($ttl);
} }
//$output->setSelf('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']); $output->setSelf(get_self_url());
$output->setAlternate($url, 'Source URL');
$output->setRelated('http://www.subtome.com/#/subscribe?feeds='.urlencode(get_self_url()).'&back='.urlencode(get_self_url()), 'Subscribe to feed');
$output->setLink($feed->get_link()); // Google Reader uses this for pulling in favicons $output->setLink($feed->get_link()); // Google Reader uses this for pulling in favicons
if ($img_url = $feed->get_image_url()) { if ($img_url = $feed->get_image_url()) {
$output->setImage($feed->get_title(), $feed->get_link(), $img_url); $output->setImage($feed->get_title(), $feed->get_link(), $img_url);
@ -656,8 +662,13 @@ foreach ($items as $key => $item) {
// simplepie already sanitizes URLs so let's not do it again here. // simplepie already sanitizes URLs so let's not do it again here.
//$permalink = $http->validateUrl($permalink); //$permalink = $http->validateUrl($permalink);
if ($permalink) { if ($permalink) {
if (!url_allowed($permalink)) {
debug('URL blocked, skipping...');
$permalink = false;
} else {
$urls_sanitized[] = $permalink; $urls_sanitized[] = $permalink;
} }
}
$urls[$key] = $permalink; $urls[$key] = $permalink;
} }
debug('--------'); debug('--------');
@ -669,6 +680,7 @@ $http->fetchAll($urls_sanitized);
$item_count = 0; $item_count = 0;
foreach ($items as $key => $item) { foreach ($items as $key => $item) {
libxml_clear_errors();
debug('--------'); debug('--------');
debug('Processing feed item '.($item_count+1)); debug('Processing feed item '.($item_count+1));
$do_content_extraction = true; $do_content_extraction = true;
@ -697,7 +709,10 @@ foreach ($items as $key => $item) {
// errors being treated as valid responses. // errors being treated as valid responses.
if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300)) { if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300)) {
$effective_url = $response['effective_url']; $effective_url = $response['effective_url'];
if (!url_allowed($effective_url)) continue; if (!url_allowed($effective_url)) {
debug('URL blocked, skipping...');
continue;
}
// check if action defined for returned Content-Type // check if action defined for returned Content-Type
$mime_info = get_mime_action_info($response['headers']); $mime_info = get_mime_action_info($response['headers']);
if (isset($mime_info['action'])) { if (isset($mime_info['action'])) {
@ -727,7 +742,7 @@ foreach ($items as $key => $item) {
} }
// check site config for single page URL - fetch it if found // check site config for single page URL - fetch it if found
$is_single_page = false; $is_single_page = false;
if ($options->singlepage && ($single_page_response = getSinglePage($item, $html, $effective_url))) { if ($options->singlepage && ($single_page_response = get_single_page($item, $html, $effective_url))) {
$is_single_page = true; $is_single_page = true;
$effective_url = $single_page_response['effective_url']; $effective_url = $single_page_response['effective_url'];
// check if action defined for returned Content-Type // check if action defined for returned Content-Type
@ -765,6 +780,13 @@ foreach ($items as $key => $item) {
debug("Here's the full HTML after it's been parsed by Full-Text RSS:"); debug("Here's the full HTML after it's been parsed by Full-Text RSS:");
die($readability->dom->saveXML($readability->dom->documentElement)); die($readability->dom->saveXML($readability->dom->documentElement));
} }
// is this a native ad?
if ($extract_result && $extractor->isNativeAd()) {
debug("This article appears to be a native ad");
if (!$isDummyFeed && $options->remove_native_ads) {
continue; // skip this feed item entry
}
}
$content_block = ($extract_result) ? $extractor->getContent() : null; $content_block = ($extract_result) ? $extractor->getContent() : null;
$extracted_title = ($extract_result) ? $extractor->getTitle() : ''; $extracted_title = ($extract_result) ? $extractor->getTitle() : '';
// Deal with multi-page articles // Deal with multi-page articles
@ -779,7 +801,7 @@ foreach ($items as $key => $item) {
debug('--------'); debug('--------');
debug('Processing next page: '.$next_page_url); debug('Processing next page: '.$next_page_url);
// If we've got URL, resolve against $url // If we've got URL, resolve against $url
if ($next_page_url = makeAbsoluteStr($effective_url, $next_page_url)) { if ($next_page_url = make_absolute_str($effective_url, $next_page_url)) {
// check it's not what we have already! // check it's not what we have already!
if (!in_array($next_page_url, $multi_page_urls)) { if (!in_array($next_page_url, $multi_page_urls)) {
// it's not, so let's attempt to fetch it // it's not, so let's attempt to fetch it
@ -844,7 +866,12 @@ foreach ($items as $key => $item) {
$html .= $item->get_description(); $html .= $item->get_description();
} else { } else {
$readability->clean($content_block, 'select'); $readability->clean($content_block, 'select');
if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block); if ($options->rewrite_relative_urls) {
$base_url = get_base_url($readability->dom);
if (!$base_url) $base_url = $effective_url;
// rewrite URLs
make_absolute($base_url, $content_block);
}
// footnotes // footnotes
if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) { if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
$readability->addFootnotes($content_block); $readability->addFootnotes($content_block);
@ -987,12 +1014,16 @@ foreach ($items as $key => $item) {
// add effective URL (URL after redirects) // add effective URL (URL after redirects)
if (isset($effective_url)) { if (isset($effective_url)) {
//TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g. //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
//http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir--25th-March-2012-Special-Program-from-Liari-(Karachi) //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir--25th-March-2012-Special-Program-from-Liari-(Karachi)
//temporary measure: use utf8_encode() //temporary measure: use utf8_encode()
$newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url))); $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
} else { } else {
$newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink())); $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
} }
// is this a native ad?
if ($extractor->isNativeAd()) {
$newitem->addElement('dc:type', 'Native Ad');
}
// add categories // add categories
if ($categories = $item->get_categories()) { if ($categories = $item->get_categories()) {
@ -1075,7 +1106,7 @@ if (!$debug_mode) {
$output->generateFeed(); $output->generateFeed();
$output = ob_get_contents(); $output = ob_get_contents();
ob_end_clean(); ob_end_clean();
if ($html_only && $item_count == 0) { if ($accept === 'html' && $item_count == 0) {
// do not cache - in case of temporary server glitch at source URL // do not cache - in case of temporary server glitch at source URL
} else { } else {
$cache = get_cache(); $cache = get_cache();
@ -1092,10 +1123,77 @@ if (!$debug_mode) {
// HELPER FUNCTIONS // HELPER FUNCTIONS
/////////////////////////////// ///////////////////////////////
function get_self_url() {
global $options, $url;
$scheme = (is_ssl()) ? 'https://' : 'http://';
$host = $_SERVER['HTTP_HOST'];
$path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\');
$_qs_url = (strtolower(substr($url, 0, 7)) == 'http://') ? substr($url, 7) : $url;
$self = $scheme.htmlspecialchars($host.$path).'/makefulltextfeed.php?url='.urlencode($_qs_url);
// hide API key if we can
if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->api_keys)) !== false) {
$_hash = sha1($_GET['key'].$url);
$self .= '&key='.$key_index;
$self .= '&hash='.urlencode($_hash);
} elseif(isset($_GET['key']) && isset($_GET['hash'])) {
$self .= '&key='.urlencode($_GET['key']);
$self .= '&hash='.urlencode($_GET['hash']);
}
if (isset($_GET['html'])) $self .= '&html='.urlencode($_GET['html']);
if (isset($_GET['accept'])) $self .= '&accept='.urlencode($_GET['accept']);
if (isset($_GET['max'])) $self .= '&max='.(int)$_GET['max'];
if (isset($_GET['links'])) $self .= '&links='.urlencode($_GET['links']);
if (isset($_GET['exc'])) $self .= '&exc='.urlencode($_GET['exc']);
if (isset($_GET['format'])) $self .= '&format='.urlencode($_GET['format']);
if (isset($_GET['callback'])) $self .= '&callback='.urlencode($_GET['callback']);
if (isset($_GET['l'])) $self .= '&l='.urlencode($_GET['l']);
if (isset($_GET['lang'])) $self .= '&lang='.urlencode($_GET['lang']);
if (isset($_GET['xss'])) $self .= '&xss';
if (isset($_GET['use_extracted_title'])) $self .= '&use_extracted_title';
if (isset($_GET['content'])) $self .= '&content='.urlencode($_GET['content']);
if (isset($_GET['summary'])) $self .= '&summary='.urlencode($_GET['summary']);
if (isset($_GET['debug'])) $self .= '&debug';
if (isset($_GET['parser'])) $self .= '&parser='.urlencode($_GET['parser']);
if (isset($_GET['proxy'])) $self .= '&proxy='.urlencode($_GET['proxy']);
if (isset($_GET['siteconfig'])) $self .= '&siteconfig='.urlencode($_GET['siteconfig']);
return $self;
}
function validate_url($url) {
$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
if ($test === false) {
$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
}
if ($test !== false && $test !== null && preg_match('!^https?://!i', $url)) {
return $url;
} else {
return false;
}
}
function get_base_url($dom) {
$xpath = new DOMXPath($dom);
return @$xpath->evaluate('string(//head/base/@href)', $dom);
}
function is_ssl() {
if (isset($_SERVER['HTTPS']) && ($_SERVER['HTTPS'] != '') && ($_SERVER['HTTPS'] != 'off')) {
return true;
} elseif (isset($_SERVER['HTTP_X_FORWARDED_PROTO']) && $_SERVER['HTTP_X_FORWARDED_PROTO'] == 'https') {
return true;
} else {
return false;
}
}
// Adapted from WordPress // Adapted from WordPress
// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173 // http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
function get_excerpt($text, $num_words=55, $more=null) { function get_excerpt($text, $num_words=55, $more=null) {
if (null === $more) $more = '&hellip;'; if (null === $more) $more = '';
$text = strip_tags($text); $text = strip_tags($text);
//TODO: Check if word count is based on single characters (East Asian characters) //TODO: Check if word count is based on single characters (East Asian characters)
/* /*
@ -1183,9 +1281,10 @@ function convert_to_utf8($html, $header=null) {
} }
} }
} }
if (isset($encoding)) $encoding = trim($encoding); if (isset($encoding)) $encoding = strtolower(trim($encoding));
// trim is important here! // fix bad encoding values
if (!$encoding || (strtolower($encoding) == 'iso-8859-1')) { if ($encoding === 'iso-8850-1') $encoding = 'iso-8859-1';
if (!$encoding || ($encoding === 'iso-8859-1')) {
// replace MS Word smart qutoes // replace MS Word smart qutoes
$trans = array(); $trans = array();
$trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark $trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark
@ -1219,7 +1318,7 @@ function convert_to_utf8($html, $header=null) {
$encoding = 'utf-8'; $encoding = 'utf-8';
} else { } else {
debug('Character encoding: '.$encoding); debug('Character encoding: '.$encoding);
if (strtolower($encoding) != 'utf-8') { if ($encoding !== 'utf-8') {
debug('Converting to UTF-8'); debug('Converting to UTF-8');
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
} }
@ -1228,7 +1327,7 @@ function convert_to_utf8($html, $header=null) {
return $html; return $html;
} }
function makeAbsolute($base, $elem) { function make_absolute($base, $elem) {
$base = new SimplePie_IRI($base); $base = new SimplePie_IRI($base);
// remove '//' in URL path (used to prevent URLs from resolving properly) // remove '//' in URL path (used to prevent URLs from resolving properly)
// TODO: check if this is still the case // TODO: check if this is still the case
@ -1238,12 +1337,12 @@ function makeAbsolute($base, $elem) {
for ($i = $elems->length-1; $i >= 0; $i--) { for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i); $e = $elems->item($i);
//$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
makeAbsoluteAttr($base, $e, $attr); make_absolute_attr($base, $e, $attr);
} }
if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr); if (strtolower($elem->tagName) == $tag) make_absolute_attr($base, $elem, $attr);
} }
} }
function makeAbsoluteAttr($base, $e, $attr) { function make_absolute_attr($base, $e, $attr) {
if ($e->hasAttribute($attr)) { if ($e->hasAttribute($attr)) {
// Trim leading and trailing white space. I don't really like this but // Trim leading and trailing white space. I don't really like this but
// unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" /> // unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
@ -1256,7 +1355,7 @@ function makeAbsoluteAttr($base, $e, $attr) {
} }
} }
} }
function makeAbsoluteStr($base, $url) { function make_absolute_str($base, $url) {
$base = new SimplePie_IRI($base); $base = new SimplePie_IRI($base);
// remove '//' in URL path (causes URLs not to resolve properly) // remove '//' in URL path (causes URLs not to resolve properly)
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
@ -1271,7 +1370,7 @@ function makeAbsoluteStr($base, $url) {
} }
} }
// returns single page response, or false if not found // returns single page response, or false if not found
function getSinglePage($item, $html, $url) { function get_single_page($item, $html, $url) {
global $http, $extractor; global $http, $extractor;
debug('Looking for site config files to see if single page link exists'); debug('Looking for site config files to see if single page link exists');
$site_config = $extractor->buildSiteConfig($url, $html); $site_config = $extractor->buildSiteConfig($url, $html);
@ -1308,7 +1407,7 @@ function getSinglePage($item, $html, $url) {
} }
} }
// If we've got URL, resolve against $url // If we've got URL, resolve against $url
if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) { if (isset($single_page_url) && ($single_page_url = make_absolute_str($url, $single_page_url))) {
// check it's not what we have already! // check it's not what we have already!
if ($single_page_url != $url) { if ($single_page_url != $url) {
// it's not, so let's try to fetch it... // it's not, so let's try to fetch it...

View File

@ -1,16 +0,0 @@
# This file is only used when deploying Full-Text RSS to AppFog.
# See http://help.fivefilters.org/customer/portal/articles/1143210-hosting
---
applications:
.:
# name: full-text-rss
framework:
name: php
info:
mem: 512M
description: PHP Application
exec:
infra: aws
# url: ${name}.${target-base}
mem: 512M
instances: 1