Full-Text RSS 3.5

This commit is contained in:
FiveFilters.org 2017-02-18 16:06:19 +01:00
parent daedf214fe
commit bfed79edc7
25 changed files with 627 additions and 166 deletions

View File

@ -1,9 +1,9 @@
<?php <?php
// Update site config files for Full-Text RSS // Update site config files for Full-Text RSS
// Author: Keyvan Minoukadeh // Author: Keyvan Minoukadeh
// Copyright (c) 2014 Keyvan Minoukadeh // Copyright (c) 2015 Keyvan Minoukadeh
// License: AGPLv3 // License: AGPLv3
// Date: 2014-08-19 // Date: 2015-06-10
// More info: http://fivefilters.org/content-only/ // More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org // Help: http://help.fivefilters.org
@ -245,7 +245,7 @@ function println($txt) {
} }
function rrmdir($dir) { function rrmdir($dir) {
foreach(glob($dir . '/{*.txt,*.php,.*.txt,.*.php,.gitattributes,.gitignore,ftr-site-config-master,README.md}', GLOB_BRACE|GLOB_NOSORT) as $file) { foreach(glob($dir . '/{*.txt,*.php,*.com,.*.txt,.*.php,.*.com,.gitattributes,.gitignore,ftr-site-config-master,README.md}', GLOB_BRACE|GLOB_NOSORT) as $file) {
if(is_dir($file)) { if(is_dir($file)) {
rrmdir($file); rrmdir($file);
} else { } else {

View File

@ -2,6 +2,19 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/ http://fivefilters.org/content-only/
CHANGELOG CHANGELOG
------------------------------------ ------------------------------------
3.5 (2015-06-13)
- Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
- Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
- Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky)
- Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates)
- Humble HTTP Agent updated to support version 2 of PHP's HTTP extension
- HTML5-PHP library updated
- Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6
- Config option removed: $options->user_agents - use site config files.
- Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional.
- Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4
- Site config files updated for better extraction
- Other minor fixes/improvements
3.4.1 (unreleased) 3.4.1 (unreleased)
- Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c - Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c

View File

@ -430,22 +430,6 @@ $options->fingerprints = array(
'<meta name="generator" content="WordPress' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true) '<meta name="generator" content="WordPress' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true)
); );
// User Agent strings - mapping domain names
// ----------------------
// e.g. $options->user_agents = array('example.org' => 'PHP/5.2');
$options->user_agents = array( 'lifehacker.com' => 'PHP/5.2',
'gawker.com' => 'PHP/5.2',
'deadspin.com' => 'PHP/5.2',
'kotaku.com' => 'PHP/5.2',
'jezebel.com' => 'PHP/5.2',
'io9.com' => 'PHP/5.2',
'jalopnik.com' => 'PHP/5.2',
'gizmodo.com' => 'PHP/5.2',
'.wikipedia.org' => 'Mozilla/5.2',
'.fok.nl' => 'Googlebot/2.1',
'getpocket.com' => 'PHP/5.2'
);
// URL Rewriting // URL Rewriting
// ---------------------- // ----------------------
// Currently allows simple string replace of URLs. // Currently allows simple string replace of URLs.
@ -500,7 +484,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS /////////// /// DO NOT CHANGE ANYTHING BELOW THIS ///////////
///////////////////////////////////////////////// /////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.4'); if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.5');
if (basename(__FILE__) == 'config.php') { if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) { if (file_exists(dirname(__FILE__).'/custom_config.php')) {

View File

@ -16,17 +16,19 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/ http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/ */
$app_name = 'Full-Text RSS 3.3'; $app_name = 'Full-Text RSS 3.5';
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION. // Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION')); //$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
// HHVM works okay, but no Tidy and autoupdate of site config files not working (tested 3.7.1)
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.3.0', '>='));
$pcre_ok = extension_loaded('pcre'); $pcre_ok = extension_loaded('pcre');
$zlib_ok = extension_loaded('zlib'); $zlib_ok = extension_loaded('zlib');
$mbstring_ok = extension_loaded('mbstring'); $mbstring_ok = extension_loaded('mbstring');
$iconv_ok = extension_loaded('iconv'); $iconv_ok = extension_loaded('iconv');
$tidy_ok = function_exists('tidy_parse_string'); $tidy_ok = function_exists('tidy_parse_string');
$curl_ok = function_exists('curl_exec'); $curl_ok = function_exists('curl_exec');
$parallel_ok = ((extension_loaded('http') && class_exists('HttpRequestPool')) || ($curl_ok && function_exists('curl_multi_init'))); $parallel_ok = ((extension_loaded('http') && class_exists('http\Client\Request')) || ($curl_ok && function_exists('curl_multi_init')));
$allow_url_fopen_ok = (bool)ini_get('allow_url_fopen'); $allow_url_fopen_ok = (bool)ini_get('allow_url_fopen');
$filter_ok = extension_loaded('filter'); $filter_ok = extension_loaded('filter');
@ -201,7 +203,7 @@ div.chunk {
<tbody> <tbody>
<tr class="<?php echo ($php_ok) ? 'enabled' : 'disabled'; ?>"> <tr class="<?php echo ($php_ok) ? 'enabled' : 'disabled'; ?>">
<td>PHP</td> <td>PHP</td>
<td>5.2.0 or higher</td> <td>5.3 or higher</td>
<td><?php echo phpversion(); ?></td> <td><?php echo phpversion(); ?></td>
</tr> </tr>
<tr class="<?php echo ($xml_ok) ? 'enabled, and sane' : 'disabled, or broken'; ?>"> <tr class="<?php echo ($xml_ok) ? 'enabled, and sane' : 'disabled, or broken'; ?>">
@ -306,9 +308,9 @@ div.chunk {
<?php endif; ?> <?php endif; ?>
<?php if ($parallel_ok): ?> <?php if ($parallel_ok): ?>
<li><strong>Parallel URL fetching:</strong> You have <code>HttpRequestPool</code> or <code>curl_multi</code> support installed. No problems here.</li> <li><strong>Parallel URL fetching:</strong> You have PHP's HTTP extension or <code>curl_multi</code> installed. No problems here.</li>
<?php else: ?> <?php else: ?>
<li class="highlight"><strong>Parallel URL fetching:</strong> <code>HttpRequestPool</code> or <code>curl_multi</code> support is not available. <?php echo $app_name; ?> will use <code>file_get_contents()</code> instead to fetch URLs sequentially rather than in parallel.</li> <li class="highlight"><strong>Parallel URL fetching:</strong> HTTP extension or <code>curl_multi</code> support is not available. <?php echo $app_name; ?> will use <code>file_get_contents()</code> instead to fetch URLs sequentially rather than in parallel.</li>
<?php endif; ?> <?php endif; ?>
<?php else: ?> <?php else: ?>
@ -352,11 +354,11 @@ div.chunk {
<div class="chunk"> <div class="chunk">
<h3>Further info</h3> <h3>Further info</h3>
<h4>HTTP module</h4> <h4>HTTP module</h4>
<p>Full-Text RSS can make use of <code>HttpRequestPool</code> or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p> <p>Full-Text RSS can make use of PHP's HTTP extension or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p>
<?php <?php
$http_type = 'file_get_contents'; $http_type = 'file_get_contents';
if (extension_loaded('http') && class_exists('HttpRequestPool')) { if (extension_loaded('http') && class_exists('http\Client\Request')) {
$http_type = 'HttpRequestPool'; $http_type = 'HTTP extension';
} elseif ($curl_ok && function_exists('curl_multi_init')) { } elseif ($curl_ok && function_exists('curl_multi_init')) {
$http_type = 'curl_multi'; $http_type = 'curl_multi';
} }

View File

@ -582,8 +582,8 @@ if (!defined('_FF_FTR_INDEX')) {
<h3>System Requirements</h3> <h3>System Requirements</h3>
<p>PHP 5.2 or above is required. A simple shared web hosting account will work fine. <p>PHP 5.3 or above is required. A simple shared web hosting account should work fine, but we recommend a <a href="http://help.fivefilters.org/customer/portal/articles/1143210-hosting">VPS with 1GB RAM</a>.
The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using <a href="http://www.wampserver.com/en/index.php">WampServer</a>. It has also been reported as working under IIS, but we have not tested this ourselves.</p> The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using <a href="http://www.uniformserver.com/">Uniform Server</a>. It has also been reported as working under IIS, but we have not tested this ourselves.</p>
<h3 id="download">Download</h3> <h3 id="download">Download</h3>
<p>Download from <a href="http://fivefilters.org/content-only/#download">fivefilters.org</a> &mdash; old versions are available in our <a href="http://code.fivefilters.org">code repository</a>.</p> <p>Download from <a href="http://fivefilters.org/content-only/#download">fivefilters.org</a> &mdash; old versions are available in our <a href="http://code.fivefilters.org">code repository</a>.</p>

View File

@ -15,12 +15,12 @@
class ContentExtractor class ContentExtractor
{ {
protected static $tidy_config = array( protected static $tidy_config = array(
'clean' => true, 'clean' => false, // can't preserve wbr tabs if this is set to true
'output-xhtml' => true, 'output-xhtml' => true,
'logical-emphasis' => true, 'logical-emphasis' => true,
'show-body-only' => false, 'show-body-only' => false,
'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', 'new-blocklevel-tags' => 'article aside footer header hgroup menu nav section details datagrid',
'new-inline-tags' => 'mark, time, meter, progress, data', 'new-inline-tags' => 'mark time meter progress data wbr',
'wrap' => 0, 'wrap' => 0,
'drop-empty-paras' => true, 'drop-empty-paras' => true,
'drop-proprietary-attributes' => false, 'drop-proprietary-attributes' => false,
@ -42,6 +42,7 @@ class ContentExtractor
protected $body; protected $body;
protected $success = false; protected $success = false;
protected $nextPageUrl; protected $nextPageUrl;
protected $opengraph = array();
public $allowedParsers = array('libxml', 'html5php'); public $allowedParsers = array('libxml', 'html5php');
public $defaultParser = 'libxml'; public $defaultParser = 'libxml';
public $parserOverride = null; public $parserOverride = null;
@ -79,6 +80,7 @@ class ContentExtractor
$this->date = null; $this->date = null;
$this->nextPageUrl = null; $this->nextPageUrl = null;
$this->success = false; $this->success = false;
$this->opengraph = array();
} }
public function findHostUsingFingerprints($html) { public function findHostUsingFingerprints($html) {
@ -109,8 +111,11 @@ class ContentExtractor
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
// is merged version already cached? // is merged version already cached?
if (SiteConfig::is_cached("$host.merged")) { if (SiteConfig::is_cached("$host.merged")) {
$config = SiteConfig::build("$host.merged");
if ($config) {
$this->debug("Returning cached and merged site config for $host"); $this->debug("Returning cached and merged site config for $host");
return SiteConfig::build("$host.merged"); return $config;
}
} }
// let's build from site_config/custom/ and standard/ // let's build from site_config/custom/ and standard/
$config = SiteConfig::build($host); $config = SiteConfig::build($host);
@ -316,6 +321,24 @@ class ContentExtractor
} }
} }
// try to open graph properties
$elems = @$xpath->query("//head//meta[@property='og:title' or @property='og:type' or @property='og:url' or @property='og:image' or @property='og:description']", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Extracting Open Graph elements');
foreach ($elems as $elem) {
if ($elem->hasAttribute('content')) {
$_prop = strtolower($elem->getAttribute('property'));
$_val = $elem->getAttribute('content');
// currently one of each is returned, so we keep the first one
if (!isset($this->opengraph[$_prop])) {
$this->opengraph[$_prop] = $_val;
}
}
}
unset($_prop, $_val);
}
// try to get date // try to get date
foreach ($this->config->date as $pattern) { foreach ($this->config->date as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom); $elems = @$xpath->evaluate($pattern, $this->readability->dom);
@ -398,6 +421,16 @@ class ContentExtractor
} }
} }
// strip empty a elements
$elems = $xpath->query("//a[not(./*) and normalize-space(.)='']", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' empty a elements');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
// try to get body // try to get body
foreach ($this->config->body as $pattern) { foreach ($this->config->body as $pattern) {
$elems = @$xpath->query($pattern, $this->readability->dom); $elems = @$xpath->query($pattern, $this->readability->dom);
@ -789,6 +822,10 @@ class ContentExtractor
return $this->body; return $this->body;
} }
public function getOpenGraph() {
return $this->opengraph;
}
public function isNativeAd() { public function isNativeAd() {
return $this->nativeAd; return $this->nativeAd;
} }

View File

@ -5,10 +5,10 @@
* Each instance of this class should hold extraction patterns and other directives * Each instance of this class should hold extraction patterns and other directives
* for a website. See ContentExtractor class to see how it's used. * for a website. See ContentExtractor class to see how it's used.
* *
* @version 0.8 * @version 1.0
* @date 2013-04-16 * @date 2015-06-09
* @author Keyvan Minoukadeh * @author Keyvan Minoukadeh
* @copyright 2013 Keyvan Minoukadeh * @copyright 2015 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/ */
@ -38,8 +38,7 @@ class SiteConfig
// Mark article as a native ad if any of these expressions match (0 or more xpath expressions) // Mark article as a native ad if any of these expressions match (0 or more xpath expressions)
public $native_ad_clue = array(); public $native_ad_clue = array();
// Additional HTTP headers to send // Additional HTTP headers to send (associative array)
// NOT YET USED
public $http_header = array(); public $http_header = array();
// Process HTML with tidy before creating DOM (bool or null if undeclared) // Process HTML with tidy before creating DOM (bool or null if undeclared)
@ -67,6 +66,15 @@ class SiteConfig
// Test URL - if present, can be used to test the config above // Test URL - if present, can be used to test the config above
public $test_url = array(); public $test_url = array();
// Test URL contains - one or more snippets of text from the article body.
// Used to determine if the extraction rules for the site are still valid (ie. still extracting relevant content)
// Keys should be one or more of the test URLs supplied, and value an array of strings to look for.
public $test_contains = array();
// If page contains - XPath expression. Used to determine if the preceding rule gets evaluated or not.
// Currently only works with single_page_link.
public $if_page_contains = array();
// Single-page link - should identify a link element or URL pointing to the page holding the entire article // Single-page link - should identify a link element or URL pointing to the page holding the entire article
// This is useful for sites which split their articles across multiple pages. Links to such pages tend to // This is useful for sites which split their articles across multiple pages. Links to such pages tend to
// display the first page with links to the other pages at the bottom. Often there is also a link to a page // display the first page with links to the other pages at the bottom. Often there is also a link to a page
@ -185,11 +193,23 @@ class SiteConfig
public function append(SiteConfig $newconfig) { public function append(SiteConfig $newconfig) {
// check for commands where we accept multiple statements (no test_url) // check for commands where we accept multiple statements (no test_url)
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header') as $var) { foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue') as $var) {
// append array elements for this config variable from $newconfig to this config // append array elements for this config variable from $newconfig to this config
//$this->$var = $this->$var + $newconfig->$var; //$this->$var = $this->$var + $newconfig->$var;
$this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
} }
// special handling of commands where key is important and config values being appended should not overwrite existing ones
foreach (array('http_header') as $var) {
$this->$var = array_merge($newconfig->$var, $this->$var);
}
// special handling of if_page_contains directive
foreach (array('single_page_link') as $var) {
if (isset($this->if_page_contains[$var]) && isset($newconfig->if_page_contains[$var])) {
$this->if_page_contains[$var] = array_merge($newconfig->if_page_contains[$var], $this->if_page_contains[$var]);
} elseif (isset($newconfig->if_page_contains[$var])) {
$this->if_page_contains[$var] = $newconfig->if_page_contains[$var];
}
}
// check for single statement commands // check for single statement commands
// we do not overwrite existing non null values // we do not overwrite existing non null values
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
@ -213,6 +233,40 @@ class SiteConfig
return $key_suffix; return $key_suffix;
} }
// Add test_contains to last test_url
public function add_test_contains($test_contains) {
if (!empty($this->test_url)) {
$test_contains = (string) $test_contains;
$key = end($this->test_url);
reset($this->test_url);
if (isset($this->test_contains[$key])) {
$this->test_contains[$key][] = $test_contains;
} else {
$this->test_contains[$key] = array($test_contains);
}
}
}
// Add if_page_page_contains
// TODO: Expand so it can be used with other rules too
public function add_if_page_contains_condition($if_page_contains) {
if (!empty($this->single_page_link)) {
$if_page_contains = (string) $if_page_contains;
$key = end($this->single_page_link);
reset($this->single_page_link);
$this->if_page_contains['single_page_link'][$key] = $if_page_contains;
}
}
public function get_if_page_contains_condition($directive_name, $directive_value) {
if (isset($this->if_page_contains[$directive_name])) {
if (isset($this->if_page_contains[$directive_name][$directive_value])) {
return $this->if_page_contains[$directive_name][$directive_value];
}
}
return null;
}
// returns SiteConfig instance if an appropriate one is found, false otherwise // returns SiteConfig instance if an appropriate one is found, false otherwise
// if $exact_host_match is true, we will not look for wildcard config matches // if $exact_host_match is true, we will not look for wildcard config matches
// by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
@ -356,12 +410,20 @@ class SiteConfig
// check for single statement commands stored as strings // check for single statement commands stored as strings
} elseif (in_array($command, array('parser'))) { } elseif (in_array($command, array('parser'))) {
$config->$command = $val; $config->$command = $val;
// special treatment for test_contains
} elseif (in_array($command, array('test_contains'))) {
$config->add_test_contains($val);
// special treatment for if_page_contains
} elseif (in_array($command, array('if_page_contains'))) {
$config->add_if_page_contains_condition($val);
// check for replace_string(find): replace // check for replace_string(find): replace
} elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
if (in_array($match[1], array('replace_string'))) { if (in_array($match[1], array('replace_string'))) {
$command = $match[1];
array_push($config->find_string, $match[2]); array_push($config->find_string, $match[2]);
array_push($config->$command, $val); array_push($config->replace_string, $val);
} elseif (in_array($match[1], array('http_header'))) {
$_header = strtolower(trim($match[2]));
$config->http_header[$_header] = $val;
} }
} }
} }

View File

@ -1,7 +1,7 @@
<?php <?php
define('RSS2', 1, true); define('RSS2', 1);
define('JSON', 2, true); define('JSON', 2);
define('JSONP', 3, true); define('JSONP', 3);
/** /**
* Univarsel Feed Writer class * Univarsel Feed Writer class
@ -131,6 +131,11 @@ define('JSONP', 3, true);
$simplejson->language = null; $simplejson->language = null;
$simplejson->url = null; $simplejson->url = null;
$simplejson->effective_url = null; $simplejson->effective_url = null;
$simplejson->og_url = null;
$simplejson->og_title = null;
$simplejson->og_description = null;
$simplejson->og_image = null;
$simplejson->og_type = null;
$simplejson->content = null; $simplejson->content = null;
// actual values // actual values
$simplejson->url = $jsonitem->link; $simplejson->url = $jsonitem->link;
@ -151,6 +156,11 @@ define('JSONP', 3, true);
if (isset($jsonitem->pubDate)) { if (isset($jsonitem->pubDate)) {
$simplejson->date = gmdate(DATE_ATOM, strtotime($jsonitem->pubDate)); $simplejson->date = gmdate(DATE_ATOM, strtotime($jsonitem->pubDate));
} }
if (isset($jsonitem->og_url)) $simplejson->og_url = $jsonitem->og_url;
if (isset($jsonitem->og_title)) $simplejson->og_title = $jsonitem->og_title;
if (isset($jsonitem->og_description)) $simplejson->og_description = $jsonitem->og_description;
if (isset($jsonitem->og_image)) $simplejson->og_image = $jsonitem->og_image;
if (isset($jsonitem->og_type)) $simplejson->og_type = $jsonitem->og_type;
echo json_encode($simplejson); echo json_encode($simplejson);
} }
} }
@ -327,7 +337,7 @@ define('JSONP', 3, true);
{ {
$out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; $out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; $out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:og="http://ogp.me/ns#">' . PHP_EOL;
echo $out; echo $out;
} }
elseif ($this->version == JSON || $this->version == JSONP) elseif ($this->version == JSON || $this->version == JSONP)
@ -370,7 +380,9 @@ define('JSONP', 3, true);
{ {
foreach ($attributes as $key => $value) foreach ($attributes as $key => $value)
{ {
$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" "; //$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" ";
// TODO: replace HTML entities not supported in XML with UTF8 equivalent characters
$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8')."\" ";
} }
} }
$nodeText .= "<{$tagName}{$attrText}>"; $nodeText .= "<{$tagName}{$attrText}>";
@ -384,7 +396,9 @@ define('JSONP', 3, true);
else else
{ {
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent); //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent);
$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false); //$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false);
// TODO: replace HTML entities not supported in XML with UTF8 equivalent characters
$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8');
} }
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>"; //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>";
$nodeText .= "</$tagName>"; $nodeText .= "</$tagName>";

View File

@ -1,8 +1,8 @@
<?php <?php
/* /*
htmLawed 1.1.17, 11 March 2014 htmLawed 1.1.19, 19 January 2015
OOP code, 11 March 2014 OOP code, 19 January 2015
Copyright Santosh Patnaik Copyright Santosh Patnaik
Dual LGPL v3 and GPL v2+ license Dual LGPL v3 and GPL v2+ license
A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed
@ -478,7 +478,7 @@ while(strlen($a)){
break; case 2: // Val break; case 2: // Val
if(preg_match('`^((?:"[^"]*")|(?:\'[^\']*\')|(?:\s*[^\s"\']+))(.*)`', $a, $m)){ if(preg_match('`^((?:"[^"]*")|(?:\'[^\']*\')|(?:\s*[^\s"\']+))(.*)`', $a, $m)){
$a = ltrim($m[2]); $m = $m[1]; $w = 1; $mode = 0; $a = ltrim($m[2]); $m = $m[1]; $w = 1; $mode = 0;
$aA[$nm] = trim(($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m); $aA[$nm] = trim(str_replace('<', '&lt;', ($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m));
} }
break; break;
} }
@ -507,7 +507,7 @@ foreach($aA as $k=>$v){
$v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'htmLawed::hl_prot', $v); $v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'htmLawed::hl_prot', $v);
$v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v; $v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v;
}elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){ }elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){
$v = str_replace("\xad", ' ', (strpos($v, '&') !== false ? str_replace(array('&#xad;', '&#173;', '&shy;'), ' ', $v) : $v)); $v = str_replace("­", ' ', (strpos($v, '&') !== false ? str_replace(array('&#xad;', '&#173;', '&shy;'), ' ', $v) : $v)); # double-quoted char is soft-hyphen; appears here as "­" or hyphen or something else depending on viewing software
$v = htmLawed::hl_prot($v, $k); $v = htmLawed::hl_prot($v, $k);
if($k == 'href'){ // X-spam if($k == 'href'){ // X-spam
if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){ if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){
@ -701,7 +701,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array(
public static function hl_version(){ public static function hl_version(){
// rel // rel
return '1.1.17'; return '1.1.19';
// eof // eof
} }

View File

@ -60,20 +60,22 @@ class HTML5
* The path to the file to parse. If this is a resource, it is * The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first * assumed to be an open stream whose pointer is set to the first
* byte of input. * byte of input.
* @param array $options
* Configuration options when parsing the HTML
* @return \DOMDocument A DOM document. These object type is defined by the libxml * @return \DOMDocument A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP. * library, and should have been included with your version of PHP.
*/ */
public function load($file) public function load($file, array $options = array())
{ {
// Handle the case where file is a resource. // Handle the case where file is a resource.
if (is_resource($file)) { if (is_resource($file)) {
// FIXME: We need a StreamInputStream class. // FIXME: We need a StreamInputStream class.
return $this->loadHTML(stream_get_contents($file)); return $this->loadHTML(stream_get_contents($file), $options);
} }
$input = new FileInputStream($file); $input = new FileInputStream($file);
return $this->parse($input); return $this->parse($input, $options);
} }
/** /**
@ -84,14 +86,16 @@ class HTML5
* *
* @param string $string * @param string $string
* A html5 document as a string. * A html5 document as a string.
* @param array $options
* Configuration options when parsing the HTML
* @return \DOMDocument A DOM document. DOM is part of libxml, which is included with * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
* almost all distribtions of PHP. * almost all distribtions of PHP.
*/ */
public function loadHTML($string) public function loadHTML($string, array $options = array())
{ {
$input = new StringInputStream($string); $input = new StringInputStream($string);
return $this->parse($input); return $this->parse($input, $options);
} }
/** /**
@ -104,13 +108,15 @@ class HTML5
* The path to the file to parse. If this is a resource, it is * The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first * assumed to be an open stream whose pointer is set to the first
* byte of input. * byte of input.
* @param array $options
* Configuration options when parsing the HTML
* *
* @return \DOMDocument A DOM document. These object type is defined by the libxml * @return \DOMDocument A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP. * library, and should have been included with your version of PHP.
*/ */
public function loadHTMLFile($file) public function loadHTMLFile($file, array $options = array())
{ {
return $this->load($file); return $this->load($file, $options);
} }
/** /**
@ -118,15 +124,17 @@ class HTML5
* *
* @param string $string * @param string $string
* The html5 fragment as a string. * The html5 fragment as a string.
* @param array $options
* Configuration options when parsing the HTML
* *
* @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
* almost all distributions of PHP. * almost all distributions of PHP.
*/ */
public function loadHTMLFragment($string) public function loadHTMLFragment($string, array $options = array())
{ {
$input = new StringInputStream($string); $input = new StringInputStream($string);
return $this->parseFragment($input); return $this->parseFragment($input, $options);
} }
/** /**
@ -155,10 +163,10 @@ class HTML5
* Lower-level loading function. This requires an input stream instead * Lower-level loading function. This requires an input stream instead
* of a string, file, or resource. * of a string, file, or resource.
*/ */
public function parse(\Masterminds\HTML5\Parser\InputStream $input) public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
{ {
$this->errors = array(); $this->errors = array();
$events = new DOMTreeBuilder(false, $this->options); $events = new DOMTreeBuilder(false, array_merge($this->getOptions(), $options));
$scanner = new Scanner($input); $scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events); $parser = new Tokenizer($scanner, $events);
@ -174,9 +182,9 @@ class HTML5
* Lower-level loading function. This requires an input stream instead * Lower-level loading function. This requires an input stream instead
* of a string, file, or resource. * of a string, file, or resource.
*/ */
public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input) public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
{ {
$events = new DOMTreeBuilder(true, $this->options); $events = new DOMTreeBuilder(true, array_merge($this->getOptions(), $options));
$scanner = new Scanner($input); $scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events); $parser = new Tokenizer($scanner, $events);

View File

@ -66,6 +66,11 @@ class Elements
*/ */
const BLOCK_TAG = 64; const BLOCK_TAG = 64;
/**
* Indicates that the tag allows only inline elements as child nodes.
*/
const BLOCK_ONLY_INLINE = 128;
/** /**
* The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html. * The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html.
* *
@ -120,7 +125,7 @@ class Elements
"head" => 1, "head" => 1,
"header" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG "header" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"hgroup" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG "hgroup" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"hr" => 73, // NORMAL | VOID_TAG | BLOCK_TAG "hr" => 73, // NORMAL | VOID_TAG
"html" => 1, "html" => 1,
"i" => 1, "i" => 1,
"iframe" => 3, // NORMAL | TEXT_RAW "iframe" => 3, // NORMAL | TEXT_RAW
@ -145,7 +150,7 @@ class Elements
"optgroup" => 1, "optgroup" => 1,
"option" => 1, "option" => 1,
"output" => 65, // NORMAL | BLOCK_TAG "output" => 65, // NORMAL | BLOCK_TAG
"p" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG "p" => 209, // NORMAL | AUTOCLOSE_P | BLOCK_TAG | BLOCK_ONLY_INLINE
"param" => 9, // NORMAL | VOID_TAG "param" => 9, // NORMAL | VOID_TAG
"pre" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG "pre" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"progress" => 1, "progress" => 1,

View File

@ -38,6 +38,12 @@ class DOMTreeBuilder implements EventHandler
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
const OPT_DISABLE_HTML_NS = 'disable_html_ns';
const OPT_TARGET_DOC = 'target_document';
const OPT_IMPLICIT_NS = 'implicit_namespaces';
/** /**
* Holds the HTML5 element names that causes a namespace switch * Holds the HTML5 element names that causes a namespace switch
* *
@ -138,6 +144,12 @@ class DOMTreeBuilder implements EventHandler
protected $insertMode = 0; protected $insertMode = 0;
/**
* Track if we are in an element that allows only inline child nodes
* @var string|null
*/
protected $onlyInline;
/** /**
* Quirks mode is enabled by default. * Quirks mode is enabled by default.
* Any document that is missing the * Any document that is missing the
@ -151,6 +163,9 @@ class DOMTreeBuilder implements EventHandler
{ {
$this->options = $options; $this->options = $options;
if (isset($options[self::OPT_TARGET_DOC])) {
$this->doc = $options[self::OPT_TARGET_DOC];
} else {
$impl = new \DOMImplementation(); $impl = new \DOMImplementation();
// XXX: // XXX:
// Create the doctype. For now, we are always creating HTML5 // Create the doctype. For now, we are always creating HTML5
@ -158,6 +173,7 @@ class DOMTreeBuilder implements EventHandler
$dt = $impl->createDocumentType('html'); $dt = $impl->createDocumentType('html');
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc = $impl->createDocument(null, null, $dt); $this->doc = $impl->createDocument(null, null, $dt);
}
$this->errors = array(); $this->errors = array();
$this->current = $this->doc; // ->documentElement; $this->current = $this->doc; // ->documentElement;
@ -165,8 +181,15 @@ class DOMTreeBuilder implements EventHandler
// Create a rules engine for tags. // Create a rules engine for tags.
$this->rules = new TreeBuildingRules($this->doc); $this->rules = new TreeBuildingRules($this->doc);
$implicitNS = array();
if (isset($this->options[self::OPT_IMPLICIT_NS])) {
$implicitNS = $this->options[self::OPT_IMPLICIT_NS];
} elseif (isset($this->options["implicitNamespaces"])) {
$implicitNS = $this->options["implicitNamespaces"];
}
// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options // Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array( array_unshift($this->nsStack, $implicitNS + array(
'' => self::NAMESPACE_HTML '' => self::NAMESPACE_HTML
) + $this->implicitNamespaces); ) + $this->implicitNamespaces);
@ -320,6 +343,11 @@ class DOMTreeBuilder implements EventHandler
} }
} }
if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) {
$this->autoclose($this->onlyInline);
$this->onlyInline = null;
}
try { try {
$prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : ''; $prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : '';
@ -334,10 +362,10 @@ class DOMTreeBuilder implements EventHandler
$ele = $this->doc->importNode($frag->documentElement, true); $ele = $this->doc->importNode($frag->documentElement, true);
} else { } else {
if (isset($this->nsStack[0][$prefix])) { if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) {
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
} else {
$ele = $this->doc->createElement($lname); $ele = $this->doc->createElement($lname);
} else {
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
} }
} }
@ -346,6 +374,10 @@ class DOMTreeBuilder implements EventHandler
$ele = $this->doc->createElement('invalid'); $ele = $this->doc->createElement('invalid');
} }
if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) {
$this->onlyInline = $lname;
}
// When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them. // When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them.
// When we are on a void tag, we do not need to care about namesapce nesting. // When we are on a void tag, we do not need to care about namesapce nesting.
if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) { if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) {
@ -394,7 +426,7 @@ class DOMTreeBuilder implements EventHandler
} }
// Some elements have special processing rules. Handle those separately. // Some elements have special processing rules. Handle those separately.
if ($this->rules->hasRules($name)) { if ($this->rules->hasRules($name) && $this->frag !== $this->current) {
$this->current = $this->rules->evaluate($ele, $this->current); $this->current = $this->rules->evaluate($ele, $this->current);
} // Otherwise, it's a standard element. } // Otherwise, it's a standard element.
else { else {

View File

@ -11,9 +11,9 @@ class Scanner
const CHARS_HEX = 'abcdefABCDEF01234567890'; const CHARS_HEX = 'abcdefABCDEF01234567890';
const CHARS_ALNUM = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ'; const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
protected $is; protected $is;

View File

@ -200,10 +200,12 @@ class Tokenizer
if (is_null($this->untilTag)) { if (is_null($this->untilTag)) {
return $this->text(); return $this->text();
} }
$sequence = '</' . $this->untilTag . '>'; $sequence = '</' . $this->untilTag;
$txt = ''; $txt = '';
$tok = $this->scanner->current(); $tok = $this->scanner->current();
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))))) {
$caseSensitive = !Elements::isHtml5Element($this->untilTag);
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) {
if ($tok == '&') { if ($tok == '&') {
$txt .= $this->decodeCharacterReference(); $txt .= $this->decodeCharacterReference();
$tok = $this->scanner->current(); $tok = $this->scanner->current();
@ -212,6 +214,13 @@ class Tokenizer
$tok = $this->scanner->next(); $tok = $this->scanner->next();
} }
} }
$len = strlen($sequence);
$this->scanner->consume($len);
$len += strlen($this->scanner->whitespace());
if ($this->scanner->current() !== '>') {
$this->parseError("Unclosed RCDATA end tag");
}
$this->scanner->unconsume($len);
$this->events->text($txt); $this->events->text($txt);
$this->setTextMode(0); $this->setTextMode(0);
return $this->endTag(); return $this->endTag();
@ -353,7 +362,7 @@ class Tokenizer
} }
// We know this is at least one char. // We know this is at least one char.
$name = strtolower($this->scanner->charsWhile(":0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")); $name = strtolower($this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"));
$attributes = array(); $attributes = array();
$selfClose = false; $selfClose = false;
@ -891,7 +900,7 @@ class Tokenizer
$buffer .= $this->scanner->charsUntil($first); $buffer .= $this->scanner->charsUntil($first);
// Stop as soon as we hit the stopping condition. // Stop as soon as we hit the stopping condition.
if ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))) { if ($this->sequenceMatches($sequence, false)) {
return $buffer; return $buffer;
} }
$buffer .= $this->scanner->current(); $buffer .= $this->scanner->current();
@ -916,7 +925,7 @@ class Tokenizer
* see if the input stream is at the start of a * see if the input stream is at the start of a
* '</script>' string. * '</script>' string.
*/ */
protected function sequenceMatches($sequence) protected function sequenceMatches($sequence, $caseSensitive = true)
{ {
$len = strlen($sequence); $len = strlen($sequence);
$buffer = ''; $buffer = '';
@ -932,7 +941,7 @@ class Tokenizer
} }
$this->scanner->unconsume($len); $this->scanner->unconsume($len);
return $buffer == $sequence; return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0;
} }
/** /**
@ -1056,8 +1065,14 @@ class Tokenizer
// [a-zA-Z0-9]+; // [a-zA-Z0-9]+;
$cname = $this->scanner->getAsciiAlpha(); $cname = $this->scanner->getAsciiAlpha();
$entity = CharacterReference::lookupName($cname); $entity = CharacterReference::lookupName($cname);
// When no entity is found provide the name of the unmatched string
// and continue on as the & is not part of an entity. The & will
// be converted to &amp; elsewhere.
if ($entity == null) { if ($entity == null) {
$this->parseError("No match in entity table for '%s'", $entity); $this->parseError("No match in entity table for '%s'", $cname);
$this->scanner->unconsume($this->scanner->position() - $start);
return '&';
} }
} }

View File

@ -115,9 +115,11 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
public function document($dom) public function document($dom)
{ {
$this->doctype(); $this->doctype();
if ($dom->documentElement) {
$this->traverser->node($dom->documentElement); $this->traverser->node($dom->documentElement);
$this->nl(); $this->nl();
} }
}
protected function doctype() protected function doctype()
{ {

View File

@ -112,7 +112,7 @@ class Traverser
break; break;
// Currently we don't support embedding DTDs. // Currently we don't support embedding DTDs.
default: default:
print '<!-- Skipped -->'; //print '<!-- Skipped -->';
break; break;
} }
} }

View File

@ -2,8 +2,9 @@
Copyright (c) 2013 The Authors of HTML5-PHP Copyright (c) 2013 The Authors of HTML5-PHP
Matt Butcher - technosophos@gmail.com Matt Butcher - mattbutcher@google.com
Matt Farina - matt@mattfarina.com Matt Farina - matt@mattfarina.com
Asmir Mustafic - goetas@gmail.com
Permission is hereby granted, free of charge, to any person obtaining a copy of Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in this software and associated documentation files (the "Software"), to deal in

View File

@ -10,6 +10,7 @@ But after some initial refactoring work, we began a new parser.
- Event-based (SAX-like) parser - Event-based (SAX-like) parser
- DOM tree builder - DOM tree builder
- Interoperability with QueryPath [[in progress](https://github.com/technosophos/querypath/issues/114)] - Interoperability with QueryPath [[in progress](https://github.com/technosophos/querypath/issues/114)]
- Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer
[![Build Status](https://travis-ci.org/Masterminds/html5-php.png?branch=master)](https://travis-ci.org/Masterminds/html5-php) [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5) [![Coverage Status](https://coveralls.io/repos/Masterminds/html5-php/badge.png?branch=master)](https://coveralls.io/r/Masterminds/html5-php?branch=master) [![Build Status](https://travis-ci.org/Masterminds/html5-php.png?branch=master)](https://travis-ci.org/Masterminds/html5-php) [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5) [![Coverage Status](https://coveralls.io/repos/Masterminds/html5-php/badge.png?branch=master)](https://coveralls.io/r/Masterminds/html5-php?branch=master)
@ -22,12 +23,12 @@ To install, add `masterminds/html5` to your `composer.json` file:
``` ```
{ {
"require" : { "require" : {
"masterminds/html5": "1.*" "masterminds/html5": "2.*"
}, },
} }
``` ```
(You may substitute `1.*` for a more specific release tag, of (You may substitute `2.*` for a more specific release tag, of
course.) course.)
From there, use the `composer install` or `composer update` commands to From there, use the `composer install` or `composer update` commands to
@ -43,6 +44,7 @@ Here is how you use the high-level `HTML5` library API:
<?php <?php
// Assuming you installed from Composer: // Assuming you installed from Composer:
require "vendor/autoload.php"; require "vendor/autoload.php";
use Masterminds\HTML5;
// An example HTML document: // An example HTML document:
@ -59,13 +61,14 @@ $html = <<< 'HERE'
HERE; HERE;
// Parse the document. $dom is a DOMDocument. // Parse the document. $dom is a DOMDocument.
$dom = HTML5::loadHTML($html); $html5 = new HTML5();
$dom = $html5->loadHTML($html);
// Render it as HTML5: // Render it as HTML5:
print HTML5::saveHTML($dom); print $html5->saveHTML($dom);
// Or save it to a file: // Or save it to a file:
HTML5::save($dom, 'out.html'); $html5->save($dom, 'out.html');
?> ?>
``` ```
@ -73,6 +76,35 @@ HTML5::save($dom, 'out.html');
The `$dom` created by the parser is a full `DOMDocument` object. And the The `$dom` created by the parser is a full `DOMDocument` object. And the
`save()` and `saveHTML()` methods will take any DOMDocument. `save()` and `saveHTML()` methods will take any DOMDocument.
### Options
It is possible to pass in an array of configuration options when loading
an HTML5 document.
```php
// An associative array of options
$options = array(
'option_name' => 'option_value',
);
// Provide the options to the constructor
$html5 = new HTML5($options);
$dom = $html5->loadHTML($html);
```
The following options are supported:
* `encode_entities` (boolean): Indicates that the serializer should aggressively
encode characters as entities. Without this, it only encodes the bare
minimum.
* `disable_html_ns` (boolean): Prevents the parser from automatically
assigning the HTML5 namespace to the DOM document. This is for
non-namespace aware DOM tools.
* `target_document` (\DOMDocument): A DOM document that will be used as the
destination for the parsed nodes.
* `implicit_namespaces` (array): An assoc array of namespaces that should be
used by the parser. Name is tag prefix, value is NS URI.
## The Low-Level API ## The Low-Level API
@ -116,7 +148,7 @@ different rule sets to be used.
- The `Traverser`, which is a special-purpose tree walker. It visits - The `Traverser`, which is a special-purpose tree walker. It visits
each node node in the tree and uses the `OutputRules` to transform the node each node node in the tree and uses the `OutputRules` to transform the node
into a string. into a string.
- `\HTML5` manages the `Traverser` and stores the resultant data - `HTML5` manages the `Traverser` and stores the resultant data
in the correct place. in the correct place.
The serializer (`save()`, `saveHTML()`) follows the The serializer (`save()`, `saveHTML()`) follows the
@ -134,7 +166,9 @@ issues known issues that are not presently on the roadmap:
- Namespaces: HTML5 only [supports a selected list of namespaces](http://www.w3.org/TR/html5/infrastructure.html#namespaces) - Namespaces: HTML5 only [supports a selected list of namespaces](http://www.w3.org/TR/html5/infrastructure.html#namespaces)
and they do not operate in the same way as XML namespaces. A `:` has no special and they do not operate in the same way as XML namespaces. A `:` has no special
meaning. The parser does not support XML style namespaces via `:`. meaning.
By default the parser does not support XML style namespaces via `:`;
to enable the XML namespaces see the [XML Namespaces section](#xml-namespaces)
- Scripts: This parser does not contain a JavaScript or a CSS - Scripts: This parser does not contain a JavaScript or a CSS
interpreter. While one may be supplied, not all features will be interpreter. While one may be supplied, not all features will be
supported. supported.
@ -162,8 +196,45 @@ issues known issues that are not presently on the roadmap:
- PLAINTEXT: Unsupported. - PLAINTEXT: Unsupported.
- Adoption Agency Algorithm: Not yet implemented. (8.2.5.4.7) - Adoption Agency Algorithm: Not yet implemented. (8.2.5.4.7)
##XML Namespaces
To use XML style namespaces you have to configure well the main `HTML5` instance.
```php
use Masterminds\HTML5;
$html = new HTML5(array(
"xmlNamespaces" => true
));
$dom = $html->loadHTML('<t:tag xmlns:t="http://www.example.com"/>');
$dom->documentElement->namespaceURI; // http://www.example.com
```
You can also add some default prefixes that will not require the namespace declaration,
but it's elements will be namespaced.
```php
use Masterminds\HTML5;
$html = new HTML5(array(
"implicitNamespaces"=>array(
"t"=>"http://www.example.com"
)
));
$dom = $html->loadHTML('<t:tag/>');
$dom->documentElement->namespaceURI; // http://www.example.com
```
## Thanks to... ## Thanks to...
The dedicated (and patient) contributors of patches small and large,
who have already made this library better.See the CREDITS file for
a list of contributors.
We owe a huge debt of gratitude to the original authors of html5lib. We owe a huge debt of gratitude to the original authors of html5lib.
While not much of the orignal parser remains, we learned a lot from While not much of the orignal parser remains, we learned a lot from

View File

@ -1,5 +1,42 @@
# Release Notes # Release Notes
2.1.1 (2015-03-23)
- #78: Fixes bug where unmatched entity like string drops everything after &.
2.1.0 (2015-02-01)
- #74: Added `disable_html_ns` and `target_doc` dom parsing options
- Unified option names
- #73: Fixed alphabet, &szlig; now can be detected
- #75 and #76: Allow whitespace in RCDATA tags
- #77: Fixed parsing blunder for json embeds
- #72: Add options to HTML methods
2.0.2 (2014-12-17)
- #50: empty document handling
- #63: tags with strange capitalization
- #65: dashes and underscores as allowed characters in tag names
- #68: Fixed issue with non-inline elements inside inline containers
2.0.1 (2014-09-23)
- #59: Fixed issue parsing some fragments.
- #56: Incorrectly saw 0 as empty string
- Sami as new documentation generator
2.0.0 (2014-07-28)
- #53: Improved boolean attributes handling
- #52: Facebook HHVM compatibility
- #48: Adopted PSR-2 as coding standard
- #47: Moved everything to Masterminds namespace
- #45: Added custom namespaces
- #44: Added support to XML-style namespaces
- #37: Refactored HTML5 class removing static methods
1.0.5 (2014-06-10)
- #38: Set the dev-master branch as the 1.0.x branch for composer (goetas)
- #34: Tests use PSR-4 for autoloading. (goetas)
- #40, #41: Fix entity handling in RCDATA sections. (KitaitiMakoto)
- #32: Fixed issue where wharacter references were being incorrectly encoded in style tags.
1.0.4 (2014-04-29) 1.0.4 (2014-04-29)
- #30/#31 Don't throw an exception for invalid tag names. - #30/#31 Don't throw an exception for invalid tag names.

View File

@ -7,11 +7,11 @@
* For environments which do not have these options, it reverts to standard sequential * For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents()) * requests (using file_get_contents())
* *
* @version 1.5 * @version 1.6
* @date 2014-03-28 * @date 2015-06-05
* @see http://php.net/HttpRequestPool * @see http://devel-m6w6.rhcloud.com/mdref/http
* @author Keyvan Minoukadeh * @author Keyvan Minoukadeh
* @copyright 2011-2014 Keyvan Minoukadeh * @copyright 2011-2015 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/ */
@ -22,7 +22,7 @@ class HumbleHttpAgent
const METHOD_FILE_GET_CONTENTS = 4; const METHOD_FILE_GET_CONTENTS = 4;
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
const UA_PHP = 'PHP/5.4'; const UA_PHP = 'PHP/5.5';
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
protected $requests = array(); protected $requests = array();
@ -38,6 +38,7 @@ class HumbleHttpAgent
public $debug = false; public $debug = false;
public $debugVerbose = false; public $debugVerbose = false;
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
public $siteConfigBuilder = null; // can be set to an instance of ContentExtractor to have site config files used for custom HTTP headers
public $maxRedirects = 5; public $maxRedirects = 5;
public $userAgentMap = array(); public $userAgentMap = array();
public $rewriteUrls = array(); public $rewriteUrls = array();
@ -67,7 +68,7 @@ class HumbleHttpAgent
if (in_array($method, array(1,2,4))) { if (in_array($method, array(1,2,4))) {
$this->method = $method; $this->method = $method;
} else { } else {
if (class_exists('HttpRequestPool')) { if (class_exists('http\Client\Request')) {
$this->method = self::METHOD_REQUEST_POOL; $this->method = self::METHOD_REQUEST_POOL;
} elseif (function_exists('curl_multi_init')) { } elseif (function_exists('curl_multi_init')) {
$this->method = self::METHOD_CURL_MULTI; $this->method = self::METHOD_CURL_MULTI;
@ -192,6 +193,7 @@ class HumbleHttpAgent
return false; return false;
} }
$redirect_url = $match[1]; $redirect_url = $match[1];
$redirect_url = htmlspecialchars_decode($redirect_url); // For Facebook!
if (preg_match('!^https?://!i', $redirect_url)) { if (preg_match('!^https?://!i', $redirect_url)) {
// already absolute // already absolute
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url); $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
@ -203,7 +205,7 @@ class HumbleHttpAgent
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
return $absolute; return $absolute->get_iri();
} }
return false; return false;
} }
@ -293,14 +295,16 @@ class HumbleHttpAgent
if (empty($urls)) return; if (empty($urls)) return;
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// parallel (HttpRequestPool) // parallel (HTTP extension)
if ($this->method == self::METHOD_REQUEST_POOL) { if ($this->method == self::METHOD_REQUEST_POOL) {
$this->debug('Starting parallel fetch (HttpRequestPool)'); $this->debug('Starting parallel fetch (HTTP Extension)');
try { try {
while (count($urls) > 0) { while (count($urls) > 0) {
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
$subset = array_splice($urls, 0, $this->maxParallelRequests); $subset = array_splice($urls, 0, $this->maxParallelRequests);
$pool = new HttpRequestPool(); //$pool = new HttpRequestPool();
$pool = new http\Client;
$pool->setOptions($this->requestOptions);
foreach ($subset as $orig => $url) { foreach ($subset as $orig => $url) {
if (!$isRedirect) $orig = $url; if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]); unset($this->redirectQueue[$orig]);
@ -320,24 +324,62 @@ class HumbleHttpAgent
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url); $req_url = $this->removeFragment($req_url);
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
$_meth = HttpRequest::METH_HEAD; $_meth = "HEAD";
} else { } else {
$_meth = HttpRequest::METH_GET; $_meth = "GET";
unset($this->requests[$orig]['wrongGuess']); unset($this->requests[$orig]['wrongGuess']);
} }
$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); //$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
// send cookies, if we have any $httpRequest = new http\Client\Request($_meth, $req_url);
if ($cookies = $this->getCookies($orig, $req_url)) { $httpRequest->setOptions($this->requestOptions);
$this->debug("......sending cookies: $cookies");
$httpRequest->addHeaders(array('Cookie' => $cookies)); // check site config for additional http headers
$scHeaders = array();
if (isset($this->siteConfigBuilder)) {
$scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header;
} }
//$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
$httpRequest->addHeaders($this->getUserAgent($req_url, true)); // send cookies, if we have any
$_cookies = null;
if (isset($scHeaders['cookie'])) {
$_cookies = $scHeaders['cookie'];
} else {
//$_cookies = $this->cookieJar->getMatchingCookies($req_url);
$_cookies = $this->getCookies($orig, $req_url);
}
if ($_cookies) {
$this->debug("......sending cookies: $_cookies");
$httpRequest->addHeaders(array('Cookie' => $_cookies));
}
// send user agent
$_ua = null;
if (isset($scHeaders['user-agent'])) {
$_ua = $scHeaders['user-agent'];
} else {
$_ua = $this->getUserAgent($req_url, true);
$_ua = $_ua['User-Agent'];
}
if ($_ua) {
$this->debug("......user-agent set to: $_ua");
$httpRequest->addHeaders(array('User-Agent' => $_ua));
}
// add referer for picky sites // add referer for picky sites
$httpRequest->addheaders(array('Referer' => $this->referer)); $_referer = null;
if (isset($scHeaders['referer'])) {
$_referer = $scHeaders['referer'];
} else {
$_referer = $this->referer;
}
if ($_referer) {
$this->debug("......referer set to: $_referer");
$httpRequest->addheaders(array('Referer'=>$_referer));
}
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$this->requests[$orig]['original_url'] = $orig; $this->requests[$orig]['original_url'] = $orig;
$pool->attach($httpRequest); $pool->enqueue($httpRequest);
} }
} }
// did we get anything into the pool? // did we get anything into the pool?
@ -345,16 +387,20 @@ class HumbleHttpAgent
$this->debug('Sending request...'); $this->debug('Sending request...');
try { try {
$pool->send(); $pool->send();
} catch (HttpRequestPoolException $e) { } catch (http\Exception $e) {
// do nothing // do nothing
} }
$this->debug('Received responses'); $this->debug('Received responses');
foreach($subset as $orig => $url) { foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url; if (!$isRedirect) $orig = $url;
$request = $this->requests[$orig]['httpRequest']; $request = $this->requests[$orig]['httpRequest'];
$response = $pool->getResponse($request);
//$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
// getResponseHeader() doesn't return status line, so, for consistency... // getResponseHeader() doesn't return status line, so, for consistency...
$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); //$headers = $response->toString();
$this->requests[$orig]['headers'] = $response->getInfo()."\n".$this->headersToString($response->getHeaders(), true);
// v1 HTTP extension code
//$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
// check content type // check content type
// TODO: use getResponseHeader('content-type') or getResponseInfo() // TODO: use getResponseHeader('content-type') or getResponseInfo()
if ($this->headerOnlyType($this->requests[$orig]['headers'])) { if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
@ -362,25 +408,37 @@ class HumbleHttpAgent
$_header_only_type = true; $_header_only_type = true;
$this->debug('Header only type returned'); $this->debug('Header only type returned');
} else { } else {
$this->requests[$orig]['body'] = $request->getResponseBody(); $this->requests[$orig]['body'] = $response->getBody()->toString();
//var_dump($this->requests[$orig]['body']);exit;
// v1 HTTP ext. code
//$this->requests[$orig]['body'] = $request->getResponseBody();
$_header_only_type = false; $_header_only_type = false;
} }
$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); $this->requests[$orig]['effective_url'] = $response->getTransferInfo('effective_url');
$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); $this->requests[$orig]['status_code'] = $status_code = $response->getResponseCode();
// v1 HTTP ext. code
//$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
//$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
// is redirect? // is redirect?
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $response->getHeader('location')) {
$redirectURL = $request->getResponseHeader('location'); // v1 HTTP ext. code
//if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
$redirectURL = $response->getHeader('location');
if (!preg_match('!^https?://!i', $redirectURL)) { if (!preg_match('!^https?://!i', $redirectURL)) {
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
} }
if ($this->validateURL($redirectURL)) { if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL); $this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
//$cookies = $request->getResponseHeader('set-cookie');
//if ($cookies && !is_array($cookies)) $cookies = array($cookies);
//if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
$this->storeCookies($orig, $url); $this->storeCookies($orig, $url);
$this->redirectQueue[$orig] = $redirectURL; $this->redirectQueue[$orig] = $redirectURL;
} else { } else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL); $this->debug('Redirect detected. Invalid URL: '.$redirectURL);
} }
} elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { } elseif (!$_header_only_type && $request->getRequestMethod() == "HEAD") {
// the response content-type did not match our 'header only' types, // the response content-type did not match our 'header only' types,
// but we'd issues a HEAD request because we assumed it would. So // but we'd issues a HEAD request because we assumed it would. So
// let's queue a proper GET request for this item... // let's queue a proper GET request for this item...
@ -399,7 +457,7 @@ class HumbleHttpAgent
} }
} }
//die($url.' -multi- '.$request->getResponseInfo('effective_url')); //die($url.' -multi- '.$request->getResponseInfo('effective_url'));
$pool->detach($request); $pool->dequeue($request);
unset($this->requests[$orig]['httpRequest'], $request); unset($this->requests[$orig]['httpRequest'], $request);
/* /*
if ($this->minimiseMemoryUse) { if ($this->minimiseMemoryUse) {
@ -411,7 +469,7 @@ class HumbleHttpAgent
} }
} }
} }
} catch (HttpException $e) { } catch (http\Exception $e) {
$this->debug($e); $this->debug($e);
return false; return false;
} }
@ -452,15 +510,51 @@ class HumbleHttpAgent
unset($this->requests[$orig]['wrongGuess']); unset($this->requests[$orig]['wrongGuess']);
} }
$headers = array(); $headers = array();
//$headers[] = 'User-Agent: '.$this->userAgent;
$headers[] = $this->getUserAgent($req_url); // check site config for additional http headers
// add referer for picky sites $scHeaders = array();
$headers[] = 'Referer: '.$this->referer; if (isset($this->siteConfigBuilder)) {
// send cookies, if we have any $scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header;
if ($cookies = $this->getCookies($orig, $req_url)) {
$this->debug("......sending cookies: $cookies");
$headers[] = 'Cookie: '.$cookies;
} }
// send cookies, if we have any
$_cookies = null;
if (isset($scHeaders['cookie'])) {
$_cookies = $scHeaders['cookie'];
} else {
//$_cookies = $this->cookieJar->getMatchingCookies($req_url);
$_cookies = $this->getCookies($orig, $req_url);
}
if ($_cookies) {
$this->debug("......sending cookies: $_cookies");
$headers[] = 'Cookie: '.$_cookies;
}
// send user agent
$_ua = null;
if (isset($scHeaders['user-agent'])) {
$_ua = $scHeaders['user-agent'];
} else {
$_ua = $this->getUserAgent($req_url, true);
$_ua = $_ua['User-Agent'];
}
if ($_ua) {
$this->debug("......user-agent set to: $_ua");
$headers[] = 'User-Agent: '.$_ua;
}
// add referer for picky sites
$_referer = null;
if (isset($scHeaders['referer'])) {
$_referer = $scHeaders['referer'];
} else {
$_referer = $this->referer;
}
if ($_referer) {
$this->debug("......referer set to: $_referer");
$headers[] = 'Referer: '.$_referer;
}
$httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, $this->curlOptions); $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, $this->curlOptions);
$httpRequest->set_original_url($orig); $httpRequest->set_original_url($orig);
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
@ -494,6 +588,9 @@ class HumbleHttpAgent
} }
if ($this->validateURL($redirectURL)) { if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL); $this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
//$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
//if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->storeCookies($orig, $url); $this->storeCookies($orig, $url);
$this->redirectQueue[$orig] = $redirectURL; $this->redirectQueue[$orig] = $redirectURL;
} else { } else {
@ -548,15 +645,52 @@ class HumbleHttpAgent
$req_url = $this->rewriteUrls($url); $req_url = $this->rewriteUrls($url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url); $req_url = $this->removeFragment($req_url);
// send cookies, if we have any
$httpContext = $this->httpContext; $httpContext = $this->httpContext;
$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
// add referer for picky sites // check site config for additional http headers
$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; $scHeaders = array();
if ($cookies = $this->getCookies($orig, $req_url)) { if (isset($this->siteConfigBuilder)) {
$this->debug("......sending cookies: $cookies"); $scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header;
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
} }
// send cookies, if we have any
$_cookies = null;
if (isset($scHeaders['cookie'])) {
$_cookies = $scHeaders['cookie'];
} else {
//$_cookies = $this->cookieJar->getMatchingCookies($req_url);
$_cookies = $this->getCookies($orig, $req_url);
}
if ($_cookies) {
$this->debug("......sending cookies: $_cookies");
$httpContext['http']['header'] .= 'Cookie: '.$_cookies."\r\n";
}
// send user agent
$_ua = null;
if (isset($scHeaders['user-agent'])) {
$_ua = $scHeaders['user-agent'];
} else {
$_ua = $this->getUserAgent($req_url, true);
$_ua = $_ua['User-Agent'];
}
if ($_ua) {
$this->debug("......user-agent set to: $_ua");
$httpContext['http']['header'] .= 'User-Agent: '.$_ua."\r\n";
}
// add referer for picky sites
$_referer = null;
if (isset($scHeaders['referer'])) {
$_referer = $scHeaders['referer'];
} else {
$_referer = $this->referer;
}
if ($_referer) {
$this->debug("......referer set to: $_referer");
$httpContext['http']['header'] .= 'Referer: '.$_referer."\r\n";
}
if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
$this->debug('Received response'); $this->debug('Received response');
// get status code // get status code
@ -585,6 +719,9 @@ class HumbleHttpAgent
} }
if ($this->validateURL($redirectURL)) { if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL); $this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
//$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
//if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->storeCookies($orig, $url); $this->storeCookies($orig, $url);
$this->redirectQueue[$orig] = $redirectURL; $this->redirectQueue[$orig] = $redirectURL;
} else { } else {
@ -680,7 +817,7 @@ class HumbleHttpAgent
} }
public function parallelSupport() { public function parallelSupport() {
return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); return class_exists('http\Client') || function_exists('curl_multi_init');
} }
private function headerOnlyType($headers) { private function headerOnlyType($headers) {
@ -727,6 +864,7 @@ class HumbleHttpAgent
protected function deleteCookies() { protected function deleteCookies() {
$this->cookieJar = array(); $this->cookieJar = array();
} }
} }
// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930

View File

@ -22,6 +22,7 @@ class HumbleHttpAgentDummy
public $userAgentMap = array(); public $userAgentMap = array();
public $rewriteUrls = array(); public $rewriteUrls = array();
public $userAgentDefault; public $userAgentDefault;
public $siteConfigBuilder = null;
public $referer; public $referer;
protected $body = ''; protected $body = '';

View File

@ -12,7 +12,7 @@
* More information: http://fivefilters.org/content-only/ * More information: http://fivefilters.org/content-only/
* License: Apache License, Version 2.0 * License: Apache License, Version 2.0
* Requires: PHP5 * Requires: PHP5
* Date: 2014-03-27 * Date: 2015-06-01
* *
* Differences between the PHP port and the original * Differences between the PHP port and the original
* ------------------------------------------------------ * ------------------------------------------------------
@ -95,7 +95,7 @@ class Readability
// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
'normalize' => '/\s{2,}/', 'normalize' => '/\s{2,}/',
'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/', 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|twitch\.tv)!i', 'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|soundcloud\.com|twitch\.tv)!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
); );
@ -121,8 +121,12 @@ class Readability
if (version_compare(PHP_VERSION, '5.3.0') >= 0) { if (version_compare(PHP_VERSION, '5.3.0') >= 0) {
//use Masterminds\HTML5; //use Masterminds\HTML5;
$html5class = 'Masterminds\HTML5'; $html5class = 'Masterminds\HTML5';
$html5 = new $html5class(); $html5 = new $html5class(array('disable_html_ns' => true));
$this->dom = $html5->loadHTML($html); $this->dom = $html5->loadHTML($html);
//echo $html5->saveHTML($this->dom);exit;
//$xpath = new DOMXPath($this->dom);
//$elems = $xpath->query("//a");
//print_r($elems);exit;
} }
} }
if ($this->dom === null) { if ($this->dom === null) {
@ -314,7 +318,11 @@ class Readability
$styleTags = $this->dom->getElementsByTagName('style'); $styleTags = $this->dom->getElementsByTagName('style');
for ($i = $styleTags->length-1; $i >= 0; $i--) for ($i = $styleTags->length-1; $i >= 0; $i--)
{ {
$styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); try {
@$styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
} catch (Exception $e) {
// Do nothing
}
} }
/* Turn all double br's into p's */ /* Turn all double br's into p's */
@ -832,7 +840,11 @@ class Readability
$scripts = $doc->getElementsByTagName('script'); $scripts = $doc->getElementsByTagName('script');
for($i = $scripts->length-1; $i >= 0; $i--) for($i = $scripts->length-1; $i >= 0; $i--)
{ {
try {
$scripts->item($i)->parentNode->removeChild($scripts->item($i)); $scripts->item($i)->parentNode->removeChild($scripts->item($i));
} catch (Exception $e) {
// do nothing
}
} }
} }

View File

@ -1,10 +1,10 @@
<?php <?php
// Full-Text RSS: Create Full-Text Feeds // Full-Text RSS: Create Full-Text Feeds
// Author: Keyvan Minoukadeh // Author: Keyvan Minoukadeh
// Copyright (c) 2014 Keyvan Minoukadeh // Copyright (c) 2015 Keyvan Minoukadeh
// License: AGPLv3 // License: AGPLv3
// Version: 3.4 // Version: 3.5
// Date: 2014-08-28 // Date: 2015-05-29
// More info: http://fivefilters.org/content-only/ // More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org // Help: http://help.fivefilters.org
@ -30,6 +30,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
error_reporting(E_ALL ^ E_NOTICE); error_reporting(E_ALL ^ E_NOTICE);
libxml_use_internal_errors(true); libxml_use_internal_errors(true);
libxml_disable_entity_loader(true);
ini_set("display_errors", 1); ini_set("display_errors", 1);
@set_time_limit(120); @set_time_limit(120);
@ -234,7 +235,7 @@ if (isset($_REQUEST['accept']) && in_array(strtolower($_REQUEST['accept']), arra
$user_submitted_config = null; $user_submitted_config = null;
if (isset($_REQUEST['siteconfig'])) { if (isset($_REQUEST['siteconfig'])) {
$user_submitted_config = $_REQUEST['siteconfig']; $user_submitted_config = $_REQUEST['siteconfig'];
if (!$options->user_submitted_content && $user_submitted_config) { if (!$options->user_submitted_config && $user_submitted_config) {
die('User-submitted site configs are currently disabled. Please remove the siteconfig parameter.'); die('User-submitted site configs are currently disabled. Please remove the siteconfig parameter.');
} }
} }
@ -526,7 +527,8 @@ if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') {
} }
$http = new HumbleHttpAgent($_req_options); $http = new HumbleHttpAgent($_req_options);
$http->debug = $debug_mode; $http->debug = $debug_mode;
$http->userAgentMap = $options->user_agents; // User agents can now be set in site config files using the http_header directive
//$http->userAgentMap = $options->user_agents;
$http->headerOnlyTypes = array_keys($options->content_type_exc); $http->headerOnlyTypes = array_keys($options->content_type_exc);
$http->rewriteUrls = $options->rewrite_url; $http->rewriteUrls = $options->rewrite_url;
unset($_req_options); unset($_req_options);
@ -545,6 +547,7 @@ $extractor->parserOverride = $parser;
if ($options->user_submitted_config && $user_submitted_config) { if ($options->user_submitted_config && $user_submitted_config) {
$extractor->setUserSubmittedConfig($user_submitted_config); $extractor->setUserSubmittedConfig($user_submitted_config);
} }
$http->siteConfigBuilder = $extractor;
//////////////////////////////// ////////////////////////////////
// Get RSS/Atom feed // Get RSS/Atom feed
@ -655,7 +658,7 @@ $items = $feed->get_items(0, $max);
$urls_sanitized = array(); $urls_sanitized = array();
$urls = array(); $urls = array();
foreach ($items as $key => $item) { foreach ($items as $key => $item) {
$permalink = htmlspecialchars_decode($item->get_permalink()); $permalink = htmlspecialchars_decode(trim($item->get_permalink()));
// Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded // Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
$permalink = str_replace('%3A', ':', $permalink); $permalink = str_replace('%3A', ':', $permalink);
// validateUrl() strips non-ascii characters // validateUrl() strips non-ascii characters
@ -974,6 +977,13 @@ foreach ($items as $key => $item) {
} }
} }
// add open graph
if ($opengraph = $extractor->getOpenGraph()) {
foreach ($opengraph as $og_prop => $og_val) {
$newitem->addElement($og_prop, $og_val);
}
}
// add language // add language
if ($detect_language) { if ($detect_language) {
$language = $extractor->getLanguage(); $language = $extractor->getLanguage();
@ -1390,6 +1400,17 @@ function get_single_page($item, $html, $url) {
// Loop through single_page_link xpath expressions // Loop through single_page_link xpath expressions
$single_page_url = null; $single_page_url = null;
foreach ($splink as $pattern) { foreach ($splink as $pattern) {
// Do we have conditions?
$condition = $site_config->get_if_page_contains_condition('single_page_link', $pattern);
if ($condition) {
$elems = @$xpath->evaluate($condition, $readability->dom);
if ($elems instanceof DOMNodeList && $elems->length > 0) {
// all fine
} else {
// move on to next single page link XPath
continue;
}
}
$elems = @$xpath->evaluate($pattern, $readability->dom); $elems = @$xpath->evaluate($pattern, $readability->dom);
if (is_string($elems)) { if (is_string($elems)) {
$single_page_url = trim($elems); $single_page_url = trim($elems);

3
robots.txt Normal file
View File

@ -0,0 +1,3 @@
User-agent: *
Disallow: /makefulltextfeed.php
Disallow: /extract.php

View File

@ -0,0 +1,3 @@
<?php
// this is here to prevent directory listing over the web
?>