Full-Text RSS 3.5
This commit is contained in:
parent
daedf214fe
commit
bfed79edc7
@ -1,9 +1,9 @@
|
||||
<?php
|
||||
// Update site config files for Full-Text RSS
|
||||
// Author: Keyvan Minoukadeh
|
||||
// Copyright (c) 2014 Keyvan Minoukadeh
|
||||
// Copyright (c) 2015 Keyvan Minoukadeh
|
||||
// License: AGPLv3
|
||||
// Date: 2014-08-19
|
||||
// Date: 2015-06-10
|
||||
// More info: http://fivefilters.org/content-only/
|
||||
// Help: http://help.fivefilters.org
|
||||
|
||||
@ -245,7 +245,7 @@ function println($txt) {
|
||||
}
|
||||
|
||||
function rrmdir($dir) {
|
||||
foreach(glob($dir . '/{*.txt,*.php,.*.txt,.*.php,.gitattributes,.gitignore,ftr-site-config-master,README.md}', GLOB_BRACE|GLOB_NOSORT) as $file) {
|
||||
foreach(glob($dir . '/{*.txt,*.php,*.com,.*.txt,.*.php,.*.com,.gitattributes,.gitignore,ftr-site-config-master,README.md}', GLOB_BRACE|GLOB_NOSORT) as $file) {
|
||||
if(is_dir($file)) {
|
||||
rrmdir($file);
|
||||
} else {
|
||||
|
@ -2,6 +2,19 @@ FiveFilters.org: Full-Text RSS
|
||||
http://fivefilters.org/content-only/
|
||||
CHANGELOG
|
||||
------------------------------------
|
||||
3.5 (2015-06-13)
|
||||
- Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
|
||||
- Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
|
||||
- Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky)
|
||||
- Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates)
|
||||
- Humble HTTP Agent updated to support version 2 of PHP's HTTP extension
|
||||
- HTML5-PHP library updated
|
||||
- Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6
|
||||
- Config option removed: $options->user_agents - use site config files.
|
||||
- Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional.
|
||||
- Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4
|
||||
- Site config files updated for better extraction
|
||||
- Other minor fixes/improvements
|
||||
|
||||
3.4.1 (unreleased)
|
||||
- Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c
|
||||
|
18
config.php
18
config.php
@ -430,22 +430,6 @@ $options->fingerprints = array(
|
||||
'<meta name="generator" content="WordPress' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true)
|
||||
);
|
||||
|
||||
// User Agent strings - mapping domain names
|
||||
// ----------------------
|
||||
// e.g. $options->user_agents = array('example.org' => 'PHP/5.2');
|
||||
$options->user_agents = array( 'lifehacker.com' => 'PHP/5.2',
|
||||
'gawker.com' => 'PHP/5.2',
|
||||
'deadspin.com' => 'PHP/5.2',
|
||||
'kotaku.com' => 'PHP/5.2',
|
||||
'jezebel.com' => 'PHP/5.2',
|
||||
'io9.com' => 'PHP/5.2',
|
||||
'jalopnik.com' => 'PHP/5.2',
|
||||
'gizmodo.com' => 'PHP/5.2',
|
||||
'.wikipedia.org' => 'Mozilla/5.2',
|
||||
'.fok.nl' => 'Googlebot/2.1',
|
||||
'getpocket.com' => 'PHP/5.2'
|
||||
);
|
||||
|
||||
// URL Rewriting
|
||||
// ----------------------
|
||||
// Currently allows simple string replace of URLs.
|
||||
@ -500,7 +484,7 @@ $options->cache_cleanup = 100;
|
||||
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
|
||||
/////////////////////////////////////////////////
|
||||
|
||||
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.4');
|
||||
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.5');
|
||||
|
||||
if (basename(__FILE__) == 'config.php') {
|
||||
if (file_exists(dirname(__FILE__).'/custom_config.php')) {
|
||||
|
@ -16,17 +16,19 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
|
||||
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
|
||||
*/
|
||||
|
||||
$app_name = 'Full-Text RSS 3.3';
|
||||
$app_name = 'Full-Text RSS 3.5';
|
||||
|
||||
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
|
||||
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
|
||||
//$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
|
||||
// HHVM works okay, but no Tidy and autoupdate of site config files not working (tested 3.7.1)
|
||||
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.3.0', '>='));
|
||||
$pcre_ok = extension_loaded('pcre');
|
||||
$zlib_ok = extension_loaded('zlib');
|
||||
$mbstring_ok = extension_loaded('mbstring');
|
||||
$iconv_ok = extension_loaded('iconv');
|
||||
$tidy_ok = function_exists('tidy_parse_string');
|
||||
$curl_ok = function_exists('curl_exec');
|
||||
$parallel_ok = ((extension_loaded('http') && class_exists('HttpRequestPool')) || ($curl_ok && function_exists('curl_multi_init')));
|
||||
$parallel_ok = ((extension_loaded('http') && class_exists('http\Client\Request')) || ($curl_ok && function_exists('curl_multi_init')));
|
||||
$allow_url_fopen_ok = (bool)ini_get('allow_url_fopen');
|
||||
$filter_ok = extension_loaded('filter');
|
||||
|
||||
@ -201,7 +203,7 @@ div.chunk {
|
||||
<tbody>
|
||||
<tr class="<?php echo ($php_ok) ? 'enabled' : 'disabled'; ?>">
|
||||
<td>PHP</td>
|
||||
<td>5.2.0 or higher</td>
|
||||
<td>5.3 or higher</td>
|
||||
<td><?php echo phpversion(); ?></td>
|
||||
</tr>
|
||||
<tr class="<?php echo ($xml_ok) ? 'enabled, and sane' : 'disabled, or broken'; ?>">
|
||||
@ -306,9 +308,9 @@ div.chunk {
|
||||
<?php endif; ?>
|
||||
|
||||
<?php if ($parallel_ok): ?>
|
||||
<li><strong>Parallel URL fetching:</strong> You have <code>HttpRequestPool</code> or <code>curl_multi</code> support installed. No problems here.</li>
|
||||
<li><strong>Parallel URL fetching:</strong> You have PHP's HTTP extension or <code>curl_multi</code> installed. No problems here.</li>
|
||||
<?php else: ?>
|
||||
<li class="highlight"><strong>Parallel URL fetching:</strong> <code>HttpRequestPool</code> or <code>curl_multi</code> support is not available. <?php echo $app_name; ?> will use <code>file_get_contents()</code> instead to fetch URLs sequentially rather than in parallel.</li>
|
||||
<li class="highlight"><strong>Parallel URL fetching:</strong> HTTP extension or <code>curl_multi</code> support is not available. <?php echo $app_name; ?> will use <code>file_get_contents()</code> instead to fetch URLs sequentially rather than in parallel.</li>
|
||||
<?php endif; ?>
|
||||
|
||||
<?php else: ?>
|
||||
@ -352,11 +354,11 @@ div.chunk {
|
||||
<div class="chunk">
|
||||
<h3>Further info</h3>
|
||||
<h4>HTTP module</h4>
|
||||
<p>Full-Text RSS can make use of <code>HttpRequestPool</code> or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p>
|
||||
<p>Full-Text RSS can make use of PHP's HTTP extension or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p>
|
||||
<?php
|
||||
$http_type = 'file_get_contents';
|
||||
if (extension_loaded('http') && class_exists('HttpRequestPool')) {
|
||||
$http_type = 'HttpRequestPool';
|
||||
if (extension_loaded('http') && class_exists('http\Client\Request')) {
|
||||
$http_type = 'HTTP extension';
|
||||
} elseif ($curl_ok && function_exists('curl_multi_init')) {
|
||||
$http_type = 'curl_multi';
|
||||
}
|
||||
|
@ -582,8 +582,8 @@ if (!defined('_FF_FTR_INDEX')) {
|
||||
|
||||
<h3>System Requirements</h3>
|
||||
|
||||
<p>PHP 5.2 or above is required. A simple shared web hosting account will work fine.
|
||||
The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using <a href="http://www.wampserver.com/en/index.php">WampServer</a>. It has also been reported as working under IIS, but we have not tested this ourselves.</p>
|
||||
<p>PHP 5.3 or above is required. A simple shared web hosting account should work fine, but we recommend a <a href="http://help.fivefilters.org/customer/portal/articles/1143210-hosting">VPS with 1GB RAM</a>.
|
||||
The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using <a href="http://www.uniformserver.com/">Uniform Server</a>. It has also been reported as working under IIS, but we have not tested this ourselves.</p>
|
||||
|
||||
<h3 id="download">Download</h3>
|
||||
<p>Download from <a href="http://fivefilters.org/content-only/#download">fivefilters.org</a> — old versions are available in our <a href="http://code.fivefilters.org">code repository</a>.</p>
|
||||
|
@ -15,12 +15,12 @@
|
||||
class ContentExtractor
|
||||
{
|
||||
protected static $tidy_config = array(
|
||||
'clean' => true,
|
||||
'clean' => false, // can't preserve wbr tabs if this is set to true
|
||||
'output-xhtml' => true,
|
||||
'logical-emphasis' => true,
|
||||
'show-body-only' => false,
|
||||
'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
|
||||
'new-inline-tags' => 'mark, time, meter, progress, data',
|
||||
'new-blocklevel-tags' => 'article aside footer header hgroup menu nav section details datagrid',
|
||||
'new-inline-tags' => 'mark time meter progress data wbr',
|
||||
'wrap' => 0,
|
||||
'drop-empty-paras' => true,
|
||||
'drop-proprietary-attributes' => false,
|
||||
@ -42,6 +42,7 @@ class ContentExtractor
|
||||
protected $body;
|
||||
protected $success = false;
|
||||
protected $nextPageUrl;
|
||||
protected $opengraph = array();
|
||||
public $allowedParsers = array('libxml', 'html5php');
|
||||
public $defaultParser = 'libxml';
|
||||
public $parserOverride = null;
|
||||
@ -79,6 +80,7 @@ class ContentExtractor
|
||||
$this->date = null;
|
||||
$this->nextPageUrl = null;
|
||||
$this->success = false;
|
||||
$this->opengraph = array();
|
||||
}
|
||||
|
||||
public function findHostUsingFingerprints($html) {
|
||||
@ -109,8 +111,11 @@ class ContentExtractor
|
||||
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
|
||||
// is merged version already cached?
|
||||
if (SiteConfig::is_cached("$host.merged")) {
|
||||
$this->debug("Returning cached and merged site config for $host");
|
||||
return SiteConfig::build("$host.merged");
|
||||
$config = SiteConfig::build("$host.merged");
|
||||
if ($config) {
|
||||
$this->debug("Returning cached and merged site config for $host");
|
||||
return $config;
|
||||
}
|
||||
}
|
||||
// let's build from site_config/custom/ and standard/
|
||||
$config = SiteConfig::build($host);
|
||||
@ -315,7 +320,25 @@ class ContentExtractor
|
||||
if ($this->language) break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// try to open graph properties
|
||||
$elems = @$xpath->query("//head//meta[@property='og:title' or @property='og:type' or @property='og:url' or @property='og:image' or @property='og:description']", $this->readability->dom);
|
||||
// check for matches
|
||||
if ($elems && $elems->length > 0) {
|
||||
$this->debug('Extracting Open Graph elements');
|
||||
foreach ($elems as $elem) {
|
||||
if ($elem->hasAttribute('content')) {
|
||||
$_prop = strtolower($elem->getAttribute('property'));
|
||||
$_val = $elem->getAttribute('content');
|
||||
// currently one of each is returned, so we keep the first one
|
||||
if (!isset($this->opengraph[$_prop])) {
|
||||
$this->opengraph[$_prop] = $_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
unset($_prop, $_val);
|
||||
}
|
||||
|
||||
// try to get date
|
||||
foreach ($this->config->date as $pattern) {
|
||||
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
|
||||
@ -397,6 +420,16 @@ class ContentExtractor
|
||||
$elems->item($i)->parentNode->removeChild($elems->item($i));
|
||||
}
|
||||
}
|
||||
|
||||
// strip empty a elements
|
||||
$elems = $xpath->query("//a[not(./*) and normalize-space(.)='']", $this->readability->dom);
|
||||
// check for matches
|
||||
if ($elems && $elems->length > 0) {
|
||||
$this->debug('Stripping '.$elems->length.' empty a elements');
|
||||
for ($i=$elems->length-1; $i >= 0; $i--) {
|
||||
$elems->item($i)->parentNode->removeChild($elems->item($i));
|
||||
}
|
||||
}
|
||||
|
||||
// try to get body
|
||||
foreach ($this->config->body as $pattern) {
|
||||
@ -789,6 +822,10 @@ class ContentExtractor
|
||||
return $this->body;
|
||||
}
|
||||
|
||||
public function getOpenGraph() {
|
||||
return $this->opengraph;
|
||||
}
|
||||
|
||||
public function isNativeAd() {
|
||||
return $this->nativeAd;
|
||||
}
|
||||
|
@ -5,10 +5,10 @@
|
||||
* Each instance of this class should hold extraction patterns and other directives
|
||||
* for a website. See ContentExtractor class to see how it's used.
|
||||
*
|
||||
* @version 0.8
|
||||
* @date 2013-04-16
|
||||
* @version 1.0
|
||||
* @date 2015-06-09
|
||||
* @author Keyvan Minoukadeh
|
||||
* @copyright 2013 Keyvan Minoukadeh
|
||||
* @copyright 2015 Keyvan Minoukadeh
|
||||
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
||||
*/
|
||||
|
||||
@ -38,8 +38,7 @@ class SiteConfig
|
||||
// Mark article as a native ad if any of these expressions match (0 or more xpath expressions)
|
||||
public $native_ad_clue = array();
|
||||
|
||||
// Additional HTTP headers to send
|
||||
// NOT YET USED
|
||||
// Additional HTTP headers to send (associative array)
|
||||
public $http_header = array();
|
||||
|
||||
// Process HTML with tidy before creating DOM (bool or null if undeclared)
|
||||
@ -66,6 +65,15 @@ class SiteConfig
|
||||
|
||||
// Test URL - if present, can be used to test the config above
|
||||
public $test_url = array();
|
||||
|
||||
// Test URL contains - one or more snippets of text from the article body.
|
||||
// Used to determine if the extraction rules for the site are still valid (ie. still extracting relevant content)
|
||||
// Keys should be one or more of the test URLs supplied, and value an array of strings to look for.
|
||||
public $test_contains = array();
|
||||
|
||||
// If page contains - XPath expression. Used to determine if the preceding rule gets evaluated or not.
|
||||
// Currently only works with single_page_link.
|
||||
public $if_page_contains = array();
|
||||
|
||||
// Single-page link - should identify a link element or URL pointing to the page holding the entire article
|
||||
// This is useful for sites which split their articles across multiple pages. Links to such pages tend to
|
||||
@ -185,11 +193,23 @@ class SiteConfig
|
||||
|
||||
public function append(SiteConfig $newconfig) {
|
||||
// check for commands where we accept multiple statements (no test_url)
|
||||
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header') as $var) {
|
||||
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue') as $var) {
|
||||
// append array elements for this config variable from $newconfig to this config
|
||||
//$this->$var = $this->$var + $newconfig->$var;
|
||||
$this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
|
||||
}
|
||||
// special handling of commands where key is important and config values being appended should not overwrite existing ones
|
||||
foreach (array('http_header') as $var) {
|
||||
$this->$var = array_merge($newconfig->$var, $this->$var);
|
||||
}
|
||||
// special handling of if_page_contains directive
|
||||
foreach (array('single_page_link') as $var) {
|
||||
if (isset($this->if_page_contains[$var]) && isset($newconfig->if_page_contains[$var])) {
|
||||
$this->if_page_contains[$var] = array_merge($newconfig->if_page_contains[$var], $this->if_page_contains[$var]);
|
||||
} elseif (isset($newconfig->if_page_contains[$var])) {
|
||||
$this->if_page_contains[$var] = $newconfig->if_page_contains[$var];
|
||||
}
|
||||
}
|
||||
// check for single statement commands
|
||||
// we do not overwrite existing non null values
|
||||
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
|
||||
@ -213,6 +233,40 @@ class SiteConfig
|
||||
return $key_suffix;
|
||||
}
|
||||
|
||||
// Add test_contains to last test_url
|
||||
public function add_test_contains($test_contains) {
|
||||
if (!empty($this->test_url)) {
|
||||
$test_contains = (string) $test_contains;
|
||||
$key = end($this->test_url);
|
||||
reset($this->test_url);
|
||||
if (isset($this->test_contains[$key])) {
|
||||
$this->test_contains[$key][] = $test_contains;
|
||||
} else {
|
||||
$this->test_contains[$key] = array($test_contains);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add if_page_page_contains
|
||||
// TODO: Expand so it can be used with other rules too
|
||||
public function add_if_page_contains_condition($if_page_contains) {
|
||||
if (!empty($this->single_page_link)) {
|
||||
$if_page_contains = (string) $if_page_contains;
|
||||
$key = end($this->single_page_link);
|
||||
reset($this->single_page_link);
|
||||
$this->if_page_contains['single_page_link'][$key] = $if_page_contains;
|
||||
}
|
||||
}
|
||||
|
||||
public function get_if_page_contains_condition($directive_name, $directive_value) {
|
||||
if (isset($this->if_page_contains[$directive_name])) {
|
||||
if (isset($this->if_page_contains[$directive_name][$directive_value])) {
|
||||
return $this->if_page_contains[$directive_name][$directive_value];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// returns SiteConfig instance if an appropriate one is found, false otherwise
|
||||
// if $exact_host_match is true, we will not look for wildcard config matches
|
||||
// by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
|
||||
@ -356,12 +410,20 @@ class SiteConfig
|
||||
// check for single statement commands stored as strings
|
||||
} elseif (in_array($command, array('parser'))) {
|
||||
$config->$command = $val;
|
||||
// special treatment for test_contains
|
||||
} elseif (in_array($command, array('test_contains'))) {
|
||||
$config->add_test_contains($val);
|
||||
// special treatment for if_page_contains
|
||||
} elseif (in_array($command, array('if_page_contains'))) {
|
||||
$config->add_if_page_contains_condition($val);
|
||||
// check for replace_string(find): replace
|
||||
} elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
|
||||
if (in_array($match[1], array('replace_string'))) {
|
||||
$command = $match[1];
|
||||
array_push($config->find_string, $match[2]);
|
||||
array_push($config->$command, $val);
|
||||
array_push($config->replace_string, $val);
|
||||
} elseif (in_array($match[1], array('http_header'))) {
|
||||
$_header = strtolower(trim($match[2]));
|
||||
$config->http_header[$_header] = $val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
<?php
|
||||
define('RSS2', 1, true);
|
||||
define('JSON', 2, true);
|
||||
define('JSONP', 3, true);
|
||||
define('RSS2', 1);
|
||||
define('JSON', 2);
|
||||
define('JSONP', 3);
|
||||
|
||||
/**
|
||||
* Univarsel Feed Writer class
|
||||
@ -131,6 +131,11 @@ define('JSONP', 3, true);
|
||||
$simplejson->language = null;
|
||||
$simplejson->url = null;
|
||||
$simplejson->effective_url = null;
|
||||
$simplejson->og_url = null;
|
||||
$simplejson->og_title = null;
|
||||
$simplejson->og_description = null;
|
||||
$simplejson->og_image = null;
|
||||
$simplejson->og_type = null;
|
||||
$simplejson->content = null;
|
||||
// actual values
|
||||
$simplejson->url = $jsonitem->link;
|
||||
@ -151,6 +156,11 @@ define('JSONP', 3, true);
|
||||
if (isset($jsonitem->pubDate)) {
|
||||
$simplejson->date = gmdate(DATE_ATOM, strtotime($jsonitem->pubDate));
|
||||
}
|
||||
if (isset($jsonitem->og_url)) $simplejson->og_url = $jsonitem->og_url;
|
||||
if (isset($jsonitem->og_title)) $simplejson->og_title = $jsonitem->og_title;
|
||||
if (isset($jsonitem->og_description)) $simplejson->og_description = $jsonitem->og_description;
|
||||
if (isset($jsonitem->og_image)) $simplejson->og_image = $jsonitem->og_image;
|
||||
if (isset($jsonitem->og_type)) $simplejson->og_type = $jsonitem->og_type;
|
||||
echo json_encode($simplejson);
|
||||
}
|
||||
}
|
||||
@ -327,7 +337,7 @@ define('JSONP', 3, true);
|
||||
{
|
||||
$out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
|
||||
if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
|
||||
$out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
|
||||
$out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:og="http://ogp.me/ns#">' . PHP_EOL;
|
||||
echo $out;
|
||||
}
|
||||
elseif ($this->version == JSON || $this->version == JSONP)
|
||||
@ -370,7 +380,9 @@ define('JSONP', 3, true);
|
||||
{
|
||||
foreach ($attributes as $key => $value)
|
||||
{
|
||||
$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" ";
|
||||
//$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" ";
|
||||
// TODO: replace HTML entities not supported in XML with UTF8 equivalent characters
|
||||
$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8')."\" ";
|
||||
}
|
||||
}
|
||||
$nodeText .= "<{$tagName}{$attrText}>";
|
||||
@ -384,7 +396,9 @@ define('JSONP', 3, true);
|
||||
else
|
||||
{
|
||||
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent);
|
||||
$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false);
|
||||
//$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false);
|
||||
// TODO: replace HTML entities not supported in XML with UTF8 equivalent characters
|
||||
$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8');
|
||||
}
|
||||
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>";
|
||||
$nodeText .= "</$tagName>";
|
||||
|
@ -1,8 +1,8 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
htmLawed 1.1.17, 11 March 2014
|
||||
OOP code, 11 March 2014
|
||||
htmLawed 1.1.19, 19 January 2015
|
||||
OOP code, 19 January 2015
|
||||
Copyright Santosh Patnaik
|
||||
Dual LGPL v3 and GPL v2+ license
|
||||
A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed
|
||||
@ -478,7 +478,7 @@ while(strlen($a)){
|
||||
break; case 2: // Val
|
||||
if(preg_match('`^((?:"[^"]*")|(?:\'[^\']*\')|(?:\s*[^\s"\']+))(.*)`', $a, $m)){
|
||||
$a = ltrim($m[2]); $m = $m[1]; $w = 1; $mode = 0;
|
||||
$aA[$nm] = trim(($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m);
|
||||
$aA[$nm] = trim(str_replace('<', '<', ($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m));
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -507,7 +507,7 @@ foreach($aA as $k=>$v){
|
||||
$v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'htmLawed::hl_prot', $v);
|
||||
$v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v;
|
||||
}elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){
|
||||
$v = str_replace("\xad", ' ', (strpos($v, '&') !== false ? str_replace(array('­', '­', '­'), ' ', $v) : $v));
|
||||
$v = str_replace("", ' ', (strpos($v, '&') !== false ? str_replace(array('­', '­', '­'), ' ', $v) : $v)); # double-quoted char is soft-hyphen; appears here as "" or hyphen or something else depending on viewing software
|
||||
$v = htmLawed::hl_prot($v, $k);
|
||||
if($k == 'href'){ // X-spam
|
||||
if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){
|
||||
@ -701,7 +701,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array(
|
||||
|
||||
public static function hl_version(){
|
||||
// rel
|
||||
return '1.1.17';
|
||||
return '1.1.19';
|
||||
// eof
|
||||
}
|
||||
|
||||
|
@ -60,20 +60,22 @@ class HTML5
|
||||
* The path to the file to parse. If this is a resource, it is
|
||||
* assumed to be an open stream whose pointer is set to the first
|
||||
* byte of input.
|
||||
* @param array $options
|
||||
* Configuration options when parsing the HTML
|
||||
* @return \DOMDocument A DOM document. These object type is defined by the libxml
|
||||
* library, and should have been included with your version of PHP.
|
||||
*/
|
||||
public function load($file)
|
||||
public function load($file, array $options = array())
|
||||
{
|
||||
// Handle the case where file is a resource.
|
||||
if (is_resource($file)) {
|
||||
// FIXME: We need a StreamInputStream class.
|
||||
return $this->loadHTML(stream_get_contents($file));
|
||||
return $this->loadHTML(stream_get_contents($file), $options);
|
||||
}
|
||||
|
||||
$input = new FileInputStream($file);
|
||||
|
||||
return $this->parse($input);
|
||||
return $this->parse($input, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -84,14 +86,16 @@ class HTML5
|
||||
*
|
||||
* @param string $string
|
||||
* A html5 document as a string.
|
||||
* @param array $options
|
||||
* Configuration options when parsing the HTML
|
||||
* @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
|
||||
* almost all distribtions of PHP.
|
||||
*/
|
||||
public function loadHTML($string)
|
||||
public function loadHTML($string, array $options = array())
|
||||
{
|
||||
$input = new StringInputStream($string);
|
||||
|
||||
return $this->parse($input);
|
||||
return $this->parse($input, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -104,13 +108,15 @@ class HTML5
|
||||
* The path to the file to parse. If this is a resource, it is
|
||||
* assumed to be an open stream whose pointer is set to the first
|
||||
* byte of input.
|
||||
* @param array $options
|
||||
* Configuration options when parsing the HTML
|
||||
*
|
||||
* @return \DOMDocument A DOM document. These object type is defined by the libxml
|
||||
* library, and should have been included with your version of PHP.
|
||||
*/
|
||||
public function loadHTMLFile($file)
|
||||
public function loadHTMLFile($file, array $options = array())
|
||||
{
|
||||
return $this->load($file);
|
||||
return $this->load($file, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -118,15 +124,17 @@ class HTML5
|
||||
*
|
||||
* @param string $string
|
||||
* The html5 fragment as a string.
|
||||
* @param array $options
|
||||
* Configuration options when parsing the HTML
|
||||
*
|
||||
* @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
|
||||
* almost all distributions of PHP.
|
||||
*/
|
||||
public function loadHTMLFragment($string)
|
||||
public function loadHTMLFragment($string, array $options = array())
|
||||
{
|
||||
$input = new StringInputStream($string);
|
||||
|
||||
return $this->parseFragment($input);
|
||||
return $this->parseFragment($input, $options);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -155,10 +163,10 @@ class HTML5
|
||||
* Lower-level loading function. This requires an input stream instead
|
||||
* of a string, file, or resource.
|
||||
*/
|
||||
public function parse(\Masterminds\HTML5\Parser\InputStream $input)
|
||||
public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
|
||||
{
|
||||
$this->errors = array();
|
||||
$events = new DOMTreeBuilder(false, $this->options);
|
||||
$events = new DOMTreeBuilder(false, array_merge($this->getOptions(), $options));
|
||||
$scanner = new Scanner($input);
|
||||
$parser = new Tokenizer($scanner, $events);
|
||||
|
||||
@ -174,9 +182,9 @@ class HTML5
|
||||
* Lower-level loading function. This requires an input stream instead
|
||||
* of a string, file, or resource.
|
||||
*/
|
||||
public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input)
|
||||
public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
|
||||
{
|
||||
$events = new DOMTreeBuilder(true, $this->options);
|
||||
$events = new DOMTreeBuilder(true, array_merge($this->getOptions(), $options));
|
||||
$scanner = new Scanner($input);
|
||||
$parser = new Tokenizer($scanner, $events);
|
||||
|
||||
|
@ -66,6 +66,11 @@ class Elements
|
||||
*/
|
||||
const BLOCK_TAG = 64;
|
||||
|
||||
/**
|
||||
* Indicates that the tag allows only inline elements as child nodes.
|
||||
*/
|
||||
const BLOCK_ONLY_INLINE = 128;
|
||||
|
||||
/**
|
||||
* The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html.
|
||||
*
|
||||
@ -120,7 +125,7 @@ class Elements
|
||||
"head" => 1,
|
||||
"header" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
|
||||
"hgroup" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
|
||||
"hr" => 73, // NORMAL | VOID_TAG | BLOCK_TAG
|
||||
"hr" => 73, // NORMAL | VOID_TAG
|
||||
"html" => 1,
|
||||
"i" => 1,
|
||||
"iframe" => 3, // NORMAL | TEXT_RAW
|
||||
@ -145,7 +150,7 @@ class Elements
|
||||
"optgroup" => 1,
|
||||
"option" => 1,
|
||||
"output" => 65, // NORMAL | BLOCK_TAG
|
||||
"p" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
|
||||
"p" => 209, // NORMAL | AUTOCLOSE_P | BLOCK_TAG | BLOCK_ONLY_INLINE
|
||||
"param" => 9, // NORMAL | VOID_TAG
|
||||
"pre" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
|
||||
"progress" => 1,
|
||||
|
@ -38,6 +38,12 @@ class DOMTreeBuilder implements EventHandler
|
||||
|
||||
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
|
||||
|
||||
const OPT_DISABLE_HTML_NS = 'disable_html_ns';
|
||||
|
||||
const OPT_TARGET_DOC = 'target_document';
|
||||
|
||||
const OPT_IMPLICIT_NS = 'implicit_namespaces';
|
||||
|
||||
/**
|
||||
* Holds the HTML5 element names that causes a namespace switch
|
||||
*
|
||||
@ -138,6 +144,12 @@ class DOMTreeBuilder implements EventHandler
|
||||
|
||||
protected $insertMode = 0;
|
||||
|
||||
/**
|
||||
* Track if we are in an element that allows only inline child nodes
|
||||
* @var string|null
|
||||
*/
|
||||
protected $onlyInline;
|
||||
|
||||
/**
|
||||
* Quirks mode is enabled by default.
|
||||
* Any document that is missing the
|
||||
@ -151,13 +163,17 @@ class DOMTreeBuilder implements EventHandler
|
||||
{
|
||||
$this->options = $options;
|
||||
|
||||
$impl = new \DOMImplementation();
|
||||
// XXX:
|
||||
// Create the doctype. For now, we are always creating HTML5
|
||||
// documents, and attempting to up-convert any older DTDs to HTML5.
|
||||
$dt = $impl->createDocumentType('html');
|
||||
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
|
||||
$this->doc = $impl->createDocument(null, null, $dt);
|
||||
if (isset($options[self::OPT_TARGET_DOC])) {
|
||||
$this->doc = $options[self::OPT_TARGET_DOC];
|
||||
} else {
|
||||
$impl = new \DOMImplementation();
|
||||
// XXX:
|
||||
// Create the doctype. For now, we are always creating HTML5
|
||||
// documents, and attempting to up-convert any older DTDs to HTML5.
|
||||
$dt = $impl->createDocumentType('html');
|
||||
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
|
||||
$this->doc = $impl->createDocument(null, null, $dt);
|
||||
}
|
||||
$this->errors = array();
|
||||
|
||||
$this->current = $this->doc; // ->documentElement;
|
||||
@ -165,8 +181,15 @@ class DOMTreeBuilder implements EventHandler
|
||||
// Create a rules engine for tags.
|
||||
$this->rules = new TreeBuildingRules($this->doc);
|
||||
|
||||
$implicitNS = array();
|
||||
if (isset($this->options[self::OPT_IMPLICIT_NS])) {
|
||||
$implicitNS = $this->options[self::OPT_IMPLICIT_NS];
|
||||
} elseif (isset($this->options["implicitNamespaces"])) {
|
||||
$implicitNS = $this->options["implicitNamespaces"];
|
||||
}
|
||||
|
||||
// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
|
||||
array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array(
|
||||
array_unshift($this->nsStack, $implicitNS + array(
|
||||
'' => self::NAMESPACE_HTML
|
||||
) + $this->implicitNamespaces);
|
||||
|
||||
@ -320,6 +343,11 @@ class DOMTreeBuilder implements EventHandler
|
||||
}
|
||||
}
|
||||
|
||||
if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) {
|
||||
$this->autoclose($this->onlyInline);
|
||||
$this->onlyInline = null;
|
||||
}
|
||||
|
||||
try {
|
||||
$prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : '';
|
||||
|
||||
@ -334,10 +362,10 @@ class DOMTreeBuilder implements EventHandler
|
||||
$ele = $this->doc->importNode($frag->documentElement, true);
|
||||
|
||||
} else {
|
||||
if (isset($this->nsStack[0][$prefix])) {
|
||||
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
|
||||
} else {
|
||||
if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) {
|
||||
$ele = $this->doc->createElement($lname);
|
||||
} else {
|
||||
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
|
||||
}
|
||||
}
|
||||
|
||||
@ -346,6 +374,10 @@ class DOMTreeBuilder implements EventHandler
|
||||
$ele = $this->doc->createElement('invalid');
|
||||
}
|
||||
|
||||
if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) {
|
||||
$this->onlyInline = $lname;
|
||||
}
|
||||
|
||||
// When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them.
|
||||
// When we are on a void tag, we do not need to care about namesapce nesting.
|
||||
if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) {
|
||||
@ -394,7 +426,7 @@ class DOMTreeBuilder implements EventHandler
|
||||
}
|
||||
|
||||
// Some elements have special processing rules. Handle those separately.
|
||||
if ($this->rules->hasRules($name)) {
|
||||
if ($this->rules->hasRules($name) && $this->frag !== $this->current) {
|
||||
$this->current = $this->rules->evaluate($ele, $this->current);
|
||||
} // Otherwise, it's a standard element.
|
||||
else {
|
||||
@ -649,4 +681,4 @@ class DOMTreeBuilder implements EventHandler
|
||||
{
|
||||
return $this->current->tagName == $tagname;
|
||||
}
|
||||
}
|
||||
}
|
@ -11,9 +11,9 @@ class Scanner
|
||||
|
||||
const CHARS_HEX = 'abcdefABCDEF01234567890';
|
||||
|
||||
const CHARS_ALNUM = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
|
||||
const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
|
||||
|
||||
const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ';
|
||||
const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
|
||||
|
||||
protected $is;
|
||||
|
||||
|
@ -200,10 +200,12 @@ class Tokenizer
|
||||
if (is_null($this->untilTag)) {
|
||||
return $this->text();
|
||||
}
|
||||
$sequence = '</' . $this->untilTag . '>';
|
||||
$sequence = '</' . $this->untilTag;
|
||||
$txt = '';
|
||||
$tok = $this->scanner->current();
|
||||
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))))) {
|
||||
|
||||
$caseSensitive = !Elements::isHtml5Element($this->untilTag);
|
||||
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) {
|
||||
if ($tok == '&') {
|
||||
$txt .= $this->decodeCharacterReference();
|
||||
$tok = $this->scanner->current();
|
||||
@ -212,6 +214,13 @@ class Tokenizer
|
||||
$tok = $this->scanner->next();
|
||||
}
|
||||
}
|
||||
$len = strlen($sequence);
|
||||
$this->scanner->consume($len);
|
||||
$len += strlen($this->scanner->whitespace());
|
||||
if ($this->scanner->current() !== '>') {
|
||||
$this->parseError("Unclosed RCDATA end tag");
|
||||
}
|
||||
$this->scanner->unconsume($len);
|
||||
$this->events->text($txt);
|
||||
$this->setTextMode(0);
|
||||
return $this->endTag();
|
||||
@ -353,7 +362,7 @@ class Tokenizer
|
||||
}
|
||||
|
||||
// We know this is at least one char.
|
||||
$name = strtolower($this->scanner->charsWhile(":0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"));
|
||||
$name = strtolower($this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"));
|
||||
$attributes = array();
|
||||
$selfClose = false;
|
||||
|
||||
@ -891,7 +900,7 @@ class Tokenizer
|
||||
$buffer .= $this->scanner->charsUntil($first);
|
||||
|
||||
// Stop as soon as we hit the stopping condition.
|
||||
if ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))) {
|
||||
if ($this->sequenceMatches($sequence, false)) {
|
||||
return $buffer;
|
||||
}
|
||||
$buffer .= $this->scanner->current();
|
||||
@ -916,7 +925,7 @@ class Tokenizer
|
||||
* see if the input stream is at the start of a
|
||||
* '</script>' string.
|
||||
*/
|
||||
protected function sequenceMatches($sequence)
|
||||
protected function sequenceMatches($sequence, $caseSensitive = true)
|
||||
{
|
||||
$len = strlen($sequence);
|
||||
$buffer = '';
|
||||
@ -932,7 +941,7 @@ class Tokenizer
|
||||
}
|
||||
|
||||
$this->scanner->unconsume($len);
|
||||
return $buffer == $sequence;
|
||||
return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1056,8 +1065,14 @@ class Tokenizer
|
||||
// [a-zA-Z0-9]+;
|
||||
$cname = $this->scanner->getAsciiAlpha();
|
||||
$entity = CharacterReference::lookupName($cname);
|
||||
|
||||
// When no entity is found provide the name of the unmatched string
|
||||
// and continue on as the & is not part of an entity. The & will
|
||||
// be converted to & elsewhere.
|
||||
if ($entity == null) {
|
||||
$this->parseError("No match in entity table for '%s'", $entity);
|
||||
$this->parseError("No match in entity table for '%s'", $cname);
|
||||
$this->scanner->unconsume($this->scanner->position() - $start);
|
||||
return '&';
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -115,8 +115,10 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
|
||||
public function document($dom)
|
||||
{
|
||||
$this->doctype();
|
||||
$this->traverser->node($dom->documentElement);
|
||||
$this->nl();
|
||||
if ($dom->documentElement) {
|
||||
$this->traverser->node($dom->documentElement);
|
||||
$this->nl();
|
||||
}
|
||||
}
|
||||
|
||||
protected function doctype()
|
||||
|
@ -112,7 +112,7 @@ class Traverser
|
||||
break;
|
||||
// Currently we don't support embedding DTDs.
|
||||
default:
|
||||
print '<!-- Skipped -->';
|
||||
//print '<!-- Skipped -->';
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -2,8 +2,9 @@
|
||||
|
||||
Copyright (c) 2013 The Authors of HTML5-PHP
|
||||
|
||||
Matt Butcher - technosophos@gmail.com
|
||||
Matt Butcher - mattbutcher@google.com
|
||||
Matt Farina - matt@mattfarina.com
|
||||
Asmir Mustafic - goetas@gmail.com
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
|
@ -10,6 +10,7 @@ But after some initial refactoring work, we began a new parser.
|
||||
- Event-based (SAX-like) parser
|
||||
- DOM tree builder
|
||||
- Interoperability with QueryPath [[in progress](https://github.com/technosophos/querypath/issues/114)]
|
||||
- Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer
|
||||
|
||||
[![Build Status](https://travis-ci.org/Masterminds/html5-php.png?branch=master)](https://travis-ci.org/Masterminds/html5-php) [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5) [![Coverage Status](https://coveralls.io/repos/Masterminds/html5-php/badge.png?branch=master)](https://coveralls.io/r/Masterminds/html5-php?branch=master)
|
||||
|
||||
@ -22,12 +23,12 @@ To install, add `masterminds/html5` to your `composer.json` file:
|
||||
```
|
||||
{
|
||||
"require" : {
|
||||
"masterminds/html5": "1.*"
|
||||
"masterminds/html5": "2.*"
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
(You may substitute `1.*` for a more specific release tag, of
|
||||
(You may substitute `2.*` for a more specific release tag, of
|
||||
course.)
|
||||
|
||||
From there, use the `composer install` or `composer update` commands to
|
||||
@ -43,6 +44,7 @@ Here is how you use the high-level `HTML5` library API:
|
||||
<?php
|
||||
// Assuming you installed from Composer:
|
||||
require "vendor/autoload.php";
|
||||
use Masterminds\HTML5;
|
||||
|
||||
|
||||
// An example HTML document:
|
||||
@ -59,13 +61,14 @@ $html = <<< 'HERE'
|
||||
HERE;
|
||||
|
||||
// Parse the document. $dom is a DOMDocument.
|
||||
$dom = HTML5::loadHTML($html);
|
||||
$html5 = new HTML5();
|
||||
$dom = $html5->loadHTML($html);
|
||||
|
||||
// Render it as HTML5:
|
||||
print HTML5::saveHTML($dom);
|
||||
print $html5->saveHTML($dom);
|
||||
|
||||
// Or save it to a file:
|
||||
HTML5::save($dom, 'out.html');
|
||||
$html5->save($dom, 'out.html');
|
||||
|
||||
?>
|
||||
```
|
||||
@ -73,6 +76,35 @@ HTML5::save($dom, 'out.html');
|
||||
The `$dom` created by the parser is a full `DOMDocument` object. And the
|
||||
`save()` and `saveHTML()` methods will take any DOMDocument.
|
||||
|
||||
### Options
|
||||
|
||||
It is possible to pass in an array of configuration options when loading
|
||||
an HTML5 document.
|
||||
|
||||
```php
|
||||
// An associative array of options
|
||||
$options = array(
|
||||
'option_name' => 'option_value',
|
||||
);
|
||||
|
||||
// Provide the options to the constructor
|
||||
$html5 = new HTML5($options);
|
||||
|
||||
$dom = $html5->loadHTML($html);
|
||||
```
|
||||
|
||||
The following options are supported:
|
||||
|
||||
* `encode_entities` (boolean): Indicates that the serializer should aggressively
|
||||
encode characters as entities. Without this, it only encodes the bare
|
||||
minimum.
|
||||
* `disable_html_ns` (boolean): Prevents the parser from automatically
|
||||
assigning the HTML5 namespace to the DOM document. This is for
|
||||
non-namespace aware DOM tools.
|
||||
* `target_document` (\DOMDocument): A DOM document that will be used as the
|
||||
destination for the parsed nodes.
|
||||
* `implicit_namespaces` (array): An assoc array of namespaces that should be
|
||||
used by the parser. Name is tag prefix, value is NS URI.
|
||||
|
||||
## The Low-Level API
|
||||
|
||||
@ -116,7 +148,7 @@ different rule sets to be used.
|
||||
- The `Traverser`, which is a special-purpose tree walker. It visits
|
||||
each node node in the tree and uses the `OutputRules` to transform the node
|
||||
into a string.
|
||||
- `\HTML5` manages the `Traverser` and stores the resultant data
|
||||
- `HTML5` manages the `Traverser` and stores the resultant data
|
||||
in the correct place.
|
||||
|
||||
The serializer (`save()`, `saveHTML()`) follows the
|
||||
@ -134,7 +166,9 @@ issues known issues that are not presently on the roadmap:
|
||||
|
||||
- Namespaces: HTML5 only [supports a selected list of namespaces](http://www.w3.org/TR/html5/infrastructure.html#namespaces)
|
||||
and they do not operate in the same way as XML namespaces. A `:` has no special
|
||||
meaning. The parser does not support XML style namespaces via `:`.
|
||||
meaning.
|
||||
By default the parser does not support XML style namespaces via `:`;
|
||||
to enable the XML namespaces see the [XML Namespaces section](#xml-namespaces)
|
||||
- Scripts: This parser does not contain a JavaScript or a CSS
|
||||
interpreter. While one may be supplied, not all features will be
|
||||
supported.
|
||||
@ -162,8 +196,45 @@ issues known issues that are not presently on the roadmap:
|
||||
- PLAINTEXT: Unsupported.
|
||||
- Adoption Agency Algorithm: Not yet implemented. (8.2.5.4.7)
|
||||
|
||||
##XML Namespaces
|
||||
|
||||
To use XML style namespaces you have to configure well the main `HTML5` instance.
|
||||
|
||||
```php
|
||||
use Masterminds\HTML5;
|
||||
$html = new HTML5(array(
|
||||
"xmlNamespaces" => true
|
||||
));
|
||||
|
||||
$dom = $html->loadHTML('<t:tag xmlns:t="http://www.example.com"/>');
|
||||
|
||||
$dom->documentElement->namespaceURI; // http://www.example.com
|
||||
|
||||
```
|
||||
|
||||
You can also add some default prefixes that will not require the namespace declaration,
|
||||
but it's elements will be namespaced.
|
||||
|
||||
```php
|
||||
use Masterminds\HTML5;
|
||||
$html = new HTML5(array(
|
||||
"implicitNamespaces"=>array(
|
||||
"t"=>"http://www.example.com"
|
||||
)
|
||||
));
|
||||
|
||||
$dom = $html->loadHTML('<t:tag/>');
|
||||
|
||||
$dom->documentElement->namespaceURI; // http://www.example.com
|
||||
|
||||
```
|
||||
|
||||
## Thanks to...
|
||||
|
||||
The dedicated (and patient) contributors of patches small and large,
|
||||
who have already made this library better.See the CREDITS file for
|
||||
a list of contributors.
|
||||
|
||||
We owe a huge debt of gratitude to the original authors of html5lib.
|
||||
|
||||
While not much of the orignal parser remains, we learned a lot from
|
||||
|
@ -1,5 +1,42 @@
|
||||
# Release Notes
|
||||
|
||||
2.1.1 (2015-03-23)
|
||||
- #78: Fixes bug where unmatched entity like string drops everything after &.
|
||||
|
||||
2.1.0 (2015-02-01)
|
||||
- #74: Added `disable_html_ns` and `target_doc` dom parsing options
|
||||
- Unified option names
|
||||
- #73: Fixed alphabet, ß now can be detected
|
||||
- #75 and #76: Allow whitespace in RCDATA tags
|
||||
- #77: Fixed parsing blunder for json embeds
|
||||
- #72: Add options to HTML methods
|
||||
|
||||
2.0.2 (2014-12-17)
|
||||
- #50: empty document handling
|
||||
- #63: tags with strange capitalization
|
||||
- #65: dashes and underscores as allowed characters in tag names
|
||||
- #68: Fixed issue with non-inline elements inside inline containers
|
||||
|
||||
2.0.1 (2014-09-23)
|
||||
- #59: Fixed issue parsing some fragments.
|
||||
- #56: Incorrectly saw 0 as empty string
|
||||
- Sami as new documentation generator
|
||||
|
||||
2.0.0 (2014-07-28)
|
||||
- #53: Improved boolean attributes handling
|
||||
- #52: Facebook HHVM compatibility
|
||||
- #48: Adopted PSR-2 as coding standard
|
||||
- #47: Moved everything to Masterminds namespace
|
||||
- #45: Added custom namespaces
|
||||
- #44: Added support to XML-style namespaces
|
||||
- #37: Refactored HTML5 class removing static methods
|
||||
|
||||
1.0.5 (2014-06-10)
|
||||
- #38: Set the dev-master branch as the 1.0.x branch for composer (goetas)
|
||||
- #34: Tests use PSR-4 for autoloading. (goetas)
|
||||
- #40, #41: Fix entity handling in RCDATA sections. (KitaitiMakoto)
|
||||
- #32: Fixed issue where wharacter references were being incorrectly encoded in style tags.
|
||||
|
||||
1.0.4 (2014-04-29)
|
||||
- #30/#31 Don't throw an exception for invalid tag names.
|
||||
|
||||
|
@ -7,11 +7,11 @@
|
||||
* For environments which do not have these options, it reverts to standard sequential
|
||||
* requests (using file_get_contents())
|
||||
*
|
||||
* @version 1.5
|
||||
* @date 2014-03-28
|
||||
* @see http://php.net/HttpRequestPool
|
||||
* @version 1.6
|
||||
* @date 2015-06-05
|
||||
* @see http://devel-m6w6.rhcloud.com/mdref/http
|
||||
* @author Keyvan Minoukadeh
|
||||
* @copyright 2011-2014 Keyvan Minoukadeh
|
||||
* @copyright 2011-2015 Keyvan Minoukadeh
|
||||
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
||||
*/
|
||||
|
||||
@ -22,7 +22,7 @@ class HumbleHttpAgent
|
||||
const METHOD_FILE_GET_CONTENTS = 4;
|
||||
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
|
||||
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
|
||||
const UA_PHP = 'PHP/5.4';
|
||||
const UA_PHP = 'PHP/5.5';
|
||||
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
|
||||
|
||||
protected $requests = array();
|
||||
@ -38,6 +38,7 @@ class HumbleHttpAgent
|
||||
public $debug = false;
|
||||
public $debugVerbose = false;
|
||||
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
|
||||
public $siteConfigBuilder = null; // can be set to an instance of ContentExtractor to have site config files used for custom HTTP headers
|
||||
public $maxRedirects = 5;
|
||||
public $userAgentMap = array();
|
||||
public $rewriteUrls = array();
|
||||
@ -67,7 +68,7 @@ class HumbleHttpAgent
|
||||
if (in_array($method, array(1,2,4))) {
|
||||
$this->method = $method;
|
||||
} else {
|
||||
if (class_exists('HttpRequestPool')) {
|
||||
if (class_exists('http\Client\Request')) {
|
||||
$this->method = self::METHOD_REQUEST_POOL;
|
||||
} elseif (function_exists('curl_multi_init')) {
|
||||
$this->method = self::METHOD_CURL_MULTI;
|
||||
@ -192,6 +193,7 @@ class HumbleHttpAgent
|
||||
return false;
|
||||
}
|
||||
$redirect_url = $match[1];
|
||||
$redirect_url = htmlspecialchars_decode($redirect_url); // For Facebook!
|
||||
if (preg_match('!^https?://!i', $redirect_url)) {
|
||||
// already absolute
|
||||
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
|
||||
@ -203,7 +205,7 @@ class HumbleHttpAgent
|
||||
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
|
||||
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
|
||||
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
|
||||
return $absolute;
|
||||
return $absolute->get_iri();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -293,14 +295,16 @@ class HumbleHttpAgent
|
||||
if (empty($urls)) return;
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// parallel (HttpRequestPool)
|
||||
// parallel (HTTP extension)
|
||||
if ($this->method == self::METHOD_REQUEST_POOL) {
|
||||
$this->debug('Starting parallel fetch (HttpRequestPool)');
|
||||
$this->debug('Starting parallel fetch (HTTP Extension)');
|
||||
try {
|
||||
while (count($urls) > 0) {
|
||||
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
|
||||
$subset = array_splice($urls, 0, $this->maxParallelRequests);
|
||||
$pool = new HttpRequestPool();
|
||||
//$pool = new HttpRequestPool();
|
||||
$pool = new http\Client;
|
||||
$pool->setOptions($this->requestOptions);
|
||||
foreach ($subset as $orig => $url) {
|
||||
if (!$isRedirect) $orig = $url;
|
||||
unset($this->redirectQueue[$orig]);
|
||||
@ -320,24 +324,62 @@ class HumbleHttpAgent
|
||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
|
||||
$req_url = $this->removeFragment($req_url);
|
||||
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
|
||||
$_meth = HttpRequest::METH_HEAD;
|
||||
$_meth = "HEAD";
|
||||
} else {
|
||||
$_meth = HttpRequest::METH_GET;
|
||||
$_meth = "GET";
|
||||
unset($this->requests[$orig]['wrongGuess']);
|
||||
}
|
||||
$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
|
||||
// send cookies, if we have any
|
||||
if ($cookies = $this->getCookies($orig, $req_url)) {
|
||||
$this->debug("......sending cookies: $cookies");
|
||||
$httpRequest->addHeaders(array('Cookie' => $cookies));
|
||||
//$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
|
||||
$httpRequest = new http\Client\Request($_meth, $req_url);
|
||||
$httpRequest->setOptions($this->requestOptions);
|
||||
|
||||
// check site config for additional http headers
|
||||
$scHeaders = array();
|
||||
if (isset($this->siteConfigBuilder)) {
|
||||
$scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header;
|
||||
}
|
||||
//$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
|
||||
$httpRequest->addHeaders($this->getUserAgent($req_url, true));
|
||||
|
||||
// send cookies, if we have any
|
||||
$_cookies = null;
|
||||
if (isset($scHeaders['cookie'])) {
|
||||
$_cookies = $scHeaders['cookie'];
|
||||
} else {
|
||||
//$_cookies = $this->cookieJar->getMatchingCookies($req_url);
|
||||
$_cookies = $this->getCookies($orig, $req_url);
|
||||
}
|
||||
if ($_cookies) {
|
||||
$this->debug("......sending cookies: $_cookies");
|
||||
$httpRequest->addHeaders(array('Cookie' => $_cookies));
|
||||
}
|
||||
|
||||
// send user agent
|
||||
$_ua = null;
|
||||
if (isset($scHeaders['user-agent'])) {
|
||||
$_ua = $scHeaders['user-agent'];
|
||||
} else {
|
||||
$_ua = $this->getUserAgent($req_url, true);
|
||||
$_ua = $_ua['User-Agent'];
|
||||
}
|
||||
if ($_ua) {
|
||||
$this->debug("......user-agent set to: $_ua");
|
||||
$httpRequest->addHeaders(array('User-Agent' => $_ua));
|
||||
}
|
||||
|
||||
// add referer for picky sites
|
||||
$httpRequest->addheaders(array('Referer' => $this->referer));
|
||||
$_referer = null;
|
||||
if (isset($scHeaders['referer'])) {
|
||||
$_referer = $scHeaders['referer'];
|
||||
} else {
|
||||
$_referer = $this->referer;
|
||||
}
|
||||
if ($_referer) {
|
||||
$this->debug("......referer set to: $_referer");
|
||||
$httpRequest->addheaders(array('Referer'=>$_referer));
|
||||
}
|
||||
|
||||
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
|
||||
$this->requests[$orig]['original_url'] = $orig;
|
||||
$pool->attach($httpRequest);
|
||||
$pool->enqueue($httpRequest);
|
||||
}
|
||||
}
|
||||
// did we get anything into the pool?
|
||||
@ -345,16 +387,20 @@ class HumbleHttpAgent
|
||||
$this->debug('Sending request...');
|
||||
try {
|
||||
$pool->send();
|
||||
} catch (HttpRequestPoolException $e) {
|
||||
} catch (http\Exception $e) {
|
||||
// do nothing
|
||||
}
|
||||
$this->debug('Received responses');
|
||||
foreach($subset as $orig => $url) {
|
||||
if (!$isRedirect) $orig = $url;
|
||||
$request = $this->requests[$orig]['httpRequest'];
|
||||
$response = $pool->getResponse($request);
|
||||
//$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
|
||||
// getResponseHeader() doesn't return status line, so, for consistency...
|
||||
$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
|
||||
//$headers = $response->toString();
|
||||
$this->requests[$orig]['headers'] = $response->getInfo()."\n".$this->headersToString($response->getHeaders(), true);
|
||||
// v1 HTTP extension code
|
||||
//$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
|
||||
// check content type
|
||||
// TODO: use getResponseHeader('content-type') or getResponseInfo()
|
||||
if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
|
||||
@ -362,25 +408,37 @@ class HumbleHttpAgent
|
||||
$_header_only_type = true;
|
||||
$this->debug('Header only type returned');
|
||||
} else {
|
||||
$this->requests[$orig]['body'] = $request->getResponseBody();
|
||||
$this->requests[$orig]['body'] = $response->getBody()->toString();
|
||||
//var_dump($this->requests[$orig]['body']);exit;
|
||||
// v1 HTTP ext. code
|
||||
//$this->requests[$orig]['body'] = $request->getResponseBody();
|
||||
$_header_only_type = false;
|
||||
}
|
||||
$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
|
||||
$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
|
||||
$this->requests[$orig]['effective_url'] = $response->getTransferInfo('effective_url');
|
||||
$this->requests[$orig]['status_code'] = $status_code = $response->getResponseCode();
|
||||
// v1 HTTP ext. code
|
||||
//$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
|
||||
//$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
|
||||
// is redirect?
|
||||
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
|
||||
$redirectURL = $request->getResponseHeader('location');
|
||||
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $response->getHeader('location')) {
|
||||
// v1 HTTP ext. code
|
||||
//if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
|
||||
$redirectURL = $response->getHeader('location');
|
||||
if (!preg_match('!^https?://!i', $redirectURL)) {
|
||||
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
||||
}
|
||||
if ($this->validateURL($redirectURL)) {
|
||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||
// store any cookies
|
||||
//$cookies = $request->getResponseHeader('set-cookie');
|
||||
//if ($cookies && !is_array($cookies)) $cookies = array($cookies);
|
||||
//if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
|
||||
$this->storeCookies($orig, $url);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
} else {
|
||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||
}
|
||||
} elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
|
||||
} elseif (!$_header_only_type && $request->getRequestMethod() == "HEAD") {
|
||||
// the response content-type did not match our 'header only' types,
|
||||
// but we'd issues a HEAD request because we assumed it would. So
|
||||
// let's queue a proper GET request for this item...
|
||||
@ -399,7 +457,7 @@ class HumbleHttpAgent
|
||||
}
|
||||
}
|
||||
//die($url.' -multi- '.$request->getResponseInfo('effective_url'));
|
||||
$pool->detach($request);
|
||||
$pool->dequeue($request);
|
||||
unset($this->requests[$orig]['httpRequest'], $request);
|
||||
/*
|
||||
if ($this->minimiseMemoryUse) {
|
||||
@ -411,7 +469,7 @@ class HumbleHttpAgent
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (HttpException $e) {
|
||||
} catch (http\Exception $e) {
|
||||
$this->debug($e);
|
||||
return false;
|
||||
}
|
||||
@ -450,17 +508,53 @@ class HumbleHttpAgent
|
||||
} else {
|
||||
$_meth = 'GET';
|
||||
unset($this->requests[$orig]['wrongGuess']);
|
||||
}
|
||||
$headers = array();
|
||||
//$headers[] = 'User-Agent: '.$this->userAgent;
|
||||
$headers[] = $this->getUserAgent($req_url);
|
||||
// add referer for picky sites
|
||||
$headers[] = 'Referer: '.$this->referer;
|
||||
// send cookies, if we have any
|
||||
if ($cookies = $this->getCookies($orig, $req_url)) {
|
||||
$this->debug("......sending cookies: $cookies");
|
||||
$headers[] = 'Cookie: '.$cookies;
|
||||
}
|
||||
$headers = array();
|
||||
|
||||
// check site config for additional http headers
|
||||
$scHeaders = array();
|
||||
if (isset($this->siteConfigBuilder)) {
|
||||
$scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header;
|
||||
}
|
||||
|
||||
// send cookies, if we have any
|
||||
$_cookies = null;
|
||||
if (isset($scHeaders['cookie'])) {
|
||||
$_cookies = $scHeaders['cookie'];
|
||||
} else {
|
||||
//$_cookies = $this->cookieJar->getMatchingCookies($req_url);
|
||||
$_cookies = $this->getCookies($orig, $req_url);
|
||||
}
|
||||
if ($_cookies) {
|
||||
$this->debug("......sending cookies: $_cookies");
|
||||
$headers[] = 'Cookie: '.$_cookies;
|
||||
}
|
||||
|
||||
// send user agent
|
||||
$_ua = null;
|
||||
if (isset($scHeaders['user-agent'])) {
|
||||
$_ua = $scHeaders['user-agent'];
|
||||
} else {
|
||||
$_ua = $this->getUserAgent($req_url, true);
|
||||
$_ua = $_ua['User-Agent'];
|
||||
}
|
||||
if ($_ua) {
|
||||
$this->debug("......user-agent set to: $_ua");
|
||||
$headers[] = 'User-Agent: '.$_ua;
|
||||
}
|
||||
|
||||
// add referer for picky sites
|
||||
$_referer = null;
|
||||
if (isset($scHeaders['referer'])) {
|
||||
$_referer = $scHeaders['referer'];
|
||||
} else {
|
||||
$_referer = $this->referer;
|
||||
}
|
||||
if ($_referer) {
|
||||
$this->debug("......referer set to: $_referer");
|
||||
$headers[] = 'Referer: '.$_referer;
|
||||
}
|
||||
|
||||
$httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, $this->curlOptions);
|
||||
$httpRequest->set_original_url($orig);
|
||||
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
|
||||
@ -494,7 +588,10 @@ class HumbleHttpAgent
|
||||
}
|
||||
if ($this->validateURL($redirectURL)) {
|
||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||
$this->storeCookies($orig, $url);
|
||||
// store any cookies
|
||||
//$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
|
||||
//if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
|
||||
$this->storeCookies($orig, $url);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
} else {
|
||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||
@ -548,15 +645,52 @@ class HumbleHttpAgent
|
||||
$req_url = $this->rewriteUrls($url);
|
||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
|
||||
$req_url = $this->removeFragment($req_url);
|
||||
// send cookies, if we have any
|
||||
$httpContext = $this->httpContext;
|
||||
$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
|
||||
// add referer for picky sites
|
||||
$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
|
||||
if ($cookies = $this->getCookies($orig, $req_url)) {
|
||||
$this->debug("......sending cookies: $cookies");
|
||||
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
|
||||
|
||||
// check site config for additional http headers
|
||||
$scHeaders = array();
|
||||
if (isset($this->siteConfigBuilder)) {
|
||||
$scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header;
|
||||
}
|
||||
|
||||
// send cookies, if we have any
|
||||
$_cookies = null;
|
||||
if (isset($scHeaders['cookie'])) {
|
||||
$_cookies = $scHeaders['cookie'];
|
||||
} else {
|
||||
//$_cookies = $this->cookieJar->getMatchingCookies($req_url);
|
||||
$_cookies = $this->getCookies($orig, $req_url);
|
||||
}
|
||||
if ($_cookies) {
|
||||
$this->debug("......sending cookies: $_cookies");
|
||||
$httpContext['http']['header'] .= 'Cookie: '.$_cookies."\r\n";
|
||||
}
|
||||
|
||||
// send user agent
|
||||
$_ua = null;
|
||||
if (isset($scHeaders['user-agent'])) {
|
||||
$_ua = $scHeaders['user-agent'];
|
||||
} else {
|
||||
$_ua = $this->getUserAgent($req_url, true);
|
||||
$_ua = $_ua['User-Agent'];
|
||||
}
|
||||
if ($_ua) {
|
||||
$this->debug("......user-agent set to: $_ua");
|
||||
$httpContext['http']['header'] .= 'User-Agent: '.$_ua."\r\n";
|
||||
}
|
||||
|
||||
// add referer for picky sites
|
||||
$_referer = null;
|
||||
if (isset($scHeaders['referer'])) {
|
||||
$_referer = $scHeaders['referer'];
|
||||
} else {
|
||||
$_referer = $this->referer;
|
||||
}
|
||||
if ($_referer) {
|
||||
$this->debug("......referer set to: $_referer");
|
||||
$httpContext['http']['header'] .= 'Referer: '.$_referer."\r\n";
|
||||
}
|
||||
|
||||
if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
|
||||
$this->debug('Received response');
|
||||
// get status code
|
||||
@ -585,6 +719,9 @@ class HumbleHttpAgent
|
||||
}
|
||||
if ($this->validateURL($redirectURL)) {
|
||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||
// store any cookies
|
||||
//$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
|
||||
//if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
|
||||
$this->storeCookies($orig, $url);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
} else {
|
||||
@ -680,7 +817,7 @@ class HumbleHttpAgent
|
||||
}
|
||||
|
||||
public function parallelSupport() {
|
||||
return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
|
||||
return class_exists('http\Client') || function_exists('curl_multi_init');
|
||||
}
|
||||
|
||||
private function headerOnlyType($headers) {
|
||||
@ -703,7 +840,7 @@ class HumbleHttpAgent
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
protected function getCookies($orig, $req_url) {
|
||||
$jar = $this->cookieJar[$orig];
|
||||
if (!isset($jar)) {
|
||||
@ -727,6 +864,7 @@ class HumbleHttpAgent
|
||||
protected function deleteCookies() {
|
||||
$this->cookieJar = array();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
|
||||
|
@ -22,6 +22,7 @@ class HumbleHttpAgentDummy
|
||||
public $userAgentMap = array();
|
||||
public $rewriteUrls = array();
|
||||
public $userAgentDefault;
|
||||
public $siteConfigBuilder = null;
|
||||
public $referer;
|
||||
|
||||
protected $body = '';
|
||||
|
@ -12,7 +12,7 @@
|
||||
* More information: http://fivefilters.org/content-only/
|
||||
* License: Apache License, Version 2.0
|
||||
* Requires: PHP5
|
||||
* Date: 2014-03-27
|
||||
* Date: 2015-06-01
|
||||
*
|
||||
* Differences between the PHP port and the original
|
||||
* ------------------------------------------------------
|
||||
@ -95,7 +95,7 @@ class Readability
|
||||
// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
|
||||
'normalize' => '/\s{2,}/',
|
||||
'killBreaks' => '/(<br\s*\/?>(\s| ?)*){1,}/',
|
||||
'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|twitch\.tv)!i',
|
||||
'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|soundcloud\.com|twitch\.tv)!i',
|
||||
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
|
||||
);
|
||||
|
||||
@ -121,8 +121,12 @@ class Readability
|
||||
if (version_compare(PHP_VERSION, '5.3.0') >= 0) {
|
||||
//use Masterminds\HTML5;
|
||||
$html5class = 'Masterminds\HTML5';
|
||||
$html5 = new $html5class();
|
||||
$html5 = new $html5class(array('disable_html_ns' => true));
|
||||
$this->dom = $html5->loadHTML($html);
|
||||
//echo $html5->saveHTML($this->dom);exit;
|
||||
//$xpath = new DOMXPath($this->dom);
|
||||
//$elems = $xpath->query("//a");
|
||||
//print_r($elems);exit;
|
||||
}
|
||||
}
|
||||
if ($this->dom === null) {
|
||||
@ -314,7 +318,11 @@ class Readability
|
||||
$styleTags = $this->dom->getElementsByTagName('style');
|
||||
for ($i = $styleTags->length-1; $i >= 0; $i--)
|
||||
{
|
||||
$styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
|
||||
try {
|
||||
@$styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
|
||||
} catch (Exception $e) {
|
||||
// Do nothing
|
||||
}
|
||||
}
|
||||
|
||||
/* Turn all double br's into p's */
|
||||
@ -832,7 +840,11 @@ class Readability
|
||||
$scripts = $doc->getElementsByTagName('script');
|
||||
for($i = $scripts->length-1; $i >= 0; $i--)
|
||||
{
|
||||
$scripts->item($i)->parentNode->removeChild($scripts->item($i));
|
||||
try {
|
||||
$scripts->item($i)->parentNode->removeChild($scripts->item($i));
|
||||
} catch (Exception $e) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,10 +1,10 @@
|
||||
<?php
|
||||
// Full-Text RSS: Create Full-Text Feeds
|
||||
// Author: Keyvan Minoukadeh
|
||||
// Copyright (c) 2014 Keyvan Minoukadeh
|
||||
// Copyright (c) 2015 Keyvan Minoukadeh
|
||||
// License: AGPLv3
|
||||
// Version: 3.4
|
||||
// Date: 2014-08-28
|
||||
// Version: 3.5
|
||||
// Date: 2015-05-29
|
||||
// More info: http://fivefilters.org/content-only/
|
||||
// Help: http://help.fivefilters.org
|
||||
|
||||
@ -30,6 +30,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
error_reporting(E_ALL ^ E_NOTICE);
|
||||
libxml_use_internal_errors(true);
|
||||
libxml_disable_entity_loader(true);
|
||||
ini_set("display_errors", 1);
|
||||
@set_time_limit(120);
|
||||
|
||||
@ -234,7 +235,7 @@ if (isset($_REQUEST['accept']) && in_array(strtolower($_REQUEST['accept']), arra
|
||||
$user_submitted_config = null;
|
||||
if (isset($_REQUEST['siteconfig'])) {
|
||||
$user_submitted_config = $_REQUEST['siteconfig'];
|
||||
if (!$options->user_submitted_content && $user_submitted_config) {
|
||||
if (!$options->user_submitted_config && $user_submitted_config) {
|
||||
die('User-submitted site configs are currently disabled. Please remove the siteconfig parameter.');
|
||||
}
|
||||
}
|
||||
@ -526,7 +527,8 @@ if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') {
|
||||
}
|
||||
$http = new HumbleHttpAgent($_req_options);
|
||||
$http->debug = $debug_mode;
|
||||
$http->userAgentMap = $options->user_agents;
|
||||
// User agents can now be set in site config files using the http_header directive
|
||||
//$http->userAgentMap = $options->user_agents;
|
||||
$http->headerOnlyTypes = array_keys($options->content_type_exc);
|
||||
$http->rewriteUrls = $options->rewrite_url;
|
||||
unset($_req_options);
|
||||
@ -545,6 +547,7 @@ $extractor->parserOverride = $parser;
|
||||
if ($options->user_submitted_config && $user_submitted_config) {
|
||||
$extractor->setUserSubmittedConfig($user_submitted_config);
|
||||
}
|
||||
$http->siteConfigBuilder = $extractor;
|
||||
|
||||
////////////////////////////////
|
||||
// Get RSS/Atom feed
|
||||
@ -655,7 +658,7 @@ $items = $feed->get_items(0, $max);
|
||||
$urls_sanitized = array();
|
||||
$urls = array();
|
||||
foreach ($items as $key => $item) {
|
||||
$permalink = htmlspecialchars_decode($item->get_permalink());
|
||||
$permalink = htmlspecialchars_decode(trim($item->get_permalink()));
|
||||
// Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
|
||||
$permalink = str_replace('%3A', ':', $permalink);
|
||||
// validateUrl() strips non-ascii characters
|
||||
@ -973,6 +976,13 @@ foreach ($items as $key => $item) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// add open graph
|
||||
if ($opengraph = $extractor->getOpenGraph()) {
|
||||
foreach ($opengraph as $og_prop => $og_val) {
|
||||
$newitem->addElement($og_prop, $og_val);
|
||||
}
|
||||
}
|
||||
|
||||
// add language
|
||||
if ($detect_language) {
|
||||
@ -1390,6 +1400,17 @@ function get_single_page($item, $html, $url) {
|
||||
// Loop through single_page_link xpath expressions
|
||||
$single_page_url = null;
|
||||
foreach ($splink as $pattern) {
|
||||
// Do we have conditions?
|
||||
$condition = $site_config->get_if_page_contains_condition('single_page_link', $pattern);
|
||||
if ($condition) {
|
||||
$elems = @$xpath->evaluate($condition, $readability->dom);
|
||||
if ($elems instanceof DOMNodeList && $elems->length > 0) {
|
||||
// all fine
|
||||
} else {
|
||||
// move on to next single page link XPath
|
||||
continue;
|
||||
}
|
||||
}
|
||||
$elems = @$xpath->evaluate($pattern, $readability->dom);
|
||||
if (is_string($elems)) {
|
||||
$single_page_url = trim($elems);
|
||||
|
3
robots.txt
Normal file
3
robots.txt
Normal file
@ -0,0 +1,3 @@
|
||||
User-agent: *
|
||||
Disallow: /makefulltextfeed.php
|
||||
Disallow: /extract.php
|
3
site_config/standard/index.php
Normal file
3
site_config/standard/index.php
Normal file
@ -0,0 +1,3 @@
|
||||
<?php
|
||||
// this is here to prevent directory listing over the web
|
||||
?>
|
Loading…
Reference in New Issue
Block a user