Full-Text RSS 3.5

This commit is contained in:
FiveFilters.org 2017-02-18 16:06:19 +01:00
parent daedf214fe
commit bfed79edc7
25 changed files with 627 additions and 166 deletions

View File

@ -1,9 +1,9 @@
<?php
// Update site config files for Full-Text RSS
// Author: Keyvan Minoukadeh
// Copyright (c) 2014 Keyvan Minoukadeh
// Copyright (c) 2015 Keyvan Minoukadeh
// License: AGPLv3
// Date: 2014-08-19
// Date: 2015-06-10
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
@ -245,7 +245,7 @@ function println($txt) {
}
function rrmdir($dir) {
foreach(glob($dir . '/{*.txt,*.php,.*.txt,.*.php,.gitattributes,.gitignore,ftr-site-config-master,README.md}', GLOB_BRACE|GLOB_NOSORT) as $file) {
foreach(glob($dir . '/{*.txt,*.php,*.com,.*.txt,.*.php,.*.com,.gitattributes,.gitignore,ftr-site-config-master,README.md}', GLOB_BRACE|GLOB_NOSORT) as $file) {
if(is_dir($file)) {
rrmdir($file);
} else {

View File

@ -2,6 +2,19 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
3.5 (2015-06-13)
- Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
- Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
- Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky)
- Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates)
- Humble HTTP Agent updated to support version 2 of PHP's HTTP extension
- HTML5-PHP library updated
- Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6
- Config option removed: $options->user_agents - use site config files.
- Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional.
- Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4
- Site config files updated for better extraction
- Other minor fixes/improvements
3.4.1 (unreleased)
- Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c

View File

@ -430,22 +430,6 @@ $options->fingerprints = array(
'<meta name="generator" content="WordPress' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true)
);
// User Agent strings - mapping domain names
// ----------------------
// e.g. $options->user_agents = array('example.org' => 'PHP/5.2');
$options->user_agents = array( 'lifehacker.com' => 'PHP/5.2',
'gawker.com' => 'PHP/5.2',
'deadspin.com' => 'PHP/5.2',
'kotaku.com' => 'PHP/5.2',
'jezebel.com' => 'PHP/5.2',
'io9.com' => 'PHP/5.2',
'jalopnik.com' => 'PHP/5.2',
'gizmodo.com' => 'PHP/5.2',
'.wikipedia.org' => 'Mozilla/5.2',
'.fok.nl' => 'Googlebot/2.1',
'getpocket.com' => 'PHP/5.2'
);
// URL Rewriting
// ----------------------
// Currently allows simple string replace of URLs.
@ -500,7 +484,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.4');
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.5');
if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) {

View File

@ -16,17 +16,19 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/
$app_name = 'Full-Text RSS 3.3';
$app_name = 'Full-Text RSS 3.5';
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
//$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
// HHVM works okay, but no Tidy and autoupdate of site config files not working (tested 3.7.1)
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.3.0', '>='));
$pcre_ok = extension_loaded('pcre');
$zlib_ok = extension_loaded('zlib');
$mbstring_ok = extension_loaded('mbstring');
$iconv_ok = extension_loaded('iconv');
$tidy_ok = function_exists('tidy_parse_string');
$curl_ok = function_exists('curl_exec');
$parallel_ok = ((extension_loaded('http') && class_exists('HttpRequestPool')) || ($curl_ok && function_exists('curl_multi_init')));
$parallel_ok = ((extension_loaded('http') && class_exists('http\Client\Request')) || ($curl_ok && function_exists('curl_multi_init')));
$allow_url_fopen_ok = (bool)ini_get('allow_url_fopen');
$filter_ok = extension_loaded('filter');
@ -201,7 +203,7 @@ div.chunk {
<tbody>
<tr class="<?php echo ($php_ok) ? 'enabled' : 'disabled'; ?>">
<td>PHP</td>
<td>5.2.0 or higher</td>
<td>5.3 or higher</td>
<td><?php echo phpversion(); ?></td>
</tr>
<tr class="<?php echo ($xml_ok) ? 'enabled, and sane' : 'disabled, or broken'; ?>">
@ -306,9 +308,9 @@ div.chunk {
<?php endif; ?>
<?php if ($parallel_ok): ?>
<li><strong>Parallel URL fetching:</strong> You have <code>HttpRequestPool</code> or <code>curl_multi</code> support installed. No problems here.</li>
<li><strong>Parallel URL fetching:</strong> You have PHP's HTTP extension or <code>curl_multi</code> installed. No problems here.</li>
<?php else: ?>
<li class="highlight"><strong>Parallel URL fetching:</strong> <code>HttpRequestPool</code> or <code>curl_multi</code> support is not available. <?php echo $app_name; ?> will use <code>file_get_contents()</code> instead to fetch URLs sequentially rather than in parallel.</li>
<li class="highlight"><strong>Parallel URL fetching:</strong> HTTP extension or <code>curl_multi</code> support is not available. <?php echo $app_name; ?> will use <code>file_get_contents()</code> instead to fetch URLs sequentially rather than in parallel.</li>
<?php endif; ?>
<?php else: ?>
@ -352,11 +354,11 @@ div.chunk {
<div class="chunk">
<h3>Further info</h3>
<h4>HTTP module</h4>
<p>Full-Text RSS can make use of <code>HttpRequestPool</code> or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p>
<p>Full-Text RSS can make use of PHP's HTTP extension or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p>
<?php
$http_type = 'file_get_contents';
if (extension_loaded('http') && class_exists('HttpRequestPool')) {
$http_type = 'HttpRequestPool';
if (extension_loaded('http') && class_exists('http\Client\Request')) {
$http_type = 'HTTP extension';
} elseif ($curl_ok && function_exists('curl_multi_init')) {
$http_type = 'curl_multi';
}

View File

@ -582,8 +582,8 @@ if (!defined('_FF_FTR_INDEX')) {
<h3>System Requirements</h3>
<p>PHP 5.2 or above is required. A simple shared web hosting account will work fine.
The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using <a href="http://www.wampserver.com/en/index.php">WampServer</a>. It has also been reported as working under IIS, but we have not tested this ourselves.</p>
<p>PHP 5.3 or above is required. A simple shared web hosting account should work fine, but we recommend a <a href="http://help.fivefilters.org/customer/portal/articles/1143210-hosting">VPS with 1GB RAM</a>.
The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using <a href="http://www.uniformserver.com/">Uniform Server</a>. It has also been reported as working under IIS, but we have not tested this ourselves.</p>
<h3 id="download">Download</h3>
<p>Download from <a href="http://fivefilters.org/content-only/#download">fivefilters.org</a> &mdash; old versions are available in our <a href="http://code.fivefilters.org">code repository</a>.</p>

View File

@ -15,12 +15,12 @@
class ContentExtractor
{
protected static $tidy_config = array(
'clean' => true,
'clean' => false, // can't preserve wbr tabs if this is set to true
'output-xhtml' => true,
'logical-emphasis' => true,
'show-body-only' => false,
'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
'new-inline-tags' => 'mark, time, meter, progress, data',
'new-blocklevel-tags' => 'article aside footer header hgroup menu nav section details datagrid',
'new-inline-tags' => 'mark time meter progress data wbr',
'wrap' => 0,
'drop-empty-paras' => true,
'drop-proprietary-attributes' => false,
@ -42,6 +42,7 @@ class ContentExtractor
protected $body;
protected $success = false;
protected $nextPageUrl;
protected $opengraph = array();
public $allowedParsers = array('libxml', 'html5php');
public $defaultParser = 'libxml';
public $parserOverride = null;
@ -79,6 +80,7 @@ class ContentExtractor
$this->date = null;
$this->nextPageUrl = null;
$this->success = false;
$this->opengraph = array();
}
public function findHostUsingFingerprints($html) {
@ -109,8 +111,11 @@ class ContentExtractor
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
// is merged version already cached?
if (SiteConfig::is_cached("$host.merged")) {
$config = SiteConfig::build("$host.merged");
if ($config) {
$this->debug("Returning cached and merged site config for $host");
return SiteConfig::build("$host.merged");
return $config;
}
}
// let's build from site_config/custom/ and standard/
$config = SiteConfig::build($host);
@ -316,6 +321,24 @@ class ContentExtractor
}
}
// try to open graph properties
$elems = @$xpath->query("//head//meta[@property='og:title' or @property='og:type' or @property='og:url' or @property='og:image' or @property='og:description']", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Extracting Open Graph elements');
foreach ($elems as $elem) {
if ($elem->hasAttribute('content')) {
$_prop = strtolower($elem->getAttribute('property'));
$_val = $elem->getAttribute('content');
// currently one of each is returned, so we keep the first one
if (!isset($this->opengraph[$_prop])) {
$this->opengraph[$_prop] = $_val;
}
}
}
unset($_prop, $_val);
}
// try to get date
foreach ($this->config->date as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
@ -398,6 +421,16 @@ class ContentExtractor
}
}
// strip empty a elements
$elems = $xpath->query("//a[not(./*) and normalize-space(.)='']", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' empty a elements');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
// try to get body
foreach ($this->config->body as $pattern) {
$elems = @$xpath->query($pattern, $this->readability->dom);
@ -789,6 +822,10 @@ class ContentExtractor
return $this->body;
}
public function getOpenGraph() {
return $this->opengraph;
}
public function isNativeAd() {
return $this->nativeAd;
}

View File

@ -5,10 +5,10 @@
* Each instance of this class should hold extraction patterns and other directives
* for a website. See ContentExtractor class to see how it's used.
*
* @version 0.8
* @date 2013-04-16
* @version 1.0
* @date 2015-06-09
* @author Keyvan Minoukadeh
* @copyright 2013 Keyvan Minoukadeh
* @copyright 2015 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@ -38,8 +38,7 @@ class SiteConfig
// Mark article as a native ad if any of these expressions match (0 or more xpath expressions)
public $native_ad_clue = array();
// Additional HTTP headers to send
// NOT YET USED
// Additional HTTP headers to send (associative array)
public $http_header = array();
// Process HTML with tidy before creating DOM (bool or null if undeclared)
@ -67,6 +66,15 @@ class SiteConfig
// Test URL - if present, can be used to test the config above
public $test_url = array();
// Test URL contains - one or more snippets of text from the article body.
// Used to determine if the extraction rules for the site are still valid (ie. still extracting relevant content)
// Keys should be one or more of the test URLs supplied, and value an array of strings to look for.
public $test_contains = array();
// If page contains - XPath expression. Used to determine if the preceding rule gets evaluated or not.
// Currently only works with single_page_link.
public $if_page_contains = array();
// Single-page link - should identify a link element or URL pointing to the page holding the entire article
// This is useful for sites which split their articles across multiple pages. Links to such pages tend to
// display the first page with links to the other pages at the bottom. Often there is also a link to a page
@ -185,11 +193,23 @@ class SiteConfig
public function append(SiteConfig $newconfig) {
// check for commands where we accept multiple statements (no test_url)
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header') as $var) {
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue') as $var) {
// append array elements for this config variable from $newconfig to this config
//$this->$var = $this->$var + $newconfig->$var;
$this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
}
// special handling of commands where key is important and config values being appended should not overwrite existing ones
foreach (array('http_header') as $var) {
$this->$var = array_merge($newconfig->$var, $this->$var);
}
// special handling of if_page_contains directive
foreach (array('single_page_link') as $var) {
if (isset($this->if_page_contains[$var]) && isset($newconfig->if_page_contains[$var])) {
$this->if_page_contains[$var] = array_merge($newconfig->if_page_contains[$var], $this->if_page_contains[$var]);
} elseif (isset($newconfig->if_page_contains[$var])) {
$this->if_page_contains[$var] = $newconfig->if_page_contains[$var];
}
}
// check for single statement commands
// we do not overwrite existing non null values
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
@ -213,6 +233,40 @@ class SiteConfig
return $key_suffix;
}
// Add test_contains to last test_url
public function add_test_contains($test_contains) {
if (!empty($this->test_url)) {
$test_contains = (string) $test_contains;
$key = end($this->test_url);
reset($this->test_url);
if (isset($this->test_contains[$key])) {
$this->test_contains[$key][] = $test_contains;
} else {
$this->test_contains[$key] = array($test_contains);
}
}
}
// Add if_page_page_contains
// TODO: Expand so it can be used with other rules too
public function add_if_page_contains_condition($if_page_contains) {
if (!empty($this->single_page_link)) {
$if_page_contains = (string) $if_page_contains;
$key = end($this->single_page_link);
reset($this->single_page_link);
$this->if_page_contains['single_page_link'][$key] = $if_page_contains;
}
}
public function get_if_page_contains_condition($directive_name, $directive_value) {
if (isset($this->if_page_contains[$directive_name])) {
if (isset($this->if_page_contains[$directive_name][$directive_value])) {
return $this->if_page_contains[$directive_name][$directive_value];
}
}
return null;
}
// returns SiteConfig instance if an appropriate one is found, false otherwise
// if $exact_host_match is true, we will not look for wildcard config matches
// by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
@ -356,12 +410,20 @@ class SiteConfig
// check for single statement commands stored as strings
} elseif (in_array($command, array('parser'))) {
$config->$command = $val;
// special treatment for test_contains
} elseif (in_array($command, array('test_contains'))) {
$config->add_test_contains($val);
// special treatment for if_page_contains
} elseif (in_array($command, array('if_page_contains'))) {
$config->add_if_page_contains_condition($val);
// check for replace_string(find): replace
} elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) {
if (in_array($match[1], array('replace_string'))) {
$command = $match[1];
array_push($config->find_string, $match[2]);
array_push($config->$command, $val);
array_push($config->replace_string, $val);
} elseif (in_array($match[1], array('http_header'))) {
$_header = strtolower(trim($match[2]));
$config->http_header[$_header] = $val;
}
}
}

View File

@ -1,7 +1,7 @@
<?php
define('RSS2', 1, true);
define('JSON', 2, true);
define('JSONP', 3, true);
define('RSS2', 1);
define('JSON', 2);
define('JSONP', 3);
/**
* Univarsel Feed Writer class
@ -131,6 +131,11 @@ define('JSONP', 3, true);
$simplejson->language = null;
$simplejson->url = null;
$simplejson->effective_url = null;
$simplejson->og_url = null;
$simplejson->og_title = null;
$simplejson->og_description = null;
$simplejson->og_image = null;
$simplejson->og_type = null;
$simplejson->content = null;
// actual values
$simplejson->url = $jsonitem->link;
@ -151,6 +156,11 @@ define('JSONP', 3, true);
if (isset($jsonitem->pubDate)) {
$simplejson->date = gmdate(DATE_ATOM, strtotime($jsonitem->pubDate));
}
if (isset($jsonitem->og_url)) $simplejson->og_url = $jsonitem->og_url;
if (isset($jsonitem->og_title)) $simplejson->og_title = $jsonitem->og_title;
if (isset($jsonitem->og_description)) $simplejson->og_description = $jsonitem->og_description;
if (isset($jsonitem->og_image)) $simplejson->og_image = $jsonitem->og_image;
if (isset($jsonitem->og_type)) $simplejson->og_type = $jsonitem->og_type;
echo json_encode($simplejson);
}
}
@ -327,7 +337,7 @@ define('JSONP', 3, true);
{
$out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:og="http://ogp.me/ns#">' . PHP_EOL;
echo $out;
}
elseif ($this->version == JSON || $this->version == JSONP)
@ -370,7 +380,9 @@ define('JSONP', 3, true);
{
foreach ($attributes as $key => $value)
{
$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" ";
//$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" ";
// TODO: replace HTML entities not supported in XML with UTF8 equivalent characters
$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8')."\" ";
}
}
$nodeText .= "<{$tagName}{$attrText}>";
@ -384,7 +396,9 @@ define('JSONP', 3, true);
else
{
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent);
$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false);
//$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false);
// TODO: replace HTML entities not supported in XML with UTF8 equivalent characters
$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8');
}
//$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]></$tagName>" : "</$tagName>";
$nodeText .= "</$tagName>";

View File

@ -1,8 +1,8 @@
<?php
/*
htmLawed 1.1.17, 11 March 2014
OOP code, 11 March 2014
htmLawed 1.1.19, 19 January 2015
OOP code, 19 January 2015
Copyright Santosh Patnaik
Dual LGPL v3 and GPL v2+ license
A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed
@ -478,7 +478,7 @@ while(strlen($a)){
break; case 2: // Val
if(preg_match('`^((?:"[^"]*")|(?:\'[^\']*\')|(?:\s*[^\s"\']+))(.*)`', $a, $m)){
$a = ltrim($m[2]); $m = $m[1]; $w = 1; $mode = 0;
$aA[$nm] = trim(($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m);
$aA[$nm] = trim(str_replace('<', '&lt;', ($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m));
}
break;
}
@ -507,7 +507,7 @@ foreach($aA as $k=>$v){
$v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'htmLawed::hl_prot', $v);
$v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v;
}elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){
$v = str_replace("\xad", ' ', (strpos($v, '&') !== false ? str_replace(array('&#xad;', '&#173;', '&shy;'), ' ', $v) : $v));
$v = str_replace("­", ' ', (strpos($v, '&') !== false ? str_replace(array('&#xad;', '&#173;', '&shy;'), ' ', $v) : $v)); # double-quoted char is soft-hyphen; appears here as "­" or hyphen or something else depending on viewing software
$v = htmLawed::hl_prot($v, $k);
if($k == 'href'){ // X-spam
if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){
@ -701,7 +701,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array(
public static function hl_version(){
// rel
return '1.1.17';
return '1.1.19';
// eof
}

View File

@ -60,20 +60,22 @@ class HTML5
* The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first
* byte of input.
* @param array $options
* Configuration options when parsing the HTML
* @return \DOMDocument A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP.
*/
public function load($file)
public function load($file, array $options = array())
{
// Handle the case where file is a resource.
if (is_resource($file)) {
// FIXME: We need a StreamInputStream class.
return $this->loadHTML(stream_get_contents($file));
return $this->loadHTML(stream_get_contents($file), $options);
}
$input = new FileInputStream($file);
return $this->parse($input);
return $this->parse($input, $options);
}
/**
@ -84,14 +86,16 @@ class HTML5
*
* @param string $string
* A html5 document as a string.
* @param array $options
* Configuration options when parsing the HTML
* @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
* almost all distribtions of PHP.
*/
public function loadHTML($string)
public function loadHTML($string, array $options = array())
{
$input = new StringInputStream($string);
return $this->parse($input);
return $this->parse($input, $options);
}
/**
@ -104,13 +108,15 @@ class HTML5
* The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first
* byte of input.
* @param array $options
* Configuration options when parsing the HTML
*
* @return \DOMDocument A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP.
*/
public function loadHTMLFile($file)
public function loadHTMLFile($file, array $options = array())
{
return $this->load($file);
return $this->load($file, $options);
}
/**
@ -118,15 +124,17 @@ class HTML5
*
* @param string $string
* The html5 fragment as a string.
* @param array $options
* Configuration options when parsing the HTML
*
* @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
* almost all distributions of PHP.
*/
public function loadHTMLFragment($string)
public function loadHTMLFragment($string, array $options = array())
{
$input = new StringInputStream($string);
return $this->parseFragment($input);
return $this->parseFragment($input, $options);
}
/**
@ -155,10 +163,10 @@ class HTML5
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public function parse(\Masterminds\HTML5\Parser\InputStream $input)
public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
{
$this->errors = array();
$events = new DOMTreeBuilder(false, $this->options);
$events = new DOMTreeBuilder(false, array_merge($this->getOptions(), $options));
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);
@ -174,9 +182,9 @@ class HTML5
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input)
public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
{
$events = new DOMTreeBuilder(true, $this->options);
$events = new DOMTreeBuilder(true, array_merge($this->getOptions(), $options));
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);

View File

@ -66,6 +66,11 @@ class Elements
*/
const BLOCK_TAG = 64;
/**
* Indicates that the tag allows only inline elements as child nodes.
*/
const BLOCK_ONLY_INLINE = 128;
/**
* The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html.
*
@ -120,7 +125,7 @@ class Elements
"head" => 1,
"header" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"hgroup" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"hr" => 73, // NORMAL | VOID_TAG | BLOCK_TAG
"hr" => 73, // NORMAL | VOID_TAG
"html" => 1,
"i" => 1,
"iframe" => 3, // NORMAL | TEXT_RAW
@ -145,7 +150,7 @@ class Elements
"optgroup" => 1,
"option" => 1,
"output" => 65, // NORMAL | BLOCK_TAG
"p" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"p" => 209, // NORMAL | AUTOCLOSE_P | BLOCK_TAG | BLOCK_ONLY_INLINE
"param" => 9, // NORMAL | VOID_TAG
"pre" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"progress" => 1,

View File

@ -38,6 +38,12 @@ class DOMTreeBuilder implements EventHandler
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
const OPT_DISABLE_HTML_NS = 'disable_html_ns';
const OPT_TARGET_DOC = 'target_document';
const OPT_IMPLICIT_NS = 'implicit_namespaces';
/**
* Holds the HTML5 element names that causes a namespace switch
*
@ -138,6 +144,12 @@ class DOMTreeBuilder implements EventHandler
protected $insertMode = 0;
/**
* Track if we are in an element that allows only inline child nodes
* @var string|null
*/
protected $onlyInline;
/**
* Quirks mode is enabled by default.
* Any document that is missing the
@ -151,6 +163,9 @@ class DOMTreeBuilder implements EventHandler
{
$this->options = $options;
if (isset($options[self::OPT_TARGET_DOC])) {
$this->doc = $options[self::OPT_TARGET_DOC];
} else {
$impl = new \DOMImplementation();
// XXX:
// Create the doctype. For now, we are always creating HTML5
@ -158,6 +173,7 @@ class DOMTreeBuilder implements EventHandler
$dt = $impl->createDocumentType('html');
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc = $impl->createDocument(null, null, $dt);
}
$this->errors = array();
$this->current = $this->doc; // ->documentElement;
@ -165,8 +181,15 @@ class DOMTreeBuilder implements EventHandler
// Create a rules engine for tags.
$this->rules = new TreeBuildingRules($this->doc);
$implicitNS = array();
if (isset($this->options[self::OPT_IMPLICIT_NS])) {
$implicitNS = $this->options[self::OPT_IMPLICIT_NS];
} elseif (isset($this->options["implicitNamespaces"])) {
$implicitNS = $this->options["implicitNamespaces"];
}
// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array(
array_unshift($this->nsStack, $implicitNS + array(
'' => self::NAMESPACE_HTML
) + $this->implicitNamespaces);
@ -320,6 +343,11 @@ class DOMTreeBuilder implements EventHandler
}
}
if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) {
$this->autoclose($this->onlyInline);
$this->onlyInline = null;
}
try {
$prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : '';
@ -334,10 +362,10 @@ class DOMTreeBuilder implements EventHandler
$ele = $this->doc->importNode($frag->documentElement, true);
} else {
if (isset($this->nsStack[0][$prefix])) {
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
} else {
if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) {
$ele = $this->doc->createElement($lname);
} else {
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
}
}
@ -346,6 +374,10 @@ class DOMTreeBuilder implements EventHandler
$ele = $this->doc->createElement('invalid');
}
if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) {
$this->onlyInline = $lname;
}
// When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them.
// When we are on a void tag, we do not need to care about namesapce nesting.
if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) {
@ -394,7 +426,7 @@ class DOMTreeBuilder implements EventHandler
}
// Some elements have special processing rules. Handle those separately.
if ($this->rules->hasRules($name)) {
if ($this->rules->hasRules($name) && $this->frag !== $this->current) {
$this->current = $this->rules->evaluate($ele, $this->current);
} // Otherwise, it's a standard element.
else {

View File

@ -11,9 +11,9 @@ class Scanner
const CHARS_HEX = 'abcdefABCDEF01234567890';
const CHARS_ALNUM = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ';
const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
protected $is;

View File

@ -200,10 +200,12 @@ class Tokenizer
if (is_null($this->untilTag)) {
return $this->text();
}
$sequence = '</' . $this->untilTag . '>';
$sequence = '</' . $this->untilTag;
$txt = '';
$tok = $this->scanner->current();
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))))) {
$caseSensitive = !Elements::isHtml5Element($this->untilTag);
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) {
if ($tok == '&') {
$txt .= $this->decodeCharacterReference();
$tok = $this->scanner->current();
@ -212,6 +214,13 @@ class Tokenizer
$tok = $this->scanner->next();
}
}
$len = strlen($sequence);
$this->scanner->consume($len);
$len += strlen($this->scanner->whitespace());
if ($this->scanner->current() !== '>') {
$this->parseError("Unclosed RCDATA end tag");
}
$this->scanner->unconsume($len);
$this->events->text($txt);
$this->setTextMode(0);
return $this->endTag();
@ -353,7 +362,7 @@ class Tokenizer
}
// We know this is at least one char.
$name = strtolower($this->scanner->charsWhile(":0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"));
$name = strtolower($this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"));
$attributes = array();
$selfClose = false;
@ -891,7 +900,7 @@ class Tokenizer
$buffer .= $this->scanner->charsUntil($first);
// Stop as soon as we hit the stopping condition.
if ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))) {
if ($this->sequenceMatches($sequence, false)) {
return $buffer;
}
$buffer .= $this->scanner->current();
@ -916,7 +925,7 @@ class Tokenizer
* see if the input stream is at the start of a
* '</script>' string.
*/
protected function sequenceMatches($sequence)
protected function sequenceMatches($sequence, $caseSensitive = true)
{
$len = strlen($sequence);
$buffer = '';
@ -932,7 +941,7 @@ class Tokenizer
}
$this->scanner->unconsume($len);
return $buffer == $sequence;
return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0;
}
/**
@ -1056,8 +1065,14 @@ class Tokenizer
// [a-zA-Z0-9]+;
$cname = $this->scanner->getAsciiAlpha();
$entity = CharacterReference::lookupName($cname);
// When no entity is found provide the name of the unmatched string
// and continue on as the & is not part of an entity. The & will
// be converted to &amp; elsewhere.
if ($entity == null) {
$this->parseError("No match in entity table for '%s'", $entity);
$this->parseError("No match in entity table for '%s'", $cname);
$this->scanner->unconsume($this->scanner->position() - $start);
return '&';
}
}

View File

@ -115,9 +115,11 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
public function document($dom)
{
$this->doctype();
if ($dom->documentElement) {
$this->traverser->node($dom->documentElement);
$this->nl();
}
}
protected function doctype()
{

View File

@ -112,7 +112,7 @@ class Traverser
break;
// Currently we don't support embedding DTDs.
default:
print '<!-- Skipped -->';
//print '<!-- Skipped -->';
break;
}
}

View File

@ -2,8 +2,9 @@
Copyright (c) 2013 The Authors of HTML5-PHP
Matt Butcher - technosophos@gmail.com
Matt Butcher - mattbutcher@google.com
Matt Farina - matt@mattfarina.com
Asmir Mustafic - goetas@gmail.com
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in

View File

@ -10,6 +10,7 @@ But after some initial refactoring work, we began a new parser.
- Event-based (SAX-like) parser
- DOM tree builder
- Interoperability with QueryPath [[in progress](https://github.com/technosophos/querypath/issues/114)]
- Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer
[![Build Status](https://travis-ci.org/Masterminds/html5-php.png?branch=master)](https://travis-ci.org/Masterminds/html5-php) [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5) [![Coverage Status](https://coveralls.io/repos/Masterminds/html5-php/badge.png?branch=master)](https://coveralls.io/r/Masterminds/html5-php?branch=master)
@ -22,12 +23,12 @@ To install, add `masterminds/html5` to your `composer.json` file:
```
{
"require" : {
"masterminds/html5": "1.*"
"masterminds/html5": "2.*"
},
}
```
(You may substitute `1.*` for a more specific release tag, of
(You may substitute `2.*` for a more specific release tag, of
course.)
From there, use the `composer install` or `composer update` commands to
@ -43,6 +44,7 @@ Here is how you use the high-level `HTML5` library API:
<?php
// Assuming you installed from Composer:
require "vendor/autoload.php";
use Masterminds\HTML5;
// An example HTML document:
@ -59,13 +61,14 @@ $html = <<< 'HERE'
HERE;
// Parse the document. $dom is a DOMDocument.
$dom = HTML5::loadHTML($html);
$html5 = new HTML5();
$dom = $html5->loadHTML($html);
// Render it as HTML5:
print HTML5::saveHTML($dom);
print $html5->saveHTML($dom);
// Or save it to a file:
HTML5::save($dom, 'out.html');
$html5->save($dom, 'out.html');
?>
```
@ -73,6 +76,35 @@ HTML5::save($dom, 'out.html');
The `$dom` created by the parser is a full `DOMDocument` object. And the
`save()` and `saveHTML()` methods will take any DOMDocument.
### Options
It is possible to pass in an array of configuration options when loading
an HTML5 document.
```php
// An associative array of options
$options = array(
'option_name' => 'option_value',
);
// Provide the options to the constructor
$html5 = new HTML5($options);
$dom = $html5->loadHTML($html);
```
The following options are supported:
* `encode_entities` (boolean): Indicates that the serializer should aggressively
encode characters as entities. Without this, it only encodes the bare
minimum.
* `disable_html_ns` (boolean): Prevents the parser from automatically
assigning the HTML5 namespace to the DOM document. This is for
non-namespace aware DOM tools.
* `target_document` (\DOMDocument): A DOM document that will be used as the
destination for the parsed nodes.
* `implicit_namespaces` (array): An assoc array of namespaces that should be
used by the parser. Name is tag prefix, value is NS URI.
## The Low-Level API
@ -116,7 +148,7 @@ different rule sets to be used.
- The `Traverser`, which is a special-purpose tree walker. It visits
each node node in the tree and uses the `OutputRules` to transform the node
into a string.
- `\HTML5` manages the `Traverser` and stores the resultant data
- `HTML5` manages the `Traverser` and stores the resultant data
in the correct place.
The serializer (`save()`, `saveHTML()`) follows the
@ -134,7 +166,9 @@ issues known issues that are not presently on the roadmap:
- Namespaces: HTML5 only [supports a selected list of namespaces](http://www.w3.org/TR/html5/infrastructure.html#namespaces)
and they do not operate in the same way as XML namespaces. A `:` has no special
meaning. The parser does not support XML style namespaces via `:`.
meaning.
By default the parser does not support XML style namespaces via `:`;
to enable the XML namespaces see the [XML Namespaces section](#xml-namespaces)
- Scripts: This parser does not contain a JavaScript or a CSS
interpreter. While one may be supplied, not all features will be
supported.
@ -162,8 +196,45 @@ issues known issues that are not presently on the roadmap:
- PLAINTEXT: Unsupported.
- Adoption Agency Algorithm: Not yet implemented. (8.2.5.4.7)
##XML Namespaces
To use XML style namespaces you have to configure well the main `HTML5` instance.
```php
use Masterminds\HTML5;
$html = new HTML5(array(
"xmlNamespaces" => true
));
$dom = $html->loadHTML('<t:tag xmlns:t="http://www.example.com"/>');
$dom->documentElement->namespaceURI; // http://www.example.com
```
You can also add some default prefixes that will not require the namespace declaration,
but it's elements will be namespaced.
```php
use Masterminds\HTML5;
$html = new HTML5(array(
"implicitNamespaces"=>array(
"t"=>"http://www.example.com"
)
));
$dom = $html->loadHTML('<t:tag/>');
$dom->documentElement->namespaceURI; // http://www.example.com
```
## Thanks to...
The dedicated (and patient) contributors of patches small and large,
who have already made this library better.See the CREDITS file for
a list of contributors.
We owe a huge debt of gratitude to the original authors of html5lib.
While not much of the orignal parser remains, we learned a lot from

View File

@ -1,5 +1,42 @@
# Release Notes
2.1.1 (2015-03-23)
- #78: Fixes bug where unmatched entity like string drops everything after &.
2.1.0 (2015-02-01)
- #74: Added `disable_html_ns` and `target_doc` dom parsing options
- Unified option names
- #73: Fixed alphabet, &szlig; now can be detected
- #75 and #76: Allow whitespace in RCDATA tags
- #77: Fixed parsing blunder for json embeds
- #72: Add options to HTML methods
2.0.2 (2014-12-17)
- #50: empty document handling
- #63: tags with strange capitalization
- #65: dashes and underscores as allowed characters in tag names
- #68: Fixed issue with non-inline elements inside inline containers
2.0.1 (2014-09-23)
- #59: Fixed issue parsing some fragments.
- #56: Incorrectly saw 0 as empty string
- Sami as new documentation generator
2.0.0 (2014-07-28)
- #53: Improved boolean attributes handling
- #52: Facebook HHVM compatibility
- #48: Adopted PSR-2 as coding standard
- #47: Moved everything to Masterminds namespace
- #45: Added custom namespaces
- #44: Added support to XML-style namespaces
- #37: Refactored HTML5 class removing static methods
1.0.5 (2014-06-10)
- #38: Set the dev-master branch as the 1.0.x branch for composer (goetas)
- #34: Tests use PSR-4 for autoloading. (goetas)
- #40, #41: Fix entity handling in RCDATA sections. (KitaitiMakoto)
- #32: Fixed issue where wharacter references were being incorrectly encoded in style tags.
1.0.4 (2014-04-29)
- #30/#31 Don't throw an exception for invalid tag names.

View File

@ -7,11 +7,11 @@
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
* @version 1.5
* @date 2014-03-28
* @see http://php.net/HttpRequestPool
* @version 1.6
* @date 2015-06-05
* @see http://devel-m6w6.rhcloud.com/mdref/http
* @author Keyvan Minoukadeh
* @copyright 2011-2014 Keyvan Minoukadeh
* @copyright 2011-2015 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@ -22,7 +22,7 @@ class HumbleHttpAgent
const METHOD_FILE_GET_CONTENTS = 4;
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
const UA_PHP = 'PHP/5.4';
const UA_PHP = 'PHP/5.5';
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
protected $requests = array();
@ -38,6 +38,7 @@ class HumbleHttpAgent
public $debug = false;
public $debugVerbose = false;
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
public $siteConfigBuilder = null; // can be set to an instance of ContentExtractor to have site config files used for custom HTTP headers
public $maxRedirects = 5;
public $userAgentMap = array();
public $rewriteUrls = array();
@ -67,7 +68,7 @@ class HumbleHttpAgent
if (in_array($method, array(1,2,4))) {
$this->method = $method;
} else {
if (class_exists('HttpRequestPool')) {
if (class_exists('http\Client\Request')) {
$this->method = self::METHOD_REQUEST_POOL;
} elseif (function_exists('curl_multi_init')) {
$this->method = self::METHOD_CURL_MULTI;
@ -192,6 +193,7 @@ class HumbleHttpAgent
return false;
}
$redirect_url = $match[1];
$redirect_url = htmlspecialchars_decode($redirect_url); // For Facebook!
if (preg_match('!^https?://!i', $redirect_url)) {
// already absolute
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
@ -203,7 +205,7 @@ class HumbleHttpAgent
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
return $absolute;
return $absolute->get_iri();
}
return false;
}
@ -293,14 +295,16 @@ class HumbleHttpAgent
if (empty($urls)) return;
//////////////////////////////////////////////////////
// parallel (HttpRequestPool)
// parallel (HTTP extension)
if ($this->method == self::METHOD_REQUEST_POOL) {
$this->debug('Starting parallel fetch (HttpRequestPool)');
$this->debug('Starting parallel fetch (HTTP Extension)');
try {
while (count($urls) > 0) {
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
$subset = array_splice($urls, 0, $this->maxParallelRequests);
$pool = new HttpRequestPool();
//$pool = new HttpRequestPool();
$pool = new http\Client;
$pool->setOptions($this->requestOptions);
foreach ($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
@ -320,24 +324,62 @@ class HumbleHttpAgent
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
$_meth = HttpRequest::METH_HEAD;
$_meth = "HEAD";
} else {
$_meth = HttpRequest::METH_GET;
$_meth = "GET";
unset($this->requests[$orig]['wrongGuess']);
}
$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
// send cookies, if we have any
if ($cookies = $this->getCookies($orig, $req_url)) {
$this->debug("......sending cookies: $cookies");
$httpRequest->addHeaders(array('Cookie' => $cookies));
//$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
$httpRequest = new http\Client\Request($_meth, $req_url);
$httpRequest->setOptions($this->requestOptions);
// check site config for additional http headers
$scHeaders = array();
if (isset($this->siteConfigBuilder)) {
$scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header;
}
//$httpRequest->addHeaders(array('User-Agent' => $this->userAgent));
$httpRequest->addHeaders($this->getUserAgent($req_url, true));
// send cookies, if we have any
$_cookies = null;
if (isset($scHeaders['cookie'])) {
$_cookies = $scHeaders['cookie'];
} else {
//$_cookies = $this->cookieJar->getMatchingCookies($req_url);
$_cookies = $this->getCookies($orig, $req_url);
}
if ($_cookies) {
$this->debug("......sending cookies: $_cookies");
$httpRequest->addHeaders(array('Cookie' => $_cookies));
}
// send user agent
$_ua = null;
if (isset($scHeaders['user-agent'])) {
$_ua = $scHeaders['user-agent'];
} else {
$_ua = $this->getUserAgent($req_url, true);
$_ua = $_ua['User-Agent'];
}
if ($_ua) {
$this->debug("......user-agent set to: $_ua");
$httpRequest->addHeaders(array('User-Agent' => $_ua));
}
// add referer for picky sites
$httpRequest->addheaders(array('Referer' => $this->referer));
$_referer = null;
if (isset($scHeaders['referer'])) {
$_referer = $scHeaders['referer'];
} else {
$_referer = $this->referer;
}
if ($_referer) {
$this->debug("......referer set to: $_referer");
$httpRequest->addheaders(array('Referer'=>$_referer));
}
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$this->requests[$orig]['original_url'] = $orig;
$pool->attach($httpRequest);
$pool->enqueue($httpRequest);
}
}
// did we get anything into the pool?
@ -345,16 +387,20 @@ class HumbleHttpAgent
$this->debug('Sending request...');
try {
$pool->send();
} catch (HttpRequestPoolException $e) {
} catch (http\Exception $e) {
// do nothing
}
$this->debug('Received responses');
foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
$request = $this->requests[$orig]['httpRequest'];
$response = $pool->getResponse($request);
//$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
// getResponseHeader() doesn't return status line, so, for consistency...
$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
//$headers = $response->toString();
$this->requests[$orig]['headers'] = $response->getInfo()."\n".$this->headersToString($response->getHeaders(), true);
// v1 HTTP extension code
//$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
// check content type
// TODO: use getResponseHeader('content-type') or getResponseInfo()
if ($this->headerOnlyType($this->requests[$orig]['headers'])) {
@ -362,25 +408,37 @@ class HumbleHttpAgent
$_header_only_type = true;
$this->debug('Header only type returned');
} else {
$this->requests[$orig]['body'] = $request->getResponseBody();
$this->requests[$orig]['body'] = $response->getBody()->toString();
//var_dump($this->requests[$orig]['body']);exit;
// v1 HTTP ext. code
//$this->requests[$orig]['body'] = $request->getResponseBody();
$_header_only_type = false;
}
$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
$this->requests[$orig]['effective_url'] = $response->getTransferInfo('effective_url');
$this->requests[$orig]['status_code'] = $status_code = $response->getResponseCode();
// v1 HTTP ext. code
//$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
//$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
// is redirect?
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
$redirectURL = $request->getResponseHeader('location');
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $response->getHeader('location')) {
// v1 HTTP ext. code
//if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
$redirectURL = $response->getHeader('location');
if (!preg_match('!^https?://!i', $redirectURL)) {
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
//$cookies = $request->getResponseHeader('set-cookie');
//if ($cookies && !is_array($cookies)) $cookies = array($cookies);
//if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
$this->storeCookies($orig, $url);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
} elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) {
} elseif (!$_header_only_type && $request->getRequestMethod() == "HEAD") {
// the response content-type did not match our 'header only' types,
// but we'd issues a HEAD request because we assumed it would. So
// let's queue a proper GET request for this item...
@ -399,7 +457,7 @@ class HumbleHttpAgent
}
}
//die($url.' -multi- '.$request->getResponseInfo('effective_url'));
$pool->detach($request);
$pool->dequeue($request);
unset($this->requests[$orig]['httpRequest'], $request);
/*
if ($this->minimiseMemoryUse) {
@ -411,7 +469,7 @@ class HumbleHttpAgent
}
}
}
} catch (HttpException $e) {
} catch (http\Exception $e) {
$this->debug($e);
return false;
}
@ -452,15 +510,51 @@ class HumbleHttpAgent
unset($this->requests[$orig]['wrongGuess']);
}
$headers = array();
//$headers[] = 'User-Agent: '.$this->userAgent;
$headers[] = $this->getUserAgent($req_url);
// add referer for picky sites
$headers[] = 'Referer: '.$this->referer;
// send cookies, if we have any
if ($cookies = $this->getCookies($orig, $req_url)) {
$this->debug("......sending cookies: $cookies");
$headers[] = 'Cookie: '.$cookies;
// check site config for additional http headers
$scHeaders = array();
if (isset($this->siteConfigBuilder)) {
$scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header;
}
// send cookies, if we have any
$_cookies = null;
if (isset($scHeaders['cookie'])) {
$_cookies = $scHeaders['cookie'];
} else {
//$_cookies = $this->cookieJar->getMatchingCookies($req_url);
$_cookies = $this->getCookies($orig, $req_url);
}
if ($_cookies) {
$this->debug("......sending cookies: $_cookies");
$headers[] = 'Cookie: '.$_cookies;
}
// send user agent
$_ua = null;
if (isset($scHeaders['user-agent'])) {
$_ua = $scHeaders['user-agent'];
} else {
$_ua = $this->getUserAgent($req_url, true);
$_ua = $_ua['User-Agent'];
}
if ($_ua) {
$this->debug("......user-agent set to: $_ua");
$headers[] = 'User-Agent: '.$_ua;
}
// add referer for picky sites
$_referer = null;
if (isset($scHeaders['referer'])) {
$_referer = $scHeaders['referer'];
} else {
$_referer = $this->referer;
}
if ($_referer) {
$this->debug("......referer set to: $_referer");
$headers[] = 'Referer: '.$_referer;
}
$httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, $this->curlOptions);
$httpRequest->set_original_url($orig);
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
@ -494,6 +588,9 @@ class HumbleHttpAgent
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
//$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
//if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->storeCookies($orig, $url);
$this->redirectQueue[$orig] = $redirectURL;
} else {
@ -548,15 +645,52 @@ class HumbleHttpAgent
$req_url = $this->rewriteUrls($url);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
$req_url = $this->removeFragment($req_url);
// send cookies, if we have any
$httpContext = $this->httpContext;
$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
// add referer for picky sites
$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
if ($cookies = $this->getCookies($orig, $req_url)) {
$this->debug("......sending cookies: $cookies");
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
// check site config for additional http headers
$scHeaders = array();
if (isset($this->siteConfigBuilder)) {
$scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header;
}
// send cookies, if we have any
$_cookies = null;
if (isset($scHeaders['cookie'])) {
$_cookies = $scHeaders['cookie'];
} else {
//$_cookies = $this->cookieJar->getMatchingCookies($req_url);
$_cookies = $this->getCookies($orig, $req_url);
}
if ($_cookies) {
$this->debug("......sending cookies: $_cookies");
$httpContext['http']['header'] .= 'Cookie: '.$_cookies."\r\n";
}
// send user agent
$_ua = null;
if (isset($scHeaders['user-agent'])) {
$_ua = $scHeaders['user-agent'];
} else {
$_ua = $this->getUserAgent($req_url, true);
$_ua = $_ua['User-Agent'];
}
if ($_ua) {
$this->debug("......user-agent set to: $_ua");
$httpContext['http']['header'] .= 'User-Agent: '.$_ua."\r\n";
}
// add referer for picky sites
$_referer = null;
if (isset($scHeaders['referer'])) {
$_referer = $scHeaders['referer'];
} else {
$_referer = $this->referer;
}
if ($_referer) {
$this->debug("......referer set to: $_referer");
$httpContext['http']['header'] .= 'Referer: '.$_referer."\r\n";
}
if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
$this->debug('Received response');
// get status code
@ -585,6 +719,9 @@ class HumbleHttpAgent
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
//$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
//if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->storeCookies($orig, $url);
$this->redirectQueue[$orig] = $redirectURL;
} else {
@ -680,7 +817,7 @@ class HumbleHttpAgent
}
public function parallelSupport() {
return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
return class_exists('http\Client') || function_exists('curl_multi_init');
}
private function headerOnlyType($headers) {
@ -727,6 +864,7 @@ class HumbleHttpAgent
protected function deleteCookies() {
$this->cookieJar = array();
}
}
// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930

View File

@ -22,6 +22,7 @@ class HumbleHttpAgentDummy
public $userAgentMap = array();
public $rewriteUrls = array();
public $userAgentDefault;
public $siteConfigBuilder = null;
public $referer;
protected $body = '';

View File

@ -12,7 +12,7 @@
* More information: http://fivefilters.org/content-only/
* License: Apache License, Version 2.0
* Requires: PHP5
* Date: 2014-03-27
* Date: 2015-06-01
*
* Differences between the PHP port and the original
* ------------------------------------------------------
@ -95,7 +95,7 @@ class Readability
// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
'normalize' => '/\s{2,}/',
'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|twitch\.tv)!i',
'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|soundcloud\.com|twitch\.tv)!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
);
@ -121,8 +121,12 @@ class Readability
if (version_compare(PHP_VERSION, '5.3.0') >= 0) {
//use Masterminds\HTML5;
$html5class = 'Masterminds\HTML5';
$html5 = new $html5class();
$html5 = new $html5class(array('disable_html_ns' => true));
$this->dom = $html5->loadHTML($html);
//echo $html5->saveHTML($this->dom);exit;
//$xpath = new DOMXPath($this->dom);
//$elems = $xpath->query("//a");
//print_r($elems);exit;
}
}
if ($this->dom === null) {
@ -314,7 +318,11 @@ class Readability
$styleTags = $this->dom->getElementsByTagName('style');
for ($i = $styleTags->length-1; $i >= 0; $i--)
{
$styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
try {
@$styleTags->item($i)->parentNode->removeChild($styleTags->item($i));
} catch (Exception $e) {
// Do nothing
}
}
/* Turn all double br's into p's */
@ -832,7 +840,11 @@ class Readability
$scripts = $doc->getElementsByTagName('script');
for($i = $scripts->length-1; $i >= 0; $i--)
{
try {
$scripts->item($i)->parentNode->removeChild($scripts->item($i));
} catch (Exception $e) {
// do nothing
}
}
}

View File

@ -1,10 +1,10 @@
<?php
// Full-Text RSS: Create Full-Text Feeds
// Author: Keyvan Minoukadeh
// Copyright (c) 2014 Keyvan Minoukadeh
// Copyright (c) 2015 Keyvan Minoukadeh
// License: AGPLv3
// Version: 3.4
// Date: 2014-08-28
// Version: 3.5
// Date: 2015-05-29
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
@ -30,6 +30,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
error_reporting(E_ALL ^ E_NOTICE);
libxml_use_internal_errors(true);
libxml_disable_entity_loader(true);
ini_set("display_errors", 1);
@set_time_limit(120);
@ -234,7 +235,7 @@ if (isset($_REQUEST['accept']) && in_array(strtolower($_REQUEST['accept']), arra
$user_submitted_config = null;
if (isset($_REQUEST['siteconfig'])) {
$user_submitted_config = $_REQUEST['siteconfig'];
if (!$options->user_submitted_content && $user_submitted_config) {
if (!$options->user_submitted_config && $user_submitted_config) {
die('User-submitted site configs are currently disabled. Please remove the siteconfig parameter.');
}
}
@ -526,7 +527,8 @@ if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') {
}
$http = new HumbleHttpAgent($_req_options);
$http->debug = $debug_mode;
$http->userAgentMap = $options->user_agents;
// User agents can now be set in site config files using the http_header directive
//$http->userAgentMap = $options->user_agents;
$http->headerOnlyTypes = array_keys($options->content_type_exc);
$http->rewriteUrls = $options->rewrite_url;
unset($_req_options);
@ -545,6 +547,7 @@ $extractor->parserOverride = $parser;
if ($options->user_submitted_config && $user_submitted_config) {
$extractor->setUserSubmittedConfig($user_submitted_config);
}
$http->siteConfigBuilder = $extractor;
////////////////////////////////
// Get RSS/Atom feed
@ -655,7 +658,7 @@ $items = $feed->get_items(0, $max);
$urls_sanitized = array();
$urls = array();
foreach ($items as $key => $item) {
$permalink = htmlspecialchars_decode($item->get_permalink());
$permalink = htmlspecialchars_decode(trim($item->get_permalink()));
// Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
$permalink = str_replace('%3A', ':', $permalink);
// validateUrl() strips non-ascii characters
@ -974,6 +977,13 @@ foreach ($items as $key => $item) {
}
}
// add open graph
if ($opengraph = $extractor->getOpenGraph()) {
foreach ($opengraph as $og_prop => $og_val) {
$newitem->addElement($og_prop, $og_val);
}
}
// add language
if ($detect_language) {
$language = $extractor->getLanguage();
@ -1390,6 +1400,17 @@ function get_single_page($item, $html, $url) {
// Loop through single_page_link xpath expressions
$single_page_url = null;
foreach ($splink as $pattern) {
// Do we have conditions?
$condition = $site_config->get_if_page_contains_condition('single_page_link', $pattern);
if ($condition) {
$elems = @$xpath->evaluate($condition, $readability->dom);
if ($elems instanceof DOMNodeList && $elems->length > 0) {
// all fine
} else {
// move on to next single page link XPath
continue;
}
}
$elems = @$xpath->evaluate($pattern, $readability->dom);
if (is_string($elems)) {
$single_page_url = trim($elems);

3
robots.txt Normal file
View File

@ -0,0 +1,3 @@
User-agent: *
Disallow: /makefulltextfeed.php
Disallow: /extract.php

View File

@ -0,0 +1,3 @@
<?php
// this is here to prevent directory listing over the web
?>