Full-Text RSS 3.7

This commit is contained in:
FiveFilters.org 2019-04-04 23:23:27 +02:00
parent 9658f6a00b
commit d3009c43e3
62 changed files with 1979 additions and 656 deletions

View File

@ -2,6 +2,26 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
3.7 (2017-02-12)
- Request HTML5 output using HTML5-PHP - new config option $options->html5_output and new request parameter &content=html5
- Improve support for lazy-loading images
- Feed preview now displays RTL content correctly (added dir='auto' to feed.xsl)
- New request parameter images=0 to remove all images from extracted content
- Open Graph and Twitter card metadata now returned in JSON output (no longer in RSS output)
- Metadata now returned in extract.php even if article extraction fails
- Additional data returned in extract.php for developers: 'domain', 'word_count'
- HTML5-PHP library updated
- SimplePie library updated (fixes PHP 7.1 issue)
- New VPS Puppet script (ubuntu-16.04.pp) - installs PHP 7 and Gumbo PHP extension for faster HTML5 parsing
- Bug fix: Language detection now works correctly with PHP 7
- Bug fix: Take base href URL into account when following next_page/single_page links (thanks Lukas!)
- Bug fix: VPS Puppet script installs new version of PECL HTTP extension that fixes problem when requesting punycode encoded domains
- Site config files updated for better extraction
- Compatibility test file updated (will tell you if Gumbo PHP will be used)
- Tidy won't be used to repair HTML if using an HTML5 parser (unless explicitly requested in site config file - tidy: yes)
- New config option $options->blocked_message - set what a user will see when a URL is blocked by Full-Text RSS
- Other fixes/improvements
3.6 (2016-02-21)
- Insert og:image (if we find one) at the top of the article when no images have been extracted
- Additional lazy image load handling - helps preserve more images designed for JS-enabled browsers

View File

@ -59,6 +59,20 @@ $options->max_entries = 10;
// from the output.
$options->content = 'user';
// HTML5 output
// ----------------------
// By default, Full-Text RSS uses libxml to convert the parsed DOM tree back into HTML.
// If this is enabled, we'll use HTML5-PHP to produce the HTML. This will be a little
// slower, but might produce better results, adhering to the HTML5 spec.
//
// Note: in a future release we might make HTML5 output the default.
//
// Possible values...
// HTML5 (slower): true
// libxml (faster): false
// libxml unless user overrides (&content=html5): 'user' (default)
$options->html5_output = 'user';
// Excerpts
// ----------------------
// By default Full-Text RSS does not include excerpts in the output.
@ -242,6 +256,16 @@ $options->allowed_urls = array();
// Note: for feeds, this option applies to both feed URLs and item URLs within those feeds.
$options->blocked_urls = array();
// Blocked message
// -----------------------
// If a request is blocked outright because of the two rules above, this is the message
// that is shown. Please note:
// * If the input URL is a feed and it's not blocked, feed items that are blocked will
// be skipped, and this message will not be shown.
// * If the input URL itself is blocked (feed or not), we will output this message instead
// of producing a feed.
$options->blocked_message = '<strong>URL blocked</strong>';
// Key holder(s) only?
// ----------------------
// Set this to true if you want to restrict access only to
@ -313,7 +337,7 @@ $options->xss_filter = 'user';
// Use effective URL in place of item URL
// ----------------------
// When we extract content for feed items, we often end up at a different URL than the
// one in the original feed. This is often a result of URL shorteners being used or
// one in the original feed. This is often a result of URL shorteners or
// tracking services being used by the feed publisher. We include the final
// (effective) URL we reached to get the content inside the dc:identifier field.
// If you enable this, we'll also use this URL in place of the original item URL
@ -500,7 +524,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.6');
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.7');
if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) {

View File

@ -10,13 +10,13 @@
<link rel="stylesheet" type="text/css" href="css/feed.css" />
</head>
<body>
<div id="explanation">
<div id="explanation" dir="auto">
<h1><xsl:value-of select="$title"/> <span class="small"> (full-text feed)</span></h1>
<p>You are viewing an auto-generated full-text <acronym title="Really Simple Syndication">RSS</acronym> feed. RSS feeds allow you to stay up to date with the latest news and features you want from websites.<br /><a href="{$subscribe}">Subscribe to this feed.</a></p>
<p>Below is the latest content available from this feed.</p>
</div>
<div id="content">
<div id="content" dir="auto">
<ul>
<xsl:for-each select="rss/channel/item">
<div class="article">

View File

@ -50,8 +50,12 @@ $_POST['accept'] = 'html';
$_POST['format'] = 'json';
// Enable excerpts
$_POST['summary'] = '1';
// Guess language if it's not already marked up
$_POST['lang'] = '2';
// Don't produce result if extraction fails
$_POST['exc'] = '1';
// Additional meta elements might still be useful (e.g. og/twitter elements),
// so we're commeting this out from FTR 3.7
//$_POST['exc'] = '1';
// Enable XSS filtering (unless explicitly disabled)
if (isset($_POST['xss']) && $_POST['xss'] !== '0') {
$_POST['xss'] = '1';

View File

@ -16,7 +16,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/
$app_name = 'Full-Text RSS 3.6';
$app_name = 'Full-Text RSS 3.7';
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
//$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
@ -31,6 +31,7 @@ $curl_ok = function_exists('curl_exec');
$parallel_ok = ((extension_loaded('http') && class_exists('http\Client\Request')) || ($curl_ok && function_exists('curl_multi_init')));
$allow_url_fopen_ok = (bool)ini_get('allow_url_fopen');
$filter_ok = extension_loaded('filter');
$gumbo_ok = class_exists('Layershifter\Gumbo\Parser');
if (extension_loaded('xmlreader')) {
$xml_ok = true;
@ -376,15 +377,16 @@ div.chunk {
?>
<h4>HTML parser</h4>
<p>Full-Text RSS uses the fast libxml parser (the default PHP parser) but it can also make use of HTML5-PHP (an HTML5 parser written in PHP) if your version of PHP supports it. The latter might produce better results for some sites, especially if Tidy is not available on your server, however, it is slower than libxml.</p>
<p><?php echo $app_name; ?> uses the fast libxml parser (the default PHP parser) but it will automatically make use of Gumbo (a fast HTML5 parser) if the <a href="https://github.com/layershifter/gumbo-php">Gumbo PHP</a> extension is installed. Alternatively, HTML5-PHP (an HTML5 parser written in PHP) can be used by passing &amp;parser=html5 as a parameter. The latter might produce better results than libxml for some sites, but is a little slower.</p>
<?php
if (version_compare(PHP_VERSION, '5.3.0') >= 0) {
echo '<p class="highlight"><strong>HTML5-PHP</strong> can be used on this server.</p>';
if ($gumbo_ok) {
echo '<p class="highlight"><strong>Gumbo PHP</strong> will be used on this server.</p>';
} else {
echo '<p class="highlight">You need at least PHP 5.3 to be able to use HTML5-PHP.</p>';
echo '<p class="highlight">libxml will be used by default, unless HTML5 parsing is requested.</p>';
}
?>
<!--
<h4>Language detection</h4>
<p>Full-Text RSS can detect the language of each article processed. This occurs using <a href="http://pear.php.net/package/Text_LanguageDetect">Text_LanguageDetect</a> or <a href="https://github.com/lstrojny/php-cld">PHP-CLD</a> (if available).</p>
<?php
@ -394,7 +396,7 @@ div.chunk {
echo '<p class="highlight"><strong>Text_LanguageDetect</strong> will be used on this server.</p>';
}
?>
-->
<h4>Automatic site config updates</h4>
<p>Full-Text RSS can be configured to update its site config files (which determine how content should be extracted for certain sites) by downloading the latest set from our GitHub repository. This functionaility is not required, and can be done manually. To configure this to occur automatically, you will need zip support enabled in PHP - we make use of the ZipArchive class.</p>
<?php

File diff suppressed because one or more lines are too long

View File

@ -5,10 +5,10 @@
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
* to extract content from HTML files.
*
* @version 1.2
* @date 2016-02-21
* @version 1.3
* @date 2017-02-12
* @author Keyvan Minoukadeh
* @copyright 2016 Keyvan Minoukadeh
* @copyright 2017 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@ -43,10 +43,12 @@ class ContentExtractor
protected $success = false;
protected $nextPageUrl;
protected $opengraph = array();
protected $twitterCard = array();
public $allowedParsers = array('libxml', 'html5php');
public $defaultParser = 'libxml';
public $parserOverride = null;
public $fingerprints = array();
public $stripImages = false;
public $readability;
public $debug = false;
public $debugVerbose = false;
@ -81,6 +83,7 @@ class ContentExtractor
$this->nextPageUrl = null;
$this->success = false;
$this->opengraph = array();
$this->twitterCard = array();
}
public function findHostUsingFingerprints($html) {
@ -188,22 +191,6 @@ class ContentExtractor
unset($_count);
}
// use tidy (if it exists)?
// This fixes problems with some sites which would otherwise
// trouble DOMDocument's HTML parsing. (Although sometimes it
// makes matters worse, which is why you can override it in site config files.)
$tidied = false;
if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
$this->debug('Using Tidy');
$tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
if (tidy_clean_repair($tidy)) {
$original_html = $html;
$tidied = true;
$html = $tidy->value;
}
unset($tidy);
}
// load and parse html
if ($this->parserOverride) {
// from querystring: &parser=xxx
@ -218,6 +205,34 @@ class ContentExtractor
$this->debug("HTML parser $_parser not listed, using ".$this->defaultParser." instead");
$_parser = $this->defaultParser;
}
// Full-Text RSS 3.7...
if (class_exists('Layershifter\Gumbo\Parser')) {
$this->debug("Gumbo PHP extension will be used for HTML parsing");
$_parser = 'gumbo'; // fast HTML5 parser
}
// use tidy (if it exists)?
// This fixes problems with some sites which would otherwise
// trouble DOMDocument's HTML parsing. (Although sometimes it
// makes matters worse, which is why you can override it in site config files.)
$tidied = false;
if ($this->config->tidy() && function_exists('tidy_parse_string') && $smart_tidy) {
// if we're using HTML5 parser and no explicit tidy declaration in site config file
// we'll skip tidy
if (($_parser == 'gumbo' || $_parser == 'html5php') && ($this->config->tidy === null)) {
// No Tidy
} else {
$this->debug('Using Tidy');
$tidy = tidy_parse_string($html, self::$tidy_config, 'UTF8');
if (tidy_clean_repair($tidy)) {
$original_html = $html;
$tidied = true;
$html = $tidy->value;
}
unset($tidy);
}
}
$this->debug("Attempting to parse HTML with $_parser");
$this->readability = new Readability($html, $url, $_parser);
@ -321,7 +336,7 @@ class ContentExtractor
}
}
// try to open graph properties
// try to get open graph elements
$elems = @$xpath->query("//head//meta[@property='og:title' or @property='og:type' or @property='og:url' or @property='og:image' or @property='og:description']", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
@ -339,6 +354,25 @@ class ContentExtractor
unset($_prop, $_val);
}
// try to get Twitter Card elements
// TODO: add more, but multiple colons, e.g. twitter:site:id cause problems for RSS validation (namespace). For the others, maybe only return in JSON output
$elems = @$xpath->query("//head//meta[@name='twitter:card' or @name='twitter:site' or @name='twitter:creator' or @name='twitter:description' or @name='twitter:title' or @name='twitter:image']", $this->readability->dom);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Extracting Twiter Card elements');
foreach ($elems as $elem) {
if ($elem->hasAttribute('content')) {
$_prop = strtolower($elem->getAttribute('name'));
$_val = $elem->getAttribute('content');
// currently one of each is returned, so we keep the first one
if (!isset($this->twitterCard[$_prop])) {
$this->twitterCard[$_prop] = $_val;
}
}
}
unset($_prop, $_val);
}
// try to get date
foreach ($this->config->date as $pattern) {
$elems = @$xpath->evaluate($pattern, $this->readability->dom);
@ -707,6 +741,21 @@ class ContentExtractor
}
}
// Find date in Open Graph meta element
// http://ogp.me/#no_vertical
if ($detect_date) {
$elems = @$xpath->query("//meta[@property='article:published_time' and @content]", $this->readability->dom);
if ($elems && $elems->length == 1) {
$this->date = strtotime(trim($elems->item(0)->getAttribute('content')));
if ($this->date) {
$this->debug('Date found (article:published_time): '.date('Y-m-d H:i:s', $this->date));
$detect_date = false;
} else {
$this->date = null;
}
}
}
// Find date in pubdate marked time element
// For the same reason given above, we only use this
// if there's exactly one element.
@ -765,16 +814,29 @@ class ContentExtractor
}
}
// prevent self-closing iframes
if ($this->body->tagName === 'iframe') {
if (!$this->body->hasChildNodes()) {
$this->body->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
}
} else {
$elems = $this->body->getElementsByTagName('iframe');
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
if (!$e->hasChildNodes()) {
$e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
// better to do this or to look for all elements not matching known void elements?
// Will requesting HTML5 output using HTML5-PHP fix this issue?
$_dont_self_close = array('iframe', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6');
foreach ($_dont_self_close as $_tagname) {
if ($this->body->tagName === $_tagname) {
if (!$this->body->hasChildNodes()) {
if ($_tagname === 'iframe') {
$this->body->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
} else {
$this->body->appendChild($this->body->ownerDocument->createTextNode(''));
}
}
} else {
$elems = $this->body->getElementsByTagName($_tagname);
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
if (!$e->hasChildNodes()) {
if ($_tagname === 'iframe') {
$e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
} else {
$e->appendChild($this->body->ownerDocument->createTextNode(''));
}
}
}
}
}
@ -782,7 +844,7 @@ class ContentExtractor
// the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
// inside the data-lazy-src attribute. It also places the original image inside a noscript element
// next to the amended one.
$elems = @$xpath->query("//img[@data-lazy-src]", $this->body);
$elems = @$xpath->query(".//img[@data-lazy-src]", $this->body);
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
// let's see if we can grab image from noscript
@ -800,19 +862,31 @@ class ContentExtractor
// now let's deal with another lazy load technique. Example:
// <img src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" class="lazyload"
// data-src="http://i68.tinypic.com/2jabu8.jpg" alt="Image and video hosting by TinyPic" border="0" />
$elems = @$xpath->query("//img[@data-src and contains(@class, 'lazyload') and contains(@src, 'data:image')]", $this->body);
$elems = @$xpath->query(".//img[@data-src and (contains(@src, 'data:image') or contains(@src, '.gif'))]", $this->body);
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
$e->setAttribute('src', $e->getAttribute('data-src'));
$e->removeAttribute('data-src');
}
// If there's an og:image, but we have no images in the article, let's place it at the beginning of the article.
if ($this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
$elems = @$xpath->query("//img", $this->body);
if ($elems->length === 0) {
$_new_elem = $this->body->ownerDocument->createDocumentFragment();
@$_new_elem->appendXML('<div><img src="'.htmlspecialchars($this->opengraph['og:image']).'" class="ff-og-image-inserted" /></div>');
$this->body->insertBefore($_new_elem, $this->body->firstChild);
// Strip images?
if ($this->stripImages && $this->body->hasChildNodes()) {
$elems = @$xpath->query("//figure | //img | //figcaption", $this->body);
// check for matches
if ($elems && $elems->length > 0) {
$this->debug('Stripping images: '.$elems->length.' img/figure/figcaption elements');
for ($i=$elems->length-1; $i >= 0; $i--) {
@$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
} else {
// If there's an og:image, but we have no images in the article, let's place it at the beginning of the article.
if ($this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
$elems = @$xpath->query(".//img", $this->body);
if ($elems->length === 0) {
$_new_elem = $this->body->ownerDocument->createDocumentFragment();
@$_new_elem->appendXML('<div><img src="'.htmlspecialchars($this->opengraph['og:image']).'" class="ff-og-image-inserted" /></div>');
$this->body->insertBefore($_new_elem, $this->body->firstChild);
}
}
}
@ -850,6 +924,10 @@ class ContentExtractor
return $this->opengraph;
}
public function getTwitterCard() {
return $this->twitterCard;
}
public function isNativeAd() {
return $this->nativeAd;
}

View File

@ -81,7 +81,7 @@ define('JSONP', 3);
*/
public function setChannelElementsFromArray($elementArray)
{
if(! is_array($elementArray)) return;
if(!is_array($elementArray)) return;
foreach ($elementArray as $elementName => $content)
{
$this->setChannelElement($elementName, $content);
@ -131,19 +131,33 @@ define('JSONP', 3);
$simplejson->language = null;
$simplejson->url = null;
$simplejson->effective_url = null;
$simplejson->domain = null;
$simplejson->word_count = null;
$simplejson->og_url = null;
$simplejson->og_title = null;
$simplejson->og_description = null;
$simplejson->og_image = null;
$simplejson->og_type = null;
$simplejson->twitter_card = null;
$simplejson->twitter_site = null;
$simplejson->twitter_creator = null;
$simplejson->twitter_image = null;
$simplejson->twitter_title = null;
$simplejson->twitter_description = null;
$simplejson->content = null;
// actual values
$simplejson->url = $jsonitem->link;
$simplejson->effective_url = $jsonitem->dc_identifier;
$simplejson->domain = strtolower(@parse_url($simplejson->effective_url, PHP_URL_HOST));
if (substr($simplejson->domain, 0, 4) === 'www.') {
$simplejson->domain = substr($simplejson->domain, 4);
}
if (isset($jsonitem->title)) $simplejson->title = $jsonitem->title;
if (isset($jsonitem->dc_language)) $simplejson->language = $jsonitem->dc_language;
if (isset($jsonitem->content_encoded)) {
$simplejson->content = $jsonitem->content_encoded;
// from http://php.net/manual/en/function.str-word-count.php#107363
$simplejson->word_count = count(preg_split('!\s+!', strip_tags($simplejson->content), -1, PREG_SPLIT_NO_EMPTY));
if (isset($jsonitem->description)) {
$simplejson->excerpt = $jsonitem->description;
}
@ -161,6 +175,12 @@ define('JSONP', 3);
if (isset($jsonitem->og_description)) $simplejson->og_description = $jsonitem->og_description;
if (isset($jsonitem->og_image)) $simplejson->og_image = $jsonitem->og_image;
if (isset($jsonitem->og_type)) $simplejson->og_type = $jsonitem->og_type;
if (isset($jsonitem->twitter_card)) $simplejson->twitter_card = $jsonitem->twitter_card;
if (isset($jsonitem->twitter_site)) $simplejson->twitter_site = $jsonitem->twitter_site;
if (isset($jsonitem->twitter_creator)) $simplejson->twitter_creator = $jsonitem->twitter_creator;
if (isset($jsonitem->twitter_image)) $simplejson->twitter_image = $jsonitem->twitter_image;
if (isset($jsonitem->twitter_title)) $simplejson->twitter_title = $jsonitem->twitter_title;
if (isset($jsonitem->twitter_description)) $simplejson->twitter_description = $jsonitem->twitter_description;
echo json_encode($simplejson);
}
}
@ -337,7 +357,8 @@ define('JSONP', 3);
{
$out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:og="http://ogp.me/ns#">' . PHP_EOL;
//$out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:og="http://ogp.me/ns#" xmlns:twitter="https://dev.twitter.com/cards/markup">' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
echo $out;
}
elseif ($this->version == JSON || $this->version == JSONP)
@ -495,6 +516,9 @@ define('JSONP', 3);
foreach ($itemElements as $thisElement) {
foreach ($thisElement as $instance) {
if ($this->version == RSS2) {
// Let's not include twitter and open graph elements in regular RSS output
// These are aimed more at developers, and so JSON is more appropriate
if (preg_match('/^(twitter|og):/i', $instance['name'])) continue;
echo $this->makeNode($instance['name'], $instance['content'], $instance['attributes']);
} elseif ($this->version == JSON || $this->version == JSONP) {
$_json_node = $this->makeNode($instance['name'], $instance['content'], $instance['attributes']);

View File

@ -1,8 +1,8 @@
<?php
/*
htmLawed 1.1.20, 9 June 2015
OOP code, 9 June 2015
htmLawed 1.1.22, 5 March 2016
OOP code, 27 February 2016
Copyright Santosh Patnaik
Dual LGPL v3 and GPL v2+ license
A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed
@ -115,31 +115,39 @@ return $t;
// eof
}
public static function hl_attrval($t, $p){
public static function hl_attrval($a, $t, $p){
// check attr val against $S
$o = 1; $l = strlen($t);
foreach($p as $k=>$v){
switch($k){
case 'maxlen':if($l > $v){$o = 0;}
break; case 'minlen': if($l < $v){$o = 0;}
break; case 'maxval': if((float)($t) > $v){$o = 0;}
break; case 'minval': if((float)($t) < $v){$o = 0;}
break; case 'match': if(!preg_match($v, $t)){$o = 0;}
break; case 'nomatch': if(preg_match($v, $t)){$o = 0;}
break; case 'oneof':
$m = 0;
foreach(explode('|', $v) as $n){if($t == $n){$m = 1; break;}}
$o = $m;
break; case 'noneof':
$m = 1;
foreach(explode('|', $v) as $n){if($t == $n){$m = 0; break;}}
$o = $m;
break; default:
break;
static $ma = array('accesskey', 'class', 'rel');
$s = in_array($a, $ma) ? ' ' : '';
$r = array();
$t = !empty($s) ? explode($s, $t) : array($t);
foreach($t as $tk=>$tv){
$o = 1; $l = strlen($tv);
foreach($p as $k=>$v){
switch($k){
case 'maxlen': if($l > $v){$o = 0;}
break; case 'minlen': if($l < $v){$o = 0;}
break; case 'maxval': if((float)($tv) > $v){$o = 0;}
break; case 'minval': if((float)($tv) < $v){$o = 0;}
break; case 'match': if(!preg_match($v, $tv)){$o = 0;}
break; case 'nomatch': if(preg_match($v, $tv)){$o = 0;}
break; case 'oneof':
$m = 0;
foreach(explode('|', $v) as $n){if($tv == $n){$m = 1; break;}}
$o = $m;
break; case 'noneof':
$m = 1;
foreach(explode('|', $v) as $n){if($tv == $n){$m = 0; break;}}
$o = $m;
break; default:
break;
}
if(!$o){break;}
}
if(!$o){break;}
if($o){$r[] = $tv;}
}
return ($o ? $t : (isset($p['default']) ? $p['default'] : 0));
$r = implode($s, $r);
return (isset($r[0]) ? $r : (isset($p['default']) ? $p['default'] : 0));
// eof
}
@ -526,7 +534,7 @@ foreach($aA as $k=>$v){
}
}
}
if(isset($rl[$k]) && is_array($rl[$k]) && ($v = htmLawed::hl_attrval($v, $rl[$k])) === 0){continue;}
if(isset($rl[$k]) && is_array($rl[$k]) && ($v = htmLawed::hl_attrval($k, $v, $rl[$k])) === 0){continue;}
$a[$k] = str_replace('"', '&quot;', $v);
}
}
@ -628,16 +636,15 @@ if($e == 'u'){$e = 'span'; return 'text-decoration: underline;';}
static $fs = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%');
if($e == 'font'){
$a2 = '';
if(preg_match('`face\s*=\s*(\'|")([^=]+?)\\1`i', $a, $m) or preg_match('`face\s*=(\s*)(\S+)`i', $a, $m)){
$a2 .= ' font-family: '. str_replace('"', '\'', trim($m[2])). ';';
while(preg_match('`(^|\s)(color|size)\s*=\s*(\'|")?(.+?)(\\3|\s|$)`i', $a, $m)){
$a = str_replace($m[0], ' ', $a);
$a2 .= strtolower($m[2]) == 'color' ? (' color: '. str_replace('"', '\'', trim($m[4])). ';') : (isset($fs[($m = trim($m[4]))]) ? ($a2 .= ' font-size: '. str_replace('"', '\'', $fs[$m]). ';') : '');
}
if(preg_match('`color\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m)){
$a2 .= ' color: '. str_replace('"', '\'', trim($m[2])). ';';
while(preg_match('`(^|\s)face\s*=\s*(\'|")?([^=]+?)\\2`i', $a, $m) or preg_match('`(^|\s)face\s*=(\s*)(\S+)`i', $a, $m)){
$a = str_replace($m[0], ' ', $a);
$a2 .= ' font-family: '. str_replace('"', '\'', trim($m[3])). ';';
}
if(preg_match('`size\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m) && isset($fs[($m = trim($m[2]))])){
$a2 .= ' font-size: '. str_replace('"', '\'', $fs[$m]). ';';
}
$e = 'span'; return ltrim($a2);
$e = 'span'; return ltrim(str_replace('<', '', $a2));
}
if($t == 2){$e = 0; return 0;}
return '';
@ -701,7 +708,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array(
public static function hl_version(){
// rel
return '1.1.20';
return '1.1.22';
// eof
}

View File

@ -166,9 +166,10 @@ class HTML5
public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
{
$this->errors = array();
$events = new DOMTreeBuilder(false, array_merge($this->getOptions(), $options));
$options = array_merge($this->getOptions(), $options);
$events = new DOMTreeBuilder(false, $options);
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);
$parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML: Tokenizer::CONFORMANT_HTML);
$parser->parse();
$this->errors = $events->getErrors();
@ -184,9 +185,10 @@ class HTML5
*/
public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
{
$events = new DOMTreeBuilder(true, array_merge($this->getOptions(), $options));
$options = array_merge($this->getOptions(), $options);
$events = new DOMTreeBuilder(true, $options);
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);
$parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML: Tokenizer::CONFORMANT_HTML);
$parser->parse();
$this->errors = $events->getErrors();

View File

@ -24,7 +24,7 @@ class Elements
const KNOWN_ELEMENT = 1;
// From section 8.1.2: "script", "style"
// From 8.2.5.4.7 ("in body" insertion mode): "noembed", "noscript"
// From 8.2.5.4.7 ("in body" insertion mode): "noembed"
// From 8.4 "style", "xmp", "iframe", "noembed", "noframes"
/**
* Indicates the contained text should be processed as raw text.
@ -79,7 +79,7 @@ class Elements
public static $html5 = array(
"a" => 1,
"abbr" => 1,
"address" => 89, // NORMAL | VOID_TAG | AUTOCLOSE_P | BLOCK_TAG
"address" => 65, // NORMAL | BLOCK_TAG
"area" => 9, // NORMAL | VOID_TAG
"article" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"aside" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
@ -144,7 +144,7 @@ class Elements
"meta" => 9, // NORMAL | VOID_TAG
"meter" => 1,
"nav" => 17, // NORMAL | AUTOCLOSE_P,
"noscript" => 67, // NORMAL | TEXT_RAW | BLOCK_TAG
"noscript" => 65, // NORMAL | BLOCK_TAG
"object" => 1,
"ol" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"optgroup" => 1,
@ -557,7 +557,7 @@ class Elements
* @param string $name
* The name of the element.
*
* @return int The element mask.
* @return int|bool The element mask or false if element does not exist.
*/
public static function element($name)
{

View File

@ -69,7 +69,7 @@ interface EventHandler
* An array with all of the tag's attributes.
* @param boolean $selfClosing
* An indicator of whether or not this tag is self-closing (<foo/>)
* @return numeric One of the Tokenizer::TEXTMODE_* constants.
* @return int One of the Tokenizer::TEXTMODE_* constants.
*/
public function startTag($name, $attributes = array(), $selfClosing = false);

View File

@ -43,6 +43,10 @@ class Tokenizer
protected $textMode = 0; // TEXTMODE_NORMAL;
protected $untilTag = null;
const CONFORMANT_XML = 'xml';
const CONFORMANT_HTML = 'html';
protected $mode = self::CONFORMANT_HTML;
const WHITE = "\t\n\f ";
/**
@ -57,11 +61,13 @@ class Tokenizer
* @param \Masterminds\HTML5\Parser\EventHandler $eventHandler
* An event handler, initialized and ready to receive
* events.
* @param string $mode
*/
public function __construct($scanner, $eventHandler)
public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML)
{
$this->scanner = $scanner;
$this->events = $eventHandler;
$this->mode = $mode;
}
/**
@ -299,7 +305,7 @@ class Tokenizer
}
elseif ($tok == 'D' || $tok == 'd') { // Doctype
return $this->doctype('');
return $this->doctype();
}
elseif ($tok == '[') { // CDATA section
@ -335,7 +341,8 @@ class Tokenizer
return $this->bogusComment('</');
}
$name = strtolower($this->scanner->charsUntil("\n\f \t>"));
$name = $this->scanner->charsUntil("\n\f \t>");
$name = $this->mode === self::CONFORMANT_XML ? $name: strtolower($name);
// Trash whitespace.
$this->scanner->whitespace();
@ -362,7 +369,8 @@ class Tokenizer
}
// We know this is at least one char.
$name = strtolower($this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"));
$name = $this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
$name = $this->mode === self::CONFORMANT_XML ? $name : strtolower($name);
$attributes = array();
$selfClose = false;

View File

@ -76,7 +76,6 @@ class TreeBuildingRules
case 'option':
return $this->closeIfCurrentMatches($new, $current, array(
'option',
'optgroup'
));
case 'tr':
return $this->closeIfCurrentMatches($new, $current, array(

View File

@ -48,10 +48,10 @@ class UTF8Utils
public static function countChars($string)
{
// Get the length for the string we need.
if (function_exists('iconv_strlen')) {
return iconv_strlen($string, 'utf-8');
} elseif (function_exists('mb_strlen')) {
if (function_exists('mb_strlen')) {
return mb_strlen($string, 'utf-8');
} elseif (function_exists('iconv_strlen')) {
return iconv_strlen($string, 'utf-8');
} elseif (function_exists('utf8_decode')) {
// MPB: Will this work? Won't certain decodes lead to two chars
// extrapolated out of 2-byte chars?
@ -94,10 +94,10 @@ class UTF8Utils
// application executing this library so we store the value, change it
// to our needs, and then change it back when we are done. This feels
// a little excessive and it would be great if there was a better way.
$save = ini_get('mbstring.substitute_character');
ini_set('mbstring.substitute_character', "none");
$save = mb_substitute_character();
mb_substitute_character('none');
$data = mb_convert_encoding($data, 'UTF-8', $encoding);
ini_set('mbstring.substitute_character', $save);
mb_substitute_character($save);
} // @todo Get iconv running in at least some environments if that is possible.
elseif (function_exists('iconv') && $encoding != 'auto') {
// fprintf(STDOUT, "iconv found\n");

View File

@ -185,7 +185,9 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
{
$this->doctype();
if ($dom->documentElement) {
$this->traverser->node($dom->documentElement);
foreach ($dom->childNodes as $node) {
$this->traverser->node($node);
}
$this->nl();
}
}
@ -219,7 +221,11 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
$this->openTag($ele);
if (Elements::isA($name, Elements::TEXT_RAW)) {
foreach ($ele->childNodes as $child) {
$this->wr($child->data);
if ($child instanceof \DOMCharacterData) {
$this->wr($child->data);
} elseif ($child instanceof \DOMElement) {
$this->element($child);
}
}
} else {
// Handle children.
@ -347,7 +353,7 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
// the XML, XMLNS, or XLink NS's should use the canonical
// prefix. It seems that DOM does this for us already, but there
// may be exceptions.
$name = $node->name;
$name = $node->nodeName;
// Special handling for attributes in SVG and MathML.
// Using if/elseif instead of switch because it's faster in PHP.

View File

@ -103,7 +103,6 @@ class Traverser
case XML_CDATA_SECTION_NODE:
$this->rules->cdata($node);
break;
// FIXME: It appears that the parser doesn't do PI's.
case XML_PI_NODE:
$this->rules->processorInstruction($node);
break;

View File

@ -9,10 +9,13 @@ But after some initial refactoring work, we began a new parser.
- Composer support
- Event-based (SAX-like) parser
- DOM tree builder
- Interoperability with QueryPath [[in progress](https://github.com/technosophos/querypath/issues/114)]
- Interoperability with [QueryPath](https://github.com/technosophos/querypath)
- Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer
[![Build Status](https://travis-ci.org/Masterminds/html5-php.png?branch=master)](https://travis-ci.org/Masterminds/html5-php) [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5) [![Coverage Status](https://coveralls.io/repos/Masterminds/html5-php/badge.png?branch=master)](https://coveralls.io/r/Masterminds/html5-php?branch=master)
[![Build Status](https://travis-ci.org/Masterminds/html5-php.png?branch=master)](https://travis-ci.org/Masterminds/html5-php)
[![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5)
[![Code Coverage](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
[![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
## Installation
@ -36,7 +39,7 @@ install.
## Basic Usage
HTML5-PHP has a high-level API and a low-level API.
HTML5-PHP has a high-level API and a low-level API.
Here is how you use the high-level `HTML5` library API:
@ -144,14 +147,14 @@ The serializer is broken into three parts:
- The `OutputRules` contain the rules to turn DOM elements into strings. The
rules are an implementation of the interface `RulesInterface` allowing for
different rule sets to be used.
different rule sets to be used.
- The `Traverser`, which is a special-purpose tree walker. It visits
each node node in the tree and uses the `OutputRules` to transform the node
into a string.
- `HTML5` manages the `Traverser` and stores the resultant data
in the correct place.
The serializer (`save()`, `saveHTML()`) follows the
The serializer (`save()`, `saveHTML()`) follows the
[section 8.9 of the HTML 5.0 spec](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments).
So tags are serialized according to these rules:
@ -166,8 +169,8 @@ issues known issues that are not presently on the roadmap:
- Namespaces: HTML5 only [supports a selected list of namespaces](http://www.w3.org/TR/html5/infrastructure.html#namespaces)
and they do not operate in the same way as XML namespaces. A `:` has no special
meaning.
By default the parser does not support XML style namespaces via `:`;
meaning.
By default the parser does not support XML style namespaces via `:`;
to enable the XML namespaces see the [XML Namespaces section](#xml-namespaces)
- Scripts: This parser does not contain a JavaScript or a CSS
interpreter. While one may be supplied, not all features will be
@ -184,13 +187,13 @@ issues known issues that are not presently on the roadmap:
* Per the spec, many legacy tags are admitted and correctly handled,
even though they are technically not part of HTML5.
- Attribute names and values: Due to the implementation details of the
PHP implementation of DOM, attribute names that do not follow the
PHP implementation of DOM, attribute names that do not follow the
XML 1.0 standard are not inserted into the DOM. (Effectively, they
are ignored.) If you've got a clever fix for this, jump in!
- Processor Instructions: The HTML5 spec does not allow processor
instructions. We do. Since this is a server-side library, we think
this is useful. And that means, dear reader, that in some cases you
can parse the HTML from a mixed PHP/HTML document. This, however,
can parse the HTML from a mixed PHP/HTML document. This, however,
is an incidental feature, not a core feature.
- HTML manifests: Unsupported.
- PLAINTEXT: Unsupported.

View File

@ -1,7 +1,31 @@
# Release Notes
2.2.2 (2016-10-22)
- #116: In XML mode, tags are case sensitive
- #115: Fix PHP Notice in OutputRules
- #112: fix parsing of options of an optgroup
- #111: Adding test for the address tag
2.2.1 (2016-05-10)
- #109: Fixed issue where address tag could be written without closing tag (thanks sylus)
2.2.0 (2016-04-11)
- #105: Enable composer cache (for CI/CD)
- #100: Use mb_substitute_character inset of ini_set for environments where
ini_set is disable (e.g., shared hosting)
- #98: Allow link, meta, style tags in noscript tags
- #96: Fixed xml:href on svgs that use the "use" breaking
- #94: Counting UTF8 characters performance improvement
- #93: Use newer version of coveralls package
- #90: Remove duplicate test
- #87: Allow multiple root nodes
2.1.2 (2015-06-07)
- #82: Support for PHP7
- #84: Improved boolean attribute handling
- #84: Improved boolean attribute handling
2.1.1 (2015-03-23)
- #78: Fixes bug where unmatched entity like string drops everything after &.

View File

@ -7,11 +7,11 @@
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
* @version 1.6
* @date 2015-06-05
* @version 1.7
* @date 2016-11-28
* @see http://devel-m6w6.rhcloud.com/mdref/http
* @author Keyvan Minoukadeh
* @copyright 2011-2015 Keyvan Minoukadeh
* @copyright 2011-2016 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@ -21,8 +21,8 @@ class HumbleHttpAgent
const METHOD_CURL_MULTI = 2;
const METHOD_FILE_GET_CONTENTS = 4;
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
const UA_PHP = 'PHP/5.5';
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36';
const UA_PHP = 'PHP/5.6';
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
protected $requests = array();
@ -103,20 +103,26 @@ class HumbleHttpAgent
)
);
// HTTP cURL
$this->curlOptions = array(
CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
CURLOPT_TIMEOUT => $this->requestOptions['timeout']
if ($this->method === self::METHOD_CURL_MULTI) {
$this->curlOptions = array(
CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
CURLOPT_TIMEOUT => $this->requestOptions['timeout']
);
}
// Use proxy?
if ($this->requestOptions['proxyhost']) {
if (isset($this->requestOptions['proxyhost']) && $this->requestOptions['proxyhost']) {
// For file_get_contents (see http://stackoverflow.com/a/1336419/407938)
$this->httpContext['http']['proxy'] = 'tcp://'.$this->requestOptions['proxyhost'];
$this->httpContext['http']['request_fulluri'] = true;
// For cURL (see http://stackoverflow.com/a/9247672/407938)
$this->curlOptions[CURLOPT_PROXY] = $this->requestOptions['proxyhost'];
if ($this->method === self::METHOD_CURL_MULTI) {
$this->curlOptions[CURLOPT_PROXY] = $this->requestOptions['proxyhost'];
}
if (isset($this->requestOptions['proxyauth'])) {
$this->httpContext['http']['header'] .= "Proxy-Authorization: Basic ".base64_encode($this->requestOptions['proxyauth'])."\r\n";
$this->curlOptions[CURLOPT_PROXYUSERPWD] = $this->requestOptions['proxyauth'];
if ($this->method === self::METHOD_CURL_MULTI) {
$this->curlOptions[CURLOPT_PROXYUSERPWD] = $this->requestOptions['proxyauth'];
}
}
}
}
@ -842,6 +848,7 @@ class HumbleHttpAgent
}
protected function getCookies($orig, $req_url) {
if (!isset($this->cookieJar[$orig])) return null;
$jar = $this->cookieJar[$orig];
if (!isset($jar)) {
return null;

View File

@ -971,7 +971,7 @@ class Text_LanguageDetect
// assume that ascii characters are the most common
// so try it first for efficiency
if ($unicode <= $blocks[0][1]) {
if ($unicode <= hexdec($blocks[0][1])) {
return $blocks[0];
}
@ -989,11 +989,11 @@ class Text_LanguageDetect
while ($low <= $high) {
$mid = floor(($low + $high) / 2);
if ($unicode < $blocks[$mid][0]) {
if ($unicode < hexdec($blocks[$mid][0])) {
// if it's lower than the lower bound
$high = $mid - 1;
} elseif ($unicode > $blocks[$mid][1]) {
} elseif ($unicode > hexdec($blocks[$mid][1])) {
// if it's higher than the upper bound
$low = $mid + 1;

View File

@ -102,7 +102,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
* @access private
* @param string $string string to be parsed
*/
function Text_LanguageDetect_Parser($string) {
function __construct($string) {
$this->_string = $string;
}

View File

@ -4,6 +4,7 @@
* Based on readability.js version 1.7.1 (without multi-page support)
* Updated to allow HTML5 parsing with html5lib
* Updated with lightClean mode to preserve more images and youtube/vimeo/viddler embeds
* Updated to allow HTML5 parsing with Gumbo PHP
* ------------------------------------------------------
* Original URL: http://lab.arc90.com/experiments/readability/js/readability.js
* Arc90's project URL: http://lab.arc90.com/experiments/readability/
@ -12,7 +13,7 @@
* More information: http://fivefilters.org/content-only/
* License: Apache License, Version 2.0
* Requires: PHP5
* Date: 2015-06-01
* Date: 2017-02-05
*
* Differences between the PHP port and the original
* ------------------------------------------------------
@ -117,17 +118,23 @@ class Readability
$html = preg_replace($this->regexps['replaceBrs'], '</p><p>', $html);
$html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
if (trim($html) == '') $html = '<html></html>';
if ($parser=='html5lib' || $parser=='html5php') {
if (version_compare(PHP_VERSION, '5.3.0') >= 0) {
//use Masterminds\HTML5;
$html5class = 'Masterminds\HTML5';
$html5 = new $html5class(array('disable_html_ns' => true));
$this->dom = $html5->loadHTML($html);
//echo $html5->saveHTML($this->dom);exit;
//$xpath = new DOMXPath($this->dom);
//$elems = $xpath->query("//a");
//print_r($elems);exit;
}
// Check for the Gumbo PHP extension https://github.com/layershifter/gumbo-php
if ($parser=='gumbo') {
// Can we avoid this encoding/deocding step? Test on:
// http://www.medialens.org/index.php/alerts/alert-archive/2017/837-undermining-democracy-corporate-media-bias-on-jeremy-corbyn-boris-johnson-and-syria.html
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
$html = mb_convert_encoding($html, "UTF-8", 'HTML-ENTITIES');
$this->dom = @Layershifter\Gumbo\Parser::load($html);
} elseif ($parser=='html5lib' || $parser=='html5php') {
//use Masterminds\HTML5;
//$html5class = 'Masterminds\HTML5';
//$html5 = new $html5class(array('disable_html_ns' => true));
$html5 = new Masterminds\HTML5(array('disable_html_ns' => true));
$this->dom = $html5->loadHTML($html);
//echo $html5->saveHTML($this->dom);exit;
//$xpath = new DOMXPath($this->dom);
//$elems = $xpath->query("//a");
//print_r($elems);exit;
}
if ($this->dom === null) {
$this->dom = new DOMDocument();

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2009, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,8 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @version 1.4.3
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -50,7 +50,7 @@ define('SIMPLEPIE_NAME', 'SimplePie');
/**
* SimplePie Version
*/
define('SIMPLEPIE_VERSION', '1.3.1');
define('SIMPLEPIE_VERSION', '1.4.3');
/**
* SimplePie Build
@ -445,6 +445,13 @@ class SimplePie
*/
public $feed_url;
/**
* @var string Original feed URL, or new feed URL iff HTTP 301 Moved Permanently
* @see SimplePie::subscribe_url()
* @access private
*/
public $permanent_url = null;
/**
* @var object Instance of SimplePie_File to use as a feed
* @see SimplePie::set_file()
@ -466,6 +473,13 @@ class SimplePie
*/
public $timeout = 10;
/**
* @var array Custom curl options
* @see SimplePie::set_curl_options()
* @access private
*/
public $curl_options = array();
/**
* @var bool Forces fsockopen() to be used for remote files instead
* of cURL, even if a new enough version is installed
@ -489,6 +503,14 @@ class SimplePie
*/
public $cache = true;
/**
* @var bool Force SimplePie to fallback to expired cache, if enabled,
* when feed is unavailable.
* @see SimplePie::force_cache_fallback()
* @access private
*/
public $force_cache_fallback = false;
/**
* @var int Cache duration (in seconds)
* @see SimplePie::set_cache_duration()
@ -594,6 +616,12 @@ class SimplePie
*/
public $item_limit = 0;
/**
* @var bool Stores if last-modified and/or etag headers were sent with the
* request when checking a feed.
*/
public $check_modified = false;
/**
* @var array Stores the default attributes to be stripped by strip_attributes().
* @see SimplePie::strip_attributes()
@ -601,6 +629,13 @@ class SimplePie
*/
public $strip_attributes = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
/**
* @var array Stores the default attributes to add to different tags by add_attributes().
* @see SimplePie::add_attributes()
* @access private
*/
public $add_attributes = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none'));
/**
* @var array Stores the default tags to be stripped by strip_htmltags().
* @see SimplePie::strip_htmltags()
@ -624,9 +659,9 @@ class SimplePie
*/
public function __construct()
{
if (version_compare(PHP_VERSION, '5.2', '<'))
if (version_compare(PHP_VERSION, '5.3', '<'))
{
trigger_error('PHP 4.x, 5.0 and 5.1 are no longer supported. Please upgrade to PHP 5.2 or newer.');
trigger_error('Please upgrade to PHP 5.3 or newer.');
die();
}
@ -637,7 +672,7 @@ class SimplePie
if (func_num_args() > 0)
{
$level = defined('E_USER_DEPRECATED') ? E_USER_DEPRECATED : E_USER_WARNING;
trigger_error('Passing parameters to the constructor is no longer supported. Please use set_feed_url(), set_cache_location(), and set_cache_location() directly.', $level);
trigger_error('Passing parameters to the constructor is no longer supported. Please use set_feed_url(), set_cache_location(), and set_cache_duration() directly.', $level);
$args = func_get_args();
switch (count($args)) {
@ -728,6 +763,7 @@ class SimplePie
else
{
$this->feed_url = $this->registry->call('Misc', 'fix_protocol', array($url, 1));
$this->permanent_url = $this->feed_url;
}
}
@ -742,6 +778,7 @@ class SimplePie
if ($file instanceof SimplePie_File)
{
$this->feed_url = $file->url;
$this->permanent_url = $this->feed_url;
$this->file =& $file;
return true;
}
@ -780,6 +817,19 @@ class SimplePie
$this->timeout = (int) $timeout;
}
/**
* Set custom curl options
*
* This allows you to change default curl options
*
* @since 1.0 Beta 3
* @param array $curl_options Curl options to add to default settings
*/
public function set_curl_options(array $curl_options = array())
{
$this->curl_options = $curl_options;
}
/**
* Force SimplePie to use fsockopen() instead of cURL
*
@ -805,6 +855,21 @@ class SimplePie
$this->cache = (bool) $enable;
}
/**
* SimplePie to continue to fall back to expired cache, if enabled, when
* feed is unavailable.
*
* This tells SimplePie to ignore any file errors and fall back to cache
* instead. This only works if caching is enabled and cached content
* still exists.
* @param bool $enable Force use of cache on fail.
*/
public function force_cache_fallback($enable = false)
{
$this->force_cache_fallback= (bool) $enable;
}
/**
* Set the length of time (in seconds) that the contents of a feed will be
* cached
@ -1073,6 +1138,7 @@ class SimplePie
$this->strip_comments(false);
$this->strip_htmltags(false);
$this->strip_attributes(false);
$this->add_attributes(false);
$this->set_image_handler(false);
}
}
@ -1119,16 +1185,25 @@ class SimplePie
$this->sanitize->strip_attributes($attribs);
}
public function add_attributes($attribs = '')
{
if ($attribs === '')
{
$attribs = $this->add_attributes;
}
$this->sanitize->add_attributes($attribs);
}
/**
* Set the output encoding
*
* Allows you to override SimplePie's output to match that of your webpage.
* This is useful for times when your webpages are not being served as
* UTF-8. This setting will be obeyed by {@see handle_content_type()}, and
* UTF-8. This setting will be obeyed by {@see handle_content_type()}, and
* is similar to {@see set_input_encoding()}.
*
* It should be noted, however, that not all character encodings can support
* all characters. If your page is being served as ISO-8859-1 and you try
* all characters. If your page is being served as ISO-8859-1 and you try
* to display a Japanese feed, you'll likely see garbled characters.
* Because of this, it is highly recommended to ensure that your webpages
* are served as UTF-8.
@ -1195,10 +1270,20 @@ class SimplePie
$this->item_limit = (int) $limit;
}
/**
* Enable throwing exceptions
*
* @param boolean $enable Should we throw exceptions, or use the old-style error property?
*/
public function enable_exceptions($enable = true)
{
$this->enable_exceptions = $enable;
}
/**
* Initialize the feed object
*
* This is what makes everything happen. Period. This is where all of the
* This is what makes everything happen. Period. This is where all of the
* configuration options get processed, feeds are fetched, cached, and
* parsed, and all of that other good stuff.
*
@ -1209,6 +1294,7 @@ class SimplePie
// Check absolute bare minimum requirements.
if (!extension_loaded('xml') || !extension_loaded('pcre'))
{
$this->error = 'XML or PCRE extensions not loaded!';
return false;
}
// Then check the xml extension is sane (i.e., libxml 2.7.x issue on PHP < 5.2.9 and libxml 2.7.0 to 2.7.2 on any version) if we don't have xmlreader.
@ -1236,7 +1322,7 @@ class SimplePie
// Pass whatever was set with config options over to the sanitizer.
// Pass the classes in for legacy support; new classes should use the registry instead
$this->sanitize->pass_cache_data($this->cache, $this->cache_location, $this->cache_name_function, $this->registry->get_class('Cache'));
$this->sanitize->pass_file_data($this->registry->get_class('File'), $this->timeout, $this->useragent, $this->force_fsockopen);
$this->sanitize->pass_file_data($this->registry->get_class('File'), $this->timeout, $this->useragent, $this->force_fsockopen, $this->curl_options);
if (!empty($this->multifeed_url))
{
@ -1265,6 +1351,7 @@ class SimplePie
$this->error = null;
$this->data = array();
$this->check_modified = false;
$this->multifeed_objects = array();
$cache = false;
@ -1289,6 +1376,13 @@ class SimplePie
list($headers, $sniffed) = $fetched;
}
// Empty response check
if(empty($this->raw_data)){
$this->error = "A feed could not be found at `$this->feed_url`. Empty body.";
$this->registry->call('Misc', 'error', array($this->error, E_USER_NOTICE, __FILE__, __LINE__));
return false;
}
// Set up array of possible encodings
$encodings = array();
@ -1296,7 +1390,7 @@ class SimplePie
// First check to see if input has been overridden.
if ($this->input_encoding !== false)
{
$encodings[] = $this->input_encoding;
$encodings[] = strtoupper($this->input_encoding);
}
$application_types = array('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity');
@ -1318,14 +1412,14 @@ class SimplePie
{
if (isset($headers['content-type']) && preg_match('/;\x20?charset=([^;]*)/i', $headers['content-type'], $charset))
{
$encodings[] = $charset[1];
$encodings[] = strtoupper($charset[1]);
}
$encodings[] = 'US-ASCII';
}
// Text MIME-type default
elseif (substr($sniffed, 0, 5) === 'text/')
{
$encodings[] = 'US-ASCII';
$encodings[] = 'UTF-8';
}
}
@ -1347,12 +1441,12 @@ class SimplePie
$parser = $this->registry->create('Parser');
// If it's parsed fine
if ($parser->parse($utf8_data, 'UTF-8'))
if ($parser->parse($utf8_data, 'UTF-8', $this->permanent_url))
{
$this->data = $parser->get_data();
if (!($this->get_type() & ~SIMPLEPIE_TYPE_NONE))
{
$this->error = "A feed could not be found at $this->feed_url. This does not appear to be a valid RSS or Atom feed.";
$this->error = "A feed could not be found at `$this->feed_url`. This does not appear to be a valid RSS or Atom feed.";
$this->registry->call('Misc', 'error', array($this->error, E_USER_NOTICE, __FILE__, __LINE__));
return false;
}
@ -1376,11 +1470,27 @@ class SimplePie
if (isset($parser))
{
// We have an error, just set SimplePie_Misc::error to it and quit
$this->error = sprintf('This XML document is invalid, likely due to invalid characters. XML error: %s at line %d, column %d', $parser->get_error_string(), $parser->get_current_line(), $parser->get_current_column());
$this->error = $this->feed_url;
$this->error .= sprintf(' is invalid XML, likely due to invalid characters. XML error: %s at line %d, column %d', $parser->get_error_string(), $parser->get_current_line(), $parser->get_current_column());
}
else
{
$this->error = 'The data could not be converted to UTF-8. You MUST have either the iconv or mbstring extension installed. Upgrading to PHP 5.x (which includes iconv) is highly recommended.';
$this->error = 'The data could not be converted to UTF-8.';
if (!extension_loaded('mbstring') && !extension_loaded('iconv') && !class_exists('\UConverter')) {
$this->error .= ' You MUST have either the iconv, mbstring or intl (PHP 5.5+) extension installed and enabled.';
} else {
$missingExtensions = array();
if (!extension_loaded('iconv')) {
$missingExtensions[] = 'iconv';
}
if (!extension_loaded('mbstring')) {
$missingExtensions[] = 'mbstring';
}
if (!class_exists('\UConverter')) {
$missingExtensions[] = 'intl (PHP 5.5+)';
}
$this->error .= ' Try installing/enabling the ' . implode(' or ', $missingExtensions) . ' extension.';
}
}
$this->registry->call('Misc', 'error', array($this->error, E_USER_NOTICE, __FILE__, __LINE__));
@ -1436,7 +1546,10 @@ class SimplePie
// Check if the cache has been updated
elseif ($cache->mtime() + $this->cache_duration < time())
{
// If we have last-modified and/or etag set
// Want to know if we tried to send last-modified and/or etag headers
// when requesting this file. (Note that it's up to the file to
// support this, but we don't always send the headers either.)
$this->check_modified = true;
if (isset($this->data['headers']['last-modified']) || isset($this->data['headers']['etag']))
{
$headers = array(
@ -1451,18 +1564,28 @@ class SimplePie
$headers['if-none-match'] = $this->data['headers']['etag'];
}
$file = $this->registry->create('File', array($this->feed_url, $this->timeout/10, 5, $headers, $this->useragent, $this->force_fsockopen));
$file = $this->registry->create('File', array($this->feed_url, $this->timeout/10, 5, $headers, $this->useragent, $this->force_fsockopen, $this->curl_options));
if ($file->success)
{
if ($file->status_code === 304)
{
// Set raw_data to false here too, to signify that the cache
// is still valid.
$this->raw_data = false;
$cache->touch();
return true;
}
}
else
{
$this->check_modified = false;
if($this->force_cache_fallback)
{
$cache->touch();
return true;
}
unset($file);
}
}
@ -1493,7 +1616,7 @@ class SimplePie
$headers = array(
'Accept' => 'application/atom+xml, application/rss+xml, application/rdf+xml;q=0.9, application/xml;q=0.8, text/xml;q=0.8, text/html;q=0.7, unknown/unknown;q=0.1, application/unknown;q=0.1, */*;q=0.1',
);
$file = $this->registry->create('File', array($this->feed_url, $this->timeout, 5, $headers, $this->useragent, $this->force_fsockopen));
$file = $this->registry->create('File', array($this->feed_url, $this->timeout, 5, $headers, $this->useragent, $this->force_fsockopen, $this->curl_options));
}
}
// If the file connection has an error, set SimplePie::error to that and quit
@ -1510,19 +1633,75 @@ class SimplePie
if (!$locate->is_feed($file))
{
// We need to unset this so that if SimplePie::set_file() has been called that object is untouched
unset($file);
$copyStatusCode = $file->status_code;
$copyContentType = $file->headers['content-type'];
try
{
if (!($file = $locate->find($this->autodiscovery, $this->all_discovered_feeds)))
$microformats = false;
if (function_exists('Mf2\parse')) {
// Check for both h-feed and h-entry, as both a feed with no entries
// and a list of entries without an h-feed wrapper are both valid.
$position = 0;
while ($position = strpos($file->body, 'h-feed', $position))
{
$start = $position < 200 ? 0 : $position - 200;
$check = substr($file->body, $start, 400);
if ($microformats = preg_match('/class="[^"]*h-feed/', $check))
{
break;
}
$position += 7;
}
$position = 0;
while ($position = strpos($file->body, 'h-entry', $position))
{
$start = $position < 200 ? 0 : $position - 200;
$check = substr($file->body, $start, 400);
if ($microformats = preg_match('/class="[^"]*h-entry/', $check))
{
break;
}
$position += 7;
}
}
// Now also do feed discovery, but if an h-entry was found don't
// overwrite the current value of file.
$discovered = $locate->find($this->autodiscovery,
$this->all_discovered_feeds);
if ($microformats)
{
$this->error = "A feed could not be found at $this->feed_url. A feed with an invalid mime type may fall victim to this error, or " . SIMPLEPIE_NAME . " was unable to auto-discover it.. Use force_feed() if you are certain this URL is a real feed.";
$this->registry->call('Misc', 'error', array($this->error, E_USER_NOTICE, __FILE__, __LINE__));
return false;
if ($hub = $locate->get_rel_link('hub'))
{
$self = $locate->get_rel_link('self');
$this->store_links($file, $hub, $self);
}
// Push the current file onto all_discovered feeds so the user can
// be shown this as one of the options.
if (isset($this->all_discovered_feeds)) {
$this->all_discovered_feeds[] = $file;
}
}
else
{
if ($discovered)
{
$file = $discovered;
}
else
{
// We need to unset this so that if SimplePie::set_file() has
// been called that object is untouched
unset($file);
$this->error = "A feed could not be found at `$this->feed_url`; the status code is `$copyStatusCode` and content-type is `$copyContentType`";
$this->registry->call('Misc', 'error', array($this->error, E_USER_NOTICE, __FILE__, __LINE__));
return false;
}
}
}
catch (SimplePie_Exception $e)
{
// We need to unset this so that if SimplePie::set_file() has been called that object is untouched
unset($file);
// This is usually because DOMDocument doesn't exist
$this->error = $e->getMessage();
$this->registry->call('Misc', 'error', array($this->error, E_USER_NOTICE, $e->getFile(), $e->getLine()));
@ -1543,7 +1722,7 @@ class SimplePie
}
$this->raw_data = $file->body;
$this->permanent_url = $file->permanent_url;
$headers = $file->headers;
$sniffer = $this->registry->create('Content_Type_Sniffer', array(&$file));
$sniffed = $sniffer->get_type();
@ -1729,26 +1908,44 @@ class SimplePie
/**
* Get the URL for the feed
*
* When the 'permanent' mode is enabled, returns the original feed URL,
* except in the case of an `HTTP 301 Moved Permanently` status response,
* in which case the location of the first redirection is returned.
*
* May or may not be different from the URL passed to {@see set_feed_url()},
* When the 'permanent' mode is disabled (default),
* may or may not be different from the URL passed to {@see set_feed_url()},
* depending on whether auto-discovery was used.
*
* @since Preview Release (previously called `get_feed_url()` since SimplePie 0.8.)
* @todo If we have a perm redirect we should return the new URL
* @todo When we make the above change, let's support <itunes:new-feed-url> as well
* @todo Support <itunes:new-feed-url>
* @todo Also, |atom:link|@rel=self
* @param bool $permanent Permanent mode to return only the original URL or the first redirection
* iff it is a 301 redirection
* @return string|null
*/
public function subscribe_url()
public function subscribe_url($permanent = false)
{
if ($this->feed_url !== null)
if ($permanent)
{
return $this->sanitize($this->feed_url, SIMPLEPIE_CONSTRUCT_IRI);
if ($this->permanent_url !== null)
{
// sanitize encodes ampersands which are required when used in a url.
return str_replace('&amp;', '&',
$this->sanitize($this->permanent_url,
SIMPLEPIE_CONSTRUCT_IRI));
}
}
else
{
return null;
if ($this->feed_url !== null)
{
return str_replace('&amp;', '&',
$this->sanitize($this->feed_url,
SIMPLEPIE_CONSTRUCT_IRI));
}
}
return null;
}
/**
@ -1963,7 +2160,21 @@ class SimplePie
*/
public function sanitize($data, $type, $base = '')
{
return $this->sanitize->sanitize($data, $type, $base);
try
{
return $this->sanitize->sanitize($data, $type, $base);
}
catch (SimplePie_Exception $e)
{
if (!$this->enable_exceptions)
{
$this->error = $e->getMessage();
$this->registry->call('Misc', 'error', array($this->error, E_USER_WARNING, $e->getFile(), $e->getLine()));
return '';
}
throw $e;
}
}
/**
@ -2014,7 +2225,7 @@ class SimplePie
* Get a category for the feed
*
* @since Unknown
* @param int $key The category that you want to return. Remember that arrays begin with 0, not 1
* @param int $key The category that you want to return. Remember that arrays begin with 0, not 1
* @return SimplePie_Category|null
*/
public function get_category($key = 0)
@ -2099,7 +2310,7 @@ class SimplePie
* Get an author for the feed
*
* @since 1.1
* @param int $key The author that you want to return. Remember that arrays begin with 0, not 1
* @param int $key The author that you want to return. Remember that arrays begin with 0, not 1
* @return SimplePie_Author|null
*/
public function get_author($key = 0)
@ -2197,7 +2408,7 @@ class SimplePie
* Get a contributor for the feed
*
* @since 1.1
* @param int $key The contrbutor that you want to return. Remember that arrays begin with 0, not 1
* @param int $key The contrbutor that you want to return. Remember that arrays begin with 0, not 1
* @return SimplePie_Author|null
*/
public function get_contributor($key = 0)
@ -2283,7 +2494,7 @@ class SimplePie
* Get a single link for the feed
*
* @since 1.0 (previously called `get_feed_link` since Preview Release, `get_feed_permalink()` since 0.8)
* @param int $key The link that you want to return. Remember that arrays begin with 0, not 1
* @param int $key The link that you want to return. Remember that arrays begin with 0, not 1
* @param string $rel The relationship of the link to return
* @return string|null Link URL
*/
@ -2393,6 +2604,12 @@ class SimplePie
{
return $this->data['links'][$rel];
}
else if (isset($this->data['headers']['link']) &&
preg_match('/<([^>]+)>; rel='.preg_quote($rel).'/',
$this->data['headers']['link'], $match))
{
return array($match[1]);
}
else
{
return null;
@ -2794,7 +3011,7 @@ class SimplePie
*
* @see get_item_quantity()
* @since Beta 2
* @param int $key The item that you want to return. Remember that arrays begin with 0, not 1
* @param int $key The item that you want to return. Remember that arrays begin with 0, not 1
* @return SimplePie_Item|null
*/
public function get_item($key = 0)
@ -2821,7 +3038,7 @@ class SimplePie
* @since Beta 2
* @param int $start Index to start at
* @param int $end Number of items to return. 0 for all items after `$start`
* @return array|null List of {@see SimplePie_Item} objects
* @return SimplePie_Item[]|null List of {@see SimplePie_Item} objects
*/
public function get_items($start = 0, $end = 0)
{
@ -2830,96 +3047,81 @@ class SimplePie
if (!empty($this->multifeed_objects))
{
$this->data['items'] = SimplePie::merge_items($this->multifeed_objects, $start, $end, $this->item_limit);
if (empty($this->data['items']))
{
return array();
}
return $this->data['items'];
}
else
$this->data['items'] = array();
if ($items = $this->get_feed_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'entry'))
{
$this->data['items'] = array();
if ($items = $this->get_feed_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'entry'))
$keys = array_keys($items);
foreach ($keys as $key)
{
$keys = array_keys($items);
foreach ($keys as $key)
{
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
if ($items = $this->get_feed_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'entry'))
}
if ($items = $this->get_feed_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'entry'))
{
$keys = array_keys($items);
foreach ($keys as $key)
{
$keys = array_keys($items);
foreach ($keys as $key)
{
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
if ($items = $this->get_feed_tags(SIMPLEPIE_NAMESPACE_RSS_10, 'item'))
}
if ($items = $this->get_feed_tags(SIMPLEPIE_NAMESPACE_RSS_10, 'item'))
{
$keys = array_keys($items);
foreach ($keys as $key)
{
$keys = array_keys($items);
foreach ($keys as $key)
{
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
if ($items = $this->get_feed_tags(SIMPLEPIE_NAMESPACE_RSS_090, 'item'))
}
if ($items = $this->get_feed_tags(SIMPLEPIE_NAMESPACE_RSS_090, 'item'))
{
$keys = array_keys($items);
foreach ($keys as $key)
{
$keys = array_keys($items);
foreach ($keys as $key)
{
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
if ($items = $this->get_channel_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'item'))
}
if ($items = $this->get_channel_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'item'))
{
$keys = array_keys($items);
foreach ($keys as $key)
{
$keys = array_keys($items);
foreach ($keys as $key)
{
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
$this->data['items'][] = $this->registry->create('Item', array($this, $items[$key]));
}
}
}
if (!empty($this->data['items']))
if (empty($this->data['items']))
{
// If we want to order it by date, check if all items have a date, and then sort it
if ($this->order_by_date && empty($this->multifeed_objects))
{
if (!isset($this->data['ordered_items']))
{
$do_sort = true;
foreach ($this->data['items'] as $item)
{
if (!$item->get_date('U'))
{
$do_sort = false;
break;
}
}
$item = null;
$this->data['ordered_items'] = $this->data['items'];
if ($do_sort)
{
usort($this->data['ordered_items'], array(get_class($this), 'sort_items'));
}
}
$items = $this->data['ordered_items'];
}
else
{
$items = $this->data['items'];
}
return array();
}
// Slice the data as desired
if ($end === 0)
if ($this->order_by_date)
{
if (!isset($this->data['ordered_items']))
{
return array_slice($items, $start);
}
else
{
return array_slice($items, $start, $end);
}
$this->data['ordered_items'] = $this->data['items'];
usort($this->data['ordered_items'], array(get_class($this), 'sort_items'));
}
$items = $this->data['ordered_items'];
}
else
{
return array();
$items = $this->data['items'];
}
// Slice the data as desired
if ($end === 0)
{
return array_slice($items, $start);
}
else
{
return array_slice($items, $start, $end);
}
}
@ -2992,7 +3194,19 @@ class SimplePie
*/
public static function sort_items($a, $b)
{
return $a->get_date('U') <= $b->get_date('U');
$a_date = $a->get_date('U');
$b_date = $b->get_date('U');
if ($a_date && $b_date) {
return $a_date > $b_date ? -1 : 1;
}
// Sort items without dates to the top.
if ($a_date) {
return 1;
}
if ($b_date) {
return -1;
}
return 0;
}
/**
@ -3025,20 +3239,7 @@ class SimplePie
}
}
$do_sort = true;
foreach ($items as $item)
{
if (!$item->get_date('U'))
{
$do_sort = false;
break;
}
}
$item = null;
if ($do_sort)
{
usort($items, array(get_class($urls[0]), 'sort_items'));
}
usort($items, array(get_class($urls[0]), 'sort_items'));
if ($end === 0)
{
@ -3055,4 +3256,42 @@ class SimplePie
return array();
}
}
/**
* Store PubSubHubbub links as headers
*
* There is no way to find PuSH links in the body of a microformats feed,
* so they are added to the headers when found, to be used later by get_links.
* @param SimplePie_File $file
* @param string $hub
* @param string $self
*/
private function store_links(&$file, $hub, $self) {
if (isset($file->headers['link']['hub']) ||
(isset($file->headers['link']) &&
preg_match('/rel=hub/', $file->headers['link'])))
{
return;
}
if ($hub)
{
if (isset($file->headers['link']))
{
if ($file->headers['link'] !== '')
{
$file->headers['link'] = ', ';
}
}
else
{
$file->headers['link'] = '';
}
$file->headers['link'] .= '<'.$hub.'>; rel=hub';
if ($self)
{
$file->headers['link'] .= ', <'.$self.'>; rel=self';
}
}
}
}

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -62,8 +61,10 @@ class SimplePie_Cache
* @var array
*/
protected static $handlers = array(
'mysql' => 'SimplePie_Cache_MySQL',
'memcache' => 'SimplePie_Cache_Memcache',
'mysql' => 'SimplePie_Cache_MySQL',
'memcache' => 'SimplePie_Cache_Memcache',
'memcached' => 'SimplePie_Cache_Memcached',
'redis' => 'SimplePie_Cache_Redis'
);
/**

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -136,11 +135,7 @@ class SimplePie_Cache_File implements SimplePie_Cache_Base
*/
public function mtime()
{
if (file_exists($this->name))
{
return filemtime($this->name);
}
return false;
return @filemtime($this->name);
}
/**
@ -150,11 +145,7 @@ class SimplePie_Cache_File implements SimplePie_Cache_Base
*/
public function touch()
{
if (file_exists($this->name))
{
return touch($this->name);
}
return false;
return @touch($this->name);
}
/**

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -95,10 +94,8 @@ class SimplePie_Cache_Memcache implements SimplePie_Cache_Base
'prefix' => 'simplepie_',
),
);
$parsed = SimplePie_Cache::parse_URL($location);
$this->options['host'] = empty($parsed['host']) ? $this->options['host'] : $parsed['host'];
$this->options['port'] = empty($parsed['port']) ? $this->options['port'] : $parsed['port'];
$this->options['extras'] = array_merge($this->options['extras'], $parsed['extras']);
$this->options = SimplePie_Misc::array_merge_recursive($this->options, SimplePie_Cache::parse_URL($location));
$this->name = $this->options['extras']['prefix'] . md5("$name:$type");
$this->cache = new Memcache();
@ -147,7 +144,7 @@ class SimplePie_Cache_Memcache implements SimplePie_Cache_Base
if ($data !== false)
{
// essentially ignore the mtime because Memcache expires on it's own
// essentially ignore the mtime because Memcache expires on its own
return time();
}
@ -165,7 +162,7 @@ class SimplePie_Cache_Memcache implements SimplePie_Cache_Base
if ($data !== false)
{
return $this->cache->set($this->name, $data, MEMCACHE_COMPRESSED, (int) $this->duration);
return $this->cache->set($this->name, $data, MEMCACHE_COMPRESSED, (int) $this->options['extras']['timeout']);
}
return false;

View File

@ -0,0 +1,166 @@
<?php
/**
* SimplePie
*
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
* permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this list
* of conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* * Neither the name of the SimplePie Team nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
* AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
* @link http://simplepie.org/ SimplePie
* @license http://www.opensource.org/licenses/bsd-license.php BSD License
*/
/**
* Caches data to memcached
*
* Registered for URLs with the "memcached" protocol
*
* For example, `memcached://localhost:11211/?timeout=3600&prefix=sp_` will
* connect to memcached on `localhost` on port 11211. All tables will be
* prefixed with `sp_` and data will expire after 3600 seconds
*
* @package SimplePie
* @subpackage Caching
* @author Paul L. McNeely
* @uses Memcached
*/
class SimplePie_Cache_Memcached implements SimplePie_Cache_Base
{
/**
* Memcached instance
* @var Memcached
*/
protected $cache;
/**
* Options
* @var array
*/
protected $options;
/**
* Cache name
* @var string
*/
protected $name;
/**
* Create a new cache object
* @param string $location Location string (from SimplePie::$cache_location)
* @param string $name Unique ID for the cache
* @param string $type Either TYPE_FEED for SimplePie data, or TYPE_IMAGE for image data
*/
public function __construct($location, $name, $type) {
$this->options = array(
'host' => '127.0.0.1',
'port' => 11211,
'extras' => array(
'timeout' => 3600, // one hour
'prefix' => 'simplepie_',
),
);
$this->options = SimplePie_Misc::array_merge_recursive($this->options, SimplePie_Cache::parse_URL($location));
$this->name = $this->options['extras']['prefix'] . md5("$name:$type");
$this->cache = new Memcached();
$this->cache->addServer($this->options['host'], (int)$this->options['port']);
}
/**
* Save data to the cache
* @param array|SimplePie $data Data to store in the cache. If passed a SimplePie object, only cache the $data property
* @return bool Successfulness
*/
public function save($data) {
if ($data instanceof SimplePie) {
$data = $data->data;
}
return $this->setData(serialize($data));
}
/**
* Retrieve the data saved to the cache
* @return array Data for SimplePie::$data
*/
public function load() {
$data = $this->cache->get($this->name);
if ($data !== false) {
return unserialize($data);
}
return false;
}
/**
* Retrieve the last modified time for the cache
* @return int Timestamp
*/
public function mtime() {
$data = $this->cache->get($this->name . '_mtime');
return (int) $data;
}
/**
* Set the last modified time to the current time
* @return bool Success status
*/
public function touch() {
$data = $this->cache->get($this->name);
return $this->setData($data);
}
/**
* Remove the cache
* @return bool Success status
*/
public function unlink() {
return $this->cache->delete($this->name, 0);
}
/**
* Set the last modified time and data to Memcached
* @return bool Success status
*/
private function setData($data) {
if ($data !== false) {
$this->cache->set($this->name . '_mtime', time(), (int)$this->options['extras']['timeout']);
return $this->cache->set($this->name, $data, (int)$this->options['extras']['timeout']);
}
return false;
}
}

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -94,9 +93,11 @@ class SimplePie_Cache_MySQL extends SimplePie_Cache_DB
'path' => '',
'extras' => array(
'prefix' => '',
'cache_purge_time' => 2592000
),
);
$this->options = array_merge_recursive($this->options, SimplePie_Cache::parse_URL($location));
$this->options = SimplePie_Misc::array_merge_recursive($this->options, SimplePie_Cache::parse_URL($location));
// Path is prefixed with a "/"
$this->options['dbname'] = substr($this->options['path'], 1);
@ -130,16 +131,20 @@ class SimplePie_Cache_MySQL extends SimplePie_Cache_DB
$query = $this->mysql->exec('CREATE TABLE `' . $this->options['extras']['prefix'] . 'cache_data` (`id` TEXT CHARACTER SET utf8 NOT NULL, `items` SMALLINT NOT NULL DEFAULT 0, `data` BLOB NOT NULL, `mtime` INT UNSIGNED NOT NULL, UNIQUE (`id`(125)))');
if ($query === false)
{
trigger_error("Can't create " . $this->options['extras']['prefix'] . "cache_data table, check permissions", E_USER_WARNING);
$this->mysql = null;
return;
}
}
if (!in_array($this->options['extras']['prefix'] . 'items', $db))
{
$query = $this->mysql->exec('CREATE TABLE `' . $this->options['extras']['prefix'] . 'items` (`feed_id` TEXT CHARACTER SET utf8 NOT NULL, `id` TEXT CHARACTER SET utf8 NOT NULL, `data` TEXT CHARACTER SET utf8 NOT NULL, `posted` INT UNSIGNED NOT NULL, INDEX `feed_id` (`feed_id`(125)))');
$query = $this->mysql->exec('CREATE TABLE `' . $this->options['extras']['prefix'] . 'items` (`feed_id` TEXT CHARACTER SET utf8 NOT NULL, `id` TEXT CHARACTER SET utf8 NOT NULL, `data` MEDIUMBLOB NOT NULL, `posted` INT UNSIGNED NOT NULL, INDEX `feed_id` (`feed_id`(125)))');
if ($query === false)
{
trigger_error("Can't create " . $this->options['extras']['prefix'] . "items table, check permissions", E_USER_WARNING);
$this->mysql = null;
return;
}
}
}
@ -157,6 +162,17 @@ class SimplePie_Cache_MySQL extends SimplePie_Cache_DB
return false;
}
$query = $this->mysql->prepare('DELETE i, cd FROM `' . $this->options['extras']['prefix'] . 'cache_data` cd, ' .
'`' . $this->options['extras']['prefix'] . 'items` i ' .
'WHERE cd.id = i.feed_id ' .
'AND cd.mtime < (unix_timestamp() - :purge_time)');
$query->bindValue(':purge_time', $this->options['extras']['cache_purge_time']);
if (!$query->execute())
{
return false;
}
if ($data instanceof SimplePie)
{
$data = clone $data;

View File

@ -0,0 +1,166 @@
<?php
/**
* SimplePie Redis Cache Extension
*
* @package SimplePie
* @author Jan Kozak <galvani78@gmail.com>
* @link http://galvani.cz/
* @license http://www.opensource.org/licenses/bsd-license.php BSD License
* @version 0.2.9
*/
/**
* Caches data to redis
*
* Registered for URLs with the "redis" protocol
*
* For example, `redis://localhost:6379/?timeout=3600&prefix=sp_&dbIndex=0` will
* connect to redis on `localhost` on port 6379. All tables will be
* prefixed with `simple_primary-` and data will expire after 3600 seconds
*
* @package SimplePie
* @subpackage Caching
* @uses Redis
*/
class SimplePie_Cache_Redis implements SimplePie_Cache_Base {
/**
* Redis instance
*
* @var \Redis
*/
protected $cache;
/**
* Options
*
* @var array
*/
protected $options;
/**
* Cache name
*
* @var string
*/
protected $name;
/**
* Cache Data
*
* @var type
*/
protected $data;
/**
* Create a new cache object
*
* @param string $location Location string (from SimplePie::$cache_location)
* @param string $name Unique ID for the cache
* @param string $type Either TYPE_FEED for SimplePie data, or TYPE_IMAGE for image data
*/
public function __construct($location, $name, $options = null) {
//$this->cache = \flow\simple\cache\Redis::getRedisClientInstance();
$parsed = SimplePie_Cache::parse_URL($location);
$redis = new Redis();
$redis->connect($parsed['host'], $parsed['port']);
$this->cache = $redis;
if (!is_null($options) && is_array($options)) {
$this->options = $options;
} else {
$this->options = array (
'prefix' => 'rss:simple_primary:',
'expire' => 0,
);
}
$this->name = $this->options['prefix'] . $name;
}
/**
* @param \Redis $cache
*/
public function setRedisClient(\Redis $cache) {
$this->cache = $cache;
}
/**
* Save data to the cache
*
* @param array|SimplePie $data Data to store in the cache. If passed a SimplePie object, only cache the $data property
* @return bool Successfulness
*/
public function save($data) {
if ($data instanceof SimplePie) {
$data = $data->data;
}
$response = $this->cache->set($this->name, serialize($data));
if ($this->options['expire']) {
$this->cache->expire($this->name, $this->options['expire']);
}
return $response;
}
/**
* Retrieve the data saved to the cache
*
* @return array Data for SimplePie::$data
*/
public function load() {
$data = $this->cache->get($this->name);
if ($data !== false) {
return unserialize($data);
}
return false;
}
/**
* Retrieve the last modified time for the cache
*
* @return int Timestamp
*/
public function mtime() {
$data = $this->cache->get($this->name);
if ($data !== false) {
return time();
}
return false;
}
/**
* Set the last modified time to the current time
*
* @return bool Success status
*/
public function touch() {
$data = $this->cache->get($this->name);
if ($data !== false) {
$return = $this->cache->set($this->name, $data);
if ($this->options['expire']) {
return $this->cache->expire($this->name, $this->ttl);
}
return $return;
}
return false;
}
/**
* Remove the cache
*
* @return bool Success status
*/
public function unlink() {
return $this->cache->set($this->name, null);
}
}

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -256,7 +255,7 @@ class SimplePie_Content_Type_Sniffer
public function feed_or_html()
{
$len = strlen($this->file->body);
$pos = strspn($this->file->body, "\x09\x0A\x0D\x20");
$pos = strspn($this->file->body, "\x09\x0A\x0D\x20\xEF\xBB\xBF");
while ($pos < $len)
{

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2009, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -169,7 +168,6 @@ class SimplePie_Decode_HTML_Entities
case "\x09":
case "\x0A":
case "\x0B":
case "\x0B":
case "\x0C":
case "\x20":
case "\x3C":

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -451,7 +450,7 @@ class SimplePie_Enclosure
/**
* Get the duration of the enclosure
*
* @param string $convert Convert seconds into hh:mm:ss
* @param bool $convert Convert seconds into hh:mm:ss
* @return string|int|null 'hh:mm:ss' string if `$convert` was specified, otherwise integer (or null if none found)
*/
public function get_duration($convert = false)
@ -942,7 +941,7 @@ class SimplePie_Enclosure
* - `height` (integer): The height of the embedded media. Accepts any
* numeric pixel value (such as `360`) or `auto`. Defaults to `auto`,
* and it is recommended that you use this default.
* - `loop` (boolean): Do you want the media to loop when its done?
* - `loop` (boolean): Do you want the media to loop when it's done?
* Defaults to `false`.
* - `mediaplayer` (string): The location of the included
* `mediaplayer.swf` file. This allows for the playback of Flash Video

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.4-dev
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -64,8 +63,9 @@ class SimplePie_File
var $redirects = 0;
var $error;
var $method = SIMPLEPIE_FILE_SOURCE_NONE;
var $permanent_url;
public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false)
public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false, $curl_options = array())
{
if (class_exists('idna_convert'))
{
@ -74,6 +74,7 @@ class SimplePie_File
$url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
}
$this->url = $url;
$this->permanent_url = $url;
$this->useragent = $useragent;
if (preg_match('/^http(s)?:\/\//i', $url))
{
@ -102,6 +103,7 @@ class SimplePie_File
curl_setopt($fp, CURLOPT_URL, $url);
curl_setopt($fp, CURLOPT_HEADER, 1);
curl_setopt($fp, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($fp, CURLOPT_FAILONERROR, 1);
curl_setopt($fp, CURLOPT_TIMEOUT, $timeout);
curl_setopt($fp, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($fp, CURLOPT_REFERER, $url);
@ -112,6 +114,9 @@ class SimplePie_File
curl_setopt($fp, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($fp, CURLOPT_MAXREDIRS, $redirects);
}
foreach ($curl_options as $curl_param => $curl_value) {
curl_setopt($fp, $curl_param, $curl_value);
}
$this->headers = curl_exec($fp);
if (curl_errno($fp) === 23 || curl_errno($fp) === 61)
@ -126,7 +131,10 @@ class SimplePie_File
}
else
{
$info = curl_getinfo($fp);
// Use the updated url provided by curl_getinfo after any redirects.
if ($info = curl_getinfo($fp)) {
$this->url = $info['url'];
}
curl_close($fp);
$this->headers = explode("\r\n\r\n", $this->headers, $info['redirect_count'] + 1);
$this->headers = array_pop($this->headers);
@ -134,13 +142,16 @@ class SimplePie_File
if ($parser->parse())
{
$this->headers = $parser->headers;
$this->body = $parser->body;
$this->body = trim($parser->body);
$this->status_code = $parser->status_code;
if ((in_array($this->status_code, array(300, 301, 302, 303, 307)) || $this->status_code > 307 && $this->status_code < 400) && isset($this->headers['location']) && $this->redirects < $redirects)
{
$this->redirects++;
$location = SimplePie_Misc::absolutize_url($this->headers['location'], $url);
return $this->__construct($location, $timeout, $redirects, $headers, $useragent, $force_fsockopen);
$previousStatusCode = $this->status_code;
$this->__construct($location, $timeout, $redirects, $headers, $useragent, $force_fsockopen);
$this->permanent_url = ($previousStatusCode == 301) ? $location : $url;
return;
}
}
}
@ -222,7 +233,10 @@ class SimplePie_File
{
$this->redirects++;
$location = SimplePie_Misc::absolutize_url($this->headers['location'], $url);
return $this->__construct($location, $timeout, $redirects, $headers, $useragent, $force_fsockopen);
$previousStatusCode = $this->status_code;
$this->__construct($location, $timeout, $redirects, $headers, $useragent, $force_fsockopen);
$this->permanent_url = ($previousStatusCode == 301) ? $location : $url;
return;
}
if (isset($this->headers['content-encoding']))
{
@ -239,7 +253,7 @@ class SimplePie_File
}
else
{
$this->body = $decoder->data;
$this->body = trim($decoder->data);
}
break;
@ -282,7 +296,7 @@ class SimplePie_File
else
{
$this->method = SIMPLEPIE_FILE_SOURCE_LOCAL | SIMPLEPIE_FILE_SOURCE_FILE_GET_CONTENTS;
if (!$this->body = file_get_contents($url))
if (empty($url) || !($this->body = trim(file_get_contents($url))))
{
$this->error = 'file_get_contents could not read the file';
$this->success = false;

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -259,6 +258,15 @@ class SimplePie_IRI
$this->set_iri($iri);
}
/**
* Clean up
*/
public function __destruct() {
$this->set_iri(null, true);
$this->set_path(null, true);
$this->set_authority(null, true);
}
/**
* Create a new IRI object by resolving a relative IRI
*
@ -768,24 +776,20 @@ class SimplePie_IRI
*/
public function is_valid()
{
$isauthority = $this->iuserinfo !== null || $this->ihost !== null || $this->port !== null;
if ($this->ipath !== '' &&
(
$isauthority && (
$this->ipath[0] !== '/' ||
substr($this->ipath, 0, 2) === '//'
) ||
(
$this->scheme === null &&
!$isauthority &&
strpos($this->ipath, ':') !== false &&
(strpos($this->ipath, '/') === false ? true : strpos($this->ipath, ':') < strpos($this->ipath, '/'))
)
)
)
{
return false;
}
if ($this->ipath === '') return true;
$isauthority = $this->iuserinfo !== null || $this->ihost !== null ||
$this->port !== null;
if ($isauthority && $this->ipath[0] === '/') return true;
if (!$isauthority && (substr($this->ipath, 0, 2) === '//')) return false;
// Relative urls cannot have a colon in the first path segment (and the
// slashes themselves are not included so skip the first character).
if (!$this->scheme && !$isauthority &&
strpos($this->ipath, ':') !== false &&
strpos($this->ipath, '/', 1) !== false &&
strpos($this->ipath, ':') < strpos($this->ipath, '/', 1)) return false;
return true;
}
@ -797,9 +801,14 @@ class SimplePie_IRI
* @param string $iri
* @return bool
*/
public function set_iri($iri)
public function set_iri($iri, $clear_cache = false)
{
static $cache;
if ($clear_cache)
{
$cache = null;
return;
}
if (!$cache)
{
$cache = array();
@ -879,9 +888,14 @@ class SimplePie_IRI
* @param string $authority
* @return bool
*/
public function set_authority($authority)
public function set_authority($authority, $clear_cache = false)
{
static $cache;
if ($clear_cache)
{
$cache = null;
return;
}
if (!$cache)
$cache = array();
@ -1049,9 +1063,14 @@ class SimplePie_IRI
* @param string $ipath
* @return bool
*/
public function set_path($ipath)
public function set_path($ipath, $clear_cache = false)
{
static $cache;
if ($clear_cache)
{
$cache = null;
return;
}
if (!$cache)
{
$cache = array();

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -203,14 +202,13 @@ class SimplePie_Item
*
* Uses `<atom:id>`, `<guid>`, `<dc:identifier>` or the `about` attribute
* for RDF. If none of these are supplied (or `$hash` is true), creates an
* MD5 hash based on the permalink and title. If either of those are not
* supplied, creates a hash based on the full feed data.
* MD5 hash based on the permalink, title and content.
*
* @since Beta 2
* @param boolean $hash Should we force using a hash instead of the supplied ID?
* @return string
*/
public function get_id($hash = false)
public function get_id($hash = false, $fn = '')
{
if (!$hash)
{
@ -238,23 +236,10 @@ class SimplePie_Item
{
return $this->sanitize($this->data['attribs'][SIMPLEPIE_NAMESPACE_RDF]['about'], SIMPLEPIE_CONSTRUCT_TEXT);
}
elseif (($return = $this->get_permalink()) !== null)
{
return $return;
}
elseif (($return = $this->get_title()) !== null)
{
return $return;
}
}
if ($this->get_permalink() !== null || $this->get_title() !== null)
{
return md5($this->get_permalink() . $this->get_title());
}
else
{
return md5(serialize($this->data));
}
if ($fn === '' || !is_callable($fn)) $fn = 'md5';
return call_user_func($fn,
$this->get_permalink().$this->get_title().$this->get_content());
}
/**
@ -322,41 +307,50 @@ class SimplePie_Item
*/
public function get_description($description_only = false)
{
if ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'summary'))
if (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'summary')) &&
($return = $this->sanitize($tags[0]['data'], $this->registry->call('Misc', 'atom_10_construct_type', array($tags[0]['attribs'])), $this->get_base($tags[0]))))
{
return $this->sanitize($return[0]['data'], $this->registry->call('Misc', 'atom_10_construct_type', array($return[0]['attribs'])), $this->get_base($return[0]));
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'summary'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'summary')) &&
($return = $this->sanitize($tags[0]['data'], $this->registry->call('Misc', 'atom_03_construct_type', array($tags[0]['attribs'])), $this->get_base($tags[0]))))
{
return $this->sanitize($return[0]['data'], $this->registry->call('Misc', 'atom_03_construct_type', array($return[0]['attribs'])), $this->get_base($return[0]));
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_10, 'description'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_10, 'description')) &&
($return = $this->sanitize($tags[0]['data'], SIMPLEPIE_CONSTRUCT_MAYBE_HTML, $this->get_base($tags[0]))))
{
return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_MAYBE_HTML, $this->get_base($return[0]));
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'description'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'description')) &&
($return = $this->sanitize($tags[0]['data'], SIMPLEPIE_CONSTRUCT_HTML, $this->get_base($tags[0]))))
{
return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_HTML, $this->get_base($return[0]));
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_11, 'description'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_11, 'description')) &&
($return = $this->sanitize($tags[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT)))
{
return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_10, 'description'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_10, 'description')) &&
($return = $this->sanitize($tags[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT)))
{
return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ITUNES, 'summary'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ITUNES, 'summary')) &&
($return = $this->sanitize($tags[0]['data'], SIMPLEPIE_CONSTRUCT_HTML, $this->get_base($tags[0]))))
{
return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_HTML, $this->get_base($return[0]));
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ITUNES, 'subtitle'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ITUNES, 'subtitle')) &&
($return = $this->sanitize($tags[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT)))
{
return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_090, 'description'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_090, 'description')) &&
($return = $this->sanitize($tags[0]['data'], SIMPLEPIE_CONSTRUCT_HTML)))
{
return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_HTML);
return $return;
}
elseif (!$description_only)
@ -385,17 +379,20 @@ class SimplePie_Item
*/
public function get_content($content_only = false)
{
if ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'content'))
if (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'content')) &&
($return = $this->sanitize($tags[0]['data'], $this->registry->call('Misc', 'atom_10_content_construct_type', array($tags[0]['attribs'])), $this->get_base($tags[0]))))
{
return $this->sanitize($return[0]['data'], $this->registry->call('Misc', 'atom_10_content_construct_type', array($return[0]['attribs'])), $this->get_base($return[0]));
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'content'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'content')) &&
($return = $this->sanitize($tags[0]['data'], $this->registry->call('Misc', 'atom_03_construct_type', array($tags[0]['attribs'])), $this->get_base($tags[0]))))
{
return $this->sanitize($return[0]['data'], $this->registry->call('Misc', 'atom_03_construct_type', array($return[0]['attribs'])), $this->get_base($return[0]));
return $return;
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_10_MODULES_CONTENT, 'encoded'))
elseif (($tags = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_10_MODULES_CONTENT, 'encoded')) &&
($return = $this->sanitize($tags[0]['data'], SIMPLEPIE_CONSTRUCT_HTML, $this->get_base($tags[0]))))
{
return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_HTML, $this->get_base($return[0]));
return $return;
}
elseif (!$content_only)
{
@ -406,6 +403,30 @@ class SimplePie_Item
return null;
}
}
/**
* Get the media:thumbnail of the item
*
* Uses `<media:thumbnail>`
*
*
* @return array|null
*/
public function get_thumbnail()
{
if (!isset($this->data['thumbnail']))
{
if ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_MEDIARSS, 'thumbnail'))
{
$this->data['thumbnail'] = $return[0]['attribs'][''];
}
else
{
$this->data['thumbnail'] = null;
}
}
return $this->data['thumbnail'];
}
/**
* Get a category for the item
@ -433,7 +454,7 @@ class SimplePie_Item
* Uses `<atom:category>`, `<category>` or `<dc:subject>`
*
* @since Beta 3
* @return array|null List of {@see SimplePie_Category} objects
* @return SimplePie_Category[]|null List of {@see SimplePie_Category} objects
*/
public function get_categories()
{
@ -446,15 +467,15 @@ class SimplePie_Item
$label = null;
if (isset($category['attribs']['']['term']))
{
$term = $this->sanitize($category['attribs']['']['term'], SIMPLEPIE_CONSTRUCT_TEXT);
$term = $this->sanitize($category['attribs']['']['term'], SIMPLEPIE_CONSTRUCT_HTML);
}
if (isset($category['attribs']['']['scheme']))
{
$scheme = $this->sanitize($category['attribs']['']['scheme'], SIMPLEPIE_CONSTRUCT_TEXT);
$scheme = $this->sanitize($category['attribs']['']['scheme'], SIMPLEPIE_CONSTRUCT_HTML);
}
if (isset($category['attribs']['']['label']))
{
$label = $this->sanitize($category['attribs']['']['label'], SIMPLEPIE_CONSTRUCT_TEXT);
$label = $this->sanitize($category['attribs']['']['label'], SIMPLEPIE_CONSTRUCT_HTML);
}
$categories[] = $this->registry->create('Category', array($term, $scheme, $label));
}
@ -462,10 +483,10 @@ class SimplePie_Item
{
// This is really the label, but keep this as the term also for BC.
// Label will also work on retrieving because that falls back to term.
$term = $this->sanitize($category['data'], SIMPLEPIE_CONSTRUCT_TEXT);
$term = $this->sanitize($category['data'], SIMPLEPIE_CONSTRUCT_HTML);
if (isset($category['attribs']['']['domain']))
{
$scheme = $this->sanitize($category['attribs']['']['domain'], SIMPLEPIE_CONSTRUCT_TEXT);
$scheme = $this->sanitize($category['attribs']['']['domain'], SIMPLEPIE_CONSTRUCT_HTML);
}
else
{
@ -475,11 +496,11 @@ class SimplePie_Item
}
foreach ((array) $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_11, 'subject') as $category)
{
$categories[] = $this->registry->create('Category', array($this->sanitize($category['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null));
$categories[] = $this->registry->create('Category', array($this->sanitize($category['data'], SIMPLEPIE_CONSTRUCT_HTML), null, null));
}
foreach ((array) $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_10, 'subject') as $category)
{
$categories[] = $this->registry->create('Category', array($this->sanitize($category['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null));
$categories[] = $this->registry->create('Category', array($this->sanitize($category['data'], SIMPLEPIE_CONSTRUCT_HTML), null, null));
}
if (!empty($categories))
@ -616,7 +637,7 @@ class SimplePie_Item
$email = null;
if (isset($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['name'][0]['data']))
{
$name = $this->sanitize($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['name'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
$name = $this->sanitize($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['name'][0]['data'], SIMPLEPIE_CONSTRUCT_HTML);
}
if (isset($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['uri'][0]['data']))
{
@ -624,7 +645,7 @@ class SimplePie_Item
}
if (isset($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['email'][0]['data']))
{
$email = $this->sanitize($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['email'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
$email = $this->sanitize($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['email'][0]['data'], SIMPLEPIE_CONSTRUCT_HTML);
}
if ($name !== null || $email !== null || $uri !== null)
{
@ -638,7 +659,7 @@ class SimplePie_Item
$email = null;
if (isset($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['name'][0]['data']))
{
$name = $this->sanitize($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['name'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
$name = $this->sanitize($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['name'][0]['data'], SIMPLEPIE_CONSTRUCT_HTML);
}
if (isset($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['url'][0]['data']))
{
@ -646,7 +667,7 @@ class SimplePie_Item
}
if (isset($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['email'][0]['data']))
{
$email = $this->sanitize($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['email'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
$email = $this->sanitize($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['email'][0]['data'], SIMPLEPIE_CONSTRUCT_HTML);
}
if ($name !== null || $email !== null || $url !== null)
{
@ -655,19 +676,19 @@ class SimplePie_Item
}
if ($author = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'author'))
{
$authors[] = $this->registry->create('Author', array(null, null, $this->sanitize($author[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT)));
$authors[] = $this->registry->create('Author', array(null, null, $this->sanitize($author[0]['data'], SIMPLEPIE_CONSTRUCT_HTML)));
}
foreach ((array) $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_11, 'creator') as $author)
{
$authors[] = $this->registry->create('Author', array($this->sanitize($author['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null));
$authors[] = $this->registry->create('Author', array($this->sanitize($author['data'], SIMPLEPIE_CONSTRUCT_HTML), null, null));
}
foreach ((array) $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_10, 'creator') as $author)
{
$authors[] = $this->registry->create('Author', array($this->sanitize($author['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null));
$authors[] = $this->registry->create('Author', array($this->sanitize($author['data'], SIMPLEPIE_CONSTRUCT_HTML), null, null));
}
foreach ((array) $this->get_item_tags(SIMPLEPIE_NAMESPACE_ITUNES, 'author') as $author)
{
$authors[] = $this->registry->create('Author', array($this->sanitize($author['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null));
$authors[] = $this->registry->create('Author', array($this->sanitize($author['data'], SIMPLEPIE_CONSTRUCT_HTML), null, null));
}
if (!empty($authors))
@ -738,6 +759,18 @@ class SimplePie_Item
{
$this->data['date']['raw'] = $return[0]['data'];
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'pubDate'))
{
$this->data['date']['raw'] = $return[0]['data'];
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_11, 'date'))
{
$this->data['date']['raw'] = $return[0]['data'];
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_10, 'date'))
{
$this->data['date']['raw'] = $return[0]['data'];
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'updated'))
{
$this->data['date']['raw'] = $return[0]['data'];
@ -754,18 +787,6 @@ class SimplePie_Item
{
$this->data['date']['raw'] = $return[0]['data'];
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'pubDate'))
{
$this->data['date']['raw'] = $return[0]['data'];
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_11, 'date'))
{
$this->data['date']['raw'] = $return[0]['data'];
}
elseif ($return = $this->get_item_tags(SIMPLEPIE_NAMESPACE_DC_10, 'date'))
{
$this->data['date']['raw'] = $return[0]['data'];
}
if (!empty($this->data['date']['raw']))
{
@ -821,7 +842,7 @@ class SimplePie_Item
if (!empty($this->data['updated']['raw']))
{
$parser = $this->registry->call('Parse_Date', 'get');
$this->data['updated']['parsed'] = $parser->parse($this->data['date']['raw']);
$this->data['updated']['parsed'] = $parser->parse($this->data['updated']['raw']);
}
else
{
@ -1080,8 +1101,8 @@ class SimplePie_Item
*
* @since Beta 2
* @todo Add support for end-user defined sorting of enclosures by type/handler (so we can prefer the faster-loading FLV over MP4).
* @todo If an element exists at a level, but it's value is empty, we should fall back to the value from the parent (if it exists).
* @return array|null List of SimplePie_Enclosure items
* @todo If an element exists at a level, but its value is empty, we should fall back to the value from the parent (if it exists).
* @return SimplePie_Enclosure[]|null List of SimplePie_Enclosure items
*/
public function get_enclosures()
{
@ -2658,7 +2679,9 @@ class SimplePie_Item
// PLAYER
if (isset($content['child'][SIMPLEPIE_NAMESPACE_MEDIARSS]['player']))
{
$player = $this->sanitize($content['child'][SIMPLEPIE_NAMESPACE_MEDIARSS]['player'][0]['attribs']['']['url'], SIMPLEPIE_CONSTRUCT_IRI);
if (isset($content['child'][SIMPLEPIE_NAMESPACE_MEDIARSS]['player'][0]['attribs']['']['url'])) {
$player = $this->sanitize($content['child'][SIMPLEPIE_NAMESPACE_MEDIARSS]['player'][0]['attribs']['']['url'], SIMPLEPIE_CONSTRUCT_IRI);
}
}
else
{
@ -2733,7 +2756,9 @@ class SimplePie_Item
{
foreach ($content['child'][SIMPLEPIE_NAMESPACE_MEDIARSS]['thumbnail'] as $thumbnail)
{
$thumbnails[] = $this->sanitize($thumbnail['attribs']['']['url'], SIMPLEPIE_CONSTRUCT_IRI);
if (isset($thumbnail['attribs']['']['url'])) {
$thumbnails[] = $this->sanitize($thumbnail['attribs']['']['url'], SIMPLEPIE_CONSTRUCT_IRI);
}
}
if (is_array($thumbnails))
{

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -121,34 +120,41 @@ class SimplePie_Locator
{
if ($type & SIMPLEPIE_LOCATOR_LOCAL_EXTENSION && $working = $this->extension($this->local))
{
return $working;
return $working[0];
}
if ($type & SIMPLEPIE_LOCATOR_LOCAL_BODY && $working = $this->body($this->local))
{
return $working;
return $working[0];
}
if ($type & SIMPLEPIE_LOCATOR_REMOTE_EXTENSION && $working = $this->extension($this->elsewhere))
{
return $working;
return $working[0];
}
if ($type & SIMPLEPIE_LOCATOR_REMOTE_BODY && $working = $this->body($this->elsewhere))
{
return $working;
return $working[0];
}
}
return null;
}
public function is_feed($file)
public function is_feed($file, $check_html = false)
{
if ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE)
{
$sniffer = $this->registry->create('Content_Type_Sniffer', array($file));
$sniffed = $sniffer->get_type();
if (in_array($sniffed, array('application/rss+xml', 'application/rdf+xml', 'text/rdf', 'application/atom+xml', 'text/xml', 'application/xml')))
$mime_types = array('application/rss+xml', 'application/rdf+xml',
'text/rdf', 'application/atom+xml', 'text/xml',
'application/xml', 'application/x-rss+xml');
if ($check_html)
{
$mime_types[] = 'text/html';
}
if (in_array($sniffed, $mime_types))
{
return true;
}
@ -226,7 +232,7 @@ class SimplePie_Locator
}
if ($link->hasAttribute('href') && $link->hasAttribute('rel'))
{
$rel = array_unique($this->registry->call('Misc', 'space_seperated_tokens', array(strtolower($link->getAttribute('rel')))));
$rel = array_unique($this->registry->call('Misc', 'space_separated_tokens', array(strtolower($link->getAttribute('rel')))));
$line = method_exists($link, 'getLineNo') ? $link->getLineNo() : 1;
if ($this->base_location < $line)
@ -242,14 +248,14 @@ class SimplePie_Locator
continue;
}
if (!in_array($href, $done) && in_array('feed', $rel) || (in_array('alternate', $rel) && !in_array('stylesheet', $rel) && $link->hasAttribute('type') && in_array(strtolower($this->registry->call('Misc', 'parse_mime', array($link->getAttribute('type')))), array('application/rss+xml', 'application/atom+xml'))) && !isset($feeds[$href]))
if (!in_array($href, $done) && in_array('feed', $rel) || (in_array('alternate', $rel) && !in_array('stylesheet', $rel) && $link->hasAttribute('type') && in_array(strtolower($this->registry->call('Misc', 'parse_mime', array($link->getAttribute('type')))), array('text/html', 'application/rss+xml', 'application/atom+xml'))) && !isset($feeds[$href]))
{
$this->checked_feeds++;
$headers = array(
'Accept' => 'application/atom+xml, application/rss+xml, application/rdf+xml;q=0.9, application/xml;q=0.8, text/xml;q=0.8, text/html;q=0.7, unknown/unknown;q=0.1, application/unknown;q=0.1, */*;q=0.1',
);
$feed = $this->registry->create('File', array($href, $this->timeout, 5, $headers, $this->useragent));
if ($feed->success && ($feed->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($feed->status_code === 200 || $feed->status_code > 206 && $feed->status_code < 300)) && $this->is_feed($feed))
if ($feed->success && ($feed->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($feed->status_code === 200 || $feed->status_code > 206 && $feed->status_code < 300)) && $this->is_feed($feed, true))
{
$feeds[$href] = $feed;
}
@ -275,9 +281,9 @@ class SimplePie_Locator
{
$href = trim($link->getAttribute('href'));
$parsed = $this->registry->call('Misc', 'parse_url', array($href));
if ($parsed['scheme'] === '' || preg_match('/^(http(s)|feed)?$/i', $parsed['scheme']))
if ($parsed['scheme'] === '' || preg_match('/^(https?|feed)?$/i', $parsed['scheme']))
{
if ($this->base_location < $link->getLineNo())
if (method_exists($link, 'getLineNo') && $this->base_location < $link->getLineNo())
{
$href = $this->registry->call('Misc', 'absolutize_url', array(trim($link->getAttribute('href')), $this->base));
}
@ -312,6 +318,57 @@ class SimplePie_Locator
return null;
}
public function get_rel_link($rel)
{
if ($this->dom === null)
{
throw new SimplePie_Exception('DOMDocument not found, unable to use '.
'locator');
}
if (!class_exists('DOMXpath'))
{
throw new SimplePie_Exception('DOMXpath not found, unable to use '.
'get_rel_link');
}
$xpath = new DOMXpath($this->dom);
$query = '//a[@rel and @href] | //link[@rel and @href]';
foreach ($xpath->query($query) as $link)
{
$href = trim($link->getAttribute('href'));
$parsed = $this->registry->call('Misc', 'parse_url', array($href));
if ($parsed['scheme'] === '' ||
preg_match('/^https?$/i', $parsed['scheme']))
{
if (method_exists($link, 'getLineNo') &&
$this->base_location < $link->getLineNo())
{
$href =
$this->registry->call('Misc', 'absolutize_url',
array(trim($link->getAttribute('href')),
$this->base));
}
else
{
$href =
$this->registry->call('Misc', 'absolutize_url',
array(trim($link->getAttribute('href')),
$this->http_base));
}
if ($href === false)
{
return null;
}
$rel_values = explode(' ', strtolower($link->getAttribute('rel')));
if (in_array($rel, $rel_values))
{
return $href;
}
}
}
return null;
}
public function extension(&$array)
{
foreach ($array as $key => $value)
@ -330,7 +387,7 @@ class SimplePie_Locator
$feed = $this->registry->create('File', array($value, $this->timeout, 5, $headers, $this->useragent));
if ($feed->success && ($feed->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($feed->status_code === 200 || $feed->status_code > 206 && $feed->status_code < 300)) && $this->is_feed($feed))
{
return $feed;
return array($feed);
}
else
{
@ -358,7 +415,7 @@ class SimplePie_Locator
$feed = $this->registry->create('File', array($value, $this->timeout, 5, null, $this->useragent));
if ($feed->success && ($feed->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($feed->status_code === 200 || $feed->status_code > 206 && $feed->status_code < 300)) && $this->is_feed($feed))
{
return $feed;
return array($feed);
}
else
{

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -124,7 +123,7 @@ class SimplePie_Misc
{
$attribs[$j][2] = $attribs[$j][1];
}
$return[$i]['attribs'][strtolower($attribs[$j][1])]['data'] = SimplePie_Misc::entities_decode(end($attribs[$j]), 'UTF-8');
$return[$i]['attribs'][strtolower($attribs[$j][1])]['data'] = SimplePie_Misc::entities_decode(end($attribs[$j]));
}
}
}
@ -138,7 +137,7 @@ class SimplePie_Misc
foreach ($element['attribs'] as $key => $value)
{
$key = strtolower($key);
$full .= " $key=\"" . htmlspecialchars($value['data']) . '"';
$full .= " $key=\"" . htmlspecialchars($value['data'], ENT_COMPAT, 'UTF-8') . '"';
}
if ($element['self_closing'])
{
@ -224,6 +223,23 @@ class SimplePie_Misc
}
}
public static function array_merge_recursive($array1, $array2)
{
foreach ($array2 as $key => $value)
{
if (is_array($value))
{
$array1[$key] = SimplePie_Misc::array_merge_recursive($array1[$key], $value);
}
else
{
$array1[$key] = $value;
}
}
return $array1;
}
public static function parse_url($url)
{
$iri = new SimplePie_IRI($url);
@ -317,11 +333,16 @@ class SimplePie_Misc
{
return $return;
}
// This is last, as behaviour of this varies with OS userland and PHP version
// This is third, as behaviour of this varies with OS userland and PHP version
elseif (function_exists('iconv') && ($return = SimplePie_Misc::change_encoding_iconv($data, $input, $output)))
{
return $return;
}
// This is last, as behaviour of this varies with OS userland and PHP version
elseif (class_exists('\UConverter') && ($return = SimplePie_Misc::change_encoding_uconverter($data, $input, $output)))
{
return $return;
}
// If we can't do anything, just fail
else
{
@ -372,6 +393,17 @@ class SimplePie_Misc
return @iconv($input, $output, $data);
}
/**
* @param string $data
* @param string $input
* @param string $output
* @return string|false
*/
protected static function change_encoding_uconverter($data, $input, $output)
{
return @\UConverter::transcode($data, $output, $input);
}
/**
* Normalize an encoding name
*
@ -1926,7 +1958,7 @@ class SimplePie_Misc
return (bool) preg_match('/^([A-Za-z0-9\-._~\x{A0}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFEF}\x{10000}-\x{1FFFD}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}\x{40000}-\x{4FFFD}\x{50000}-\x{5FFFD}\x{60000}-\x{6FFFD}\x{70000}-\x{7FFFD}\x{80000}-\x{8FFFD}\x{90000}-\x{9FFFD}\x{A0000}-\x{AFFFD}\x{B0000}-\x{BFFFD}\x{C0000}-\x{CFFFD}\x{D0000}-\x{DFFFD}\x{E1000}-\x{EFFFD}!$&\'()*+,;=@]|(%[0-9ABCDEF]{2}))+$/u', $string);
}
public static function space_seperated_tokens($string)
public static function space_separated_tokens($string)
{
$space_characters = "\x20\x09\x0A\x0B\x0C\x0D";
$string_length = strlen($string);

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -173,7 +172,7 @@ class SimplePie_Parse_Date
'aug' => 8,
'august' => 8,
'sep' => 9,
'september' => 8,
'september' => 9,
'oct' => 10,
'october' => 10,
'nov' => 11,
@ -331,6 +330,7 @@ class SimplePie_Parse_Date
'CCT' => 23400,
'CDT' => -18000,
'CEDT' => 7200,
'CEST' => 7200,
'CET' => 3600,
'CGST' => -7200,
'CGT' => -10800,
@ -630,7 +630,7 @@ class SimplePie_Parse_Date
/**
* Parse a superset of W3C-DTF (allows hyphens and colons to be omitted, as
* well as allowing any of upper or lower case "T", horizontal tabs, or
* spaces to be used as the time seperator (including more than one))
* spaces to be used as the time separator (including more than one))
*
* @access protected
* @return int Timestamp
@ -690,7 +690,7 @@ class SimplePie_Parse_Date
}
// Convert the number of seconds to an integer, taking decimals into account
$second = round($match[6] + $match[7] / pow(10, strlen($match[7])));
$second = round((int)$match[6] + (int)$match[7] / pow(10, strlen($match[7])));
return gmmktime($match[4], $match[5], $second, $match[2], $match[3], $match[1]) - $timezone;
}
@ -720,7 +720,7 @@ class SimplePie_Parse_Date
{
$output .= substr($string, $position, $pos - $position);
$position = $pos + 1;
if ($string[$pos - 1] !== '\\')
if ($pos === 0 || $string[$pos - 1] !== '\\')
{
$depth++;
while ($depth && $position < $length)

File diff suppressed because one or more lines are too long

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -113,7 +112,7 @@ class SimplePie_Registry
*/
public function register($type, $class, $legacy = false)
{
if (!is_subclass_of($class, $this->default[$type]))
if (!@is_subclass_of($class, $this->default[$type]))
{
return false;
}
@ -222,4 +221,4 @@ class SimplePie_Registry
$result = call_user_func_array(array($class, $method), $parameters);
return $result;
}
}
}

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue
@ -61,7 +60,8 @@ class SimplePie_Sanitize
var $image_handler = '';
var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
var $encode_instead_of_strip = false;
var $strip_attributes = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
var $strip_attributes = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
var $add_attributes = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none'));
var $strip_comments = false;
var $output_encoding = 'UTF-8';
var $enable_cache = true;
@ -160,7 +160,7 @@ class SimplePie_Sanitize
$this->encode_instead_of_strip = (bool) $encode;
}
public function strip_attributes($attribs = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
public function strip_attributes($attribs = array('bgsound', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
{
if ($attribs)
{
@ -179,6 +179,25 @@ class SimplePie_Sanitize
}
}
public function add_attributes($attribs = array('audio' => array('preload' => 'none'), 'iframe' => array('sandbox' => 'allow-scripts allow-same-origin'), 'video' => array('preload' => 'none')))
{
if ($attribs)
{
if (is_array($attribs))
{
$this->add_attributes = $attribs;
}
else
{
$this->add_attributes = explode(',', $attribs);
}
}
else
{
$this->add_attributes = false;
}
}
public function strip_comments($strip = false)
{
$this->strip_comments = (bool) $strip;
@ -247,18 +266,24 @@ class SimplePie_Sanitize
if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
{
if (!class_exists('DOMDocument'))
{
throw new SimplePie_Exception('DOMDocument not found, unable to use sanitizer');
}
$document = new DOMDocument();
$document->encoding = 'UTF-8';
$data = $this->preprocess($data, $type);
set_error_handler(array('SimplePie_Misc', 'silence_errors'));
$document->loadHTML($data);
restore_error_handler();
$xpath = new DOMXPath($document);
// Strip comments
if ($this->strip_comments)
{
$xpath = new DOMXPath($document);
$comments = $xpath->query('//comment()');
foreach ($comments as $comment)
@ -274,7 +299,7 @@ class SimplePie_Sanitize
{
foreach ($this->strip_htmltags as $tag)
{
$this->strip_tag($tag, $document, $type);
$this->strip_tag($tag, $document, $xpath, $type);
}
}
@ -282,7 +307,15 @@ class SimplePie_Sanitize
{
foreach ($this->strip_attributes as $attrib)
{
$this->strip_attr($attrib, $document);
$this->strip_attr($attrib, $xpath);
}
}
if ($this->add_attributes)
{
foreach ($this->add_attributes as $tag => $valuePairs)
{
$this->add_attr($tag, $valuePairs, $document);
}
}
@ -310,7 +343,7 @@ class SimplePie_Sanitize
}
else
{
$file = $this->registry->create('File', array($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
$file = $this->registry->create('File', array($img->getAttribute('src'), $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
$headers = $file->headers;
if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
@ -329,19 +362,17 @@ class SimplePie_Sanitize
}
}
// Remove the DOCTYPE
// Seems to cause segfaulting if we don't do this
if ($document->firstChild instanceof DOMDocumentType)
{
$document->removeChild($document->firstChild);
}
// Move everything from the body to the root
$real_body = $document->getElementsByTagName('body')->item(0)->childNodes->item(0);
$document->replaceChild($real_body, $document->firstChild);
// Get content node
$div = $document->getElementsByTagName('body')->item(0)->firstChild;
// Finally, convert to a HTML string
$data = trim($document->saveHTML());
if (version_compare(PHP_VERSION, '5.3.6', '>='))
{
$data = trim($document->saveHTML($div));
}
else
{
$data = trim($document->saveXML($div));
}
if ($this->remove_div)
{
@ -379,6 +410,7 @@ class SimplePie_Sanitize
protected function preprocess($html, $type)
{
$ret = '';
$html = preg_replace('%</?(?:html|body)[^>]*?'.'>%is', '', $html);
if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
{
// Atom XHTML constructs are wrapped with a div by default
@ -451,9 +483,8 @@ class SimplePie_Sanitize
}
}
protected function strip_tag($tag, $document, $type)
protected function strip_tag($tag, $document, $xpath, $type)
{
$xpath = new DOMXPath($document);
$elements = $xpath->query('body//' . $tag);
if ($this->encode_instead_of_strip)
{
@ -536,9 +567,8 @@ class SimplePie_Sanitize
}
}
protected function strip_attr($attrib, $document)
protected function strip_attr($attrib, $xpath)
{
$xpath = new DOMXPath($document);
$elements = $xpath->query('//*[@' . $attrib . ']');
foreach ($elements as $element)
@ -546,4 +576,16 @@ class SimplePie_Sanitize
$element->removeAttribute($attrib);
}
}
protected function add_attr($tag, $valuePairs, $document)
{
$elements = $document->getElementsByTagName($tag);
foreach ($elements as $element)
{
foreach ($valuePairs as $attrib => $value)
{
$element->setAttribute($attrib, $value);
}
}
}
}

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -5,7 +5,7 @@
* A PHP-Based RSS and Atom Feed Framework.
* Takes the hard work out of managing a complete RSS/Atom solution.
*
* Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* Copyright (c) 2004-2016, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
@ -33,8 +33,7 @@
* POSSIBILITY OF SUCH DAMAGE.
*
* @package SimplePie
* @version 1.3.1
* @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @copyright 2004-2016 Ryan Parman, Geoffrey Sneddon, Ryan McCue
* @author Ryan Parman
* @author Geoffrey Sneddon
* @author Ryan McCue

View File

@ -1,10 +1,10 @@
<?php
// Full-Text RSS: Create Full-Text Feeds
// Author: Keyvan Minoukadeh
// Copyright (c) 2015 Keyvan Minoukadeh
// Copyright (c) 2017 Keyvan Minoukadeh
// License: AGPLv3
// Version: 3.6
// Date: 2016-02-17
// Version: 3.7
// Date: 2017-02-12
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
@ -272,7 +272,7 @@ if (file_exists('custom_init.php')) require 'custom_init.php';
///////////////////////////////////////////////
// Check URL against list of blacklisted URLs
///////////////////////////////////////////////
if (!url_allowed($url)) die('URL blocked');
if (!url_allowed($url)) die($options->blocked_message);
///////////////////////////////////////////////
// Max entries
@ -302,12 +302,20 @@ if (isset($_REQUEST['links']) && in_array($_REQUEST['links'], array('preserve',
$links = 'preserve';
}
///////////////////////////////////////////////
// Image handling
///////////////////////////////////////////////
$images = true;
if (isset($_REQUEST['images']) && in_array($_REQUEST['images'], array('0', 'remove'))) {
$images = false;
}
///////////////////////////////////////////////
// Favour item titles in feed?
///////////////////////////////////////////////
$favour_feed_titles = true;
if ($options->favour_feed_titles == 'user') {
$favour_feed_titles = !isset($_REQUEST['use_extracted_title']);
$favour_feed_titles = (!isset($_REQUEST['use_extracted_title']) || $_REQUEST['use_extracted_title'] === '0');
} else {
$favour_feed_titles = $options->favour_feed_titles;
}
@ -317,7 +325,7 @@ if ($options->favour_feed_titles == 'user') {
///////////////////////////////////////////////
$favour_effective_url = false;
if ($options->favour_effective_url == 'user') {
$favour_effective_url = isset($_REQUEST['use_effective_url']);
$favour_effective_url = (isset($_REQUEST['use_effective_url']) && $_REQUEST['use_effective_url'] !== '0');
} else {
$favour_effective_url = $options->favour_effective_url;
}
@ -333,6 +341,17 @@ if ($options->content === 'user') {
}
}
///////////////////////////////////////////////
// HTML5 output?
///////////////////////////////////////////////
if ($options->html5_output === 'user') {
if (isset($_REQUEST['content']) && $_REQUEST['content'] === 'html5') {
$options->html5_output = true;
} else {
$options->html5_output = false;
}
}
///////////////////////////////////////////////
// Include summaries in output?
///////////////////////////////////////////////
@ -367,7 +386,7 @@ if ($options->detect_language === 'user') {
$detect_language = $options->detect_language;
}
$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
$use_cld = extension_loaded('cld');
/////////////////////////////////////
// Check for valid format
@ -468,8 +487,7 @@ if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') {
//////////////////////////////////
if ($options->caching) {
debug('Caching is enabled...');
$cache_id = md5($max.$url.(int)$valid_key.$accept.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.
(int)$xss_filter.(int)$favour_effective_url.(int)$exclude_on_fail.$format.$detect_language.$parser.$user_submitted_config._FF_FTR_MODE);
$cache_id = md5($max.$url.(int)$valid_key.$accept.$links.$images.(int)$favour_feed_titles.(int)$options->content.(int)$options->html5_output.(int)$options->summary.(int)$xss_filter.(int)$favour_effective_url.(int)$exclude_on_fail.$format.$detect_language.$parser.$user_submitted_config._FF_FTR_MODE);
$check_cache = true;
if ($options->apc && $options->smart_cache) {
apc_add("cache.$cache_id", 0, $options->cache_time*60);
@ -554,6 +572,7 @@ SiteConfig::use_apc($options->apc);
$extractor->fingerprints = $options->fingerprints;
$extractor->allowedParsers = $options->allowed_parsers;
$extractor->parserOverride = $parser;
if (!$images) $extractor->stripImages = true;
if ($options->user_submitted_config && $user_submitted_config) {
$extractor->setUserSubmittedConfig($user_submitted_config);
}
@ -633,7 +652,7 @@ if ($accept === 'html' || !$result) {
public function get_enclosure($key=0, $prefer=null) { return null; }
public function get_enclosures() { return null; }
public function get_categories() { return null; }
public function get_item_tags($namespace='', $tag='') { return null; }
public function get_item_tags($namespace='', $tag='') { return null; }
}
$feed = new DummySingleItemFeed($url);
}
@ -792,7 +811,7 @@ foreach ($items as $key => $item) {
// if user has asked to see parsed HTML, show it and exit.
if ($debug_show_parsed_html) {
debug("Here's the full HTML after it's been parsed by Full-Text RSS:");
die($readability->dom->saveXML($readability->dom->documentElement));
die(make_html($readability->dom->documentElement));
}
// is this a native ad?
if ($extract_result && $extractor->isNativeAd()) {
@ -801,6 +820,8 @@ foreach ($items as $key => $item) {
continue; // skip this feed item entry
}
}
$base_url = get_base_url($readability->dom);
if (!$base_url) $base_url = $effective_url;
$content_block = ($extract_result) ? $extractor->getContent() : null;
$extracted_title = ($extract_result) ? $extractor->getTitle() : '';
// Deal with multi-page articles
@ -814,8 +835,8 @@ foreach ($items as $key => $item) {
while ($next_page_url = $extractor->getNextPageUrl()) {
debug('--------');
debug('Processing next page: '.$next_page_url);
// If we've got URL, resolve against $url
if ($next_page_url = make_absolute_str($effective_url, $next_page_url)) {
// If we've got URL, resolve against $base_url
if ($next_page_url = make_absolute_str($base_url, $next_page_url)) {
// check it's not what we have already!
if (!in_array($next_page_url, $multi_page_urls)) {
// it's not, so let's attempt to fetch it
@ -870,19 +891,24 @@ foreach ($items as $key => $item) {
if ($do_content_extraction) {
// if we failed to extract content...
if (!$extract_result) {
if ($exclude_on_fail) {
if ($exclude_on_fail && (_FF_FTR_MODE != 'simple')) {
debug('Failed to extract, so skipping (due to exclude on fail parameter)');
continue; // skip this and move to next item
}
//TODO: get text sample for language detection
$html = $options->error_message;
// keep the original item description
$html .= $item->get_description();
if (_FF_FTR_MODE === 'simple') {
$html = '';
} else {
//TODO: get text sample for language detection
$html = $options->error_message;
// keep the original item description
$html .= $item->get_description();
}
} else {
$readability->clean($content_block, 'select');
if ($options->rewrite_relative_urls) {
$base_url = get_base_url($readability->dom);
if (!$base_url) $base_url = $effective_url;
// we've got $base_url already above
//$base_url = get_base_url($readability->dom);
//if (!$base_url) $base_url = $effective_url;
// rewrite URLs
make_absolute($base_url, $content_block);
}
@ -908,20 +934,32 @@ foreach ($items as $key => $item) {
// convert content block to HTML string
// Need to preserve things like body: //img[@id='feature']
if (in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer', 'li', 'td'))) {
$html = $content_block->innerHTML;
//$html = $content_block->innerHTML;
$html = make_html($content_block, true); // true = innerHTML
//} elseif (in_array(strtolower($content_block->tagName), array('td', 'li'))) {
// $html = '<div>'.$content_block->innerHTML.'</div>';
} else {
$html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
//$html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
$html = make_html($content_block); // outerHTML
}
//unset($content_block);
// post-processing cleanup
$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
if ($links == 'remove') {
$html = preg_replace('!</?a[^>]*>!', '', $html);
$html = preg_replace('!<a\s+[^>]*>!', '', $html);
$html = preg_replace('!</a>!', '', $html);
}
// get text sample for language detection
$text_sample = strip_tags(substr($html, 0, 500));
$_og = $extractor->getOpenGraph();
$text_sample = '';
if (isset($_og['og:title'])) {
$text_sample .= $_og['og:title'];
}
if (isset($_og['og:description'])) {
$text_sample .= ' '.$_og['og:description'];
}
$text_sample .= mb_substr($content_block->textContent, 0, 3000);
unset($_og);
$html = make_substitutions($options->message_to_prepend).$html;
$html .= make_substitutions($options->message_to_append);
}
@ -1007,10 +1045,17 @@ foreach ($items as $key => $item) {
// add open graph
if ($opengraph = $extractor->getOpenGraph()) {
foreach ($opengraph as $og_prop => $og_val) {
$newitem->addElement($og_prop, $og_val);
foreach ($opengraph as $_prop => $_val) {
$newitem->addElement($_prop, $_val);
}
}
// add Twitter Card
if ($twitterCard = $extractor->getTwitterCard()) {
foreach ($twitterCard as $_prop => $_val) {
$newitem->addElement($_prop, $_val);
}
}
unset($_prop, $_val);
// add language
if ($detect_language) {
@ -1184,6 +1229,7 @@ function get_self_url() {
if (isset($_GET['accept'])) $self .= '&accept='.urlencode($_GET['accept']);
if (isset($_GET['max'])) $self .= '&max='.(int)$_GET['max'];
if (isset($_GET['links'])) $self .= '&links='.urlencode($_GET['links']);
if (isset($_GET['images'])) $self .= '&images='.urlencode($_GET['images']);
if (isset($_GET['exc'])) $self .= '&exc='.urlencode($_GET['exc']);
if (isset($_GET['format'])) $self .= '&format='.urlencode($_GET['format']);
if (isset($_GET['callback'])) $self .= '&callback='.urlencode($_GET['callback']);
@ -1409,6 +1455,32 @@ function make_absolute_str($base, $url) {
return false;
}
}
function make_html($dom, $inner=false) {
global $options;
static $html5 = null;
if ($options->html5_output) {
if ($html5 === null) {
$html5 = new Masterminds\HTML5(array('disable_html_ns' => true));
}
if (!$inner) {
return $html5->saveHTML($dom);
} else {
$_inner = '';
if ($dom->hasChildNodes()) {
foreach ($dom->childNodes as $child) {
$_inner .= $html5->saveHTML($child);
}
}
return $_inner;
}
} else {
if (!$inner) {
return $dom->ownerDocument->saveXML($dom);
} else {
return $dom->innerHTML;
}
}
}
// returns single page response, or false if not found
function get_single_page($item, $html, $url) {
global $http, $extractor;
@ -1457,8 +1529,10 @@ function get_single_page($item, $html, $url) {
}
}
}
// If we've got URL, resolve against $url
if (isset($single_page_url) && ($single_page_url = make_absolute_str($url, $single_page_url))) {
$base_url = get_base_url($readability->dom);
if (!$base_url) $base_url = $url;
// If we've got URL, resolve against $base_url
if (isset($single_page_url) && ($single_page_url = make_absolute_str($base_url, $single_page_url))) {
// check it's not what we have already!
if ($single_page_url != $url) {
// it's not, so let's try to fetch it...

View File

@ -1,7 +1,7 @@
Full-Text RSS site config files
================
[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no rules found, it tries to detect the content block automatically.
[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no rules are found, it tries to detect the content block automatically.
This repository contains the site-specific extraction rules we rely on in Full-Text RSS.

View File

@ -1,9 +1,6 @@
# Puppet file intended to install server componenets for self-hosted FiveFilters.org web services
# Puppet file intended to install server componenets for FiveFilters.org web services
# This file is intended for base images of:
# Ubuntu 15.10
# Please see here for more information on how to use this:
# http://help.fivefilters.org/customer/en/portal/articles/1143210-hosting
# Ubuntu 16.04
Exec { path => "/bin:/usr/bin:/usr/local/bin" }
@ -31,6 +28,10 @@ class init {
APT::Periodic::Unattended-Upgrade "1";',
require => Package["unattended-upgrades"]
}
#exec { "configure-unattended-upgrades":
# require => Package["unattended-upgrades"],
# command => "sudo dpkg-reconfigure unattended-upgrades",
#}
}
# make sure apt-update run before package
@ -56,6 +57,11 @@ class apache {
require => Package["apache2"],
notify => Exec["restart-apache"]
}
exec { "enable-prefork":
require => Package["apache2"],
command => "sudo a2dismod mpm_event && sudo a2enmod mpm_prefork",
}
file { "/etc/apache2/sites-available/fivefilters.conf":
ensure => present,
@ -104,24 +110,34 @@ class apache {
}
class php {
package { "php5": ensure => latest }
package { "libapache2-mod-php5": ensure => latest }
package { "php5-cli": ensure => latest }
package { "php5-tidy": ensure => latest }
package { "php5-curl": ensure => latest }
package { "libcurl4-gnutls-dev": ensure => latest }
package { "php7.0": ensure => latest }
#package { "php-apc": ensure => latest }
package { "libapache2-mod-php7.0": ensure => latest }
package { "php7.0-cli": ensure => latest }
package { "php7.0-tidy": ensure => latest }
package { "php7.0-curl": ensure => latest }
#package { "libcurl4-gnutls-dev": ensure => latest }
package { "libcurl4-openssl-dev": ensure => latest }
package { "libpcre3-dev": ensure => latest }
package { "make": ensure=>latest }
package { "php-pear": ensure => latest }
package { "php5-dev": ensure => latest }
package { "php5-intl": ensure => latest }
package { "php5-gd": ensure => latest }
package { "php5-imagick": ensure => latest }
package { "php5-json": ensure => latest }
package { "php7.0-dev": ensure => latest }
package { "php7.0-intl": ensure => latest }
package { "php7.0-gd": ensure => latest }
package { "php7.0-mbstring": ensure => latest }
package { "php-imagick": ensure => latest }
package { "php7.0-json": ensure => latest }
#package { "php-http": ensure => latest }
package { "php5-raphf": ensure => latest }
package { "php5-propro": ensure => latest }
file { "/etc/php5/mods-available/fivefilters-php.ini":
package { "php-raphf": ensure => latest }
package { "php-propro": ensure => latest }
package { "php7.0-zip": ensure => latest }
# for gumbo-php
package { "libgumbo1": ensure => latest }
package { "libgumbo-dev": ensure => latest }
package { "libxml2": ensure => latest }
package { "libxml2-dev": ensure => latest }
file { "/etc/php/7.0/mods-available/fivefilters-php.ini":
ensure => present,
content => "engine = On
expose_php = Off
@ -134,17 +150,17 @@ class php {
default_socket_timeout = 120
file_uploads = Off
date.timezoe = 'UTC'",
require => Package["php5"],
require => Package["php7.0"],
before => Exec["enable-fivefilters-php"],
}
exec { "enable-fivefilters-php":
command => "sudo php5enmod fivefilters-php",
command => "sudo phpenmod fivefilters-php",
}
}
class php_pecl_http {
# Important: this file needs to be in place before we install the HTTP extension
file { "/etc/php5/mods-available/http.ini":
file { "/etc/php/7.0/mods-available/http.ini":
ensure => present,
#owner => root, group => root, mode => 444,
content => "; priority=25
@ -156,7 +172,7 @@ extension=http.so",
}
exec { "enable-http":
command => "sudo php5enmod http",
command => "sudo phpenmod http",
require => Class["php"],
}
@ -171,10 +187,9 @@ extension=http.so",
}
exec { "install-http-pecl":
command => "pecl install https://pecl.php.net/get/pecl_http-2.5.5.tgz",
#command => "sudo pecl install pecl_http",
# the above is now version 3.0 - requires PHP7
#command => "pecl install http://pecl.php.net/get/pecl_http-1.7.6.tgz",
# For some reason this command doesn't return a success code, even though
# it appears to succeed. So we use || /bin/true
command => "sudo pecl install channel://pecl.php.net/pecl_http-3.1.0.tgz || /bin/true",
#creates => "/tmp/needed/directory",
require => Exec["enable-http"]
}
@ -182,12 +197,12 @@ extension=http.so",
class php_pecl_apcu {
exec { "install-apcu-pecl":
command => "sudo pecl install channel://pecl.php.net/APCu-4.0.10",
command => "sudo pecl install channel://pecl.php.net/APCu-5.1.8",
#creates => "/tmp/needed/directory",
require => Class["php"]
}
file { "/etc/php5/mods-available/apcu.ini":
file { "/etc/php/7.0/mods-available/apcu.ini":
ensure => present,
#owner => root, group => root, mode => 444,
content => "extension=apcu.so",
@ -195,63 +210,66 @@ class php_pecl_apcu {
before => Exec["enable-apcu"]
}
exec { "enable-apcu":
command => "sudo php5enmod apcu",
command => "sudo phpenmod apcu",
notify => Exec["restart-apache"],
}
}
class php_cld {
# see https://github.com/lstrojny/php-cld
class php_gumbo {
# see https://github.com/layershifter/gumbo-php
package { "git": ensure => latest }
package { "build-essential": ensure => latest }
file { "/tmp/cld":
file { "/tmp/gumbo":
ensure => absent,
before => Exec["download-cld"],
before => Exec["download-gumbo"],
recurse => true,
force => true
}
exec { "download-cld":
command => "git clone git://github.com/lstrojny/php-cld.git /tmp/cld",
require => [Package["git"], Class["php"]],
before => Exec["build-cld"]
exec { "download-gumbo":
command => "git clone git://github.com/layershifter/gumbo-php.git /tmp/gumbo",
require => [Package["git"], Class["php"]]
}
exec { "checkout-cld-version":
# recent version does not work, so we switch to an older one
command => "git reset --hard fd5aa5721b01bfe547ff6674fa0daa9c3b791ca3",
cwd => "/tmp/cld",
require => Exec["download-cld"],
before => Exec["build-cld"]
}
exec { "build-cld":
command => "./build.sh",
#new cld:command => "sh compile_libs.sh",
cwd => "/tmp/cld/vendor/libcld",
require => Package["build-essential"],
provider => "shell"
}
exec { "install-cld-extension":
command => "phpize && ./configure --with-libcld-dir=/tmp/cld/vendor/libcld && make && sudo make install",
cwd => "/tmp/cld",
exec { "install-gumbo-extension":
command => "phpize && ./configure && make && sudo make install",
cwd => "/tmp/gumbo",
provider => "shell",
require => Exec["build-cld"]
require => Exec["download-gumbo"]
}
file { "/etc/php5/mods-available/cld.ini":
file { "/etc/php/7.0/mods-available/gumbo.ini":
ensure => present,
#owner => root, group => root, mode => 444,
content => "extension=cld.so",
require => Exec["install-cld-extension"],
before => Exec["enable-cld"],
content => "extension=gumbo.so",
require => Exec["install-gumbo-extension"],
before => Exec["enable-gumbo"]
}
exec { "enable-cld":
command => "sudo php5enmod cld",
exec { "enable-gumbo":
command => "sudo phpenmod gumbo",
notify => Exec["restart-apache"],
require => Exec["install-gumbo-extension"]
}
}
class php_pecl_apc_bc {
exec { "install-apc-bc-pecl":
command => "sudo pecl install channel://pecl.php.net/apcu_bc-1.0.3",
#creates => "/tmp/needed/directory",
require => Class["php_pecl_apcu"]
}
file { "/etc/php/7.0/mods-available/z_apc_bc.ini":
ensure => present,
#owner => root, group => root, mode => 444,
content => "extension=apc.so",
require => Exec["install-apc-bc-pecl"],
before => Exec["enable-apc-bc"]
}
exec { "enable-apc-bc":
command => "sudo phpenmod z_apc_bc",
notify => Exec["restart-apache"],
}
}
@ -261,12 +279,17 @@ class final {
command => "echo 'vm.swappiness = 10' >> /etc/sysctl.conf && sudo sysctl -p",
provider => "shell"
}
exec { "enable-php":
command => "sudo a2enmod php7.0 && sudo service apache2 restart",
provider => "shell"
}
}
include init
include apache
include php
include php_pecl_apcu
include php_cld
include php_pecl_apc_bc
include php_pecl_http
include php_gumbo
include final