You are running an older version of APCu ('.$apcversion.'),
+ echo '
';
diff --git a/changelog.txt b/changelog.txt
index 10a1a0e..85bf771 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -2,22 +2,32 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
-3.5 (2015-06-13)
- - Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
- - Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
- - Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky)
- - Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates)
- - Humble HTTP Agent updated to support version 2 of PHP's HTTP extension
- - HTML5-PHP library updated
- - Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6
- - Config option removed: $options->user_agents - use site config files.
- - Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional.
- - Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4
- - Site config files updated for better extraction
- - Other minor fixes/improvements
+3.6 (2016-02-21)
+- Insert og:image (if we find one) at the top of the article when no images have been extracted
+- Additional lazy image load handling - helps preserve more images designed for JS-enabled browsers
+- Original GUID values from feed items now preserved
+- New config option favour_effective_url determines if item's effective URL (after redirects) should replace original item URL in feed output
+- Adding &use_effective_url to querystring will replace original feed item URL with effective URL (unless disabled with config option above)
+- APCu stats view in admin panel fixed to work with recent versions of APCu
+- HTML5-PHP library updated
+- Tested for PHP 7 compatibility
+- VPS Puppet script (ubuntu-15.10.pp) updated - fixes issue with IDN encodings, among other things. (This is intended for setting up a new Ubuntu 15.10 instance for running Full-Text RSS.)
+- Site config files updated for better extraction
+- Other minor fixes/improvements
-3.4.1 (unreleased)
- - Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c
+3.5 (2015-06-13)
+- Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
+- Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
+- Cookie handling now only on redirects - fixes issue with certain sites (thanks to Dave Vasilevsky)
+- Compatibility test will no longer show HHVM as incompatible - Full-Text RSS worked with HHVM 3.7.1 in our tests (but without Tidy support and no automatic site config updates)
+- Humble HTTP Agent updated to support version 2 of PHP's HTTP extension
+- HTML5-PHP library updated
+- Site config files can now include HTTP headers (user-agent, cookie, referer), e.g. http_header(user-agent): PHP/5.6
+- Config option removed: $options->user_agents - use site config files.
+- Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional.
+- Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4
+- Site config files updated for better extraction
+- Other minor fixes/improvements
3.4 (2014-09-08)
- New request parameter: siteconfig lets you submit extraction rules directly in request
diff --git a/config.php b/config.php
index 5b58c30..a5d60b3 100644
--- a/config.php
+++ b/config.php
@@ -310,6 +310,22 @@ $options->max_entries_with_key = 10;
// false - disabled
$options->xss_filter = 'user';
+// Use effective URL in place of item URL
+// ----------------------
+// When we extract content for feed items, we often end up at a different URL than the
+// one in the original feed. This is often a result of URL shorteners being used or
+// tracking services being used by the feed publisher. We include the final
+// (effective) URL we reached to get the content inside the dc:identifier field.
+// If you enable this, we'll also use this URL in place of the original item URL
+// in the new feed we produce.
+// By default, we keep the original item URL but the user can request the effective
+// URL by passing '&use_effective_url' in the querystring.
+// Possible values:
+// * Use effective URL: true
+// * Keep item URL in original feed: false
+// * Keep item URL unless user requests effective URL: 'user' (default)
+$options->favour_effective_url = 'user';
+
// Favour item titles in feed
// ----------------------
// By default, when processing feeds, we assume item titles in the feed
@@ -484,7 +500,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
-if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.5');
+if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.6');
if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) {
diff --git a/ftr_compatibility_test.php b/ftr_compatibility_test.php
index e5443f0..43c361f 100644
--- a/ftr_compatibility_test.php
+++ b/ftr_compatibility_test.php
@@ -16,7 +16,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/
-$app_name = 'Full-Text RSS 3.5';
+$app_name = 'Full-Text RSS 3.6';
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
//$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
diff --git a/libraries/content-extractor/ContentExtractor.php b/libraries/content-extractor/ContentExtractor.php
index 078c993..4ce954a 100644
--- a/libraries/content-extractor/ContentExtractor.php
+++ b/libraries/content-extractor/ContentExtractor.php
@@ -5,10 +5,10 @@
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
* to extract content from HTML files.
*
- * @version 1.1
- * @date 2014-03-28
+ * @version 1.2
+ * @date 2016-02-21
* @author Keyvan Minoukadeh
- * @copyright 2014 Keyvan Minoukadeh
+ * @copyright 2016 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@@ -765,11 +765,17 @@ class ContentExtractor
}
}
// prevent self-closing iframes
- $elems = $this->body->getElementsByTagName('iframe');
- for ($i = $elems->length-1; $i >= 0; $i--) {
- $e = $elems->item($i);
- if (!$e->hasChildNodes()) {
- $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
+ if ($this->body->tagName === 'iframe') {
+ if (!$this->body->hasChildNodes()) {
+ $this->body->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
+ }
+ } else {
+ $elems = $this->body->getElementsByTagName('iframe');
+ for ($i = $elems->length-1; $i >= 0; $i--) {
+ $e = $elems->item($i);
+ if (!$e->hasChildNodes()) {
+ $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
+ }
}
}
// remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
@@ -791,6 +797,24 @@ class ContentExtractor
$e->removeAttribute('data-lazy-src');
}
}
+ // now let's deal with another lazy load technique. Example:
+ //
+ $elems = @$xpath->query("//img[@data-src and contains(@class, 'lazyload') and contains(@src, 'data:image')]", $this->body);
+ for ($i = $elems->length-1; $i >= 0; $i--) {
+ $e = $elems->item($i);
+ $e->setAttribute('src', $e->getAttribute('data-src'));
+ $e->removeAttribute('data-src');
+ }
+ // If there's an og:image, but we have no images in the article, let's place it at the beginning of the article.
+ if ($this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
+ $elems = @$xpath->query("//img", $this->body);
+ if ($elems->length === 0) {
+ $_new_elem = $this->body->ownerDocument->createDocumentFragment();
+ @$_new_elem->appendXML('
');
+ $this->body->insertBefore($_new_elem, $this->body->firstChild);
+ }
+ }
$this->success = true;
}
diff --git a/libraries/htmLawed/htmLawed.php b/libraries/htmLawed/htmLawed.php
index 5d6285e..35215c1 100644
--- a/libraries/htmLawed/htmLawed.php
+++ b/libraries/htmLawed/htmLawed.php
@@ -1,8 +1,8 @@
array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific
+static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'allowfullscreen'=>array('iframe'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific
static $aNE = array('checked'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'disabled'=>1, 'ismap'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'readonly'=>1, 'selected'=>1); // Empty
static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* & src
static $aNU = array('class'=>array('param'=>1, 'script'=>1), 'dir'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'id'=>array('script'=>1), 'lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'xml:lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'onclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'ondblclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeydown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeypress'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeyup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousedown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousemove'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseout'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseover'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'style'=>array('param'=>1, 'script'=>1), 'title'=>array('param'=>1, 'script'=>1)); // Univ & exceptions
@@ -632,10 +632,10 @@ if($e == 'font'){
$a2 .= ' font-family: '. str_replace('"', '\'', trim($m[2])). ';';
}
if(preg_match('`color\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m)){
- $a2 .= ' color: '. trim($m[2]). ';';
+ $a2 .= ' color: '. str_replace('"', '\'', trim($m[2])). ';';
}
if(preg_match('`size\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m) && isset($fs[($m = trim($m[2]))])){
- $a2 .= ' font-size: '. $fs[$m]. ';';
+ $a2 .= ' font-size: '. str_replace('"', '\'', $fs[$m]). ';';
}
$e = 'span'; return ltrim($a2);
}
@@ -701,7 +701,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array(
public static function hl_version(){
// rel
-return '1.1.19';
+return '1.1.20';
// eof
}
@@ -725,4 +725,4 @@ return $t;
// eof
}
// end class
-}
\ No newline at end of file
+}
diff --git a/libraries/html5php/HTML5/Serializer/OutputRules.php b/libraries/html5php/HTML5/Serializer/OutputRules.php
index c009698..4ad74f6 100644
--- a/libraries/html5php/HTML5/Serializer/OutputRules.php
+++ b/libraries/html5php/HTML5/Serializer/OutputRules.php
@@ -73,17 +73,86 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
'nodeName'=>'img', 'nodeName'=>array('img', 'a'),
'attrName'=>'alt', 'attrName'=>array('title', 'alt'),
-
-
- 'prefixes'=>['xh'=>'http://www.w3.org/1999/xhtml'),
- 'xpath' => "@checked[../../xh:input[@type='radio' or @type='checkbox']]",
),
*/
array(
- 'nodeNamespace'=>'http://www.w3.org/1999/xhtml',
- 'attrName'=>array('alt', 'title'),
+ 'nodeNamespace' => 'http://www.w3.org/1999/xhtml',
+ 'attrName' => array('href',
+ 'hreflang',
+ 'http-equiv',
+ 'icon',
+ 'id',
+ 'keytype',
+ 'kind',
+ 'label',
+ 'lang',
+ 'language',
+ 'list',
+ 'maxlength',
+ 'media',
+ 'method',
+ 'name',
+ 'placeholder',
+ 'rel',
+ 'rows',
+ 'rowspan',
+ 'sandbox',
+ 'spellcheck',
+ 'scope',
+ 'seamless',
+ 'shape',
+ 'size',
+ 'sizes',
+ 'span',
+ 'src',
+ 'srcdoc',
+ 'srclang',
+ 'srcset',
+ 'start',
+ 'step',
+ 'style',
+ 'summary',
+ 'tabindex',
+ 'target',
+ 'title',
+ 'type',
+ 'value',
+ 'width',
+ 'border',
+ 'charset',
+ 'cite',
+ 'class',
+ 'code',
+ 'codebase',
+ 'color',
+ 'cols',
+ 'colspan',
+ 'content',
+ 'coords',
+ 'data',
+ 'datetime',
+ 'default',
+ 'dir',
+ 'dirname',
+ 'enctype',
+ 'for',
+ 'form',
+ 'formaction',
+ 'headers',
+ 'height',
+ 'accept',
+ 'accept-charset',
+ 'accesskey',
+ 'action',
+ 'align',
+ 'alt',
+ 'bgcolor',
+ ),
+ ),
+ array(
+ 'nodeNamespace' => 'http://www.w3.org/1999/xhtml',
+ 'xpath' => 'starts-with(local-name(), \'data-\')',
),
-
);
const DOCTYPE = '';
@@ -328,7 +397,7 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
$xp->registerNamespace($nsPrefix, $ns);
}
}
- if(!$xp->query($rule['xpath'], $attr->ownerElement)->length){
+ if(!$xp->evaluate($rule['xpath'], $attr)){
continue;
}
}
diff --git a/libraries/html5php/RELEASE.md b/libraries/html5php/RELEASE.md
index d4b64a5..e3d70d3 100644
--- a/libraries/html5php/RELEASE.md
+++ b/libraries/html5php/RELEASE.md
@@ -1,4 +1,7 @@
# Release Notes
+2.1.2 (2015-06-07)
+- #82: Support for PHP7
+- #84: Improved boolean attribute handling
2.1.1 (2015-03-23)
- #78: Fixes bug where unmatched entity like string drops everything after &.
diff --git a/libraries/readability/Readability.php b/libraries/readability/Readability.php
index 9e41237..8c38e3c 100644
--- a/libraries/readability/Readability.php
+++ b/libraries/readability/Readability.php
@@ -95,7 +95,7 @@ class Readability
// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
'normalize' => '/\s{2,}/',
'killBreaks' => '/(
(\s| ?)*){1,}/',
- 'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|soundcloud\.com|twitch\.tv)!i',
+ 'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|soundcloud\.com|twitch\.tv|openload\.co)!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
);
diff --git a/makefulltextfeed.php b/makefulltextfeed.php
index d17dabd..b19ec24 100644
--- a/makefulltextfeed.php
+++ b/makefulltextfeed.php
@@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh
// Copyright (c) 2015 Keyvan Minoukadeh
// License: AGPLv3
-// Version: 3.5
-// Date: 2015-05-29
+// Version: 3.6
+// Date: 2016-02-17
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
@@ -129,6 +129,11 @@ if (!$options->enabled) {
die('The full-text RSS service is currently disabled');
}
+//////////////////////////////////
+// Enable Cross-Origin Resource Sharing (CORS)
+//////////////////////////////////
+if ($options->cors) header('Access-Control-Allow-Origin: *');
+
////////////////////////////////
// Debug mode?
// See the config file for debug options.
@@ -307,6 +312,16 @@ if ($options->favour_feed_titles == 'user') {
$favour_feed_titles = $options->favour_feed_titles;
}
+///////////////////////////////////////////////
+// Favour effective URL
+///////////////////////////////////////////////
+$favour_effective_url = false;
+if ($options->favour_effective_url == 'user') {
+ $favour_effective_url = isset($_REQUEST['use_effective_url']);
+} else {
+ $favour_effective_url = $options->favour_effective_url;
+}
+
///////////////////////////////////////////////
// Include full content in output?
///////////////////////////////////////////////
@@ -437,11 +452,6 @@ if (!empty($options->proxy_servers)) {
}
}
-//////////////////////////////////
-// Enable Cross-Origin Resource Sharing (CORS)
-//////////////////////////////////
-if ($options->cors) header('Access-Control-Allow-Origin: *');
-
//////////////////////////////////
// Has the HTML been given in the request?
//////////////////////////////////
@@ -459,7 +469,7 @@ if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') {
if ($options->caching) {
debug('Caching is enabled...');
$cache_id = md5($max.$url.(int)$valid_key.$accept.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.
- (int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.$parser.$user_submitted_config._FF_FTR_MODE);
+ (int)$xss_filter.(int)$favour_effective_url.(int)$exclude_on_fail.$format.$detect_language.$parser.$user_submitted_config._FF_FTR_MODE);
$check_cache = true;
if ($options->apc && $options->smart_cache) {
apc_add("cache.$cache_id", 0, $options->cache_time*60);
@@ -623,6 +633,7 @@ if ($accept === 'html' || !$result) {
public function get_enclosure($key=0, $prefer=null) { return null; }
public function get_enclosures() { return null; }
public function get_categories() { return null; }
+ public function get_item_tags($namespace='', $tag='') { return null; }
}
$feed = new DummySingleItemFeed($url);
}
@@ -916,7 +927,24 @@ foreach ($items as $key => $item) {
}
}
- $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
+ // guid
+ $_guid = $item->get_permalink();
+ $_ispermalink = 'true';
+ $_g = $item->get_item_tags('', 'guid');
+ if (is_array($_g) && count($_g) > 0) {
+ $_ispermalink = null;
+ $_guid = $_g[0]['data'];
+ if (isset($_g[0]['attribs']) && isset($_g[0]['attribs']['']) && isset($_g[0]['attribs']['']['isPermaLink'])) {
+ $_ispermalink = $_g[0]['attribs']['']['isPermaLink'];
+ if ($_ispermalink !== 'true') $_ispermalink = 'false';
+ }
+ }
+ if (isset($_ispermalink)) {
+ $newitem->addElement('guid', $_guid, array('isPermaLink'=>$_ispermalink));
+ } else {
+ $newitem->addElement('guid', $_guid);
+ }
+ unset($_g, $_guid, $_ispermalink);
// filter xss?
if ($xss_filter) {
@@ -1027,6 +1055,7 @@ foreach ($items as $key => $item) {
//http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-–-25th-March-2012-Special-Program-from-Liari-(Karachi)
//temporary measure: use utf8_encode()
$newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
+ if ($favour_effective_url) $newitem->setLink(remove_url_cruft(utf8_encode($effective_url)));
} else {
$newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
}
@@ -1162,6 +1191,7 @@ function get_self_url() {
if (isset($_GET['lang'])) $self .= '&lang='.urlencode($_GET['lang']);
if (isset($_GET['xss'])) $self .= '&xss';
if (isset($_GET['use_extracted_title'])) $self .= '&use_extracted_title';
+ if (isset($_GET['use_effective_url'])) $self .= '&use_effective_url';
if (isset($_GET['content'])) $self .= '&content='.urlencode($_GET['content']);
if (isset($_GET['summary'])) $self .= '&summary='.urlencode($_GET['summary']);
if (isset($_GET['debug'])) $self .= '&debug';
diff --git a/site_config/standard/README.md b/site_config/standard/README.md
index d44ed4e..bf2766c 100644
--- a/site_config/standard/README.md
+++ b/site_config/standard/README.md
@@ -1,12 +1,14 @@
Full-Text RSS site config files
================
-[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no site patterns, it tries to detect the content block automatically.
+[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no rules found, it tries to detect the content block automatically.
-This repository contains the site config files we use in Full-Text RSS.
+This repository contains the site-specific extraction rules we rely on in Full-Text RSS.
### Contributing changes
+We run automated tests on these files to detect issues. If you'd like to help keep these up to date, please look at the [test results](http://siteconfig.fivefilters.org/test/) and see which files you'd like to contribute fixes for.
+
We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface.
You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model:
@@ -19,7 +21,9 @@ If a site is not in our set, you can create a file for it in the same way. See [
### How to write a site config file
-Please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns) for a brief guide. We hope to have some tutorials up soon.
+The quickest and simplest way is to use our [point-and-click interface](http://siteconfig.fivefilters.org). It's a simple tool only intended to create a rule to extract the correct content block.
+
+For further refinements, e.g. selecting the title, stripping elements, dealing with multi-page articles, please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns).
### Instapaper
@@ -29,7 +33,7 @@ Marco, Instapaper's creator, graciously opened up the database of contributions
> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached.
-Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (login required).
+Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (no longer available since Instapaper was sold).
### Testing site config files
diff --git a/ubuntu-15.10.pp b/ubuntu-15.10.pp
new file mode 100644
index 0000000..af357f1
--- /dev/null
+++ b/ubuntu-15.10.pp
@@ -0,0 +1,272 @@
+# Puppet file intended to install server componenets for self-hosted FiveFilters.org web services
+# This file is intended for base images of:
+# Ubuntu 15.10
+
+# Please see here for more information on how to use this:
+# http://help.fivefilters.org/customer/en/portal/articles/1143210-hosting
+
+Exec { path => "/bin:/usr/bin:/usr/local/bin" }
+
+stage { 'first': before => Stage['main'] }
+stage { 'last': require => Stage['main'] }
+
+class {
+ 'init': stage => first;
+ 'final': stage => last;
+}
+
+class init {
+ exec { "apt-update":
+ command => "apt-get update"
+ }
+ package { "fail2ban":
+ ensure => latest
+ }
+ package { "unattended-upgrades":
+ ensure => latest
+ }
+ file { "/etc/apt/apt.conf.d/20auto-upgrades":
+ ensure => present,
+ content => 'APT::Periodic::Update-Package-Lists "1";
+APT::Periodic::Unattended-Upgrade "1";',
+ require => Package["unattended-upgrades"]
+ }
+}
+
+# make sure apt-update run before package
+Exec["apt-update"] -> Package <| |>
+
+class apache {
+ exec { "enable-mod_rewrite":
+ require => Package["apache2"],
+ before => Service["apache2"],
+ #command => "/usr/sbin/a2enmod rewrite",
+ command => "sudo a2enmod rewrite",
+ }
+
+ file { "/etc/apache2/mods-available/mpm_prefork.conf":
+ ensure => present,
+ content => "
+ StartServers 5
+ MinSpareServers 5
+ MaxSpareServers 10
+ MaxRequestWorkers 80
+ MaxConnectionsPerChild 0
+",
+ require => Package["apache2"],
+ notify => Exec["restart-apache"]
+ }
+
+ file { "/etc/apache2/sites-available/fivefilters.conf":
+ ensure => present,
+ content => "
+ ServerAdmin webmaster@localhost
+ DocumentRoot /var/www/html
+
+ ErrorLog ${APACHE_LOG_DIR}/error.log
+ CustomLog /dev/null combined
+ #CustomLog ${APACHE_LOG_DIR}/access.log combined
+
+ KeepAliveTimeout 2
+ MaxKeepAliveRequests 10
+",
+ require => Package["apache2"],
+ before => Exec["enable-fivefilters-apache2"],
+ notify => Exec["restart-apache"]
+ }
+
+ exec { "enable-fivefilters-apache2":
+ require => [Package["apache2"], Service["apache2"]],
+ command => "sudo a2dissite 000-default && sudo a2ensite fivefilters"
+ }
+
+ exec { "disable-mod_status":
+ require => Package["apache2"],
+ before => Service["apache2"],
+ command => "sudo a2dismod status",
+ }
+
+ package { "apache2":
+ ensure => latest
+ }
+
+ service { "apache2":
+ ensure => running,
+ require => Package["apache2"]
+ }
+
+ exec { "restart-apache":
+ command => "sudo service apache2 restart",
+ require => Package["apache2"],
+ refreshonly => true
+ }
+ #TODO: Set AllowOverride All in default config to enable .htaccess
+}
+
+class php {
+ package { "php5": ensure => latest }
+ package { "libapache2-mod-php5": ensure => latest }
+ package { "php5-cli": ensure => latest }
+ package { "php5-tidy": ensure => latest }
+ package { "php5-curl": ensure => latest }
+ package { "libcurl4-gnutls-dev": ensure => latest }
+ package { "libpcre3-dev": ensure => latest }
+ package { "make": ensure=>latest }
+ package { "php-pear": ensure => latest }
+ package { "php5-dev": ensure => latest }
+ package { "php5-intl": ensure => latest }
+ package { "php5-gd": ensure => latest }
+ package { "php5-imagick": ensure => latest }
+ package { "php5-json": ensure => latest }
+ #package { "php-http": ensure => latest }
+ package { "php5-raphf": ensure => latest }
+ package { "php5-propro": ensure => latest }
+ file { "/etc/php5/mods-available/fivefilters-php.ini":
+ ensure => present,
+ content => "engine = On
+ expose_php = Off
+ max_execution_time = 120
+ memory_limit = 128M
+ error_reporting = E_ALL & ~E_DEPRECATED
+ display_errors = Off
+ display_startup_errors = Off
+ html_errors = Off
+ default_socket_timeout = 120
+ file_uploads = Off
+ date.timezoe = 'UTC'",
+ require => Package["php5"],
+ before => Exec["enable-fivefilters-php"],
+ }
+ exec { "enable-fivefilters-php":
+ command => "sudo php5enmod fivefilters-php",
+ }
+}
+
+class php_pecl_http {
+ # Important: this file needs to be in place before we install the HTTP extension
+ file { "/etc/php5/mods-available/http.ini":
+ ensure => present,
+ #owner => root, group => root, mode => 444,
+ content => "; priority=25
+extension=raphf.so
+extension=propro.so
+extension=http.so",
+ before => [Exec["install-http-pecl"], Exec["enable-http"]],
+ require => Class["php"]
+ }
+
+ exec { "enable-http":
+ command => "sudo php5enmod http",
+ require => Class["php"],
+ }
+
+ package { "libidn11-dev":
+ ensure => latest,
+ before => Exec["install-http-pecl"]
+ }
+
+ package { "libevent-dev":
+ ensure => latest,
+ before => Exec["install-http-pecl"]
+ }
+
+ exec { "install-http-pecl":
+ command => "pecl install https://pecl.php.net/get/pecl_http-2.5.5.tgz",
+ #command => "sudo pecl install pecl_http",
+ # the above is now version 3.0 - requires PHP7
+ #command => "pecl install http://pecl.php.net/get/pecl_http-1.7.6.tgz",
+ #creates => "/tmp/needed/directory",
+ require => Exec["enable-http"]
+ }
+}
+
+class php_pecl_apcu {
+ exec { "install-apcu-pecl":
+ command => "sudo pecl install channel://pecl.php.net/APCu-4.0.10",
+ #creates => "/tmp/needed/directory",
+ require => Class["php"]
+ }
+
+ file { "/etc/php5/mods-available/apcu.ini":
+ ensure => present,
+ #owner => root, group => root, mode => 444,
+ content => "extension=apcu.so",
+ require => Exec["install-apcu-pecl"],
+ before => Exec["enable-apcu"]
+ }
+ exec { "enable-apcu":
+ command => "sudo php5enmod apcu",
+ notify => Exec["restart-apache"],
+ }
+}
+
+class php_cld {
+ # see https://github.com/lstrojny/php-cld
+ package { "git": ensure => latest }
+
+ package { "build-essential": ensure => latest }
+
+ file { "/tmp/cld":
+ ensure => absent,
+ before => Exec["download-cld"],
+ recurse => true,
+ force => true
+ }
+
+ exec { "download-cld":
+ command => "git clone git://github.com/lstrojny/php-cld.git /tmp/cld",
+ require => [Package["git"], Class["php"]],
+ before => Exec["build-cld"]
+ }
+
+ exec { "checkout-cld-version":
+ # recent version does not work, so we switch to an older one
+ command => "git reset --hard fd5aa5721b01bfe547ff6674fa0daa9c3b791ca3",
+ cwd => "/tmp/cld",
+ require => Exec["download-cld"],
+ before => Exec["build-cld"]
+ }
+
+ exec { "build-cld":
+ command => "./build.sh",
+ #new cld:command => "sh compile_libs.sh",
+ cwd => "/tmp/cld/vendor/libcld",
+ require => Package["build-essential"],
+ provider => "shell"
+ }
+
+ exec { "install-cld-extension":
+ command => "phpize && ./configure --with-libcld-dir=/tmp/cld/vendor/libcld && make && sudo make install",
+ cwd => "/tmp/cld",
+ provider => "shell",
+ require => Exec["build-cld"]
+ }
+
+ file { "/etc/php5/mods-available/cld.ini":
+ ensure => present,
+ #owner => root, group => root, mode => 444,
+ content => "extension=cld.so",
+ require => Exec["install-cld-extension"],
+ before => Exec["enable-cld"],
+ }
+
+ exec { "enable-cld":
+ command => "sudo php5enmod cld",
+ notify => Exec["restart-apache"],
+ }
+}
+
+class final {
+ exec { "lower-swappiness":
+ command => "echo 'vm.swappiness = 10' >> /etc/sysctl.conf && sudo sysctl -p",
+ provider => "shell"
+ }
+}
+
+include init
+include apache
+include php
+include php_pecl_apcu
+include php_cld
+include php_pecl_http
+include final
\ No newline at end of file