Full-Text RSS 3.6

This commit is contained in:
FiveFilters.org 2019-04-04 23:15:15 +02:00
parent b57a2b7eed
commit 9658f6a00b
12 changed files with 562 additions and 135 deletions

View File

@ -384,11 +384,11 @@ if (isset($MYREQUEST['IMG']))
break; break;
case 2: case 2:
$s=$cache['nhits']+$cache['nmisses']; $s=$cache['num_hits']+$cache['num_misses'];
$a=$cache['nhits']; $a=$cache['num_hits'];
fill_box($image, 30,$size,50,$s ? (-$a*($size-21)/$s) : 0,$col_black,$col_green,sprintf("%.1f%%",$s ? $cache['nhits']*100/$s : 0)); fill_box($image, 30,$size,50,$s ? (-$a*($size-21)/$s) : 0,$col_black,$col_green,sprintf("%.1f%%",$s ? $cache['num_hits']*100/$s : 0));
fill_box($image,130,$size,50,$s ? -max(4,($s-$a)*($size-21)/$s) : 0,$col_black,$col_red,sprintf("%.1f%%",$s ? $cache['nmisses']*100/$s : 0)); fill_box($image,130,$size,50,$s ? -max(4,($s-$a)*($size-21)/$s) : 0,$col_black,$col_red,sprintf("%.1f%%",$s ? $cache['num_misses']*100/$s : 0));
break; break;
case 3: case 3:
@ -433,11 +433,11 @@ if (isset($MYREQUEST['IMG']))
break; break;
case 4: case 4:
$s=$cache['nhits']+$cache['nmisses']; $s=$cache['num_hits']+$cache['num_misses'];
$a=$cache['nhits']; $a=$cache['num_hits'];
fill_box($image, 30,$size,50,$s ? -$a*($size-21)/$s : 0,$col_black,$col_green,sprintf("%.1f%%", $s ? $cache['nhits']*100/$s : 0)); fill_box($image, 30,$size,50,$s ? -$a*($size-21)/$s : 0,$col_black,$col_green,sprintf("%.1f%%", $s ? $cache['num_hits']*100/$s : 0));
fill_box($image,130,$size,50,$s ? -max(4,($s-$a)*($size-21)/$s) : 0,$col_black,$col_red,sprintf("%.1f%%", $s ? $cache['nmisses']*100/$s : 0)); fill_box($image,130,$size,50,$s ? -max(4,($s-$a)*($size-21)/$s) : 0,$col_black,$col_red,sprintf("%.1f%%", $s ? $cache['num_misses']*100/$s : 0));
break; break;
} }
@ -772,13 +772,13 @@ case OB_HOST_STATS:
$mem_avail= $mem['avail_mem']; $mem_avail= $mem['avail_mem'];
$mem_used = $mem_size-$mem_avail; $mem_used = $mem_size-$mem_avail;
$seg_size = bsize($mem['seg_size']); $seg_size = bsize($mem['seg_size']);
$req_rate_user = sprintf("%.2f", $cache['nhits'] ? (($cache['nhits']+$cache['nmisses'])/($time-$cache['stime'])) : 0); $req_rate_user = sprintf("%.2f", $cache['num_hits'] ? (($cache['num_hits']+$cache['num_misses'])/($time-$cache['start_time'])) : 0);
$hit_rate_user = sprintf("%.2f", $cache['nhits'] ? (($cache['nhits'])/($time-$cache['stime'])) : 0); $hit_rate_user = sprintf("%.2f", $cache['num_hits'] ? (($cache['num_hits'])/($time-$cache['start_time'])) : 0);
$miss_rate_user = sprintf("%.2f", $cache['nmisses'] ? (($cache['nmisses'])/($time-$cache['stime'])) : 0); $miss_rate_user = sprintf("%.2f", $cache['num_misses'] ? (($cache['num_misses'])/($time-$cache['start_time'])) : 0);
$insert_rate_user = sprintf("%.2f", $cache['ninserts'] ? (($cache['ninserts'])/($time-$cache['stime'])) : 0); $insert_rate_user = sprintf("%.2f", $cache['num_inserts'] ? (($cache['num_inserts'])/($time-$cache['start_time'])) : 0);
$apcversion = phpversion('apcu'); $apcversion = phpversion('apcu');
$phpversion = phpversion(); $phpversion = phpversion();
$number_vars = $cache['nentries']; $number_vars = $cache['num_entries'];
$size_vars = bsize($cache['mem_size']); $size_vars = bsize($cache['mem_size']);
$i=0; $i=0;
echo <<< EOB echo <<< EOB
@ -798,9 +798,8 @@ EOB;
<br/> ({$cache['memory_type']} memory) <br/> ({$cache['memory_type']} memory)
</td></tr> </td></tr>
EOB; EOB;
echo '<tr class=tr-1><td class=td-0>Start Time</td><td>',date(DATE_FORMAT,$cache['stime']),'</td></tr>'; echo '<tr class=tr-1><td class=td-0>Start Time</td><td>',date(DATE_FORMAT,$cache['start_time']),'</td></tr>';
echo '<tr class=tr-0><td class=td-0>Uptime</td><td>',duration($cache['stime']),'</td></tr>'; echo '<tr class=tr-0><td class=td-0>Uptime</td><td>',duration($cache['start_time']),'</td></tr>';
echo '<tr class=tr-1><td class=td-0>File Upload Support</td><td>',$cache['file_upload_progress'],'</td></tr>';
echo <<<EOB echo <<<EOB
</tbody></table> </tbody></table>
</div> </div>
@ -809,13 +808,13 @@ EOB;
<table cellspacing=0> <table cellspacing=0>
<tbody> <tbody>
<tr class=tr-0><td class=td-0>Cached Variables</td><td>$number_vars ($size_vars)</td></tr> <tr class=tr-0><td class=td-0>Cached Variables</td><td>$number_vars ($size_vars)</td></tr>
<tr class=tr-1><td class=td-0>Hits</td><td>{$cache['nhits']}</td></tr> <tr class=tr-1><td class=td-0>Hits</td><td>{$cache['num_hits']}</td></tr>
<tr class=tr-0><td class=td-0>Misses</td><td>{$cache['nmisses']}</td></tr> <tr class=tr-0><td class=td-0>Misses</td><td>{$cache['num_misses']}</td></tr>
<tr class=tr-1><td class=td-0>Request Rate (hits, misses)</td><td>$req_rate_user cache requests/second</td></tr> <tr class=tr-1><td class=td-0>Request Rate (hits, misses)</td><td>$req_rate_user cache requests/second</td></tr>
<tr class=tr-0><td class=td-0>Hit Rate</td><td>$hit_rate_user cache requests/second</td></tr> <tr class=tr-0><td class=td-0>Hit Rate</td><td>$hit_rate_user cache requests/second</td></tr>
<tr class=tr-1><td class=td-0>Miss Rate</td><td>$miss_rate_user cache requests/second</td></tr> <tr class=tr-1><td class=td-0>Miss Rate</td><td>$miss_rate_user cache requests/second</td></tr>
<tr class=tr-0><td class=td-0>Insert Rate</td><td>$insert_rate_user cache requests/second</td></tr> <tr class=tr-0><td class=td-0>Insert Rate</td><td>$insert_rate_user cache requests/second</td></tr>
<tr class=tr-1><td class=td-0>Cache full count</td><td>{$cache['nexpunges']}</td></tr> <tr class=tr-1><td class=td-0>Cache full count</td><td>{$cache['expunges']}</td></tr>
</tbody> </tbody>
</table> </table>
</div> </div>
@ -857,11 +856,11 @@ EOB;
: "", : "",
'<tr>', '<tr>',
'<td class=td-0><span class="green box">&nbsp;</span>Free: ',bsize($mem_avail).sprintf(" (%.1f%%)",$mem_avail*100/$mem_size),"</td>\n", '<td class=td-0><span class="green box">&nbsp;</span>Free: ',bsize($mem_avail).sprintf(" (%.1f%%)",$mem_avail*100/$mem_size),"</td>\n",
'<td class=td-1><span class="green box">&nbsp;</span>Hits: ',$cache['nhits'].@sprintf(" (%.1f%%)",$cache['nhits']*100/($cache['nhits']+$cache['nmisses'])),"</td>\n", '<td class=td-1><span class="green box">&nbsp;</span>Hits: ',$cache['num_hits'].@sprintf(" (%.1f%%)",$cache['num_hits']*100/($cache['num_hits']+$cache['num_misses'])),"</td>\n",
'</tr>', '</tr>',
'<tr>', '<tr>',
'<td class=td-0><span class="red box">&nbsp;</span>Used: ',bsize($mem_used).sprintf(" (%.1f%%)",$mem_used *100/$mem_size),"</td>\n", '<td class=td-0><span class="red box">&nbsp;</span>Used: ',bsize($mem_used).sprintf(" (%.1f%%)",$mem_used *100/$mem_size),"</td>\n",
'<td class=td-1><span class="red box">&nbsp;</span>Misses: ',$cache['nmisses'].@sprintf(" (%.1f%%)",$cache['nmisses']*100/($cache['nhits']+$cache['nmisses'])),"</td>\n"; '<td class=td-1><span class="red box">&nbsp;</span>Misses: ',$cache['num_misses'].@sprintf(" (%.1f%%)",$cache['num_misses']*100/($cache['num_hits']+$cache['num_misses'])),"</td>\n";
echo <<< EOB echo <<< EOB
</tr> </tr>
</tbody></table> </tbody></table>
@ -932,9 +931,9 @@ case OB_USER_CACHE:
echo '</div>'; echo '</div>';
break; break;
} }
$fieldname='key'; $fieldname='info';
$fieldheading='User Entry Label'; $fieldheading='User Entry Label';
$fieldkey='key'; $fieldkey='info';
$cols=6; $cols=6;
echo <<<EOB echo <<<EOB
@ -1008,14 +1007,14 @@ EOB;
foreach($cache[$scope_list[$MYREQUEST['SCOPE']]] as $i => $entry) { foreach($cache[$scope_list[$MYREQUEST['SCOPE']]] as $i => $entry) {
switch($MYREQUEST['SORT1']) { switch($MYREQUEST['SORT1']) {
case 'A': $k=sprintf('%015d-',$entry['atime']); break; case 'A': $k=sprintf('%015d-',$entry['access_time']); break;
case 'H': $k=sprintf('%015d-',$entry['nhits']); break; case 'H': $k=sprintf('%015d-',$entry['num_hits']); break;
case 'Z': $k=sprintf('%015d-',$entry['mem_size']); break; case 'Z': $k=sprintf('%015d-',$entry['mem_size']); break;
case 'M': $k=sprintf('%015d-',$entry['mtime']); break; case 'M': $k=sprintf('%015d-',$entry['mtime']); break;
case 'C': $k=sprintf('%015d-',$entry['ctime']); break; case 'C': $k=sprintf('%015d-',$entry['creation_time']); break;
case 'T': $k=sprintf('%015d-',$entry['ttl']); break; case 'T': $k=sprintf('%015d-',$entry['ttl']); break;
case 'D': $k=sprintf('%015d-',$entry['dtime']); break; case 'D': $k=sprintf('%015d-',$entry['deletion_time']); break;
case 'S': $k=$entry["key"]; break; case 'S': $k=$entry["info"]; break;
} }
if (!$AUTHENTICATED) { if (!$AUTHENTICATED) {
// hide all path entries if not logged in // hide all path entries if not logged in
@ -1037,16 +1036,16 @@ EOB;
$i=0; $i=0;
foreach($list as $k => $entry) { foreach($list as $k => $entry) {
if(!$MYREQUEST['SEARCH'] || preg_match($MYREQUEST['SEARCH'], $entry[$fieldname]) != 0) { if(!$MYREQUEST['SEARCH'] || preg_match($MYREQUEST['SEARCH'], $entry[$fieldname]) != 0) {
$sh=md5($entry["key"]); $sh=md5($entry["info"]);
$field_value = htmlentities(strip_tags($entry[$fieldname],''), ENT_QUOTES, 'UTF-8'); $field_value = htmlentities(strip_tags($entry[$fieldname],''), ENT_QUOTES, 'UTF-8');
echo echo
'<tr class=tr-',$i%2,'>', '<tr id="key-'. $sh .'" class=tr-',$i%2,'>',
"<td class=td-0><a href=\"$MY_SELF&OB=",$MYREQUEST['OB'],"&SH=",$sh,"\">",$field_value,'</a></td>', "<td class=td-0><a href=\"$MY_SELF&OB=",$MYREQUEST['OB'],"&SH=",$sh,"#key-". $sh ."\">",$field_value,'</a></td>',
'<td class="td-n center">',$entry['nhits'],'</td>', '<td class="td-n center">',$entry['num_hits'],'</td>',
'<td class="td-n right">',$entry['mem_size'],'</td>', '<td class="td-n right">',$entry['mem_size'],'</td>',
'<td class="td-n center">',date(DATE_FORMAT,$entry['atime']),'</td>', '<td class="td-n center">',date(DATE_FORMAT,$entry['access_time']),'</td>',
'<td class="td-n center">',date(DATE_FORMAT,$entry['mtime']),'</td>', '<td class="td-n center">',date(DATE_FORMAT,$entry['mtime']),'</td>',
'<td class="td-n center">',date(DATE_FORMAT,$entry['ctime']),'</td>'; '<td class="td-n center">',date(DATE_FORMAT,$entry['creation_time']),'</td>';
if($fieldname=='info') { if($fieldname=='info') {
if($entry['ttl']) if($entry['ttl'])
@ -1054,9 +1053,9 @@ EOB;
else else
echo '<td class="td-n center">None</td>'; echo '<td class="td-n center">None</td>';
} }
if ($entry['dtime']) { if ($entry['deletion_time']) {
echo '<td class="td-last center">', date(DATE_FORMAT,$entry['dtime']), '</td>'; echo '<td class="td-last center">', date(DATE_FORMAT,$entry['deletion_time']), '</td>';
} else if ($MYREQUEST['OB'] == OB_USER_CACHE) { } else if ($MYREQUEST['OB'] == OB_USER_CACHE) {
echo '<td class="td-last center">'; echo '<td class="td-last center">';
@ -1068,7 +1067,7 @@ EOB;
echo '</tr>'; echo '</tr>';
if ($sh == $MYREQUEST["SH"]) { if ($sh == $MYREQUEST["SH"]) {
echo '<tr>'; echo '<tr>';
echo '<td colspan="7"><pre>'.htmlentities(print_r(apcu_fetch($entry['key']), 1)).'</pre></td>'; echo '<td colspan="7"><pre>'.htmlentities(print_r(apcu_fetch($entry['info']), 1)).'</pre></td>';
echo '</tr>'; echo '</tr>';
} }
$i++; $i++;

View File

@ -2,6 +2,19 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/ http://fivefilters.org/content-only/
CHANGELOG CHANGELOG
------------------------------------ ------------------------------------
3.6 (2016-02-21)
- Insert og:image (if we find one) at the top of the article when no images have been extracted
- Additional lazy image load handling - helps preserve more images designed for JS-enabled browsers
- Original GUID values from feed items now preserved
- New config option favour_effective_url determines if item's effective URL (after redirects) should replace original item URL in feed output
- Adding &use_effective_url to querystring will replace original feed item URL with effective URL (unless disabled with config option above)
- APCu stats view in admin panel fixed to work with recent versions of APCu
- HTML5-PHP library updated
- Tested for PHP 7 compatibility
- VPS Puppet script (ubuntu-15.10.pp) updated - fixes issue with IDN encodings, among other things. (This is intended for setting up a new Ubuntu 15.10 instance for running Full-Text RSS.)
- Site config files updated for better extraction
- Other minor fixes/improvements
3.5 (2015-06-13) 3.5 (2015-06-13)
- Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed - Open Graph properties og:title, og:type, og:url, og:image, and og:description now returned if found in the page being processed
- Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled - Bug fix: certain XPath expressions weren't being evaluated correctly when HTML5 parsing was enabled
@ -16,9 +29,6 @@ CHANGELOG
- Site config files updated for better extraction - Site config files updated for better extraction
- Other minor fixes/improvements - Other minor fixes/improvements
3.4.1 (unreleased)
- Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c
3.4 (2014-09-08) 3.4 (2014-09-08)
- New request parameter: siteconfig lets you submit extraction rules directly in request - New request parameter: siteconfig lets you submit extraction rules directly in request
- New request paramter: accept=(auto|feed|html) determines what we'll accept as a response (deprecates html=1 parameter) - New request paramter: accept=(auto|feed|html) determines what we'll accept as a response (deprecates html=1 parameter)

View File

@ -310,6 +310,22 @@ $options->max_entries_with_key = 10;
// false - disabled // false - disabled
$options->xss_filter = 'user'; $options->xss_filter = 'user';
// Use effective URL in place of item URL
// ----------------------
// When we extract content for feed items, we often end up at a different URL than the
// one in the original feed. This is often a result of URL shorteners being used or
// tracking services being used by the feed publisher. We include the final
// (effective) URL we reached to get the content inside the dc:identifier field.
// If you enable this, we'll also use this URL in place of the original item URL
// in the new feed we produce.
// By default, we keep the original item URL but the user can request the effective
// URL by passing '&use_effective_url' in the querystring.
// Possible values:
// * Use effective URL: true
// * Keep item URL in original feed: false
// * Keep item URL unless user requests effective URL: 'user' (default)
$options->favour_effective_url = 'user';
// Favour item titles in feed // Favour item titles in feed
// ---------------------- // ----------------------
// By default, when processing feeds, we assume item titles in the feed // By default, when processing feeds, we assume item titles in the feed
@ -484,7 +500,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS /////////// /// DO NOT CHANGE ANYTHING BELOW THIS ///////////
///////////////////////////////////////////////// /////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.5'); if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.6');
if (basename(__FILE__) == 'config.php') { if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) { if (file_exists(dirname(__FILE__).'/custom_config.php')) {

View File

@ -16,7 +16,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/ http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/ */
$app_name = 'Full-Text RSS 3.5'; $app_name = 'Full-Text RSS 3.6';
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION. // Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
//$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION')); //$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));

View File

@ -5,10 +5,10 @@
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability) * Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
* to extract content from HTML files. * to extract content from HTML files.
* *
* @version 1.1 * @version 1.2
* @date 2014-03-28 * @date 2016-02-21
* @author Keyvan Minoukadeh * @author Keyvan Minoukadeh
* @copyright 2014 Keyvan Minoukadeh * @copyright 2016 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/ */
@ -765,6 +765,11 @@ class ContentExtractor
} }
} }
// prevent self-closing iframes // prevent self-closing iframes
if ($this->body->tagName === 'iframe') {
if (!$this->body->hasChildNodes()) {
$this->body->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
}
} else {
$elems = $this->body->getElementsByTagName('iframe'); $elems = $this->body->getElementsByTagName('iframe');
for ($i = $elems->length-1; $i >= 0; $i--) { for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i); $e = $elems->item($i);
@ -772,6 +777,7 @@ class ContentExtractor
$e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]')); $e->appendChild($this->body->ownerDocument->createTextNode('[embedded content]'));
} }
} }
}
// remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/ // remove image lazy loading - WordPress plugin http://wordpress.org/extend/plugins/lazy-load/
// the plugin replaces the src attribute to point to a 1x1 gif and puts the original src // the plugin replaces the src attribute to point to a 1x1 gif and puts the original src
// inside the data-lazy-src attribute. It also places the original image inside a noscript element // inside the data-lazy-src attribute. It also places the original image inside a noscript element
@ -791,6 +797,24 @@ class ContentExtractor
$e->removeAttribute('data-lazy-src'); $e->removeAttribute('data-lazy-src');
} }
} }
// now let's deal with another lazy load technique. Example:
// <img src="" class="lazyload"
// data-src="http://i68.tinypic.com/2jabu8.jpg" alt="Image and video hosting by TinyPic" border="0" />
$elems = @$xpath->query("//img[@data-src and contains(@class, 'lazyload') and contains(@src, 'data:image')]", $this->body);
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
$e->setAttribute('src', $e->getAttribute('data-src'));
$e->removeAttribute('data-src');
}
// If there's an og:image, but we have no images in the article, let's place it at the beginning of the article.
if ($this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
$elems = @$xpath->query("//img", $this->body);
if ($elems->length === 0) {
$_new_elem = $this->body->ownerDocument->createDocumentFragment();
@$_new_elem->appendXML('<div><img src="'.htmlspecialchars($this->opengraph['og:image']).'" class="ff-og-image-inserted" /></div>');
$this->body->insertBefore($_new_elem, $this->body->firstChild);
}
}
$this->success = true; $this->success = true;
} }

View File

@ -1,8 +1,8 @@
<?php <?php
/* /*
htmLawed 1.1.19, 19 January 2015 htmLawed 1.1.20, 9 June 2015
OOP code, 19 January 2015 OOP code, 9 June 2015
Copyright Santosh Patnaik Copyright Santosh Patnaik
Dual LGPL v3 and GPL v2+ license Dual LGPL v3 and GPL v2+ license
A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed
@ -437,7 +437,7 @@ if(!empty($m[1])){
} }
// open tag & attr // open tag & attr
static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept-charset'=>array('form'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accesskey'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'legend'=>1, 'textarea'=>1), 'action'=>array('form'=>1), 'align'=>array('caption'=>1, 'embed'=>1, 'applet'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'legend'=>1, 'table'=>1, 'hr'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'p'=>1, 'col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'allowfullscreen'=>array('iframe'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'tr'=>1, 'td'=>1, 'th'=>1), 'border'=>array('table'=>1, 'img'=>1, 'object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('input'=>1), 'cite'=>array('blockquote'=>1, 'q'=>1, 'del'=>1, 'ins'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('object'=>1, 'applet'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'coords'=>array('area'=>1, 'a'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1), 'declare'=>array('object'=>1), 'defer'=>array('script'=>1), 'dir'=>array('bdo'=>1), 'disabled'=>array('button'=>1, 'input'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'for'=>array('label'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('embed'=>1, 'iframe'=>1, 'td'=>1, 'th'=>1, 'img'=>1, 'object'=>1, 'applet'=>1), 'href'=>array('a'=>1, 'area'=>1), 'hreflang'=>array('a'=>1), 'hspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'label'=>array('option'=>1, 'optgroup'=>1), 'language'=>array('script'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'maxlength'=>array('input'=>1), 'method'=>array('form'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('select'=>1), 'name'=>array('button'=>1, 'embed'=>1, 'textarea'=>1, 'applet'=>1, 'select'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'a'=>1, 'input'=>1, 'object'=>1, 'map'=>1, 'param'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'onblur'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onchange'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'onfocus'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'onreset'=>array('form'=>1), 'onselect'=>array('input'=>1, 'textarea'=>1), 'onsubmit'=>array('form'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'prompt'=>array('isindex'=>1), 'readonly'=>array('textarea'=>1, 'input'=>1), 'rel'=>array('a'=>1), 'rev'=>array('a'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scrolling'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('area'=>1, 'a'=>1), 'size'=>array('hr'=>1, 'font'=>1, 'input'=>1, 'select'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('embed'=>1, 'script'=>1, 'input'=>1, 'iframe'=>1, 'img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'summary'=>array('table'=>1), 'tabindex'=>array('a'=>1, 'area'=>1, 'button'=>1, 'input'=>1, 'object'=>1, 'select'=>1, 'textarea'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'embed'=>1, 'object'=>1, 'param'=>1, 'script'=>1, 'input'=>1, 'li'=>1, 'ol'=>1, 'ul'=>1, 'button'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('input'=>1, 'option'=>1, 'param'=>1, 'button'=>1, 'li'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'img'=>1, 'object'=>1), 'width'=>array('embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'object'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'applet'=>1, 'col'=>1, 'colgroup'=>1, 'pre'=>1), 'wmode'=>array('embed'=>1), 'xml:space'=>array('pre'=>1, 'script'=>1, 'style'=>1)); // Ele-specific
static $aNE = array('checked'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'disabled'=>1, 'ismap'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'readonly'=>1, 'selected'=>1); // Empty static $aNE = array('checked'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'disabled'=>1, 'ismap'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'readonly'=>1, 'selected'=>1); // Empty
static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* & src static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* & src
static $aNU = array('class'=>array('param'=>1, 'script'=>1), 'dir'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'id'=>array('script'=>1), 'lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'xml:lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'onclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'ondblclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeydown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeypress'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeyup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousedown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousemove'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseout'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseover'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'style'=>array('param'=>1, 'script'=>1), 'title'=>array('param'=>1, 'script'=>1)); // Univ & exceptions static $aNU = array('class'=>array('param'=>1, 'script'=>1), 'dir'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'id'=>array('script'=>1), 'lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'xml:lang'=>array('applet'=>1, 'br'=>1, 'iframe'=>1, 'param'=>1, 'script'=>1), 'onclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'ondblclick'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeydown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeypress'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onkeyup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousedown'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmousemove'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseout'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseover'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'onmouseup'=>array('applet'=>1, 'bdo'=>1, 'br'=>1, 'font'=>1, 'iframe'=>1, 'isindex'=>1, 'param'=>1, 'script'=>1), 'style'=>array('param'=>1, 'script'=>1), 'title'=>array('param'=>1, 'script'=>1)); // Univ & exceptions
@ -632,10 +632,10 @@ if($e == 'font'){
$a2 .= ' font-family: '. str_replace('"', '\'', trim($m[2])). ';'; $a2 .= ' font-family: '. str_replace('"', '\'', trim($m[2])). ';';
} }
if(preg_match('`color\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m)){ if(preg_match('`color\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m)){
$a2 .= ' color: '. trim($m[2]). ';'; $a2 .= ' color: '. str_replace('"', '\'', trim($m[2])). ';';
} }
if(preg_match('`size\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m) && isset($fs[($m = trim($m[2]))])){ if(preg_match('`size\s*=\s*(\'|")?(.+?)(\\1|\s|$)`i', $a, $m) && isset($fs[($m = trim($m[2]))])){
$a2 .= ' font-size: '. $fs[$m]. ';'; $a2 .= ' font-size: '. str_replace('"', '\'', $fs[$m]). ';';
} }
$e = 'span'; return ltrim($a2); $e = 'span'; return ltrim($a2);
} }
@ -701,7 +701,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array(
public static function hl_version(){ public static function hl_version(){
// rel // rel
return '1.1.19'; return '1.1.20';
// eof // eof
} }

View File

@ -73,17 +73,86 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
'nodeName'=>'img', 'nodeName'=>array('img', 'a'), 'nodeName'=>'img', 'nodeName'=>array('img', 'a'),
'attrName'=>'alt', 'attrName'=>array('title', 'alt'), 'attrName'=>'alt', 'attrName'=>array('title', 'alt'),
'prefixes'=>['xh'=>'http://www.w3.org/1999/xhtml'),
'xpath' => "@checked[../../xh:input[@type='radio' or @type='checkbox']]",
), ),
*/ */
array( array(
'nodeNamespace' => 'http://www.w3.org/1999/xhtml', 'nodeNamespace' => 'http://www.w3.org/1999/xhtml',
'attrName'=>array('alt', 'title'), 'attrName' => array('href',
'hreflang',
'http-equiv',
'icon',
'id',
'keytype',
'kind',
'label',
'lang',
'language',
'list',
'maxlength',
'media',
'method',
'name',
'placeholder',
'rel',
'rows',
'rowspan',
'sandbox',
'spellcheck',
'scope',
'seamless',
'shape',
'size',
'sizes',
'span',
'src',
'srcdoc',
'srclang',
'srcset',
'start',
'step',
'style',
'summary',
'tabindex',
'target',
'title',
'type',
'value',
'width',
'border',
'charset',
'cite',
'class',
'code',
'codebase',
'color',
'cols',
'colspan',
'content',
'coords',
'data',
'datetime',
'default',
'dir',
'dirname',
'enctype',
'for',
'form',
'formaction',
'headers',
'height',
'accept',
'accept-charset',
'accesskey',
'action',
'align',
'alt',
'bgcolor',
),
),
array(
'nodeNamespace' => 'http://www.w3.org/1999/xhtml',
'xpath' => 'starts-with(local-name(), \'data-\')',
), ),
); );
const DOCTYPE = '<!DOCTYPE html>'; const DOCTYPE = '<!DOCTYPE html>';
@ -328,7 +397,7 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
$xp->registerNamespace($nsPrefix, $ns); $xp->registerNamespace($nsPrefix, $ns);
} }
} }
if(!$xp->query($rule['xpath'], $attr->ownerElement)->length){ if(!$xp->evaluate($rule['xpath'], $attr)){
continue; continue;
} }
} }

View File

@ -1,4 +1,7 @@
# Release Notes # Release Notes
2.1.2 (2015-06-07)
- #82: Support for PHP7
- #84: Improved boolean attribute handling
2.1.1 (2015-03-23) 2.1.1 (2015-03-23)
- #78: Fixes bug where unmatched entity like string drops everything after &. - #78: Fixes bug where unmatched entity like string drops everything after &.

View File

@ -95,7 +95,7 @@ class Readability
// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
'normalize' => '/\s{2,}/', 'normalize' => '/\s{2,}/',
'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/', 'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|soundcloud\.com|twitch\.tv)!i', 'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|soundcloud\.com|twitch\.tv|openload\.co)!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
); );

View File

@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh // Author: Keyvan Minoukadeh
// Copyright (c) 2015 Keyvan Minoukadeh // Copyright (c) 2015 Keyvan Minoukadeh
// License: AGPLv3 // License: AGPLv3
// Version: 3.5 // Version: 3.6
// Date: 2015-05-29 // Date: 2016-02-17
// More info: http://fivefilters.org/content-only/ // More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org // Help: http://help.fivefilters.org
@ -129,6 +129,11 @@ if (!$options->enabled) {
die('The full-text RSS service is currently disabled'); die('The full-text RSS service is currently disabled');
} }
//////////////////////////////////
// Enable Cross-Origin Resource Sharing (CORS)
//////////////////////////////////
if ($options->cors) header('Access-Control-Allow-Origin: *');
//////////////////////////////// ////////////////////////////////
// Debug mode? // Debug mode?
// See the config file for debug options. // See the config file for debug options.
@ -307,6 +312,16 @@ if ($options->favour_feed_titles == 'user') {
$favour_feed_titles = $options->favour_feed_titles; $favour_feed_titles = $options->favour_feed_titles;
} }
///////////////////////////////////////////////
// Favour effective URL
///////////////////////////////////////////////
$favour_effective_url = false;
if ($options->favour_effective_url == 'user') {
$favour_effective_url = isset($_REQUEST['use_effective_url']);
} else {
$favour_effective_url = $options->favour_effective_url;
}
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Include full content in output? // Include full content in output?
/////////////////////////////////////////////// ///////////////////////////////////////////////
@ -437,11 +452,6 @@ if (!empty($options->proxy_servers)) {
} }
} }
//////////////////////////////////
// Enable Cross-Origin Resource Sharing (CORS)
//////////////////////////////////
if ($options->cors) header('Access-Control-Allow-Origin: *');
////////////////////////////////// //////////////////////////////////
// Has the HTML been given in the request? // Has the HTML been given in the request?
////////////////////////////////// //////////////////////////////////
@ -459,7 +469,7 @@ if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') {
if ($options->caching) { if ($options->caching) {
debug('Caching is enabled...'); debug('Caching is enabled...');
$cache_id = md5($max.$url.(int)$valid_key.$accept.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary. $cache_id = md5($max.$url.(int)$valid_key.$accept.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.
(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.$parser.$user_submitted_config._FF_FTR_MODE); (int)$xss_filter.(int)$favour_effective_url.(int)$exclude_on_fail.$format.$detect_language.$parser.$user_submitted_config._FF_FTR_MODE);
$check_cache = true; $check_cache = true;
if ($options->apc && $options->smart_cache) { if ($options->apc && $options->smart_cache) {
apc_add("cache.$cache_id", 0, $options->cache_time*60); apc_add("cache.$cache_id", 0, $options->cache_time*60);
@ -623,6 +633,7 @@ if ($accept === 'html' || !$result) {
public function get_enclosure($key=0, $prefer=null) { return null; } public function get_enclosure($key=0, $prefer=null) { return null; }
public function get_enclosures() { return null; } public function get_enclosures() { return null; }
public function get_categories() { return null; } public function get_categories() { return null; }
public function get_item_tags($namespace='', $tag='') { return null; }
} }
$feed = new DummySingleItemFeed($url); $feed = new DummySingleItemFeed($url);
} }
@ -916,7 +927,24 @@ foreach ($items as $key => $item) {
} }
} }
$newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); // guid
$_guid = $item->get_permalink();
$_ispermalink = 'true';
$_g = $item->get_item_tags('', 'guid');
if (is_array($_g) && count($_g) > 0) {
$_ispermalink = null;
$_guid = $_g[0]['data'];
if (isset($_g[0]['attribs']) && isset($_g[0]['attribs']['']) && isset($_g[0]['attribs']['']['isPermaLink'])) {
$_ispermalink = $_g[0]['attribs']['']['isPermaLink'];
if ($_ispermalink !== 'true') $_ispermalink = 'false';
}
}
if (isset($_ispermalink)) {
$newitem->addElement('guid', $_guid, array('isPermaLink'=>$_ispermalink));
} else {
$newitem->addElement('guid', $_guid);
}
unset($_g, $_guid, $_ispermalink);
// filter xss? // filter xss?
if ($xss_filter) { if ($xss_filter) {
@ -1027,6 +1055,7 @@ foreach ($items as $key => $item) {
//http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir--25th-March-2012-Special-Program-from-Liari-(Karachi) //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir--25th-March-2012-Special-Program-from-Liari-(Karachi)
//temporary measure: use utf8_encode() //temporary measure: use utf8_encode()
$newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url))); $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
if ($favour_effective_url) $newitem->setLink(remove_url_cruft(utf8_encode($effective_url)));
} else { } else {
$newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink())); $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
} }
@ -1162,6 +1191,7 @@ function get_self_url() {
if (isset($_GET['lang'])) $self .= '&lang='.urlencode($_GET['lang']); if (isset($_GET['lang'])) $self .= '&lang='.urlencode($_GET['lang']);
if (isset($_GET['xss'])) $self .= '&xss'; if (isset($_GET['xss'])) $self .= '&xss';
if (isset($_GET['use_extracted_title'])) $self .= '&use_extracted_title'; if (isset($_GET['use_extracted_title'])) $self .= '&use_extracted_title';
if (isset($_GET['use_effective_url'])) $self .= '&use_effective_url';
if (isset($_GET['content'])) $self .= '&content='.urlencode($_GET['content']); if (isset($_GET['content'])) $self .= '&content='.urlencode($_GET['content']);
if (isset($_GET['summary'])) $self .= '&summary='.urlencode($_GET['summary']); if (isset($_GET['summary'])) $self .= '&summary='.urlencode($_GET['summary']);
if (isset($_GET['debug'])) $self .= '&debug'; if (isset($_GET['debug'])) $self .= '&debug';

View File

@ -1,12 +1,14 @@
Full-Text RSS site config files Full-Text RSS site config files
================ ================
[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no site patterns, it tries to detect the content block automatically. [Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no rules found, it tries to detect the content block automatically.
This repository contains the site config files we use in Full-Text RSS. This repository contains the site-specific extraction rules we rely on in Full-Text RSS.
### Contributing changes ### Contributing changes
We run automated tests on these files to detect issues. If you'd like to help keep these up to date, please look at the [test results](http://siteconfig.fivefilters.org/test/) and see which files you'd like to contribute fixes for.
We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface. We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface.
You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model: You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model:
@ -19,7 +21,9 @@ If a site is not in our set, you can create a file for it in the same way. See [
### How to write a site config file ### How to write a site config file
Please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns) for a brief guide. We hope to have some tutorials up soon. The quickest and simplest way is to use our [point-and-click interface](http://siteconfig.fivefilters.org). It's a simple tool only intended to create a rule to extract the correct content block.
For further refinements, e.g. selecting the title, stripping elements, dealing with multi-page articles, please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns).
### Instapaper ### Instapaper
@ -29,7 +33,7 @@ Marco, Instapaper's creator, graciously opened up the database of contributions
> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached. > And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached.
Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (login required). Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (no longer available since Instapaper was sold).
### Testing site config files ### Testing site config files

272
ubuntu-15.10.pp Normal file
View File

@ -0,0 +1,272 @@
# Puppet file intended to install server componenets for self-hosted FiveFilters.org web services
# This file is intended for base images of:
# Ubuntu 15.10
# Please see here for more information on how to use this:
# http://help.fivefilters.org/customer/en/portal/articles/1143210-hosting
Exec { path => "/bin:/usr/bin:/usr/local/bin" }
stage { 'first': before => Stage['main'] }
stage { 'last': require => Stage['main'] }
class {
'init': stage => first;
'final': stage => last;
}
class init {
exec { "apt-update":
command => "apt-get update"
}
package { "fail2ban":
ensure => latest
}
package { "unattended-upgrades":
ensure => latest
}
file { "/etc/apt/apt.conf.d/20auto-upgrades":
ensure => present,
content => 'APT::Periodic::Update-Package-Lists "1";
APT::Periodic::Unattended-Upgrade "1";',
require => Package["unattended-upgrades"]
}
}
# make sure apt-update run before package
Exec["apt-update"] -> Package <| |>
class apache {
exec { "enable-mod_rewrite":
require => Package["apache2"],
before => Service["apache2"],
#command => "/usr/sbin/a2enmod rewrite",
command => "sudo a2enmod rewrite",
}
file { "/etc/apache2/mods-available/mpm_prefork.conf":
ensure => present,
content => "<IfModule mpm_prefork_module>
StartServers 5
MinSpareServers 5
MaxSpareServers 10
MaxRequestWorkers 80
MaxConnectionsPerChild 0
</IfModule>",
require => Package["apache2"],
notify => Exec["restart-apache"]
}
file { "/etc/apache2/sites-available/fivefilters.conf":
ensure => present,
content => "<VirtualHost *:80>
ServerAdmin webmaster@localhost
DocumentRoot /var/www/html
ErrorLog ${APACHE_LOG_DIR}/error.log
CustomLog /dev/null combined
#CustomLog ${APACHE_LOG_DIR}/access.log combined
KeepAliveTimeout 2
MaxKeepAliveRequests 10
</VirtualHost>",
require => Package["apache2"],
before => Exec["enable-fivefilters-apache2"],
notify => Exec["restart-apache"]
}
exec { "enable-fivefilters-apache2":
require => [Package["apache2"], Service["apache2"]],
command => "sudo a2dissite 000-default && sudo a2ensite fivefilters"
}
exec { "disable-mod_status":
require => Package["apache2"],
before => Service["apache2"],
command => "sudo a2dismod status",
}
package { "apache2":
ensure => latest
}
service { "apache2":
ensure => running,
require => Package["apache2"]
}
exec { "restart-apache":
command => "sudo service apache2 restart",
require => Package["apache2"],
refreshonly => true
}
#TODO: Set AllowOverride All in default config to enable .htaccess
}
class php {
package { "php5": ensure => latest }
package { "libapache2-mod-php5": ensure => latest }
package { "php5-cli": ensure => latest }
package { "php5-tidy": ensure => latest }
package { "php5-curl": ensure => latest }
package { "libcurl4-gnutls-dev": ensure => latest }
package { "libpcre3-dev": ensure => latest }
package { "make": ensure=>latest }
package { "php-pear": ensure => latest }
package { "php5-dev": ensure => latest }
package { "php5-intl": ensure => latest }
package { "php5-gd": ensure => latest }
package { "php5-imagick": ensure => latest }
package { "php5-json": ensure => latest }
#package { "php-http": ensure => latest }
package { "php5-raphf": ensure => latest }
package { "php5-propro": ensure => latest }
file { "/etc/php5/mods-available/fivefilters-php.ini":
ensure => present,
content => "engine = On
expose_php = Off
max_execution_time = 120
memory_limit = 128M
error_reporting = E_ALL & ~E_DEPRECATED
display_errors = Off
display_startup_errors = Off
html_errors = Off
default_socket_timeout = 120
file_uploads = Off
date.timezoe = 'UTC'",
require => Package["php5"],
before => Exec["enable-fivefilters-php"],
}
exec { "enable-fivefilters-php":
command => "sudo php5enmod fivefilters-php",
}
}
class php_pecl_http {
# Important: this file needs to be in place before we install the HTTP extension
file { "/etc/php5/mods-available/http.ini":
ensure => present,
#owner => root, group => root, mode => 444,
content => "; priority=25
extension=raphf.so
extension=propro.so
extension=http.so",
before => [Exec["install-http-pecl"], Exec["enable-http"]],
require => Class["php"]
}
exec { "enable-http":
command => "sudo php5enmod http",
require => Class["php"],
}
package { "libidn11-dev":
ensure => latest,
before => Exec["install-http-pecl"]
}
package { "libevent-dev":
ensure => latest,
before => Exec["install-http-pecl"]
}
exec { "install-http-pecl":
command => "pecl install https://pecl.php.net/get/pecl_http-2.5.5.tgz",
#command => "sudo pecl install pecl_http",
# the above is now version 3.0 - requires PHP7
#command => "pecl install http://pecl.php.net/get/pecl_http-1.7.6.tgz",
#creates => "/tmp/needed/directory",
require => Exec["enable-http"]
}
}
class php_pecl_apcu {
exec { "install-apcu-pecl":
command => "sudo pecl install channel://pecl.php.net/APCu-4.0.10",
#creates => "/tmp/needed/directory",
require => Class["php"]
}
file { "/etc/php5/mods-available/apcu.ini":
ensure => present,
#owner => root, group => root, mode => 444,
content => "extension=apcu.so",
require => Exec["install-apcu-pecl"],
before => Exec["enable-apcu"]
}
exec { "enable-apcu":
command => "sudo php5enmod apcu",
notify => Exec["restart-apache"],
}
}
class php_cld {
# see https://github.com/lstrojny/php-cld
package { "git": ensure => latest }
package { "build-essential": ensure => latest }
file { "/tmp/cld":
ensure => absent,
before => Exec["download-cld"],
recurse => true,
force => true
}
exec { "download-cld":
command => "git clone git://github.com/lstrojny/php-cld.git /tmp/cld",
require => [Package["git"], Class["php"]],
before => Exec["build-cld"]
}
exec { "checkout-cld-version":
# recent version does not work, so we switch to an older one
command => "git reset --hard fd5aa5721b01bfe547ff6674fa0daa9c3b791ca3",
cwd => "/tmp/cld",
require => Exec["download-cld"],
before => Exec["build-cld"]
}
exec { "build-cld":
command => "./build.sh",
#new cld:command => "sh compile_libs.sh",
cwd => "/tmp/cld/vendor/libcld",
require => Package["build-essential"],
provider => "shell"
}
exec { "install-cld-extension":
command => "phpize && ./configure --with-libcld-dir=/tmp/cld/vendor/libcld && make && sudo make install",
cwd => "/tmp/cld",
provider => "shell",
require => Exec["build-cld"]
}
file { "/etc/php5/mods-available/cld.ini":
ensure => present,
#owner => root, group => root, mode => 444,
content => "extension=cld.so",
require => Exec["install-cld-extension"],
before => Exec["enable-cld"],
}
exec { "enable-cld":
command => "sudo php5enmod cld",
notify => Exec["restart-apache"],
}
}
class final {
exec { "lower-swappiness":
command => "echo 'vm.swappiness = 10' >> /etc/sysctl.conf && sudo sysctl -p",
provider => "shell"
}
}
include init
include apache
include php
include php_pecl_apcu
include php_cld
include php_pecl_http
include final