Full-Text RSS 3.2

This commit is contained in:
FiveFilters.org 2014-05-15 23:03:31 +02:00
parent 4960fa9f2a
commit ec7275f58d
30 changed files with 2877 additions and 789 deletions

1383
admin/apc.php Normal file

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,7 @@
// Author: Keyvan Minoukadeh // Author: Keyvan Minoukadeh
// Copyright (c) 2013 Keyvan Minoukadeh // Copyright (c) 2013 Keyvan Minoukadeh
// License: AGPLv3 // License: AGPLv3
// Date: 2013-02-25 // Date: 2013-05-09
// More info: http://fivefilters.org/content-only/ // More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org // Help: http://help.fivefilters.org
@ -57,7 +57,7 @@ require_once('require_login.php');
require_once('template.php'); require_once('template.php');
tpl_header('Edit site patterns'); tpl_header('Edit site patterns');
$version = include('../site_config/standard/version.php'); $version = file_get_contents('../site_config/standard/version.txt');
function filter_only_text($filename) { function filter_only_text($filename) {
return (strtolower(substr($filename, -4)) == '.txt'); return (strtolower(substr($filename, -4)) == '.txt');

View File

@ -40,4 +40,8 @@ tpl_header('Admin');
?> ?>
<p>The admin pages are intended to help you manage your copy of Full-Text RSS more easily.</p> <p>The admin pages are intended to help you manage your copy of Full-Text RSS more easily.</p>
<p>We currently offer an experimental <a href="update.php">update tool</a> which you can use to update your site patterns.</p> <ul>
<li><a href="update.php">Update patterns</a>: an easy way to keep site config files up to date.</li>
<li><a href="edit-pattern.php">Edit patterns</a>: need to fine-tune extraction for a certain site? Use this tool.</li>
<li><a href="apc.php?OB=3">APC</a>: If APC is enabled, you can use this tool to see what Full-Text RSS caches, and clear the cache if you need to.</li>
</ul>

View File

@ -1,4 +1,5 @@
<?php <?php
header("X-Robots-Tag: noindex, nofollow", true);
session_start(); session_start();
require_once(dirname(dirname(__FILE__)).'/config.php'); require_once(dirname(dirname(__FILE__)).'/config.php');
if (!isset($options->admin_credentials) || $options->admin_credentials['username'] == '' || $options->admin_credentials['password'] == '') { if (!isset($options->admin_credentials) || $options->admin_credentials['username'] == '' || $options->admin_credentials['password'] == '') {

View File

@ -1,9 +1,9 @@
<?php <?php
// Require login for admin access // Require login for admin access
// Author: Keyvan Minoukadeh // Author: Keyvan Minoukadeh
// Copyright (c) 2012 Keyvan Minoukadeh // Copyright (c) 2013 Keyvan Minoukadeh
// License: AGPLv3 // License: AGPLv3
// Date: 2012-08-30 // Date: 2013-05-09
// More info: http://fivefilters.org/content-only/ // More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org // Help: http://help.fivefilters.org
@ -38,58 +38,4 @@ if (!isset($_SESSION['auth']) || $_SESSION['auth'] != 1) {
header('Location: login.php'); header('Location: login.php');
} }
exit; exit;
} }
/* HTTP DIGEST authentication - doesn't work without server tweaks in FastCGI environments
$realm = 'Restricted area';
//user => password
$users = array($options->admin_credentials['username'] => $options->admin_credentials['password']);
if (empty($_SERVER['PHP_AUTH_DIGEST'])) {
header('HTTP/1.1 401 Unauthorized');
header('WWW-Authenticate: Digest realm="'.$realm.
'",qop="auth",nonce="'.uniqid().'",opaque="'.md5($realm).'"');
die('If you can\'t remember your admin credentials, open your custom_config.php and you\'ll find them in there.');
}
// analyze the PHP_AUTH_DIGEST variable
if (!($data = http_digest_parse($_SERVER['PHP_AUTH_DIGEST'])) ||
!isset($users[$data['username']]))
die('Wrong credentials!');
// generate the valid response
$A1 = md5($data['username'] . ':' . $realm . ':' . $users[$data['username']]);
$A2 = md5($_SERVER['REQUEST_METHOD'].':'.$data['uri']);
$valid_response = md5($A1.':'.$data['nonce'].':'.$data['nc'].':'.$data['cnonce'].':'.$data['qop'].':'.$A2);
if ($data['response'] != $valid_response)
die('Wrong credentials!');
// ok, valid username & password
// echo 'Thanks! You are now logged in.';
unset($realm, $users, $data, $A1, $A2, $valid_response);
// function to parse the http auth header
function http_digest_parse($txt)
{
// protect against missing data
$needed_parts = array('nonce'=>1, 'nc'=>1, 'cnonce'=>1, 'qop'=>1, 'username'=>1, 'uri'=>1, 'response'=>1);
$data = array();
$keys = implode('|', array_keys($needed_parts));
preg_match_all('@(' . $keys . ')=(?:([\'"])([^\2]+?)\2|([^\s,]+))@', $txt, $matches, PREG_SET_ORDER);
foreach ($matches as $m) {
$data[$m[1]] = $m[3] ? $m[3] : $m[4];
unset($needed_parts[$m[1]]);
}
return $needed_parts ? false : $data;
}
*/
?>

View File

@ -40,6 +40,7 @@ global $admin_page;
<ul class="nav"> <ul class="nav">
<li <?php if (@$admin_page == 'update') echo 'class="active"'; ?>><a href="update.php">Update patterns</a></li> <li <?php if (@$admin_page == 'update') echo 'class="active"'; ?>><a href="update.php">Update patterns</a></li>
<li <?php if (@$admin_page == 'edit-pattern') echo 'class="active"'; ?>><a href="edit-pattern.php">Edit patterns</a></li> <li <?php if (@$admin_page == 'edit-pattern') echo 'class="active"'; ?>><a href="edit-pattern.php">Edit patterns</a></li>
<li <?php if (@$admin_page == 'apc') echo 'class="active"'; ?>><a href="apc.php?OB=3">APC</a></li>
<li><a href="index.php?logout">Logout</a></li> <li><a href="index.php?logout">Logout</a></li>
</ul> </ul>
</div> </div>

View File

@ -1,9 +1,9 @@
<?php <?php
// Update site config files for Full-Text RSS // Update site config files for Full-Text RSS
// Author: Keyvan Minoukadeh // Author: Keyvan Minoukadeh
// Copyright (c) 2012 Keyvan Minoukadeh // Copyright (c) 2013 Keyvan Minoukadeh
// License: AGPLv3 // License: AGPLv3
// Date: 2012-04-13 // Date: 2013-05-12
// More info: http://fivefilters.org/content-only/ // More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org // Help: http://help.fivefilters.org
@ -24,7 +24,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
// Usage // Usage
// ----- // -----
// Access this file in your browser and follow the instructions to update your site config files. // * Access this file in your browser and follow the instructions to update your site config files.
// * See section on automatic updates for a URL you can fetch periodically (e.g. with cron) to update site config files
error_reporting(E_ALL ^ E_NOTICE); error_reporting(E_ALL ^ E_NOTICE);
ini_set("display_errors", 1); ini_set("display_errors", 1);
@ -35,20 +36,38 @@ ini_set("display_errors", 1);
//////////////////////////////// ////////////////////////////////
$admin_page = 'update'; $admin_page = 'update';
require_once('../config.php'); require_once('../config.php');
require_once('require_login.php'); require_once 'template.php';
require_once('template.php');
tpl_header('Update site patterns'); tpl_header('Update site patterns');
$version = include('../site_config/standard/version.php'); //////////////////////////////////
// Username and password must be available
//////////////////////////////////
if (!isset($options->admin_credentials) || $options->admin_credentials['username'] == '' || $options->admin_credentials['password'] == '') {
header("X-Robots-Tag: noindex, nofollow", true);
die('<h2>Username and password not set</h2><p>Full-Text RSS has not been configured with admin credentials.</p><p>If you are the administrator, please edit your <tt>custom_config.php</tt> file and enter the credentials in the appropriate section. When you\'ve done that, this page will prompt you for your admin credentials.</p>');
}
$admin_hash = sha1($options->admin_credentials['username'].'+'.$options->admin_credentials['password']);
$_self_host = $_SERVER['HTTP_HOST'];
$_self_path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\');
$self_update_url = 'http://'.htmlspecialchars($_self_host.$_self_path).'/update.php?key='.urlencode($admin_hash);
$latest_remote = 'https://codeload.github.com/fivefilters/ftr-site-config/zip/master';
$version = @file_get_contents('../site_config/standard/version.txt');
///////////////////////////////// /////////////////////////////////
// Check for valid update key // Check for update key
///////////////////////////////// /////////////////////////////////
if (!isset($_REQUEST['key']) || trim($_REQUEST['key']) == '') { if (!isset($_REQUEST['key']) || trim($_REQUEST['key']) == '') {
require_once 'require_login.php';
if ($_SERVER['REQUEST_METHOD'] == 'POST') { if ($_SERVER['REQUEST_METHOD'] == 'POST') {
header('Location: update.php'); header('Location: update.php');
exit; exit;
} }
$auto = true; $auto = true;
$no_auto_reasons = array(); $no_auto_reasons = array();
if (!class_exists('ZipArchive')) { if (!class_exists('ZipArchive')) {
@ -59,14 +78,8 @@ if (!isset($_REQUEST['key']) || trim($_REQUEST['key']) == '') {
$auto = false; $auto = false;
$no_auto_reasons[] = 'your <tt>site_config/</tt> folder is not writable - change permissions to 777 and try again.</p>'; $no_auto_reasons[] = 'your <tt>site_config/</tt> folder is not writable - change permissions to 777 and try again.</p>';
} }
if (!file_exists('../site_config/standard/version.php')) { if (!file_exists('../site_config/standard/version.txt')) {
die('Could not determine current version of your site pattern files (site_config/standard/version.php). Make sure you\'re using at least version 2.9.5 of Full-Text RSS.'); die('Could not determine current version of your site pattern files (site_config/standard/version.txt). Make sure you\'re using at least version 3.2 of Full-Text RSS.');
}
if (!@$options->registration_key) {
$input_field = '<label for="key">Registration key</label><input type="password" name="key" id="key" />';
} else {
$reg_key = preg_replace('/[^a-z0-9-]/i', '', $options->registration_key);
$input_field = '<input type="hidden" name="key" value="'.$reg_key.'" />';
} }
?> ?>
<p>You have Full-Text RSS <strong><?php echo _FF_FTR_VERSION; ?></strong> <p>You have Full-Text RSS <strong><?php echo _FF_FTR_VERSION; ?></strong>
@ -74,15 +87,19 @@ if (!isset($_REQUEST['key']) || trim($_REQUEST['key']) == '') {
</p> </p>
<p>To see if you have the latest versions, <a href="http://fivefilters.org/content-only/latest_version.php?version=<?php echo urlencode(_FF_FTR_VERSION).'&site_config='.urlencode(@$version); ?>">check for updates</a>.</p> <p>To see if you have the latest versions, <a href="http://fivefilters.org/content-only/latest_version.php?version=<?php echo urlencode(_FF_FTR_VERSION).'&site_config='.urlencode(@$version); ?>">check for updates</a>.</p>
<?php <?php
$reg_key_info = '<h3>Registration key</h3><p>This update tool requires a registration key issued by FiveFilters.org. You do not need a registration key to use Full-Text RSS, and none of the regular funtionality is affected if you do not have one. The update tool is simply a convenience service we offer our customers.</p>';
if ($auto) { if ($auto) {
echo '<p>This update tool will attempt to fetch the latest site patterns from FiveFilters.org and update yours.</p>'; echo '<p>This update tool will attempt to fetch the latest site patterns from our <a href="https://github.com/fivefilters/ftr-site-config/">GitHub repository</a>.</p>';
echo '<p><strong>Important: </strong>if you\'ve modified or added your own config files in the <tt>site_config/standard/</tt> folder, please move them to <tt>site_config/custom/</tt> &mdash; the update process will attempt to replace everything in <tt>site_config/standard/</tt> with our updated version.</p>'; echo '<p><strong>Important: </strong>if you\'ve modified or added your own config files in the <tt>site_config/standard/</tt> folder, please move them to <tt>site_config/custom/</tt> &mdash; the update process will attempt to replace everything in <tt>site_config/standard/</tt> with our updated version.</p>';
echo $reg_key_info; echo '<form method="post" action="update.php" class="well">';
if (!isset($reg_key)) { echo '<input type="hidden" name="key" value="'.$admin_hash.'" />';
echo '<p>Your registration key should be your PayPal or Avangate transaction ID. If you don\'t have a registration key, you will get one sent to you automatically when you <a href="http://fivefilters.org/content-only/">purchase Full-Text RSS</a> from FiveFilters.org.</p>'; echo '<input type="submit" value="Update site config files" />';
} echo '</form>';
echo '<form method="post" action="update.php" class="well">',$input_field,' <input type="submit" value="Update now" /></form>'; echo '<h3>Automatic updates</h3>';
echo '<p>You can schedule automatic updates using something like cron. The URL to call is:</p>';
echo '<p class="well">'.$self_update_url.'</p>';
echo '<p>We recommend you schedule this URL to be fetched once a day. If you do not have access to a scheduling service ';
echo 'you may want to consider one of these online services: <a href="http://www.easycron.com/">Easycron</a>, <a href="https://www.setcronjob.com/">SetCronJob</a>, <a href="http://www.onlinecronjobs.com/">onlinecronjobs.com</a>.</p>';
echo '<p>Note: the key contained in the URL is a hash value generated from your admin credentials. If you change these, the key will also change.</p>';
} else { } else {
echo '<div class="notice">'; echo '<div class="notice">';
echo '<p>We cannot automatically update your site pattern files because:</p>'; echo '<p>We cannot automatically update your site pattern files because:</p>';
@ -93,37 +110,56 @@ if (!isset($_REQUEST['key']) || trim($_REQUEST['key']) == '') {
echo '</ul>'; echo '</ul>';
echo '<p>You can still manually update by downloading the zip file and replacing everything in your <tt>site_config/standard/</tt> folder with the contents of the zip file.</p>'; echo '<p>You can still manually update by downloading the zip file and replacing everything in your <tt>site_config/standard/</tt> folder with the contents of the zip file.</p>';
echo '</div>'; echo '</div>';
echo $reg_key_info; echo '<p><a href="'.$latest_remote.'">Download site config files (zip)</a></p>';
if (!isset($reg_key)) {
echo '<p>Enter your registration key below to download the latest version of the site config files from FiveFilters.org</p>';
echo '<p>Your registration key should be your PayPal or Avangate transaction ID.</p>';
}
echo '<form method="post" class="well" action="http://fivefilters.org/content-only/update/get_site_config.php">',$input_field,' <input type="submit" value="Download site patterns" /></form>';
} }
echo '<h3>Help</h3>'; echo '<h3>Help</h3>';
echo '<p>If you have any trouble, please contact us via our <a href="http://help.fivefilters.org">support site</a>.</p>'; echo '<p>If you have any trouble, please contact us via our <a href="http://help.fivefilters.org">support site</a>.</p>';
exit; exit;
} }
//////////////////////////////////
// Check update key valid
//////////////////////////////////
if ($_REQUEST['key'] !== $admin_hash) {
println("Sorry, invalid key supplied.");
exit;
}
////////////////////////////////// //////////////////////////////////
// Check for updates // Check for updates
////////////////////////////////// //////////////////////////////////
$ff_version = (float)@file_get_contents('http://fivefilters.org/content-only/site_config/standard/version.txt'); //$ff_version = @file_get_contents('http://fivefilters.org/content-only/site_config/standard/version.txt');
if (version_compare($version, $ff_version) != -1) { $_context = stream_context_create(array('http' => array('user_agent' => 'PHP/5.4')));
$latest_info_json = @file_get_contents('https://api.github.com/repos/fivefilters/ftr-site-config', false, $_context);
if (!$latest_info_json) {
println("Sorry, couldn't get info on latest site config files. Please try again later or contact us.");
exit;
}
$latest_info_json = @json_decode($latest_info_json);
if (!is_object($latest_info_json)) {
println("Sorry, couldn't parse JSON from GitHub. Please try again later or contact us.");
exit;
}
$ff_version = $latest_info_json->updated_at;
if ($version == $ff_version) {
die('Your site config files are up to date! If you have trouble extracting from a particular site, please email us: help@fivefilters.org'); die('Your site config files are up to date! If you have trouble extracting from a particular site, please email us: help@fivefilters.org');
} else { } else {
println("Updated site patterns are available at FiveFilters.org (version $ff_version)..."); println("Updated site patterns are available (version $ff_version)...");
} }
////////////////////////////////// //////////////////////////////////
// Prepare // Prepare
////////////////////////////////// //////////////////////////////////
$latest_remote = 'http://fivefilters.org/content-only/update/get_site_config.php?key='.urlencode($_REQUEST['key']);
$tmp_latest_local = '../site_config/latest_site_config.zip'; $tmp_latest_local = '../site_config/latest_site_config.zip';
$tmp_latest_local_dir = '../site_config/standard_latest'; $tmp_latest_local_dir = '../site_config/standard_latest';
$tmp_old_local_dir = '../site_config/standard_old'; $tmp_old_local_dir = '../site_config/standard_old';
if (file_exists($tmp_latest_local)) unlink($tmp_latest_local); if (file_exists($tmp_latest_local)) unlink($tmp_latest_local);
if (file_exists($tmp_latest_local_dir)) rrmdir($tmp_latest_local_dir); if (file_exists($tmp_latest_local_dir)) {
if (!rrmdir($tmp_latest_local_dir)) {
println("Sorry, couldn't remove old folder from last update");
exit;
}
}
if (file_exists($tmp_old_local_dir)) { if (file_exists($tmp_old_local_dir)) {
rrmdir($tmp_old_local_dir); rrmdir($tmp_old_local_dir);
} }
@ -133,11 +169,8 @@ $standard_local_dir = '../site_config/standard/';
@file_put_contents($tmp_latest_local, @file_get_contents($latest_remote)); @file_put_contents($tmp_latest_local, @file_get_contents($latest_remote));
$headers = implode("\n", $http_response_header); $headers = implode("\n", $http_response_header);
//var_dump($headers); exit; //var_dump($headers); exit;
if (strpos($headers, 'HTTP/1.1 403') !== false) { if (strpos($headers, 'HTTP/1.0 200') === false) {
println("Invalid registration key supplied"); println("Sorry, something went wrong. Please contact us if the problem persists.");
exit;
} elseif (strpos($headers, 'HTTP/1.1 200') === false) {
println("Sorry, something went wrong. We're looking into it. Please contact us if the problem persists.");
exit; exit;
} }
if (class_exists('ZipArchive') && file_exists($tmp_latest_local)) { if (class_exists('ZipArchive') && file_exists($tmp_latest_local)) {
@ -149,15 +182,33 @@ if (class_exists('ZipArchive') && file_exists($tmp_latest_local)) {
@unlink($tmp_latest_local); @unlink($tmp_latest_local);
if (file_exists($tmp_latest_local_dir)) { if (file_exists($tmp_latest_local_dir)) {
println("Unzipped contents to $tmp_latest_local_dir"); println("Unzipped contents to $tmp_latest_local_dir");
if (!file_exists($tmp_latest_local_dir.'/version.php')) { if (!file_exists($tmp_latest_local_dir.'/ftr-site-config-master/README.md')) {
println("There was a problem extracting the latest site patterns archive - your current site patterns remain untouched."); println("There was a problem extracting the latest site patterns archive - your current site patterns remain untouched.");
println("Please <a href=\"$latest_remote\">update manually</a>."); println("Please <a href=\"$latest_remote\">update manually</a>.");
exit; exit;
} }
@file_put_contents($tmp_latest_local_dir.'/ftr-site-config-master/version.txt', $ff_version);
if (!file_exists($tmp_latest_local_dir.'/ftr-site-config-master/version.txt')) {
println("There was a problem writing the new version number - your current site patterns remain untouched.");
println("Please <a href=\"$latest_remote\">update manually</a>.");
exit;
}
rename($standard_local_dir, $tmp_old_local_dir); rename($standard_local_dir, $tmp_old_local_dir);
if (file_exists($tmp_old_local_dir)) println("Renamed $standard_local_dir to $tmp_old_local_dir"); if (file_exists($tmp_old_local_dir)) println("Renamed $standard_local_dir to $tmp_old_local_dir");
rename($tmp_latest_local_dir, $standard_local_dir); rename($tmp_latest_local_dir."/ftr-site-config-master", $standard_local_dir);
if (file_exists($standard_local_dir)) println("Renamed $tmp_latest_local_dir to $standard_local_dir"); if (file_exists($standard_local_dir)) println("Renamed $tmp_latest_local_dir/ftr-site-config-master to $standard_local_dir");
rmdir($tmp_latest_local_dir);
// clear cached site config files from APC
if ($options->apc && function_exists('apc_delete') && function_exists('apc_cache_info')) {
$_apc_data = apc_cache_info('user');
foreach ($_apc_data['cache_list'] as $_apc_item) {
if (substr($_apc_item['info'], 0, 3) == 'sc.') {
apc_delete($_apc_item['info']);
}
}
println('Cleared site config cache in APC.');
}
// all done!
println("<strong style=\"color: darkgreen;\">All done!</strong> Your old site config files are in $tmp_old_local_dir &mdash; these will be removed next time you go through the update process."); println("<strong style=\"color: darkgreen;\">All done!</strong> Your old site config files are in $tmp_old_local_dir &mdash; these will be removed next time you go through the update process.");
} else { } else {
if (file_exists($tmp_latest_local)) @unlink($tmp_latest_local); if (file_exists($tmp_latest_local)) @unlink($tmp_latest_local);
@ -179,13 +230,12 @@ function println($txt) {
} }
function rrmdir($dir) { function rrmdir($dir) {
foreach(glob($dir . '/{*.txt,*.php,.*.txt,.*.php}', GLOB_BRACE|GLOB_NOSORT) as $file) { foreach(glob($dir . '/{*.txt,*.php,.*.txt,.*.php,.gitattributes,.gitignore,ftr-site-config-master,README.md}', GLOB_BRACE|GLOB_NOSORT) as $file) {
if(is_dir($file)) { if(is_dir($file)) {
rrmdir($file); rrmdir($file);
} else { } else {
unlink($file); unlink($file);
} }
} }
rmdir($dir); return rmdir($dir);
} }
?>

View File

@ -2,11 +2,24 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/ http://fivefilters.org/content-only/
CHANGELOG CHANGELOG
------------------------------------ ------------------------------------
3.2 (2013-05-14)
- A short excerpt from the first few lines of the extracted content can now be included in the output (pass &summary=1 in querystring, see $options->summary in config file for more info)
- Full content can now be excluded from the output (pass &content=0 in querystring, see $options->content in config file for more info)
- Site config files can now be automatically updated from our GitHub repository (URL to call visible in admin area)
- Site config files updated for better extraction
- PHP Readability updated to be more lenient when pruning HTML
- Language detection library updated
- HTML meta refresh redirects now also followed
- APC stats (if APC is available on your server) now visible in admin area
- Bug fix: Duplicate find_string and replace_string values in site config files no longer removed (thanks Fabrizio!)
- Bug fix: MIME type actions now applied when following single page URLs
- Other minor fixes/improvements
3.1 (2013-03-06) 3.1 (2013-03-06)
- PHP Readability updated to preserve more images/videos - PHP Readability updated to preserve more images/videos
- Site config files updated for better extraction - Site config files updated for better extraction
- SimplePie updated - SimplePie updated
- New site config option favour_feed_titles and request parameter use_extracted_title to allow extracted titles to be used in generated feed - New config option favour_feed_titles and request parameter use_extracted_title to allow extracted titles to be used in generated feed
- Remove image lazy loading (looks for markup used by http://wordpress.org/extend/plugins/lazy-load/) - Remove image lazy loading (looks for markup used by http://wordpress.org/extend/plugins/lazy-load/)
- <category> elements appearing inside <item> elements are now preserved in generated feed - <category> elements appearing inside <item> elements are now preserved in generated feed
- <media:thumbnail> elements now preserved - <media:thumbnail> elements now preserved

View File

@ -45,6 +45,60 @@ $options->default_entries = 5;
// 10, only 10 will be processed. // 10, only 10 will be processed.
$options->max_entries = 10; $options->max_entries = 10;
// Full content
// ----------------------
// By default Full-Text RSS includes the extracted content in the output.
// You can exclude this from the output by passing '&content=0' in the querystring.
//
// Possible values...
// Always include: true
// Never include: false
// Include unless user overrides (&content=0): 'user' (default)
//
// Note: currently this does not disable full content extraction. It simply omits it
// from the output.
$options->content = 'user';
// Excerpts
// ----------------------
// By default Full-Text RSS does not include excerpts in the output.
// You can enable this by passing '&summary=1' in the querystring.
// This will include a plain text excerpt from the extracted content.
//
// Possible values...
// Always include: true (recommended for new users)
// Never include: false
// Don't include unless user overrides (&summary=1): 'user' (default)
//
// Important: if both content and excerpts are requested, the excerpt will be
// placed in the description element and the full content inside content:encoded.
// If excerpts are not requested, the full content will go inside the description element.
//
// Why are we not returning both excerpts and content by default?
// Mainly for backward compatibility.
// Excerpts should appear in the feed item's description element. Previous versions
// of Full-Text RSS did not return excerpts, so the description element was always
// used for the full content (as recommended by the RSS advisory). When returning both,
// we need somewhere else to place the content (content:encoded).
// Having both enabled should not create any problems for news readers, but it may create
// problems for developers upgrading from one of our earlier versions who may now find
// their applications are returning excerpts instead of the full content they were
// expecting. To avoid such surprises for users who are upgrading Full-Text RSS,
// excerpts must be explicitly requested in the querystring by default.
//
// Why not use a different element name for excerpts?
// According to the RSS advisory:
// "Publishers who employ summaries should store the summary in description and
// the full content in content:encoded, ordering description first within the item.
// On items with no summary, the full content should be stored in description."
// See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded
//
// For more consistent element naming, we recommend new users set this option to true.
// The full content can still be excluded via the querystring, but the element names
// will not change: when $options->summary = true, the description element will always
// be reserved for the excerpt and content:encoded always for full content.
$options->summary = 'user';
// Rewrite relative URLs // Rewrite relative URLs
// ---------------------- // ----------------------
// With this enabled relative URLs found in the extracted content // With this enabled relative URLs found in the extracted content
@ -373,7 +427,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS /////////// /// DO NOT CHANGE ANYTHING BELOW THIS ///////////
///////////////////////////////////////////////// /////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1'); if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2');
if (basename(__FILE__) == 'config.php') { if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) { if (file_exists(dirname(__FILE__).'/custom_config.php')) {

View File

@ -1,5 +1,5 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<xsl:output method="html" /> <xsl:output method="html" />
<xsl:variable name="title" select="/rss/channel/title"/> <xsl:variable name="title" select="/rss/channel/title"/>
<xsl:template match="/"> <xsl:template match="/">
@ -22,7 +22,12 @@
<xsl:for-each select="rss/channel/item"> <xsl:for-each select="rss/channel/item">
<div class="article"> <div class="article">
<li><a href="{link}" rel="bookmark"><xsl:value-of disable-output-escaping="yes" select="title"/></a> <li><a href="{link}" rel="bookmark"><xsl:value-of disable-output-escaping="yes" select="title"/></a>
<div><xsl:value-of disable-output-escaping="yes" select="description" /></div> <div>
<xsl:choose>
<xsl:when test="content:encoded"><xsl:value-of disable-output-escaping="yes" select="content:encoded" /></xsl:when>
<xsl:when test="description"><xsl:value-of disable-output-escaping="yes" select="description" /></xsl:when>
</xsl:choose>
</div>
</li> </li>
</div> </div>
</xsl:for-each> </xsl:for-each>

View File

@ -8,12 +8,15 @@ to it at www.example.com/ftr_compatibility_test.php
2) Open your web browser and go to the page you just uploaded. 2) Open your web browser and go to the page you just uploaded.
If things don't look right, have a look at our hosting suggestions:
http://help.fivefilters.org/customer/portal/articles/1143210-hosting
Note: This compatibility test has been borrowed (and slightly adapted) from the one supplied by Note: This compatibility test has been borrowed (and slightly adapted) from the one supplied by
SimplePie.org. We have kept most of their checks intact as we use SimplePie in our application. SimplePie.org. We have kept most of their checks intact as we use SimplePie in our application.
http://github.com/simplepie/simplepie/tree/master/compatibility_test/ http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/ */
$app_name = 'Full-Text RSS 3.1'; $app_name = 'Full-Text RSS 3.2';
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=')); $php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>='));
$pcre_ok = extension_loaded('pcre'); $pcre_ok = extension_loaded('pcre');
@ -129,33 +132,33 @@ em strong {
text-transform: uppercase; text-transform: uppercase;
} }
table#chart { table.chart {
border-collapse:collapse; border-collapse:collapse;
} }
table#chart th { table.chart th {
background-color:#eee; background-color:#eee;
padding:2px 3px; padding:2px 3px;
border:1px solid #fff; border:1px solid #fff;
} }
table#chart td { table.chart td {
text-align:center; text-align:center;
padding:2px 3px; padding:2px 3px;
border:1px solid #eee; border:1px solid #eee;
} }
table#chart tr.enabled td { table.chart tr.enabled td {
/* Leave this alone */ /* Leave this alone */
} }
table#chart tr.disabled td, table.chart tr.disabled td,
table#chart tr.disabled td a { table.chart tr.disabled td a {
color:#999; color:#999;
font-style:italic; font-style:italic;
} }
table#chart tr.disabled td a { table.chart tr.disabled td a {
text-decoration:underline; text-decoration:underline;
} }
@ -186,7 +189,7 @@ div.chunk {
<div class="chunk"> <div class="chunk">
<h2 style="text-align:center;"><?php echo $app_name; ?>: Compatibility Test</h2> <h2 style="text-align:center;"><?php echo $app_name; ?>: Compatibility Test</h2>
<table cellpadding="0" cellspacing="0" border="0" width="100%" id="chart"> <table cellpadding="0" cellspacing="0" border="0" width="100%" class="chart">
<thead> <thead>
<tr> <tr>
<th>Test</th> <th>Test</th>
@ -253,7 +256,7 @@ div.chunk {
</tbody> </tbody>
</table> </table>
</div> </div>
<div class="chunk"> <div class="chunk">
<h3>What does this mean?</h3> <h3>What does this mean?</h3>
<ol> <ol>
@ -292,7 +295,7 @@ div.chunk {
<?php if ($tidy_ok): ?> <?php if ($tidy_ok): ?>
<li><strong>Tidy:</strong> You have <code>Tidy</code> support installed. No problems here.</li> <li><strong>Tidy:</strong> You have <code>Tidy</code> support installed. No problems here.</li>
<?php else: ?> <?php else: ?>
<li class="highlight"><strong>Tidy:</strong> The <code>Tidy</code> extension is not available. <?php echo $app_name; ?> should still work with most feeds/articles, but you may experience problems with some. If you do, we suggest you specify parsing with html5lib.</li> <li class="highlight"><strong>Tidy:</strong> The <code>Tidy</code> extension is not available. <?php echo $app_name; ?> should still work with most feeds/articles, but you may experience problems with some.</li>
<?php endif; ?> <?php endif; ?>
<?php if ($curl_ok): ?> <?php if ($curl_ok): ?>
@ -341,12 +344,58 @@ div.chunk {
<p><strong>Note</strong>: Passing this test does not guarantee that <?php echo $app_name; ?> will run on your webhost &mdash; it only ensures that the basic requirements have been addressed. If you experience any problems, please let us know.</p> <p><strong>Note</strong>: Passing this test does not guarantee that <?php echo $app_name; ?> will run on your webhost &mdash; it only ensures that the basic requirements have been addressed. If you experience any problems, please let us know.</p>
<?php } else { ?> <?php } else { ?>
<h3>Bottom Line: We're sorry…</h3> <h3>Bottom Line: We're sorry…</h3>
<p><em>Your webhost does not support the minimum requirements for <?php echo $app_name; ?>.</em> It may be a good idea to contact your webhost and point them to the results of this test. They may be able to enable/install the required components.</p> <p><em>Your webhost does not support the minimum requirements for <?php echo $app_name; ?>.</em> It may be a good idea to contact your webhost and point them to the results of this test. They may be able to enable/install the required components.</p> <p>Alternatively, you can try one of our <a href="http://help.fivefilters.org/customer/portal/articles/1143210-hosting">recommended hosts</a>.</p>
<?php } ?> <?php } ?>
</div> </div>
<div class="chunk">
<h3>Further info</h3>
<h4>HTTP module</h4>
<p>Full-Text RSS can make use of <code>HttpRequestPool</code> or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p>
<?php
$http_type = 'file_get_contents';
if (extension_loaded('http') && class_exists('HttpRequestPool')) {
$http_type = 'HttpRequestPool';
} elseif ($curl_ok && function_exists('curl_multi_init')) {
$http_type = 'curl_multi';
}
?>
<p class="highlight"><strong><?php echo $http_type; ?></strong> will be used on this server.</p>
<h4>Alternative PHP Cache (APC)</h4>
<p>Full-Text RSS can make use of APC's memory cache to store site config files (when requested for the first time). This is not required, but if available it may improve performance slightly by reducing disk access.</p>
<?php
if (function_exists('apc_add')) {
echo '<p class="highlight"><strong>APC is available</strong> on this server.</p>';
} else {
echo '<p class="highlight">APC is not available on this server.</p>';
}
?>
<h4>Language detection</h4>
<p>Full-Text RSS can detect the language of each article processed. This occurs using <a href="http://pear.php.net/package/Text_LanguageDetect">Text_LanguageDetect</a> or <a href="https://github.com/lstrojny/php-cld">PHP-CLD</a> (if available).</p>
<?php
if (extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0)) {
echo '<p class="highlight"><strong>PHP-CLD</strong> will be used on this server.</p>';
} else {
echo '<p class="highlight"><strong>Text_LanguageDetect</strong> will be used on this server.</p>';
}
?>
<h4>Automatic site config updates</h4>
<p>Full-Text RSS can be configured to update its site config files (which determine how content should be extracted for certain sites) by downloading the latest set from our GitHub repository. This functionaility is not required, and can be done manually. To configure this to occur automatically, you will need zip support enabled in PHP - we make use of the ZipArchive class.</p>
<?php
if (!class_exists('ZipArchive')) {
echo '<p class="highlight">ZipArchive is not available on this server. To update the site config files you will need to do it manually by downloading the latest set and uploading it to your server.</p>';
} else {
echo '<p class="highlight"><strong>ZipArchive is available</strong> on this server.</p>';
}
?>
</div>
<div class="chunk"> <div class="chunk">
<p class="footnote">This compatibility test has been borrowed (and slightly adapted) from the one supplied by <a href="http://simplepie.org/">SimplePie.org</a>. We have kept most of their checks intact as we use SimplePie in our application.</a></p> <p class="footnote">This compatibility test has been borrowed (and slightly adapted) from the one supplied by <a href="http://simplepie.org/">SimplePie.org</a>. We have kept most of their checks intact as we use SimplePie in our application.</a></p>
<p class="footnote">Date: <?php echo date('Y-m-d'); ?></p>
</div> </div>
</div> </div>

View File

@ -115,7 +115,16 @@ if (!defined('_FF_FTR_INDEX')) {
</div> </div>
<?php } ?> <?php } ?>
<?php if ($options->summary == 'user') { ?>
<div class="control-group"> <div class="control-group">
<label class="control-label" for="summary">Include excerpt</label>
<div class="controls">
<input type="checkbox" name="summary" value="1" id="summary" style="margin-top: 7px;" />
</div>
</div>
<?php } ?>
<div class="control-group" style="margin-top: -15px;">
<label class="control-label" for="json">JSON output</label> <label class="control-label" for="json">JSON output</label>
<div class="controls"> <div class="controls">
<input type="checkbox" name="format" value="json" id="json" style="margin-top: 7px;" /> <input type="checkbox" name="format" value="json" id="json" style="margin-top: 7px;" />
@ -170,7 +179,7 @@ if (!defined('_FF_FTR_INDEX')) {
<p><?php if (!file_exists('custom_config.php')) { ?>To change the configuration, save a copy of <tt>config.php</tt> as <tt>custom_config.php</tt> and make any changes you like to it.<?php } else { ?>To change the configuration, edit <tt>custom_config.php</tt> and make any changes you like.<?php } ?></p> <p><?php if (!file_exists('custom_config.php')) { ?>To change the configuration, save a copy of <tt>config.php</tt> as <tt>custom_config.php</tt> and make any changes you like to it.<?php } else { ?>To change the configuration, edit <tt>custom_config.php</tt> and make any changes you like.<?php } ?></p>
<h3>Manage and update site config files</h3> <h3>Manage and update site config files</h3>
<p>For best results, we suggest you update the site config files bundled with Full-Text RSS. If you've purchased Full-Text RSS from us, you'll receive an email when these are updated.</p> <p>For best results, we suggest you update the site config files bundled with Full-Text RSS.</p>
<p>The easiest way to update these is via the <a href="admin/">admin area</a>. (For advanced users, you'll also be able to edit and test the extraction rules contained in the site config files from the admin area.)</p> <p>The easiest way to update these is via the <a href="admin/">admin area</a>. (For advanced users, you'll also be able to edit and test the extraction rules contained in the site config files from the admin area.)</p>
<h3>Customise this page</h3> <h3>Customise this page</h3>
@ -253,9 +262,9 @@ if (!defined('_FF_FTR_INDEX')) {
<!-- UPDATES TAB --> <!-- UPDATES TAB -->
<div id="updates" class="tab-pane"> <div id="updates" class="tab-pane">
<?php <?php
$site_config_version_file = dirname(__FILE__).'/site_config/standard/version.php'; $site_config_version_file = dirname(__FILE__).'/site_config/standard/version.txt';
if (file_exists($site_config_version_file)) { if (file_exists($site_config_version_file)) {
$site_config_version = include($site_config_version_file); $site_config_version = file_get_contents($site_config_version_file);
} }
?> ?>
<p>Your version of Full-Text RSS: <strong><?php echo _FF_FTR_VERSION; ?></strong><br /> <p>Your version of Full-Text RSS: <strong><?php echo _FF_FTR_VERSION; ?></strong><br />

View File

@ -230,7 +230,7 @@ class ContentExtractor
$this->debug("...XPath match: $pattern"); $this->debug("...XPath match: $pattern");
// remove title from document // remove title from document
try { try {
$elems->item(0)->parentNode->removeChild($elems->item(0)); @$elems->item(0)->parentNode->removeChild($elems->item(0));
} catch (DOMException $e) { } catch (DOMException $e) {
// do nothing // do nothing
} }
@ -724,5 +724,4 @@ class ContentExtractor
public function getNextPageUrl() { public function getNextPageUrl() {
return $this->nextPageUrl; return $this->nextPageUrl;
} }
} }
?>

View File

@ -5,10 +5,10 @@
* Each instance of this class should hold extraction patterns and other directives * Each instance of this class should hold extraction patterns and other directives
* for a website. See ContentExtractor class to see how it's used. * for a website. See ContentExtractor class to see how it's used.
* *
* @version 0.7 * @version 0.8
* @date 2012-08-27 * @date 2013-04-16
* @author Keyvan Minoukadeh * @author Keyvan Minoukadeh
* @copyright 2012 Keyvan Minoukadeh * @copyright 2013 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/ */
@ -180,7 +180,7 @@ class SiteConfig
public function append(SiteConfig $newconfig) { public function append(SiteConfig $newconfig) {
// check for commands where we accept multiple statements (no test_url) // check for commands where we accept multiple statements (no test_url)
foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) {
// append array elements for this config variable from $newconfig to this config // append array elements for this config variable from $newconfig to this config
//$this->$var = $this->$var + $newconfig->$var; //$this->$var = $this->$var + $newconfig->$var;
$this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); $this->$var = array_unique(array_merge($this->$var, $newconfig->$var));
@ -190,6 +190,12 @@ class SiteConfig
foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
if ($this->$var === null) $this->$var = $newconfig->$var; if ($this->$var === null) $this->$var = $newconfig->$var;
} }
// treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
foreach (array('find_string', 'replace_string') as $var) {
// append array elements for this config variable from $newconfig to this config
//$this->$var = $this->$var + $newconfig->$var;
$this->$var = array_merge($this->$var, $newconfig->$var);
}
} }
// returns SiteConfig instance if an appropriate one is found, false otherwise // returns SiteConfig instance if an appropriate one is found, false otherwise
@ -334,5 +340,4 @@ class SiteConfig
} }
return $config; return $config;
} }
} }
?>

View File

@ -110,6 +110,11 @@ define('JSONP', 3, true);
} }
} }
public function &getItems()
{
return $this->items;
}
/** /**
* Create a new FeedItem. * Create a new FeedItem.
* *
@ -239,7 +244,7 @@ define('JSONP', 3, true);
{ {
$out = '<?xml version="1.0" encoding="utf-8"?>'."\n"; $out = '<?xml version="1.0" encoding="utf-8"?>'."\n";
if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL; if ($this->xsl) $out .= '<?xml-stylesheet type="text/xsl" href="'.htmlspecialchars($this->xsl).'"?>' . PHP_EOL;
$out .= '<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL; $out .= '<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:media="http://search.yahoo.com/mrss/">' . PHP_EOL;
echo $out; echo $out;
} }
elseif ($this->version == JSON || $this->version == JSONP) elseif ($this->version == JSON || $this->version == JSONP)

View File

@ -134,6 +134,7 @@ class HTML5_TreeBuilder {
// Namespaces for foreign content // Namespaces for foreign content
const NS_HTML = null; // to prevent DOM from requiring NS on everything const NS_HTML = null; // to prevent DOM from requiring NS on everything
const NS_XHTML = 'http://www.w3.org/1999/xhtml';
const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; const NS_MATHML = 'http://www.w3.org/1998/Math/MathML';
const NS_SVG = 'http://www.w3.org/2000/svg'; const NS_SVG = 'http://www.w3.org/2000/svg';
const NS_XLINK = 'http://www.w3.org/1999/xlink'; const NS_XLINK = 'http://www.w3.org/1999/xlink';
@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder {
} }
private function insertElement($token, $append = true) { private function insertElement($token, $append = true) {
$el = $this->dom->createElementNS(self::NS_HTML, $token['name']); //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']);
$namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML;
$el = $this->dom->createElementNS($namespaceURI, $token['name']);
if (!empty($token['attr'])) { if (!empty($token['attr'])) {
foreach($token['attr'] as $attr) { foreach($token['attr'] as $attr) {
if(!$el->hasAttribute($attr['name'])) {
// mike@macgirvin.com 2011-11-17, check attribute name for
// validity (ignoring extenders and combiners) as illegal chars in names
// causes everything to abort
$valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']);
if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) {
$el->setAttribute($attr['name'], $attr['value']); $el->setAttribute($attr['name'], $attr['value']);
} }
} }

View File

@ -400,5 +400,4 @@ class CookieJar
} }
return false; return false;
} }
} }
?>

View File

@ -7,11 +7,11 @@
* For environments which do not have these options, it reverts to standard sequential * For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents()) * requests (using file_get_contents())
* *
* @version 1.1 * @version 1.4
* @date 2012-08-20 * @date 2013-05-10
* @see http://php.net/HttpRequestPool * @see http://php.net/HttpRequestPool
* @author Keyvan Minoukadeh * @author Keyvan Minoukadeh
* @copyright 2011-2012 Keyvan Minoukadeh * @copyright 2011-2013 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/ */
@ -22,7 +22,7 @@ class HumbleHttpAgent
const METHOD_FILE_GET_CONTENTS = 4; const METHOD_FILE_GET_CONTENTS = 4;
//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2';
const UA_PHP = 'PHP/5.2'; const UA_PHP = 'PHP/5.4';
const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
protected $requests = array(); protected $requests = array();
@ -82,6 +82,8 @@ class HumbleHttpAgent
// set request options (redirect must be 0) // set request options (redirect must be 0)
$this->requestOptions = array( $this->requestOptions = array(
'timeout' => 15, 'timeout' => 15,
'connecttimeout' => 15,
'dns_cache_timeout' => 300,
'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
// TODO: test onprogress? // TODO: test onprogress?
); );
@ -155,6 +157,37 @@ class HumbleHttpAgent
return $iri->get_iri(); return $iri->get_iri();
} }
public function getRedirectURLfromHTML($url, $html) {
$redirect_url = $this->getMetaRefreshURL($url, $html);
if (!$redirect_url) {
$redirect_url = $this->getUglyURL($url, $html);
}
return $redirect_url;
}
public function getMetaRefreshURL($url, $html) {
if ($html == '') return false;
// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
return false;
}
$redirect_url = $match[1];
if (preg_match('!^https?://!i', $redirect_url)) {
// already absolute
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url);
return $redirect_url;
}
// absolutize redirect URL
$base = new SimplePie_IRI($url);
// remove '//' in URL path (causes URLs not to resolve properly)
if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
return $absolute;
}
return false;
}
public function getUglyURL($url, $html) { public function getUglyURL($url, $html) {
if ($html == '') return false; if ($html == '') return false;
$found = false; $found = false;
@ -173,7 +206,9 @@ class HumbleHttpAgent
} }
$query['_escaped_fragment_'] = ''; $query['_escaped_fragment_'] = '';
$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
return $iri->get_iri(); $ugly_url = $iri->get_iri();
$this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url);
return $ugly_url;
} }
public function removeFragment($url) { public function removeFragment($url) {
@ -339,9 +374,8 @@ class HumbleHttpAgent
// for AJAX sites, e.g. Blogger with its dynamic views templates. // for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) { if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
if ($redirectURL) { if ($redirectURL) {
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
$this->redirectQueue[$orig] = $redirectURL; $this->redirectQueue[$orig] = $redirectURL;
} }
} }
@ -464,9 +498,8 @@ class HumbleHttpAgent
// for AJAX sites, e.g. Blogger with its dynamic views templates. // for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) { if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
if ($redirectURL) { if ($redirectURL) {
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
$this->redirectQueue[$orig] = $redirectURL; $this->redirectQueue[$orig] = $redirectURL;
} }
} }
@ -551,9 +584,8 @@ class HumbleHttpAgent
// for AJAX sites, e.g. Blogger with its dynamic views templates. // for AJAX sites, e.g. Blogger with its dynamic views templates.
// Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification
if (isset($this->requests[$orig]['body'])) { if (isset($this->requests[$orig]['body'])) {
$redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000));
if ($redirectURL) { if ($redirectURL) {
$this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL);
$this->redirectQueue[$orig] = $redirectURL; $this->redirectQueue[$orig] = $redirectURL;
} }
} }
@ -775,5 +807,4 @@ if (!function_exists('gzdecode')) {
} }
return $data; return $data;
} }
} }
?>

View File

@ -75,5 +75,4 @@ class SimplePie_HumbleHttpAgent extends SimplePie_File
$this->success = false; $this->success = false;
} }
} }
} }
?>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,57 @@
<?php
class Text_LanguageDetect_Exception extends Exception
{
/**
* Database file could not be found
*/
const DB_NOT_FOUND = 10;
/**
* Database file found, but not readable
*/
const DB_NOT_READABLE = 11;
/**
* Database file is empty
*/
const DB_EMPTY = 12;
/**
* Database contents is not a PHP array
*/
const DB_NOT_ARRAY = 13;
/**
* Magic quotes are activated
*/
const MAGIC_QUOTES = 14;
/**
* Parameter of invalid type passed to method
*/
const PARAM_TYPE = 20;
/**
* Character in parameter is invalid
*/
const INVALID_CHAR = 21;
/**
* Language is not in the database
*/
const UNKNOWN_LANGUAGE = 30;
/**
* Error during block detection
*/
const BLOCK_DETECTION = 40;
/**
* Error while clustering languages
*/
const NO_HIGHEST_KEY = 50;
}

View File

@ -0,0 +1,339 @@
<?php
/**
* Part of Text_LanguageDetect
*
* PHP version 5
*
* @category Text
* @package Text_LanguageDetect
* @author Christian Weiske <cweiske@php.net>
* @copyright 2011 Christian Weiske <cweiske@php.net>
* @license http://www.debian.org/misc/bsd.license BSD
* @version SVN: $Id$
* @link http://pear.php.net/package/Text_LanguageDetect/
*/
/**
* Provides a mapping between the languages from lang.dat and the
* ISO 639-1 and ISO-639-2 codes.
*
* Note that this class contains only languages that exist in lang.dat.
*
* @category Text
* @package Text_LanguageDetect
* @author Christian Weiske <cweiske@php.net>
* @copyright 2011 Christian Weiske <cweiske@php.net>
* @license http://www.debian.org/misc/bsd.license BSD
* @link http://www.loc.gov/standards/iso639-2/php/code_list.php
*/
class Text_LanguageDetect_ISO639
{
/**
* Maps all language names from the language database to the
* ISO 639-1 2-letter language code.
*
* NULL indicates that there is no 2-letter code.
*
* @var array
*/
public static $nameToCode2 = array(
'albanian' => 'sq',
'arabic' => 'ar',
'azeri' => 'az',
'bengali' => 'bn',
'bulgarian' => 'bg',
'cebuano' => null,
'croatian' => 'hr',
'czech' => 'cs',
'danish' => 'da',
'dutch' => 'nl',
'english' => 'en',
'estonian' => 'et',
'farsi' => 'fa',
'finnish' => 'fi',
'french' => 'fr',
'german' => 'de',
'hausa' => 'ha',
'hawaiian' => null,
'hindi' => 'hi',
'hungarian' => 'hu',
'icelandic' => 'is',
'indonesian' => 'id',
'italian' => 'it',
'kazakh' => 'kk',
'kyrgyz' => 'ky',
'latin' => 'la',
'latvian' => 'lv',
'lithuanian' => 'lt',
'macedonian' => 'mk',
'mongolian' => 'mn',
'nepali' => 'ne',
'norwegian' => 'no',
'pashto' => 'ps',
'pidgin' => null,
'polish' => 'pl',
'portuguese' => 'pt',
'romanian' => 'ro',
'russian' => 'ru',
'serbian' => 'sr',
'slovak' => 'sk',
'slovene' => 'sl',
'somali' => 'so',
'spanish' => 'es',
'swahili' => 'sw',
'swedish' => 'sv',
'tagalog' => 'tl',
'turkish' => 'tr',
'ukrainian' => 'uk',
'urdu' => 'ur',
'uzbek' => 'uz',
'vietnamese' => 'vi',
'welsh' => 'cy',
);
/**
* Maps all language names from the language database to the
* ISO 639-2 3-letter language code.
*
* @var array
*/
public static $nameToCode3 = array(
'albanian' => 'sqi',
'arabic' => 'ara',
'azeri' => 'aze',
'bengali' => 'ben',
'bulgarian' => 'bul',
'cebuano' => 'ceb',
'croatian' => 'hrv',
'czech' => 'ces',
'danish' => 'dan',
'dutch' => 'nld',
'english' => 'eng',
'estonian' => 'est',
'farsi' => 'fas',
'finnish' => 'fin',
'french' => 'fra',
'german' => 'deu',
'hausa' => 'hau',
'hawaiian' => 'haw',
'hindi' => 'hin',
'hungarian' => 'hun',
'icelandic' => 'isl',
'indonesian' => 'ind',
'italian' => 'ita',
'kazakh' => 'kaz',
'kyrgyz' => 'kir',
'latin' => 'lat',
'latvian' => 'lav',
'lithuanian' => 'lit',
'macedonian' => 'mkd',
'mongolian' => 'mon',
'nepali' => 'nep',
'norwegian' => 'nor',
'pashto' => 'pus',
'pidgin' => 'crp',
'polish' => 'pol',
'portuguese' => 'por',
'romanian' => 'ron',
'russian' => 'rus',
'serbian' => 'srp',
'slovak' => 'slk',
'slovene' => 'slv',
'somali' => 'som',
'spanish' => 'spa',
'swahili' => 'swa',
'swedish' => 'swe',
'tagalog' => 'tgl',
'turkish' => 'tur',
'ukrainian' => 'ukr',
'urdu' => 'urd',
'uzbek' => 'uzb',
'vietnamese' => 'vie',
'welsh' => 'cym',
);
/**
* Maps ISO 639-1 2-letter language codes to the language names
* in the language database
*
* Not all languages have a 2 letter code, so some are missing
*
* @var array
*/
public static $code2ToName = array(
'ar' => 'arabic',
'az' => 'azeri',
'bg' => 'bulgarian',
'bn' => 'bengali',
'cs' => 'czech',
'cy' => 'welsh',
'da' => 'danish',
'de' => 'german',
'en' => 'english',
'es' => 'spanish',
'et' => 'estonian',
'fa' => 'farsi',
'fi' => 'finnish',
'fr' => 'french',
'ha' => 'hausa',
'hi' => 'hindi',
'hr' => 'croatian',
'hu' => 'hungarian',
'id' => 'indonesian',
'is' => 'icelandic',
'it' => 'italian',
'kk' => 'kazakh',
'ky' => 'kyrgyz',
'la' => 'latin',
'lt' => 'lithuanian',
'lv' => 'latvian',
'mk' => 'macedonian',
'mn' => 'mongolian',
'ne' => 'nepali',
'nl' => 'dutch',
'no' => 'norwegian',
'pl' => 'polish',
'ps' => 'pashto',
'pt' => 'portuguese',
'ro' => 'romanian',
'ru' => 'russian',
'sk' => 'slovak',
'sl' => 'slovene',
'so' => 'somali',
'sq' => 'albanian',
'sr' => 'serbian',
'sv' => 'swedish',
'sw' => 'swahili',
'tl' => 'tagalog',
'tr' => 'turkish',
'uk' => 'ukrainian',
'ur' => 'urdu',
'uz' => 'uzbek',
'vi' => 'vietnamese',
);
/**
* Maps ISO 639-2 3-letter language codes to the language names
* in the language database.
*
* @var array
*/
public static $code3ToName = array(
'ara' => 'arabic',
'aze' => 'azeri',
'ben' => 'bengali',
'bul' => 'bulgarian',
'ceb' => 'cebuano',
'ces' => 'czech',
'crp' => 'pidgin',
'cym' => 'welsh',
'dan' => 'danish',
'deu' => 'german',
'eng' => 'english',
'est' => 'estonian',
'fas' => 'farsi',
'fin' => 'finnish',
'fra' => 'french',
'hau' => 'hausa',
'haw' => 'hawaiian',
'hin' => 'hindi',
'hrv' => 'croatian',
'hun' => 'hungarian',
'ind' => 'indonesian',
'isl' => 'icelandic',
'ita' => 'italian',
'kaz' => 'kazakh',
'kir' => 'kyrgyz',
'lat' => 'latin',
'lav' => 'latvian',
'lit' => 'lithuanian',
'mkd' => 'macedonian',
'mon' => 'mongolian',
'nep' => 'nepali',
'nld' => 'dutch',
'nor' => 'norwegian',
'pol' => 'polish',
'por' => 'portuguese',
'pus' => 'pashto',
'rom' => 'romanian',
'rus' => 'russian',
'slk' => 'slovak',
'slv' => 'slovene',
'som' => 'somali',
'spa' => 'spanish',
'sqi' => 'albanian',
'srp' => 'serbian',
'swa' => 'swahili',
'swe' => 'swedish',
'tgl' => 'tagalog',
'tur' => 'turkish',
'ukr' => 'ukrainian',
'urd' => 'urdu',
'uzb' => 'uzbek',
'vie' => 'vietnamese',
);
/**
* Returns the 2-letter ISO 639-1 code for the given language name.
*
* @param string $lang English language name like "swedish"
*
* @return string Two-letter language code (e.g. "sv") or NULL if not found
*/
public static function nameToCode2($lang)
{
$lang = strtolower($lang);
if (!isset(self::$nameToCode2[$lang])) {
return null;
}
return self::$nameToCode2[$lang];
}
/**
* Returns the 3-letter ISO 639-2 code for the given language name.
*
* @param string $lang English language name like "swedish"
*
* @return string Three-letter language code (e.g. "swe") or NULL if not found
*/
public static function nameToCode3($lang)
{
$lang = strtolower($lang);
if (!isset(self::$nameToCode3[$lang])) {
return null;
}
return self::$nameToCode3[$lang];
}
/**
* Returns the language name for the given 2-letter ISO 639-1 code.
*
* @param string $code Two-letter language code (e.g. "sv")
*
* @return string English language name like "swedish"
*/
public static function code2ToName($code)
{
$lang = strtolower($code);
if (!isset(self::$code2ToName[$code])) {
return null;
}
return self::$code2ToName[$code];
}
/**
* Returns the language name for the given 3-letter ISO 639-2 code.
*
* @param string $code Three-letter language code (e.g. "swe")
*
* @return string English language name like "swedish"
*/
public static function code3ToName($code)
{
$lang = strtolower($code);
if (!isset(self::$code3ToName[$code])) {
return null;
}
return self::$code3ToName[$code];
}
}

View File

@ -8,7 +8,7 @@
* @author Nicholas Pisarro * @author Nicholas Pisarro
* @copyright 2006 * @copyright 2006
* @license BSD * @license BSD
* @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
* @link http://pear.php.net/package/Text_LanguageDetect/ * @link http://pear.php.net/package/Text_LanguageDetect/
* @link http://langdetect.blogspot.com/ * @link http://langdetect.blogspot.com/
*/ */
@ -28,7 +28,7 @@
* @author Nicholas Pisarro * @author Nicholas Pisarro
* @copyright 2006 * @copyright 2006
* @license BSD * @license BSD
* @version release: 0.2.3 * @version release: 0.3.0
*/ */
class Text_LanguageDetect_Parser extends Text_LanguageDetect class Text_LanguageDetect_Parser extends Text_LanguageDetect
{ {
@ -102,21 +102,17 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
* @access private * @access private
* @param string $string string to be parsed * @param string $string string to be parsed
*/ */
function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { function Text_LanguageDetect_Parser($string) {
if (isset($db)) $this->_db_filename = $db;
if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db;
$this->_string = $string; $this->_string = $string;
} }
/** /**
* Returns true if a string is suitable for parsing * Returns true if a string is suitable for parsing
* *
* @static
* @access public
* @param string $str input string to test * @param string $str input string to test
* @return bool true if acceptable, false if not * @return bool true if acceptable, false if not
*/ */
function validateString($str) { public static function validateString($str) {
if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
return true; return true;
} else { } else {
@ -222,8 +218,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
// unicode startup // unicode startup
if ($this->_compile_unicode) { if ($this->_compile_unicode) {
$blocks =& $this->_read_unicode_block_db(); $blocks = $this->_read_unicode_block_db();
$block_count = count($blocks); $block_count = count($blocks);
$skipped_count = 0; $skipped_count = 0;
@ -349,6 +344,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
} }
} }
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
?>

View File

@ -1059,8 +1059,8 @@ class Readability
} else if ( $input > floor($p/3) ) { } else if ( $input > floor($p/3) ) {
$this->dbg(' too many <input> elements'); $this->dbg(' too many <input> elements');
$toRemove = true; $toRemove = true;
} else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) {
$this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images');
$toRemove = true; $toRemove = true;
} else if($weight < 25 && $linkDensity > 0.2) { } else if($weight < 25 && $linkDensity > 0.2) {
$this->dbg(' weight smaller than 25 and link density above 0.2'); $this->dbg(' weight smaller than 25 and link density above 0.2');

View File

@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh // Author: Keyvan Minoukadeh
// Copyright (c) 2013 Keyvan Minoukadeh // Copyright (c) 2013 Keyvan Minoukadeh
// License: AGPLv3 // License: AGPLv3
// Version: 3.1 // Version: 3.2
// Date: 2013-03-05 // Date: 2013-05-13
// More info: http://fivefilters.org/content-only/ // More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org // Help: http://help.fivefilters.org
@ -25,12 +25,8 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
// Usage // Usage
// ----- // -----
// Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org // Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article
// The following options can be passed in the querystring: // For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
// * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url))
// * URL points to HTML (not feed): html=true (optional, by default it's automatically detected)
// * API key: key=[api key] (optional, refer to config.php)
// * Max entries to process: max=[max number of items] (optional)
error_reporting(E_ALL ^ E_NOTICE); error_reporting(E_ALL ^ E_NOTICE);
ini_set("display_errors", 1); ini_set("display_errors", 1);
@ -199,6 +195,8 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']); if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
if (isset($_GET['xss'])) $redirect .= '&xss'; if (isset($_GET['xss'])) $redirect .= '&xss';
if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title'; if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';
if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']);
if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']);
if (isset($_GET['debug'])) $redirect .= '&debug'; if (isset($_GET['debug'])) $redirect .= '&debug';
if ($debug_mode) { if ($debug_mode) {
debug('Redirecting to hide access key, follow URL below to continue'); debug('Redirecting to hide access key, follow URL below to continue');
@ -284,6 +282,28 @@ if ($options->favour_feed_titles == 'user') {
$favour_feed_titles = $options->favour_feed_titles; $favour_feed_titles = $options->favour_feed_titles;
} }
///////////////////////////////////////////////
// Include full content in output?
///////////////////////////////////////////////
if ($options->content === 'user') {
if (isset($_GET['content']) && $_GET['content'] === '0') {
$options->content = false;
} else {
$options->content = true;
}
}
///////////////////////////////////////////////
// Include summaries in output?
///////////////////////////////////////////////
if ($options->summary === 'user') {
if (isset($_GET['summary']) && $_GET['summary'] === '1') {
$options->summary = true;
} else {
$options->summary = false;
}
}
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Exclude items if extraction fails // Exclude items if extraction fails
/////////////////////////////////////////////// ///////////////////////////////////////////////
@ -306,15 +326,6 @@ if ($options->detect_language === 'user') {
$detect_language = $options->detect_language; $detect_language = $options->detect_language;
} }
if ($detect_language >= 2) {
$language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',
'cebuano' => 'ceb', // ISO 639-2
'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',
'hawaiian' => 'haw', // ISO 639-2
'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',
'pidgin' => 'cpe', // ISO 639-2
'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');
}
$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0); $use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
///////////////////////////////////// /////////////////////////////////////
@ -364,7 +375,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *');
////////////////////////////////// //////////////////////////////////
if ($options->caching) { if ($options->caching) {
debug('Caching is enabled...'); debug('Caching is enabled...');
$cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); $cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
$check_cache = true; $check_cache = true;
if ($options->apc && $options->smart_cache) { if ($options->apc && $options->smart_cache) {
apc_add("cache.$cache_id", 0, 10*60); apc_add("cache.$cache_id", 0, 10*60);
@ -605,14 +616,33 @@ foreach ($items as $key => $item) {
$is_single_page = false; $is_single_page = false;
if ($single_page_response = getSinglePage($item, $html, $effective_url)) { if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
$is_single_page = true; $is_single_page = true;
$html = $single_page_response['body'];
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $single_page_response['headers']);
$effective_url = $single_page_response['effective_url']; $effective_url = $single_page_response['effective_url'];
debug("Retrieved single-page view from $effective_url"); // check if action defined for returned Content-Type
$mime_info = get_mime_action_info($single_page_response['headers']);
if (isset($mime_info['action'])) {
if ($mime_info['action'] == 'exclude') {
continue; // skip this feed item entry
} elseif ($mime_info['action'] == 'link') {
if ($mime_info['type'] == 'image') {
$html = "<a href=\"$effective_url\"><img src=\"$effective_url\" alt=\"{$mime_info['name']}\" /></a>";
} else {
$html = "<a href=\"$effective_url\">Download {$mime_info['name']}</a>";
}
$extracted_title = $mime_info['name'];
$do_content_extraction = false;
}
}
if ($do_content_extraction) {
$html = $single_page_response['body'];
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $single_page_response['headers']);
debug("Retrieved single-page view from $effective_url");
}
unset($single_page_response); unset($single_page_response);
} }
}
if ($do_content_extraction) {
debug('--------'); debug('--------');
debug('Attempting to extract content'); debug('Attempting to extract content');
$extract_result = $extractor->process($html, $effective_url); $extract_result = $extractor->process($html, $effective_url);
@ -622,7 +652,7 @@ foreach ($items as $key => $item) {
// Deal with multi-page articles // Deal with multi-page articles
//die('Next: '.$extractor->getNextPageUrl()); //die('Next: '.$extractor->getNextPageUrl());
$is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl()); $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl());
if ($options->multipage && $is_multi_page) { if ($options->multipage && $is_multi_page && $options->content) {
debug('--------'); debug('--------');
debug('Attempting to process multi-page article'); debug('Attempting to process multi-page article');
$multi_page_urls = array(); $multi_page_urls = array();
@ -660,13 +690,15 @@ foreach ($items as $key => $item) {
// did we successfully deal with this multi-page article? // did we successfully deal with this multi-page article?
if (empty($multi_page_content)) { if (empty($multi_page_content)) {
debug('Failed to extract all parts of multi-page article, so not going to include them'); debug('Failed to extract all parts of multi-page article, so not going to include them');
$multi_page_content[] = $readability->dom->createElement('p')->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>'; $_page = $readability->dom->createElement('p');
$_page->innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>';
$multi_page_content[] = $_page;
} }
foreach ($multi_page_content as $_page) { foreach ($multi_page_content as $_page) {
$_page = $content_block->ownerDocument->importNode($_page, true); $_page = $content_block->ownerDocument->importNode($_page, true);
$content_block->appendChild($_page); $content_block->appendChild($_page);
} }
unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url); unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page);
} }
} }
// use extracted title for both feed and item title if we're using single-item dummy feed // use extracted title for both feed and item title if we're using single-item dummy feed
@ -713,7 +745,7 @@ foreach ($items as $key => $item) {
} else { } else {
$html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
} }
unset($content_block); //unset($content_block);
// post-processing cleanup // post-processing cleanup
$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html); $html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
if ($links == 'remove') { if ($links == 'remove') {
@ -726,130 +758,156 @@ foreach ($items as $key => $item) {
} }
} }
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
$newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
} else {
$newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
}
// filter xss?
if ($xss_filter) {
debug('Filtering HTML to remove XSS');
$html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1));
}
// add content
if ($options->summary === true) {
// get summary
$summary = '';
if (!$do_content_extraction) {
$summary = $html;
} else { } else {
$newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); // Try to get first few paragraphs
} if (isset($content_block) && ($content_block instanceof DOMElement)) {
// filter xss? $_paras = $content_block->getElementsByTagName('p');
if ($xss_filter) { foreach ($_paras as $_para) {
debug('Filtering HTML to remove XSS'); $summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' ';
$html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); if (strlen($summary) > 200) break;
}
$newitem->setDescription($html);
// set date
if ((int)$item->get_date('U') > 0) {
$newitem->setDate((int)$item->get_date('U'));
} elseif ($extractor->getDate()) {
$newitem->setDate($extractor->getDate());
}
// add authors
if ($authors = $item->get_authors()) {
foreach ($authors as $author) {
// for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
if ($author->get_name() !== null) {
$newitem->addElement('dc:creator', $author->get_name());
} elseif ($author->get_email() !== null) {
$newitem->addElement('dc:creator', $author->get_email());
} }
} } else {
} elseif ($authors = $extractor->getAuthors()) { $summary = $html;
//TODO: make sure the list size is reasonable
foreach ($authors as $author) {
// TODO: xpath often selects authors from other articles linked from the page.
// for now choose first item
$newitem->addElement('dc:creator', $author);
break;
} }
} }
unset($_paras, $_para);
// add language $summary = get_excerpt($summary);
if ($detect_language) { $newitem->setDescription($summary);
$language = $extractor->getLanguage(); if ($options->content) $newitem->setElement('content:encoded', $html);
if (!$language) $language = $feed->get_language(); } else {
if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { if ($options->content) $newitem->setDescription($html);
try { }
if ($use_cld) {
// Use PHP-CLD extension // set date
$php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error if ((int)$item->get_date('U') > 0) {
$res = $php_cld($text_sample); $newitem->setDate((int)$item->get_date('U'));
if (is_array($res) && count($res) > 0) { } elseif ($extractor->getDate()) {
$language = $res[0]['code']; $newitem->setDate($extractor->getDate());
} }
} else {
//die('what'); // add authors
// Use PEAR's Text_LanguageDetect if ($authors = $item->get_authors()) {
if (!isset($l)) { foreach ($authors as $author) {
$l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat'); // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
} if ($author->get_name() !== null) {
$l_result = $l->detect($text_sample, 1); $newitem->addElement('dc:creator', $author->get_name());
if (count($l_result) > 0) { } elseif ($author->get_email() !== null) {
$language = $language_codes[key($l_result)]; $newitem->addElement('dc:creator', $author->get_email());
} }
}
} elseif ($authors = $extractor->getAuthors()) {
//TODO: make sure the list size is reasonable
foreach ($authors as $author) {
// TODO: xpath often selects authors from other articles linked from the page.
// for now choose first item
$newitem->addElement('dc:creator', $author);
break;
}
}
// add language
if ($detect_language) {
$language = $extractor->getLanguage();
if (!$language) $language = $feed->get_language();
if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {
try {
if ($use_cld) {
// Use PHP-CLD extension
$php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error
$res = $php_cld($text_sample);
if (is_array($res) && count($res) > 0) {
$language = $res[0]['code'];
}
} else {
//die('what');
// Use PEAR's Text_LanguageDetect
if (!isset($l)) {
$l = new Text_LanguageDetect();
$l->setNameMode(2); // return ISO 639-1 codes (e.g. "en")
} }
} catch (Exception $e) { $l_result = $l->detect($text_sample, 1);
//die('error: '.$e); if (count($l_result) > 0) {
// do nothing $language = key($l_result);
}
}
if ($language && (strlen($language) < 7)) {
$newitem->addElement('dc:language', $language);
}
}
// add MIME type (if it appeared in our exclusions lists)
if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
// add effective URL (URL after redirects)
if (isset($effective_url)) {
//TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
//http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir--25th-March-2012-Special-Program-from-Liari-(Karachi)
//temporary measure: use utf8_encode()
$newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
} else {
$newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
}
// add categories
if ($categories = $item->get_categories()) {
foreach ($categories as $category) {
if ($category->get_label() !== null) {
$newitem->addElement('category', $category->get_label());
}
}
}
// check for enclosures
if ($options->keep_enclosures) {
if ($enclosures = $item->get_enclosures()) {
foreach ($enclosures as $enclosure) {
// thumbnails
foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {
$newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));
} }
if (!$enclosure->get_link()) continue;
$enc = array();
// Media RSS spec ($enc): http://search.yahoo.com/mrss
// SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
$enc['url'] = $enclosure->get_link();
if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
$newitem->addElement('media:content', '', $enc);
} }
} catch (Exception $e) {
//die('error: '.$e);
// do nothing
} }
} }
/* } */ if ($language && (strlen($language) < 7)) {
$newitem->addElement('dc:language', $language);
}
}
// add MIME type (if it appeared in our exclusions lists)
if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']);
// add effective URL (URL after redirects)
if (isset($effective_url)) {
//TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
//http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir--25th-March-2012-Special-Program-from-Liari-(Karachi)
//temporary measure: use utf8_encode()
$newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
} else {
$newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
}
// add categories
if ($categories = $item->get_categories()) {
foreach ($categories as $category) {
if ($category->get_label() !== null) {
$newitem->addElement('category', $category->get_label());
}
}
}
// check for enclosures
if ($options->keep_enclosures) {
if ($enclosures = $item->get_enclosures()) {
foreach ($enclosures as $enclosure) {
// thumbnails
foreach ((array)$enclosure->get_thumbnails() as $thumbnail) {
$newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail));
}
if (!$enclosure->get_link()) continue;
$enc = array();
// Media RSS spec ($enc): http://search.yahoo.com/mrss
// SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
$enc['url'] = $enclosure->get_link();
if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
$newitem->addElement('media:content', '', $enc);
}
}
}
$output->addItem($newitem); $output->addItem($newitem);
unset($html); unset($html);
$item_count++; $item_count++;
@ -906,6 +964,38 @@ if (!$debug_mode) {
// HELPER FUNCTIONS // HELPER FUNCTIONS
/////////////////////////////// ///////////////////////////////
// Adapted from WordPress
// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
function get_excerpt($text, $num_words=55, $more=null) {
if (null === $more) $more = '&hellip;';
$text = strip_tags($text);
//TODO: Check if word count is based on single characters (East Asian characters)
/*
if (1==2) {
$text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' ');
preg_match_all('/./u', $text, $words_array);
$words_array = array_slice($words_array[0], 0, $num_words + 1);
$sep = '';
} else {
$words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
$sep = ' ';
}
*/
$words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY);
$sep = ' ';
if (count($words_array) > $num_words) {
array_pop($words_array);
$text = implode($sep, $words_array);
$text = $text.$more;
} else {
$text = implode($sep, $words_array);
}
// trim whitespace at beginning or end of string
// See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
$text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text);
return $text;
}
function url_allowed($url) { function url_allowed($url) {
global $options; global $options;
if (!empty($options->allowed_urls)) { if (!empty($options->allowed_urls)) {
@ -1005,14 +1095,6 @@ function convert_to_utf8($html, $header=null)
if (strtolower($encoding) != 'utf-8') { if (strtolower($encoding) != 'utf-8') {
debug('Converting to UTF-8'); debug('Converting to UTF-8');
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
/*
if (function_exists('iconv')) {
// iconv appears to handle certain character encodings better than mb_convert_encoding
$html = iconv($encoding, 'utf-8', $html);
} else {
$html = mb_convert_encoding($html, 'utf-8', $encoding);
}
*/
} }
} }
} }

View File

@ -1,3 +1,5 @@
# This file is only used when deploying Full-Text RSS to AppFog.
# See http://help.fivefilters.org/customer/portal/articles/1143210-hosting
--- ---
applications: applications:
.: .:

View File

@ -1,3 +1,2 @@
<?php <?php
// this is here to prevent directory listing over the web // this is here to prevent directory listing over the web
?>

View File

@ -1,3 +0,0 @@
<?php
// this is here to prevent directory listing over the web
?>

View File

@ -1 +0,0 @@
<?php return 4; ?>

View File

@ -1 +1 @@
4 2013-05-12T22:53:07Z