Full-Text RSS 2.8

This commit is contained in:
Keyvan 2012-05-01 00:51:43 +02:00
parent f9f03f14c0
commit eeec0f1982
8 changed files with 448 additions and 311 deletions

View File

@ -2,6 +2,15 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
2.8 (2011-05-30)
- Tidy no longer stripping HTML5 elements
- JSON output (pass &format=json in querystring)
- New site patterns added and old ones updated
- New site config option to force full-page retrieval on multi-page articles: single_page_link
- User Guide (PDF) now included (although still a work in progress)
- URL placeholders now accepted in message_to_prepend/append config options
- Plus minor fixes...
2.7 (2011-03-21)
- Site patterns for better control over extraction (see site_config/README.txt)
- hNews support (improves content extraction for sites using hNews microformatting)

View File

@ -74,11 +74,17 @@ $options->cache_dir = dirname(__FILE__).'/cache';
// Message to prepend (without API key)
// ----------------------
// HTML to insert at the beginning of each feed item when no API key is supplied.
// Substitution tags:
// {url} - Feed item URL
// {effective-url} - Feed item URL after we've followed all redirects
$options->message_to_prepend = '';
// Message to append (without API key)
// ----------------------
// HTML to insert at the end of each feed item when no API key is supplied.
// Substitution tags:
// {url} - Feed item URL
// {effective-url} - Feed item URL after we've followed all redirects
$options->message_to_append = '';
// URLs to allow
@ -188,7 +194,7 @@ $options->error_message_with_key = '[unable to retrieve full-text content]';
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.7');
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.8');
if ((basename(__FILE__) == 'config.php') && (file_exists(dirname(__FILE__).'/custom_config.php'))) {
require_once(dirname(__FILE__).'/custom_config.php');

View File

@ -13,7 +13,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/
$app_name = 'Full-Text RSS 2.7';
$app_name = 'Full-Text RSS 2.8';
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>='));
$pcre_ok = extension_loaded('pcre');
@ -327,7 +327,7 @@ div.chunk {
<p><em>Your webhost has its act together!</em></p>
<p>You can download the latest version of <?php echo $app_name; ?> from <a href="http://fivefilters.org/content-only/#download">FiveFilters.org</a>.</p>
<p><strong>Note</strong>: Passing this test does not guarantee that <?php echo $app_name; ?> will run on your webhost &mdash; it only ensures that the basic requirements have been addressed. If you experience any problems, please let us know.</p>
<?php } else if ($php_ok && $xml_ok && $pcre_ok && $allow_url_fopen_ok && $filter_ok) { ?>
<?php } else if ($php_ok && $xml_ok && $pcre_ok && $mbstring_ok && $allow_url_fopen_ok && $filter_ok) { ?>
<h3>Bottom Line: Yes, you can!</h3>
<p><em>For most feeds, it'll run with no problems.</em> There are certain languages that you might have a hard time with though.</p>
<p>You can download the latest version of <?php echo $app_name; ?> from <a href="http://fivefilters.org/content-only/#download">FiveFilters.org</a>.</p>

View File

@ -96,14 +96,16 @@ if (!defined('_FF_FTR_INDEX')) {
<p>Thanks for downloading and setting this up. If you haven't done so already, <a href="ftr_compatibility_test.php">check server compatibility</a>
to see if your environment will support this application. Full-Text RSS runs on most shared web hosting environments.</p>
<h3>Configure</h3>
<p>In addition to the options above, Full-Text RSS comes with a configuration file which allows you to control how the application works. Features include:</p>
<p>In addition to the options above, Full-Text RSS can be configured to better suit your needs. Features include:</p>
<ul>
<li>Site patterns for better control over extraction (<a href="site_config/README.txt">more info</a>)</li>
<li>Restrict access to a pre-defined set of URLs or block certain URLs</li>
<li>Restrict the maximum number of feed items to be processed</li>
<li>JSON output</li>
<li>Prepend or append an HTML fragment to each feed item processed</li>
<li>Caching</li>
</ul>
<p>Please refer to the <a href="http://fivefilters.org/content-only/guide/user_guide_2.8.pdf">user guide</a> for more information.</p>
<p><?php if (!file_exists('custom_config.php')) { ?>To change the configuration, save a copy of <tt>config.php</tt> as <tt>custom_config.php</tt> and make any changes you like to it.<?php } else { ?>To change the configuration, edit <tt>custom_config.php</tt> and make any changes you like.<?php } ?></p>
<p>If everything works fine, feel free to modify this page by saving it as <tt>custom_index.php</tt> and change it to whatever you like.</p>
@ -118,7 +120,8 @@ if (!defined('_FF_FTR_INDEX')) {
<p>To see if you're running the latest version, <a href="http://fivefilters.org/content-only/latest_version.php?version=<?php echo urlencode(_FF_FTR_VERSION); ?>">check for updates</a>.</p>
<h3 id="donate">Support</h3>
<p>We have more information in the section below, but if you need help with anything, please email <a href="mailto:fivefilters@fivefilters.org">fivefilters@fivefilters.org</a>.</p>
<p>We have a <a href="https://member.fivefilters.org/f/">public forum</a> which anyone can use to discuss any issues, post questions and find answers (it's free to join and post).</p>
<p>We provide a little more information in the section below, but if you need help with anything, you can also email us at <a href="mailto:fivefilters@fivefilters.org">fivefilters@fivefilters.org</a>.</p>
<hr />

View File

@ -5,8 +5,8 @@
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
* to extract content from HTML files.
*
* @version 0.5
* @date 2011-03-07
* @version 0.6
* @date 2011-05-04
* @author Keyvan Minoukadeh
* @copyright 2011 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
@ -14,13 +14,13 @@
class ContentExtractor
{
const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
protected static $config_cache = array();
protected static $tidy_config = array(
'clean' => true,
'output-xhtml' => true,
'logical-emphasis' => true,
'show-body-only' => false,
'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
'new-inline-tags' => 'new-inline-tags: mark, time, meter, progress',
'wrap' => 0,
'drop-empty-paras' => true,
'drop-proprietary-attributes' => false,
@ -31,19 +31,16 @@ class ContentExtractor
'char-encoding' => 'utf8',
'hide-comments' => true
);
protected $config_path;
protected $html;
protected $config;
protected $title;
protected $body;
protected $success = false;
protected $fallback;
public $readability;
public $debug = false;
function __construct($config_path=null, ContentExtractor $config_fallback=null) {
$this->config_path = $config_path;
$this->fallback = $config_fallback;
function __construct($path, $fallback=null) {
SiteConfig::set_config_path($path, $fallback);
}
protected function debug($msg) {
@ -66,71 +63,6 @@ class ContentExtractor
$this->success = false;
}
// returns SiteConfig instance if an appropriate one is found, false otherwise
public function get_site_config($host) {
$host = strtolower($host);
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false;
// check for site configuration
$try = array($host);
$split = explode('.', $host);
if (count($split) > 1) {
array_shift($split);
$try[] = '.'.implode('.', $split);
}
foreach ($try as $h) {
if (array_key_exists($h, self::$config_cache)) {
$this->debug("... cached ($h)");
return self::$config_cache[$h];
} elseif (file_exists($this->config_path."/$h.txt")) {
$this->debug("... from file ($h)");
$file = $this->config_path."/$h.txt";
break;
}
}
if (!isset($file)) {
if (isset($this->fallback)) {
$this->debug("... trying fallback ($host)");
return $this->fallback->get_site_config($host);
} else {
$this->debug("... no match ($host)");
return false;
}
}
$config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
if (!$config_file || !is_array($config_file)) return false;
$config = new SiteConfig();
foreach ($config_file as $line) {
$line = trim($line);
// skip comments, empty lines
if ($line == '' || $line[0] == '#') continue;
// get command
$command = explode(':', $line, 2);
// if there's no colon ':', skip this line
if (count($command) != 2) continue;
$val = trim($command[1]);
$command = trim($command[0]);
if ($command == '' || $val == '') continue;
// check for commands where we accept multiple statements
if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src'))) {
array_push($config->$command, $val);
// check for single statement commands that evaluate to true or false
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
$config->$command = ($val == 'yes');
// check for single statement commands stored as strings
} elseif (in_array($command, array('test_url'))) {
$config->$command = $val;
}
}
// store copy of config in our static cache array in case we need to process another URL
self::$config_cache[$h] = $config;
return $config;
}
// returns true on success, false on failure
// $smart_tidy indicates that if tidy is used and no results are produced, we will
// try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
@ -140,11 +72,12 @@ class ContentExtractor
// extract host name
$host = @parse_url($url, PHP_URL_HOST);
if (!($this->config = $this->get_site_config($host))) {
if (!($this->config = SiteConfig::build($host))) {
// no match, so use defaults
$this->config = new SiteConfig();
self::$config_cache[$host] = $this->config;
}
// store copy of config in our static cache array in case we need to process another URL
SiteConfig::add_to_cache($host, $this->config);
// use tidy (if it exists)?
// This fixes problems with some sites which would otherwise

View File

@ -47,5 +47,120 @@ class SiteConfig
// Test URL - if present, can be used to test the config above
public $test_url = null;
// Single-page link - should identify a link element or URL pointing to the page holding the entire article
// This is useful for sites which split their articles across multiple pages. Links to such pages tend to
// display the first page with links to the other pages at the bottom. Often there is also a link to a page
// which displays the entire article on one page (e.g. 'print view').
// This should be an XPath expression identifying the link to that page. If present and we find a match,
// we will retrieve that page and the rest of the options in this config will be applied to the new page.
public $single_page_link = array();
// Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
public $single_page_link_in_feed = array();
// TODO: which parser to use for turning raw HTML into a DOMDocument
public $parser = 'libxml';
// the options below cannot be set in the config files which this class represents
public static $debug = false;
protected static $config_path;
protected static $config_path_fallback;
protected static $config_cache = array();
const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
protected static function debug($msg) {
if (self::$debug) {
$mem = round(memory_get_usage()/1024, 2);
$memPeak = round(memory_get_peak_usage()/1024, 2);
echo '* ',$msg;
echo ' - mem used: ',$mem," (peak: $memPeak)\n";
ob_flush();
flush();
}
}
public static function set_config_path($path, $fallback=null) {
self::$config_path = $path;
self::$config_path_fallback = $fallback;
}
public static function add_to_cache($host, SiteConfig $config) {
$host = strtolower($host);
self::$config_cache[$host] = $config;
}
// returns SiteConfig instance if an appropriate one is found, false otherwise
public static function build($host) {
$host = strtolower($host);
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false;
// check for site configuration
$try = array($host);
$split = explode('.', $host);
if (count($split) > 1) {
array_shift($split);
$try[] = '.'.implode('.', $split);
}
foreach ($try as $h) {
if (array_key_exists($h, self::$config_cache)) {
self::debug("... cached ($h)");
return self::$config_cache[$h];
} elseif (file_exists(self::$config_path."/$h.txt")) {
self::debug("... from file ($h)");
$file = self::$config_path."/$h.txt";
break;
}
}
if (!isset($file)) {
if (isset(self::$config_path_fallback)) {
self::debug("... trying fallback ($host)");
foreach ($try as $h) {
if (file_exists(self::$config_path_fallback."/$h.txt")) {
self::debug("... from fallback file ($h)");
$file = self::$config_path_fallback."/$h.txt";
break;
}
}
if (!isset($file)) {
self::debug("... no match in fallback directory");
return false;
}
} else {
self::debug("... no match ($host)");
return false;
}
}
$config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
if (!$config_file || !is_array($config_file)) return false;
$config = new SiteConfig();
foreach ($config_file as $line) {
$line = trim($line);
// skip comments, empty lines
if ($line == '' || $line[0] == '#') continue;
// get command
$command = explode(':', $line, 2);
// if there's no colon ':', skip this line
if (count($command) != 2) continue;
$val = trim($command[1]);
$command = trim($command[0]);
if ($command == '' || $val == '') continue;
// check for commands where we accept multiple statements
if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed'))) {
array_push($config->$command, $val);
// check for single statement commands that evaluate to true or false
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
$config->$command = ($val == 'yes');
// check for single statement commands stored as strings
} elseif (in_array($command, array('test_url'))) {
$config->$command = $val;
}
}
return $config;
}
}
?>

View File

@ -7,8 +7,8 @@
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
* @version 0.8
* @date 2011-02-28
* @version 0.9.5
* @date 2011-05-23
* @see http://php.net/HttpRequestPool
* @author Keyvan Minoukadeh
* @copyright 2011 Keyvan Minoukadeh
@ -104,6 +104,15 @@ class HumbleHttpAgent
return $iri->uri;
}
public function removeFragment($url) {
$pos = strpos($url, '#');
if ($pos === false) {
return $url;
} else {
return substr($url, 0, $pos);
}
}
public function enableDebug($bool=true) {
$this->debug = (bool)$bool;
}
@ -211,6 +220,7 @@ class HumbleHttpAgent
} else {
$this->debug("......adding to pool");
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
$req_url = $this->removeFragment($req_url);
$httpRequest = new HttpRequest($req_url, HttpRequest::METH_GET, $this->requestOptions);
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
@ -225,7 +235,11 @@ class HumbleHttpAgent
// did we get anything into the pool?
if (count($pool) > 0) {
$this->debug('Sending request...');
$pool->send();
try {
$pool->send();
} catch (HttpRequestPoolException $e) {
// do nothing
}
$this->debug('Received responses');
foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
@ -240,7 +254,9 @@ class HumbleHttpAgent
// is redirect?
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
$redirectURL = $request->getResponseHeader('location');
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
if (!preg_match('!^https?://!i', $redirectURL)) {
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
@ -298,6 +314,7 @@ class HumbleHttpAgent
} else {
$this->debug("......adding to pool");
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
$req_url = $this->removeFragment($req_url);
$headers = array();
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
@ -327,7 +344,9 @@ class HumbleHttpAgent
$status_code = $this->requests[$orig]['status_code'];
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
if (!preg_match('!^https?://!i', $redirectURL)) {
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
@ -367,6 +386,7 @@ class HumbleHttpAgent
$this->debug("Sending request for $url");
$this->requests[$orig]['original_url'] = $orig;
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
$req_url = $this->removeFragment($req_url);
// send cookies, if we have any
$httpContext = $this->httpContext;
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
@ -391,7 +411,9 @@ class HumbleHttpAgent
}
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
if (!preg_match('!^https?://!i', $redirectURL)) {
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
@ -444,6 +466,7 @@ class HumbleHttpAgent
}
public function get($url, $remove=false) {
$url = "$url";
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
$response = $this->requests[$url];

View File

@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh
// Copyright (c) 2011 Keyvan Minoukadeh
// License: AGPLv3
// Version: 2.7
// Date: 2011-03-21
// Version: 2.8
// Date: 2011-05-23
/*
This program is free software: you can redistribute it and/or modify
@ -73,131 +73,11 @@ function __autoload($class_name) {
}
}
function url_allowed($url) {
global $options;
if (!empty($options->allowed_urls)) {
$allowed = false;
foreach ($options->allowed_urls as $allowurl) {
if (stristr($url, $allowurl) !== false) {
$allowed = true;
break;
}
}
if (!$allowed) return false;
} else {
foreach ($options->blocked_urls as $blockurl) {
if (stristr($url, $blockurl) !== false) {
return false;
}
}
}
return true;
}
////////////////////////////////
// Load config file if it exists
////////////////////////////////
require_once(dirname(__FILE__).'/config.php');
//////////////////////////////////////////////
// Convert $html to UTF8
// (uses HTTP headers and HTML to find encoding)
// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
//////////////////////////////////////////////
function convert_to_utf8($html, $header=null)
{
$encoding = null;
if ($html || $header) {
if (is_array($header)) $header = implode("\n", $header);
if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
// error parsing the response
} else {
$match = end($match); // get last matched element (in case of redirects)
if (isset($match[2])) $encoding = trim($match[2], '"\'');
}
if (!$encoding) {
if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
$encoding = trim($match[1], '"\'');
} elseif(preg_match('/<meta\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html, $match)) {
if (isset($match[1])) $encoding = trim($match[1]);
}
}
if (!$encoding) {
$encoding = 'utf-8';
} else {
if (strtolower($encoding) != 'utf-8') {
if (strtolower($encoding) == 'iso-8859-1') {
// replace MS Word smart qutoes
$trans = array();
$trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark
$trans[chr(131)] = '&fnof;'; // Latin Small Letter F With Hook
$trans[chr(132)] = '&bdquo;'; // Double Low-9 Quotation Mark
$trans[chr(133)] = '&hellip;'; // Horizontal Ellipsis
$trans[chr(134)] = '&dagger;'; // Dagger
$trans[chr(135)] = '&Dagger;'; // Double Dagger
$trans[chr(136)] = '&circ;'; // Modifier Letter Circumflex Accent
$trans[chr(137)] = '&permil;'; // Per Mille Sign
$trans[chr(138)] = '&Scaron;'; // Latin Capital Letter S With Caron
$trans[chr(139)] = '&lsaquo;'; // Single Left-Pointing Angle Quotation Mark
$trans[chr(140)] = '&OElig;'; // Latin Capital Ligature OE
$trans[chr(145)] = '&lsquo;'; // Left Single Quotation Mark
$trans[chr(146)] = '&rsquo;'; // Right Single Quotation Mark
$trans[chr(147)] = '&ldquo;'; // Left Double Quotation Mark
$trans[chr(148)] = '&rdquo;'; // Right Double Quotation Mark
$trans[chr(149)] = '&bull;'; // Bullet
$trans[chr(150)] = '&ndash;'; // En Dash
$trans[chr(151)] = '&mdash;'; // Em Dash
$trans[chr(152)] = '&tilde;'; // Small Tilde
$trans[chr(153)] = '&trade;'; // Trade Mark Sign
$trans[chr(154)] = '&scaron;'; // Latin Small Letter S With Caron
$trans[chr(155)] = '&rsaquo;'; // Single Right-Pointing Angle Quotation Mark
$trans[chr(156)] = '&oelig;'; // Latin Small Ligature OE
$trans[chr(159)] = '&Yuml;'; // Latin Capital Letter Y With Diaeresis
$html = strtr($html, $trans);
}
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
/*
if (function_exists('iconv')) {
// iconv appears to handle certain character encodings better than mb_convert_encoding
$html = iconv($encoding, 'utf-8', $html);
} else {
$html = mb_convert_encoding($html, 'utf-8', $encoding);
}
*/
}
}
}
return $html;
}
function makeAbsolute($base, $elem) {
$base = new IRI($base);
foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
$elems = $elem->getElementsByTagName($tag);
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
//$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
makeAbsoluteAttr($base, $e, $attr);
}
if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
}
}
function makeAbsoluteAttr($base, $e, $attr) {
if ($e->hasAttribute($attr)) {
// Trim leading and trailing white space. I don't really like this but
// unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
$url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
$url = str_replace(' ', '%20', $url);
if (!preg_match('!https?://!i', $url)) {
$absolute = IRI::absolutize($base, $url);
if ($absolute) {
$e->setAttribute($attr, $absolute);
}
}
}
}
////////////////////////////////
// Check if service is enabled
////////////////////////////////
@ -211,7 +91,7 @@ if (!$options->enabled) {
if (!isset($_GET['url'])) {
die('No URL supplied');
}
$url = $_GET['url'];
$url = trim($_GET['url']);
if (!preg_match('!^https?://.+!i', $url)) {
$url = 'http://'.$url;
}
@ -240,6 +120,7 @@ if ($options->alternative_url != '' && !isset($_GET['redir']) && mt_rand(0, 100)
if (isset($_GET['links'])) $redirect .= '&links='.$_GET['links'];
if (isset($_GET['exc'])) $redirect .= '&exc='.$_GET['exc'];
if (isset($_GET['what'])) $redirect .= '&what='.$_GET['what'];
if (isset($_GET['format'])) $redirect .= '&format='.$_GET['format'];
header("Location: $redirect");
exit;
}
@ -258,6 +139,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
if (isset($_GET['what'])) $redirect .= '&what='.urlencode($_GET['what']);
if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
header("Location: $redirect");
exit;
}
@ -364,9 +246,13 @@ if (($extract_pattern != '') && ($extract_pattern != 'auto')) {
/////////////////////////////////////
// Check for valid format
// (stick to RSS for the time being)
// (stick to RSS (or RSS as JSON) for the time being)
/////////////////////////////////////
$format = 'rss';
if (isset($_GET['format']) && $_GET['format'] == 'json') {
$format = 'json';
} else {
$format = 'rss';
}
//////////////////////////////////
// Check for cached copy
@ -392,10 +278,14 @@ if ($options->caching) {
// getting a Zend_Cache_Core object
$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
$cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.(int)isset($_GET['pubsub']));
$cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.$format.(int)isset($_GET['pubsub']));
if ($data = $cache->load($cache_id)) {
header("Content-type: text/xml; charset=UTF-8");
if ($format == 'json') {
header("Content-type: application/json; charset=UTF-8");
} else {
header("Content-type: text/xml; charset=UTF-8");
}
if (headers_sent()) die('Some data has already been output, can\'t send RSS file');
echo $data;
exit;
@ -419,7 +309,7 @@ $http = new HumbleHttpAgent();
//////////////////////////////////
// Set up Content Extractor
//////////////////////////////////
$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', new ContentExtractor(dirname(__FILE__).'/site_config/standard'));
$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', dirname(__FILE__).'/site_config/standard');
/*
if ($options->caching) {
@ -453,7 +343,8 @@ if (!$html_only) {
SimplePie_HumbleHttpAgent::set_agent($http);
$feed = new SimplePie();
$feed->set_file_class('SimplePie_HumbleHttpAgent');
$feed->set_feed_url($url);
//$feed->set_feed_url($url); // colons appearing in the URL's path get encoded
$feed->feed_url = $url;
$feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
$feed->set_timeout(20);
$feed->enable_cache(false);
@ -471,97 +362,34 @@ if (!$html_only) {
}
////////////////////////////////////////////////////////////////////////////////
// Extract content from HTML (if URL is not feed or explicit HTML request has been made)
// Our given URL is not a feed, so let's create our own feed with a single item:
// the given URL. This basically treats all non-feed URLs as if they were
// single-item feeds.
////////////////////////////////////////////////////////////////////////////////
$isDummyFeed = false;
if ($html_only || !$result) {
$isDummyFeed = true;
unset($feed, $result);
if ($response = $http->get($url)) {
$effective_url = $response['effective_url'];
if (!url_allowed($effective_url)) die('URL blocked');
$html = $response['body'];
// remove strange things here
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $response['headers']);
// create single item dummy feed object
class DummySingleItemFeed {
public $item;
function __construct($url) { $this->item = new DummySingleItem($url); }
public function get_title() { return ''; }
public function get_description() { return 'Content extracted from '.$this->item->url; }
public function get_link() { return $this->item->url; }
public function get_image_url() { return false; }
public function get_items($start=0, $max=1) { return array(0=>$this->item); }
}
if (!$response || $response['status_code'] >= 300) {
die('Error retrieving '.$url);
class DummySingleItem {
public $url;
function __construct($url) { $this->url = $url; }
public function get_permalink() { return $this->url; }
public function get_title() { return ''; }
public function get_date($format='') { return false; }
public function get_author() { return false; }
public function get_description() { return ''; }
}
if ($auto_extract) {
$extract_result = $extractor->process($html, $effective_url);
if (!$extract_result) die($options->error_message);
$readability = $extractor->readability;
$content_block = $extractor->getContent();
$title = $extractor->getTitle();
} else {
$readability = new Readability($html, $effective_url);
// content block is entire document
$content_block = $readability->dom;
//TODO: get title
$title = '';
}
if ($extract_pattern) {
$xpath = new DOMXPath($readability->dom);
$elems = @$xpath->query($extract_pattern, $content_block);
// check if our custom extraction pattern matched
if ($elems && $elems->length > 0) {
// get the first matched element
$content_block = $elems->item(0);
// clean it up
$readability->removeScripts($content_block);
$readability->prepArticle($content_block);
} else {
die($options->error_message);
//$content_block = $readability->dom->createElement('p', 'Sorry, could not extract content');
}
}
$readability->clean($content_block, 'select');
if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block);
// footnotes
if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
$readability->addFootnotes($content_block);
}
if ($extract_pattern) {
// get outerHTML
$content = $content_block->ownerDocument->saveXML($content_block);
} else {
if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
$content = $content_block->firstChild->innerHTML;
} else {
$content = $content_block->innerHTML;
}
}
if ($links == 'remove') {
$content = preg_replace('!</?a[^>]*>!', '', $content);
}
if (!$valid_key) {
$content = $options->message_to_prepend.$content;
$content .= $options->message_to_append;
} else {
$content = $options->message_to_prepend_with_key.$content;
$content .= $options->message_to_append_with_key;
}
unset($readability, $html);
$output = new FeedWriter(); //ATOM an option
$output->setTitle($title);
$output->setDescription("Content extracted from $url");
$output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it
if ($format == 'atom') {
$output->setChannelElement('updated', date(DATE_ATOM));
$output->setChannelElement('author', array('name'=>'Five Filters', 'uri'=>'http://fivefilters.org'));
}
$output->setLink($url);
$newitem = $output->createNewItem();
$newitem->setTitle($title);
$newitem->setLink($url);
if ($format == 'atom') {
$newitem->setDate(time());
$newitem->addElement('content', $content);
} else {
$newitem->setDescription($content);
}
$output->addItem($newitem);
$output->genarateFeed();
exit;
$feed = new DummySingleItemFeed($url);
}
////////////////////////////////////////////
@ -594,6 +422,8 @@ $urls_sanitized = array();
$urls = array();
foreach ($items as $key => $item) {
$permalink = htmlspecialchars_decode($item->get_permalink());
// Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
$permalink = str_replace('%3A', ':', $permalink);
$permalink = $http->validateUrl($permalink);
if ($permalink) {
$urls_sanitized[] = $permalink;
@ -625,17 +455,34 @@ foreach ($items as $key => $item) {
$effective_url = $response['effective_url'];
if (!url_allowed($effective_url)) continue;
$html = $response['body'];
// remove strange things here
$html = str_replace('</[>', '', $html);
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $response['headers']);
if ($auto_extract) {
// check site config for single page URL - fetch it if found
if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
$html = $single_page_response['body'];
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $single_page_response['headers']);
$effective_url = $single_page_response['effective_url'];
unset($single_page_response);
}
$extract_result = $extractor->process($html, $effective_url);
$readability = $extractor->readability;
$content_block = ($extract_result) ? $extractor->getContent() : null;
$title = ($extract_result) ? $extractor->getTitle() : '';
} else {
$readability = new Readability($html, $effective_url);
// content block is entire document (for now...)
$content_block = $readability->dom;
$content_block = $readability->dom;
//TODO: get title
$title = '';
}
// use extracted title for both feed and item title if we're using single-item dummy feed
if ($isDummyFeed) {
$output->setTitle($title);
$newitem->setTitle($title);
}
if ($extract_pattern && isset($content_block)) {
$xpath = new DOMXPath($readability->dom);
@ -684,11 +531,11 @@ foreach ($items as $key => $item) {
$html = preg_replace('!</?a[^>]*>!', '', $html);
}
if (!$valid_key) {
$html = $options->message_to_prepend.$html;
$html .= $options->message_to_append;
$html = make_substitutions($options->message_to_prepend).$html;
$html .= make_substitutions($options->message_to_append);
} else {
$html = $options->message_to_prepend_with_key.$html;
$html .= $options->message_to_append_with_key;
$html = make_substitutions($options->message_to_prepend_with_key).$html;
$html .= make_substitutions($options->message_to_append_with_key);
}
}
if ($format == 'atom') {
@ -715,14 +562,215 @@ foreach ($items as $key => $item) {
unset($html);
}
// output feed
if ($options->caching) {
if ($options->caching || $format == 'json') {
ob_start();
$output->genarateFeed();
$output = ob_get_contents();
ob_end_clean();
$cache->save($output, $cache_id);
if ($format == 'json') {
$jsonrss = new stdClass();
$jsonrss->rss = @simplexml_load_string($output);
$output = json_encode($jsonrss);
header("Content-type: application/json; charset=UTF-8");
}
if ($options->caching) $cache->save($output, $cache_id);
echo $output;
} else {
$output->genarateFeed();
}
///////////////////////////////
// HELPER FUNCTIONS
///////////////////////////////
function url_allowed($url) {
global $options;
if (!empty($options->allowed_urls)) {
$allowed = false;
foreach ($options->allowed_urls as $allowurl) {
if (stristr($url, $allowurl) !== false) {
$allowed = true;
break;
}
}
if (!$allowed) return false;
} else {
foreach ($options->blocked_urls as $blockurl) {
if (stristr($url, $blockurl) !== false) {
return false;
}
}
}
return true;
}
//////////////////////////////////////////////
// Convert $html to UTF8
// (uses HTTP headers and HTML to find encoding)
// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
//////////////////////////////////////////////
function convert_to_utf8($html, $header=null)
{
$encoding = null;
if ($html || $header) {
if (is_array($header)) $header = implode("\n", $header);
if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
// error parsing the response
} else {
$match = end($match); // get last matched element (in case of redirects)
if (isset($match[2])) $encoding = trim($match[2], '"\'');
}
if (!$encoding) {
if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
$encoding = trim($match[1], '"\'');
} elseif(preg_match('/<meta\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html, $match)) {
if (isset($match[1])) $encoding = trim($match[1]);
}
}
if (!$encoding) {
$encoding = 'utf-8';
} else {
if (strtolower($encoding) != 'utf-8') {
if (strtolower($encoding) == 'iso-8859-1') {
// replace MS Word smart qutoes
$trans = array();
$trans[chr(130)] = '&sbquo;'; // Single Low-9 Quotation Mark
$trans[chr(131)] = '&fnof;'; // Latin Small Letter F With Hook
$trans[chr(132)] = '&bdquo;'; // Double Low-9 Quotation Mark
$trans[chr(133)] = '&hellip;'; // Horizontal Ellipsis
$trans[chr(134)] = '&dagger;'; // Dagger
$trans[chr(135)] = '&Dagger;'; // Double Dagger
$trans[chr(136)] = '&circ;'; // Modifier Letter Circumflex Accent
$trans[chr(137)] = '&permil;'; // Per Mille Sign
$trans[chr(138)] = '&Scaron;'; // Latin Capital Letter S With Caron
$trans[chr(139)] = '&lsaquo;'; // Single Left-Pointing Angle Quotation Mark
$trans[chr(140)] = '&OElig;'; // Latin Capital Ligature OE
$trans[chr(145)] = '&lsquo;'; // Left Single Quotation Mark
$trans[chr(146)] = '&rsquo;'; // Right Single Quotation Mark
$trans[chr(147)] = '&ldquo;'; // Left Double Quotation Mark
$trans[chr(148)] = '&rdquo;'; // Right Double Quotation Mark
$trans[chr(149)] = '&bull;'; // Bullet
$trans[chr(150)] = '&ndash;'; // En Dash
$trans[chr(151)] = '&mdash;'; // Em Dash
$trans[chr(152)] = '&tilde;'; // Small Tilde
$trans[chr(153)] = '&trade;'; // Trade Mark Sign
$trans[chr(154)] = '&scaron;'; // Latin Small Letter S With Caron
$trans[chr(155)] = '&rsaquo;'; // Single Right-Pointing Angle Quotation Mark
$trans[chr(156)] = '&oelig;'; // Latin Small Ligature OE
$trans[chr(159)] = '&Yuml;'; // Latin Capital Letter Y With Diaeresis
$html = strtr($html, $trans);
}
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
/*
if (function_exists('iconv')) {
// iconv appears to handle certain character encodings better than mb_convert_encoding
$html = iconv($encoding, 'utf-8', $html);
} else {
$html = mb_convert_encoding($html, 'utf-8', $encoding);
}
*/
}
}
}
return $html;
}
function makeAbsolute($base, $elem) {
$base = new IRI($base);
// remove '//' in URL path (causes URLs not to resolve properly)
if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
$elems = $elem->getElementsByTagName($tag);
for ($i = $elems->length-1; $i >= 0; $i--) {
$e = $elems->item($i);
//$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
makeAbsoluteAttr($base, $e, $attr);
}
if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
}
}
function makeAbsoluteAttr($base, $e, $attr) {
if ($e->hasAttribute($attr)) {
// Trim leading and trailing white space. I don't really like this but
// unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
$url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
$url = str_replace(' ', '%20', $url);
if (!preg_match('!https?://!i', $url)) {
$absolute = IRI::absolutize($base, $url);
if ($absolute) {
$e->setAttribute($attr, $absolute);
}
}
}
}
function makeAbsoluteStr($base, $url) {
$base = new IRI($base);
// remove '//' in URL path (causes URLs not to resolve properly)
if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
if (preg_match('!^https?://!i', $url)) {
// already absolute
return $url;
} else {
$absolute = IRI::absolutize($base, $url);
if ($absolute) return $absolute;
return false;
}
}
// returns single page response, or false if not found
function getSinglePage($item, $html, $url) {
global $http;
$host = @parse_url($url, PHP_URL_HOST);
$site_config = SiteConfig::build($host);
if ($site_config === false) return false;
$splink = null;
if (!empty($site_config->single_page_link)) {
$splink = $site_config->single_page_link;
} elseif (!empty($site_config->single_page_link_in_feed)) {
// single page link xpath is targeted at feed
$splink = $site_config->single_page_link_in_feed;
// so let's replace HTML with feed item description
$html = $item->get_description();
}
if (isset($splink)) {
// Build DOM tree from HTML
$readability = new Readability($html, $url);
$xpath = new DOMXPath($readability->dom);
// Loop through single_page_link xpath expressions
$single_page_url = null;
foreach ($splink as $pattern) {
$elems = @$xpath->evaluate($pattern, $readability->dom);
if (is_string($elems)) {
$single_page_url = trim($elems);
break;
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $item) {
if ($item->hasAttribute('href')) {
$single_page_url = $item->getAttribute('href');
break;
}
}
}
}
// If we've got URL, resolve against $url
if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
// check it's not what we have already!
if ($single_page_url != $url) {
// it's not, so let's try to fetch it...
if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
return $response;
}
}
}
}
return false;
}
function make_substitutions($string) {
if ($string == '') return $string;
global $item, $effective_url;
$string = str_replace('{url}', htmlspecialchars($item->get_permalink()), $string);
$string = str_replace('{effective-url}', htmlspecialchars($effective_url), $string);
return $string;
}
?>