Full-Text RSS 2.8
This commit is contained in:
parent
f9f03f14c0
commit
eeec0f1982
@ -2,6 +2,15 @@ FiveFilters.org: Full-Text RSS
|
|||||||
http://fivefilters.org/content-only/
|
http://fivefilters.org/content-only/
|
||||||
CHANGELOG
|
CHANGELOG
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
2.8 (2011-05-30)
|
||||||
|
- Tidy no longer stripping HTML5 elements
|
||||||
|
- JSON output (pass &format=json in querystring)
|
||||||
|
- New site patterns added and old ones updated
|
||||||
|
- New site config option to force full-page retrieval on multi-page articles: single_page_link
|
||||||
|
- User Guide (PDF) now included (although still a work in progress)
|
||||||
|
- URL placeholders now accepted in message_to_prepend/append config options
|
||||||
|
- Plus minor fixes...
|
||||||
|
|
||||||
2.7 (2011-03-21)
|
2.7 (2011-03-21)
|
||||||
- Site patterns for better control over extraction (see site_config/README.txt)
|
- Site patterns for better control over extraction (see site_config/README.txt)
|
||||||
- hNews support (improves content extraction for sites using hNews microformatting)
|
- hNews support (improves content extraction for sites using hNews microformatting)
|
||||||
|
@ -74,11 +74,17 @@ $options->cache_dir = dirname(__FILE__).'/cache';
|
|||||||
// Message to prepend (without API key)
|
// Message to prepend (without API key)
|
||||||
// ----------------------
|
// ----------------------
|
||||||
// HTML to insert at the beginning of each feed item when no API key is supplied.
|
// HTML to insert at the beginning of each feed item when no API key is supplied.
|
||||||
|
// Substitution tags:
|
||||||
|
// {url} - Feed item URL
|
||||||
|
// {effective-url} - Feed item URL after we've followed all redirects
|
||||||
$options->message_to_prepend = '';
|
$options->message_to_prepend = '';
|
||||||
|
|
||||||
// Message to append (without API key)
|
// Message to append (without API key)
|
||||||
// ----------------------
|
// ----------------------
|
||||||
// HTML to insert at the end of each feed item when no API key is supplied.
|
// HTML to insert at the end of each feed item when no API key is supplied.
|
||||||
|
// Substitution tags:
|
||||||
|
// {url} - Feed item URL
|
||||||
|
// {effective-url} - Feed item URL after we've followed all redirects
|
||||||
$options->message_to_append = '';
|
$options->message_to_append = '';
|
||||||
|
|
||||||
// URLs to allow
|
// URLs to allow
|
||||||
@ -188,7 +194,7 @@ $options->error_message_with_key = '[unable to retrieve full-text content]';
|
|||||||
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
|
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
|
||||||
/////////////////////////////////////////////////
|
/////////////////////////////////////////////////
|
||||||
|
|
||||||
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.7');
|
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.8');
|
||||||
|
|
||||||
if ((basename(__FILE__) == 'config.php') && (file_exists(dirname(__FILE__).'/custom_config.php'))) {
|
if ((basename(__FILE__) == 'config.php') && (file_exists(dirname(__FILE__).'/custom_config.php'))) {
|
||||||
require_once(dirname(__FILE__).'/custom_config.php');
|
require_once(dirname(__FILE__).'/custom_config.php');
|
||||||
|
@ -13,7 +13,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
|
|||||||
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
|
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
|
||||||
*/
|
*/
|
||||||
|
|
||||||
$app_name = 'Full-Text RSS 2.7';
|
$app_name = 'Full-Text RSS 2.8';
|
||||||
|
|
||||||
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>='));
|
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>='));
|
||||||
$pcre_ok = extension_loaded('pcre');
|
$pcre_ok = extension_loaded('pcre');
|
||||||
@ -327,7 +327,7 @@ div.chunk {
|
|||||||
<p><em>Your webhost has its act together!</em></p>
|
<p><em>Your webhost has its act together!</em></p>
|
||||||
<p>You can download the latest version of <?php echo $app_name; ?> from <a href="http://fivefilters.org/content-only/#download">FiveFilters.org</a>.</p>
|
<p>You can download the latest version of <?php echo $app_name; ?> from <a href="http://fivefilters.org/content-only/#download">FiveFilters.org</a>.</p>
|
||||||
<p><strong>Note</strong>: Passing this test does not guarantee that <?php echo $app_name; ?> will run on your webhost — it only ensures that the basic requirements have been addressed. If you experience any problems, please let us know.</p>
|
<p><strong>Note</strong>: Passing this test does not guarantee that <?php echo $app_name; ?> will run on your webhost — it only ensures that the basic requirements have been addressed. If you experience any problems, please let us know.</p>
|
||||||
<?php } else if ($php_ok && $xml_ok && $pcre_ok && $allow_url_fopen_ok && $filter_ok) { ?>
|
<?php } else if ($php_ok && $xml_ok && $pcre_ok && $mbstring_ok && $allow_url_fopen_ok && $filter_ok) { ?>
|
||||||
<h3>Bottom Line: Yes, you can!</h3>
|
<h3>Bottom Line: Yes, you can!</h3>
|
||||||
<p><em>For most feeds, it'll run with no problems.</em> There are certain languages that you might have a hard time with though.</p>
|
<p><em>For most feeds, it'll run with no problems.</em> There are certain languages that you might have a hard time with though.</p>
|
||||||
<p>You can download the latest version of <?php echo $app_name; ?> from <a href="http://fivefilters.org/content-only/#download">FiveFilters.org</a>.</p>
|
<p>You can download the latest version of <?php echo $app_name; ?> from <a href="http://fivefilters.org/content-only/#download">FiveFilters.org</a>.</p>
|
||||||
|
@ -96,14 +96,16 @@ if (!defined('_FF_FTR_INDEX')) {
|
|||||||
<p>Thanks for downloading and setting this up. If you haven't done so already, <a href="ftr_compatibility_test.php">check server compatibility</a>
|
<p>Thanks for downloading and setting this up. If you haven't done so already, <a href="ftr_compatibility_test.php">check server compatibility</a>
|
||||||
to see if your environment will support this application. Full-Text RSS runs on most shared web hosting environments.</p>
|
to see if your environment will support this application. Full-Text RSS runs on most shared web hosting environments.</p>
|
||||||
<h3>Configure</h3>
|
<h3>Configure</h3>
|
||||||
<p>In addition to the options above, Full-Text RSS comes with a configuration file which allows you to control how the application works. Features include:</p>
|
<p>In addition to the options above, Full-Text RSS can be configured to better suit your needs. Features include:</p>
|
||||||
<ul>
|
<ul>
|
||||||
<li>Site patterns for better control over extraction (<a href="site_config/README.txt">more info</a>)</li>
|
<li>Site patterns for better control over extraction (<a href="site_config/README.txt">more info</a>)</li>
|
||||||
<li>Restrict access to a pre-defined set of URLs or block certain URLs</li>
|
<li>Restrict access to a pre-defined set of URLs or block certain URLs</li>
|
||||||
<li>Restrict the maximum number of feed items to be processed</li>
|
<li>Restrict the maximum number of feed items to be processed</li>
|
||||||
|
<li>JSON output</li>
|
||||||
<li>Prepend or append an HTML fragment to each feed item processed</li>
|
<li>Prepend or append an HTML fragment to each feed item processed</li>
|
||||||
<li>Caching</li>
|
<li>Caching</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
<p>Please refer to the <a href="http://fivefilters.org/content-only/guide/user_guide_2.8.pdf">user guide</a> for more information.</p>
|
||||||
<p><?php if (!file_exists('custom_config.php')) { ?>To change the configuration, save a copy of <tt>config.php</tt> as <tt>custom_config.php</tt> and make any changes you like to it.<?php } else { ?>To change the configuration, edit <tt>custom_config.php</tt> and make any changes you like.<?php } ?></p>
|
<p><?php if (!file_exists('custom_config.php')) { ?>To change the configuration, save a copy of <tt>config.php</tt> as <tt>custom_config.php</tt> and make any changes you like to it.<?php } else { ?>To change the configuration, edit <tt>custom_config.php</tt> and make any changes you like.<?php } ?></p>
|
||||||
|
|
||||||
<p>If everything works fine, feel free to modify this page by saving it as <tt>custom_index.php</tt> and change it to whatever you like.</p>
|
<p>If everything works fine, feel free to modify this page by saving it as <tt>custom_index.php</tt> and change it to whatever you like.</p>
|
||||||
@ -118,7 +120,8 @@ if (!defined('_FF_FTR_INDEX')) {
|
|||||||
<p>To see if you're running the latest version, <a href="http://fivefilters.org/content-only/latest_version.php?version=<?php echo urlencode(_FF_FTR_VERSION); ?>">check for updates</a>.</p>
|
<p>To see if you're running the latest version, <a href="http://fivefilters.org/content-only/latest_version.php?version=<?php echo urlencode(_FF_FTR_VERSION); ?>">check for updates</a>.</p>
|
||||||
|
|
||||||
<h3 id="donate">Support</h3>
|
<h3 id="donate">Support</h3>
|
||||||
<p>We have more information in the section below, but if you need help with anything, please email <a href="mailto:fivefilters@fivefilters.org">fivefilters@fivefilters.org</a>.</p>
|
<p>We have a <a href="https://member.fivefilters.org/f/">public forum</a> which anyone can use to discuss any issues, post questions and find answers (it's free to join and post).</p>
|
||||||
|
<p>We provide a little more information in the section below, but if you need help with anything, you can also email us at <a href="mailto:fivefilters@fivefilters.org">fivefilters@fivefilters.org</a>.</p>
|
||||||
|
|
||||||
<hr />
|
<hr />
|
||||||
|
|
||||||
|
@ -5,8 +5,8 @@
|
|||||||
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
|
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
|
||||||
* to extract content from HTML files.
|
* to extract content from HTML files.
|
||||||
*
|
*
|
||||||
* @version 0.5
|
* @version 0.6
|
||||||
* @date 2011-03-07
|
* @date 2011-05-04
|
||||||
* @author Keyvan Minoukadeh
|
* @author Keyvan Minoukadeh
|
||||||
* @copyright 2011 Keyvan Minoukadeh
|
* @copyright 2011 Keyvan Minoukadeh
|
||||||
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
|
||||||
@ -14,13 +14,13 @@
|
|||||||
|
|
||||||
class ContentExtractor
|
class ContentExtractor
|
||||||
{
|
{
|
||||||
const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
|
|
||||||
protected static $config_cache = array();
|
|
||||||
protected static $tidy_config = array(
|
protected static $tidy_config = array(
|
||||||
'clean' => true,
|
'clean' => true,
|
||||||
'output-xhtml' => true,
|
'output-xhtml' => true,
|
||||||
'logical-emphasis' => true,
|
'logical-emphasis' => true,
|
||||||
'show-body-only' => false,
|
'show-body-only' => false,
|
||||||
|
'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
|
||||||
|
'new-inline-tags' => 'new-inline-tags: mark, time, meter, progress',
|
||||||
'wrap' => 0,
|
'wrap' => 0,
|
||||||
'drop-empty-paras' => true,
|
'drop-empty-paras' => true,
|
||||||
'drop-proprietary-attributes' => false,
|
'drop-proprietary-attributes' => false,
|
||||||
@ -31,19 +31,16 @@ class ContentExtractor
|
|||||||
'char-encoding' => 'utf8',
|
'char-encoding' => 'utf8',
|
||||||
'hide-comments' => true
|
'hide-comments' => true
|
||||||
);
|
);
|
||||||
protected $config_path;
|
|
||||||
protected $html;
|
protected $html;
|
||||||
protected $config;
|
protected $config;
|
||||||
protected $title;
|
protected $title;
|
||||||
protected $body;
|
protected $body;
|
||||||
protected $success = false;
|
protected $success = false;
|
||||||
protected $fallback;
|
|
||||||
public $readability;
|
public $readability;
|
||||||
public $debug = false;
|
public $debug = false;
|
||||||
|
|
||||||
function __construct($config_path=null, ContentExtractor $config_fallback=null) {
|
function __construct($path, $fallback=null) {
|
||||||
$this->config_path = $config_path;
|
SiteConfig::set_config_path($path, $fallback);
|
||||||
$this->fallback = $config_fallback;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected function debug($msg) {
|
protected function debug($msg) {
|
||||||
@ -66,71 +63,6 @@ class ContentExtractor
|
|||||||
$this->success = false;
|
$this->success = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// returns SiteConfig instance if an appropriate one is found, false otherwise
|
|
||||||
public function get_site_config($host) {
|
|
||||||
$host = strtolower($host);
|
|
||||||
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
|
|
||||||
if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false;
|
|
||||||
// check for site configuration
|
|
||||||
$try = array($host);
|
|
||||||
$split = explode('.', $host);
|
|
||||||
if (count($split) > 1) {
|
|
||||||
array_shift($split);
|
|
||||||
$try[] = '.'.implode('.', $split);
|
|
||||||
}
|
|
||||||
foreach ($try as $h) {
|
|
||||||
if (array_key_exists($h, self::$config_cache)) {
|
|
||||||
$this->debug("... cached ($h)");
|
|
||||||
return self::$config_cache[$h];
|
|
||||||
} elseif (file_exists($this->config_path."/$h.txt")) {
|
|
||||||
$this->debug("... from file ($h)");
|
|
||||||
$file = $this->config_path."/$h.txt";
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!isset($file)) {
|
|
||||||
if (isset($this->fallback)) {
|
|
||||||
$this->debug("... trying fallback ($host)");
|
|
||||||
return $this->fallback->get_site_config($host);
|
|
||||||
} else {
|
|
||||||
$this->debug("... no match ($host)");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
$config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
|
|
||||||
if (!$config_file || !is_array($config_file)) return false;
|
|
||||||
$config = new SiteConfig();
|
|
||||||
foreach ($config_file as $line) {
|
|
||||||
$line = trim($line);
|
|
||||||
|
|
||||||
// skip comments, empty lines
|
|
||||||
if ($line == '' || $line[0] == '#') continue;
|
|
||||||
|
|
||||||
// get command
|
|
||||||
$command = explode(':', $line, 2);
|
|
||||||
// if there's no colon ':', skip this line
|
|
||||||
if (count($command) != 2) continue;
|
|
||||||
$val = trim($command[1]);
|
|
||||||
$command = trim($command[0]);
|
|
||||||
if ($command == '' || $val == '') continue;
|
|
||||||
|
|
||||||
// check for commands where we accept multiple statements
|
|
||||||
if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src'))) {
|
|
||||||
array_push($config->$command, $val);
|
|
||||||
// check for single statement commands that evaluate to true or false
|
|
||||||
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
|
|
||||||
$config->$command = ($val == 'yes');
|
|
||||||
// check for single statement commands stored as strings
|
|
||||||
} elseif (in_array($command, array('test_url'))) {
|
|
||||||
$config->$command = $val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// store copy of config in our static cache array in case we need to process another URL
|
|
||||||
self::$config_cache[$h] = $config;
|
|
||||||
|
|
||||||
return $config;
|
|
||||||
}
|
|
||||||
|
|
||||||
// returns true on success, false on failure
|
// returns true on success, false on failure
|
||||||
// $smart_tidy indicates that if tidy is used and no results are produced, we will
|
// $smart_tidy indicates that if tidy is used and no results are produced, we will
|
||||||
// try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
|
// try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
|
||||||
@ -140,11 +72,12 @@ class ContentExtractor
|
|||||||
|
|
||||||
// extract host name
|
// extract host name
|
||||||
$host = @parse_url($url, PHP_URL_HOST);
|
$host = @parse_url($url, PHP_URL_HOST);
|
||||||
if (!($this->config = $this->get_site_config($host))) {
|
if (!($this->config = SiteConfig::build($host))) {
|
||||||
// no match, so use defaults
|
// no match, so use defaults
|
||||||
$this->config = new SiteConfig();
|
$this->config = new SiteConfig();
|
||||||
self::$config_cache[$host] = $this->config;
|
|
||||||
}
|
}
|
||||||
|
// store copy of config in our static cache array in case we need to process another URL
|
||||||
|
SiteConfig::add_to_cache($host, $this->config);
|
||||||
|
|
||||||
// use tidy (if it exists)?
|
// use tidy (if it exists)?
|
||||||
// This fixes problems with some sites which would otherwise
|
// This fixes problems with some sites which would otherwise
|
||||||
|
@ -47,5 +47,120 @@ class SiteConfig
|
|||||||
|
|
||||||
// Test URL - if present, can be used to test the config above
|
// Test URL - if present, can be used to test the config above
|
||||||
public $test_url = null;
|
public $test_url = null;
|
||||||
|
|
||||||
|
// Single-page link - should identify a link element or URL pointing to the page holding the entire article
|
||||||
|
// This is useful for sites which split their articles across multiple pages. Links to such pages tend to
|
||||||
|
// display the first page with links to the other pages at the bottom. Often there is also a link to a page
|
||||||
|
// which displays the entire article on one page (e.g. 'print view').
|
||||||
|
// This should be an XPath expression identifying the link to that page. If present and we find a match,
|
||||||
|
// we will retrieve that page and the rest of the options in this config will be applied to the new page.
|
||||||
|
public $single_page_link = array();
|
||||||
|
|
||||||
|
// Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
|
||||||
|
public $single_page_link_in_feed = array();
|
||||||
|
|
||||||
|
// TODO: which parser to use for turning raw HTML into a DOMDocument
|
||||||
|
public $parser = 'libxml';
|
||||||
|
|
||||||
|
// the options below cannot be set in the config files which this class represents
|
||||||
|
|
||||||
|
public static $debug = false;
|
||||||
|
protected static $config_path;
|
||||||
|
protected static $config_path_fallback;
|
||||||
|
protected static $config_cache = array();
|
||||||
|
const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
|
||||||
|
|
||||||
|
protected static function debug($msg) {
|
||||||
|
if (self::$debug) {
|
||||||
|
$mem = round(memory_get_usage()/1024, 2);
|
||||||
|
$memPeak = round(memory_get_peak_usage()/1024, 2);
|
||||||
|
echo '* ',$msg;
|
||||||
|
echo ' - mem used: ',$mem," (peak: $memPeak)\n";
|
||||||
|
ob_flush();
|
||||||
|
flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function set_config_path($path, $fallback=null) {
|
||||||
|
self::$config_path = $path;
|
||||||
|
self::$config_path_fallback = $fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static function add_to_cache($host, SiteConfig $config) {
|
||||||
|
$host = strtolower($host);
|
||||||
|
self::$config_cache[$host] = $config;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns SiteConfig instance if an appropriate one is found, false otherwise
|
||||||
|
public static function build($host) {
|
||||||
|
$host = strtolower($host);
|
||||||
|
if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
|
||||||
|
if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false;
|
||||||
|
// check for site configuration
|
||||||
|
$try = array($host);
|
||||||
|
$split = explode('.', $host);
|
||||||
|
if (count($split) > 1) {
|
||||||
|
array_shift($split);
|
||||||
|
$try[] = '.'.implode('.', $split);
|
||||||
|
}
|
||||||
|
foreach ($try as $h) {
|
||||||
|
if (array_key_exists($h, self::$config_cache)) {
|
||||||
|
self::debug("... cached ($h)");
|
||||||
|
return self::$config_cache[$h];
|
||||||
|
} elseif (file_exists(self::$config_path."/$h.txt")) {
|
||||||
|
self::debug("... from file ($h)");
|
||||||
|
$file = self::$config_path."/$h.txt";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!isset($file)) {
|
||||||
|
if (isset(self::$config_path_fallback)) {
|
||||||
|
self::debug("... trying fallback ($host)");
|
||||||
|
foreach ($try as $h) {
|
||||||
|
if (file_exists(self::$config_path_fallback."/$h.txt")) {
|
||||||
|
self::debug("... from fallback file ($h)");
|
||||||
|
$file = self::$config_path_fallback."/$h.txt";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!isset($file)) {
|
||||||
|
self::debug("... no match in fallback directory");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
self::debug("... no match ($host)");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
|
||||||
|
if (!$config_file || !is_array($config_file)) return false;
|
||||||
|
$config = new SiteConfig();
|
||||||
|
foreach ($config_file as $line) {
|
||||||
|
$line = trim($line);
|
||||||
|
|
||||||
|
// skip comments, empty lines
|
||||||
|
if ($line == '' || $line[0] == '#') continue;
|
||||||
|
|
||||||
|
// get command
|
||||||
|
$command = explode(':', $line, 2);
|
||||||
|
// if there's no colon ':', skip this line
|
||||||
|
if (count($command) != 2) continue;
|
||||||
|
$val = trim($command[1]);
|
||||||
|
$command = trim($command[0]);
|
||||||
|
if ($command == '' || $val == '') continue;
|
||||||
|
|
||||||
|
// check for commands where we accept multiple statements
|
||||||
|
if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed'))) {
|
||||||
|
array_push($config->$command, $val);
|
||||||
|
// check for single statement commands that evaluate to true or false
|
||||||
|
} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
|
||||||
|
$config->$command = ($val == 'yes');
|
||||||
|
// check for single statement commands stored as strings
|
||||||
|
} elseif (in_array($command, array('test_url'))) {
|
||||||
|
$config->$command = $val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return $config;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
?>
|
?>
|
@ -7,8 +7,8 @@
|
|||||||
* For environments which do not have these options, it reverts to standard sequential
|
* For environments which do not have these options, it reverts to standard sequential
|
||||||
* requests (using file_get_contents())
|
* requests (using file_get_contents())
|
||||||
*
|
*
|
||||||
* @version 0.8
|
* @version 0.9.5
|
||||||
* @date 2011-02-28
|
* @date 2011-05-23
|
||||||
* @see http://php.net/HttpRequestPool
|
* @see http://php.net/HttpRequestPool
|
||||||
* @author Keyvan Minoukadeh
|
* @author Keyvan Minoukadeh
|
||||||
* @copyright 2011 Keyvan Minoukadeh
|
* @copyright 2011 Keyvan Minoukadeh
|
||||||
@ -104,6 +104,15 @@ class HumbleHttpAgent
|
|||||||
return $iri->uri;
|
return $iri->uri;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function removeFragment($url) {
|
||||||
|
$pos = strpos($url, '#');
|
||||||
|
if ($pos === false) {
|
||||||
|
return $url;
|
||||||
|
} else {
|
||||||
|
return substr($url, 0, $pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public function enableDebug($bool=true) {
|
public function enableDebug($bool=true) {
|
||||||
$this->debug = (bool)$bool;
|
$this->debug = (bool)$bool;
|
||||||
}
|
}
|
||||||
@ -211,6 +220,7 @@ class HumbleHttpAgent
|
|||||||
} else {
|
} else {
|
||||||
$this->debug("......adding to pool");
|
$this->debug("......adding to pool");
|
||||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
|
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
|
||||||
|
$req_url = $this->removeFragment($req_url);
|
||||||
$httpRequest = new HttpRequest($req_url, HttpRequest::METH_GET, $this->requestOptions);
|
$httpRequest = new HttpRequest($req_url, HttpRequest::METH_GET, $this->requestOptions);
|
||||||
// send cookies, if we have any
|
// send cookies, if we have any
|
||||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
||||||
@ -225,7 +235,11 @@ class HumbleHttpAgent
|
|||||||
// did we get anything into the pool?
|
// did we get anything into the pool?
|
||||||
if (count($pool) > 0) {
|
if (count($pool) > 0) {
|
||||||
$this->debug('Sending request...');
|
$this->debug('Sending request...');
|
||||||
|
try {
|
||||||
$pool->send();
|
$pool->send();
|
||||||
|
} catch (HttpRequestPoolException $e) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
$this->debug('Received responses');
|
$this->debug('Received responses');
|
||||||
foreach($subset as $orig => $url) {
|
foreach($subset as $orig => $url) {
|
||||||
if (!$isRedirect) $orig = $url;
|
if (!$isRedirect) $orig = $url;
|
||||||
@ -240,7 +254,9 @@ class HumbleHttpAgent
|
|||||||
// is redirect?
|
// is redirect?
|
||||||
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
|
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
|
||||||
$redirectURL = $request->getResponseHeader('location');
|
$redirectURL = $request->getResponseHeader('location');
|
||||||
|
if (!preg_match('!^https?://!i', $redirectURL)) {
|
||||||
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
||||||
|
}
|
||||||
if ($this->validateURL($redirectURL)) {
|
if ($this->validateURL($redirectURL)) {
|
||||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||||
// store any cookies
|
// store any cookies
|
||||||
@ -298,6 +314,7 @@ class HumbleHttpAgent
|
|||||||
} else {
|
} else {
|
||||||
$this->debug("......adding to pool");
|
$this->debug("......adding to pool");
|
||||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
|
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
|
||||||
|
$req_url = $this->removeFragment($req_url);
|
||||||
$headers = array();
|
$headers = array();
|
||||||
// send cookies, if we have any
|
// send cookies, if we have any
|
||||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
||||||
@ -327,7 +344,9 @@ class HumbleHttpAgent
|
|||||||
$status_code = $this->requests[$orig]['status_code'];
|
$status_code = $this->requests[$orig]['status_code'];
|
||||||
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
|
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
|
||||||
$redirectURL = $this->requests[$orig]['location'];
|
$redirectURL = $this->requests[$orig]['location'];
|
||||||
|
if (!preg_match('!^https?://!i', $redirectURL)) {
|
||||||
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
||||||
|
}
|
||||||
if ($this->validateURL($redirectURL)) {
|
if ($this->validateURL($redirectURL)) {
|
||||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||||
// store any cookies
|
// store any cookies
|
||||||
@ -367,6 +386,7 @@ class HumbleHttpAgent
|
|||||||
$this->debug("Sending request for $url");
|
$this->debug("Sending request for $url");
|
||||||
$this->requests[$orig]['original_url'] = $orig;
|
$this->requests[$orig]['original_url'] = $orig;
|
||||||
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
|
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
|
||||||
|
$req_url = $this->removeFragment($req_url);
|
||||||
// send cookies, if we have any
|
// send cookies, if we have any
|
||||||
$httpContext = $this->httpContext;
|
$httpContext = $this->httpContext;
|
||||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
||||||
@ -391,7 +411,9 @@ class HumbleHttpAgent
|
|||||||
}
|
}
|
||||||
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
|
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
|
||||||
$redirectURL = $this->requests[$orig]['location'];
|
$redirectURL = $this->requests[$orig]['location'];
|
||||||
|
if (!preg_match('!^https?://!i', $redirectURL)) {
|
||||||
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
|
||||||
|
}
|
||||||
if ($this->validateURL($redirectURL)) {
|
if ($this->validateURL($redirectURL)) {
|
||||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||||
// store any cookies
|
// store any cookies
|
||||||
@ -444,6 +466,7 @@ class HumbleHttpAgent
|
|||||||
}
|
}
|
||||||
|
|
||||||
public function get($url, $remove=false) {
|
public function get($url, $remove=false) {
|
||||||
|
$url = "$url";
|
||||||
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
|
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
|
||||||
$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
|
$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
|
||||||
$response = $this->requests[$url];
|
$response = $this->requests[$url];
|
||||||
|
@ -3,8 +3,8 @@
|
|||||||
// Author: Keyvan Minoukadeh
|
// Author: Keyvan Minoukadeh
|
||||||
// Copyright (c) 2011 Keyvan Minoukadeh
|
// Copyright (c) 2011 Keyvan Minoukadeh
|
||||||
// License: AGPLv3
|
// License: AGPLv3
|
||||||
// Version: 2.7
|
// Version: 2.8
|
||||||
// Date: 2011-03-21
|
// Date: 2011-05-23
|
||||||
|
|
||||||
/*
|
/*
|
||||||
This program is free software: you can redistribute it and/or modify
|
This program is free software: you can redistribute it and/or modify
|
||||||
@ -73,131 +73,11 @@ function __autoload($class_name) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function url_allowed($url) {
|
|
||||||
global $options;
|
|
||||||
if (!empty($options->allowed_urls)) {
|
|
||||||
$allowed = false;
|
|
||||||
foreach ($options->allowed_urls as $allowurl) {
|
|
||||||
if (stristr($url, $allowurl) !== false) {
|
|
||||||
$allowed = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!$allowed) return false;
|
|
||||||
} else {
|
|
||||||
foreach ($options->blocked_urls as $blockurl) {
|
|
||||||
if (stristr($url, $blockurl) !== false) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Load config file if it exists
|
// Load config file if it exists
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
require_once(dirname(__FILE__).'/config.php');
|
require_once(dirname(__FILE__).'/config.php');
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
// Convert $html to UTF8
|
|
||||||
// (uses HTTP headers and HTML to find encoding)
|
|
||||||
// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
function convert_to_utf8($html, $header=null)
|
|
||||||
{
|
|
||||||
$encoding = null;
|
|
||||||
if ($html || $header) {
|
|
||||||
if (is_array($header)) $header = implode("\n", $header);
|
|
||||||
if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
|
|
||||||
// error parsing the response
|
|
||||||
} else {
|
|
||||||
$match = end($match); // get last matched element (in case of redirects)
|
|
||||||
if (isset($match[2])) $encoding = trim($match[2], '"\'');
|
|
||||||
}
|
|
||||||
if (!$encoding) {
|
|
||||||
if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
|
|
||||||
$encoding = trim($match[1], '"\'');
|
|
||||||
} elseif(preg_match('/<meta\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html, $match)) {
|
|
||||||
if (isset($match[1])) $encoding = trim($match[1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!$encoding) {
|
|
||||||
$encoding = 'utf-8';
|
|
||||||
} else {
|
|
||||||
if (strtolower($encoding) != 'utf-8') {
|
|
||||||
if (strtolower($encoding) == 'iso-8859-1') {
|
|
||||||
// replace MS Word smart qutoes
|
|
||||||
$trans = array();
|
|
||||||
$trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark
|
|
||||||
$trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook
|
|
||||||
$trans[chr(132)] = '„'; // Double Low-9 Quotation Mark
|
|
||||||
$trans[chr(133)] = '…'; // Horizontal Ellipsis
|
|
||||||
$trans[chr(134)] = '†'; // Dagger
|
|
||||||
$trans[chr(135)] = '‡'; // Double Dagger
|
|
||||||
$trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent
|
|
||||||
$trans[chr(137)] = '‰'; // Per Mille Sign
|
|
||||||
$trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron
|
|
||||||
$trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark
|
|
||||||
$trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE
|
|
||||||
$trans[chr(145)] = '‘'; // Left Single Quotation Mark
|
|
||||||
$trans[chr(146)] = '’'; // Right Single Quotation Mark
|
|
||||||
$trans[chr(147)] = '“'; // Left Double Quotation Mark
|
|
||||||
$trans[chr(148)] = '”'; // Right Double Quotation Mark
|
|
||||||
$trans[chr(149)] = '•'; // Bullet
|
|
||||||
$trans[chr(150)] = '–'; // En Dash
|
|
||||||
$trans[chr(151)] = '—'; // Em Dash
|
|
||||||
$trans[chr(152)] = '˜'; // Small Tilde
|
|
||||||
$trans[chr(153)] = '™'; // Trade Mark Sign
|
|
||||||
$trans[chr(154)] = 'š'; // Latin Small Letter S With Caron
|
|
||||||
$trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark
|
|
||||||
$trans[chr(156)] = 'œ'; // Latin Small Ligature OE
|
|
||||||
$trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis
|
|
||||||
$html = strtr($html, $trans);
|
|
||||||
}
|
|
||||||
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
|
|
||||||
|
|
||||||
/*
|
|
||||||
if (function_exists('iconv')) {
|
|
||||||
// iconv appears to handle certain character encodings better than mb_convert_encoding
|
|
||||||
$html = iconv($encoding, 'utf-8', $html);
|
|
||||||
} else {
|
|
||||||
$html = mb_convert_encoding($html, 'utf-8', $encoding);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return $html;
|
|
||||||
}
|
|
||||||
|
|
||||||
function makeAbsolute($base, $elem) {
|
|
||||||
$base = new IRI($base);
|
|
||||||
foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
|
|
||||||
$elems = $elem->getElementsByTagName($tag);
|
|
||||||
for ($i = $elems->length-1; $i >= 0; $i--) {
|
|
||||||
$e = $elems->item($i);
|
|
||||||
//$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
|
|
||||||
makeAbsoluteAttr($base, $e, $attr);
|
|
||||||
}
|
|
||||||
if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
function makeAbsoluteAttr($base, $e, $attr) {
|
|
||||||
if ($e->hasAttribute($attr)) {
|
|
||||||
// Trim leading and trailing white space. I don't really like this but
|
|
||||||
// unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
|
|
||||||
$url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
|
|
||||||
$url = str_replace(' ', '%20', $url);
|
|
||||||
if (!preg_match('!https?://!i', $url)) {
|
|
||||||
$absolute = IRI::absolutize($base, $url);
|
|
||||||
if ($absolute) {
|
|
||||||
$e->setAttribute($attr, $absolute);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// Check if service is enabled
|
// Check if service is enabled
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
@ -211,7 +91,7 @@ if (!$options->enabled) {
|
|||||||
if (!isset($_GET['url'])) {
|
if (!isset($_GET['url'])) {
|
||||||
die('No URL supplied');
|
die('No URL supplied');
|
||||||
}
|
}
|
||||||
$url = $_GET['url'];
|
$url = trim($_GET['url']);
|
||||||
if (!preg_match('!^https?://.+!i', $url)) {
|
if (!preg_match('!^https?://.+!i', $url)) {
|
||||||
$url = 'http://'.$url;
|
$url = 'http://'.$url;
|
||||||
}
|
}
|
||||||
@ -240,6 +120,7 @@ if ($options->alternative_url != '' && !isset($_GET['redir']) && mt_rand(0, 100)
|
|||||||
if (isset($_GET['links'])) $redirect .= '&links='.$_GET['links'];
|
if (isset($_GET['links'])) $redirect .= '&links='.$_GET['links'];
|
||||||
if (isset($_GET['exc'])) $redirect .= '&exc='.$_GET['exc'];
|
if (isset($_GET['exc'])) $redirect .= '&exc='.$_GET['exc'];
|
||||||
if (isset($_GET['what'])) $redirect .= '&what='.$_GET['what'];
|
if (isset($_GET['what'])) $redirect .= '&what='.$_GET['what'];
|
||||||
|
if (isset($_GET['format'])) $redirect .= '&format='.$_GET['format'];
|
||||||
header("Location: $redirect");
|
header("Location: $redirect");
|
||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
@ -258,6 +139,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
|
|||||||
if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
|
if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
|
||||||
if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
|
if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
|
||||||
if (isset($_GET['what'])) $redirect .= '&what='.urlencode($_GET['what']);
|
if (isset($_GET['what'])) $redirect .= '&what='.urlencode($_GET['what']);
|
||||||
|
if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
|
||||||
header("Location: $redirect");
|
header("Location: $redirect");
|
||||||
exit;
|
exit;
|
||||||
}
|
}
|
||||||
@ -364,9 +246,13 @@ if (($extract_pattern != '') && ($extract_pattern != 'auto')) {
|
|||||||
|
|
||||||
/////////////////////////////////////
|
/////////////////////////////////////
|
||||||
// Check for valid format
|
// Check for valid format
|
||||||
// (stick to RSS for the time being)
|
// (stick to RSS (or RSS as JSON) for the time being)
|
||||||
/////////////////////////////////////
|
/////////////////////////////////////
|
||||||
$format = 'rss';
|
if (isset($_GET['format']) && $_GET['format'] == 'json') {
|
||||||
|
$format = 'json';
|
||||||
|
} else {
|
||||||
|
$format = 'rss';
|
||||||
|
}
|
||||||
|
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
// Check for cached copy
|
// Check for cached copy
|
||||||
@ -392,10 +278,14 @@ if ($options->caching) {
|
|||||||
|
|
||||||
// getting a Zend_Cache_Core object
|
// getting a Zend_Cache_Core object
|
||||||
$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
|
$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
|
||||||
$cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.(int)isset($_GET['pubsub']));
|
$cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.$format.(int)isset($_GET['pubsub']));
|
||||||
|
|
||||||
if ($data = $cache->load($cache_id)) {
|
if ($data = $cache->load($cache_id)) {
|
||||||
|
if ($format == 'json') {
|
||||||
|
header("Content-type: application/json; charset=UTF-8");
|
||||||
|
} else {
|
||||||
header("Content-type: text/xml; charset=UTF-8");
|
header("Content-type: text/xml; charset=UTF-8");
|
||||||
|
}
|
||||||
if (headers_sent()) die('Some data has already been output, can\'t send RSS file');
|
if (headers_sent()) die('Some data has already been output, can\'t send RSS file');
|
||||||
echo $data;
|
echo $data;
|
||||||
exit;
|
exit;
|
||||||
@ -419,7 +309,7 @@ $http = new HumbleHttpAgent();
|
|||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
// Set up Content Extractor
|
// Set up Content Extractor
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', new ContentExtractor(dirname(__FILE__).'/site_config/standard'));
|
$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', dirname(__FILE__).'/site_config/standard');
|
||||||
|
|
||||||
/*
|
/*
|
||||||
if ($options->caching) {
|
if ($options->caching) {
|
||||||
@ -453,7 +343,8 @@ if (!$html_only) {
|
|||||||
SimplePie_HumbleHttpAgent::set_agent($http);
|
SimplePie_HumbleHttpAgent::set_agent($http);
|
||||||
$feed = new SimplePie();
|
$feed = new SimplePie();
|
||||||
$feed->set_file_class('SimplePie_HumbleHttpAgent');
|
$feed->set_file_class('SimplePie_HumbleHttpAgent');
|
||||||
$feed->set_feed_url($url);
|
//$feed->set_feed_url($url); // colons appearing in the URL's path get encoded
|
||||||
|
$feed->feed_url = $url;
|
||||||
$feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
|
$feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
|
||||||
$feed->set_timeout(20);
|
$feed->set_timeout(20);
|
||||||
$feed->enable_cache(false);
|
$feed->enable_cache(false);
|
||||||
@ -471,97 +362,34 @@ if (!$html_only) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Extract content from HTML (if URL is not feed or explicit HTML request has been made)
|
// Our given URL is not a feed, so let's create our own feed with a single item:
|
||||||
|
// the given URL. This basically treats all non-feed URLs as if they were
|
||||||
|
// single-item feeds.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
$isDummyFeed = false;
|
||||||
if ($html_only || !$result) {
|
if ($html_only || !$result) {
|
||||||
|
$isDummyFeed = true;
|
||||||
unset($feed, $result);
|
unset($feed, $result);
|
||||||
if ($response = $http->get($url)) {
|
// create single item dummy feed object
|
||||||
$effective_url = $response['effective_url'];
|
class DummySingleItemFeed {
|
||||||
if (!url_allowed($effective_url)) die('URL blocked');
|
public $item;
|
||||||
$html = $response['body'];
|
function __construct($url) { $this->item = new DummySingleItem($url); }
|
||||||
// remove strange things here
|
public function get_title() { return ''; }
|
||||||
$html = str_replace('</[>', '', $html);
|
public function get_description() { return 'Content extracted from '.$this->item->url; }
|
||||||
$html = convert_to_utf8($html, $response['headers']);
|
public function get_link() { return $this->item->url; }
|
||||||
|
public function get_image_url() { return false; }
|
||||||
|
public function get_items($start=0, $max=1) { return array(0=>$this->item); }
|
||||||
}
|
}
|
||||||
if (!$response || $response['status_code'] >= 300) {
|
class DummySingleItem {
|
||||||
die('Error retrieving '.$url);
|
public $url;
|
||||||
|
function __construct($url) { $this->url = $url; }
|
||||||
|
public function get_permalink() { return $this->url; }
|
||||||
|
public function get_title() { return ''; }
|
||||||
|
public function get_date($format='') { return false; }
|
||||||
|
public function get_author() { return false; }
|
||||||
|
public function get_description() { return ''; }
|
||||||
}
|
}
|
||||||
if ($auto_extract) {
|
$feed = new DummySingleItemFeed($url);
|
||||||
$extract_result = $extractor->process($html, $effective_url);
|
|
||||||
if (!$extract_result) die($options->error_message);
|
|
||||||
$readability = $extractor->readability;
|
|
||||||
$content_block = $extractor->getContent();
|
|
||||||
$title = $extractor->getTitle();
|
|
||||||
} else {
|
|
||||||
$readability = new Readability($html, $effective_url);
|
|
||||||
// content block is entire document
|
|
||||||
$content_block = $readability->dom;
|
|
||||||
//TODO: get title
|
|
||||||
$title = '';
|
|
||||||
}
|
|
||||||
if ($extract_pattern) {
|
|
||||||
$xpath = new DOMXPath($readability->dom);
|
|
||||||
$elems = @$xpath->query($extract_pattern, $content_block);
|
|
||||||
// check if our custom extraction pattern matched
|
|
||||||
if ($elems && $elems->length > 0) {
|
|
||||||
// get the first matched element
|
|
||||||
$content_block = $elems->item(0);
|
|
||||||
// clean it up
|
|
||||||
$readability->removeScripts($content_block);
|
|
||||||
$readability->prepArticle($content_block);
|
|
||||||
} else {
|
|
||||||
die($options->error_message);
|
|
||||||
//$content_block = $readability->dom->createElement('p', 'Sorry, could not extract content');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
$readability->clean($content_block, 'select');
|
|
||||||
if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block);
|
|
||||||
// footnotes
|
|
||||||
if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
|
|
||||||
$readability->addFootnotes($content_block);
|
|
||||||
}
|
|
||||||
if ($extract_pattern) {
|
|
||||||
// get outerHTML
|
|
||||||
$content = $content_block->ownerDocument->saveXML($content_block);
|
|
||||||
} else {
|
|
||||||
if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
|
|
||||||
$content = $content_block->firstChild->innerHTML;
|
|
||||||
} else {
|
|
||||||
$content = $content_block->innerHTML;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ($links == 'remove') {
|
|
||||||
$content = preg_replace('!</?a[^>]*>!', '', $content);
|
|
||||||
}
|
|
||||||
if (!$valid_key) {
|
|
||||||
$content = $options->message_to_prepend.$content;
|
|
||||||
$content .= $options->message_to_append;
|
|
||||||
} else {
|
|
||||||
$content = $options->message_to_prepend_with_key.$content;
|
|
||||||
$content .= $options->message_to_append_with_key;
|
|
||||||
}
|
|
||||||
unset($readability, $html);
|
|
||||||
$output = new FeedWriter(); //ATOM an option
|
|
||||||
$output->setTitle($title);
|
|
||||||
$output->setDescription("Content extracted from $url");
|
|
||||||
$output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it
|
|
||||||
if ($format == 'atom') {
|
|
||||||
$output->setChannelElement('updated', date(DATE_ATOM));
|
|
||||||
$output->setChannelElement('author', array('name'=>'Five Filters', 'uri'=>'http://fivefilters.org'));
|
|
||||||
}
|
|
||||||
$output->setLink($url);
|
|
||||||
$newitem = $output->createNewItem();
|
|
||||||
$newitem->setTitle($title);
|
|
||||||
$newitem->setLink($url);
|
|
||||||
if ($format == 'atom') {
|
|
||||||
$newitem->setDate(time());
|
|
||||||
$newitem->addElement('content', $content);
|
|
||||||
} else {
|
|
||||||
$newitem->setDescription($content);
|
|
||||||
}
|
|
||||||
$output->addItem($newitem);
|
|
||||||
$output->genarateFeed();
|
|
||||||
exit;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
@ -594,6 +422,8 @@ $urls_sanitized = array();
|
|||||||
$urls = array();
|
$urls = array();
|
||||||
foreach ($items as $key => $item) {
|
foreach ($items as $key => $item) {
|
||||||
$permalink = htmlspecialchars_decode($item->get_permalink());
|
$permalink = htmlspecialchars_decode($item->get_permalink());
|
||||||
|
// Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
|
||||||
|
$permalink = str_replace('%3A', ':', $permalink);
|
||||||
$permalink = $http->validateUrl($permalink);
|
$permalink = $http->validateUrl($permalink);
|
||||||
if ($permalink) {
|
if ($permalink) {
|
||||||
$urls_sanitized[] = $permalink;
|
$urls_sanitized[] = $permalink;
|
||||||
@ -625,17 +455,34 @@ foreach ($items as $key => $item) {
|
|||||||
$effective_url = $response['effective_url'];
|
$effective_url = $response['effective_url'];
|
||||||
if (!url_allowed($effective_url)) continue;
|
if (!url_allowed($effective_url)) continue;
|
||||||
$html = $response['body'];
|
$html = $response['body'];
|
||||||
// remove strange things here
|
// remove strange things
|
||||||
$html = str_replace('</[>', '', $html);
|
$html = str_replace('</[>', '', $html);
|
||||||
$html = convert_to_utf8($html, $response['headers']);
|
$html = convert_to_utf8($html, $response['headers']);
|
||||||
if ($auto_extract) {
|
if ($auto_extract) {
|
||||||
|
// check site config for single page URL - fetch it if found
|
||||||
|
if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
|
||||||
|
$html = $single_page_response['body'];
|
||||||
|
// remove strange things
|
||||||
|
$html = str_replace('</[>', '', $html);
|
||||||
|
$html = convert_to_utf8($html, $single_page_response['headers']);
|
||||||
|
$effective_url = $single_page_response['effective_url'];
|
||||||
|
unset($single_page_response);
|
||||||
|
}
|
||||||
$extract_result = $extractor->process($html, $effective_url);
|
$extract_result = $extractor->process($html, $effective_url);
|
||||||
$readability = $extractor->readability;
|
$readability = $extractor->readability;
|
||||||
$content_block = ($extract_result) ? $extractor->getContent() : null;
|
$content_block = ($extract_result) ? $extractor->getContent() : null;
|
||||||
|
$title = ($extract_result) ? $extractor->getTitle() : '';
|
||||||
} else {
|
} else {
|
||||||
$readability = new Readability($html, $effective_url);
|
$readability = new Readability($html, $effective_url);
|
||||||
// content block is entire document (for now...)
|
// content block is entire document (for now...)
|
||||||
$content_block = $readability->dom;
|
$content_block = $readability->dom;
|
||||||
|
//TODO: get title
|
||||||
|
$title = '';
|
||||||
|
}
|
||||||
|
// use extracted title for both feed and item title if we're using single-item dummy feed
|
||||||
|
if ($isDummyFeed) {
|
||||||
|
$output->setTitle($title);
|
||||||
|
$newitem->setTitle($title);
|
||||||
}
|
}
|
||||||
if ($extract_pattern && isset($content_block)) {
|
if ($extract_pattern && isset($content_block)) {
|
||||||
$xpath = new DOMXPath($readability->dom);
|
$xpath = new DOMXPath($readability->dom);
|
||||||
@ -684,11 +531,11 @@ foreach ($items as $key => $item) {
|
|||||||
$html = preg_replace('!</?a[^>]*>!', '', $html);
|
$html = preg_replace('!</?a[^>]*>!', '', $html);
|
||||||
}
|
}
|
||||||
if (!$valid_key) {
|
if (!$valid_key) {
|
||||||
$html = $options->message_to_prepend.$html;
|
$html = make_substitutions($options->message_to_prepend).$html;
|
||||||
$html .= $options->message_to_append;
|
$html .= make_substitutions($options->message_to_append);
|
||||||
} else {
|
} else {
|
||||||
$html = $options->message_to_prepend_with_key.$html;
|
$html = make_substitutions($options->message_to_prepend_with_key).$html;
|
||||||
$html .= $options->message_to_append_with_key;
|
$html .= make_substitutions($options->message_to_append_with_key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ($format == 'atom') {
|
if ($format == 'atom') {
|
||||||
@ -715,14 +562,215 @@ foreach ($items as $key => $item) {
|
|||||||
unset($html);
|
unset($html);
|
||||||
}
|
}
|
||||||
// output feed
|
// output feed
|
||||||
if ($options->caching) {
|
if ($options->caching || $format == 'json') {
|
||||||
ob_start();
|
ob_start();
|
||||||
$output->genarateFeed();
|
$output->genarateFeed();
|
||||||
$output = ob_get_contents();
|
$output = ob_get_contents();
|
||||||
ob_end_clean();
|
ob_end_clean();
|
||||||
$cache->save($output, $cache_id);
|
if ($format == 'json') {
|
||||||
|
$jsonrss = new stdClass();
|
||||||
|
$jsonrss->rss = @simplexml_load_string($output);
|
||||||
|
$output = json_encode($jsonrss);
|
||||||
|
header("Content-type: application/json; charset=UTF-8");
|
||||||
|
}
|
||||||
|
if ($options->caching) $cache->save($output, $cache_id);
|
||||||
echo $output;
|
echo $output;
|
||||||
} else {
|
} else {
|
||||||
$output->genarateFeed();
|
$output->genarateFeed();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////
|
||||||
|
// HELPER FUNCTIONS
|
||||||
|
///////////////////////////////
|
||||||
|
|
||||||
|
function url_allowed($url) {
|
||||||
|
global $options;
|
||||||
|
if (!empty($options->allowed_urls)) {
|
||||||
|
$allowed = false;
|
||||||
|
foreach ($options->allowed_urls as $allowurl) {
|
||||||
|
if (stristr($url, $allowurl) !== false) {
|
||||||
|
$allowed = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!$allowed) return false;
|
||||||
|
} else {
|
||||||
|
foreach ($options->blocked_urls as $blockurl) {
|
||||||
|
if (stristr($url, $blockurl) !== false) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// Convert $html to UTF8
|
||||||
|
// (uses HTTP headers and HTML to find encoding)
|
||||||
|
// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
function convert_to_utf8($html, $header=null)
|
||||||
|
{
|
||||||
|
$encoding = null;
|
||||||
|
if ($html || $header) {
|
||||||
|
if (is_array($header)) $header = implode("\n", $header);
|
||||||
|
if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
|
||||||
|
// error parsing the response
|
||||||
|
} else {
|
||||||
|
$match = end($match); // get last matched element (in case of redirects)
|
||||||
|
if (isset($match[2])) $encoding = trim($match[2], '"\'');
|
||||||
|
}
|
||||||
|
if (!$encoding) {
|
||||||
|
if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
|
||||||
|
$encoding = trim($match[1], '"\'');
|
||||||
|
} elseif(preg_match('/<meta\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html, $match)) {
|
||||||
|
if (isset($match[1])) $encoding = trim($match[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!$encoding) {
|
||||||
|
$encoding = 'utf-8';
|
||||||
|
} else {
|
||||||
|
if (strtolower($encoding) != 'utf-8') {
|
||||||
|
if (strtolower($encoding) == 'iso-8859-1') {
|
||||||
|
// replace MS Word smart qutoes
|
||||||
|
$trans = array();
|
||||||
|
$trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark
|
||||||
|
$trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook
|
||||||
|
$trans[chr(132)] = '„'; // Double Low-9 Quotation Mark
|
||||||
|
$trans[chr(133)] = '…'; // Horizontal Ellipsis
|
||||||
|
$trans[chr(134)] = '†'; // Dagger
|
||||||
|
$trans[chr(135)] = '‡'; // Double Dagger
|
||||||
|
$trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent
|
||||||
|
$trans[chr(137)] = '‰'; // Per Mille Sign
|
||||||
|
$trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron
|
||||||
|
$trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark
|
||||||
|
$trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE
|
||||||
|
$trans[chr(145)] = '‘'; // Left Single Quotation Mark
|
||||||
|
$trans[chr(146)] = '’'; // Right Single Quotation Mark
|
||||||
|
$trans[chr(147)] = '“'; // Left Double Quotation Mark
|
||||||
|
$trans[chr(148)] = '”'; // Right Double Quotation Mark
|
||||||
|
$trans[chr(149)] = '•'; // Bullet
|
||||||
|
$trans[chr(150)] = '–'; // En Dash
|
||||||
|
$trans[chr(151)] = '—'; // Em Dash
|
||||||
|
$trans[chr(152)] = '˜'; // Small Tilde
|
||||||
|
$trans[chr(153)] = '™'; // Trade Mark Sign
|
||||||
|
$trans[chr(154)] = 'š'; // Latin Small Letter S With Caron
|
||||||
|
$trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark
|
||||||
|
$trans[chr(156)] = 'œ'; // Latin Small Ligature OE
|
||||||
|
$trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis
|
||||||
|
$html = strtr($html, $trans);
|
||||||
|
}
|
||||||
|
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
|
||||||
|
|
||||||
|
/*
|
||||||
|
if (function_exists('iconv')) {
|
||||||
|
// iconv appears to handle certain character encodings better than mb_convert_encoding
|
||||||
|
$html = iconv($encoding, 'utf-8', $html);
|
||||||
|
} else {
|
||||||
|
$html = mb_convert_encoding($html, 'utf-8', $encoding);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return $html;
|
||||||
|
}
|
||||||
|
|
||||||
|
function makeAbsolute($base, $elem) {
|
||||||
|
$base = new IRI($base);
|
||||||
|
// remove '//' in URL path (causes URLs not to resolve properly)
|
||||||
|
if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
|
||||||
|
foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
|
||||||
|
$elems = $elem->getElementsByTagName($tag);
|
||||||
|
for ($i = $elems->length-1; $i >= 0; $i--) {
|
||||||
|
$e = $elems->item($i);
|
||||||
|
//$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
|
||||||
|
makeAbsoluteAttr($base, $e, $attr);
|
||||||
|
}
|
||||||
|
if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function makeAbsoluteAttr($base, $e, $attr) {
|
||||||
|
if ($e->hasAttribute($attr)) {
|
||||||
|
// Trim leading and trailing white space. I don't really like this but
|
||||||
|
// unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
|
||||||
|
$url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
|
||||||
|
$url = str_replace(' ', '%20', $url);
|
||||||
|
if (!preg_match('!https?://!i', $url)) {
|
||||||
|
$absolute = IRI::absolutize($base, $url);
|
||||||
|
if ($absolute) {
|
||||||
|
$e->setAttribute($attr, $absolute);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function makeAbsoluteStr($base, $url) {
|
||||||
|
$base = new IRI($base);
|
||||||
|
// remove '//' in URL path (causes URLs not to resolve properly)
|
||||||
|
if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
|
||||||
|
if (preg_match('!^https?://!i', $url)) {
|
||||||
|
// already absolute
|
||||||
|
return $url;
|
||||||
|
} else {
|
||||||
|
$absolute = IRI::absolutize($base, $url);
|
||||||
|
if ($absolute) return $absolute;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// returns single page response, or false if not found
|
||||||
|
function getSinglePage($item, $html, $url) {
|
||||||
|
global $http;
|
||||||
|
$host = @parse_url($url, PHP_URL_HOST);
|
||||||
|
$site_config = SiteConfig::build($host);
|
||||||
|
if ($site_config === false) return false;
|
||||||
|
$splink = null;
|
||||||
|
if (!empty($site_config->single_page_link)) {
|
||||||
|
$splink = $site_config->single_page_link;
|
||||||
|
} elseif (!empty($site_config->single_page_link_in_feed)) {
|
||||||
|
// single page link xpath is targeted at feed
|
||||||
|
$splink = $site_config->single_page_link_in_feed;
|
||||||
|
// so let's replace HTML with feed item description
|
||||||
|
$html = $item->get_description();
|
||||||
|
}
|
||||||
|
if (isset($splink)) {
|
||||||
|
// Build DOM tree from HTML
|
||||||
|
$readability = new Readability($html, $url);
|
||||||
|
$xpath = new DOMXPath($readability->dom);
|
||||||
|
// Loop through single_page_link xpath expressions
|
||||||
|
$single_page_url = null;
|
||||||
|
foreach ($splink as $pattern) {
|
||||||
|
$elems = @$xpath->evaluate($pattern, $readability->dom);
|
||||||
|
if (is_string($elems)) {
|
||||||
|
$single_page_url = trim($elems);
|
||||||
|
break;
|
||||||
|
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
|
||||||
|
foreach ($elems as $item) {
|
||||||
|
if ($item->hasAttribute('href')) {
|
||||||
|
$single_page_url = $item->getAttribute('href');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If we've got URL, resolve against $url
|
||||||
|
if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
|
||||||
|
// check it's not what we have already!
|
||||||
|
if ($single_page_url != $url) {
|
||||||
|
// it's not, so let's try to fetch it...
|
||||||
|
if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
|
||||||
|
return $response;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function make_substitutions($string) {
|
||||||
|
if ($string == '') return $string;
|
||||||
|
global $item, $effective_url;
|
||||||
|
$string = str_replace('{url}', htmlspecialchars($item->get_permalink()), $string);
|
||||||
|
$string = str_replace('{effective-url}', htmlspecialchars($effective_url), $string);
|
||||||
|
return $string;
|
||||||
|
}
|
||||||
?>
|
?>
|
Loading…
Reference in New Issue
Block a user