diff --git a/changelog.txt b/changelog.txt
index 4a4dfb5..9e29ead 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -2,6 +2,15 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
+2.8 (2011-05-30)
+ - Tidy no longer stripping HTML5 elements
+ - JSON output (pass &format=json in querystring)
+ - New site patterns added and old ones updated
+ - New site config option to force full-page retrieval on multi-page articles: single_page_link
+ - User Guide (PDF) now included (although still a work in progress)
+ - URL placeholders now accepted in message_to_prepend/append config options
+ - Plus minor fixes...
+
2.7 (2011-03-21)
- Site patterns for better control over extraction (see site_config/README.txt)
- hNews support (improves content extraction for sites using hNews microformatting)
diff --git a/config.php b/config.php
index 6b55583..68fcc5d 100644
--- a/config.php
+++ b/config.php
@@ -74,11 +74,17 @@ $options->cache_dir = dirname(__FILE__).'/cache';
// Message to prepend (without API key)
// ----------------------
// HTML to insert at the beginning of each feed item when no API key is supplied.
+// Substitution tags:
+// {url} - Feed item URL
+// {effective-url} - Feed item URL after we've followed all redirects
$options->message_to_prepend = '';
// Message to append (without API key)
// ----------------------
// HTML to insert at the end of each feed item when no API key is supplied.
+// Substitution tags:
+// {url} - Feed item URL
+// {effective-url} - Feed item URL after we've followed all redirects
$options->message_to_append = '';
// URLs to allow
@@ -188,7 +194,7 @@ $options->error_message_with_key = '[unable to retrieve full-text content]';
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
-if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.7');
+if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.8');
if ((basename(__FILE__) == 'config.php') && (file_exists(dirname(__FILE__).'/custom_config.php'))) {
require_once(dirname(__FILE__).'/custom_config.php');
diff --git a/ftr_compatibility_test.php b/ftr_compatibility_test.php
index ee35d06..aae5687 100644
--- a/ftr_compatibility_test.php
+++ b/ftr_compatibility_test.php
@@ -13,7 +13,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/
-$app_name = 'Full-Text RSS 2.7';
+$app_name = 'Full-Text RSS 2.8';
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>='));
$pcre_ok = extension_loaded('pcre');
@@ -327,7 +327,7 @@ div.chunk {
Your webhost has its act together!
You can download the latest version of from FiveFilters.org.
Note: Passing this test does not guarantee that will run on your webhost — it only ensures that the basic requirements have been addressed. If you experience any problems, please let us know.
-
+
Bottom Line: Yes, you can!
For most feeds, it'll run with no problems. There are certain languages that you might have a hard time with though.
You can download the latest version of from FiveFilters.org.
diff --git a/index.php b/index.php
index 7a55b56..f955f8e 100644
--- a/index.php
+++ b/index.php
@@ -96,14 +96,16 @@ if (!defined('_FF_FTR_INDEX')) {
Thanks for downloading and setting this up. If you haven't done so already, check server compatibility
to see if your environment will support this application. Full-Text RSS runs on most shared web hosting environments.
Configure
- In addition to the options above, Full-Text RSS comes with a configuration file which allows you to control how the application works. Features include:
+ In addition to the options above, Full-Text RSS can be configured to better suit your needs. Features include:
- Site patterns for better control over extraction (more info)
- Restrict access to a pre-defined set of URLs or block certain URLs
- Restrict the maximum number of feed items to be processed
+ - JSON output
- Prepend or append an HTML fragment to each feed item processed
- Caching
+ Please refer to the user guide for more information.
To change the configuration, save a copy of config.php as custom_config.php and make any changes you like to it.To change the configuration, edit custom_config.php and make any changes you like.
If everything works fine, feel free to modify this page by saving it as custom_index.php and change it to whatever you like.
@@ -118,7 +120,8 @@ if (!defined('_FF_FTR_INDEX')) {
To see if you're running the latest version, check for updates.
Support
- We have more information in the section below, but if you need help with anything, please email fivefilters@fivefilters.org.
+ We have a public forum which anyone can use to discuss any issues, post questions and find answers (it's free to join and post).
+ We provide a little more information in the section below, but if you need help with anything, you can also email us at fivefilters@fivefilters.org.
diff --git a/libraries/content-extractor/ContentExtractor.php b/libraries/content-extractor/ContentExtractor.php
index 131aab6..33e4955 100644
--- a/libraries/content-extractor/ContentExtractor.php
+++ b/libraries/content-extractor/ContentExtractor.php
@@ -5,8 +5,8 @@
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
* to extract content from HTML files.
*
- * @version 0.5
- * @date 2011-03-07
+ * @version 0.6
+ * @date 2011-05-04
* @author Keyvan Minoukadeh
* @copyright 2011 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
@@ -14,13 +14,13 @@
class ContentExtractor
{
- const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
- protected static $config_cache = array();
protected static $tidy_config = array(
'clean' => true,
'output-xhtml' => true,
'logical-emphasis' => true,
'show-body-only' => false,
+ 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
+ 'new-inline-tags' => 'new-inline-tags: mark, time, meter, progress',
'wrap' => 0,
'drop-empty-paras' => true,
'drop-proprietary-attributes' => false,
@@ -31,19 +31,16 @@ class ContentExtractor
'char-encoding' => 'utf8',
'hide-comments' => true
);
- protected $config_path;
protected $html;
protected $config;
protected $title;
protected $body;
protected $success = false;
- protected $fallback;
public $readability;
public $debug = false;
- function __construct($config_path=null, ContentExtractor $config_fallback=null) {
- $this->config_path = $config_path;
- $this->fallback = $config_fallback;
+ function __construct($path, $fallback=null) {
+ SiteConfig::set_config_path($path, $fallback);
}
protected function debug($msg) {
@@ -66,71 +63,6 @@ class ContentExtractor
$this->success = false;
}
- // returns SiteConfig instance if an appropriate one is found, false otherwise
- public function get_site_config($host) {
- $host = strtolower($host);
- if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
- if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false;
- // check for site configuration
- $try = array($host);
- $split = explode('.', $host);
- if (count($split) > 1) {
- array_shift($split);
- $try[] = '.'.implode('.', $split);
- }
- foreach ($try as $h) {
- if (array_key_exists($h, self::$config_cache)) {
- $this->debug("... cached ($h)");
- return self::$config_cache[$h];
- } elseif (file_exists($this->config_path."/$h.txt")) {
- $this->debug("... from file ($h)");
- $file = $this->config_path."/$h.txt";
- break;
- }
- }
- if (!isset($file)) {
- if (isset($this->fallback)) {
- $this->debug("... trying fallback ($host)");
- return $this->fallback->get_site_config($host);
- } else {
- $this->debug("... no match ($host)");
- return false;
- }
- }
- $config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- if (!$config_file || !is_array($config_file)) return false;
- $config = new SiteConfig();
- foreach ($config_file as $line) {
- $line = trim($line);
-
- // skip comments, empty lines
- if ($line == '' || $line[0] == '#') continue;
-
- // get command
- $command = explode(':', $line, 2);
- // if there's no colon ':', skip this line
- if (count($command) != 2) continue;
- $val = trim($command[1]);
- $command = trim($command[0]);
- if ($command == '' || $val == '') continue;
-
- // check for commands where we accept multiple statements
- if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src'))) {
- array_push($config->$command, $val);
- // check for single statement commands that evaluate to true or false
- } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
- $config->$command = ($val == 'yes');
- // check for single statement commands stored as strings
- } elseif (in_array($command, array('test_url'))) {
- $config->$command = $val;
- }
- }
- // store copy of config in our static cache array in case we need to process another URL
- self::$config_cache[$h] = $config;
-
- return $config;
- }
-
// returns true on success, false on failure
// $smart_tidy indicates that if tidy is used and no results are produced, we will
// try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
@@ -140,11 +72,12 @@ class ContentExtractor
// extract host name
$host = @parse_url($url, PHP_URL_HOST);
- if (!($this->config = $this->get_site_config($host))) {
+ if (!($this->config = SiteConfig::build($host))) {
// no match, so use defaults
$this->config = new SiteConfig();
- self::$config_cache[$host] = $this->config;
}
+ // store copy of config in our static cache array in case we need to process another URL
+ SiteConfig::add_to_cache($host, $this->config);
// use tidy (if it exists)?
// This fixes problems with some sites which would otherwise
diff --git a/libraries/content-extractor/SiteConfig.php b/libraries/content-extractor/SiteConfig.php
index b816d0a..9387702 100644
--- a/libraries/content-extractor/SiteConfig.php
+++ b/libraries/content-extractor/SiteConfig.php
@@ -47,5 +47,120 @@ class SiteConfig
// Test URL - if present, can be used to test the config above
public $test_url = null;
+
+ // Single-page link - should identify a link element or URL pointing to the page holding the entire article
+ // This is useful for sites which split their articles across multiple pages. Links to such pages tend to
+ // display the first page with links to the other pages at the bottom. Often there is also a link to a page
+ // which displays the entire article on one page (e.g. 'print view').
+ // This should be an XPath expression identifying the link to that page. If present and we find a match,
+ // we will retrieve that page and the rest of the options in this config will be applied to the new page.
+ public $single_page_link = array();
+
+ // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
+ public $single_page_link_in_feed = array();
+
+ // TODO: which parser to use for turning raw HTML into a DOMDocument
+ public $parser = 'libxml';
+
+ // the options below cannot be set in the config files which this class represents
+
+ public static $debug = false;
+ protected static $config_path;
+ protected static $config_path_fallback;
+ protected static $config_cache = array();
+ const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
+
+ protected static function debug($msg) {
+ if (self::$debug) {
+ $mem = round(memory_get_usage()/1024, 2);
+ $memPeak = round(memory_get_peak_usage()/1024, 2);
+ echo '* ',$msg;
+ echo ' - mem used: ',$mem," (peak: $memPeak)\n";
+ ob_flush();
+ flush();
+ }
+ }
+
+ public static function set_config_path($path, $fallback=null) {
+ self::$config_path = $path;
+ self::$config_path_fallback = $fallback;
+ }
+
+ public static function add_to_cache($host, SiteConfig $config) {
+ $host = strtolower($host);
+ self::$config_cache[$host] = $config;
+ }
+
+ // returns SiteConfig instance if an appropriate one is found, false otherwise
+ public static function build($host) {
+ $host = strtolower($host);
+ if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
+ if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false;
+ // check for site configuration
+ $try = array($host);
+ $split = explode('.', $host);
+ if (count($split) > 1) {
+ array_shift($split);
+ $try[] = '.'.implode('.', $split);
+ }
+ foreach ($try as $h) {
+ if (array_key_exists($h, self::$config_cache)) {
+ self::debug("... cached ($h)");
+ return self::$config_cache[$h];
+ } elseif (file_exists(self::$config_path."/$h.txt")) {
+ self::debug("... from file ($h)");
+ $file = self::$config_path."/$h.txt";
+ break;
+ }
+ }
+ if (!isset($file)) {
+ if (isset(self::$config_path_fallback)) {
+ self::debug("... trying fallback ($host)");
+ foreach ($try as $h) {
+ if (file_exists(self::$config_path_fallback."/$h.txt")) {
+ self::debug("... from fallback file ($h)");
+ $file = self::$config_path_fallback."/$h.txt";
+ break;
+ }
+ }
+ if (!isset($file)) {
+ self::debug("... no match in fallback directory");
+ return false;
+ }
+ } else {
+ self::debug("... no match ($host)");
+ return false;
+ }
+ }
+ $config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+ if (!$config_file || !is_array($config_file)) return false;
+ $config = new SiteConfig();
+ foreach ($config_file as $line) {
+ $line = trim($line);
+
+ // skip comments, empty lines
+ if ($line == '' || $line[0] == '#') continue;
+
+ // get command
+ $command = explode(':', $line, 2);
+ // if there's no colon ':', skip this line
+ if (count($command) != 2) continue;
+ $val = trim($command[1]);
+ $command = trim($command[0]);
+ if ($command == '' || $val == '') continue;
+
+ // check for commands where we accept multiple statements
+ if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed'))) {
+ array_push($config->$command, $val);
+ // check for single statement commands that evaluate to true or false
+ } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
+ $config->$command = ($val == 'yes');
+ // check for single statement commands stored as strings
+ } elseif (in_array($command, array('test_url'))) {
+ $config->$command = $val;
+ }
+ }
+ return $config;
+ }
}
?>
\ No newline at end of file
diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php
index 92c69af..fcdce01 100644
--- a/libraries/humble-http-agent/HumbleHttpAgent.php
+++ b/libraries/humble-http-agent/HumbleHttpAgent.php
@@ -7,8 +7,8 @@
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
- * @version 0.8
- * @date 2011-02-28
+ * @version 0.9.5
+ * @date 2011-05-23
* @see http://php.net/HttpRequestPool
* @author Keyvan Minoukadeh
* @copyright 2011 Keyvan Minoukadeh
@@ -104,6 +104,15 @@ class HumbleHttpAgent
return $iri->uri;
}
+ public function removeFragment($url) {
+ $pos = strpos($url, '#');
+ if ($pos === false) {
+ return $url;
+ } else {
+ return substr($url, 0, $pos);
+ }
+ }
+
public function enableDebug($bool=true) {
$this->debug = (bool)$bool;
}
@@ -211,6 +220,7 @@ class HumbleHttpAgent
} else {
$this->debug("......adding to pool");
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
+ $req_url = $this->removeFragment($req_url);
$httpRequest = new HttpRequest($req_url, HttpRequest::METH_GET, $this->requestOptions);
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
@@ -225,7 +235,11 @@ class HumbleHttpAgent
// did we get anything into the pool?
if (count($pool) > 0) {
$this->debug('Sending request...');
- $pool->send();
+ try {
+ $pool->send();
+ } catch (HttpRequestPoolException $e) {
+ // do nothing
+ }
$this->debug('Received responses');
foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
@@ -240,7 +254,9 @@ class HumbleHttpAgent
// is redirect?
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
$redirectURL = $request->getResponseHeader('location');
- $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+ if (!preg_match('!^https?://!i', $redirectURL)) {
+ $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+ }
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
@@ -298,6 +314,7 @@ class HumbleHttpAgent
} else {
$this->debug("......adding to pool");
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
+ $req_url = $this->removeFragment($req_url);
$headers = array();
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
@@ -327,7 +344,9 @@ class HumbleHttpAgent
$status_code = $this->requests[$orig]['status_code'];
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
- $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+ if (!preg_match('!^https?://!i', $redirectURL)) {
+ $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+ }
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
@@ -367,6 +386,7 @@ class HumbleHttpAgent
$this->debug("Sending request for $url");
$this->requests[$orig]['original_url'] = $orig;
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
+ $req_url = $this->removeFragment($req_url);
// send cookies, if we have any
$httpContext = $this->httpContext;
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
@@ -391,7 +411,9 @@ class HumbleHttpAgent
}
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
- $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+ if (!preg_match('!^https?://!i', $redirectURL)) {
+ $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+ }
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
@@ -444,6 +466,7 @@ class HumbleHttpAgent
}
public function get($url, $remove=false) {
+ $url = "$url";
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
$response = $this->requests[$url];
diff --git a/makefulltextfeed.php b/makefulltextfeed.php
index fdff082..f2df18a 100644
--- a/makefulltextfeed.php
+++ b/makefulltextfeed.php
@@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh
// Copyright (c) 2011 Keyvan Minoukadeh
// License: AGPLv3
-// Version: 2.7
-// Date: 2011-03-21
+// Version: 2.8
+// Date: 2011-05-23
/*
This program is free software: you can redistribute it and/or modify
@@ -73,131 +73,11 @@ function __autoload($class_name) {
}
}
-function url_allowed($url) {
- global $options;
- if (!empty($options->allowed_urls)) {
- $allowed = false;
- foreach ($options->allowed_urls as $allowurl) {
- if (stristr($url, $allowurl) !== false) {
- $allowed = true;
- break;
- }
- }
- if (!$allowed) return false;
- } else {
- foreach ($options->blocked_urls as $blockurl) {
- if (stristr($url, $blockurl) !== false) {
- return false;
- }
- }
- }
- return true;
-}
-
////////////////////////////////
// Load config file if it exists
////////////////////////////////
require_once(dirname(__FILE__).'/config.php');
-//////////////////////////////////////////////
-// Convert $html to UTF8
-// (uses HTTP headers and HTML to find encoding)
-// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
-//////////////////////////////////////////////
-function convert_to_utf8($html, $header=null)
-{
- $encoding = null;
- if ($html || $header) {
- if (is_array($header)) $header = implode("\n", $header);
- if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
- // error parsing the response
- } else {
- $match = end($match); // get last matched element (in case of redirects)
- if (isset($match[2])) $encoding = trim($match[2], '"\'');
- }
- if (!$encoding) {
- if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
- $encoding = trim($match[1], '"\'');
- } elseif(preg_match('/]+)/i', $html, $match)) {
- if (isset($match[1])) $encoding = trim($match[1]);
- }
- }
- if (!$encoding) {
- $encoding = 'utf-8';
- } else {
- if (strtolower($encoding) != 'utf-8') {
- if (strtolower($encoding) == 'iso-8859-1') {
- // replace MS Word smart qutoes
- $trans = array();
- $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark
- $trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook
- $trans[chr(132)] = '„'; // Double Low-9 Quotation Mark
- $trans[chr(133)] = '…'; // Horizontal Ellipsis
- $trans[chr(134)] = '†'; // Dagger
- $trans[chr(135)] = '‡'; // Double Dagger
- $trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent
- $trans[chr(137)] = '‰'; // Per Mille Sign
- $trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron
- $trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark
- $trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE
- $trans[chr(145)] = '‘'; // Left Single Quotation Mark
- $trans[chr(146)] = '’'; // Right Single Quotation Mark
- $trans[chr(147)] = '“'; // Left Double Quotation Mark
- $trans[chr(148)] = '”'; // Right Double Quotation Mark
- $trans[chr(149)] = '•'; // Bullet
- $trans[chr(150)] = '–'; // En Dash
- $trans[chr(151)] = '—'; // Em Dash
- $trans[chr(152)] = '˜'; // Small Tilde
- $trans[chr(153)] = '™'; // Trade Mark Sign
- $trans[chr(154)] = 'š'; // Latin Small Letter S With Caron
- $trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark
- $trans[chr(156)] = 'œ'; // Latin Small Ligature OE
- $trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis
- $html = strtr($html, $trans);
- }
- $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
-
- /*
- if (function_exists('iconv')) {
- // iconv appears to handle certain character encodings better than mb_convert_encoding
- $html = iconv($encoding, 'utf-8', $html);
- } else {
- $html = mb_convert_encoding($html, 'utf-8', $encoding);
- }
- */
- }
- }
- }
- return $html;
-}
-
-function makeAbsolute($base, $elem) {
- $base = new IRI($base);
- foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
- $elems = $elem->getElementsByTagName($tag);
- for ($i = $elems->length-1; $i >= 0; $i--) {
- $e = $elems->item($i);
- //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
- makeAbsoluteAttr($base, $e, $attr);
- }
- if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
- }
-}
-function makeAbsoluteAttr($base, $e, $attr) {
- if ($e->hasAttribute($attr)) {
- // Trim leading and trailing white space. I don't really like this but
- // unfortunately it does appear on some sites. e.g.
- $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
- $url = str_replace(' ', '%20', $url);
- if (!preg_match('!https?://!i', $url)) {
- $absolute = IRI::absolutize($base, $url);
- if ($absolute) {
- $e->setAttribute($attr, $absolute);
- }
- }
- }
-}
-
////////////////////////////////
// Check if service is enabled
////////////////////////////////
@@ -211,7 +91,7 @@ if (!$options->enabled) {
if (!isset($_GET['url'])) {
die('No URL supplied');
}
-$url = $_GET['url'];
+$url = trim($_GET['url']);
if (!preg_match('!^https?://.+!i', $url)) {
$url = 'http://'.$url;
}
@@ -240,6 +120,7 @@ if ($options->alternative_url != '' && !isset($_GET['redir']) && mt_rand(0, 100)
if (isset($_GET['links'])) $redirect .= '&links='.$_GET['links'];
if (isset($_GET['exc'])) $redirect .= '&exc='.$_GET['exc'];
if (isset($_GET['what'])) $redirect .= '&what='.$_GET['what'];
+ if (isset($_GET['format'])) $redirect .= '&format='.$_GET['format'];
header("Location: $redirect");
exit;
}
@@ -258,6 +139,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
if (isset($_GET['what'])) $redirect .= '&what='.urlencode($_GET['what']);
+ if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
header("Location: $redirect");
exit;
}
@@ -364,9 +246,13 @@ if (($extract_pattern != '') && ($extract_pattern != 'auto')) {
/////////////////////////////////////
// Check for valid format
-// (stick to RSS for the time being)
+// (stick to RSS (or RSS as JSON) for the time being)
/////////////////////////////////////
-$format = 'rss';
+if (isset($_GET['format']) && $_GET['format'] == 'json') {
+ $format = 'json';
+} else {
+ $format = 'rss';
+}
//////////////////////////////////
// Check for cached copy
@@ -392,10 +278,14 @@ if ($options->caching) {
// getting a Zend_Cache_Core object
$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
- $cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.(int)isset($_GET['pubsub']));
+ $cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.$format.(int)isset($_GET['pubsub']));
if ($data = $cache->load($cache_id)) {
- header("Content-type: text/xml; charset=UTF-8");
+ if ($format == 'json') {
+ header("Content-type: application/json; charset=UTF-8");
+ } else {
+ header("Content-type: text/xml; charset=UTF-8");
+ }
if (headers_sent()) die('Some data has already been output, can\'t send RSS file');
echo $data;
exit;
@@ -419,7 +309,7 @@ $http = new HumbleHttpAgent();
//////////////////////////////////
// Set up Content Extractor
//////////////////////////////////
-$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', new ContentExtractor(dirname(__FILE__).'/site_config/standard'));
+$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', dirname(__FILE__).'/site_config/standard');
/*
if ($options->caching) {
@@ -453,7 +343,8 @@ if (!$html_only) {
SimplePie_HumbleHttpAgent::set_agent($http);
$feed = new SimplePie();
$feed->set_file_class('SimplePie_HumbleHttpAgent');
- $feed->set_feed_url($url);
+ //$feed->set_feed_url($url); // colons appearing in the URL's path get encoded
+ $feed->feed_url = $url;
$feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
$feed->set_timeout(20);
$feed->enable_cache(false);
@@ -471,97 +362,34 @@ if (!$html_only) {
}
////////////////////////////////////////////////////////////////////////////////
-// Extract content from HTML (if URL is not feed or explicit HTML request has been made)
+// Our given URL is not a feed, so let's create our own feed with a single item:
+// the given URL. This basically treats all non-feed URLs as if they were
+// single-item feeds.
////////////////////////////////////////////////////////////////////////////////
+$isDummyFeed = false;
if ($html_only || !$result) {
+ $isDummyFeed = true;
unset($feed, $result);
- if ($response = $http->get($url)) {
- $effective_url = $response['effective_url'];
- if (!url_allowed($effective_url)) die('URL blocked');
- $html = $response['body'];
- // remove strange things here
- $html = str_replace('[>', '', $html);
- $html = convert_to_utf8($html, $response['headers']);
+ // create single item dummy feed object
+ class DummySingleItemFeed {
+ public $item;
+ function __construct($url) { $this->item = new DummySingleItem($url); }
+ public function get_title() { return ''; }
+ public function get_description() { return 'Content extracted from '.$this->item->url; }
+ public function get_link() { return $this->item->url; }
+ public function get_image_url() { return false; }
+ public function get_items($start=0, $max=1) { return array(0=>$this->item); }
}
- if (!$response || $response['status_code'] >= 300) {
- die('Error retrieving '.$url);
+ class DummySingleItem {
+ public $url;
+ function __construct($url) { $this->url = $url; }
+ public function get_permalink() { return $this->url; }
+ public function get_title() { return ''; }
+ public function get_date($format='') { return false; }
+ public function get_author() { return false; }
+ public function get_description() { return ''; }
}
- if ($auto_extract) {
- $extract_result = $extractor->process($html, $effective_url);
- if (!$extract_result) die($options->error_message);
- $readability = $extractor->readability;
- $content_block = $extractor->getContent();
- $title = $extractor->getTitle();
- } else {
- $readability = new Readability($html, $effective_url);
- // content block is entire document
- $content_block = $readability->dom;
- //TODO: get title
- $title = '';
- }
- if ($extract_pattern) {
- $xpath = new DOMXPath($readability->dom);
- $elems = @$xpath->query($extract_pattern, $content_block);
- // check if our custom extraction pattern matched
- if ($elems && $elems->length > 0) {
- // get the first matched element
- $content_block = $elems->item(0);
- // clean it up
- $readability->removeScripts($content_block);
- $readability->prepArticle($content_block);
- } else {
- die($options->error_message);
- //$content_block = $readability->dom->createElement('p', 'Sorry, could not extract content');
- }
- }
- $readability->clean($content_block, 'select');
- if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block);
- // footnotes
- if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
- $readability->addFootnotes($content_block);
- }
- if ($extract_pattern) {
- // get outerHTML
- $content = $content_block->ownerDocument->saveXML($content_block);
- } else {
- if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
- $content = $content_block->firstChild->innerHTML;
- } else {
- $content = $content_block->innerHTML;
- }
- }
- if ($links == 'remove') {
- $content = preg_replace('!?a[^>]*>!', '', $content);
- }
- if (!$valid_key) {
- $content = $options->message_to_prepend.$content;
- $content .= $options->message_to_append;
- } else {
- $content = $options->message_to_prepend_with_key.$content;
- $content .= $options->message_to_append_with_key;
- }
- unset($readability, $html);
- $output = new FeedWriter(); //ATOM an option
- $output->setTitle($title);
- $output->setDescription("Content extracted from $url");
- $output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it
- if ($format == 'atom') {
- $output->setChannelElement('updated', date(DATE_ATOM));
- $output->setChannelElement('author', array('name'=>'Five Filters', 'uri'=>'http://fivefilters.org'));
- }
- $output->setLink($url);
- $newitem = $output->createNewItem();
- $newitem->setTitle($title);
- $newitem->setLink($url);
- if ($format == 'atom') {
- $newitem->setDate(time());
- $newitem->addElement('content', $content);
- } else {
- $newitem->setDescription($content);
- }
- $output->addItem($newitem);
- $output->genarateFeed();
- exit;
+ $feed = new DummySingleItemFeed($url);
}
////////////////////////////////////////////
@@ -594,6 +422,8 @@ $urls_sanitized = array();
$urls = array();
foreach ($items as $key => $item) {
$permalink = htmlspecialchars_decode($item->get_permalink());
+ // Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
+ $permalink = str_replace('%3A', ':', $permalink);
$permalink = $http->validateUrl($permalink);
if ($permalink) {
$urls_sanitized[] = $permalink;
@@ -625,17 +455,34 @@ foreach ($items as $key => $item) {
$effective_url = $response['effective_url'];
if (!url_allowed($effective_url)) continue;
$html = $response['body'];
- // remove strange things here
- $html = str_replace('[>', '', $html);
+ // remove strange things
+ $html = str_replace('[>', '', $html);
$html = convert_to_utf8($html, $response['headers']);
if ($auto_extract) {
+ // check site config for single page URL - fetch it if found
+ if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
+ $html = $single_page_response['body'];
+ // remove strange things
+ $html = str_replace('[>', '', $html);
+ $html = convert_to_utf8($html, $single_page_response['headers']);
+ $effective_url = $single_page_response['effective_url'];
+ unset($single_page_response);
+ }
$extract_result = $extractor->process($html, $effective_url);
$readability = $extractor->readability;
$content_block = ($extract_result) ? $extractor->getContent() : null;
+ $title = ($extract_result) ? $extractor->getTitle() : '';
} else {
$readability = new Readability($html, $effective_url);
// content block is entire document (for now...)
- $content_block = $readability->dom;
+ $content_block = $readability->dom;
+ //TODO: get title
+ $title = '';
+ }
+ // use extracted title for both feed and item title if we're using single-item dummy feed
+ if ($isDummyFeed) {
+ $output->setTitle($title);
+ $newitem->setTitle($title);
}
if ($extract_pattern && isset($content_block)) {
$xpath = new DOMXPath($readability->dom);
@@ -684,11 +531,11 @@ foreach ($items as $key => $item) {
$html = preg_replace('!?a[^>]*>!', '', $html);
}
if (!$valid_key) {
- $html = $options->message_to_prepend.$html;
- $html .= $options->message_to_append;
+ $html = make_substitutions($options->message_to_prepend).$html;
+ $html .= make_substitutions($options->message_to_append);
} else {
- $html = $options->message_to_prepend_with_key.$html;
- $html .= $options->message_to_append_with_key;
+ $html = make_substitutions($options->message_to_prepend_with_key).$html;
+ $html .= make_substitutions($options->message_to_append_with_key);
}
}
if ($format == 'atom') {
@@ -715,14 +562,215 @@ foreach ($items as $key => $item) {
unset($html);
}
// output feed
-if ($options->caching) {
+if ($options->caching || $format == 'json') {
ob_start();
$output->genarateFeed();
$output = ob_get_contents();
ob_end_clean();
- $cache->save($output, $cache_id);
+ if ($format == 'json') {
+ $jsonrss = new stdClass();
+ $jsonrss->rss = @simplexml_load_string($output);
+ $output = json_encode($jsonrss);
+ header("Content-type: application/json; charset=UTF-8");
+ }
+ if ($options->caching) $cache->save($output, $cache_id);
echo $output;
} else {
$output->genarateFeed();
}
+
+///////////////////////////////
+// HELPER FUNCTIONS
+///////////////////////////////
+
+function url_allowed($url) {
+ global $options;
+ if (!empty($options->allowed_urls)) {
+ $allowed = false;
+ foreach ($options->allowed_urls as $allowurl) {
+ if (stristr($url, $allowurl) !== false) {
+ $allowed = true;
+ break;
+ }
+ }
+ if (!$allowed) return false;
+ } else {
+ foreach ($options->blocked_urls as $blockurl) {
+ if (stristr($url, $blockurl) !== false) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+//////////////////////////////////////////////
+// Convert $html to UTF8
+// (uses HTTP headers and HTML to find encoding)
+// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
+//////////////////////////////////////////////
+function convert_to_utf8($html, $header=null)
+{
+ $encoding = null;
+ if ($html || $header) {
+ if (is_array($header)) $header = implode("\n", $header);
+ if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
+ // error parsing the response
+ } else {
+ $match = end($match); // get last matched element (in case of redirects)
+ if (isset($match[2])) $encoding = trim($match[2], '"\'');
+ }
+ if (!$encoding) {
+ if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
+ $encoding = trim($match[1], '"\'');
+ } elseif(preg_match('/]+)/i', $html, $match)) {
+ if (isset($match[1])) $encoding = trim($match[1]);
+ }
+ }
+ if (!$encoding) {
+ $encoding = 'utf-8';
+ } else {
+ if (strtolower($encoding) != 'utf-8') {
+ if (strtolower($encoding) == 'iso-8859-1') {
+ // replace MS Word smart qutoes
+ $trans = array();
+ $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark
+ $trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook
+ $trans[chr(132)] = '„'; // Double Low-9 Quotation Mark
+ $trans[chr(133)] = '…'; // Horizontal Ellipsis
+ $trans[chr(134)] = '†'; // Dagger
+ $trans[chr(135)] = '‡'; // Double Dagger
+ $trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent
+ $trans[chr(137)] = '‰'; // Per Mille Sign
+ $trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron
+ $trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark
+ $trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE
+ $trans[chr(145)] = '‘'; // Left Single Quotation Mark
+ $trans[chr(146)] = '’'; // Right Single Quotation Mark
+ $trans[chr(147)] = '“'; // Left Double Quotation Mark
+ $trans[chr(148)] = '”'; // Right Double Quotation Mark
+ $trans[chr(149)] = '•'; // Bullet
+ $trans[chr(150)] = '–'; // En Dash
+ $trans[chr(151)] = '—'; // Em Dash
+ $trans[chr(152)] = '˜'; // Small Tilde
+ $trans[chr(153)] = '™'; // Trade Mark Sign
+ $trans[chr(154)] = 'š'; // Latin Small Letter S With Caron
+ $trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark
+ $trans[chr(156)] = 'œ'; // Latin Small Ligature OE
+ $trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis
+ $html = strtr($html, $trans);
+ }
+ $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
+
+ /*
+ if (function_exists('iconv')) {
+ // iconv appears to handle certain character encodings better than mb_convert_encoding
+ $html = iconv($encoding, 'utf-8', $html);
+ } else {
+ $html = mb_convert_encoding($html, 'utf-8', $encoding);
+ }
+ */
+ }
+ }
+ }
+ return $html;
+}
+
+function makeAbsolute($base, $elem) {
+ $base = new IRI($base);
+ // remove '//' in URL path (causes URLs not to resolve properly)
+ if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
+ foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
+ $elems = $elem->getElementsByTagName($tag);
+ for ($i = $elems->length-1; $i >= 0; $i--) {
+ $e = $elems->item($i);
+ //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
+ makeAbsoluteAttr($base, $e, $attr);
+ }
+ if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
+ }
+}
+function makeAbsoluteAttr($base, $e, $attr) {
+ if ($e->hasAttribute($attr)) {
+ // Trim leading and trailing white space. I don't really like this but
+ // unfortunately it does appear on some sites. e.g.
+ $url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
+ $url = str_replace(' ', '%20', $url);
+ if (!preg_match('!https?://!i', $url)) {
+ $absolute = IRI::absolutize($base, $url);
+ if ($absolute) {
+ $e->setAttribute($attr, $absolute);
+ }
+ }
+ }
+}
+function makeAbsoluteStr($base, $url) {
+ $base = new IRI($base);
+ // remove '//' in URL path (causes URLs not to resolve properly)
+ if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
+ if (preg_match('!^https?://!i', $url)) {
+ // already absolute
+ return $url;
+ } else {
+ $absolute = IRI::absolutize($base, $url);
+ if ($absolute) return $absolute;
+ return false;
+ }
+}
+// returns single page response, or false if not found
+function getSinglePage($item, $html, $url) {
+ global $http;
+ $host = @parse_url($url, PHP_URL_HOST);
+ $site_config = SiteConfig::build($host);
+ if ($site_config === false) return false;
+ $splink = null;
+ if (!empty($site_config->single_page_link)) {
+ $splink = $site_config->single_page_link;
+ } elseif (!empty($site_config->single_page_link_in_feed)) {
+ // single page link xpath is targeted at feed
+ $splink = $site_config->single_page_link_in_feed;
+ // so let's replace HTML with feed item description
+ $html = $item->get_description();
+ }
+ if (isset($splink)) {
+ // Build DOM tree from HTML
+ $readability = new Readability($html, $url);
+ $xpath = new DOMXPath($readability->dom);
+ // Loop through single_page_link xpath expressions
+ $single_page_url = null;
+ foreach ($splink as $pattern) {
+ $elems = @$xpath->evaluate($pattern, $readability->dom);
+ if (is_string($elems)) {
+ $single_page_url = trim($elems);
+ break;
+ } elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+ foreach ($elems as $item) {
+ if ($item->hasAttribute('href')) {
+ $single_page_url = $item->getAttribute('href');
+ break;
+ }
+ }
+ }
+ }
+ // If we've got URL, resolve against $url
+ if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
+ // check it's not what we have already!
+ if ($single_page_url != $url) {
+ // it's not, so let's try to fetch it...
+ if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
+ return $response;
+ }
+ }
+ }
+ }
+ return false;
+}
+
+function make_substitutions($string) {
+ if ($string == '') return $string;
+ global $item, $effective_url;
+ $string = str_replace('{url}', htmlspecialchars($item->get_permalink()), $string);
+ $string = str_replace('{effective-url}', htmlspecialchars($effective_url), $string);
+ return $string;
+}
?>
\ No newline at end of file