From eeec0f1982cb2bc87adefc7c63185499a9f7ca40 Mon Sep 17 00:00:00 2001 From: Keyvan Date: Tue, 1 May 2012 00:51:43 +0200 Subject: [PATCH] Full-Text RSS 2.8 --- changelog.txt | 9 + config.php | 8 +- ftr_compatibility_test.php | 4 +- index.php | 7 +- .../content-extractor/ContentExtractor.php | 85 +-- libraries/content-extractor/SiteConfig.php | 115 ++++ .../humble-http-agent/HumbleHttpAgent.php | 35 +- makefulltextfeed.php | 496 ++++++++++-------- 8 files changed, 448 insertions(+), 311 deletions(-) diff --git a/changelog.txt b/changelog.txt index 4a4dfb5..9e29ead 100644 --- a/changelog.txt +++ b/changelog.txt @@ -2,6 +2,15 @@ FiveFilters.org: Full-Text RSS http://fivefilters.org/content-only/ CHANGELOG ------------------------------------ +2.8 (2011-05-30) + - Tidy no longer stripping HTML5 elements + - JSON output (pass &format=json in querystring) + - New site patterns added and old ones updated + - New site config option to force full-page retrieval on multi-page articles: single_page_link + - User Guide (PDF) now included (although still a work in progress) + - URL placeholders now accepted in message_to_prepend/append config options + - Plus minor fixes... + 2.7 (2011-03-21) - Site patterns for better control over extraction (see site_config/README.txt) - hNews support (improves content extraction for sites using hNews microformatting) diff --git a/config.php b/config.php index 6b55583..68fcc5d 100644 --- a/config.php +++ b/config.php @@ -74,11 +74,17 @@ $options->cache_dir = dirname(__FILE__).'/cache'; // Message to prepend (without API key) // ---------------------- // HTML to insert at the beginning of each feed item when no API key is supplied. +// Substitution tags: +// {url} - Feed item URL +// {effective-url} - Feed item URL after we've followed all redirects $options->message_to_prepend = ''; // Message to append (without API key) // ---------------------- // HTML to insert at the end of each feed item when no API key is supplied. +// Substitution tags: +// {url} - Feed item URL +// {effective-url} - Feed item URL after we've followed all redirects $options->message_to_append = ''; // URLs to allow @@ -188,7 +194,7 @@ $options->error_message_with_key = '[unable to retrieve full-text content]'; /// DO NOT CHANGE ANYTHING BELOW THIS /////////// ///////////////////////////////////////////////// -if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.7'); +if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.8'); if ((basename(__FILE__) == 'config.php') && (file_exists(dirname(__FILE__).'/custom_config.php'))) { require_once(dirname(__FILE__).'/custom_config.php'); diff --git a/ftr_compatibility_test.php b/ftr_compatibility_test.php index ee35d06..aae5687 100644 --- a/ftr_compatibility_test.php +++ b/ftr_compatibility_test.php @@ -13,7 +13,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o http://github.com/simplepie/simplepie/tree/master/compatibility_test/ */ -$app_name = 'Full-Text RSS 2.7'; +$app_name = 'Full-Text RSS 2.8'; $php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=')); $pcre_ok = extension_loaded('pcre'); @@ -327,7 +327,7 @@ div.chunk {

Your webhost has its act together!

You can download the latest version of from FiveFilters.org.

Note: Passing this test does not guarantee that will run on your webhost — it only ensures that the basic requirements have been addressed. If you experience any problems, please let us know.

- +

Bottom Line: Yes, you can!

For most feeds, it'll run with no problems. There are certain languages that you might have a hard time with though.

You can download the latest version of from FiveFilters.org.

diff --git a/index.php b/index.php index 7a55b56..f955f8e 100644 --- a/index.php +++ b/index.php @@ -96,14 +96,16 @@ if (!defined('_FF_FTR_INDEX')) {

Thanks for downloading and setting this up. If you haven't done so already, check server compatibility to see if your environment will support this application. Full-Text RSS runs on most shared web hosting environments.

Configure

-

In addition to the options above, Full-Text RSS comes with a configuration file which allows you to control how the application works. Features include:

+

In addition to the options above, Full-Text RSS can be configured to better suit your needs. Features include:

+

Please refer to the user guide for more information.

To change the configuration, save a copy of config.php as custom_config.php and make any changes you like to it.To change the configuration, edit custom_config.php and make any changes you like.

If everything works fine, feel free to modify this page by saving it as custom_index.php and change it to whatever you like.

@@ -118,7 +120,8 @@ if (!defined('_FF_FTR_INDEX')) {

To see if you're running the latest version, check for updates.

-

We have more information in the section below, but if you need help with anything, please email fivefilters@fivefilters.org.

+

We have a public forum which anyone can use to discuss any issues, post questions and find answers (it's free to join and post).

+

We provide a little more information in the section below, but if you need help with anything, you can also email us at fivefilters@fivefilters.org.


diff --git a/libraries/content-extractor/ContentExtractor.php b/libraries/content-extractor/ContentExtractor.php index 131aab6..33e4955 100644 --- a/libraries/content-extractor/ContentExtractor.php +++ b/libraries/content-extractor/ContentExtractor.php @@ -5,8 +5,8 @@ * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) * to extract content from HTML files. * - * @version 0.5 - * @date 2011-03-07 + * @version 0.6 + * @date 2011-05-04 * @author Keyvan Minoukadeh * @copyright 2011 Keyvan Minoukadeh * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 @@ -14,13 +14,13 @@ class ContentExtractor { - const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; - protected static $config_cache = array(); protected static $tidy_config = array( 'clean' => true, 'output-xhtml' => true, 'logical-emphasis' => true, 'show-body-only' => false, + 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', + 'new-inline-tags' => 'new-inline-tags: mark, time, meter, progress', 'wrap' => 0, 'drop-empty-paras' => true, 'drop-proprietary-attributes' => false, @@ -31,19 +31,16 @@ class ContentExtractor 'char-encoding' => 'utf8', 'hide-comments' => true ); - protected $config_path; protected $html; protected $config; protected $title; protected $body; protected $success = false; - protected $fallback; public $readability; public $debug = false; - function __construct($config_path=null, ContentExtractor $config_fallback=null) { - $this->config_path = $config_path; - $this->fallback = $config_fallback; + function __construct($path, $fallback=null) { + SiteConfig::set_config_path($path, $fallback); } protected function debug($msg) { @@ -66,71 +63,6 @@ class ContentExtractor $this->success = false; } - // returns SiteConfig instance if an appropriate one is found, false otherwise - public function get_site_config($host) { - $host = strtolower($host); - if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); - if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false; - // check for site configuration - $try = array($host); - $split = explode('.', $host); - if (count($split) > 1) { - array_shift($split); - $try[] = '.'.implode('.', $split); - } - foreach ($try as $h) { - if (array_key_exists($h, self::$config_cache)) { - $this->debug("... cached ($h)"); - return self::$config_cache[$h]; - } elseif (file_exists($this->config_path."/$h.txt")) { - $this->debug("... from file ($h)"); - $file = $this->config_path."/$h.txt"; - break; - } - } - if (!isset($file)) { - if (isset($this->fallback)) { - $this->debug("... trying fallback ($host)"); - return $this->fallback->get_site_config($host); - } else { - $this->debug("... no match ($host)"); - return false; - } - } - $config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); - if (!$config_file || !is_array($config_file)) return false; - $config = new SiteConfig(); - foreach ($config_file as $line) { - $line = trim($line); - - // skip comments, empty lines - if ($line == '' || $line[0] == '#') continue; - - // get command - $command = explode(':', $line, 2); - // if there's no colon ':', skip this line - if (count($command) != 2) continue; - $val = trim($command[1]); - $command = trim($command[0]); - if ($command == '' || $val == '') continue; - - // check for commands where we accept multiple statements - if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src'))) { - array_push($config->$command, $val); - // check for single statement commands that evaluate to true or false - } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { - $config->$command = ($val == 'yes'); - // check for single statement commands stored as strings - } elseif (in_array($command, array('test_url'))) { - $config->$command = $val; - } - } - // store copy of config in our static cache array in case we need to process another URL - self::$config_cache[$h] = $config; - - return $config; - } - // returns true on success, false on failure // $smart_tidy indicates that if tidy is used and no results are produced, we will // try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time @@ -140,11 +72,12 @@ class ContentExtractor // extract host name $host = @parse_url($url, PHP_URL_HOST); - if (!($this->config = $this->get_site_config($host))) { + if (!($this->config = SiteConfig::build($host))) { // no match, so use defaults $this->config = new SiteConfig(); - self::$config_cache[$host] = $this->config; } + // store copy of config in our static cache array in case we need to process another URL + SiteConfig::add_to_cache($host, $this->config); // use tidy (if it exists)? // This fixes problems with some sites which would otherwise diff --git a/libraries/content-extractor/SiteConfig.php b/libraries/content-extractor/SiteConfig.php index b816d0a..9387702 100644 --- a/libraries/content-extractor/SiteConfig.php +++ b/libraries/content-extractor/SiteConfig.php @@ -47,5 +47,120 @@ class SiteConfig // Test URL - if present, can be used to test the config above public $test_url = null; + + // Single-page link - should identify a link element or URL pointing to the page holding the entire article + // This is useful for sites which split their articles across multiple pages. Links to such pages tend to + // display the first page with links to the other pages at the bottom. Often there is also a link to a page + // which displays the entire article on one page (e.g. 'print view'). + // This should be an XPath expression identifying the link to that page. If present and we find a match, + // we will retrieve that page and the rest of the options in this config will be applied to the new page. + public $single_page_link = array(); + + // Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed + public $single_page_link_in_feed = array(); + + // TODO: which parser to use for turning raw HTML into a DOMDocument + public $parser = 'libxml'; + + // the options below cannot be set in the config files which this class represents + + public static $debug = false; + protected static $config_path; + protected static $config_path_fallback; + protected static $config_cache = array(); + const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/'; + + protected static function debug($msg) { + if (self::$debug) { + $mem = round(memory_get_usage()/1024, 2); + $memPeak = round(memory_get_peak_usage()/1024, 2); + echo '* ',$msg; + echo ' - mem used: ',$mem," (peak: $memPeak)\n"; + ob_flush(); + flush(); + } + } + + public static function set_config_path($path, $fallback=null) { + self::$config_path = $path; + self::$config_path_fallback = $fallback; + } + + public static function add_to_cache($host, SiteConfig $config) { + $host = strtolower($host); + self::$config_cache[$host] = $config; + } + + // returns SiteConfig instance if an appropriate one is found, false otherwise + public static function build($host) { + $host = strtolower($host); + if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); + if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false; + // check for site configuration + $try = array($host); + $split = explode('.', $host); + if (count($split) > 1) { + array_shift($split); + $try[] = '.'.implode('.', $split); + } + foreach ($try as $h) { + if (array_key_exists($h, self::$config_cache)) { + self::debug("... cached ($h)"); + return self::$config_cache[$h]; + } elseif (file_exists(self::$config_path."/$h.txt")) { + self::debug("... from file ($h)"); + $file = self::$config_path."/$h.txt"; + break; + } + } + if (!isset($file)) { + if (isset(self::$config_path_fallback)) { + self::debug("... trying fallback ($host)"); + foreach ($try as $h) { + if (file_exists(self::$config_path_fallback."/$h.txt")) { + self::debug("... from fallback file ($h)"); + $file = self::$config_path_fallback."/$h.txt"; + break; + } + } + if (!isset($file)) { + self::debug("... no match in fallback directory"); + return false; + } + } else { + self::debug("... no match ($host)"); + return false; + } + } + $config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + if (!$config_file || !is_array($config_file)) return false; + $config = new SiteConfig(); + foreach ($config_file as $line) { + $line = trim($line); + + // skip comments, empty lines + if ($line == '' || $line[0] == '#') continue; + + // get command + $command = explode(':', $line, 2); + // if there's no colon ':', skip this line + if (count($command) != 2) continue; + $val = trim($command[1]); + $command = trim($command[0]); + if ($command == '' || $val == '') continue; + + // check for commands where we accept multiple statements + if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed'))) { + array_push($config->$command, $val); + // check for single statement commands that evaluate to true or false + } elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) { + $config->$command = ($val == 'yes'); + // check for single statement commands stored as strings + } elseif (in_array($command, array('test_url'))) { + $config->$command = $val; + } + } + return $config; + } } ?> \ No newline at end of file diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php index 92c69af..fcdce01 100644 --- a/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/libraries/humble-http-agent/HumbleHttpAgent.php @@ -7,8 +7,8 @@ * For environments which do not have these options, it reverts to standard sequential * requests (using file_get_contents()) * - * @version 0.8 - * @date 2011-02-28 + * @version 0.9.5 + * @date 2011-05-23 * @see http://php.net/HttpRequestPool * @author Keyvan Minoukadeh * @copyright 2011 Keyvan Minoukadeh @@ -104,6 +104,15 @@ class HumbleHttpAgent return $iri->uri; } + public function removeFragment($url) { + $pos = strpos($url, '#'); + if ($pos === false) { + return $url; + } else { + return substr($url, 0, $pos); + } + } + public function enableDebug($bool=true) { $this->debug = (bool)$bool; } @@ -211,6 +220,7 @@ class HumbleHttpAgent } else { $this->debug("......adding to pool"); $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url; + $req_url = $this->removeFragment($req_url); $httpRequest = new HttpRequest($req_url, HttpRequest::METH_GET, $this->requestOptions); // send cookies, if we have any if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { @@ -225,7 +235,11 @@ class HumbleHttpAgent // did we get anything into the pool? if (count($pool) > 0) { $this->debug('Sending request...'); - $pool->send(); + try { + $pool->send(); + } catch (HttpRequestPoolException $e) { + // do nothing + } $this->debug('Received responses'); foreach($subset as $orig => $url) { if (!$isRedirect) $orig = $url; @@ -240,7 +254,9 @@ class HumbleHttpAgent // is redirect? if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { $redirectURL = $request->getResponseHeader('location'); - $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + if (!preg_match('!^https?://!i', $redirectURL)) { + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + } if ($this->validateURL($redirectURL)) { $this->debug('Redirect detected. Valid URL: '.$redirectURL); // store any cookies @@ -298,6 +314,7 @@ class HumbleHttpAgent } else { $this->debug("......adding to pool"); $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url; + $req_url = $this->removeFragment($req_url); $headers = array(); // send cookies, if we have any if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { @@ -327,7 +344,9 @@ class HumbleHttpAgent $status_code = $this->requests[$orig]['status_code']; if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { $redirectURL = $this->requests[$orig]['location']; - $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + if (!preg_match('!^https?://!i', $redirectURL)) { + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + } if ($this->validateURL($redirectURL)) { $this->debug('Redirect detected. Valid URL: '.$redirectURL); // store any cookies @@ -367,6 +386,7 @@ class HumbleHttpAgent $this->debug("Sending request for $url"); $this->requests[$orig]['original_url'] = $orig; $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url; + $req_url = $this->removeFragment($req_url); // send cookies, if we have any $httpContext = $this->httpContext; if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { @@ -391,7 +411,9 @@ class HumbleHttpAgent } if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { $redirectURL = $this->requests[$orig]['location']; - $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + if (!preg_match('!^https?://!i', $redirectURL)) { + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + } if ($this->validateURL($redirectURL)) { $this->debug('Redirect detected. Valid URL: '.$redirectURL); // store any cookies @@ -444,6 +466,7 @@ class HumbleHttpAgent } public function get($url, $remove=false) { + $url = "$url"; if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); $response = $this->requests[$url]; diff --git a/makefulltextfeed.php b/makefulltextfeed.php index fdff082..f2df18a 100644 --- a/makefulltextfeed.php +++ b/makefulltextfeed.php @@ -3,8 +3,8 @@ // Author: Keyvan Minoukadeh // Copyright (c) 2011 Keyvan Minoukadeh // License: AGPLv3 -// Version: 2.7 -// Date: 2011-03-21 +// Version: 2.8 +// Date: 2011-05-23 /* This program is free software: you can redistribute it and/or modify @@ -73,131 +73,11 @@ function __autoload($class_name) { } } -function url_allowed($url) { - global $options; - if (!empty($options->allowed_urls)) { - $allowed = false; - foreach ($options->allowed_urls as $allowurl) { - if (stristr($url, $allowurl) !== false) { - $allowed = true; - break; - } - } - if (!$allowed) return false; - } else { - foreach ($options->blocked_urls as $blockurl) { - if (stristr($url, $blockurl) !== false) { - return false; - } - } - } - return true; -} - //////////////////////////////// // Load config file if it exists //////////////////////////////// require_once(dirname(__FILE__).'/config.php'); -////////////////////////////////////////////// -// Convert $html to UTF8 -// (uses HTTP headers and HTML to find encoding) -// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8 -////////////////////////////////////////////// -function convert_to_utf8($html, $header=null) -{ - $encoding = null; - if ($html || $header) { - if (is_array($header)) $header = implode("\n", $header); - if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) { - // error parsing the response - } else { - $match = end($match); // get last matched element (in case of redirects) - if (isset($match[2])) $encoding = trim($match[2], '"\''); - } - if (!$encoding) { - if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) { - $encoding = trim($match[1], '"\''); - } elseif(preg_match('/]+)/i', $html, $match)) { - if (isset($match[1])) $encoding = trim($match[1]); - } - } - if (!$encoding) { - $encoding = 'utf-8'; - } else { - if (strtolower($encoding) != 'utf-8') { - if (strtolower($encoding) == 'iso-8859-1') { - // replace MS Word smart qutoes - $trans = array(); - $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark - $trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook - $trans[chr(132)] = '„'; // Double Low-9 Quotation Mark - $trans[chr(133)] = '…'; // Horizontal Ellipsis - $trans[chr(134)] = '†'; // Dagger - $trans[chr(135)] = '‡'; // Double Dagger - $trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent - $trans[chr(137)] = '‰'; // Per Mille Sign - $trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron - $trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark - $trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE - $trans[chr(145)] = '‘'; // Left Single Quotation Mark - $trans[chr(146)] = '’'; // Right Single Quotation Mark - $trans[chr(147)] = '“'; // Left Double Quotation Mark - $trans[chr(148)] = '”'; // Right Double Quotation Mark - $trans[chr(149)] = '•'; // Bullet - $trans[chr(150)] = '–'; // En Dash - $trans[chr(151)] = '—'; // Em Dash - $trans[chr(152)] = '˜'; // Small Tilde - $trans[chr(153)] = '™'; // Trade Mark Sign - $trans[chr(154)] = 'š'; // Latin Small Letter S With Caron - $trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark - $trans[chr(156)] = 'œ'; // Latin Small Ligature OE - $trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis - $html = strtr($html, $trans); - } - $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); - - /* - if (function_exists('iconv')) { - // iconv appears to handle certain character encodings better than mb_convert_encoding - $html = iconv($encoding, 'utf-8', $html); - } else { - $html = mb_convert_encoding($html, 'utf-8', $encoding); - } - */ - } - } - } - return $html; -} - -function makeAbsolute($base, $elem) { - $base = new IRI($base); - foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) { - $elems = $elem->getElementsByTagName($tag); - for ($i = $elems->length-1; $i >= 0; $i--) { - $e = $elems->item($i); - //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); - makeAbsoluteAttr($base, $e, $attr); - } - if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr); - } -} -function makeAbsoluteAttr($base, $e, $attr) { - if ($e->hasAttribute($attr)) { - // Trim leading and trailing white space. I don't really like this but - // unfortunately it does appear on some sites. e.g. - $url = trim(str_replace('%20', ' ', $e->getAttribute($attr))); - $url = str_replace(' ', '%20', $url); - if (!preg_match('!https?://!i', $url)) { - $absolute = IRI::absolutize($base, $url); - if ($absolute) { - $e->setAttribute($attr, $absolute); - } - } - } -} - //////////////////////////////// // Check if service is enabled //////////////////////////////// @@ -211,7 +91,7 @@ if (!$options->enabled) { if (!isset($_GET['url'])) { die('No URL supplied'); } -$url = $_GET['url']; +$url = trim($_GET['url']); if (!preg_match('!^https?://.+!i', $url)) { $url = 'http://'.$url; } @@ -240,6 +120,7 @@ if ($options->alternative_url != '' && !isset($_GET['redir']) && mt_rand(0, 100) if (isset($_GET['links'])) $redirect .= '&links='.$_GET['links']; if (isset($_GET['exc'])) $redirect .= '&exc='.$_GET['exc']; if (isset($_GET['what'])) $redirect .= '&what='.$_GET['what']; + if (isset($_GET['format'])) $redirect .= '&format='.$_GET['format']; header("Location: $redirect"); exit; } @@ -258,6 +139,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']); if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']); if (isset($_GET['what'])) $redirect .= '&what='.urlencode($_GET['what']); + if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']); header("Location: $redirect"); exit; } @@ -364,9 +246,13 @@ if (($extract_pattern != '') && ($extract_pattern != 'auto')) { ///////////////////////////////////// // Check for valid format -// (stick to RSS for the time being) +// (stick to RSS (or RSS as JSON) for the time being) ///////////////////////////////////// -$format = 'rss'; +if (isset($_GET['format']) && $_GET['format'] == 'json') { + $format = 'json'; +} else { + $format = 'rss'; +} ////////////////////////////////// // Check for cached copy @@ -392,10 +278,14 @@ if ($options->caching) { // getting a Zend_Cache_Core object $cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions); - $cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.(int)isset($_GET['pubsub'])); + $cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.$format.(int)isset($_GET['pubsub'])); if ($data = $cache->load($cache_id)) { - header("Content-type: text/xml; charset=UTF-8"); + if ($format == 'json') { + header("Content-type: application/json; charset=UTF-8"); + } else { + header("Content-type: text/xml; charset=UTF-8"); + } if (headers_sent()) die('Some data has already been output, can\'t send RSS file'); echo $data; exit; @@ -419,7 +309,7 @@ $http = new HumbleHttpAgent(); ////////////////////////////////// // Set up Content Extractor ////////////////////////////////// -$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', new ContentExtractor(dirname(__FILE__).'/site_config/standard')); +$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', dirname(__FILE__).'/site_config/standard'); /* if ($options->caching) { @@ -453,7 +343,8 @@ if (!$html_only) { SimplePie_HumbleHttpAgent::set_agent($http); $feed = new SimplePie(); $feed->set_file_class('SimplePie_HumbleHttpAgent'); - $feed->set_feed_url($url); + //$feed->set_feed_url($url); // colons appearing in the URL's path get encoded + $feed->feed_url = $url; $feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE); $feed->set_timeout(20); $feed->enable_cache(false); @@ -471,97 +362,34 @@ if (!$html_only) { } //////////////////////////////////////////////////////////////////////////////// -// Extract content from HTML (if URL is not feed or explicit HTML request has been made) +// Our given URL is not a feed, so let's create our own feed with a single item: +// the given URL. This basically treats all non-feed URLs as if they were +// single-item feeds. //////////////////////////////////////////////////////////////////////////////// +$isDummyFeed = false; if ($html_only || !$result) { + $isDummyFeed = true; unset($feed, $result); - if ($response = $http->get($url)) { - $effective_url = $response['effective_url']; - if (!url_allowed($effective_url)) die('URL blocked'); - $html = $response['body']; - // remove strange things here - $html = str_replace('', '', $html); - $html = convert_to_utf8($html, $response['headers']); + // create single item dummy feed object + class DummySingleItemFeed { + public $item; + function __construct($url) { $this->item = new DummySingleItem($url); } + public function get_title() { return ''; } + public function get_description() { return 'Content extracted from '.$this->item->url; } + public function get_link() { return $this->item->url; } + public function get_image_url() { return false; } + public function get_items($start=0, $max=1) { return array(0=>$this->item); } } - if (!$response || $response['status_code'] >= 300) { - die('Error retrieving '.$url); + class DummySingleItem { + public $url; + function __construct($url) { $this->url = $url; } + public function get_permalink() { return $this->url; } + public function get_title() { return ''; } + public function get_date($format='') { return false; } + public function get_author() { return false; } + public function get_description() { return ''; } } - if ($auto_extract) { - $extract_result = $extractor->process($html, $effective_url); - if (!$extract_result) die($options->error_message); - $readability = $extractor->readability; - $content_block = $extractor->getContent(); - $title = $extractor->getTitle(); - } else { - $readability = new Readability($html, $effective_url); - // content block is entire document - $content_block = $readability->dom; - //TODO: get title - $title = ''; - } - if ($extract_pattern) { - $xpath = new DOMXPath($readability->dom); - $elems = @$xpath->query($extract_pattern, $content_block); - // check if our custom extraction pattern matched - if ($elems && $elems->length > 0) { - // get the first matched element - $content_block = $elems->item(0); - // clean it up - $readability->removeScripts($content_block); - $readability->prepArticle($content_block); - } else { - die($options->error_message); - //$content_block = $readability->dom->createElement('p', 'Sorry, could not extract content'); - } - } - $readability->clean($content_block, 'select'); - if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block); - // footnotes - if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) { - $readability->addFootnotes($content_block); - } - if ($extract_pattern) { - // get outerHTML - $content = $content_block->ownerDocument->saveXML($content_block); - } else { - if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) { - $content = $content_block->firstChild->innerHTML; - } else { - $content = $content_block->innerHTML; - } - } - if ($links == 'remove') { - $content = preg_replace('!]*>!', '', $content); - } - if (!$valid_key) { - $content = $options->message_to_prepend.$content; - $content .= $options->message_to_append; - } else { - $content = $options->message_to_prepend_with_key.$content; - $content .= $options->message_to_append_with_key; - } - unset($readability, $html); - $output = new FeedWriter(); //ATOM an option - $output->setTitle($title); - $output->setDescription("Content extracted from $url"); - $output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it - if ($format == 'atom') { - $output->setChannelElement('updated', date(DATE_ATOM)); - $output->setChannelElement('author', array('name'=>'Five Filters', 'uri'=>'http://fivefilters.org')); - } - $output->setLink($url); - $newitem = $output->createNewItem(); - $newitem->setTitle($title); - $newitem->setLink($url); - if ($format == 'atom') { - $newitem->setDate(time()); - $newitem->addElement('content', $content); - } else { - $newitem->setDescription($content); - } - $output->addItem($newitem); - $output->genarateFeed(); - exit; + $feed = new DummySingleItemFeed($url); } //////////////////////////////////////////// @@ -594,6 +422,8 @@ $urls_sanitized = array(); $urls = array(); foreach ($items as $key => $item) { $permalink = htmlspecialchars_decode($item->get_permalink()); + // Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded + $permalink = str_replace('%3A', ':', $permalink); $permalink = $http->validateUrl($permalink); if ($permalink) { $urls_sanitized[] = $permalink; @@ -625,17 +455,34 @@ foreach ($items as $key => $item) { $effective_url = $response['effective_url']; if (!url_allowed($effective_url)) continue; $html = $response['body']; - // remove strange things here - $html = str_replace('', '', $html); + // remove strange things + $html = str_replace('', '', $html); $html = convert_to_utf8($html, $response['headers']); if ($auto_extract) { + // check site config for single page URL - fetch it if found + if ($single_page_response = getSinglePage($item, $html, $effective_url)) { + $html = $single_page_response['body']; + // remove strange things + $html = str_replace('', '', $html); + $html = convert_to_utf8($html, $single_page_response['headers']); + $effective_url = $single_page_response['effective_url']; + unset($single_page_response); + } $extract_result = $extractor->process($html, $effective_url); $readability = $extractor->readability; $content_block = ($extract_result) ? $extractor->getContent() : null; + $title = ($extract_result) ? $extractor->getTitle() : ''; } else { $readability = new Readability($html, $effective_url); // content block is entire document (for now...) - $content_block = $readability->dom; + $content_block = $readability->dom; + //TODO: get title + $title = ''; + } + // use extracted title for both feed and item title if we're using single-item dummy feed + if ($isDummyFeed) { + $output->setTitle($title); + $newitem->setTitle($title); } if ($extract_pattern && isset($content_block)) { $xpath = new DOMXPath($readability->dom); @@ -684,11 +531,11 @@ foreach ($items as $key => $item) { $html = preg_replace('!]*>!', '', $html); } if (!$valid_key) { - $html = $options->message_to_prepend.$html; - $html .= $options->message_to_append; + $html = make_substitutions($options->message_to_prepend).$html; + $html .= make_substitutions($options->message_to_append); } else { - $html = $options->message_to_prepend_with_key.$html; - $html .= $options->message_to_append_with_key; + $html = make_substitutions($options->message_to_prepend_with_key).$html; + $html .= make_substitutions($options->message_to_append_with_key); } } if ($format == 'atom') { @@ -715,14 +562,215 @@ foreach ($items as $key => $item) { unset($html); } // output feed -if ($options->caching) { +if ($options->caching || $format == 'json') { ob_start(); $output->genarateFeed(); $output = ob_get_contents(); ob_end_clean(); - $cache->save($output, $cache_id); + if ($format == 'json') { + $jsonrss = new stdClass(); + $jsonrss->rss = @simplexml_load_string($output); + $output = json_encode($jsonrss); + header("Content-type: application/json; charset=UTF-8"); + } + if ($options->caching) $cache->save($output, $cache_id); echo $output; } else { $output->genarateFeed(); } + +/////////////////////////////// +// HELPER FUNCTIONS +/////////////////////////////// + +function url_allowed($url) { + global $options; + if (!empty($options->allowed_urls)) { + $allowed = false; + foreach ($options->allowed_urls as $allowurl) { + if (stristr($url, $allowurl) !== false) { + $allowed = true; + break; + } + } + if (!$allowed) return false; + } else { + foreach ($options->blocked_urls as $blockurl) { + if (stristr($url, $blockurl) !== false) { + return false; + } + } + } + return true; +} + +////////////////////////////////////////////// +// Convert $html to UTF8 +// (uses HTTP headers and HTML to find encoding) +// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8 +////////////////////////////////////////////// +function convert_to_utf8($html, $header=null) +{ + $encoding = null; + if ($html || $header) { + if (is_array($header)) $header = implode("\n", $header); + if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) { + // error parsing the response + } else { + $match = end($match); // get last matched element (in case of redirects) + if (isset($match[2])) $encoding = trim($match[2], '"\''); + } + if (!$encoding) { + if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) { + $encoding = trim($match[1], '"\''); + } elseif(preg_match('/]+)/i', $html, $match)) { + if (isset($match[1])) $encoding = trim($match[1]); + } + } + if (!$encoding) { + $encoding = 'utf-8'; + } else { + if (strtolower($encoding) != 'utf-8') { + if (strtolower($encoding) == 'iso-8859-1') { + // replace MS Word smart qutoes + $trans = array(); + $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark + $trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook + $trans[chr(132)] = '„'; // Double Low-9 Quotation Mark + $trans[chr(133)] = '…'; // Horizontal Ellipsis + $trans[chr(134)] = '†'; // Dagger + $trans[chr(135)] = '‡'; // Double Dagger + $trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent + $trans[chr(137)] = '‰'; // Per Mille Sign + $trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron + $trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark + $trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE + $trans[chr(145)] = '‘'; // Left Single Quotation Mark + $trans[chr(146)] = '’'; // Right Single Quotation Mark + $trans[chr(147)] = '“'; // Left Double Quotation Mark + $trans[chr(148)] = '”'; // Right Double Quotation Mark + $trans[chr(149)] = '•'; // Bullet + $trans[chr(150)] = '–'; // En Dash + $trans[chr(151)] = '—'; // Em Dash + $trans[chr(152)] = '˜'; // Small Tilde + $trans[chr(153)] = '™'; // Trade Mark Sign + $trans[chr(154)] = 'š'; // Latin Small Letter S With Caron + $trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark + $trans[chr(156)] = 'œ'; // Latin Small Ligature OE + $trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis + $html = strtr($html, $trans); + } + $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); + + /* + if (function_exists('iconv')) { + // iconv appears to handle certain character encodings better than mb_convert_encoding + $html = iconv($encoding, 'utf-8', $html); + } else { + $html = mb_convert_encoding($html, 'utf-8', $encoding); + } + */ + } + } + } + return $html; +} + +function makeAbsolute($base, $elem) { + $base = new IRI($base); + // remove '//' in URL path (causes URLs not to resolve properly) + if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath); + foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) { + $elems = $elem->getElementsByTagName($tag); + for ($i = $elems->length-1; $i >= 0; $i--) { + $e = $elems->item($i); + //$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e); + makeAbsoluteAttr($base, $e, $attr); + } + if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr); + } +} +function makeAbsoluteAttr($base, $e, $attr) { + if ($e->hasAttribute($attr)) { + // Trim leading and trailing white space. I don't really like this but + // unfortunately it does appear on some sites. e.g. + $url = trim(str_replace('%20', ' ', $e->getAttribute($attr))); + $url = str_replace(' ', '%20', $url); + if (!preg_match('!https?://!i', $url)) { + $absolute = IRI::absolutize($base, $url); + if ($absolute) { + $e->setAttribute($attr, $absolute); + } + } + } +} +function makeAbsoluteStr($base, $url) { + $base = new IRI($base); + // remove '//' in URL path (causes URLs not to resolve properly) + if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath); + if (preg_match('!^https?://!i', $url)) { + // already absolute + return $url; + } else { + $absolute = IRI::absolutize($base, $url); + if ($absolute) return $absolute; + return false; + } +} +// returns single page response, or false if not found +function getSinglePage($item, $html, $url) { + global $http; + $host = @parse_url($url, PHP_URL_HOST); + $site_config = SiteConfig::build($host); + if ($site_config === false) return false; + $splink = null; + if (!empty($site_config->single_page_link)) { + $splink = $site_config->single_page_link; + } elseif (!empty($site_config->single_page_link_in_feed)) { + // single page link xpath is targeted at feed + $splink = $site_config->single_page_link_in_feed; + // so let's replace HTML with feed item description + $html = $item->get_description(); + } + if (isset($splink)) { + // Build DOM tree from HTML + $readability = new Readability($html, $url); + $xpath = new DOMXPath($readability->dom); + // Loop through single_page_link xpath expressions + $single_page_url = null; + foreach ($splink as $pattern) { + $elems = @$xpath->evaluate($pattern, $readability->dom); + if (is_string($elems)) { + $single_page_url = trim($elems); + break; + } elseif ($elems instanceof DOMNodeList && $elems->length > 0) { + foreach ($elems as $item) { + if ($item->hasAttribute('href')) { + $single_page_url = $item->getAttribute('href'); + break; + } + } + } + } + // If we've got URL, resolve against $url + if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) { + // check it's not what we have already! + if ($single_page_url != $url) { + // it's not, so let's try to fetch it... + if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) { + return $response; + } + } + } + } + return false; +} + +function make_substitutions($string) { + if ($string == '') return $string; + global $item, $effective_url; + $string = str_replace('{url}', htmlspecialchars($item->get_permalink()), $string); + $string = str_replace('{effective-url}', htmlspecialchars($effective_url), $string); + return $string; +} ?> \ No newline at end of file