From e2a9b81740367b575ab58fac4cf418e969ef8284 Mon Sep 17 00:00:00 2001 From: Keyvan Date: Fri, 4 Nov 2011 18:10:31 +0100 Subject: [PATCH] Full-Text RSS 2.6 --- README.txt | 29 ++ UPDATING.txt | 22 + changelog.txt | 10 + cleancache.php | 2 + config.php | 19 +- ftr_compatibility_test.php | 43 +- index.php | 18 +- .../humble-http-agent/HumbleHttpAgent.php | 338 +++++++++++---- libraries/humble-http-agent/RollingCurl.php | 392 ++++++++++++++++++ .../SimplePie_HumbleHttpAgent.php | 76 ++++ libraries/iri/iri.php | 2 +- libraries/readability/Readability.php | 1 + .../readability/examples/Readability.php | 9 + makefulltextfeed.php | 100 +++-- 14 files changed, 906 insertions(+), 155 deletions(-) create mode 100644 README.txt create mode 100644 UPDATING.txt create mode 100644 libraries/humble-http-agent/RollingCurl.php create mode 100644 libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..27121aa --- /dev/null +++ b/README.txt @@ -0,0 +1,29 @@ +Full-Text RSS +============= + +About +----- + +See http://fivefilters.org/content-only/ for a description of the code. + + +Installation +------------ + +1. Extract the files in this ZIP archive to a folder on your computer. + +2. FTP the files up to your server + +3. Access index.php through your browser. E.g. http://my-host.com/full-text-rss/index.php + +4. Enter a URL in the form field to test the code + +5. If you get an RSS feed with full-text content, all is working well. :) + +Configuration (optional) +------------------------ + +1. Save a copy of config.php as custom_config.php and edit custom_config.php + +2. If you decide to enable caching, make sure the cache folder (and its 2 sub folders) is writable. +(you might need to change the permissions of these folders to 777 through your FTP client). \ No newline at end of file diff --git a/UPDATING.txt b/UPDATING.txt new file mode 100644 index 0000000..530362e --- /dev/null +++ b/UPDATING.txt @@ -0,0 +1,22 @@ +Updating Full-Text RSS +====================== + +To update your copy of Full-Text RSS to ensure feeds continue to be processed as they were before, we suggest the following steps: + +1. Keep your current installation in place for now (we'll deal with it later) + +2. Extract this updated package to a new folder -- for example, if the last version is in a folder called 'full-text-rss', extract this version to a new folder called 'full-text-rss-updated' + +3. FTP the new folder up to your server + +4. Access index.php in the new folder through your browser -- for example http://my-host.com/full-text-rss-updated/index.php + +5. Enter a URL in the form field to test the updated code + +6. If you'd configured the last version, copy custom_config.php from your old version to the new folder. + +7. Test the new copy again to make sure the config values are now applied to the new version. + +8. Now simply rename the folder with your old copy to 'full-text-rss-old' and then rename the folder with the new copy to 'full-text-rss' (or whatever name you'd given the original folder). + +That's all that's needed. Your feeds should continue to work as they did before. Let us know if you have any trouble: fivefilters@fivefilters.org. \ No newline at end of file diff --git a/changelog.txt b/changelog.txt index d90ad28..823f62d 100644 --- a/changelog.txt +++ b/changelog.txt @@ -2,6 +2,16 @@ FiveFilters.org: Full-Text RSS http://fivefilters.org/content-only/ CHANGELOG ------------------------------------ +2.6 (2011-03-02) + - Rewriting of hash-bang (#!) URLs (see http://www.tbray.org/ongoing/When/201x/2011/02/09/Hash-Blecch for an explanation) + - Improved parallel fetching support (HumbleHttpAgent uses curl_multi_* functions if PECL HTTP extension is not present) + - Improved HTTP redirect support (now handled in HumbleHttpAgent, no longer relies on PHP) + - Improved performance for single page (non-feed) requests: (SimplePie connected to HumbleHttpAgent) + - Improved memory use for processing large feeds (HumbleHttpAgent's stored responses cleared as they're retrieved) + - Bug fix: exclude on fail option no longer requires valid key + - Bug fix: workaround for PHP bug http://bugs.php.net/51192 (fixed in makefulltextfeed.php) + - Plus other minor changes... + 2.5 (2011-01-08) - New option: custom extraction pattern (CSS selectors) - New option: allowed URLs (restrict service to pre-defined feeds/domains) diff --git a/cleancache.php b/cleancache.php index 49c6a3e..d1d9711 100644 --- a/cleancache.php +++ b/cleancache.php @@ -53,6 +53,7 @@ function __autoload($class_name) { require_once(dirname(__FILE__).'/config.php'); if (!$options->caching) die('Caching is disabled'); +/* // clean http response cache $frontendOptions = array( 'lifetime' => 30*60, // cache lifetime of 30 minutes @@ -73,6 +74,7 @@ $backendOptions = array( ); $cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions); $cache->clean(Zend_Cache::CLEANING_MODE_OLD); +*/ // clean rss (non-key) cache $frontendOptions = array( diff --git a/config.php b/config.php index b85f127..bf8bd79 100644 --- a/config.php +++ b/config.php @@ -1,10 +1,15 @@ max_entries = 10; // ---------------------- // With this enabled relative URLs found in the extracted content // block are automatically rewritten as absolute URLs. -// Set to false if you want to preserve relative URLs appearing in -// the extracted content block. $options->rewrite_relative_urls = true; // Exclude items if extraction fails @@ -128,8 +131,8 @@ $options->cache_cleanup = 100; ///////////////////////////////////////////////// /// DEPRECATED OPTIONS -/// THESE OPTIONS WILL CHANGE IN THE NEXT -/// VERSION, WE RECOMMEND YOU DO NOT USE THEM +/// THESE OPTIONS WILL CHANGE IN VERSION 3.0 +/// WE RECOMMEND YOU DO NOT USE THEM ///////////////////////////////////////////////// // Restrict service (deprecated) @@ -182,6 +185,8 @@ $options->error_message_with_key = '[unable to retrieve full-text content]'; /// DO NOT CHANGE ANYTHING BELOW THIS /////////// ///////////////////////////////////////////////// -if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.5'); +if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.6'); -?> \ No newline at end of file +if ((basename(__FILE__) == 'config.php') && (file_exists(dirname(__FILE__).'/custom_config.php'))) { + require_once(dirname(__FILE__).'/custom_config.php'); +} \ No newline at end of file diff --git a/ftr_compatibility_test.php b/ftr_compatibility_test.php index cc34ba5..6973723 100644 --- a/ftr_compatibility_test.php +++ b/ftr_compatibility_test.php @@ -13,20 +13,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o http://github.com/simplepie/simplepie/tree/master/compatibility_test/ */ -$app_name = 'Full-Text RSS v2'; - -// test redirect -$url = parse_url('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']); -$redirect_url = 'http://'.$url['host'].$url['path'].'?redirect=true'; -if (isset($_GET['redirect'])) { - $url = parse_url('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']); - $url = 'http://'.$url['host'].$url['path'].'?redirected=true'; - header('Location: '.$url); - exit; -} -if (isset($_GET['redirected'])) { - die('Redirect works'); -} +$app_name = 'Full-Text RSS 2.6'; $php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=')); $pcre_ok = extension_loaded('pcre'); @@ -35,9 +22,8 @@ $mbstring_ok = extension_loaded('mbstring'); $iconv_ok = extension_loaded('iconv'); $tidy_ok = function_exists('tidy_parse_string'); $curl_ok = function_exists('curl_exec'); -$http_ok = (extension_loaded('http') && class_exists('HttpRequestPool')); +$parallel_ok = ((extension_loaded('http') && class_exists('HttpRequestPool')) || ($curl_ok && function_exists('curl_multi_init'))); $allow_url_fopen_ok = (bool)ini_get('allow_url_fopen'); -$redirect_ok = ($allow_url_fopen_ok && file_get_contents($redirect_url) == 'Redirect works'); if (extension_loaded('xmlreader')) { $xml_ok = true; @@ -241,21 +227,16 @@ div.chunk { Enabled - - HttpRequestPool + + Parallel URL fetching Enabled - + allow_url_fopen Enabled - - HTTP Redirects - Enabled - - @@ -263,7 +244,7 @@ div.chunk {

What does this mean?

    - +
  1. You have everything you need to run properly! Congratulations!
  2. @@ -304,18 +285,12 @@ div.chunk {
  3. cURL: The cURL extension is not available. SimplePie will use fsockopen() instead.
  4. - -
  5. HttpRequestPool: You have HttpRequestPool support installed. No problems here.
  6. + +
  7. Parallel URL fetching: You have HttpRequestPool or curl_multi support installed. No problems here.
  8. -
  9. HttpRequestPool: The HttpRequestPool class is not available. will use file_get_contents() instead to fetch URLs sequentially rather than in parallel.
  10. +
  11. Parallel URL fetching: HttpRequestPool or curl_multi support is not available. will use file_get_contents() instead to fetch URLs sequentially rather than in parallel.
  12. - -
  13. HTTP Redirects: Your server appears to handle redirects ok. No problems here.
  14. - -
  15. HTTP Redirects: Your server appears not to be able to handle HTTP redirects. should still work with most feeds, but you may experience problems with some.
  16. - -
  17. allow_url_fopen: Your PHP configuration has allow_url_fopen disabled. will not work here.
  18. diff --git a/index.php b/index.php index a6787e5..cf3977b 100644 --- a/index.php +++ b/index.php @@ -135,7 +135,7 @@ if (!defined('_FF_FTR_INDEX')) { Then whenever you'd like a full-text feed, click the bookmarklet.

    Drag this:

    API

    @@ -143,11 +143,6 @@ if (!defined('_FF_FTR_INDEX')) {
    • /makefulltextfeed.php?url=[url]
    -

    If you have an API key, add that to the querystring:

    -
      -
    • /makefulltextfeed.php?key=[key]&url=[url]
    • -
    • /makefulltextfeed.php?key=[key]&max=[number of feed items]&url=[url]
    • -

    All the parameters in the form above can be passed in this way. Examine the URL in the addressbar after you click 'Create Feed' to see the values.

    @@ -155,26 +150,27 @@ if (!defined('_FF_FTR_INDEX')) { project licensed under the AGPL. You're free to download your own copy.

    Source Code and Technologies

    -

    The application uses PHP, PHP Readability, SimplePie, FeedWriter, Humble HTTP Agent. Depending on configuration, these optional components may also be used: Zend Cache, Zend DOM Query and IRI. Readability is the magic piece of code that tries to identify and extract the content block from any given web page.

    +

    The application uses PHP, PHP Readability, SimplePie, FeedWriter, Humble HTTP Agent. Depending on your configuration, these optional components may also be used: Zend Cache, Zend DOM Query, Rolling Curl and IRI. Readability is the magic piece of code that tries to identify and extract the content block from any given web page.

    System Requirements

    PHP 5.2 or above is required. A simple shared web hosting account will work fine. - The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using WampServer.

    + The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using WampServer. It has also been reported as working under IIS, but we have not tested this ourselves.

    Download

    Download from fivefilters.org - old versions are available in the code repository.

    License

    -

    AGPL logo
    This web application is licensed under the AGPL version 3 — which basically means if you use the code to offer the same or similar service for your users, you are also required to share the code with your users so they can run it for themselves. (More on why this is important.)

    +

    AGPL logo
    This web application is licensed under the AGPL version 3 — which basically means if you use the code to offer the same or similar service for your users, you are also required to share the code with your users so they can examine the code and run it for themselves. (More on why this is important.)

    The libraries used by the application are licensed as follows...

diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php index 50a1d32..2472767 100644 --- a/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/libraries/humble-http-agent/HumbleHttpAgent.php @@ -3,37 +3,59 @@ * Humble HTTP Agent * * This class is designed to take advantage of parallel HTTP requests - * offered by PHP's PECL HTTP extension. For environments which - * do not have this extension, it reverts to standard sequential + * offered by PHP's PECL HTTP extension or the curl_multi_* functions. + * For environments which do not have these options, it reverts to standard sequential * requests (using file_get_contents()) * - * @version 2010-10-19 + * @version 0.8 + * @date 2011-02-28 * @see http://php.net/HttpRequestPool * @author Keyvan Minoukadeh - * @copyright 2010 Keyvan Minoukadeh + * @copyright 2011 Keyvan Minoukadeh * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 */ class HumbleHttpAgent { + const METHOD_REQUEST_POOL = 1; + const METHOD_CURL_MULTI = 2; + const METHOD_FILE_GET_CONTENTS = 4; + protected $requests = array(); + protected $redirectQueue = array(); protected $requestOptions; - protected $parallelSupport; protected $maxParallelRequests = 5; - protected $cache = null; + protected $cache = null; //TODO protected $httpContext; - protected $minimiseMemoryUse = false; + protected $minimiseMemoryUse = false; //TODO protected $debug = false; + protected $method; + public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html + public $maxRedirects = 5; //TODO: prevent certain file/mime types //TODO: set max file size //TODO: normalise headers - function __construct($requestOptions=null) { - $this->parallelSupport = class_exists('HttpRequestPool'); + function __construct($requestOptions=null, $method=null) { + // set the request method + if (in_array($method, array(1,2,4))) { + $this->method = $method; + } else { + if (class_exists('HttpRequestPool')) { + $this->method = self::METHOD_REQUEST_POOL; + } elseif (function_exists('curl_multi_init')) { + $this->method = self::METHOD_CURL_MULTI; + } else { + $this->method = self::METHOD_FILE_GET_CONTENTS; + } + } + if ($this->method == self::METHOD_CURL_MULTI) { + require_once(dirname(__FILE__).'/RollingCurl.php'); + } $this->requestOptions = array( 'timeout' => 10, - 'redirect' => 5 + 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web // TODO: test onprogress? ); if (is_array($requestOptions)) { @@ -41,13 +63,14 @@ class HumbleHttpAgent } $this->httpContext = stream_context_create(array( 'http' => array( + 'ignore_errors' => true, 'timeout' => $this->requestOptions['timeout'], 'max_redirects' => $this->requestOptions['redirect'], - 'header' => "User-Agent: PHP/5.2\r\n". - "Accept: */*\r\n" + 'header' => "User-Agent: PHP/".phpversion()."\r\n". + "Accept: */*\r\n" ) ) - ); + ); } protected function debug($msg) { @@ -61,6 +84,23 @@ class HumbleHttpAgent } } + public function rewriteHashbangFragment($url) { + // return $url if there's no '#!' + if (strpos($url, '#!') === false) return $url; + // split $url and rewrite + $iri = new IRI($url); + $fragment = substr($iri->ifragment, 1); // strip '!' + $iri->fragment = null; + if (isset($iri->iquery)) { + parse_str($iri->iquery, $query); + } else { + $query = array(); + } + $query['_escaped_fragment_'] = (string)$fragment; + $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites + return $iri->uri; + } + public function enableDebug($bool=true) { $this->debug = (bool)$bool; } @@ -73,29 +113,30 @@ class HumbleHttpAgent $this->maxParallelRequests = $max; } - /** - * Set cache object. - * The cache object passed should implement Zend_Cache_Backend_Interface - * @param Zend_Cache_Backend_Interface - */ - public function useCache($cache) { - $this->cache = $cache; - } - public function validateUrl($url) { - //TODO: run sanitize filter first! + $url = filter_var($url, FILTER_SANITIZE_URL); $test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); // deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) if ($test === false) { $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); } if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { - return filter_var($url, FILTER_SANITIZE_URL); + return $url; } else { return false; } } + /** + * Set cache object. + * The cache object passed should implement Zend_Cache_Backend_Interface + * @param Zend_Cache_Backend_Interface + */ + /* all disk caching temporily disabled - needs work + public function useCache($cache) { + $this->cache = $cache; + } + public function isCached($url) { if (!isset($this->cache)) return false; return ($this->cache->test(md5($url)) !== false); @@ -115,7 +156,7 @@ class HumbleHttpAgent return ($res !== false); } return false; - } + } public function cacheAll() { if (isset($this->cache)) { @@ -126,30 +167,50 @@ class HumbleHttpAgent } return false; } + */ public function fetchAll(array $urls) { - $urls = array_unique($urls); - // parallel - if (count($urls) > 1 && $this->parallelSupport() && $this->maxParallelRequests > 1) { - $this->debug('Starting parallel fetch'); + $this->fetchAllOnce($urls, $isRedirect=false); + $redirects = 0; + while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) { + $this->debug("Following redirects #$redirects..."); + $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); + } + } + + // fetch all URLs without following redirects + public function fetchAllOnce(array $urls, $isRedirect=false) { + if (!$isRedirect) $urls = array_unique($urls); + if (empty($urls)) return; + + ////////////////////////////////////////////////////// + // parallel (HttpRequestPool) + if ($this->method == self::METHOD_REQUEST_POOL) { + $this->debug('Starting parallel fetch (HttpRequestPool)'); try { while (count($urls) > 0) { - $this->debug('Processing set of '.$this->maxParallelRequests); + $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); $subset = array_splice($urls, 0, $this->maxParallelRequests); $pool = new HttpRequestPool(); - foreach ($subset as $url) { + foreach ($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + unset($this->redirectQueue[$orig]); $this->debug("...$url"); - if (isset($this->requests[$url])) { + if (!$isRedirect && isset($this->requests[$url])) { $this->debug("......in memory"); + /* } elseif ($this->isCached($url)) { $this->debug("......is cached"); if (!$this->minimiseMemoryUse) { $this->requests[$url] = $this->getCached($url); } + */ } else { $this->debug("......adding to pool"); - $httpRequest = new HttpRequest($url, HttpRequest::METH_GET, $this->requestOptions); - $this->requests[$url] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); + $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url; + $httpRequest = new HttpRequest($req_url, HttpRequest::METH_GET, $this->requestOptions); + $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); + $this->requests[$orig]['original_url'] = $orig; $pool->attach($httpRequest); } } @@ -158,21 +219,37 @@ class HumbleHttpAgent $this->debug('Sending request...'); $pool->send(); $this->debug('Received responses'); - foreach($subset as $url) { - if (!isset($this->requests[$url]['fromCache'])) { - $request = $this->requests[$url]['httpRequest']; - $this->requests[$url]['headers'] = $this->headersToString($request->getResponseHeader()); - $this->requests[$url]['body'] = $request->getResponseBody(); - $this->requests[$url]['effective_url'] = $request->getResponseInfo('effective_url'); + foreach($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + //if (!isset($this->requests[$url]['fromCache'])) { + $request = $this->requests[$orig]['httpRequest']; + //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); + // getResponseHeader() doesn't return status line, so, for consistency... + $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); + $this->requests[$orig]['body'] = $request->getResponseBody(); + $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); + $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { + $redirectURL = $request->getResponseHeader('location'); + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + if ($this->validateURL($redirectURL)) { + $this->debug('Redirect detected. Valid URL: '.$redirectURL); + $this->redirectQueue[$orig] = $redirectURL; + } else { + $this->debug('Redirect detected. Invalid URL: '.$redirectURL); + } + } //die($url.' -multi- '.$request->getResponseInfo('effective_url')); $pool->detach($request); - unset($this->requests[$url]['httpRequest'], $request); + unset($this->requests[$orig]['httpRequest'], $request); + /* if ($this->minimiseMemoryUse) { if ($this->cache($url)) { unset($this->requests[$url]); } } - } + */ + //} } } } @@ -180,13 +257,144 @@ class HumbleHttpAgent $this->debug($e); return false; } - // sequential - } else { - $this->debug('Starting sequential fetch...'); - foreach($urls as $url) { - $this->get($url); + } + + ////////////////////////////////////////////////////////// + // parallel (curl_multi_*) + elseif ($this->method == self::METHOD_CURL_MULTI) { + $this->debug('Starting parallel fetch (curl_multi_*)'); + while (count($urls) > 0) { + $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); + $subset = array_splice($urls, 0, $this->maxParallelRequests); + $pool = new RollingCurl(array($this, 'handleCurlResponse')); + $pool->window_size = count($subset); + + foreach ($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + unset($this->redirectQueue[$orig]); + $this->debug("...$url"); + if (!$isRedirect && isset($this->requests[$url])) { + $this->debug("......in memory"); + /* + } elseif ($this->isCached($url)) { + $this->debug("......is cached"); + if (!$this->minimiseMemoryUse) { + $this->requests[$url] = $this->getCached($url); + } + */ + } else { + $this->debug("......adding to pool"); + $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url; + + $httpRequest = new RollingCurlRequest($req_url, 'GET', null, null, array( + CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'], + CURLOPT_TIMEOUT => $this->requestOptions['timeout'] + )); + $httpRequest->set_original_url($orig); + $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); + $this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore? + $pool->add($httpRequest); + } + } + // did we get anything into the pool? + if (count($pool) > 0) { + $this->debug('Sending request...'); + $pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig] + $this->debug('Received responses'); + foreach($subset as $orig => $url) { + if (!$isRedirect) $orig = $url; + // $this->requests[$orig]['headers'] + // $this->requests[$orig]['body'] + // $this->requests[$orig]['effective_url'] + $status_code = $this->requests[$orig]['status_code']; + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { + $redirectURL = $this->requests[$orig]['location']; + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + if ($this->validateURL($redirectURL)) { + $this->debug('Redirect detected. Valid URL: '.$redirectURL); + $this->redirectQueue[$orig] = $redirectURL; + } else { + $this->debug('Redirect detected. Invalid URL: '.$redirectURL); + } + } + // die($url.' -multi- '.$request->getResponseInfo('effective_url')); + unset($this->requests[$orig]['httpRequest']); + } + } } } + + ////////////////////////////////////////////////////// + // sequential (file_get_contents) + else { + $this->debug('Starting sequential fetch (file_get_contents)'); + $this->debug('Processing set of '.count($urls)); + foreach ($urls as $orig => $url) { + if (!$isRedirect) $orig = $url; + unset($this->redirectQueue[$orig]); + $this->debug("...$url"); + if (!$isRedirect && isset($this->requests[$url])) { + $this->debug("......in memory"); + /* + } elseif ($this->isCached($url)) { + $this->debug("......is cached"); + if (!$this->minimiseMemoryUse) { + $this->requests[$url] = $this->getCached($url); + } + */ + } else { + $this->debug("Sending request for $url"); + $this->requests[$orig]['original_url'] = $orig; + $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url; + if (false !== ($html = @file_get_contents($req_url, false, $this->httpContext))) { + $this->debug('Received response'); + // get status code + if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) { + $this->debug('Error: no status code found'); + // TODO: handle error - no status code + } else { + $this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false); + $this->requests[$orig]['body'] = $html; + $this->requests[$orig]['effective_url'] = $req_url; + $this->requests[$orig]['status_code'] = $status_code = (int)$match[1]; + unset($match); + // handle redirect + if (preg_match('/^Location:(.*?)$/m', $this->requests[$orig]['headers'], $match)) { + $this->requests[$orig]['location'] = trim($match[1]); + } + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) { + $redirectURL = $this->requests[$orig]['location']; + $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); + if ($this->validateURL($redirectURL)) { + $this->debug('Redirect detected. Valid URL: '.$redirectURL); + $this->redirectQueue[$orig] = $redirectURL; + } else { + $this->debug('Redirect detected. Invalid URL: '.$redirectURL); + } + } + } + } else { + $this->debug('Error retrieving URL'); + //print_r($req_url); + //print_r($http_response_header); + //print_r($html); + + // TODO: handle error - failed to retrieve URL + } + } + } + } + } + + public function handleCurlResponse($response, $info, $request) { + $orig = $request->url_original; + $this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']); + $this->requests[$orig]['body'] = substr($response, $info['header_size']); + $this->requests[$orig]['effective_url'] = $info['url']; + $this->requests[$orig]['status_code'] = (int)$info['http_code']; + if (preg_match('/^Location:(.*?)$/m', $this->requests[$orig]['headers'], $match)) { + $this->requests[$orig]['location'] = trim($match[1]); + } } protected function headersToString(array $headers, $associative=true) { @@ -205,50 +413,38 @@ class HumbleHttpAgent } } - protected function getRedirectUrl($header) { - if (is_array($header)) $header = implode("\n", $header); - if (!$header || !preg_match_all('!^Location:\s*(https?://.+)!im', $header, $match, PREG_SET_ORDER)) { - // error parsing the response - return false; - } else { - $match = end($match); // get last matched element (in case of redirects) - return $match[1]; - } - } - - public function get($url) { + public function get($url, $remove=false) { if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { - $this->debug("URL already fetched - in memory ($url)"); + $this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})"); $response = $this->requests[$url]; + /* } elseif ($this->isCached($url)) { $this->debug("URL already fetched - in disk cache ($url)"); $response = $this->getCached($url); $this->requests[$url] = $response; + */ } else { $this->debug("Fetching URL ($url)"); - if ($html = @file_get_contents($url, false, $this->httpContext)) { - $header = $this->headersToString($http_response_header, false); - $response = array('headers'=>$header, 'body'=>$html); - if ($last_url = $this->getRedirectUrl($header)) { - $response['effective_url'] = $last_url; - //die($url .' -single- '. $response['effective_url']); - } else { - $response['effective_url'] = $url; - } - $this->requests[$url] = $response; + $this->fetchAll(array($url)); + if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) { + $response = $this->requests[$url]; } else { + $this->debug("Request failed"); $response = false; } } + /* if ($this->minimiseMemoryUse && $response) { $this->cache($url); unset($this->requests[$url]); } + */ + if ($remove && $response) unset($this->requests[$url]); return $response; } public function parallelSupport() { - return $this->parallelSupport; + return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); } } ?> \ No newline at end of file diff --git a/libraries/humble-http-agent/RollingCurl.php b/libraries/humble-http-agent/RollingCurl.php new file mode 100644 index 0000000..e33bca2 --- /dev/null +++ b/libraries/humble-http-agent/RollingCurl.php @@ -0,0 +1,392 @@ +url = $url; + $this->url_original = $url; + $this->method = $method; + $this->post_data = $post_data; + $this->headers = $headers; + $this->options = $options; + } + + /** + * @param string $url + * @return void + */ + public function set_original_url($url) { + $this->url_original = $url; + } + /** + * @return void + */ + public function __destruct() { + unset($this->url, $this->url_original, $this->method, $this->post_data, $this->headers, $this->options); + } +} + +/** + * RollingCurl custom exception + */ +class RollingCurlException extends Exception { +} + +/** + * Class that holds a rolling queue of curl requests. + * + * @throws RollingCurlException + */ +class RollingCurl implements Countable { + /** + * @var int + * + * Window size is the max number of simultaneous connections allowed. + * + * REMEMBER TO RESPECT THE SERVERS: + * Sending too many requests at one time can easily be perceived + * as a DOS attack. Increase this window_size if you are making requests + * to multiple servers or have permission from the receving server admins. + */ + private $window_size = 5; + + /** + * @var float + * + * Timeout is the timeout used for curl_multi_select. + */ + private $timeout = 10; + + /** + * @var string|array + * + * Callback function to be applied to each result. + */ + private $callback; + + /** + * @var array + * + * Set your base options that you want to be used with EVERY request. + */ + protected $options = array( + CURLOPT_SSL_VERIFYPEER => 0, + CURLOPT_RETURNTRANSFER => 1, + CURLOPT_CONNECTTIMEOUT => 30, + CURLOPT_TIMEOUT => 30 + ); + + /** + * @var array + */ + private $headers = array(); + + /** + * @var Request[] + * + * The request queue + */ + private $requests = array(); + + /** + * @var RequestMap[] + * + * Maps handles to request indexes + */ + private $requestMap = array(); + + /** + * @param $callback + * Callback function to be applied to each result. + * + * Can be specified as 'my_callback_function' + * or array($object, 'my_callback_method'). + * + * Function should take three parameters: $response, $info, $request. + * $response is response body, $info is additional curl info. + * $request is the original request + * + * @return void + */ + function __construct($callback = null) { + $this->callback = $callback; + } + + /** + * @param string $name + * @return mixed + */ + public function __get($name) { + return (isset($this->{$name})) ? $this->{$name} : null; + } + + /** + * @param string $name + * @param mixed $value + * @return bool + */ + public function __set($name, $value) { + // append the base options & headers + if ($name == "options" || $name == "headers") { + $this->{$name} = $value + $this->{$name}; + } else { + $this->{$name} = $value; + } + return true; + } + + /** + * Count number of requests added (Countable interface) + * + * @return int + */ + public function count() { + return count($this->requests); + } + + /** + * Add a request to the request queue + * + * @param Request $request + * @return bool + */ + public function add($request) { + $this->requests[] = $request; + return true; + } + + /** + * Create new Request and add it to the request queue + * + * @param string $url + * @param string $method + * @param $post_data + * @param $headers + * @param $options + * @return bool + */ + public function request($url, $method = "GET", $post_data = null, $headers = null, $options = null) { + $this->requests[] = new RollingCurlRequest($url, $method, $post_data, $headers, $options); + return true; + } + + /** + * Perform GET request + * + * @param string $url + * @param $headers + * @param $options + * @return bool + */ + public function get($url, $headers = null, $options = null) { + return $this->request($url, "GET", null, $headers, $options); + } + + /** + * Perform POST request + * + * @param string $url + * @param $post_data + * @param $headers + * @param $options + * @return bool + */ + public function post($url, $post_data = null, $headers = null, $options = null) { + return $this->request($url, "POST", $post_data, $headers, $options); + } + + /** + * Execute processing + * + * @param int $window_size Max number of simultaneous connections + * @return string|bool + */ + public function execute($window_size = null) { + // rolling curl window must always be greater than 1 + if (sizeof($this->requests) == 1) { + return $this->single_curl(); + } else { + // start the rolling curl. window_size is the max number of simultaneous connections + return $this->rolling_curl($window_size); + } + } + + /** + * Performs a single curl request + * + * @access private + * @return string + */ + private function single_curl() { + $ch = curl_init(); + $request = array_shift($this->requests); + $options = $this->get_options($request); + curl_setopt_array($ch, $options); + $output = curl_exec($ch); + $info = curl_getinfo($ch); + + // it's not neccesary to set a callback for one-off requests + if ($this->callback) { + $callback = $this->callback; + if (is_callable($this->callback)) { + call_user_func($callback, $output, $info, $request); + } + } + else + return $output; + return true; + } + + /** + * Performs multiple curl requests + * + * @access private + * @throws RollingCurlException + * @param int $window_size Max number of simultaneous connections + * @return bool + */ + private function rolling_curl($window_size = null) { + if ($window_size) + $this->window_size = $window_size; + + // make sure the rolling window isn't greater than the # of urls + if (sizeof($this->requests) < $this->window_size) + $this->window_size = sizeof($this->requests); + + if ($this->window_size < 2) { + throw new RollingCurlException("Window size must be greater than 1"); + } + + $master = curl_multi_init(); + + // start the first batch of requests + for ($i = 0; $i < $this->window_size; $i++) { + $ch = curl_init(); + + $options = $this->get_options($this->requests[$i]); + + curl_setopt_array($ch, $options); + curl_multi_add_handle($master, $ch); + + // Add to our request Maps + $key = (string) $ch; + $this->requestMap[$key] = $i; + } + + do { + while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ; + if ($execrun != CURLM_OK) + break; + // a request was just completed -- find out which one + while ($done = curl_multi_info_read($master)) { + + // get the info and content returned on the request + $info = curl_getinfo($done['handle']); + $output = curl_multi_getcontent($done['handle']); + + // send the return values to the callback function. + $callback = $this->callback; + if (is_callable($callback)) { + $key = (string) $done['handle']; + $request = $this->requests[$this->requestMap[$key]]; + unset($this->requestMap[$key]); + call_user_func($callback, $output, $info, $request); + } + + // start a new request (it's important to do this before removing the old one) + if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests)) { + $ch = curl_init(); + $options = $this->get_options($this->requests[$i]); + curl_setopt_array($ch, $options); + curl_multi_add_handle($master, $ch); + + // Add to our request Maps + $key = (string) $ch; + $this->requestMap[$key] = $i; + $i++; + } + + // remove the curl handle that just completed + curl_multi_remove_handle($master, $done['handle']); + + } + + // Block for data in / output; error handling is done by curl_multi_exec + if ($running) + curl_multi_select($master, $this->timeout); + + } while ($running); + curl_multi_close($master); + return true; + } + + + /** + * Helper function to set up a new request by setting the appropriate options + * + * @access private + * @param Request $request + * @return array + */ + private function get_options($request) { + // options for this entire curl object + $options = $this->__get('options'); + // We're managing reirects in PHP - allows us to intervene and rewrite/block URLs + // before the next request goes out. + $options[CURLOPT_FOLLOWLOCATION] = 0; + $options[CURLOPT_MAXREDIRS] = 0; + //if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode')) { + // $options[CURLOPT_FOLLOWLOCATION] = 1; + // $options[CURLOPT_MAXREDIRS] = 5; + //} + $headers = $this->__get('headers'); + + // append custom options for this specific request + if ($request->options) { + $options = $request->options + $options; + } + + // set the request URL + $options[CURLOPT_URL] = $request->url; + + if ($headers) { + $options[CURLOPT_HTTPHEADER] = $headers; + } + // return response headers + $options[CURLOPT_HEADER] = 1; + + return $options; + } + + /** + * @return void + */ + public function __destruct() { + unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests); + } +} \ No newline at end of file diff --git a/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php new file mode 100644 index 0000000..168bd32 --- /dev/null +++ b/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php @@ -0,0 +1,76 @@ +encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']); + } + $this->url = $url; + $this->useragent = $useragent; + if (preg_match('/^http(s)?:\/\//i', $url)) + { + if (!is_array($headers)) + { + $headers = array(); + } + $this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL; + $headers2 = array(); + foreach ($headers as $key => $value) { + $headers2[] = "$key: $value"; + } + //TODO: allow for HTTP headers + // curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2); + + $response = self::$agent->get($url); + + if ($response === false || !isset($response['status_code'])) { + $this->error = 'failed to fetch URL'; + $this->success = false; + } else { + $parser = new SimplePie_HTTP_Parser($response['headers']); + if ($parser->parse()) { + $this->headers = $parser->headers; + //$this->body = $parser->body; + $this->body = $response['body']; + $this->status_code = $parser->status_code; + } + } + } + else + { + $this->error = 'invalid URL'; + $this->success = false; + } + } +} +?> \ No newline at end of file diff --git a/libraries/iri/iri.php b/libraries/iri/iri.php index ed97a22..f1fc7cf 100644 --- a/libraries/iri/iri.php +++ b/libraries/iri/iri.php @@ -1086,7 +1086,7 @@ class IRI { $iri .= '//' . $iauthority; } - $iri .= $this->ipath; + $iri .= ($this->ipath) ? $this->ipath : '/'; if ($this->iquery !== null) { $iri .= '?' . $this->iquery; diff --git a/libraries/readability/Readability.php b/libraries/readability/Readability.php index b77a733..662e93b 100644 --- a/libraries/readability/Readability.php +++ b/libraries/readability/Readability.php @@ -114,6 +114,7 @@ class Readability $html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html); $html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8"); $this->dom = new DOMDocument(); + $this->dom->preserveWhiteSpace = false; $this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement'); @$this->dom->loadHTML($html); $this->url = $url; diff --git a/libraries/readability/examples/Readability.php b/libraries/readability/examples/Readability.php index 8ac49a6..22f42c0 100644 --- a/libraries/readability/examples/Readability.php +++ b/libraries/readability/examples/Readability.php @@ -12,6 +12,15 @@ $html = file_get_contents($url); // first before passing it to PHP Readability. // Both iconv() and mb_convert_encoding() can do this. +// If we've got Tidy, let's clean up input. +// This step is highly recommended - PHP's default HTML parser +// often does a terrible job and results in strange output. +if (function_exists('tidy_parse_string')) { + $tidy = tidy_parse_string($html, array(), 'UTF8'); + $tidy->cleanRepair(); + $html = $tidy->value; +} + // give it to Readability $readability = new Readability($html, $url); // print debug output? diff --git a/makefulltextfeed.php b/makefulltextfeed.php index de73862..ba5a70e 100644 --- a/makefulltextfeed.php +++ b/makefulltextfeed.php @@ -3,8 +3,8 @@ // Author: Keyvan Minoukadeh // Copyright (c) 2011 Keyvan Minoukadeh // License: AGPLv3 -// Version: 2.5 -// Date: 2011-01-08 +// Version: 2.6 +// Date: 2011-03-02 /* This program is free software: you can redistribute it and/or modify @@ -43,7 +43,9 @@ function __autoload($class_name) { static $mapping = array( // Include SimplePie for RSS/Atom parsing 'SimplePie' => 'simplepie/simplepie.class.php', - 'SimplePie_Misc' => 'simplepie/simplepie.class.php', + 'SimplePie_Misc' => 'simplepie/simplepie.class.php', + 'SimplePie_HTTP_Parser' => 'simplepie/simplepie.class.php', + 'SimplePie_File' => 'simplepie/simplepie.class.php', // Include FeedCreator for RSS/Atom creation 'FeedWriter' => 'feedwriter/FeedWriter.php', 'FeedItem' => 'feedwriter/FeedItem.php', @@ -51,6 +53,7 @@ function __autoload($class_name) { 'Readability' => 'readability/Readability.php', // Include Humble HTTP Agent to allow parallel requests and response caching 'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php', + 'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php', // Include IRI class for resolving relative URLs 'IRI' => 'iri/iri.php', // Include Zend Cache to improve performance (cache results) @@ -67,13 +70,31 @@ function __autoload($class_name) { } } +function url_allowed($url) { + global $options; + if (!empty($options->allowed_urls)) { + $allowed = false; + foreach ($options->allowed_urls as $allowurl) { + if (stristr($url, $allowurl) !== false) { + $allowed = true; + break; + } + } + if (!$allowed) return false; + } else { + foreach ($options->blocked_urls as $blockurl) { + if (stristr($url, $blockurl) !== false) { + return false; + } + } + } + return true; +} + //////////////////////////////// // Load config file if it exists //////////////////////////////// require_once(dirname(__FILE__).'/config.php'); -if (file_exists(dirname(__FILE__).'/custom_config.php')) { - require_once(dirname(__FILE__).'/custom_config.php'); -} ////////////////////////////////////////////// // Convert $html to UTF8 @@ -191,9 +212,16 @@ $url = $_GET['url']; if (!preg_match('!^https?://.+!i', $url)) { $url = 'http://'.$url; } -$valid_url = filter_var($url, FILTER_VALIDATE_URL); -if ($valid_url !== false && $valid_url !== null && preg_match('!^https?://!', $valid_url)) { - $url = filter_var($url, FILTER_SANITIZE_URL); + +$url = filter_var($url, FILTER_SANITIZE_URL); +$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); +// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2) +if ($test === false) { + $test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED); +} +if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) { + // all okay + unset($test); } else { die('Invalid URL supplied'); } @@ -231,6 +259,16 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap exit; } +/////////////////////////////////////////////// +// Set timezone. +// Prevents warnings, but needs more testing - +// perhaps if timezone is set in php.ini we +// don't need to set it at all... +/////////////////////////////////////////////// +if (!ini_get('date.timezone') || !@date_default_timezone_set(ini_get('date.timezone'))) { + date_default_timezone_set('UTC'); +} + /////////////////////////////////////////////// // Check if the request is explicitly for an HTML page /////////////////////////////////////////////// @@ -246,25 +284,8 @@ if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int /////////////////////////////////////////////// // Check URL against list of blacklisted URLs -// TODO: set up better system for this /////////////////////////////////////////////// - -if (!empty($options->allowed_urls)) { - $allowed = false; - foreach ($options->allowed_urls as $allowurl) { - if (strstr($url, $allowurl) !== false) { - $allowed = true; - break; - } - } - if (!$allowed) die('URL not allowed'); -} else { - foreach ($options->blocked_urls as $blockurl) { - if (strstr($url, $blockurl) !== false) { - die('URL blocked'); - } - } -} +if (!url_allowed($url)) die('URL blocked'); /////////////////////////////////////////////// // Max entries @@ -441,7 +462,10 @@ if (function_exists('tidy_parse_string')) { // Get RSS/Atom feed //////////////////////////////// if (!$html_only) { + // configure SimplePie HTTP extension class to use our HumbleHttpAgent instance + SimplePie_HumbleHttpAgent::set_agent($http); $feed = new SimplePie(); + $feed->set_file_class('SimplePie_HumbleHttpAgent'); $feed->set_feed_url($url); $feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE); $feed->set_timeout(20); @@ -466,7 +490,10 @@ if ($html_only || !$result) { unset($feed, $result); if ($response = $http->get($url)) { $effective_url = $response['effective_url']; + if (!url_allowed($effective_url)) die('URL blocked'); $html = $response['body']; + // remove strange things here + $html = str_replace('', '', $html); $html = convert_to_utf8($html, $response['headers']); } else { die('Error retrieving '.$url); @@ -513,7 +540,11 @@ if ($html_only || !$result) { // get outerHTML $content = $content_block->ownerDocument->saveXML($content_block); } else { - $content = $content_block->innerHTML; + if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) { + $content = $content_block->firstChild->innerHTML; + } else { + $content = $content_block->innerHTML; + } } if ($links == 'remove') { $content = preg_replace('!]*>!', '', $content); @@ -586,7 +617,7 @@ foreach ($items as $key => $item) { $urls[$key] = $permalink; } $http->fetchAll($urls_sanitized); -$http->cacheAll(); +//$http->cacheAll(); foreach ($items as $key => $item) { $extract_result = false; @@ -606,9 +637,12 @@ foreach ($items as $key => $item) { $newitem->setLink($item->get_permalink()); } } - if ($permalink && $response = $http->get($permalink)) { + if ($permalink && $response = $http->get($permalink, true)) { $effective_url = $response['effective_url']; + if (!url_allowed($effective_url)) continue; $html = $response['body']; + // remove strange things here + $html = str_replace('', '', $html); $html = convert_to_utf8($html, $response['headers']); if ($auto_extract) { // Run through Tidy (if it exists). @@ -661,7 +695,11 @@ foreach ($items as $key => $item) { // get outerHTML $html = $content_block->ownerDocument->saveXML($content_block); } else { - $html = $content_block->innerHTML; + if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) { + $html = $content_block->firstChild->innerHTML; + } else { + $html = $content_block->innerHTML; + } } // post-processing cleanup $html = preg_replace('!

[\s\h\v]*

!u', '', $html);