Full-Text RSS 2.6

This commit is contained in:
Keyvan 2011-11-04 18:10:31 +01:00
parent 8f5d813459
commit e2a9b81740
14 changed files with 906 additions and 155 deletions

29
README.txt Normal file
View File

@ -0,0 +1,29 @@
Full-Text RSS
=============
About
-----
See http://fivefilters.org/content-only/ for a description of the code.
Installation
------------
1. Extract the files in this ZIP archive to a folder on your computer.
2. FTP the files up to your server
3. Access index.php through your browser. E.g. http://my-host.com/full-text-rss/index.php
4. Enter a URL in the form field to test the code
5. If you get an RSS feed with full-text content, all is working well. :)
Configuration (optional)
------------------------
1. Save a copy of config.php as custom_config.php and edit custom_config.php
2. If you decide to enable caching, make sure the cache folder (and its 2 sub folders) is writable.
(you might need to change the permissions of these folders to 777 through your FTP client).

22
UPDATING.txt Normal file
View File

@ -0,0 +1,22 @@
Updating Full-Text RSS
======================
To update your copy of Full-Text RSS to ensure feeds continue to be processed as they were before, we suggest the following steps:
1. Keep your current installation in place for now (we'll deal with it later)
2. Extract this updated package to a new folder -- for example, if the last version is in a folder called 'full-text-rss', extract this version to a new folder called 'full-text-rss-updated'
3. FTP the new folder up to your server
4. Access index.php in the new folder through your browser -- for example http://my-host.com/full-text-rss-updated/index.php
5. Enter a URL in the form field to test the updated code
6. If you'd configured the last version, copy custom_config.php from your old version to the new folder.
7. Test the new copy again to make sure the config values are now applied to the new version.
8. Now simply rename the folder with your old copy to 'full-text-rss-old' and then rename the folder with the new copy to 'full-text-rss' (or whatever name you'd given the original folder).
That's all that's needed. Your feeds should continue to work as they did before. Let us know if you have any trouble: fivefilters@fivefilters.org.

View File

@ -2,6 +2,16 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
2.6 (2011-03-02)
- Rewriting of hash-bang (#!) URLs (see http://www.tbray.org/ongoing/When/201x/2011/02/09/Hash-Blecch for an explanation)
- Improved parallel fetching support (HumbleHttpAgent uses curl_multi_* functions if PECL HTTP extension is not present)
- Improved HTTP redirect support (now handled in HumbleHttpAgent, no longer relies on PHP)
- Improved performance for single page (non-feed) requests: (SimplePie connected to HumbleHttpAgent)
- Improved memory use for processing large feeds (HumbleHttpAgent's stored responses cleared as they're retrieved)
- Bug fix: exclude on fail option no longer requires valid key
- Bug fix: workaround for PHP bug http://bugs.php.net/51192 (fixed in makefulltextfeed.php)
- Plus other minor changes...
2.5 (2011-01-08)
- New option: custom extraction pattern (CSS selectors)
- New option: allowed URLs (restrict service to pre-defined feeds/domains)

View File

@ -53,6 +53,7 @@ function __autoload($class_name) {
require_once(dirname(__FILE__).'/config.php');
if (!$options->caching) die('Caching is disabled');
/*
// clean http response cache
$frontendOptions = array(
'lifetime' => 30*60, // cache lifetime of 30 minutes
@ -73,6 +74,7 @@ $backendOptions = array(
);
$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
$cache->clean(Zend_Cache::CLEANING_MODE_OLD);
*/
// clean rss (non-key) cache
$frontendOptions = array(

View File

@ -1,10 +1,15 @@
<?php
/* Full-Text RSS config */
// ......IMPORTANT......................................
// .....................................................
// Please do not change this file (config.php) directly.
// Save a copy as custom_config.php and make your
// changes to that instead. It will automatically
// override anything in config.php.
// override anything in config.php. Because config.php
// always gets loaded anyway, you can simply specify
// options you'd like to override in custom_config.php.
// .....................................................
// Enable service
// ----------------------
@ -27,8 +32,6 @@ $options->max_entries = 10;
// ----------------------
// With this enabled relative URLs found in the extracted content
// block are automatically rewritten as absolute URLs.
// Set to false if you want to preserve relative URLs appearing in
// the extracted content block.
$options->rewrite_relative_urls = true;
// Exclude items if extraction fails
@ -128,8 +131,8 @@ $options->cache_cleanup = 100;
/////////////////////////////////////////////////
/// DEPRECATED OPTIONS
/// THESE OPTIONS WILL CHANGE IN THE NEXT
/// VERSION, WE RECOMMEND YOU DO NOT USE THEM
/// THESE OPTIONS WILL CHANGE IN VERSION 3.0
/// WE RECOMMEND YOU DO NOT USE THEM
/////////////////////////////////////////////////
// Restrict service (deprecated)
@ -182,6 +185,8 @@ $options->error_message_with_key = '[unable to retrieve full-text content]';
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.5');
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.6');
?>
if ((basename(__FILE__) == 'config.php') && (file_exists(dirname(__FILE__).'/custom_config.php'))) {
require_once(dirname(__FILE__).'/custom_config.php');
}

View File

@ -13,20 +13,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/
$app_name = 'Full-Text RSS v2';
// test redirect
$url = parse_url('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']);
$redirect_url = 'http://'.$url['host'].$url['path'].'?redirect=true';
if (isset($_GET['redirect'])) {
$url = parse_url('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']);
$url = 'http://'.$url['host'].$url['path'].'?redirected=true';
header('Location: '.$url);
exit;
}
if (isset($_GET['redirected'])) {
die('Redirect works');
}
$app_name = 'Full-Text RSS 2.6';
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>='));
$pcre_ok = extension_loaded('pcre');
@ -35,9 +22,8 @@ $mbstring_ok = extension_loaded('mbstring');
$iconv_ok = extension_loaded('iconv');
$tidy_ok = function_exists('tidy_parse_string');
$curl_ok = function_exists('curl_exec');
$http_ok = (extension_loaded('http') && class_exists('HttpRequestPool'));
$parallel_ok = ((extension_loaded('http') && class_exists('HttpRequestPool')) || ($curl_ok && function_exists('curl_multi_init')));
$allow_url_fopen_ok = (bool)ini_get('allow_url_fopen');
$redirect_ok = ($allow_url_fopen_ok && file_get_contents($redirect_url) == 'Redirect works');
if (extension_loaded('xmlreader')) {
$xml_ok = true;
@ -241,21 +227,16 @@ div.chunk {
<td>Enabled</td>
<td><?php echo (extension_loaded('curl')) ? 'Enabled' : 'Disabled'; ?></td>
</tr>
<tr class="<?php echo ($http_ok) ? 'enabled' : 'disabled'; ?>">
<td><a href="http://php.net/HttpRequestPool">HttpRequestPool</a></td>
<tr class="<?php echo ($parallel_ok) ? 'enabled' : 'disabled'; ?>">
<td>Parallel URL fetching</td>
<td>Enabled</td>
<td><?php echo ($http_ok) ? 'Enabled' : 'Disabled'; ?></td>
<td><?php echo ($parallel_ok) ? 'Enabled' : 'Disabled'; ?></td>
</tr>
<tr class="<?php echo ($allow_url_fopen_ok) ? 'enabled' : 'disabled'; ?>">
<td><a href="http://www.php.net/manual/en/filesystem.configuration.php#ini.allow-url-fopen">allow_url_fopen</a></td>
<td>Enabled</td>
<td><?php echo ($allow_url_fopen_ok) ? 'Enabled' : 'Disabled'; ?></td>
</tr>
<tr class="<?php echo ($redirect_ok) ? 'enabled' : 'disabled'; ?>">
<td>HTTP Redirects</td>
<td>Enabled</td>
<td><?php echo ($redirect_ok) ? 'Enabled' : 'Disabled'; ?></td>
</tr>
</tbody>
</table>
</div>
@ -263,7 +244,7 @@ div.chunk {
<div class="chunk">
<h3>What does this mean?</h3>
<ol>
<?php if ($php_ok && $xml_ok && $pcre_ok && $mbstring_ok && $iconv_ok && $zlib_ok && $tidy_ok && $curl_ok && $http_ok && $allow_url_fopen_ok && $redirect_ok): ?>
<?php if ($php_ok && $xml_ok && $pcre_ok && $mbstring_ok && $iconv_ok && $zlib_ok && $tidy_ok && $curl_ok && $parallel_ok && $allow_url_fopen_ok): ?>
<li><em>You have everything you need to run <?php echo $app_name; ?> properly! Congratulations!</em></li>
<?php else: ?>
<?php if ($php_ok): ?>
@ -304,16 +285,10 @@ div.chunk {
<li><strong>cURL:</strong> The <code>cURL</code> extension is not available. SimplePie will use <code>fsockopen()</code> instead.</li>
<?php endif; ?>
<?php if ($http_ok): ?>
<li><strong>HttpRequestPool:</strong> You have <code>HttpRequestPool</code> support installed. <em>No problems here.</em></li>
<?php if ($parallel_ok): ?>
<li><strong>Parallel URL fetching:</strong> You have <code>HttpRequestPool</code> or <code>curl_multi</code> support installed. <em>No problems here.</em></li>
<?php else: ?>
<li><strong>HttpRequestPool:</strong> The <code>HttpRequestPool</code> class is not available. <?php echo $app_name; ?> will use <code>file_get_contents()</code> instead to fetch URLs sequentially rather than in parallel.</li>
<?php endif; ?>
<?php if ($redirect_ok): ?>
<li><strong>HTTP Redirects:</strong> Your server appears to handle redirects ok. <em>No problems here.</em></li>
<?php else: ?>
<li><strong>HTTP Redirects:</strong> Your server appears not to be able to handle HTTP redirects. <?php echo $app_name; ?> should still work with most feeds, but you may experience problems with some.</li>
<li><strong>Parallel URL fetching:</strong> <code>HttpRequestPool</code> or <code>curl_multi</code> support is not available. <?php echo $app_name; ?> will use <code>file_get_contents()</code> instead to fetch URLs sequentially rather than in parallel.</li>
<?php endif; ?>
<?php else: ?>

View File

@ -135,7 +135,7 @@ if (!defined('_FF_FTR_INDEX')) {
Then whenever you'd like a full-text feed, click the bookmarklet.</p>
<p>Drag this:
<script type="text/javascript">
document.write('<a style="cursor: move;" onclick="alert(\'drag to bookmarks toolbar\'); return false;" href="javascript:location.href=\''+baseUrl+'/makefulltextfeed.php?url=\'+escape(document.location.href);">Full-Text RSS</a>');
document.write('<a style="cursor: move;" onclick="alert(\'drag to bookmarks toolbar\'); return false;" href="javascript:location.href=\''+baseUrl+'/makefulltextfeed.php?url=\'+encodeURIComponent(document.location.href);">Full-Text RSS</a>');
</script>
<h3 id="api">API</h3>
@ -143,11 +143,6 @@ if (!defined('_FF_FTR_INDEX')) {
<ul>
<li style="font-family: monospace;"><script type="text/javascript">document.write(baseUrl);</script>/makefulltextfeed.php?url=<strong>[url]</strong></li>
</ul>
<p>If you have an API key, add that to the querystring:</p>
<ul>
<li style="font-family: monospace; white-space:nowrap;"><script type="text/javascript">document.write(baseUrl);</script>/makefulltextfeed.php?key=<strong>[key]</strong>&amp;url=<strong>[url]</strong></li>
<li style="font-family: monospace; white-space:nowrap;"><script type="text/javascript">document.write(baseUrl);</script>/makefulltextfeed.php?key=<strong>[key]</strong>&amp;max=<strong>[number of feed items]</strong>&amp;url=<strong>[url]</strong></li>
</ul>
<p>All the parameters in the form above can be passed in this way. Examine the URL in the addressbar after you click 'Create Feed' to see the values.</p>
@ -155,26 +150,27 @@ if (!defined('_FF_FTR_INDEX')) {
project licensed under the AGPL. You're free to <a href="http://fivefilters.org/content-only/#download">download your own copy</a>.</p>
<h3>Source Code and Technologies</h3>
<p><p>The application uses <a href="http://en.wikipedia.org/wiki/PHP">PHP</a>, <a href="http://www.keyvan.net/2010/08/php-readability/">PHP Readability</a>, <a href="http://simplepie.org/">SimplePie</a>, <a href="http://www.ajaxray.com/blog/2008/03/08/php-universal-feed-generator-supports-rss-10-rss-20-and-atom/">FeedWriter</a>, Humble HTTP Agent. Depending on configuration, these optional components may also be used: <a href="http://framework.zend.com/manual/en/zend.cache.introduction.html">Zend Cache</a>, <a href="http://framework.zend.com/manual/en/zend.dom.query.html">Zend DOM Query</a> and <a href="http://hg.gsnedders.com/iri/">IRI</a>. Readability is the magic piece of code that tries to identify and extract the content block from any given web page.</p>
<p><p>The application uses <a href="http://en.wikipedia.org/wiki/PHP">PHP</a>, <a href="http://www.keyvan.net/2010/08/php-readability/">PHP Readability</a>, <a href="http://simplepie.org/">SimplePie</a>, <a href="http://www.ajaxray.com/blog/2008/03/08/php-universal-feed-generator-supports-rss-10-rss-20-and-atom/">FeedWriter</a>, Humble HTTP Agent. Depending on your configuration, these optional components may also be used: <a href="http://framework.zend.com/manual/en/zend.cache.introduction.html">Zend Cache</a>, <a href="http://framework.zend.com/manual/en/zend.dom.query.html">Zend DOM Query</a>, <a href="http://code.google.com/p/rolling-curl/">Rolling Curl</a> and <a href="http://hg.gsnedders.com/iri/">IRI</a>. Readability is the magic piece of code that tries to identify and extract the content block from any given web page.</p>
<h3>System Requirements</h3>
<p>PHP 5.2 or above is required. A simple shared web hosting account will work fine.
The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using <a href="http://www.wampserver.com/en/index.php">WampServer</a>.</p>
The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using <a href="http://www.wampserver.com/en/index.php">WampServer</a>. It has also been reported as working under IIS, but we have not tested this ourselves.</p>
<h3 id="download">Download</h3>
<p>Download from <a href="http://fivefilters.org/content-only/#download">fivefilters.org</a> - old versions are available in the code repository.</p>
<h3>License</h3>
<p><a href="http://en.wikipedia.org/wiki/Affero_General_Public_License" style="border-bottom: none;"><img src="images/agplv3.png" alt="AGPL logo" /></a><br />This web application is licensed under the <a href="http://en.wikipedia.org/wiki/Affero_General_Public_License">AGPL version 3</a> &mdash; which basically means if you use the code to offer the same or similar service for your users, you are also required to share the code with your users so they can run it for themselves. (<a href="http://www.clipperz.com/users/marco/blog/2008/05/30/freedom_and_privacy_cloud_call_action">More on why this is important.</a>)</p>
<p><a href="http://en.wikipedia.org/wiki/Affero_General_Public_License" style="border-bottom: none;"><img src="images/agplv3.png" alt="AGPL logo" /></a><br />This web application is licensed under the <a href="http://en.wikipedia.org/wiki/Affero_General_Public_License">AGPL version 3</a> &mdash; which basically means if you use the code to offer the same or similar service for your users, you are also required to share the code with your users so they can examine the code and run it for themselves. (<a href="http://www.clipperz.com/users/marco/blog/2008/05/30/freedom_and_privacy_cloud_call_action">More on why this is important.</a>)</p>
<p>The libraries used by the application are licensed as follows...</p>
<ul>
<li>Readability: <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License v2</a></li>
<li>PHP Readability: <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License v2</a></li>
<li>SimplePie: <a href="http://en.wikipedia.org/wiki/BSD_license">BSD</a></li>
<li>FeedWriter: <a href="http://www.gnu.org/licenses/old-licenses/gpl-2.0.html">GPL v2</a></li>
<li>Humble HTTP Agent: <a href="http://en.wikipedia.org/wiki/Affero_General_Public_License">AGPL v3</a></li>
<li>Zend: <a href="http://framework.zend.com/license/new-bsd">New BSD</a></li>
<li>IRI: <a href="http://en.wikipedia.org/wiki/BSD_license">BSD</a></li>
<li>Rolling Curl: <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License v2</a></li>
</ul>
</div>

View File

@ -3,37 +3,59 @@
* Humble HTTP Agent
*
* This class is designed to take advantage of parallel HTTP requests
* offered by PHP's PECL HTTP extension. For environments which
* do not have this extension, it reverts to standard sequential
* offered by PHP's PECL HTTP extension or the curl_multi_* functions.
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
* @version 2010-10-19
* @version 0.8
* @date 2011-02-28
* @see http://php.net/HttpRequestPool
* @author Keyvan Minoukadeh
* @copyright 2010 Keyvan Minoukadeh
* @copyright 2011 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
class HumbleHttpAgent
{
const METHOD_REQUEST_POOL = 1;
const METHOD_CURL_MULTI = 2;
const METHOD_FILE_GET_CONTENTS = 4;
protected $requests = array();
protected $redirectQueue = array();
protected $requestOptions;
protected $parallelSupport;
protected $maxParallelRequests = 5;
protected $cache = null;
protected $cache = null; //TODO
protected $httpContext;
protected $minimiseMemoryUse = false;
protected $minimiseMemoryUse = false; //TODO
protected $debug = false;
protected $method;
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
public $maxRedirects = 5;
//TODO: prevent certain file/mime types
//TODO: set max file size
//TODO: normalise headers
function __construct($requestOptions=null) {
$this->parallelSupport = class_exists('HttpRequestPool');
function __construct($requestOptions=null, $method=null) {
// set the request method
if (in_array($method, array(1,2,4))) {
$this->method = $method;
} else {
if (class_exists('HttpRequestPool')) {
$this->method = self::METHOD_REQUEST_POOL;
} elseif (function_exists('curl_multi_init')) {
$this->method = self::METHOD_CURL_MULTI;
} else {
$this->method = self::METHOD_FILE_GET_CONTENTS;
}
}
if ($this->method == self::METHOD_CURL_MULTI) {
require_once(dirname(__FILE__).'/RollingCurl.php');
}
$this->requestOptions = array(
'timeout' => 10,
'redirect' => 5
'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
// TODO: test onprogress?
);
if (is_array($requestOptions)) {
@ -41,9 +63,10 @@ class HumbleHttpAgent
}
$this->httpContext = stream_context_create(array(
'http' => array(
'ignore_errors' => true,
'timeout' => $this->requestOptions['timeout'],
'max_redirects' => $this->requestOptions['redirect'],
'header' => "User-Agent: PHP/5.2\r\n".
'header' => "User-Agent: PHP/".phpversion()."\r\n".
"Accept: */*\r\n"
)
)
@ -61,6 +84,23 @@ class HumbleHttpAgent
}
}
public function rewriteHashbangFragment($url) {
// return $url if there's no '#!'
if (strpos($url, '#!') === false) return $url;
// split $url and rewrite
$iri = new IRI($url);
$fragment = substr($iri->ifragment, 1); // strip '!'
$iri->fragment = null;
if (isset($iri->iquery)) {
parse_str($iri->iquery, $query);
} else {
$query = array();
}
$query['_escaped_fragment_'] = (string)$fragment;
$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
return $iri->uri;
}
public function enableDebug($bool=true) {
$this->debug = (bool)$bool;
}
@ -73,29 +113,30 @@ class HumbleHttpAgent
$this->maxParallelRequests = $max;
}
/**
* Set cache object.
* The cache object passed should implement Zend_Cache_Backend_Interface
* @param Zend_Cache_Backend_Interface
*/
public function useCache($cache) {
$this->cache = $cache;
}
public function validateUrl($url) {
//TODO: run sanitize filter first!
$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
if ($test === false) {
$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
}
if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
return filter_var($url, FILTER_SANITIZE_URL);
return $url;
} else {
return false;
}
}
/**
* Set cache object.
* The cache object passed should implement Zend_Cache_Backend_Interface
* @param Zend_Cache_Backend_Interface
*/
/* all disk caching temporily disabled - needs work
public function useCache($cache) {
$this->cache = $cache;
}
public function isCached($url) {
if (!isset($this->cache)) return false;
return ($this->cache->test(md5($url)) !== false);
@ -126,30 +167,50 @@ class HumbleHttpAgent
}
return false;
}
*/
public function fetchAll(array $urls) {
$urls = array_unique($urls);
// parallel
if (count($urls) > 1 && $this->parallelSupport() && $this->maxParallelRequests > 1) {
$this->debug('Starting parallel fetch');
$this->fetchAllOnce($urls, $isRedirect=false);
$redirects = 0;
while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
$this->debug("Following redirects #$redirects...");
$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
}
}
// fetch all URLs without following redirects
public function fetchAllOnce(array $urls, $isRedirect=false) {
if (!$isRedirect) $urls = array_unique($urls);
if (empty($urls)) return;
//////////////////////////////////////////////////////
// parallel (HttpRequestPool)
if ($this->method == self::METHOD_REQUEST_POOL) {
$this->debug('Starting parallel fetch (HttpRequestPool)');
try {
while (count($urls) > 0) {
$this->debug('Processing set of '.$this->maxParallelRequests);
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
$subset = array_splice($urls, 0, $this->maxParallelRequests);
$pool = new HttpRequestPool();
foreach ($subset as $url) {
foreach ($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
$this->debug("...$url");
if (isset($this->requests[$url])) {
if (!$isRedirect && isset($this->requests[$url])) {
$this->debug("......in memory");
/*
} elseif ($this->isCached($url)) {
$this->debug("......is cached");
if (!$this->minimiseMemoryUse) {
$this->requests[$url] = $this->getCached($url);
}
*/
} else {
$this->debug("......adding to pool");
$httpRequest = new HttpRequest($url, HttpRequest::METH_GET, $this->requestOptions);
$this->requests[$url] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
$httpRequest = new HttpRequest($req_url, HttpRequest::METH_GET, $this->requestOptions);
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$this->requests[$orig]['original_url'] = $orig;
$pool->attach($httpRequest);
}
}
@ -158,21 +219,37 @@ class HumbleHttpAgent
$this->debug('Sending request...');
$pool->send();
$this->debug('Received responses');
foreach($subset as $url) {
if (!isset($this->requests[$url]['fromCache'])) {
$request = $this->requests[$url]['httpRequest'];
$this->requests[$url]['headers'] = $this->headersToString($request->getResponseHeader());
$this->requests[$url]['body'] = $request->getResponseBody();
$this->requests[$url]['effective_url'] = $request->getResponseInfo('effective_url');
foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
//if (!isset($this->requests[$url]['fromCache'])) {
$request = $this->requests[$orig]['httpRequest'];
//$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
// getResponseHeader() doesn't return status line, so, for consistency...
$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
$this->requests[$orig]['body'] = $request->getResponseBody();
$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
$redirectURL = $request->getResponseHeader('location');
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
}
//die($url.' -multi- '.$request->getResponseInfo('effective_url'));
$pool->detach($request);
unset($this->requests[$url]['httpRequest'], $request);
unset($this->requests[$orig]['httpRequest'], $request);
/*
if ($this->minimiseMemoryUse) {
if ($this->cache($url)) {
unset($this->requests[$url]);
}
}
}
*/
//}
}
}
}
@ -180,12 +257,143 @@ class HumbleHttpAgent
$this->debug($e);
return false;
}
// sequential
} else {
$this->debug('Starting sequential fetch...');
foreach($urls as $url) {
$this->get($url);
}
//////////////////////////////////////////////////////////
// parallel (curl_multi_*)
elseif ($this->method == self::METHOD_CURL_MULTI) {
$this->debug('Starting parallel fetch (curl_multi_*)');
while (count($urls) > 0) {
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
$subset = array_splice($urls, 0, $this->maxParallelRequests);
$pool = new RollingCurl(array($this, 'handleCurlResponse'));
$pool->window_size = count($subset);
foreach ($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
$this->debug("...$url");
if (!$isRedirect && isset($this->requests[$url])) {
$this->debug("......in memory");
/*
} elseif ($this->isCached($url)) {
$this->debug("......is cached");
if (!$this->minimiseMemoryUse) {
$this->requests[$url] = $this->getCached($url);
}
*/
} else {
$this->debug("......adding to pool");
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
$httpRequest = new RollingCurlRequest($req_url, 'GET', null, null, array(
CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
CURLOPT_TIMEOUT => $this->requestOptions['timeout']
));
$httpRequest->set_original_url($orig);
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
$pool->add($httpRequest);
}
}
// did we get anything into the pool?
if (count($pool) > 0) {
$this->debug('Sending request...');
$pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
$this->debug('Received responses');
foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
// $this->requests[$orig]['headers']
// $this->requests[$orig]['body']
// $this->requests[$orig]['effective_url']
$status_code = $this->requests[$orig]['status_code'];
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
}
// die($url.' -multi- '.$request->getResponseInfo('effective_url'));
unset($this->requests[$orig]['httpRequest']);
}
}
}
}
//////////////////////////////////////////////////////
// sequential (file_get_contents)
else {
$this->debug('Starting sequential fetch (file_get_contents)');
$this->debug('Processing set of '.count($urls));
foreach ($urls as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
$this->debug("...$url");
if (!$isRedirect && isset($this->requests[$url])) {
$this->debug("......in memory");
/*
} elseif ($this->isCached($url)) {
$this->debug("......is cached");
if (!$this->minimiseMemoryUse) {
$this->requests[$url] = $this->getCached($url);
}
*/
} else {
$this->debug("Sending request for $url");
$this->requests[$orig]['original_url'] = $orig;
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
if (false !== ($html = @file_get_contents($req_url, false, $this->httpContext))) {
$this->debug('Received response');
// get status code
if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
$this->debug('Error: no status code found');
// TODO: handle error - no status code
} else {
$this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
$this->requests[$orig]['body'] = $html;
$this->requests[$orig]['effective_url'] = $req_url;
$this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
unset($match);
// handle redirect
if (preg_match('/^Location:(.*?)$/m', $this->requests[$orig]['headers'], $match)) {
$this->requests[$orig]['location'] = trim($match[1]);
}
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
}
}
} else {
$this->debug('Error retrieving URL');
//print_r($req_url);
//print_r($http_response_header);
//print_r($html);
// TODO: handle error - failed to retrieve URL
}
}
}
}
}
public function handleCurlResponse($response, $info, $request) {
$orig = $request->url_original;
$this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
$this->requests[$orig]['body'] = substr($response, $info['header_size']);
$this->requests[$orig]['effective_url'] = $info['url'];
$this->requests[$orig]['status_code'] = (int)$info['http_code'];
if (preg_match('/^Location:(.*?)$/m', $this->requests[$orig]['headers'], $match)) {
$this->requests[$orig]['location'] = trim($match[1]);
}
}
@ -205,50 +413,38 @@ class HumbleHttpAgent
}
}
protected function getRedirectUrl($header) {
if (is_array($header)) $header = implode("\n", $header);
if (!$header || !preg_match_all('!^Location:\s*(https?://.+)!im', $header, $match, PREG_SET_ORDER)) {
// error parsing the response
return false;
} else {
$match = end($match); // get last matched element (in case of redirects)
return $match[1];
}
}
public function get($url) {
public function get($url, $remove=false) {
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
$this->debug("URL already fetched - in memory ($url)");
$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
$response = $this->requests[$url];
/*
} elseif ($this->isCached($url)) {
$this->debug("URL already fetched - in disk cache ($url)");
$response = $this->getCached($url);
$this->requests[$url] = $response;
*/
} else {
$this->debug("Fetching URL ($url)");
if ($html = @file_get_contents($url, false, $this->httpContext)) {
$header = $this->headersToString($http_response_header, false);
$response = array('headers'=>$header, 'body'=>$html);
if ($last_url = $this->getRedirectUrl($header)) {
$response['effective_url'] = $last_url;
//die($url .' -single- '. $response['effective_url']);
} else {
$response['effective_url'] = $url;
}
$this->requests[$url] = $response;
$this->fetchAll(array($url));
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
$response = $this->requests[$url];
} else {
$this->debug("Request failed");
$response = false;
}
}
/*
if ($this->minimiseMemoryUse && $response) {
$this->cache($url);
unset($this->requests[$url]);
}
*/
if ($remove && $response) unset($this->requests[$url]);
return $response;
}
public function parallelSupport() {
return $this->parallelSupport;
return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
}
}
?>

View File

@ -0,0 +1,392 @@
<?php
/*
Authored by Josh Fraser (www.joshfraser.com)
Released under Apache License 2.0
Maintained by Alexander Makarov, http://rmcreative.ru/
Modified by Keyvan Minoukadeh for the Five Filters project: http://fivefilters.org
*/
/**
* Class that represent a single curl request
*/
class RollingCurlRequest {
public $url = false;
public $url_original = false; // used for tracking redirects
public $method = 'GET';
public $post_data = null;
public $headers = null;
public $options = null;
/**
* @param string $url
* @param string $method
* @param $post_data
* @param $headers
* @param $options
* @return void
*/
function __construct($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
$this->url = $url;
$this->url_original = $url;
$this->method = $method;
$this->post_data = $post_data;
$this->headers = $headers;
$this->options = $options;
}
/**
* @param string $url
* @return void
*/
public function set_original_url($url) {
$this->url_original = $url;
}
/**
* @return void
*/
public function __destruct() {
unset($this->url, $this->url_original, $this->method, $this->post_data, $this->headers, $this->options);
}
}
/**
* RollingCurl custom exception
*/
class RollingCurlException extends Exception {
}
/**
* Class that holds a rolling queue of curl requests.
*
* @throws RollingCurlException
*/
class RollingCurl implements Countable {
/**
* @var int
*
* Window size is the max number of simultaneous connections allowed.
*
* REMEMBER TO RESPECT THE SERVERS:
* Sending too many requests at one time can easily be perceived
* as a DOS attack. Increase this window_size if you are making requests
* to multiple servers or have permission from the receving server admins.
*/
private $window_size = 5;
/**
* @var float
*
* Timeout is the timeout used for curl_multi_select.
*/
private $timeout = 10;
/**
* @var string|array
*
* Callback function to be applied to each result.
*/
private $callback;
/**
* @var array
*
* Set your base options that you want to be used with EVERY request.
*/
protected $options = array(
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_CONNECTTIMEOUT => 30,
CURLOPT_TIMEOUT => 30
);
/**
* @var array
*/
private $headers = array();
/**
* @var Request[]
*
* The request queue
*/
private $requests = array();
/**
* @var RequestMap[]
*
* Maps handles to request indexes
*/
private $requestMap = array();
/**
* @param $callback
* Callback function to be applied to each result.
*
* Can be specified as 'my_callback_function'
* or array($object, 'my_callback_method').
*
* Function should take three parameters: $response, $info, $request.
* $response is response body, $info is additional curl info.
* $request is the original request
*
* @return void
*/
function __construct($callback = null) {
$this->callback = $callback;
}
/**
* @param string $name
* @return mixed
*/
public function __get($name) {
return (isset($this->{$name})) ? $this->{$name} : null;
}
/**
* @param string $name
* @param mixed $value
* @return bool
*/
public function __set($name, $value) {
// append the base options & headers
if ($name == "options" || $name == "headers") {
$this->{$name} = $value + $this->{$name};
} else {
$this->{$name} = $value;
}
return true;
}
/**
* Count number of requests added (Countable interface)
*
* @return int
*/
public function count() {
return count($this->requests);
}
/**
* Add a request to the request queue
*
* @param Request $request
* @return bool
*/
public function add($request) {
$this->requests[] = $request;
return true;
}
/**
* Create new Request and add it to the request queue
*
* @param string $url
* @param string $method
* @param $post_data
* @param $headers
* @param $options
* @return bool
*/
public function request($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
$this->requests[] = new RollingCurlRequest($url, $method, $post_data, $headers, $options);
return true;
}
/**
* Perform GET request
*
* @param string $url
* @param $headers
* @param $options
* @return bool
*/
public function get($url, $headers = null, $options = null) {
return $this->request($url, "GET", null, $headers, $options);
}
/**
* Perform POST request
*
* @param string $url
* @param $post_data
* @param $headers
* @param $options
* @return bool
*/
public function post($url, $post_data = null, $headers = null, $options = null) {
return $this->request($url, "POST", $post_data, $headers, $options);
}
/**
* Execute processing
*
* @param int $window_size Max number of simultaneous connections
* @return string|bool
*/
public function execute($window_size = null) {
// rolling curl window must always be greater than 1
if (sizeof($this->requests) == 1) {
return $this->single_curl();
} else {
// start the rolling curl. window_size is the max number of simultaneous connections
return $this->rolling_curl($window_size);
}
}
/**
* Performs a single curl request
*
* @access private
* @return string
*/
private function single_curl() {
$ch = curl_init();
$request = array_shift($this->requests);
$options = $this->get_options($request);
curl_setopt_array($ch, $options);
$output = curl_exec($ch);
$info = curl_getinfo($ch);
// it's not neccesary to set a callback for one-off requests
if ($this->callback) {
$callback = $this->callback;
if (is_callable($this->callback)) {
call_user_func($callback, $output, $info, $request);
}
}
else
return $output;
return true;
}
/**
* Performs multiple curl requests
*
* @access private
* @throws RollingCurlException
* @param int $window_size Max number of simultaneous connections
* @return bool
*/
private function rolling_curl($window_size = null) {
if ($window_size)
$this->window_size = $window_size;
// make sure the rolling window isn't greater than the # of urls
if (sizeof($this->requests) < $this->window_size)
$this->window_size = sizeof($this->requests);
if ($this->window_size < 2) {
throw new RollingCurlException("Window size must be greater than 1");
}
$master = curl_multi_init();
// start the first batch of requests
for ($i = 0; $i < $this->window_size; $i++) {
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
// Add to our request Maps
$key = (string) $ch;
$this->requestMap[$key] = $i;
}
do {
while (($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM) ;
if ($execrun != CURLM_OK)
break;
// a request was just completed -- find out which one
while ($done = curl_multi_info_read($master)) {
// get the info and content returned on the request
$info = curl_getinfo($done['handle']);
$output = curl_multi_getcontent($done['handle']);
// send the return values to the callback function.
$callback = $this->callback;
if (is_callable($callback)) {
$key = (string) $done['handle'];
$request = $this->requests[$this->requestMap[$key]];
unset($this->requestMap[$key]);
call_user_func($callback, $output, $info, $request);
}
// start a new request (it's important to do this before removing the old one)
if ($i < sizeof($this->requests) && isset($this->requests[$i]) && $i < count($this->requests)) {
$ch = curl_init();
$options = $this->get_options($this->requests[$i]);
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
// Add to our request Maps
$key = (string) $ch;
$this->requestMap[$key] = $i;
$i++;
}
// remove the curl handle that just completed
curl_multi_remove_handle($master, $done['handle']);
}
// Block for data in / output; error handling is done by curl_multi_exec
if ($running)
curl_multi_select($master, $this->timeout);
} while ($running);
curl_multi_close($master);
return true;
}
/**
* Helper function to set up a new request by setting the appropriate options
*
* @access private
* @param Request $request
* @return array
*/
private function get_options($request) {
// options for this entire curl object
$options = $this->__get('options');
// We're managing reirects in PHP - allows us to intervene and rewrite/block URLs
// before the next request goes out.
$options[CURLOPT_FOLLOWLOCATION] = 0;
$options[CURLOPT_MAXREDIRS] = 0;
//if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode')) {
// $options[CURLOPT_FOLLOWLOCATION] = 1;
// $options[CURLOPT_MAXREDIRS] = 5;
//}
$headers = $this->__get('headers');
// append custom options for this specific request
if ($request->options) {
$options = $request->options + $options;
}
// set the request URL
$options[CURLOPT_URL] = $request->url;
if ($headers) {
$options[CURLOPT_HTTPHEADER] = $headers;
}
// return response headers
$options[CURLOPT_HEADER] = 1;
return $options;
}
/**
* @return void
*/
public function __destruct() {
unset($this->window_size, $this->callback, $this->options, $this->headers, $this->requests);
}
}

View File

@ -0,0 +1,76 @@
<?php
/**
* Humble HTTP Agent extension for SimplePie_File
*
* This class is designed to extend and override SimplePie_File
* in order to prevent duplicate HTTP requests being sent out.
* The idea is to initialise an instance of Humble HTTP Agent
* and attach it, to a static class variable, of this class.
* SimplePie will then automatically initialise this class
*
* @date 2011-02-28
*/
class SimplePie_HumbleHttpAgent extends SimplePie_File
{
protected static $agent;
var $url;
var $useragent;
var $success = true;
var $headers = array();
var $body;
var $status_code;
var $redirects = 0;
var $error;
var $method = SIMPLEPIE_FILE_SOURCE_NONE;
public static function set_agent(HumbleHttpAgent $agent) {
self::$agent = $agent;
}
public function __construct($url, $timeout = 10, $redirects = 5, $headers = null, $useragent = null, $force_fsockopen = false) {
if (class_exists('idna_convert'))
{
$idn = new idna_convert();
$parsed = SimplePie_Misc::parse_url($url);
$url = SimplePie_Misc::compress_parse_url($parsed['scheme'], $idn->encode($parsed['authority']), $parsed['path'], $parsed['query'], $parsed['fragment']);
}
$this->url = $url;
$this->useragent = $useragent;
if (preg_match('/^http(s)?:\/\//i', $url))
{
if (!is_array($headers))
{
$headers = array();
}
$this->method = SIMPLEPIE_FILE_SOURCE_REMOTE | SIMPLEPIE_FILE_SOURCE_CURL;
$headers2 = array();
foreach ($headers as $key => $value) {
$headers2[] = "$key: $value";
}
//TODO: allow for HTTP headers
// curl_setopt($fp, CURLOPT_HTTPHEADER, $headers2);
$response = self::$agent->get($url);
if ($response === false || !isset($response['status_code'])) {
$this->error = 'failed to fetch URL';
$this->success = false;
} else {
$parser = new SimplePie_HTTP_Parser($response['headers']);
if ($parser->parse()) {
$this->headers = $parser->headers;
//$this->body = $parser->body;
$this->body = $response['body'];
$this->status_code = $parser->status_code;
}
}
}
else
{
$this->error = 'invalid URL';
$this->success = false;
}
}
}
?>

View File

@ -1086,7 +1086,7 @@ class IRI
{
$iri .= '//' . $iauthority;
}
$iri .= $this->ipath;
$iri .= ($this->ipath) ? $this->ipath : '/';
if ($this->iquery !== null)
{
$iri .= '?' . $this->iquery;

View File

@ -114,6 +114,7 @@ class Readability
$html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
$this->dom = new DOMDocument();
$this->dom->preserveWhiteSpace = false;
$this->dom->registerNodeClass('DOMElement', 'JSLikeHTMLElement');
@$this->dom->loadHTML($html);
$this->url = $url;

View File

@ -12,6 +12,15 @@ $html = file_get_contents($url);
// first before passing it to PHP Readability.
// Both iconv() and mb_convert_encoding() can do this.
// If we've got Tidy, let's clean up input.
// This step is highly recommended - PHP's default HTML parser
// often does a terrible job and results in strange output.
if (function_exists('tidy_parse_string')) {
$tidy = tidy_parse_string($html, array(), 'UTF8');
$tidy->cleanRepair();
$html = $tidy->value;
}
// give it to Readability
$readability = new Readability($html, $url);
// print debug output?

View File

@ -3,8 +3,8 @@
// Author: Keyvan Minoukadeh
// Copyright (c) 2011 Keyvan Minoukadeh
// License: AGPLv3
// Version: 2.5
// Date: 2011-01-08
// Version: 2.6
// Date: 2011-03-02
/*
This program is free software: you can redistribute it and/or modify
@ -44,6 +44,8 @@ function __autoload($class_name) {
// Include SimplePie for RSS/Atom parsing
'SimplePie' => 'simplepie/simplepie.class.php',
'SimplePie_Misc' => 'simplepie/simplepie.class.php',
'SimplePie_HTTP_Parser' => 'simplepie/simplepie.class.php',
'SimplePie_File' => 'simplepie/simplepie.class.php',
// Include FeedCreator for RSS/Atom creation
'FeedWriter' => 'feedwriter/FeedWriter.php',
'FeedItem' => 'feedwriter/FeedItem.php',
@ -51,6 +53,7 @@ function __autoload($class_name) {
'Readability' => 'readability/Readability.php',
// Include Humble HTTP Agent to allow parallel requests and response caching
'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php',
'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php',
// Include IRI class for resolving relative URLs
'IRI' => 'iri/iri.php',
// Include Zend Cache to improve performance (cache results)
@ -67,13 +70,31 @@ function __autoload($class_name) {
}
}
function url_allowed($url) {
global $options;
if (!empty($options->allowed_urls)) {
$allowed = false;
foreach ($options->allowed_urls as $allowurl) {
if (stristr($url, $allowurl) !== false) {
$allowed = true;
break;
}
}
if (!$allowed) return false;
} else {
foreach ($options->blocked_urls as $blockurl) {
if (stristr($url, $blockurl) !== false) {
return false;
}
}
}
return true;
}
////////////////////////////////
// Load config file if it exists
////////////////////////////////
require_once(dirname(__FILE__).'/config.php');
if (file_exists(dirname(__FILE__).'/custom_config.php')) {
require_once(dirname(__FILE__).'/custom_config.php');
}
//////////////////////////////////////////////
// Convert $html to UTF8
@ -191,9 +212,16 @@ $url = $_GET['url'];
if (!preg_match('!^https?://.+!i', $url)) {
$url = 'http://'.$url;
}
$valid_url = filter_var($url, FILTER_VALIDATE_URL);
if ($valid_url !== false && $valid_url !== null && preg_match('!^https?://!', $valid_url)) {
$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
if ($test === false) {
$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
}
if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
// all okay
unset($test);
} else {
die('Invalid URL supplied');
}
@ -231,6 +259,16 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
exit;
}
///////////////////////////////////////////////
// Set timezone.
// Prevents warnings, but needs more testing -
// perhaps if timezone is set in php.ini we
// don't need to set it at all...
///////////////////////////////////////////////
if (!ini_get('date.timezone') || !@date_default_timezone_set(ini_get('date.timezone'))) {
date_default_timezone_set('UTC');
}
///////////////////////////////////////////////
// Check if the request is explicitly for an HTML page
///////////////////////////////////////////////
@ -246,25 +284,8 @@ if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int
///////////////////////////////////////////////
// Check URL against list of blacklisted URLs
// TODO: set up better system for this
///////////////////////////////////////////////
if (!empty($options->allowed_urls)) {
$allowed = false;
foreach ($options->allowed_urls as $allowurl) {
if (strstr($url, $allowurl) !== false) {
$allowed = true;
break;
}
}
if (!$allowed) die('URL not allowed');
} else {
foreach ($options->blocked_urls as $blockurl) {
if (strstr($url, $blockurl) !== false) {
die('URL blocked');
}
}
}
if (!url_allowed($url)) die('URL blocked');
///////////////////////////////////////////////
// Max entries
@ -441,7 +462,10 @@ if (function_exists('tidy_parse_string')) {
// Get RSS/Atom feed
////////////////////////////////
if (!$html_only) {
// configure SimplePie HTTP extension class to use our HumbleHttpAgent instance
SimplePie_HumbleHttpAgent::set_agent($http);
$feed = new SimplePie();
$feed->set_file_class('SimplePie_HumbleHttpAgent');
$feed->set_feed_url($url);
$feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
$feed->set_timeout(20);
@ -466,7 +490,10 @@ if ($html_only || !$result) {
unset($feed, $result);
if ($response = $http->get($url)) {
$effective_url = $response['effective_url'];
if (!url_allowed($effective_url)) die('URL blocked');
$html = $response['body'];
// remove strange things here
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $response['headers']);
} else {
die('Error retrieving '.$url);
@ -512,9 +539,13 @@ if ($html_only || !$result) {
if ($extract_pattern) {
// get outerHTML
$content = $content_block->ownerDocument->saveXML($content_block);
} else {
if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
$content = $content_block->firstChild->innerHTML;
} else {
$content = $content_block->innerHTML;
}
}
if ($links == 'remove') {
$content = preg_replace('!</?a[^>]*>!', '', $content);
}
@ -586,7 +617,7 @@ foreach ($items as $key => $item) {
$urls[$key] = $permalink;
}
$http->fetchAll($urls_sanitized);
$http->cacheAll();
//$http->cacheAll();
foreach ($items as $key => $item) {
$extract_result = false;
@ -606,9 +637,12 @@ foreach ($items as $key => $item) {
$newitem->setLink($item->get_permalink());
}
}
if ($permalink && $response = $http->get($permalink)) {
if ($permalink && $response = $http->get($permalink, true)) {
$effective_url = $response['effective_url'];
if (!url_allowed($effective_url)) continue;
$html = $response['body'];
// remove strange things here
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $response['headers']);
if ($auto_extract) {
// Run through Tidy (if it exists).
@ -660,9 +694,13 @@ foreach ($items as $key => $item) {
if ($extract_pattern) {
// get outerHTML
$html = $content_block->ownerDocument->saveXML($content_block);
} else {
if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
$html = $content_block->firstChild->innerHTML;
} else {
$html = $content_block->innerHTML;
}
}
// post-processing cleanup
$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
if ($links == 'remove') {