full-text-rss/libraries/humble-http-agent/HumbleHttpAgent.php

480 lines
17 KiB
PHP
Raw Normal View History

2011-01-11 18:06:12 +00:00
<?php
/**
* Humble HTTP Agent
*
* This class is designed to take advantage of parallel HTTP requests
2011-11-04 17:10:31 +00:00
* offered by PHP's PECL HTTP extension or the curl_multi_* functions.
* For environments which do not have these options, it reverts to standard sequential
2011-01-11 18:06:12 +00:00
* requests (using file_get_contents())
*
2011-11-04 17:10:31 +00:00
* @version 0.8
* @date 2011-02-28
2011-01-11 18:06:12 +00:00
* @see http://php.net/HttpRequestPool
* @author Keyvan Minoukadeh
2011-11-04 17:10:31 +00:00
* @copyright 2011 Keyvan Minoukadeh
2011-01-11 18:06:12 +00:00
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
class HumbleHttpAgent
{
2011-11-04 17:10:31 +00:00
const METHOD_REQUEST_POOL = 1;
const METHOD_CURL_MULTI = 2;
const METHOD_FILE_GET_CONTENTS = 4;
2011-01-11 18:06:12 +00:00
protected $requests = array();
2011-11-04 17:10:31 +00:00
protected $redirectQueue = array();
2011-01-11 18:06:12 +00:00
protected $requestOptions;
protected $maxParallelRequests = 5;
2011-11-04 17:10:31 +00:00
protected $cache = null; //TODO
2011-01-11 18:06:12 +00:00
protected $httpContext;
2011-11-04 17:10:31 +00:00
protected $minimiseMemoryUse = false; //TODO
2011-01-11 18:06:12 +00:00
protected $debug = false;
2011-11-04 17:10:31 +00:00
protected $method;
2011-11-04 17:40:29 +00:00
protected $cookieJar;
2011-11-04 17:10:31 +00:00
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
public $maxRedirects = 5;
2011-01-11 18:06:12 +00:00
//TODO: prevent certain file/mime types
//TODO: set max file size
//TODO: normalise headers
2011-11-04 17:10:31 +00:00
function __construct($requestOptions=null, $method=null) {
// set the request method
if (in_array($method, array(1,2,4))) {
$this->method = $method;
} else {
if (class_exists('HttpRequestPool')) {
$this->method = self::METHOD_REQUEST_POOL;
} elseif (function_exists('curl_multi_init')) {
$this->method = self::METHOD_CURL_MULTI;
} else {
$this->method = self::METHOD_FILE_GET_CONTENTS;
}
}
if ($this->method == self::METHOD_CURL_MULTI) {
require_once(dirname(__FILE__).'/RollingCurl.php');
}
2011-11-04 17:40:29 +00:00
// create cookie jar
$this->cookieJar = new CookieJar();
// set request options (redirect must be 0)
2011-01-11 18:06:12 +00:00
$this->requestOptions = array(
'timeout' => 10,
2011-11-04 17:10:31 +00:00
'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web
2011-01-11 18:06:12 +00:00
// TODO: test onprogress?
);
if (is_array($requestOptions)) {
$this->requestOptions = array_merge($this->requestOptions, $requestOptions);
}
2011-11-04 17:40:29 +00:00
$this->httpContext = array(
2011-01-11 18:06:12 +00:00
'http' => array(
2011-11-04 17:10:31 +00:00
'ignore_errors' => true,
2011-01-11 18:06:12 +00:00
'timeout' => $this->requestOptions['timeout'],
2011-03-23 22:39:01 +00:00
'max_redirects' => $this->requestOptions['redirect'],
2011-11-04 17:10:31 +00:00
'header' => "User-Agent: PHP/".phpversion()."\r\n".
"Accept: */*\r\n"
2011-01-11 18:06:12 +00:00
)
2011-11-04 17:40:29 +00:00
);
2011-01-11 18:06:12 +00:00
}
protected function debug($msg) {
if ($this->debug) {
$mem = round(memory_get_usage()/1024, 2);
$memPeak = round(memory_get_peak_usage()/1024, 2);
echo '* ',$msg;
echo ' - mem used: ',$mem," (peak: $memPeak)\n";
ob_flush();
flush();
}
}
2011-11-04 17:10:31 +00:00
public function rewriteHashbangFragment($url) {
// return $url if there's no '#!'
if (strpos($url, '#!') === false) return $url;
// split $url and rewrite
$iri = new IRI($url);
$fragment = substr($iri->ifragment, 1); // strip '!'
$iri->fragment = null;
if (isset($iri->iquery)) {
parse_str($iri->iquery, $query);
} else {
$query = array();
}
$query['_escaped_fragment_'] = (string)$fragment;
$iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites
return $iri->uri;
}
2011-01-11 18:06:12 +00:00
public function enableDebug($bool=true) {
$this->debug = (bool)$bool;
}
public function minimiseMemoryUse($bool = true) {
$this->minimiseMemoryUse = $bool;
}
public function setMaxParallelRequests($max) {
$this->maxParallelRequests = $max;
}
public function validateUrl($url) {
2011-11-04 17:10:31 +00:00
$url = filter_var($url, FILTER_SANITIZE_URL);
2011-03-23 22:39:01 +00:00
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
if ($test === false) {
$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
}
if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
2011-11-04 17:10:31 +00:00
return $url;
2011-01-11 18:06:12 +00:00
} else {
return false;
}
}
2011-11-04 17:10:31 +00:00
/**
* Set cache object.
* The cache object passed should implement Zend_Cache_Backend_Interface
* @param Zend_Cache_Backend_Interface
*/
/* all disk caching temporily disabled - needs work
public function useCache($cache) {
$this->cache = $cache;
}
2011-01-11 18:06:12 +00:00
public function isCached($url) {
if (!isset($this->cache)) return false;
return ($this->cache->test(md5($url)) !== false);
}
public function getCached($url) {
$cached = $this->cache->load(md5($url));
$cached['fromCache'] = true;
return $cached;
}
public function cache($url) {
if (isset($this->cache) && !isset($this->requests[$url]['fromCache']) && isset($this->requests[$url]['body'])) {
$this->debug("Saving to cache ($url)");
$res = $this->cache->save($this->requests[$url], md5($url));
//$res = @file_put_contents($this->cacheFolder.'/'.md5($url).'.txt', serialize($this->requests[$url]));
return ($res !== false);
}
return false;
2011-11-04 17:10:31 +00:00
}
2011-01-11 18:06:12 +00:00
public function cacheAll() {
if (isset($this->cache)) {
foreach (array_keys($this->requests) as $url) {
$this->cache($url);
}
return true;
}
return false;
}
2011-11-04 17:10:31 +00:00
*/
2011-01-11 18:06:12 +00:00
public function fetchAll(array $urls) {
2011-11-04 17:10:31 +00:00
$this->fetchAllOnce($urls, $isRedirect=false);
$redirects = 0;
while (!empty($this->redirectQueue) && ++$redirects <= $this->maxRedirects) {
$this->debug("Following redirects #$redirects...");
$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
}
}
// fetch all URLs without following redirects
public function fetchAllOnce(array $urls, $isRedirect=false) {
if (!$isRedirect) $urls = array_unique($urls);
if (empty($urls)) return;
//////////////////////////////////////////////////////
// parallel (HttpRequestPool)
if ($this->method == self::METHOD_REQUEST_POOL) {
$this->debug('Starting parallel fetch (HttpRequestPool)');
2011-01-11 18:06:12 +00:00
try {
while (count($urls) > 0) {
2011-11-04 17:10:31 +00:00
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
2011-01-11 18:06:12 +00:00
$subset = array_splice($urls, 0, $this->maxParallelRequests);
$pool = new HttpRequestPool();
2011-11-04 17:10:31 +00:00
foreach ($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
2011-01-11 18:06:12 +00:00
$this->debug("...$url");
2011-11-04 17:10:31 +00:00
if (!$isRedirect && isset($this->requests[$url])) {
2011-01-11 18:06:12 +00:00
$this->debug("......in memory");
2011-11-04 17:10:31 +00:00
/*
2011-01-11 18:06:12 +00:00
} elseif ($this->isCached($url)) {
$this->debug("......is cached");
if (!$this->minimiseMemoryUse) {
$this->requests[$url] = $this->getCached($url);
}
2011-11-04 17:10:31 +00:00
*/
2011-01-11 18:06:12 +00:00
} else {
$this->debug("......adding to pool");
2011-11-04 17:40:29 +00:00
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
2011-11-04 17:10:31 +00:00
$httpRequest = new HttpRequest($req_url, HttpRequest::METH_GET, $this->requestOptions);
2011-11-04 17:40:29 +00:00
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
$this->debug("......sending cookies: $cookies");
$httpRequest->addHeaders(array('Cookie' => $cookies));
}
2011-11-04 17:10:31 +00:00
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$this->requests[$orig]['original_url'] = $orig;
2011-01-11 18:06:12 +00:00
$pool->attach($httpRequest);
}
}
// did we get anything into the pool?
if (count($pool) > 0) {
$this->debug('Sending request...');
$pool->send();
$this->debug('Received responses');
2011-11-04 17:10:31 +00:00
foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
//if (!isset($this->requests[$url]['fromCache'])) {
$request = $this->requests[$orig]['httpRequest'];
//$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader());
// getResponseHeader() doesn't return status line, so, for consistency...
$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size'));
$this->requests[$orig]['body'] = $request->getResponseBody();
$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url');
$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode();
2011-11-04 17:40:29 +00:00
// is redirect?
2011-11-04 17:10:31 +00:00
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
$redirectURL = $request->getResponseHeader('location');
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
2011-11-04 17:40:29 +00:00
// store any cookies
$cookies = $request->getResponseHeader('set-cookie');
if ($cookies && !is_array($cookies)) $cookies = array($cookies);
if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
2011-11-04 17:10:31 +00:00
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
}
2011-01-11 18:06:12 +00:00
//die($url.' -multi- '.$request->getResponseInfo('effective_url'));
$pool->detach($request);
2011-11-04 17:10:31 +00:00
unset($this->requests[$orig]['httpRequest'], $request);
/*
2011-01-11 18:06:12 +00:00
if ($this->minimiseMemoryUse) {
if ($this->cache($url)) {
unset($this->requests[$url]);
}
}
2011-11-04 17:10:31 +00:00
*/
//}
2011-01-11 18:06:12 +00:00
}
}
}
} catch (HttpException $e) {
$this->debug($e);
return false;
}
2011-11-04 17:10:31 +00:00
}
//////////////////////////////////////////////////////////
// parallel (curl_multi_*)
elseif ($this->method == self::METHOD_CURL_MULTI) {
$this->debug('Starting parallel fetch (curl_multi_*)');
while (count($urls) > 0) {
$this->debug('Processing set of '.min($this->maxParallelRequests, count($urls)));
$subset = array_splice($urls, 0, $this->maxParallelRequests);
$pool = new RollingCurl(array($this, 'handleCurlResponse'));
$pool->window_size = count($subset);
foreach ($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
$this->debug("...$url");
if (!$isRedirect && isset($this->requests[$url])) {
$this->debug("......in memory");
/*
} elseif ($this->isCached($url)) {
$this->debug("......is cached");
if (!$this->minimiseMemoryUse) {
$this->requests[$url] = $this->getCached($url);
}
*/
} else {
$this->debug("......adding to pool");
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
2011-11-04 17:40:29 +00:00
$headers = array();
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
$this->debug("......sending cookies: $cookies");
$headers[] = 'Cookie: '.$cookies;
}
$httpRequest = new RollingCurlRequest($req_url, 'GET', null, $headers, array(
2011-11-04 17:10:31 +00:00
CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
CURLOPT_TIMEOUT => $this->requestOptions['timeout']
));
$httpRequest->set_original_url($orig);
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
$pool->add($httpRequest);
}
}
// did we get anything into the pool?
if (count($pool) > 0) {
$this->debug('Sending request...');
$pool->execute(); // this will call handleCurlResponse() and populate $this->requests[$orig]
$this->debug('Received responses');
foreach($subset as $orig => $url) {
if (!$isRedirect) $orig = $url;
// $this->requests[$orig]['headers']
// $this->requests[$orig]['body']
// $this->requests[$orig]['effective_url']
$status_code = $this->requests[$orig]['status_code'];
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
2011-11-04 17:40:29 +00:00
// store any cookies
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->redirectQueue[$orig] = $redirectURL;
2011-11-04 17:10:31 +00:00
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
}
// die($url.' -multi- '.$request->getResponseInfo('effective_url'));
unset($this->requests[$orig]['httpRequest']);
}
}
2011-01-11 18:06:12 +00:00
}
}
2011-11-04 17:10:31 +00:00
//////////////////////////////////////////////////////
// sequential (file_get_contents)
else {
$this->debug('Starting sequential fetch (file_get_contents)');
$this->debug('Processing set of '.count($urls));
foreach ($urls as $orig => $url) {
if (!$isRedirect) $orig = $url;
unset($this->redirectQueue[$orig]);
$this->debug("...$url");
if (!$isRedirect && isset($this->requests[$url])) {
$this->debug("......in memory");
/*
} elseif ($this->isCached($url)) {
$this->debug("......is cached");
if (!$this->minimiseMemoryUse) {
$this->requests[$url] = $this->getCached($url);
}
*/
} else {
$this->debug("Sending request for $url");
$this->requests[$orig]['original_url'] = $orig;
$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
2011-11-04 17:40:29 +00:00
// send cookies, if we have any
$httpContext = $this->httpContext;
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
$this->debug("......sending cookies: $cookies");
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
}
if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) {
2011-11-04 17:10:31 +00:00
$this->debug('Received response');
// get status code
if (!isset($http_response_header[0]) || !preg_match('!^HTTP/\d+\.\d+\s+(\d+)!', trim($http_response_header[0]), $match)) {
$this->debug('Error: no status code found');
// TODO: handle error - no status code
} else {
$this->requests[$orig]['headers'] = $this->headersToString($http_response_header, false);
$this->requests[$orig]['body'] = $html;
$this->requests[$orig]['effective_url'] = $req_url;
$this->requests[$orig]['status_code'] = $status_code = (int)$match[1];
unset($match);
// handle redirect
if (preg_match('/^Location:(.*?)$/m', $this->requests[$orig]['headers'], $match)) {
$this->requests[$orig]['location'] = trim($match[1]);
}
if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
$redirectURL = $this->requests[$orig]['location'];
$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
2011-11-04 17:40:29 +00:00
// store any cookies
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->redirectQueue[$orig] = $redirectURL;
2011-11-04 17:10:31 +00:00
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
}
}
}
} else {
$this->debug('Error retrieving URL');
//print_r($req_url);
//print_r($http_response_header);
//print_r($html);
// TODO: handle error - failed to retrieve URL
}
}
}
}
}
public function handleCurlResponse($response, $info, $request) {
$orig = $request->url_original;
$this->requests[$orig]['headers'] = substr($response, 0, $info['header_size']);
$this->requests[$orig]['body'] = substr($response, $info['header_size']);
$this->requests[$orig]['effective_url'] = $info['url'];
$this->requests[$orig]['status_code'] = (int)$info['http_code'];
if (preg_match('/^Location:(.*?)$/m', $this->requests[$orig]['headers'], $match)) {
$this->requests[$orig]['location'] = trim($match[1]);
}
2011-01-11 18:06:12 +00:00
}
protected function headersToString(array $headers, $associative=true) {
if (!$associative) {
return implode("\n", $headers);
} else {
$str = '';
foreach ($headers as $key => $val) {
if (is_array($val)) {
foreach ($val as $v) $str .= "$key: $v\n";
} else {
$str .= "$key: $val\n";
}
}
return rtrim($str);
}
}
2011-11-04 17:10:31 +00:00
public function get($url, $remove=false) {
2011-01-11 18:06:12 +00:00
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
2011-11-04 17:10:31 +00:00
$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
2011-01-11 18:06:12 +00:00
$response = $this->requests[$url];
2011-11-04 17:10:31 +00:00
/*
2011-01-11 18:06:12 +00:00
} elseif ($this->isCached($url)) {
$this->debug("URL already fetched - in disk cache ($url)");
$response = $this->getCached($url);
$this->requests[$url] = $response;
2011-11-04 17:10:31 +00:00
*/
2011-01-11 18:06:12 +00:00
} else {
$this->debug("Fetching URL ($url)");
2011-11-04 17:10:31 +00:00
$this->fetchAll(array($url));
if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
$response = $this->requests[$url];
2011-01-11 18:06:12 +00:00
} else {
2011-11-04 17:10:31 +00:00
$this->debug("Request failed");
2011-01-11 18:06:12 +00:00
$response = false;
}
}
2011-11-04 17:10:31 +00:00
/*
2011-01-11 18:06:12 +00:00
if ($this->minimiseMemoryUse && $response) {
$this->cache($url);
unset($this->requests[$url]);
}
2011-11-04 17:10:31 +00:00
*/
if ($remove && $response) unset($this->requests[$url]);
2011-01-11 18:06:12 +00:00
return $response;
}
public function parallelSupport() {
2011-11-04 17:10:31 +00:00
return class_exists('HttpRequestPool') || function_exists('curl_multi_init');
2011-01-11 18:06:12 +00:00
}
}
?>