Cookie handling only on redirect (Dave's patch)
This commit is contained in:
parent
e7753953f6
commit
daedf214fe
@ -2,6 +2,10 @@ FiveFilters.org: Full-Text RSS
|
|||||||
http://fivefilters.org/content-only/
|
http://fivefilters.org/content-only/
|
||||||
CHANGELOG
|
CHANGELOG
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
|
||||||
|
3.4.1 (unreleased)
|
||||||
|
- Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c
|
||||||
|
|
||||||
3.4 (2014-09-08)
|
3.4 (2014-09-08)
|
||||||
- New request parameter: siteconfig lets you submit extraction rules directly in request
|
- New request parameter: siteconfig lets you submit extraction rules directly in request
|
||||||
- New request paramter: accept=(auto|feed|html) determines what we'll accept as a response (deprecates html=1 parameter)
|
- New request paramter: accept=(auto|feed|html) determines what we'll accept as a response (deprecates html=1 parameter)
|
||||||
|
@ -229,7 +229,7 @@ class CookieJar
|
|||||||
}
|
}
|
||||||
|
|
||||||
// return array of set-cookie values extracted from HTTP response headers (string $h)
|
// return array of set-cookie values extracted from HTTP response headers (string $h)
|
||||||
public function extractCookies($h) {
|
public static function extractCookies($h) {
|
||||||
$x = 0;
|
$x = 0;
|
||||||
$lines = 0;
|
$lines = 0;
|
||||||
$headers = array();
|
$headers = array();
|
||||||
|
@ -34,7 +34,7 @@ class HumbleHttpAgent
|
|||||||
protected $curlOptions;
|
protected $curlOptions;
|
||||||
protected $minimiseMemoryUse = false; //TODO
|
protected $minimiseMemoryUse = false; //TODO
|
||||||
protected $method;
|
protected $method;
|
||||||
protected $cookieJar;
|
protected $cookieJar = array();
|
||||||
public $debug = false;
|
public $debug = false;
|
||||||
public $debugVerbose = false;
|
public $debugVerbose = false;
|
||||||
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
|
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
|
||||||
@ -79,7 +79,7 @@ class HumbleHttpAgent
|
|||||||
require_once(dirname(__FILE__).'/RollingCurl.php');
|
require_once(dirname(__FILE__).'/RollingCurl.php');
|
||||||
}
|
}
|
||||||
// create cookie jar
|
// create cookie jar
|
||||||
$this->cookieJar = new CookieJar();
|
// $this->cookieJar = new CookieJar();
|
||||||
// set request options (redirect must be 0)
|
// set request options (redirect must be 0)
|
||||||
// HTTP PECL (http://php.net/manual/en/http.request.options.php)
|
// HTTP PECL (http://php.net/manual/en/http.request.options.php)
|
||||||
$this->requestOptions = array(
|
$this->requestOptions = array(
|
||||||
@ -284,6 +284,7 @@ class HumbleHttpAgent
|
|||||||
$this->debug("Following redirects #$redirects...");
|
$this->debug("Following redirects #$redirects...");
|
||||||
$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
|
$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
|
||||||
}
|
}
|
||||||
|
$this->deleteCookies();
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetch all URLs without following redirects
|
// fetch all URLs without following redirects
|
||||||
@ -326,7 +327,7 @@ class HumbleHttpAgent
|
|||||||
}
|
}
|
||||||
$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
|
$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
|
||||||
// send cookies, if we have any
|
// send cookies, if we have any
|
||||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
if ($cookies = $this->getCookies($orig, $req_url)) {
|
||||||
$this->debug("......sending cookies: $cookies");
|
$this->debug("......sending cookies: $cookies");
|
||||||
$httpRequest->addHeaders(array('Cookie' => $cookies));
|
$httpRequest->addHeaders(array('Cookie' => $cookies));
|
||||||
}
|
}
|
||||||
@ -374,10 +375,7 @@ class HumbleHttpAgent
|
|||||||
}
|
}
|
||||||
if ($this->validateURL($redirectURL)) {
|
if ($this->validateURL($redirectURL)) {
|
||||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||||
// store any cookies
|
$this->storeCookies($orig, $url);
|
||||||
$cookies = $request->getResponseHeader('set-cookie');
|
|
||||||
if ($cookies && !is_array($cookies)) $cookies = array($cookies);
|
|
||||||
if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
|
|
||||||
$this->redirectQueue[$orig] = $redirectURL;
|
$this->redirectQueue[$orig] = $redirectURL;
|
||||||
} else {
|
} else {
|
||||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||||
@ -459,7 +457,7 @@ class HumbleHttpAgent
|
|||||||
// add referer for picky sites
|
// add referer for picky sites
|
||||||
$headers[] = 'Referer: '.$this->referer;
|
$headers[] = 'Referer: '.$this->referer;
|
||||||
// send cookies, if we have any
|
// send cookies, if we have any
|
||||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
if ($cookies = $this->getCookies($orig, $req_url)) {
|
||||||
$this->debug("......sending cookies: $cookies");
|
$this->debug("......sending cookies: $cookies");
|
||||||
$headers[] = 'Cookie: '.$cookies;
|
$headers[] = 'Cookie: '.$cookies;
|
||||||
}
|
}
|
||||||
@ -496,9 +494,7 @@ class HumbleHttpAgent
|
|||||||
}
|
}
|
||||||
if ($this->validateURL($redirectURL)) {
|
if ($this->validateURL($redirectURL)) {
|
||||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||||
// store any cookies
|
$this->storeCookies($orig, $url);
|
||||||
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
|
|
||||||
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
|
|
||||||
$this->redirectQueue[$orig] = $redirectURL;
|
$this->redirectQueue[$orig] = $redirectURL;
|
||||||
} else {
|
} else {
|
||||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||||
@ -557,7 +553,7 @@ class HumbleHttpAgent
|
|||||||
$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
|
$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
|
||||||
// add referer for picky sites
|
// add referer for picky sites
|
||||||
$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
|
$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
|
||||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
if ($cookies = $this->getCookies($orig, $req_url)) {
|
||||||
$this->debug("......sending cookies: $cookies");
|
$this->debug("......sending cookies: $cookies");
|
||||||
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
|
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
|
||||||
}
|
}
|
||||||
@ -589,9 +585,7 @@ class HumbleHttpAgent
|
|||||||
}
|
}
|
||||||
if ($this->validateURL($redirectURL)) {
|
if ($this->validateURL($redirectURL)) {
|
||||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||||
// store any cookies
|
$this->storeCookies($orig, $url);
|
||||||
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
|
|
||||||
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
|
|
||||||
$this->redirectQueue[$orig] = $redirectURL;
|
$this->redirectQueue[$orig] = $redirectURL;
|
||||||
} else {
|
} else {
|
||||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||||
@ -709,6 +703,30 @@ class HumbleHttpAgent
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected function getCookies($orig, $req_url) {
|
||||||
|
$jar = $this->cookieJar[$orig];
|
||||||
|
if (!isset($jar)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return $jar->getMatchingCookies($req_url);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function storeCookies($orig, $url) {
|
||||||
|
$headers = $this->requests[$orig]['headers'];
|
||||||
|
$cookies = CookieJar::extractCookies($headers);
|
||||||
|
if (empty($cookies)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!isset($this->cookieJar[$orig])) {
|
||||||
|
$this->cookieJar[$orig] = new CookieJar();
|
||||||
|
}
|
||||||
|
$this->cookieJar[$orig]->storeCookies($url, $cookies);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function deleteCookies() {
|
||||||
|
$this->cookieJar = array();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
|
// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
|
||||||
|
Loading…
Reference in New Issue
Block a user