Cookie handling only on redirect (Dave's patch)

This commit is contained in:
FiveFilters.org 2015-06-14 02:28:37 +02:00
parent e7753953f6
commit daedf214fe
3 changed files with 38 additions and 16 deletions

View File

@ -2,6 +2,10 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
3.4.1 (unreleased)
- Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c
3.4 (2014-09-08)
- New request parameter: siteconfig lets you submit extraction rules directly in request
- New request paramter: accept=(auto|feed|html) determines what we'll accept as a response (deprecates html=1 parameter)

View File

@ -229,7 +229,7 @@ class CookieJar
}
// return array of set-cookie values extracted from HTTP response headers (string $h)
public function extractCookies($h) {
public static function extractCookies($h) {
$x = 0;
$lines = 0;
$headers = array();

View File

@ -34,7 +34,7 @@ class HumbleHttpAgent
protected $curlOptions;
protected $minimiseMemoryUse = false; //TODO
protected $method;
protected $cookieJar;
protected $cookieJar = array();
public $debug = false;
public $debugVerbose = false;
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
@ -79,7 +79,7 @@ class HumbleHttpAgent
require_once(dirname(__FILE__).'/RollingCurl.php');
}
// create cookie jar
$this->cookieJar = new CookieJar();
// $this->cookieJar = new CookieJar();
// set request options (redirect must be 0)
// HTTP PECL (http://php.net/manual/en/http.request.options.php)
$this->requestOptions = array(
@ -284,6 +284,7 @@ class HumbleHttpAgent
$this->debug("Following redirects #$redirects...");
$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
}
$this->deleteCookies();
}
// fetch all URLs without following redirects
@ -326,7 +327,7 @@ class HumbleHttpAgent
}
$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
if ($cookies = $this->getCookies($orig, $req_url)) {
$this->debug("......sending cookies: $cookies");
$httpRequest->addHeaders(array('Cookie' => $cookies));
}
@ -374,10 +375,7 @@ class HumbleHttpAgent
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
$cookies = $request->getResponseHeader('set-cookie');
if ($cookies && !is_array($cookies)) $cookies = array($cookies);
if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
$this->storeCookies($orig, $url);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
@ -459,7 +457,7 @@ class HumbleHttpAgent
// add referer for picky sites
$headers[] = 'Referer: '.$this->referer;
// send cookies, if we have any
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
if ($cookies = $this->getCookies($orig, $req_url)) {
$this->debug("......sending cookies: $cookies");
$headers[] = 'Cookie: '.$cookies;
}
@ -496,9 +494,7 @@ class HumbleHttpAgent
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->storeCookies($orig, $url);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
@ -557,7 +553,7 @@ class HumbleHttpAgent
$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
// add referer for picky sites
$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
if ($cookies = $this->getCookies($orig, $req_url)) {
$this->debug("......sending cookies: $cookies");
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
}
@ -589,9 +585,7 @@ class HumbleHttpAgent
}
if ($this->validateURL($redirectURL)) {
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
// store any cookies
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
$this->storeCookies($orig, $url);
$this->redirectQueue[$orig] = $redirectURL;
} else {
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
@ -709,6 +703,30 @@ class HumbleHttpAgent
}
return false;
}
protected function getCookies($orig, $req_url) {
$jar = $this->cookieJar[$orig];
if (!isset($jar)) {
return null;
}
return $jar->getMatchingCookies($req_url);
}
protected function storeCookies($orig, $url) {
$headers = $this->requests[$orig]['headers'];
$cookies = CookieJar::extractCookies($headers);
if (empty($cookies)) {
return;
}
if (!isset($this->cookieJar[$orig])) {
$this->cookieJar[$orig] = new CookieJar();
}
$this->cookieJar[$orig]->storeCookies($url, $cookies);
}
protected function deleteCookies() {
$this->cookieJar = array();
}
}
// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930