Cookie handling only on redirect (Dave's patch)
This commit is contained in:
parent
e7753953f6
commit
daedf214fe
@ -2,6 +2,10 @@ FiveFilters.org: Full-Text RSS
|
||||
http://fivefilters.org/content-only/
|
||||
CHANGELOG
|
||||
------------------------------------
|
||||
|
||||
3.4.1 (unreleased)
|
||||
- Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c
|
||||
|
||||
3.4 (2014-09-08)
|
||||
- New request parameter: siteconfig lets you submit extraction rules directly in request
|
||||
- New request paramter: accept=(auto|feed|html) determines what we'll accept as a response (deprecates html=1 parameter)
|
||||
|
@ -229,7 +229,7 @@ class CookieJar
|
||||
}
|
||||
|
||||
// return array of set-cookie values extracted from HTTP response headers (string $h)
|
||||
public function extractCookies($h) {
|
||||
public static function extractCookies($h) {
|
||||
$x = 0;
|
||||
$lines = 0;
|
||||
$headers = array();
|
||||
|
@ -34,7 +34,7 @@ class HumbleHttpAgent
|
||||
protected $curlOptions;
|
||||
protected $minimiseMemoryUse = false; //TODO
|
||||
protected $method;
|
||||
protected $cookieJar;
|
||||
protected $cookieJar = array();
|
||||
public $debug = false;
|
||||
public $debugVerbose = false;
|
||||
public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html
|
||||
@ -79,7 +79,7 @@ class HumbleHttpAgent
|
||||
require_once(dirname(__FILE__).'/RollingCurl.php');
|
||||
}
|
||||
// create cookie jar
|
||||
$this->cookieJar = new CookieJar();
|
||||
// $this->cookieJar = new CookieJar();
|
||||
// set request options (redirect must be 0)
|
||||
// HTTP PECL (http://php.net/manual/en/http.request.options.php)
|
||||
$this->requestOptions = array(
|
||||
@ -284,6 +284,7 @@ class HumbleHttpAgent
|
||||
$this->debug("Following redirects #$redirects...");
|
||||
$this->fetchAllOnce($this->redirectQueue, $isRedirect=true);
|
||||
}
|
||||
$this->deleteCookies();
|
||||
}
|
||||
|
||||
// fetch all URLs without following redirects
|
||||
@ -326,7 +327,7 @@ class HumbleHttpAgent
|
||||
}
|
||||
$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions);
|
||||
// send cookies, if we have any
|
||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
||||
if ($cookies = $this->getCookies($orig, $req_url)) {
|
||||
$this->debug("......sending cookies: $cookies");
|
||||
$httpRequest->addHeaders(array('Cookie' => $cookies));
|
||||
}
|
||||
@ -374,10 +375,7 @@ class HumbleHttpAgent
|
||||
}
|
||||
if ($this->validateURL($redirectURL)) {
|
||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||
// store any cookies
|
||||
$cookies = $request->getResponseHeader('set-cookie');
|
||||
if ($cookies && !is_array($cookies)) $cookies = array($cookies);
|
||||
if ($cookies) $this->cookieJar->storeCookies($url, $cookies);
|
||||
$this->storeCookies($orig, $url);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
} else {
|
||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||
@ -459,7 +457,7 @@ class HumbleHttpAgent
|
||||
// add referer for picky sites
|
||||
$headers[] = 'Referer: '.$this->referer;
|
||||
// send cookies, if we have any
|
||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
||||
if ($cookies = $this->getCookies($orig, $req_url)) {
|
||||
$this->debug("......sending cookies: $cookies");
|
||||
$headers[] = 'Cookie: '.$cookies;
|
||||
}
|
||||
@ -496,9 +494,7 @@ class HumbleHttpAgent
|
||||
}
|
||||
if ($this->validateURL($redirectURL)) {
|
||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||
// store any cookies
|
||||
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
|
||||
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
|
||||
$this->storeCookies($orig, $url);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
} else {
|
||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||
@ -557,7 +553,7 @@ class HumbleHttpAgent
|
||||
$httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n";
|
||||
// add referer for picky sites
|
||||
$httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n";
|
||||
if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
|
||||
if ($cookies = $this->getCookies($orig, $req_url)) {
|
||||
$this->debug("......sending cookies: $cookies");
|
||||
$httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n";
|
||||
}
|
||||
@ -589,9 +585,7 @@ class HumbleHttpAgent
|
||||
}
|
||||
if ($this->validateURL($redirectURL)) {
|
||||
$this->debug('Redirect detected. Valid URL: '.$redirectURL);
|
||||
// store any cookies
|
||||
$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']);
|
||||
if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies);
|
||||
$this->storeCookies($orig, $url);
|
||||
$this->redirectQueue[$orig] = $redirectURL;
|
||||
} else {
|
||||
$this->debug('Redirect detected. Invalid URL: '.$redirectURL);
|
||||
@ -709,6 +703,30 @@ class HumbleHttpAgent
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected function getCookies($orig, $req_url) {
|
||||
$jar = $this->cookieJar[$orig];
|
||||
if (!isset($jar)) {
|
||||
return null;
|
||||
}
|
||||
return $jar->getMatchingCookies($req_url);
|
||||
}
|
||||
|
||||
protected function storeCookies($orig, $url) {
|
||||
$headers = $this->requests[$orig]['headers'];
|
||||
$cookies = CookieJar::extractCookies($headers);
|
||||
if (empty($cookies)) {
|
||||
return;
|
||||
}
|
||||
if (!isset($this->cookieJar[$orig])) {
|
||||
$this->cookieJar[$orig] = new CookieJar();
|
||||
}
|
||||
$this->cookieJar[$orig]->storeCookies($url, $cookies);
|
||||
}
|
||||
|
||||
protected function deleteCookies() {
|
||||
$this->cookieJar = array();
|
||||
}
|
||||
}
|
||||
|
||||
// gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930
|
||||
|
Loading…
Reference in New Issue
Block a user