From daedf214febe8d944141bb92e2a4c06b8e4a6c12 Mon Sep 17 00:00:00 2001 From: "FiveFilters.org" Date: Sun, 14 Jun 2015 02:28:37 +0200 Subject: [PATCH] Cookie handling only on redirect (Dave's patch) --- changelog.txt | 4 ++ libraries/humble-http-agent/CookieJar.php | 2 +- .../humble-http-agent/HumbleHttpAgent.php | 48 +++++++++++++------ 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/changelog.txt b/changelog.txt index 3d3de96..1c29b0e 100644 --- a/changelog.txt +++ b/changelog.txt @@ -2,6 +2,10 @@ FiveFilters.org: Full-Text RSS http://fivefilters.org/content-only/ CHANGELOG ------------------------------------ + +3.4.1 (unreleased) + - Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c + 3.4 (2014-09-08) - New request parameter: siteconfig lets you submit extraction rules directly in request - New request paramter: accept=(auto|feed|html) determines what we'll accept as a response (deprecates html=1 parameter) diff --git a/libraries/humble-http-agent/CookieJar.php b/libraries/humble-http-agent/CookieJar.php index e4d5f49..4d2c4fe 100644 --- a/libraries/humble-http-agent/CookieJar.php +++ b/libraries/humble-http-agent/CookieJar.php @@ -229,7 +229,7 @@ class CookieJar } // return array of set-cookie values extracted from HTTP response headers (string $h) - public function extractCookies($h) { + public static function extractCookies($h) { $x = 0; $lines = 0; $headers = array(); diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php index 23af46f..6f60fe9 100644 --- a/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/libraries/humble-http-agent/HumbleHttpAgent.php @@ -34,7 +34,7 @@ class HumbleHttpAgent protected $curlOptions; protected $minimiseMemoryUse = false; //TODO protected $method; - protected $cookieJar; + protected $cookieJar = array(); public $debug = false; public $debugVerbose = false; public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html @@ -79,7 +79,7 @@ class HumbleHttpAgent require_once(dirname(__FILE__).'/RollingCurl.php'); } // create cookie jar - $this->cookieJar = new CookieJar(); + // $this->cookieJar = new CookieJar(); // set request options (redirect must be 0) // HTTP PECL (http://php.net/manual/en/http.request.options.php) $this->requestOptions = array( @@ -284,6 +284,7 @@ class HumbleHttpAgent $this->debug("Following redirects #$redirects..."); $this->fetchAllOnce($this->redirectQueue, $isRedirect=true); } + $this->deleteCookies(); } // fetch all URLs without following redirects @@ -326,7 +327,7 @@ class HumbleHttpAgent } $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); // send cookies, if we have any - if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { + if ($cookies = $this->getCookies($orig, $req_url)) { $this->debug("......sending cookies: $cookies"); $httpRequest->addHeaders(array('Cookie' => $cookies)); } @@ -374,10 +375,7 @@ class HumbleHttpAgent } if ($this->validateURL($redirectURL)) { $this->debug('Redirect detected. Valid URL: '.$redirectURL); - // store any cookies - $cookies = $request->getResponseHeader('set-cookie'); - if ($cookies && !is_array($cookies)) $cookies = array($cookies); - if ($cookies) $this->cookieJar->storeCookies($url, $cookies); + $this->storeCookies($orig, $url); $this->redirectQueue[$orig] = $redirectURL; } else { $this->debug('Redirect detected. Invalid URL: '.$redirectURL); @@ -459,7 +457,7 @@ class HumbleHttpAgent // add referer for picky sites $headers[] = 'Referer: '.$this->referer; // send cookies, if we have any - if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { + if ($cookies = $this->getCookies($orig, $req_url)) { $this->debug("......sending cookies: $cookies"); $headers[] = 'Cookie: '.$cookies; } @@ -496,9 +494,7 @@ class HumbleHttpAgent } if ($this->validateURL($redirectURL)) { $this->debug('Redirect detected. Valid URL: '.$redirectURL); - // store any cookies - $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); - if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); + $this->storeCookies($orig, $url); $this->redirectQueue[$orig] = $redirectURL; } else { $this->debug('Redirect detected. Invalid URL: '.$redirectURL); @@ -557,7 +553,7 @@ class HumbleHttpAgent $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; // add referer for picky sites $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; - if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) { + if ($cookies = $this->getCookies($orig, $req_url)) { $this->debug("......sending cookies: $cookies"); $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; } @@ -589,9 +585,7 @@ class HumbleHttpAgent } if ($this->validateURL($redirectURL)) { $this->debug('Redirect detected. Valid URL: '.$redirectURL); - // store any cookies - $cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); - if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); + $this->storeCookies($orig, $url); $this->redirectQueue[$orig] = $redirectURL; } else { $this->debug('Redirect detected. Invalid URL: '.$redirectURL); @@ -709,6 +703,30 @@ class HumbleHttpAgent } return false; } + + protected function getCookies($orig, $req_url) { + $jar = $this->cookieJar[$orig]; + if (!isset($jar)) { + return null; + } + return $jar->getMatchingCookies($req_url); + } + + protected function storeCookies($orig, $url) { + $headers = $this->requests[$orig]['headers']; + $cookies = CookieJar::extractCookies($headers); + if (empty($cookies)) { + return; + } + if (!isset($this->cookieJar[$orig])) { + $this->cookieJar[$orig] = new CookieJar(); + } + $this->cookieJar[$orig]->storeCookies($url, $cookies); + } + + protected function deleteCookies() { + $this->cookieJar = array(); + } } // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930