From eeec0f1982cb2bc87adefc7c63185499a9f7ca40 Mon Sep 17 00:00:00 2001
From: Keyvan <keyvan@k1m.com>
Date: Tue, 1 May 2012 00:51:43 +0200
Subject: [PATCH] Full-Text RSS 2.8

---
 changelog.txt                                 |   9 +
 config.php                                    |   8 +-
 ftr_compatibility_test.php                    |   4 +-
 index.php                                     |   7 +-
 .../content-extractor/ContentExtractor.php    |  85 +--
 libraries/content-extractor/SiteConfig.php    | 115 ++++
 .../humble-http-agent/HumbleHttpAgent.php     |  35 +-
 makefulltextfeed.php                          | 496 ++++++++++--------
 8 files changed, 448 insertions(+), 311 deletions(-)
diff --git a/changelog.txt b/changelog.txt
index 4a4dfb5..9e29ead 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -2,6 +2,15 @@ FiveFilters.org: Full-Text RSS
 http://fivefilters.org/content-only/
 CHANGELOG
 ------------------------------------
+2.8 (2011-05-30)
+ - Tidy no longer stripping HTML5 elements
+ - JSON output (pass &format=json in querystring)
+ - New site patterns added and old ones updated
+ - New site config option to force full-page retrieval on multi-page articles: single_page_link
+ - User Guide (PDF) now included (although still a work in progress)
+ - URL placeholders now accepted in message_to_prepend/append config options
+ - Plus minor fixes...
+
 2.7 (2011-03-21)
  - Site patterns for better control over extraction (see site_config/README.txt)
  - hNews support (improves content extraction for sites using hNews microformatting)
diff --git a/config.php b/config.php
index 6b55583..68fcc5d 100644
--- a/config.php
+++ b/config.php
@@ -74,11 +74,17 @@ $options->cache_dir = dirname(__FILE__).'/cache';
 // Message to prepend (without API key)
 // ----------------------
 // HTML to insert at the beginning of each feed item when no API key is supplied.
+// Substitution tags:
+// {url} - Feed item URL
+// {effective-url} - Feed item URL after we've followed all redirects
 $options->message_to_prepend = '';
 
 // Message to append (without API key)
 // ----------------------
 // HTML to insert at the end of each feed item when no API key is supplied.
+// Substitution tags:
+// {url} - Feed item URL
+// {effective-url} - Feed item URL after we've followed all redirects
 $options->message_to_append = '';
 
 // URLs to allow
@@ -188,7 +194,7 @@ $options->error_message_with_key = '[unable to retrieve full-text content]';
 /// DO NOT CHANGE ANYTHING BELOW THIS ///////////
 /////////////////////////////////////////////////
 
-if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.7');
+if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.8');
 
 if ((basename(__FILE__) == 'config.php') && (file_exists(dirname(__FILE__).'/custom_config.php'))) {
 	require_once(dirname(__FILE__).'/custom_config.php');
diff --git a/ftr_compatibility_test.php b/ftr_compatibility_test.php
index ee35d06..aae5687 100644
--- a/ftr_compatibility_test.php
+++ b/ftr_compatibility_test.php
@@ -13,7 +13,7 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
 http://github.com/simplepie/simplepie/tree/master/compatibility_test/
 */
 
-$app_name = 'Full-Text RSS 2.7';
+$app_name = 'Full-Text RSS 2.8';
 
 $php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>='));
 $pcre_ok = extension_loaded('pcre');
@@ -327,7 +327,7 @@ div.chunk {
 				<p><em>Your webhost has its act together!</em></p>
 				<p>You can download the latest version of <?php echo $app_name; ?> from <a href="http://fivefilters.org/content-only/#download">FiveFilters.org</a>.</p>
 				<p><strong>Note</strong>: Passing this test does not guarantee that <?php echo $app_name; ?> will run on your webhost &mdash; it only ensures that the basic requirements have been addressed. If you experience any problems, please let us know.</p>
-			<?php } else if ($php_ok && $xml_ok && $pcre_ok && $allow_url_fopen_ok && $filter_ok) { ?>
+			<?php } else if ($php_ok && $xml_ok && $pcre_ok && $mbstring_ok && $allow_url_fopen_ok && $filter_ok) { ?>
 				<h3>Bottom Line: Yes, you can!</h3>
 				<p><em>For most feeds, it'll run with no problems.</em> There are certain languages that you might have a hard time with though.</p>
 				<p>You can download the latest version of <?php echo $app_name; ?> from <a href="http://fivefilters.org/content-only/#download">FiveFilters.org</a>.</p>
diff --git a/index.php b/index.php
index 7a55b56..f955f8e 100644
--- a/index.php
+++ b/index.php
@@ -96,14 +96,16 @@ if (!defined('_FF_FTR_INDEX')) {
 	<p>Thanks for downloading and setting this up. If you haven't done so already, <a href="ftr_compatibility_test.php">check server compatibility</a>
 	to see if your environment will support this application. Full-Text RSS runs on most shared web hosting environments.</p>
 	<h3>Configure</h3>
-	<p>In addition to the options above, Full-Text RSS comes with a configuration file which allows you to control how the application works. Features include:</p>
+	<p>In addition to the options above, Full-Text RSS can be configured to better suit your needs. Features include:</p>
 	<ul>
 		<li>Site patterns for better control over extraction (<a href="site_config/README.txt">more info</a>)</li>
 		<li>Restrict access to a pre-defined set of URLs or block certain URLs</li>
 		<li>Restrict the maximum number of feed items to be processed</li>
+		<li>JSON output</li>
 		<li>Prepend or append an HTML fragment to each feed item processed</li>
 		<li>Caching</li>		
 	</ul>
+	<p>Please refer to the <a href="http://fivefilters.org/content-only/guide/user_guide_2.8.pdf">user guide</a> for more information.</p>
 	<p><?php if (!file_exists('custom_config.php')) { ?>To change the configuration, save a copy of <tt>config.php</tt> as <tt>custom_config.php</tt> and make any changes you like to it.<?php } else { ?>To change the configuration, edit <tt>custom_config.php</tt> and make any changes you like.<?php } ?></p>
 
 	<p>If everything works fine, feel free to modify this page by saving it as <tt>custom_index.php</tt> and change it to whatever you like.</p>
@@ -118,7 +120,8 @@ if (!defined('_FF_FTR_INDEX')) {
 	<p>To see if you're running the latest version, <a href="http://fivefilters.org/content-only/latest_version.php?version=<?php echo urlencode(_FF_FTR_VERSION); ?>">check for updates</a>.</p>
 	
 	<h3 id="donate">Support</h3>
-	<p>We have more information in the section below, but if you need help with anything, please email <a href="mailto:fivefilters@fivefilters.org">fivefilters@fivefilters.org</a>.</p>
+	<p>We have a <a href="https://member.fivefilters.org/f/">public forum</a> which anyone can use to discuss any issues, post questions and find answers (it's free to join and post).</p>
+	<p>We provide a little more information in the section below, but if you need help with anything, you can also email us at <a href="mailto:fivefilters@fivefilters.org">fivefilters@fivefilters.org</a>.</p>
 	
 	<hr />
 	
diff --git a/libraries/content-extractor/ContentExtractor.php b/libraries/content-extractor/ContentExtractor.php
index 131aab6..33e4955 100644
--- a/libraries/content-extractor/ContentExtractor.php
+++ b/libraries/content-extractor/ContentExtractor.php
@@ -5,8 +5,8 @@
  * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) 
  * to extract content from HTML files.
  * 
- * @version 0.5
- * @date 2011-03-07
+ * @version 0.6
+ * @date 2011-05-04
  * @author Keyvan Minoukadeh
  * @copyright 2011 Keyvan Minoukadeh
  * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
@@ -14,13 +14,13 @@
 
 class ContentExtractor
 {
-	const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
-	protected static $config_cache = array();
 	protected static $tidy_config = array(
 				 'clean' => true,
 				 'output-xhtml' => true,
 				 'logical-emphasis' => true,
 				 'show-body-only' => false,
+				 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid',
+				 'new-inline-tags' => 'new-inline-tags: mark, time, meter, progress',
 				 'wrap' => 0,
 				 'drop-empty-paras' => true,
 				 'drop-proprietary-attributes' => false,
@@ -31,19 +31,16 @@ class ContentExtractor
 				 'char-encoding' => 'utf8',
 				 'hide-comments' => true
 				 );
-	protected $config_path;
 	protected $html;
 	protected $config;
 	protected $title;
 	protected $body;
 	protected $success = false;
-	protected $fallback;
 	public $readability;	
 	public $debug = false;
 
-	function __construct($config_path=null, ContentExtractor $config_fallback=null) {
-		$this->config_path = $config_path;
-		$this->fallback = $config_fallback;
+	function __construct($path, $fallback=null) {
+		SiteConfig::set_config_path($path, $fallback);	
 	}
 	
 	protected function debug($msg) {
@@ -66,71 +63,6 @@ class ContentExtractor
 		$this->success = false;
 	}
 	
-	// returns SiteConfig instance if an appropriate one is found, false otherwise
-	public function get_site_config($host) {
-		$host = strtolower($host);
-		if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
-		if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false;
-		// check for site configuration
-		$try = array($host);
-		$split = explode('.', $host);
-		if (count($split) > 1) {
-			array_shift($split);
-			$try[] = '.'.implode('.', $split);
-		}
-		foreach ($try as $h) {
-			if (array_key_exists($h, self::$config_cache)) {
-				$this->debug("... cached ($h)");
-				return self::$config_cache[$h];
-			} elseif (file_exists($this->config_path."/$h.txt")) {
-				$this->debug("... from file ($h)");
-				$file = $this->config_path."/$h.txt";
-				break;
-			}
-		}
-		if (!isset($file)) {
-			if (isset($this->fallback)) {
-				$this->debug("... trying fallback ($host)");
-				return $this->fallback->get_site_config($host);
-			} else {
-				$this->debug("... no match ($host)");
-				return false;
-			}
-		}
-		$config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
-		if (!$config_file || !is_array($config_file)) return false;
-		$config = new SiteConfig();
-		foreach ($config_file as $line) {
-			$line = trim($line);
-			
-			// skip comments, empty lines
-			if ($line == '' || $line[0] == '#') continue;
-			
-			// get command
-			$command = explode(':', $line, 2);
-			// if there's no colon ':', skip this line
-			if (count($command) != 2) continue;
-			$val = trim($command[1]);
-			$command = trim($command[0]);
-			if ($command == '' || $val == '') continue;
-			
-			// check for commands where we accept multiple statements
-			if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src'))) {
-				array_push($config->$command, $val);
-			// check for single statement commands that evaluate to true or false
-			} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
-				$config->$command = ($val == 'yes');
-			// check for single statement commands stored as strings
-			} elseif (in_array($command, array('test_url'))) {
-				$config->$command = $val;
-			}
-		}
-		// store copy of config in our static cache array in case we need to process another URL
-		self::$config_cache[$h] = $config;
-		
-		return $config;
-	}
-	
 	// returns true on success, false on failure
 	// $smart_tidy indicates that if tidy is used and no results are produced, we will
 	// try again without it. Tidy helps us deal with PHP's patchy HTML parsing most of the time
@@ -140,11 +72,12 @@ class ContentExtractor
 		
 		// extract host name
 		$host = @parse_url($url, PHP_URL_HOST);
-		if (!($this->config = $this->get_site_config($host))) {
+		if (!($this->config = SiteConfig::build($host))) {
 			// no match, so use defaults
 			$this->config = new SiteConfig();
-			self::$config_cache[$host] = $this->config;
 		}
+		// store copy of config in our static cache array in case we need to process another URL
+		SiteConfig::add_to_cache($host, $this->config);
 		
 		// use tidy (if it exists)?
 		// This fixes problems with some sites which would otherwise
diff --git a/libraries/content-extractor/SiteConfig.php b/libraries/content-extractor/SiteConfig.php
index b816d0a..9387702 100644
--- a/libraries/content-extractor/SiteConfig.php
+++ b/libraries/content-extractor/SiteConfig.php
@@ -47,5 +47,120 @@ class SiteConfig
 	
 	// Test URL - if present, can be used to test the config above
 	public $test_url = null;
+	
+	// Single-page link - should identify a link element or URL pointing to the page holding the entire article
+	// This is useful for sites which split their articles across multiple pages. Links to such pages tend to 
+	// display the first page with links to the other pages at the bottom. Often there is also a link to a page
+	// which displays the entire article on one page (e.g. 'print view').
+	// This should be an XPath expression identifying the link to that page. If present and we find a match,
+	// we will retrieve that page and the rest of the options in this config will be applied to the new page.
+	public $single_page_link = array();
+	
+	// Single-page link in feed? - same as above, but patterns applied to item description HTML taken from feed
+	public $single_page_link_in_feed = array();
+	
+	// TODO: which parser to use for turning raw HTML into a DOMDocument
+	public $parser = 'libxml';
+	
+	// the options below cannot be set in the config files which this class represents
+	
+	public static $debug = false;
+	protected static $config_path;
+	protected static $config_path_fallback;
+	protected static $config_cache = array();
+	const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
+	
+	protected static function debug($msg) {
+		if (self::$debug) {
+			$mem = round(memory_get_usage()/1024, 2);
+			$memPeak = round(memory_get_peak_usage()/1024, 2);
+			echo '* ',$msg;
+			echo ' - mem used: ',$mem," (peak: $memPeak)\n";	
+			ob_flush();
+			flush();
+		}
+	}	
+	
+	public static function set_config_path($path, $fallback=null) {
+		self::$config_path = $path;
+		self::$config_path_fallback = $fallback;
+	}
+	
+	public static function add_to_cache($host, SiteConfig $config) {
+		$host = strtolower($host);
+		self::$config_cache[$host] = $config;	
+	}
+	
+	// returns SiteConfig instance if an appropriate one is found, false otherwise
+	public static function build($host) {
+		$host = strtolower($host);
+		if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
+		if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, $host)) return false;
+		// check for site configuration
+		$try = array($host);
+		$split = explode('.', $host);
+		if (count($split) > 1) {
+			array_shift($split);
+			$try[] = '.'.implode('.', $split);
+		}
+		foreach ($try as $h) {
+			if (array_key_exists($h, self::$config_cache)) {
+				self::debug("... cached ($h)");
+				return self::$config_cache[$h];
+			} elseif (file_exists(self::$config_path."/$h.txt")) {
+				self::debug("... from file ($h)");
+				$file = self::$config_path."/$h.txt";
+				break;
+			}
+		}
+		if (!isset($file)) {
+			if (isset(self::$config_path_fallback)) {
+				self::debug("... trying fallback ($host)");
+				foreach ($try as $h) {
+					if (file_exists(self::$config_path_fallback."/$h.txt")) {
+						self::debug("... from fallback file ($h)");
+						$file = self::$config_path_fallback."/$h.txt";
+						break;
+					}
+				}
+				if (!isset($file)) {
+					self::debug("... no match in fallback directory");
+					return false;
+				}
+			} else {
+				self::debug("... no match ($host)");
+				return false;
+			}
+		}
+		$config_file = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+		if (!$config_file || !is_array($config_file)) return false;
+		$config = new SiteConfig();
+		foreach ($config_file as $line) {
+			$line = trim($line);
+			
+			// skip comments, empty lines
+			if ($line == '' || $line[0] == '#') continue;
+			
+			// get command
+			$command = explode(':', $line, 2);
+			// if there's no colon ':', skip this line
+			if (count($command) != 2) continue;
+			$val = trim($command[1]);
+			$command = trim($command[0]);
+			if ($command == '' || $val == '') continue;
+			
+			// check for commands where we accept multiple statements
+			if (in_array($command, array('title', 'body', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed'))) {
+				array_push($config->$command, $val);
+			// check for single statement commands that evaluate to true or false
+			} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
+				$config->$command = ($val == 'yes');
+			// check for single statement commands stored as strings
+			} elseif (in_array($command, array('test_url'))) {
+				$config->$command = $val;
+			}
+		}
+		return $config;
+	}
 }
 ?>
\ No newline at end of file
diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php
index 92c69af..fcdce01 100644
--- a/libraries/humble-http-agent/HumbleHttpAgent.php
+++ b/libraries/humble-http-agent/HumbleHttpAgent.php
@@ -7,8 +7,8 @@
  * For environments which do not have these options, it reverts to standard sequential 
  * requests (using file_get_contents())
  * 
- * @version 0.8
- * @date 2011-02-28
+ * @version 0.9.5
+ * @date 2011-05-23
  * @see http://php.net/HttpRequestPool
  * @author Keyvan Minoukadeh
  * @copyright 2011 Keyvan Minoukadeh
@@ -104,6 +104,15 @@ class HumbleHttpAgent
 		return $iri->uri;
 	}
 	
+	public function removeFragment($url) {
+		$pos = strpos($url, '#');
+		if ($pos === false) {
+			return $url;
+		} else {
+			return substr($url, 0, $pos);
+		}
+	}	
+	
 	public function enableDebug($bool=true) {
 		$this->debug = (bool)$bool;
 	}
@@ -211,6 +220,7 @@ class HumbleHttpAgent
 						} else {
 							$this->debug("......adding to pool");
 							$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
+							$req_url = $this->removeFragment($req_url);
 							$httpRequest = new HttpRequest($req_url, HttpRequest::METH_GET, $this->requestOptions);
 							// send cookies, if we have any
 							if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
@@ -225,7 +235,11 @@ class HumbleHttpAgent
 					// did we get anything into the pool?
 					if (count($pool) > 0) {
 						$this->debug('Sending request...');
-						$pool->send();
+						try {
+							$pool->send();
+						} catch (HttpRequestPoolException $e) {
+							// do nothing
+						}
 						$this->debug('Received responses');
 						foreach($subset as $orig => $url) {
 							if (!$isRedirect) $orig = $url;
@@ -240,7 +254,9 @@ class HumbleHttpAgent
 								// is redirect?
 								if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) {
 									$redirectURL = $request->getResponseHeader('location');
-									$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+									if (!preg_match('!^https?://!i', $redirectURL)) {
+										$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+									}
 									if ($this->validateURL($redirectURL)) {
 										$this->debug('Redirect detected. Valid URL: '.$redirectURL);
 										// store any cookies
@@ -298,6 +314,7 @@ class HumbleHttpAgent
 					} else {
 						$this->debug("......adding to pool");
 						$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
+						$req_url = $this->removeFragment($req_url);
 						$headers = array();
 						// send cookies, if we have any
 						if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
@@ -327,7 +344,9 @@ class HumbleHttpAgent
 						$status_code = $this->requests[$orig]['status_code'];
 						if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
 							$redirectURL = $this->requests[$orig]['location'];
-							$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+							if (!preg_match('!^https?://!i', $redirectURL)) {
+								$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+							}
 							if ($this->validateURL($redirectURL)) {
 								$this->debug('Redirect detected. Valid URL: '.$redirectURL);
 								// store any cookies
@@ -367,6 +386,7 @@ class HumbleHttpAgent
 					$this->debug("Sending request for $url");
 					$this->requests[$orig]['original_url'] = $orig;					
 					$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($url) : $url;
+					$req_url = $this->removeFragment($req_url);
 					// send cookies, if we have any
 					$httpContext = $this->httpContext;
 					if ($cookies = $this->cookieJar->getMatchingCookies($req_url)) {
@@ -391,7 +411,9 @@ class HumbleHttpAgent
 							}
 							if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && isset($this->requests[$orig]['location'])) {
 								$redirectURL = $this->requests[$orig]['location'];
-								$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+								if (!preg_match('!^https?://!i', $redirectURL)) {
+									$redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url);
+								}
 								if ($this->validateURL($redirectURL)) {
 									$this->debug('Redirect detected. Valid URL: '.$redirectURL);
 									// store any cookies
@@ -444,6 +466,7 @@ class HumbleHttpAgent
 	}
 	
 	public function get($url, $remove=false) {
+		$url = "$url";
 		if (isset($this->requests[$url]) && isset($this->requests[$url]['body'])) {
 			$this->debug("URL already fetched - in memory ($url, effective: {$this->requests[$url]['effective_url']})");
 			$response = $this->requests[$url];
diff --git a/makefulltextfeed.php b/makefulltextfeed.php
index fdff082..f2df18a 100644
--- a/makefulltextfeed.php
+++ b/makefulltextfeed.php
@@ -3,8 +3,8 @@
 // Author: Keyvan Minoukadeh
 // Copyright (c) 2011 Keyvan Minoukadeh
 // License: AGPLv3
-// Version: 2.7
-// Date: 2011-03-21
+// Version: 2.8
+// Date: 2011-05-23
 
 /*
 This program is free software: you can redistribute it and/or modify
@@ -73,131 +73,11 @@ function __autoload($class_name) {
 	}
 }
 
-function url_allowed($url) {
-	global $options;
-	if (!empty($options->allowed_urls)) {
-		$allowed = false;
-		foreach ($options->allowed_urls as $allowurl) {
-			if (stristr($url, $allowurl) !== false) {
-				$allowed = true;
-				break;
-			}
-		}
-		if (!$allowed) return false;
-	} else {
-		foreach ($options->blocked_urls as $blockurl) {
-			if (stristr($url, $blockurl) !== false) {
-				return false;
-			}
-		}
-	}
-	return true;
-}
-
 ////////////////////////////////
 // Load config file if it exists
 ////////////////////////////////
 require_once(dirname(__FILE__).'/config.php');
 
-//////////////////////////////////////////////
-// Convert $html to UTF8
-// (uses HTTP headers and HTML to find encoding)
-// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
-//////////////////////////////////////////////
-function convert_to_utf8($html, $header=null)
-{
-	$encoding = null;
-	if ($html || $header) {
-		if (is_array($header)) $header = implode("\n", $header);
-		if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
-			// error parsing the response
-		} else {
-			$match = end($match); // get last matched element (in case of redirects)
-			if (isset($match[2])) $encoding = trim($match[2], '"\'');
-		}
-		if (!$encoding) {
-			if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
-				$encoding = trim($match[1], '"\'');
-			} elseif(preg_match('/<meta\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html, $match)) {
-				if (isset($match[1])) $encoding = trim($match[1]);
-			}
-		}
-		if (!$encoding) {
-			$encoding = 'utf-8';
-		} else {
-			if (strtolower($encoding) != 'utf-8') {
-				if (strtolower($encoding) == 'iso-8859-1') {
-					// replace MS Word smart qutoes
-					$trans = array();
-					$trans[chr(130)] = '&sbquo;';    // Single Low-9 Quotation Mark
-					$trans[chr(131)] = '&fnof;';    // Latin Small Letter F With Hook
-					$trans[chr(132)] = '&bdquo;';    // Double Low-9 Quotation Mark
-					$trans[chr(133)] = '&hellip;';    // Horizontal Ellipsis
-					$trans[chr(134)] = '&dagger;';    // Dagger
-					$trans[chr(135)] = '&Dagger;';    // Double Dagger
-					$trans[chr(136)] = '&circ;';    // Modifier Letter Circumflex Accent
-					$trans[chr(137)] = '&permil;';    // Per Mille Sign
-					$trans[chr(138)] = '&Scaron;';    // Latin Capital Letter S With Caron
-					$trans[chr(139)] = '&lsaquo;';    // Single Left-Pointing Angle Quotation Mark
-					$trans[chr(140)] = '&OElig;';    // Latin Capital Ligature OE
-					$trans[chr(145)] = '&lsquo;';    // Left Single Quotation Mark
-					$trans[chr(146)] = '&rsquo;';    // Right Single Quotation Mark
-					$trans[chr(147)] = '&ldquo;';    // Left Double Quotation Mark
-					$trans[chr(148)] = '&rdquo;';    // Right Double Quotation Mark
-					$trans[chr(149)] = '&bull;';    // Bullet
-					$trans[chr(150)] = '&ndash;';    // En Dash
-					$trans[chr(151)] = '&mdash;';    // Em Dash
-					$trans[chr(152)] = '&tilde;';    // Small Tilde
-					$trans[chr(153)] = '&trade;';    // Trade Mark Sign
-					$trans[chr(154)] = '&scaron;';    // Latin Small Letter S With Caron
-					$trans[chr(155)] = '&rsaquo;';    // Single Right-Pointing Angle Quotation Mark
-					$trans[chr(156)] = '&oelig;';    // Latin Small Ligature OE
-					$trans[chr(159)] = '&Yuml;';    // Latin Capital Letter Y With Diaeresis
-					$html = strtr($html, $trans);
-				}
-				$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
-
-				/*
-				if (function_exists('iconv')) {
-					// iconv appears to handle certain character encodings better than mb_convert_encoding
-					$html = iconv($encoding, 'utf-8', $html);
-				} else {
-					$html = mb_convert_encoding($html, 'utf-8', $encoding);
-				}
-				*/
-			}
-		}
-	}
-	return $html;
-}
-
-function makeAbsolute($base, $elem) {
-	$base = new IRI($base);
-	foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
-		$elems = $elem->getElementsByTagName($tag);
-		for ($i = $elems->length-1; $i >= 0; $i--) {
-			$e = $elems->item($i);
-			//$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
-			makeAbsoluteAttr($base, $e, $attr);
-		}
-		if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
-	}
-}
-function makeAbsoluteAttr($base, $e, $attr) {
-	if ($e->hasAttribute($attr)) {
-		// Trim leading and trailing white space. I don't really like this but 
-		// unfortunately it does appear on some sites. e.g.  <img src=" /path/to/image.jpg" />
-		$url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
-		$url = str_replace(' ', '%20', $url);
-		if (!preg_match('!https?://!i', $url)) {
-			$absolute = IRI::absolutize($base, $url);
-			if ($absolute) {
-				$e->setAttribute($attr, $absolute);
-			}
-		}
-	}
-}
-
 ////////////////////////////////
 // Check if service is enabled
 ////////////////////////////////
@@ -211,7 +91,7 @@ if (!$options->enabled) {
 if (!isset($_GET['url'])) { 
 	die('No URL supplied'); 
 }
-$url = $_GET['url'];
+$url = trim($_GET['url']);
 if (!preg_match('!^https?://.+!i', $url)) {
 	$url = 'http://'.$url;
 }
@@ -240,6 +120,7 @@ if ($options->alternative_url != '' && !isset($_GET['redir']) && mt_rand(0, 100)
 	if (isset($_GET['links'])) $redirect .= '&links='.$_GET['links'];
 	if (isset($_GET['exc'])) $redirect .= '&exc='.$_GET['exc'];
 	if (isset($_GET['what'])) $redirect .= '&what='.$_GET['what'];	
+	if (isset($_GET['format'])) $redirect .= '&format='.$_GET['format'];	
 	header("Location: $redirect");
 	exit;
 }
@@ -258,6 +139,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
 	if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
 	if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
 	if (isset($_GET['what'])) $redirect .= '&what='.urlencode($_GET['what']);
+	if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
 	header("Location: $redirect");
 	exit;
 }
@@ -364,9 +246,13 @@ if (($extract_pattern != '') && ($extract_pattern != 'auto')) {
 
 /////////////////////////////////////
 // Check for valid format
-// (stick to RSS for the time being)
+// (stick to RSS (or RSS as JSON) for the time being)
 /////////////////////////////////////
-$format = 'rss';
+if (isset($_GET['format']) && $_GET['format'] == 'json') {
+	$format = 'json';
+} else {
+	$format = 'rss';
+}
 
 //////////////////////////////////
 // Check for cached copy
@@ -392,10 +278,14 @@ if ($options->caching) {
 
 	// getting a Zend_Cache_Core object
 	$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
-	$cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.(int)isset($_GET['pubsub']));
+	$cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.$format.(int)isset($_GET['pubsub']));
 	
 	if ($data = $cache->load($cache_id)) {
-		header("Content-type: text/xml; charset=UTF-8");
+		if ($format == 'json') {
+			header("Content-type: application/json; charset=UTF-8");
+		} else {
+			header("Content-type: text/xml; charset=UTF-8");
+		}
 		if (headers_sent()) die('Some data has already been output, can\'t send RSS file');
 		echo $data;
 		exit;
@@ -419,7 +309,7 @@ $http = new HumbleHttpAgent();
 //////////////////////////////////
 // Set up Content Extractor
 //////////////////////////////////
-$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', new ContentExtractor(dirname(__FILE__).'/site_config/standard'));
+$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', dirname(__FILE__).'/site_config/standard');
 
 /*
 if ($options->caching) {
@@ -453,7 +343,8 @@ if (!$html_only) {
 	SimplePie_HumbleHttpAgent::set_agent($http);
 	$feed = new SimplePie();
 	$feed->set_file_class('SimplePie_HumbleHttpAgent');
-	$feed->set_feed_url($url);
+	//$feed->set_feed_url($url); // colons appearing in the URL's path get encoded
+	$feed->feed_url = $url;
 	$feed->set_autodiscovery_level(SIMPLEPIE_LOCATOR_NONE);
 	$feed->set_timeout(20);
 	$feed->enable_cache(false);
@@ -471,97 +362,34 @@ if (!$html_only) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-// Extract content from HTML (if URL is not feed or explicit HTML request has been made)
+// Our given URL is not a feed, so let's create our own feed with a single item:
+// the given URL. This basically treats all non-feed URLs as if they were
+// single-item feeds.
 ////////////////////////////////////////////////////////////////////////////////
+$isDummyFeed = false;
 if ($html_only || !$result) {
+	$isDummyFeed = true;
 	unset($feed, $result);
-	if ($response = $http->get($url)) {
-		$effective_url = $response['effective_url'];
-		if (!url_allowed($effective_url)) die('URL blocked');
-		$html = $response['body'];
-		// remove strange things here
-		$html = str_replace('</[>', '', $html);
-		$html = convert_to_utf8($html, $response['headers']);	
+	// create single item dummy feed object
+	class DummySingleItemFeed {
+		public $item;
+		function __construct($url) { $this->item = new DummySingleItem($url); }
+		public function get_title() { return ''; }
+		public function get_description() { return 'Content extracted from '.$this->item->url; }
+		public function get_link() { return $this->item->url; }
+		public function get_image_url() { return false; }
+		public function get_items($start=0, $max=1) { return array(0=>$this->item); }
 	}
-	if (!$response || $response['status_code'] >= 300) {
-		die('Error retrieving '.$url);
+	class DummySingleItem {
+		public $url;
+		function __construct($url) { $this->url = $url; }
+		public function get_permalink() { return $this->url; }
+		public function get_title() { return ''; }
+		public function get_date($format='') { return false; }
+		public function get_author() { return false; }
+		public function get_description() { return ''; }
 	}
-	if ($auto_extract) {
-		$extract_result = $extractor->process($html, $effective_url);
-		if (!$extract_result) die($options->error_message);
-		$readability = $extractor->readability;
-		$content_block = $extractor->getContent();	
-		$title = $extractor->getTitle();
-	} else {
-		$readability = new Readability($html, $effective_url);
-		// content block is entire document
-		$content_block = $readability->dom;
-		//TODO: get title
-		$title = '';
-	}
-	if ($extract_pattern) {
-		$xpath = new DOMXPath($readability->dom);
-		$elems = @$xpath->query($extract_pattern, $content_block);
-		// check if our custom extraction pattern matched
-		if ($elems && $elems->length > 0) {
-			// get the first matched element
-			$content_block = $elems->item(0);
-			// clean it up
-			$readability->removeScripts($content_block);
-			$readability->prepArticle($content_block);
-		} else {
-			die($options->error_message);
-			//$content_block = $readability->dom->createElement('p', 'Sorry, could not extract content');
-		}
-	}
-	$readability->clean($content_block, 'select');
-	if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block);
-	// footnotes
-	if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
-		$readability->addFootnotes($content_block);
-	}
-	if ($extract_pattern) {
-		// get outerHTML
-		$content = $content_block->ownerDocument->saveXML($content_block);
-	} else {
-		if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
-			$content = $content_block->firstChild->innerHTML;
-		} else {
-			$content = $content_block->innerHTML;
-		}
-	}
-	if ($links == 'remove') {
-		$content = preg_replace('!</?a[^>]*>!', '', $content);
-	}
-	if (!$valid_key) {
-		$content = $options->message_to_prepend.$content;
-		$content .= $options->message_to_append;
-	} else {
-		$content = $options->message_to_prepend_with_key.$content;	
-		$content .= $options->message_to_append_with_key;
-	}
-	unset($readability, $html);
-	$output = new FeedWriter(); //ATOM an option
-	$output->setTitle($title);
-	$output->setDescription("Content extracted from $url");
-	$output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it
-	if ($format == 'atom') {
-		$output->setChannelElement('updated', date(DATE_ATOM));
-		$output->setChannelElement('author', array('name'=>'Five Filters', 'uri'=>'http://fivefilters.org'));
-	}
-	$output->setLink($url);
-	$newitem = $output->createNewItem();
-	$newitem->setTitle($title);
-	$newitem->setLink($url);
-	if ($format == 'atom') {
-		$newitem->setDate(time());
-		$newitem->addElement('content', $content);
-	} else {
-		$newitem->setDescription($content);
-	}
-	$output->addItem($newitem);
-	$output->genarateFeed(); 
-	exit;
+	$feed = new DummySingleItemFeed($url);
 }
 
 ////////////////////////////////////////////
@@ -594,6 +422,8 @@ $urls_sanitized = array();
 $urls = array();
 foreach ($items as $key => $item) {
 	$permalink = htmlspecialchars_decode($item->get_permalink());
+	// Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
+	$permalink = str_replace('%3A', ':', $permalink);
 	$permalink = $http->validateUrl($permalink);
 	if ($permalink) {
 		$urls_sanitized[] = $permalink;
@@ -625,17 +455,34 @@ foreach ($items as $key => $item) {
 		$effective_url = $response['effective_url'];
 		if (!url_allowed($effective_url)) continue;
 		$html = $response['body'];
-		// remove strange things here
-		$html = str_replace('</[>', '', $html);		
+		// remove strange things
+		$html = str_replace('</[>', '', $html);
 		$html = convert_to_utf8($html, $response['headers']);
 		if ($auto_extract) {
+			// check site config for single page URL - fetch it if found
+			if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
+				$html = $single_page_response['body'];
+				// remove strange things
+				$html = str_replace('</[>', '', $html);	
+				$html = convert_to_utf8($html, $single_page_response['headers']);
+				$effective_url = $single_page_response['effective_url'];
+				unset($single_page_response);
+			}
 			$extract_result = $extractor->process($html, $effective_url);
 			$readability = $extractor->readability;
 			$content_block = ($extract_result) ? $extractor->getContent() : null;
+			$title = ($extract_result) ? $extractor->getTitle() : '';
 		} else {
 			$readability = new Readability($html, $effective_url);
 			// content block is entire document (for now...)
-			$content_block = $readability->dom;			
+			$content_block = $readability->dom;
+			//TODO: get title
+			$title = '';
+		}
+		// use extracted title for both feed and item title if we're using single-item dummy feed
+		if ($isDummyFeed) {
+			$output->setTitle($title);
+			$newitem->setTitle($title);
 		}
 		if ($extract_pattern && isset($content_block)) {
 			$xpath = new DOMXPath($readability->dom);
@@ -684,11 +531,11 @@ foreach ($items as $key => $item) {
 			$html = preg_replace('!</?a[^>]*>!', '', $html);
 		}
 		if (!$valid_key) {
-			$html = $options->message_to_prepend.$html;
-			$html .= $options->message_to_append;
+			$html = make_substitutions($options->message_to_prepend).$html;
+			$html .= make_substitutions($options->message_to_append);
 		} else {
-			$html = $options->message_to_prepend_with_key.$html;	
-			$html .= $options->message_to_append_with_key;
+			$html = make_substitutions($options->message_to_prepend_with_key).$html;	
+			$html .= make_substitutions($options->message_to_append_with_key);
 		}
 	}
 	if ($format == 'atom') {
@@ -715,14 +562,215 @@ foreach ($items as $key => $item) {
 	unset($html);
 }
 // output feed
-if ($options->caching) {
+if ($options->caching || $format == 'json') {
 	ob_start();
 	$output->genarateFeed();
 	$output = ob_get_contents();
 	ob_end_clean();
-	$cache->save($output, $cache_id);
+	if ($format == 'json') {
+		$jsonrss = new stdClass();
+		$jsonrss->rss = @simplexml_load_string($output);
+		$output = json_encode($jsonrss);
+		header("Content-type: application/json; charset=UTF-8");
+	}
+	if ($options->caching) $cache->save($output, $cache_id);
 	echo $output;
 } else {
 	$output->genarateFeed();
 }
+
+///////////////////////////////
+// HELPER FUNCTIONS
+///////////////////////////////
+
+function url_allowed($url) {
+	global $options;
+	if (!empty($options->allowed_urls)) {
+		$allowed = false;
+		foreach ($options->allowed_urls as $allowurl) {
+			if (stristr($url, $allowurl) !== false) {
+				$allowed = true;
+				break;
+			}
+		}
+		if (!$allowed) return false;
+	} else {
+		foreach ($options->blocked_urls as $blockurl) {
+			if (stristr($url, $blockurl) !== false) {
+				return false;
+			}
+		}
+	}
+	return true;
+}
+
+//////////////////////////////////////////////
+// Convert $html to UTF8
+// (uses HTTP headers and HTML to find encoding)
+// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
+//////////////////////////////////////////////
+function convert_to_utf8($html, $header=null)
+{
+	$encoding = null;
+	if ($html || $header) {
+		if (is_array($header)) $header = implode("\n", $header);
+		if (!$header || !preg_match_all('/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
+			// error parsing the response
+		} else {
+			$match = end($match); // get last matched element (in case of redirects)
+			if (isset($match[2])) $encoding = trim($match[2], '"\'');
+		}
+		if (!$encoding) {
+			if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
+				$encoding = trim($match[1], '"\'');
+			} elseif(preg_match('/<meta\s+http-equiv=["\']Content-Type["\'] content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i', $html, $match)) {
+				if (isset($match[1])) $encoding = trim($match[1]);
+			}
+		}
+		if (!$encoding) {
+			$encoding = 'utf-8';
+		} else {
+			if (strtolower($encoding) != 'utf-8') {
+				if (strtolower($encoding) == 'iso-8859-1') {
+					// replace MS Word smart qutoes
+					$trans = array();
+					$trans[chr(130)] = '&sbquo;';    // Single Low-9 Quotation Mark
+					$trans[chr(131)] = '&fnof;';    // Latin Small Letter F With Hook
+					$trans[chr(132)] = '&bdquo;';    // Double Low-9 Quotation Mark
+					$trans[chr(133)] = '&hellip;';    // Horizontal Ellipsis
+					$trans[chr(134)] = '&dagger;';    // Dagger
+					$trans[chr(135)] = '&Dagger;';    // Double Dagger
+					$trans[chr(136)] = '&circ;';    // Modifier Letter Circumflex Accent
+					$trans[chr(137)] = '&permil;';    // Per Mille Sign
+					$trans[chr(138)] = '&Scaron;';    // Latin Capital Letter S With Caron
+					$trans[chr(139)] = '&lsaquo;';    // Single Left-Pointing Angle Quotation Mark
+					$trans[chr(140)] = '&OElig;';    // Latin Capital Ligature OE
+					$trans[chr(145)] = '&lsquo;';    // Left Single Quotation Mark
+					$trans[chr(146)] = '&rsquo;';    // Right Single Quotation Mark
+					$trans[chr(147)] = '&ldquo;';    // Left Double Quotation Mark
+					$trans[chr(148)] = '&rdquo;';    // Right Double Quotation Mark
+					$trans[chr(149)] = '&bull;';    // Bullet
+					$trans[chr(150)] = '&ndash;';    // En Dash
+					$trans[chr(151)] = '&mdash;';    // Em Dash
+					$trans[chr(152)] = '&tilde;';    // Small Tilde
+					$trans[chr(153)] = '&trade;';    // Trade Mark Sign
+					$trans[chr(154)] = '&scaron;';    // Latin Small Letter S With Caron
+					$trans[chr(155)] = '&rsaquo;';    // Single Right-Pointing Angle Quotation Mark
+					$trans[chr(156)] = '&oelig;';    // Latin Small Ligature OE
+					$trans[chr(159)] = '&Yuml;';    // Latin Capital Letter Y With Diaeresis
+					$html = strtr($html, $trans);
+				}
+				$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
+
+				/*
+				if (function_exists('iconv')) {
+					// iconv appears to handle certain character encodings better than mb_convert_encoding
+					$html = iconv($encoding, 'utf-8', $html);
+				} else {
+					$html = mb_convert_encoding($html, 'utf-8', $encoding);
+				}
+				*/
+			}
+		}
+	}
+	return $html;
+}
+
+function makeAbsolute($base, $elem) {
+	$base = new IRI($base);
+	// remove '//' in URL path (causes URLs not to resolve properly)
+	if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
+	foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
+		$elems = $elem->getElementsByTagName($tag);
+		for ($i = $elems->length-1; $i >= 0; $i--) {
+			$e = $elems->item($i);
+			//$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
+			makeAbsoluteAttr($base, $e, $attr);
+		}
+		if (strtolower($elem->tagName) == $tag) makeAbsoluteAttr($base, $elem, $attr);
+	}
+}
+function makeAbsoluteAttr($base, $e, $attr) {
+	if ($e->hasAttribute($attr)) {
+		// Trim leading and trailing white space. I don't really like this but 
+		// unfortunately it does appear on some sites. e.g.  <img src=" /path/to/image.jpg" />
+		$url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
+		$url = str_replace(' ', '%20', $url);
+		if (!preg_match('!https?://!i', $url)) {
+			$absolute = IRI::absolutize($base, $url);
+			if ($absolute) {
+				$e->setAttribute($attr, $absolute);
+			}
+		}
+	}
+}
+function makeAbsoluteStr($base, $url) {
+	$base = new IRI($base);
+	// remove '//' in URL path (causes URLs not to resolve properly)
+	if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
+	if (preg_match('!^https?://!i', $url)) {
+		// already absolute
+		return $url;
+	} else {
+		$absolute = IRI::absolutize($base, $url);
+		if ($absolute) return $absolute;
+		return false;
+	}
+}
+// returns single page response, or false if not found
+function getSinglePage($item, $html, $url) {
+	global $http;
+	$host = @parse_url($url, PHP_URL_HOST);
+	$site_config = SiteConfig::build($host);
+	if ($site_config === false) return false;
+	$splink = null;
+	if (!empty($site_config->single_page_link)) {
+		$splink = $site_config->single_page_link;
+	} elseif (!empty($site_config->single_page_link_in_feed)) {
+		// single page link xpath is targeted at feed
+		$splink = $site_config->single_page_link_in_feed;
+		// so let's replace HTML with feed item description
+		$html = $item->get_description();
+	}
+	if (isset($splink)) {
+		// Build DOM tree from HTML
+		$readability = new Readability($html, $url);
+		$xpath = new DOMXPath($readability->dom);
+		// Loop through single_page_link xpath expressions
+		$single_page_url = null;
+		foreach ($splink as $pattern) {
+			$elems = @$xpath->evaluate($pattern, $readability->dom);
+			if (is_string($elems)) {
+				$single_page_url = trim($elems);
+				break;
+			} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
+				foreach ($elems as $item) {
+					if ($item->hasAttribute('href')) {
+						$single_page_url = $item->getAttribute('href');
+						break;
+					}
+				}
+			}
+		}
+		// If we've got URL, resolve against $url
+		if (isset($single_page_url) && ($single_page_url = makeAbsoluteStr($url, $single_page_url))) {
+			// check it's not what we have already!
+			if ($single_page_url != $url) {
+				// it's not, so let's try to fetch it...
+				if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
+					return $response;
+				}
+			}
+		}
+	}
+	return false;
+}
+
+function make_substitutions($string) {
+	if ($string == '') return $string;
+	global $item, $effective_url;
+	$string = str_replace('{url}', htmlspecialchars($item->get_permalink()), $string);
+	$string = str_replace('{effective-url}', htmlspecialchars($effective_url), $string);
+	return $string;
+}
 ?>
\ No newline at end of file