Full-Text RSS 3.8

2019-04-04 23:46:36 +02:00 · 2019-04-04 23:46:36 +02:00 · 1ec2f36b3e
commit 1ec2f36b3e
parent 954e765b5a
21 changed files with 635 additions and 545 deletions
--- a/changelog.txt
+++ b/changelog.txt
@ -2,6 +2,22 @@ FiveFilters.org: Full-Text RSS
 http://fivefilters.org/content-only/
 CHANGELOG
 ------------------------------------
+3.8 (2017-09-25)
+ - New site config directive: strip_attr: XPath attribute selector (e.g. //img/@srcset) - remove attribute from element
+ - New site config directive: insert_detected_image: yes/no (default yes) - places image in og:image in the body if no other images extracted
+ - Bug fix: Better handling of Internationalized Domain Names (IDNs)
+ - Bug fix: Relative base URLs (<base>) now resolved against page URL
+ - Bug fix: Wrong site config file chosen in certain cases (when wildcard and exact subdomain files available and cached in APCu) 
+ - Bug fix: &apos; HTML entities not converted correctly when parsing with Gumbo PHP
+ - Remove srcset (+ sizes) attributes on img elements if it looks like they only contain relative URLs (browser will use src attribute value instead)
+ - https:// URLs now re-written to sec:// before being submitted to avoid overzealous security software blocking request on some servers - no redirect, only affects newly submitted URLs on index.php
+ - HTML5-PHP library updated
+ - Language Detect library updated
+ - Site config files updated for better extraction
+ - Minimum PHP version is now 5.4. If you must use PHP 5.3, please stick with Full-Text RSS 3.7
+ - Tested with PHP 7.2
+ - Other fixes/improvements
+ 
 3.7 (2017-02-12)
 - Request HTML5 output using HTML5-PHP - new config option $options->html5_output and new request parameter &content=html5
 - Improve support for lazy-loading images
--- a/config.php
+++ b/config.php
@ -61,16 +61,15 @@ $options->content = 'user';

 // HTML5 output
 // ----------------------
-// By default, Full-Text RSS uses libxml to convert the parsed DOM tree back into HTML.
-// If this is enabled, we'll use HTML5-PHP to produce the HTML. This will be a little
-// slower, but might produce better results, adhering to the HTML5 spec.
-//
-// Note: in a future release we might make HTML5 output the default.
+// Full-Text RSS used to rely on libxml to output HTML extracted from
+// a web page. Since version 3.8 we use HTML5-PHP by default.
+// If you prefer the old output, either set this to false or pass &content=1 
+// in the querystring.
 // 
 // Possible values...
 // HTML5 (slower): true
 // libxml (faster): false
-// libxml unless user overrides (&content=html5): 'user' (default)
+// HTML5 unless user overrides (&content=1): 'user' (default)
 $options->html5_output = 'user';

 // Excerpts
@ -524,7 +523,7 @@ $options->cache_cleanup = 100;
 /// DO NOT CHANGE ANYTHING BELOW THIS ///////////
 /////////////////////////////////////////////////

-if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.7');
+if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.8');

 if (basename(__FILE__) == 'config.php') {
 	if (file_exists(dirname(__FILE__).'/custom_config.php')) {
--- a/ftr_compatibility_test.php
+++ b/ftr_compatibility_test.php
@ -16,12 +16,12 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
 http://github.com/simplepie/simplepie/tree/master/compatibility_test/
 */

-$app_name = 'Full-Text RSS 3.7';
+$app_name = 'Full-Text RSS 3.8';

 // Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
 //$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
 // HHVM works okay, but no Tidy and autoupdate of site config files not working (tested 3.7.1)
-$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.3.0', '>='));
+$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.4.0', '>='));
 $pcre_ok = extension_loaded('pcre');
 $zlib_ok = extension_loaded('zlib');
 $mbstring_ok = extension_loaded('mbstring');
@ -32,6 +32,7 @@ $parallel_ok = ((extension_loaded('http') && class_exists('http\Client\Request')
 $allow_url_fopen_ok = (bool)ini_get('allow_url_fopen');
 $filter_ok = extension_loaded('filter');
 $gumbo_ok = class_exists('Layershifter\Gumbo\Parser');
+$idn_ok = function_exists('idn_to_ascii');

 if (extension_loaded('xmlreader')) {
 	$xml_ok = true;
@ -204,7 +205,7 @@ div.chunk {
 				<tbody>
 					<tr class="<?php echo ($php_ok) ? 'enabled' : 'disabled'; ?>">
 						<td>PHP</td>
-						<td>5.3 or higher</td>
+						<td>5.4 or higher</td>
 						<td><?php echo phpversion(); ?></td>
 					</tr>
 					<tr class="<?php echo ($xml_ok) ? 'enabled, and sane' : 'disabled, or broken'; ?>">
@ -354,6 +355,11 @@ div.chunk {

 		<div class="chunk">
 			<h3>Further info</h3>
+
+			<h4>IDN support</h4>
+			<p>When treating an <a href="https://en.wikipedia.org/wiki/Internationalized_domain_name">internationalized domain name (IDN)</a> Full-Text RSS will try to make use of PHP's <code>idn_to_ascii</code> function to convert the domain to ASCII. If this function does not exist, you might have trouble retrieving article content from internationalized domains.</p>
+			<p class="highlight"><strong>idn_to_ascii</strong> is <?php if (!$idn_ok) echo '<strong>not</strong>'; ?> available on this server.</p>
+
 			<h4>HTTP module</h4>
 			<p>Full-Text RSS can make use of PHP's HTTP extension or <code>curl_multi</code> to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using <code>file_get_contents</code>.</p>
 			<?php 
--- a/index.php
+++ b/index.php
@ -25,6 +25,7 @@ if (!defined('_FF_FTR_INDEX')) {
 		// remove http scheme from urls before submitting
 		$('#form').submit(function() {
 			$('#url').val($('#url').val().replace(/^http:\/\//i, ''));
+			$('#url').val($('#url').val().replace(/^https:\/\//i, 'sec://'));
 			return true;
 		});
 		// popovers
@ -271,8 +272,8 @@ if (!defined('_FF_FTR_INDEX')) {

 		<tr>
 			<td>content</td>
-			<td><tt>0</tt>, <tt>1</tt> (default), <tt>html5</tt></td>
-			<td>If set to 0, the extracted content will not be included in the output. If set to html5, we'll output HTML5.</td>
+			<td><tt>0</tt>, <tt>1</tt>, <tt>html5</tt> (default)</td>
+			<td>If set to 0, the extracted content will not be included in the output. If set to 1, we'll use regular libxml output - might not be HTML5 compliant.</td>
 		</tr>
 		
 		<tr>
--- a/libraries/content-extractor/ContentExtractor.php
+++ b/libraries/content-extractor/ContentExtractor.php
@ -5,8 +5,8 @@
 * Uses patterns specified in site config files and auto detection (hNews/PHP Readability) 
 * to extract content from HTML files.
 * 
- * @version 1.3
- * @date 2017-02-12
+ * @version 1.4
+ * @date 2017-09-25
 * @author Keyvan Minoukadeh
 * @copyright 2017 Keyvan Minoukadeh
 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
@ -107,24 +107,13 @@ class ContentExtractor
 	}
 	
 	// returns SiteConfig instance (joined in order: exact match, wildcard, fingerprint, global, default)
-	public function buildSiteConfig($url, $html='', $add_to_cache=true) {
+	public function buildSiteConfig($url, $html='') {
 		// extract host name
 		$host = @parse_url($url, PHP_URL_HOST);
 		$host = strtolower($host);
 		if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
-		// is merged version already cached?
-		if (SiteConfig::is_cached("$host.merged")) {
-			$config = SiteConfig::build("$host.merged");
-			if ($config) {
-				$this->debug("Returning cached and merged site config for $host");
-				return $config;
-			}
-		}
 		// let's build from site_config/custom/ and standard/
 		$config = SiteConfig::build($host);
-		if ($add_to_cache && $config && !SiteConfig::is_cached("$host")) {
-			SiteConfig::add_to_cache($host, $config);
-		}
 		// if no match, use defaults
 		if (!$config) $config = new SiteConfig();
 		// load fingerprint config?
@ -134,10 +123,6 @@ class ContentExtractor
 				if ($config_fingerprint = SiteConfig::build($_fphost)) {
 					$this->debug("Appending site config settings from $_fphost (fingerprint match)");
 					$config->append($config_fingerprint);
-					if ($add_to_cache && !SiteConfig::is_cached($_fphost)) {
-						//$config_fingerprint->cache_in_apc = true;
-						SiteConfig::add_to_cache($_fphost, $config_fingerprint);
-					}
 				}
 			}
 		}
@ -146,19 +131,8 @@ class ContentExtractor
 			if ($config_global = SiteConfig::build('global', true)) {
 				$this->debug('Appending site config settings from global.txt');
 				$config->append($config_global);
-				if ($add_to_cache && !SiteConfig::is_cached('global')) {
-					//$config_global->cache_in_apc = true;
-					SiteConfig::add_to_cache('global', $config_global);
 			}
 		}
-		}
-		// store copy of merged config
-		if ($add_to_cache) {
-			// do not store in APC if wildcard match
-			$use_apc = ($host == $config->cache_key);
-			$config->cache_key = null;
-			SiteConfig::add_to_cache("$host.merged", $config, $use_apc);
-		}
 		return $config;
 	}
 	
@ -398,14 +372,18 @@ class ContentExtractor
 			$elems = @$xpath->query($pattern, $this->readability->dom);
 			// check for matches
 			if ($elems && $elems->length > 0) {
-				$this->debug('Stripping '.$elems->length.' elements (strip)');
+				$this->debug('Stripping '.$elems->length.' elements (strip: '.$pattern.')');
 				for ($i=$elems->length-1; $i >= 0; $i--) {
 					if ($elems->item($i)->parentNode) {
+						if ($elems->item($i) instanceof DOMAttr) {
+							$elems->item($i)->parentNode->removeAttributeNode($elems->item($i));
+						} else {
 							$elems->item($i)->parentNode->removeChild($elems->item($i));
 						}
 					}
 				}
 			}
+		}
 		
 		// strip elements (using id and class attribute values)
 		foreach ($this->config->strip_id_or_class as $string) {
@ -413,7 +391,7 @@ class ContentExtractor
 			$elems = @$xpath->query("//*[contains(@class, '$string') or contains(@id, '$string')]", $this->readability->dom);
 			// check for matches
 			if ($elems && $elems->length > 0) {
-				$this->debug('Stripping '.$elems->length.' elements (strip_id_or_class)');
+				$this->debug('Stripping '.$elems->length.' elements (strip_id_or_class: '.$string.')');
 				for ($i=$elems->length-1; $i >= 0; $i--) {
 					$elems->item($i)->parentNode->removeChild($elems->item($i));
 				}
@ -426,12 +404,13 @@ class ContentExtractor
 			$elems = @$xpath->query("//img[contains(@src, '$string')]", $this->readability->dom);
 			// check for matches
 			if ($elems && $elems->length > 0) {
-				$this->debug('Stripping '.$elems->length.' image elements');
+				$this->debug('Stripping '.$elems->length.' elements (strip_image_src: '.$string.')');
 				for ($i=$elems->length-1; $i >= 0; $i--) {
 					$elems->item($i)->parentNode->removeChild($elems->item($i));
 				}
 			}
 		}
+
 		// strip elements using Readability.com and Instapaper.com ignore class names
 		// .entry-unrelated and .instapaper_ignore
 		// See https://www.readability.com/publishers/guidelines/#view-plainGuidelines
@ -465,6 +444,21 @@ class ContentExtractor
 			}
 		}

+		// strip img srcset/sizes attributes with relative URIs (src should be present and will be absolutised)
+		// TODO: absolutize srcet values rather than removing them
+		// To remove srcset from all image elements, site config files can contain: strip: //img/@srcset
+		$elems = $xpath->query("//img[@srcset and not(contains(@srcset, '//'))]", $this->readability->dom);
+		// check for matches
+		if ($elems && $elems->length > 0) {
+			$this->debug('Stripping '.$elems->length.' srcset attributes');
+			foreach ($elems as $elem) {
+				$elem->removeAttribute('srcset');
+				if ($elem->hasAttribute('sizes')) {
+					$elem->removeAttribute('sizes');
+				}
+			}
+		}
+
 		// try to get body
 		foreach ($this->config->body as $pattern) {
 			$elems = @$xpath->query($pattern, $this->readability->dom);
@ -880,7 +874,7 @@ class ContentExtractor
 				}
 			} else {
 				// If there's an og:image, but we have no images in the article, let's place it at the beginning of the article.
-				if ($this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
+				if ($this->config->insert_detected_image() && $this->body->hasChildNodes() && isset($this->opengraph['og:image']) && substr($this->opengraph['og:image'], 0, 4) === 'http') {
 					$elems = @$xpath->query(".//img", $this->body);
 					if ($elems->length === 0) {
 						$_new_elem = $this->body->ownerDocument->createDocumentFragment();
--- a/libraries/content-extractor/SiteConfig.php
+++ b/libraries/content-extractor/SiteConfig.php
@ -5,10 +5,10 @@
 * Each instance of this class should hold extraction patterns and other directives
 * for a website. See ContentExtractor class to see how it's used.
 * 
- * @version 1.0
- * @date 2015-06-09
+ * @version 1.1
+ * @date 2017-09-25
 * @author Keyvan Minoukadeh
- * @copyright 2015 Keyvan Minoukadeh
+ * @copyright 2017 Keyvan Minoukadeh
 * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
 */

@ -43,7 +43,6 @@ class SiteConfig
 	
 	// Process HTML with tidy before creating DOM (bool or null if undeclared)
 	public $tidy = null;
-	
 	protected $default_tidy = true; // used if undeclared
 	
 	// Autodetect title/body if xpath expressions fail to produce results.
@ -93,6 +92,12 @@ class SiteConfig
 	public $parser = null;
 	protected $default_parser = 'libxml'; // used if undeclared
 	
+	// Insert detected image (currently only og:image) into beginning of extracted article
+	// Only does this if extracted article contains no images
+	// bool or null if undeclared
+	public $insert_detected_image = null;
+	protected $default_insert_detected_image = true; // used if undeclared
+
 	// Strings to search for in HTML before processing begins (used with $replace_string)
 	public $find_string = array();
 	// Strings to replace those found in $find_string before HTML processing begins
@ -101,10 +106,9 @@ class SiteConfig
 	// the options below cannot be set in the config files which this class represents
 	
 	//public $cache_in_apc = false; // used to decide if we should cache in apc or not
-	public $cache_key = null;
 	public static $debug = false;
 	protected static $apc = false;
-	protected static $config_path;
+	protected static $config_path_custom;
 	protected static $config_path_fallback;
 	protected static $config_cache = array();
 	const HOSTNAME_REGEX = '/^(([a-zA-Z0-9-]*[a-zA-Z0-9])\.)*([A-Za-z0-9-]*[A-Za-z0-9])$/';
@ -137,6 +141,12 @@ class SiteConfig
 		return $apc;
 	}

+	// return bool or null
+	public function insert_detected_image($use_default=true) {
+		if ($use_default) return (isset($this->insert_detected_image)) ? $this->insert_detected_image : $this->default_insert_detected_image;
+		return $this->insert_detected_image;
+	}
+
 	// return bool or null
 	public function tidy($use_default=true) {
 		if ($use_default) return (isset($this->tidy)) ? $this->tidy : $this->default_tidy;
@ -162,15 +172,32 @@ class SiteConfig
 	}
 	
 	public static function set_config_path($path, $fallback=null) {
-		self::$config_path = $path;
+		self::$config_path_custom = $path;
 		self::$config_path_fallback = $fallback;
 	}

+	protected static function load_cached_merged($host, $exact_host_match) {
+		if ($exact_host_match) {
+			$key = $host.'.merged.ex';
+		} else {
+			$key = $host.'.merged';
+		}
+		return self::load_cached($key);
+	}
+
+	protected static function add_to_cache_merged($host, $exact_host_match, SiteConfig $config=null) {
+		if ($exact_host_match) {
+			$key = $host.'.merged.ex';
+		} else {
+			$key = $host.'.merged';
+		}
+		if (!isset($config)) $config = new SiteConfig();
+		self::add_to_cache($key, $config);
+	}
+
 	public static function add_to_cache($key, SiteConfig $config, $use_apc=true) {
 		$key = strtolower($key);
 		if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
-		if ($config->cache_key) $key = $config->cache_key;
-		$key .= '.'.self::get_key_suffix();		
 		self::$config_cache[$key] = $config;
 		if (self::$apc && $use_apc) {
 			self::debug("Adding site config to APC cache with key sc.$key");
@ -179,9 +206,22 @@ class SiteConfig
 		self::debug("Cached site config with key $key");
 	}

+	public static function load_cached($key) {
+		$key = strtolower($key);
+		if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
+		//var_dump('in cache?', $key, self::$config_cache);
+		if (array_key_exists($key, self::$config_cache)) {
+			self::debug("... site config for $key already loaded in this request");
+			return self::$config_cache[$key];
+		} elseif (self::$apc && ($sconfig = apc_fetch("sc.$key"))) {
+			self::debug("... site config for $key found in APCu");
+			return $sconfig;
+		}
+		return false;
+	}
+
 	public static function is_cached($key) {
 		$key = strtolower($key);
-		$key .= '.'.self::get_key_suffix();
 		if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
 		if (array_key_exists($key, self::$config_cache)) {
 			return true;
@ -212,7 +252,7 @@ class SiteConfig
 		}
 		// check for single statement commands
 		// we do not overwrite existing non null values
-		foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) {
+		foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure', 'insert_detected_image') as $var) {
 			if ($this->$var === null) $this->$var = $newconfig->$var;
 		}
 		// treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!)
@ -223,16 +263,6 @@ class SiteConfig
 		}
 	}

-	// This is used to make sure that when a different primary folder is chosen
-	// The key for the cached result includes that folder choice.
-	// Otherwise, a subsequent request choosing a different folder
-	// could return the wrong cached config.
-	public static function get_key_suffix() {
-		$key_suffix = basename(self::$config_path);
-		if ($key_suffix === 'custom') $key_suffix = '';
-		return $key_suffix;
-	}
-
 	// Add test_contains to last test_url
 	public function add_test_contains($test_contains) {
 		if (!empty($this->test_url)) {
@ -274,6 +304,12 @@ class SiteConfig
 		$host = strtolower($host);
 		if (substr($host, 0, 4) == 'www.') $host = substr($host, 4);
 		if (!$host || (strlen($host) > 200) || !preg_match(self::HOSTNAME_REGEX, ltrim($host, '.'))) return false;
+		// got a merged one?
+		$config = self::load_cached_merged($host, $exact_host_match);
+		if ($config) {
+			//self::debug('. returned merged config from a previous request');
+			return $config;
+		}
 		// check for site configuration
 		$try = array($host);
 		// should we look for wildcard matches 
@ -285,99 +321,84 @@ class SiteConfig
 			}
 		}

-		// Which primary folder should we look inside?
-		// If it's not the default ('custom'), we need
-		// a key suffix to distinguish site config fules
-		// held in this folder from those in other folders.
-		$key_suffix = self::get_key_suffix();
-
-		// look for site config file in primary folder
-		self::debug(". looking for site config for $host in primary folder");
+		// look for site config file in custom folder
+		self::debug(". looking for site config for $host in custom folder");
+		//var_dump($try);
+		$config = null;
+		$config_std = null;
 		foreach ($try as $h) {
-			$h_key = "$h.$key_suffix";
-			if (array_key_exists($h_key, self::$config_cache)) {
-				self::debug("... site config for $h already loaded in this request");
-				return self::$config_cache[$h_key];
-			} elseif (self::$apc && ($sconfig = apc_fetch("sc.$h_key"))) {
-				self::debug("... site config for $h in APC cache");
-				return $sconfig;
-			} elseif (file_exists(self::$config_path."/$h.txt")) {
+			//$h_key = $h.'.'.$key_suffix;
+			$h_key = $h.'.custom';
+			//var_dump($h_key, $h);
+			if ($config = self::load_cached($h_key)) {
+				break;
+			} elseif (file_exists(self::$config_path_custom."/$h.txt")) {
 				self::debug("... found site config ($h.txt)");
-				$file_primary = self::$config_path."/$h.txt";
-				$matched_name = $h;
+				$file_custom = self::$config_path_custom."/$h.txt";
+				$config = self::build_from_file($file_custom);
+				//$matched_name = $h;
 				break;
 			}
 		}
 		
 		// if we found site config, process it
-		if (isset($file_primary)) {
-			$config_lines = file($file_primary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
-			if (!$config_lines || !is_array($config_lines)) return false;
-			$config = self::build_from_array($config_lines);
-			// if APC caching is available and enabled, mark this for cache
-			//$config->cache_in_apc = true;
-			$config->cache_key = $matched_name;
-			
 		// if autodetec on failure is off (on by default) we do not need to look
 		// in secondary folder
-			if (!$config->autodetect_on_failure()) {
+		if ($config && !$config->autodetect_on_failure()) {
 			self::debug('... autodetect on failure is disabled (no other site config files will be loaded)');
+			self::add_to_cache_merged($host, $exact_host_match, $config);
 			return $config;
 		}
-		}
 		
 		// look for site config file in secondary folder
 		if (isset(self::$config_path_fallback)) {
-			self::debug(". looking for site config for $host in secondary folder");
+			self::debug(". looking for site config for $host in standard folder");
 			foreach ($try as $h) {
-				if (file_exists(self::$config_path_fallback."/$h.txt")) {
-					self::debug("... found site config in secondary folder ($h.txt)");
+				if ($config_std = self::load_cached($h)) {
+					break;
+				} elseif (file_exists(self::$config_path_fallback."/$h.txt")) {
+					self::debug("... found site config in standard folder ($h.txt)");
 					$file_secondary = self::$config_path_fallback."/$h.txt";
-					$matched_name = $h;
+					$config_std = self::build_from_file($file_secondary);
 					break;
 				}
 			}
-			if (!isset($file_secondary)) {
-				self::debug("... no site config match in secondary folder");
-			}
 		}
 		
 		// return false if no config file found
-		if (!isset($file_primary) && !isset($file_secondary)) {
+		if (!$config && !$config_std) {
 			self::debug("... no site config match for $host");
+			self::add_to_cache_merged($host, $exact_host_match);
 			return false;
 		}
 		
-		// return primary config if secondary not found
-		if (!isset($file_secondary) && isset($config)) {
-			return $config;
-		}
-		
-		// process secondary config file
-		$config_lines = file($file_secondary, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
-		if (!$config_lines || !is_array($config_lines)) {
-			// failed to process secondary
-			if (isset($config)) {
-				// return primary config
-				return $config;
-			} else {
-				return false;
-			}
-		}
-		
-		// merge with primary and return
-		if (isset($config)) {
+		// final config handling
+		$config_final = null;
+		if (!$config_std && $config) {
+			$config_final = $config;
+		// merge with primary
+		} elseif ($config_std && $config) {
 			self::debug('. merging config files');
-			$config->append(self::build_from_array($config_lines));
-			return $config;
+			$config->append($config_std);
+			$config_final = $config;
 		} else {
 			// return just secondary
-			$config = self::build_from_array($config_lines);
+			//$config = self::build_from_array($config_lines);
 			// if APC caching is available and enabled, mark this for cache
 			//$config->cache_in_apc = true;
-			$config->cache_key = $matched_name;
-			return $config;
+			$config_final = $config_std;
 		}
+		self::add_to_cache_merged($host, $exact_host_match, $config_final);
+		return $config_final;
+	}
+	
+	public static function build_from_file($path, $cache=true) {
+		$key = basename($path, '.txt');
+		$config_lines = file($path, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+		if (!$config_lines || !is_array($config_lines)) return false;
+		$config = self::build_from_array($config_lines);
+		if ($cache) self::add_to_cache($key, $config);
+		return $config;
 	}

 	public static function build_from_string($string) {
@ -399,13 +420,23 @@ class SiteConfig
 			if (count($command) != 2) continue;
 			$val = trim($command[1]);
 			$command = trim($command[0]);
-			if ($command == '' || $val == '') continue;
+			//if ($command == '' || $val == '') continue;
+			// $val can be empty, e.g. replace_string: 
+			if ($command == '') continue;
+
+			// strip_attr is now an alias for strip.
+			// In FTR 3.8 we can strip attributes from elements, not only the elements themselves
+			// e.g. strip: //img/@srcset (removes srcset attribute from all img elements)
+			// but for backward compatibility (to avoid errors with new config files + old version of FTR)
+			// we've introduced strip_attr and we'll recommend using that in our public site config rep.
+			// strip_attr: //img/@srcset
+			if ($command == 'strip_attr') $command = 'strip';

 			// check for commands where we accept multiple statements
 			if (in_array($command, array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header', 'test_url', 'find_string', 'replace_string'))) {
 				array_push($config->$command, $val);
 			// check for single statement commands that evaluate to true or false
-			} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure'))) {
+			} elseif (in_array($command, array('tidy', 'prune', 'autodetect_on_failure', 'insert_detected_image'))) {
 				$config->$command = ($val == 'yes');
 			// check for single statement commands stored as strings
 			} elseif (in_array($command, array('parser'))) {
--- a/libraries/feedwriter/FeedItem.php
+++ b/libraries/feedwriter/FeedItem.php
@ -186,5 +186,4 @@
 		$this->setElement('enclosure','',$attributes);
 	}
 	
- } // end of class FeedItem
-?>
+ }
--- a/libraries/feedwriter/FeedWriter.php
+++ b/libraries/feedwriter/FeedWriter.php
@ -1,4 +1,6 @@
 <?php
+define('ATOM', -1); // unused
+define('RSS1', 0); // unused
 define('RSS2', 1);
 define('JSON', 2);
 define('JSONP', 3);
--- a/libraries/htmLawed/htmLawed.php
+++ b/libraries/htmLawed/htmLawed.php
--- a/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php
+++ b/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php
@ -274,7 +274,8 @@ class DOMTreeBuilder implements EventHandler

        // SPECIAL TAG HANDLING:
        // Spec says do this, and "don't ask."
-        if ($name == 'image') {
+        // find the spec where this is defined... looks problematic
+        if ($name == 'image' && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) {
            $name = 'img';
        }

--- a/libraries/html5php/HTML5/Parser/Tokenizer.php
+++ b/libraries/html5php/HTML5/Parser/Tokenizer.php
@ -83,11 +83,8 @@ class Tokenizer
     */
    public function parse()
    {
-        $p = 0;
        do {
-            $p = $this->scanner->position();
            $this->consumeData();
-
            // FIXME: Add infinite loop protection.
        } while ($this->carryOn);
    }
@ -145,7 +142,8 @@ class Tokenizer
     */
    protected function characterData()
    {
-        if ($this->scanner->current() === false) {
+        $tok = $this->scanner->current();
+        if ($tok === false) {
            return false;
        }
        switch ($this->textMode) {
@ -154,7 +152,6 @@ class Tokenizer
            case Elements::TEXT_RCDATA:
                return $this->rcdata();
            default:
-                $tok = $this->scanner->current();
                if (strspn($tok, "<&")) {
                    return false;
                }
@ -408,24 +405,26 @@ class Tokenizer
        if ($tok == '/') {
            $this->scanner->next();
            $this->scanner->whitespace();
-            if ($this->scanner->current() == '>') {
+            $tok = $this->scanner->current();
+
+            if ($tok == '>') {
                $selfClose = true;
                return true;
            }
-            if ($this->scanner->current() === false) {
+            if ($tok === false) {
                $this->parseError("Unexpected EOF inside of tag.");
                return true;
            }
            // Basically, we skip the / token and go on.
            // See 8.2.4.43.
-            $this->parseError("Unexpected '%s' inside of a tag.", $this->scanner->current());
+            $this->parseError("Unexpected '%s' inside of a tag.", $tok);
            return false;
        }

-        if ($this->scanner->current() == '>') {
+        if ($tok == '>') {
            return true;
        }
-        if ($this->scanner->current() === false) {
+        if ($tok === false) {
            $this->parseError("Unexpected EOF inside of tag.");
            return true;
        }
@ -541,15 +540,21 @@ class Tokenizer
    {
        $stoplist = "\f" . $quote;
        $val = '';
-        $tok = $this->scanner->current();
-        while (strspn($tok, $stoplist) == 0 && $tok !== false) {
-            if ($tok == '&') {
-                $val .= $this->decodeCharacterReference(true);
-                $tok = $this->scanner->current();
+
+        while (true) {
+            $tokens = $this->scanner->charsUntil($stoplist.'&');
+            if ($tokens !== false) {
+                $val .= $tokens;
            } else {
-                $val .= $tok;
-                $tok = $this->scanner->next();
+                break;
            }
+
+            $tok = $this->scanner->current();
+            if ($tok == '&') {
+                $val .= $this->decodeCharacterReference(true, $tok);
+                continue;
+            }
+            break;
        }
        $this->scanner->next();
        return $val;
@ -591,18 +596,18 @@ class Tokenizer
     */
    protected function bogusComment($leading = '')
    {
-
-        // TODO: This can be done more efficiently when the
-        // scanner exposes a readUntil() method.
        $comment = $leading;
+        $tokens = $this->scanner->charsUntil('>');
+        if ($tokens !== false) {
+            $comment .= $tokens;
+        }
        $tok = $this->scanner->current();
-        do {
+        if ($tok !== false) {
            $comment .= $tok;
-            $tok = $this->scanner->next();
-        } while ($tok !== false && $tok != '>');
+        }

        $this->flushBuffer();
-        $this->events->comment($comment . $tok);
+        $this->events->comment($comment);
        $this->scanner->next();

        return true;
@ -646,15 +651,17 @@ class Tokenizer
     */
    protected function isCommentEnd()
    {
+        $tok = $this->scanner->current();
+
        // EOF
-        if ($this->scanner->current() === false) {
+        if ($tok === false) {
            // Hit the end.
            $this->parseError("Unexpected EOF in a comment.");
            return true;
        }

        // If it doesn't start with -, not the end.
-        if ($this->scanner->current() != '-') {
+        if ($tok != '-') {
            return false;
        }

@ -737,7 +744,6 @@ class Tokenizer

        $pub = strtoupper($this->scanner->getAsciiAlpha());
        $white = strlen($this->scanner->whitespace());
-        $tok = $this->scanner->current();

        // Get ID, and flag it as pub or system.
        if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
@ -938,10 +944,11 @@ class Tokenizer
        $len = strlen($sequence);
        $buffer = '';
        for ($i = 0; $i < $len; ++ $i) {
-            $buffer .= $this->scanner->current();
+            $tok = $this->scanner->current();
+            $buffer .= $tok;

            // EOF. Rewind and let the caller handle it.
-            if ($this->scanner->current() === false) {
+            if ($tok === false) {
                $this->scanner->unconsume($i);
                return false;
            }
@ -1067,18 +1074,22 @@ class Tokenizer
                }
                $entity = CharacterReference::lookupDecimal($numeric);
            }
-        }         // String entity.
-        else {
+        } elseif ($tok === '=' && $inAttribute) {
+            return '&';
+        } else { // String entity.
+
            // Attempt to consume a string up to a ';'.
            // [a-zA-Z0-9]+;
-            $cname = $this->scanner->getAsciiAlpha();
+            $cname = $this->scanner->getAsciiAlphaNum();
            $entity = CharacterReference::lookupName($cname);

            // When no entity is found provide the name of the unmatched string
            // and continue on as the & is not part of an entity. The & will
            // be converted to &amp; elsewhere.
            if ($entity == null) {
+                if (!$inAttribute || strlen($cname) === 0) {
                    $this->parseError("No match in entity table for '%s'", $cname);
+                }
                $this->scanner->unconsume($this->scanner->position() - $start);
                return '&';
            }
--- a/libraries/html5php/README.md
+++ b/libraries/html5php/README.md
@ -1,14 +1,16 @@
 # HTML5-PHP

-The need for an HTML5 parser in PHP is clear. This project initially
-began with the seemingly abandoned `html5lib` project [original source](https://code.google.com/p/html5lib/source/checkout).
-But after some initial refactoring work, we began a new parser.
+HTML5 is a standards-compliant HTML5 parser and writer written entirely in PHP.
+It is stable and used in many production websites, and has
+well over [one million downloads](https://packagist.org/packages/masterminds/html5).
+
+HTML5 provides the following features.

 - An HTML5 serializer
 - Support for PHP namespaces
 - Composer support
 - Event-based (SAX-like) parser
- DOM tree builder
+- A DOM tree builder
 - Interoperability with [QueryPath](https://github.com/technosophos/querypath)
 - Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer

@ -16,6 +18,7 @@ But after some initial refactoring work, we began a new parser.
 [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5)
 [![Code Coverage](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/coverage.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
 [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/Masterminds/html5-php/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/Masterminds/html5-php/?branch=master)
+[![Stability: Sustained](https://masterminds.github.io/stability/sustained.svg)](https://masterminds.github.io/stability/sustained.html)

 ## Installation

@ -23,7 +26,7 @@ Install HTML5-PHP using [composer](http://getcomposer.org/).

 To install, add `masterminds/html5` to your `composer.json` file:

-```
+```json
 {
  "require" : {
    "masterminds/html5": "2.*"
--- a/libraries/html5php/RELEASE.md
+++ b/libraries/html5php/RELEASE.md
@ -1,6 +1,13 @@
 # Release Notes

-2.2.2 (2016-10-22)
+2.3.0 (2017-09-04)
+
+- #129: image within inline svg breaks system (fixed by #133) 
+- #131: &sup2; does not work (fixed by #132)
+- #134: Improve tokenizer performance by 20% (alternative version of #130 thanks to @MichaelHeerklotz)
+- #135: Raw & in attributes
+
+2.2.2 (2016-09-22)

 - #116: In XML mode, tags are case sensitive
 - #115: Fix PHP Notice in OutputRules
@ -14,8 +21,7 @@
 2.2.0 (2016-04-11)

 - #105: Enable composer cache (for CI/CD)
- #100: Use mb_substitute_character inset of ini_set for environments where
-  ini_set is disable (e.g., shared hosting)
+- #100: Use mb_substitute_character inset of ini_set for environments where ini_set is disable (e.g., shared hosting)
 - #98: Allow link, meta, style tags in noscript tags
 - #96: Fixed xml:href on svgs that use the "use" breaking
 - #94: Counting UTF8 characters performance improvement
--- a/libraries/humble-http-agent/HumbleHttpAgent.php
+++ b/libraries/humble-http-agent/HumbleHttpAgent.php
@ -7,8 +7,8 @@
 * For environments which do not have these options, it reverts to standard sequential 
 * requests (using file_get_contents())
 * 
- * @version 1.7
- * @date 2016-11-28
+ * @version 1.8
+ * @date 2017-09-25
 * @see http://devel-m6w6.rhcloud.com/mdref/http
 * @author Keyvan Minoukadeh
 * @copyright 2011-2016 Keyvan Minoukadeh
@ -21,8 +21,9 @@ class HumbleHttpAgent
 	const METHOD_CURL_MULTI = 2;
 	const METHOD_FILE_GET_CONTENTS = 4;
 	//const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1';
-	const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36';
-	const UA_PHP = 'PHP/5.6';
+	// popular user agents from https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
+	const UA_BROWSER = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36';
+	const UA_PHP = 'PHP/7.1';
 	const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1';
 	
 	protected $requests = array();
@ -194,6 +195,24 @@ class HumbleHttpAgent
 	
 	public function getMetaRefreshURL($url, $html) {
 		if ($html == '') return false;
+
+		// TODO: parse HTML properly
+		// For now, to deal with cases where meta refresh matches but shouldn't, e.g. CNN's 
+		// <!--[if lte IE 9]><meta http-equiv="refresh" content="1;url=/2.37.2/static/unsupp.html" /><![endif]-->
+		// we do the string replacements in the site config file before looking for the meta refresh
+		if (isset($this->siteConfigBuilder)) {
+			$sconfig = $this->siteConfigBuilder->buildSiteConfig($url);
+			// do string replacements
+			if (!empty($sconfig->find_string)) {
+				if (count($sconfig->find_string) == count($sconfig->replace_string)) {
+					$html = str_replace($sconfig->find_string, $sconfig->replace_string, $html, $_count);
+					//$this->debug("Strings replaced: $_count (find_string and/or replace_string)");
+				} else {
+					//$this->debug('Skipped string replacement - incorrect number of find-replace strings in site config');
+				}
+			}
+		}
+
 		// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
 		if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']?!i', $html, $match)) {
 			return false;
@ -211,7 +230,7 @@ class HumbleHttpAgent
 		if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
 		if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) {
 			$this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute);
-			return $absolute->get_iri();
+			return $absolute->get_uri();
 		}
 		return false;
 	}	
@ -248,6 +267,21 @@ class HumbleHttpAgent
 		}
 	}
 	
+	public function convertIdn($url) {
+		if (function_exists('idn_to_ascii')) {
+			if ($host = @parse_url($url, PHP_URL_HOST)) {
+				$puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46);
+				if ($host != $puny) {
+					$pos = strpos($url, $host);
+					if ($pos !== false) {
+						$url = substr_replace($url, $puny, $pos, strlen($host));
+					}
+				}
+			}
+		}
+		return $url;
+	}
+
 	public function rewriteUrls($url) {
 		foreach ($this->rewriteUrls as $find => $action) {
 			if (strpos($url, $find) !== false) {
@ -327,6 +361,7 @@ class HumbleHttpAgent
 						} else {
 							$this->debug("......adding to pool");
 							$req_url = $this->rewriteUrls($url);
+							$req_url = $this->convertIdn($req_url);
 							$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
 							$req_url = $this->removeFragment($req_url);
 							if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
@ -507,6 +542,7 @@ class HumbleHttpAgent
 					} else {
 						$this->debug("......adding to pool");
 						$req_url = $this->rewriteUrls($url);
+						$req_url = $this->convertIdn($req_url);
 						$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
 						$req_url = $this->removeFragment($req_url);
 						if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) {
@ -649,6 +685,7 @@ class HumbleHttpAgent
 					$this->debug("Sending request for $url");
 					$this->requests[$orig]['original_url'] = $orig;
 					$req_url = $this->rewriteUrls($url);
+					$req_url = $this->convertIdn($req_url);
 					$req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url;
 					$req_url = $this->removeFragment($req_url);
 					$httpContext = $this->httpContext;
--- a/libraries/language-detect/LanguageDetect.php
+++ b/libraries/language-detect/LanguageDetect.php
@ -1,5 +1,4 @@
 <?php
-
 /**
 * Detects the language of a given piece of text.
 *
@ -74,9 +73,8 @@ class Text_LanguageDetect
     * $this->_data_dir will be ignored
     *
     * @var string
-     * @access   private
     */
-    var $_db_filename = 'lang.dat';
+    protected $_db_filename = 'lang.dat';

    /**
     * The filename that stores the unicode block definitions
@ -85,9 +83,8 @@ class Text_LanguageDetect
     * $this->_data_dir will be ignored
     *
     * @var string
-     * @access private
     */
-    var $_unicode_db_filename = 'unicode_blocks.dat';
+    protected $_unicode_db_filename = 'unicode_blocks.dat';

    /**
     * The data directory
@ -95,9 +92,8 @@ class Text_LanguageDetect
     * Should be set by PEAR installer
     *
     * @var string
-     * @access   private
     */
-    var $_data_dir = '@data_dir@';
+    protected $_data_dir = '@data_dir@';

    /**
     * The trigram data for comparison
@ -105,63 +101,56 @@ class Text_LanguageDetect
     * Will be loaded on start from $this->_db_filename
     *
     * @var array
-     * @access   private
     */
-    var $_lang_db = array();
+    protected $_lang_db = array();

    /**
-     * stores the map of the trigram data to unicode characters
+     * Stores the map of the trigram data to unicode characters
     *
-     * @access private
     * @var array
     */
-    var $_unicode_map;
+    protected $_unicode_map;

    /**
     * The size of the trigram data arrays
     *
     * @var int
-     * @access   private
     */
-    var $_threshold = 300;
+    protected $_threshold = 300;

    /**
-     * the maximum possible score.
+     * The maximum possible score.
     *
-     * needed for score normalization. Different depending on the
+     * Needed for score normalization. Different depending on the
     * perl compatibility setting
     *
-     * @access  private
     * @var int
     * @see setPerlCompatible()
     */
-    var $_max_score = 0;
+    protected $_max_score = 0;

    /**
     * Whether or not to simulate perl's Language::Guess exactly
     *
-     * @access  private
     * @var bool
     * @see setPerlCompatible()
     */
-    var $_perl_compatible = false;
+    protected $_perl_compatible = false;

    /**
     * Whether to use the unicode block detection to speed up processing
     *
-     * @access private
     * @var bool
     */
-    var $_use_unicode_narrowing = true;
+    protected $_use_unicode_narrowing = true;

    /**
-     * stores the result of the clustering operation
+     * Stores the result of the clustering operation
     *
-     * @access  private
     * @var array
     * @see clusterLanguages()
     */
-    var $_clusters;
+    protected $_clusters;

    /**
     * Which type of "language names" are accepted and returned:
@ -170,7 +159,7 @@ class Text_LanguageDetect
     * 2 - 2-letter ISO 639-1 code ("en")
     * 3 - 3-letter ISO 639-2 code ("eng")
     */
-    var $_name_mode = 0;
+    protected $_name_mode = 0;

    /**
     * Constructor
@ -178,7 +167,7 @@ class Text_LanguageDetect
     * Will attempt to load the language database. If it fails, you will get
     * an exception.
     */
-    function __construct()
+    public function __construct()
    {
        $data = $this->_readdb($this->_db_filename);
        $this->_checkTrigram($data['trigram']);
@ -200,9 +189,8 @@ class Text_LanguageDetect
     * @param string $fname File name to load
     *
     * @return string expected path to the language model database
-     * @access private
     */
-    function _get_data_loc($fname)
+    protected function _get_data_loc($fname)
    {
        return dirname(__FILE__).'/'.$fname;
    }
@ -216,9 +204,8 @@ class Text_LanguageDetect
     *
     * @return array the language model data
     * @throws Text_LanguageDetect_Exception
-     * @access private
     */
-    function _readdb($fname)
+    protected function _readdb($fname)
    {
        // finds the correct data dir
        $fname = $this->_get_data_loc($fname);
@ -246,9 +233,8 @@ class Text_LanguageDetect
     * @param array $trigram Trigram data from database
     *
     * @return void
-     * @access private
     */
-    function _checkTrigram($trigram)
+    protected function _checkTrigram($trigram)
    {
        if (!is_array($trigram)) {
            if (ini_get('magic_quotes_runtime')) {
@ -340,11 +326,10 @@ class Text_LanguageDetect
    /**
     * Returns the number of languages that this object can detect
     *
-     * @access public
     * @return int            the number of languages
     * @throws Text_LanguageDetect_Exception
     */
-    function getLanguageCount()
+    public function getLanguageCount()
    {
        return count($this->_lang_db);
    }
@ -382,11 +367,10 @@ class Text_LanguageDetect
    /**
     * Returns the list of detectable languages
     *
-     * @access public
     * @return array        the names of the languages known to this object<<<<<<<
     * @throws Text_LanguageDetect_Exception
     */
-    function getLanguages()
+    public function getLanguages()
    {
        return $this->_convertToNameMode(
            array_keys($this->_lang_db)
@ -424,7 +408,7 @@ class Text_LanguageDetect
     *
     * @return void
     */
-    function setNameMode($name_mode)
+    public function setNameMode($name_mode)
    {
        $this->_name_mode = $name_mode;
    }
@ -454,10 +438,9 @@ class Text_LanguageDetect
     * @param string $text text to convert
     *
     * @return     array array of trigram frequencies
-     * @access     private
     * @deprecated Superceded by the Text_LanguageDetect_Parser class
     */
-    function _trigram($text)
+    protected function _trigram($text)
    {
        $s = new Text_LanguageDetect_Parser($text);
        $s->prepareTrigram();
@ -475,9 +458,8 @@ class Text_LanguageDetect
     * @param array $arr array of trigram
     *
     * @return array ranks of trigrams
-     * @access protected
     */
-    function _arr_rank($arr)
+    protected function _arr_rank($arr)
    {

        // sorts alphabetically first as a standard way of breaking rank ties
@ -505,12 +487,11 @@ class Text_LanguageDetect
    /**
     * Sorts an array by value breaking ties alphabetically
     *
-     * @param array &$arr the array to sort
+     * @param array $arr the array to sort
     *
     * @return void
-     * @access private
     */
-    function _bub_sort(&$arr)
+    protected function _bub_sort(&$arr)
    {
        // should do the same as this perl statement:
        // sort { $trigrams{$b} == $trigrams{$a}
@ -548,9 +529,8 @@ class Text_LanguageDetect
     *
     * @return int 1 if $a is greater, -1 if not
     * @see    _bub_sort()
-     * @access private
     */
-    function _sort_func($a, $b)
+    protected function _sort_func($a, $b)
    {
        // each is actually a key/value pair, so that it can compare using both
        list($a_key, $a_value) = $a;
@ -588,9 +568,8 @@ class Text_LanguageDetect
     *
     * @return int the sum of the differences between the ranks of
     *             the two trigram sets
-     * @access private
     */
-    function _distance($arr1, $arr2)
+    protected function _distance($arr1, $arr2)
    {
        $sumdist = 0;

@ -621,9 +600,8 @@ class Text_LanguageDetect
     *
     * @return float the normalized score
     * @see    _distance()
-     * @access private
     */
-    function _normalize_score($score, $base_count = null)
+    protected function _normalize_score($score, $base_count = null)
    {
        if ($base_count === null) {
            $base_count = $this->_threshold;
@ -699,7 +677,7 @@ class Text_LanguageDetect
        $sample_obj->setPadStart(!$this->_perl_compatible);
        $sample_obj->analyze();

-        $trigram_freqs =& $sample_obj->getTrigramRanks();
+        $trigram_freqs = $sample_obj->getTrigramRanks();
        $trigram_count = count($trigram_freqs);

        if ($trigram_count == 0) {
@ -710,7 +688,7 @@ class Text_LanguageDetect

        // use unicode block detection to narrow down the possibilities
        if ($this->_use_unicode_narrowing) {
-            $blocks =& $sample_obj->getUnicodeBlocks();
+            $blocks = $sample_obj->getUnicodeBlocks();

            if (is_array($blocks)) {
                $present_blocks = array_keys($blocks);
@ -962,16 +940,15 @@ class Text_LanguageDetect
     *
     * @return mixed Block name, -1 if it failed
     * @see    unicodeBlockName()
-     * @access protected
     */
-    function _unicode_block_name($unicode, $blocks, $block_count = -1)
+    protected function _unicode_block_name($unicode, $blocks, $block_count = -1)
    {
        // for a reference, see
        // http://www.unicode.org/Public/UNIDATA/Blocks.txt

        // assume that ascii characters are the most common
        // so try it first for efficiency
-        if ($unicode <= hexdec($blocks[0][1])) {
+        if ($unicode <= $blocks[0][1]) {
            return $blocks[0];
        }

@ -989,11 +966,11 @@ class Text_LanguageDetect
        while ($low <= $high) {
            $mid = floor(($low + $high) / 2);

-            if ($unicode < hexdec($blocks[$mid][0])) {
+            if ($unicode < $blocks[$mid][0]) {
                // if it's lower than the lower bound
                $high = $mid - 1;

-            } elseif ($unicode > hexdec($blocks[$mid][1])) {
+            } elseif ($unicode > $blocks[$mid][1]) {
                // if it's higher than the upper bound
                $low = $mid + 1;

@ -1015,9 +992,8 @@ class Text_LanguageDetect
     *
     * @return array the database of unicode block definitions
     * @throws Text_LanguageDetect_Exception
-     * @access protected
     */
-    function _read_unicode_block_db()
+    protected function _read_unicode_block_db()
    {
        // since the unicode definitions are always going to be the same,
        // might as well share the memory for the db with all other instances
@ -1136,14 +1112,13 @@ class Text_LanguageDetect
     * Uses a nearest neighbor technique to generate the maximum possible
     * number of dendograms from the similarity data.
     *
-     * @access      public
     * @return     array language cluster data
     * @throws     Text_LanguageDetect_Exception
     * @see        languageSimilarity()
     * @deprecated this function will eventually be removed and placed into
     *              the model generation class
     */
-    function clusterLanguages()
+    public function clusterLanguages()
    {
        // todo: set the maximum number of clusters
        // return cached result, if any
@ -1452,7 +1427,7 @@ class Text_LanguageDetect
    }

    /**
-     * ut8-safe strlen()
+     * UTF8-safe strlen()
     *
     * Returns the numbers of characters (not bytes) in a utf8 string
     *
@ -1476,10 +1451,9 @@ class Text_LanguageDetect
     * @param string $char a utf8 (possibly multi-byte) char
     *
     * @return int unicode value
-     * @access protected
     * @link   http://en.wikipedia.org/wiki/UTF-8
     */
-    function _utf8char2unicode($char)
+    protected function _utf8char2unicode($char)
    {
        // strlen() here will actually get the binary length of a single char
        switch (strlen($char)) {
@ -1516,20 +1490,19 @@ class Text_LanguageDetect
    }

    /**
-     * utf8-safe fast character iterator
+     * UTF8-safe fast character iterator
     *
     * Will get the next character starting from $counter, which will then be
     * incremented. If a multi-byte char the bytes will be concatenated and
     * $counter will be incremeted by the number of bytes in the char.
     *
     * @param string $str             the string being iterated over
-     * @param int    &$counter        the iterator, will increment by reference
+     * @param int    $counter         the iterator, will increment by reference
     * @param bool   $special_convert whether to do special conversions
     *
     * @return char the next (possibly multi-byte) char from $counter
-     * @access private
     */
-    static function _next_char($str, &$counter, $special_convert = false)
+    protected static function _next_char($str, &$counter, $special_convert = false)
    {
        $char = $str{$counter++};
        $ord = ord($char);
@ -1621,7 +1594,7 @@ class Text_LanguageDetect
     *
     * @return string|array Language name
     */
-    function _convertFromNameMode($lang, $convertKey = false)
+    protected function _convertFromNameMode($lang, $convertKey = false)
    {
        if ($this->_name_mode == 0) {
            return $lang;
@ -1661,7 +1634,7 @@ class Text_LanguageDetect
     *
     * @return string|array Language name
     */
-    function _convertToNameMode($lang, $convertKey = false)
+    protected function _convertToNameMode($lang, $convertKey = false)
    {
        if ($this->_name_mode == 0) {
            return $lang;
@ -1689,5 +1662,3 @@ class Text_LanguageDetect
        return $newlang;
    }
 }
-
-/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
--- a/libraries/language-detect/LanguageDetect/Exception.php
+++ b/libraries/language-detect/LanguageDetect/Exception.php
@ -1,4 +1,16 @@
 <?php
+/**
+ * Part of Text_LanguageDetect
+ *
+ * PHP version 5
+ *
+ * @category Text
+ * @package  Text_LanguageDetect
+ * @author   Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @license  BSD http://www.opensource.org/licenses/bsd-license.php
+ * @link     http://pear.php.net/package/Text_LanguageDetect/
+ */
+ 
 class Text_LanguageDetect_Exception extends Exception
 {
    /**
--- a/libraries/language-detect/LanguageDetect/ISO639.php
+++ b/libraries/language-detect/LanguageDetect/ISO639.php
@ -1,18 +1,4 @@
 <?php
-/**
- * Part of Text_LanguageDetect
- *
- * PHP version 5
- *
- * @category  Text
- * @package   Text_LanguageDetect
- * @author    Christian Weiske <cweiske@php.net>
- * @copyright 2011 Christian Weiske <cweiske@php.net>
- * @license   http://www.debian.org/misc/bsd.license BSD
- * @version   SVN: $Id$
- * @link      http://pear.php.net/package/Text_LanguageDetect/
- */
-
 /**
 * Provides a mapping between the languages from lang.dat and the
 * ISO 639-1 and ISO-639-2 codes.
@ -23,7 +9,7 @@
 * @package   Text_LanguageDetect
 * @author    Christian Weiske <cweiske@php.net>
 * @copyright 2011 Christian Weiske <cweiske@php.net>
- * @license   http://www.debian.org/misc/bsd.license BSD
+ * @license   BSD http://www.opensource.org/licenses/bsd-license.php
 * @link      http://www.loc.gov/standards/iso639-2/php/code_list.php
 */
 class Text_LanguageDetect_ISO639
--- a/libraries/language-detect/LanguageDetect/Parser.php
+++ b/libraries/language-detect/LanguageDetect/Parser.php
@ -1,18 +1,4 @@
 <?php
-
-/**
- * This class represents a text sample to be parsed.
- *
- * @category    Text
- * @package     Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
- * @link        http://pear.php.net/package/Text_LanguageDetect/
- * @link        http://langdetect.blogspot.com/
- */
-
 /**
 * This class represents a text sample to be parsed.
 *
@ -25,94 +11,101 @@
 *
 * @category  Text
 * @package   Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     release: 0.3.0
+ * @author    Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @copyright 2006 Nicholas Pisarro
+ * @license   BSD http://www.opensource.org/licenses/bsd-license.php
+ * @version   Release: 1.0.0
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
 */
 class Text_LanguageDetect_Parser extends Text_LanguageDetect
 {
    /**
-     * the piece of text being parsed
+     * The piece of text being parsed
     *
-     * @access  private
     * @var string
     */
-    var $_string;
+    protected $_string;

    /**
-     * stores the trigram frequencies of the sample
+     * Stores the trigram frequencies of the sample
     *
-     * @access  private
     * @var string
     */
-    var $_trigrams = array();
+    protected $_trigrams = array();

    /**
-     * stores the trigram ranks of the sample
+     * Stores the trigram ranks of the sample
     *
-     * @access  private
     * @var array
     */
-    var $_trigram_ranks = array();
+    protected $_trigram_ranks = array();

    /**
-     * stores the unicode blocks of the sample
+     * Stores the unicode blocks of the sample
     *
-     * @access  private
     * @var array
     */
-    var $_unicode_blocks = array();
+    protected $_unicode_blocks = array();

    /**
     * Whether the parser should compile the unicode ranges
     *
-     * @access  private
     * @var bool
     */
-    var $_compile_unicode = false;
+    protected $_compile_unicode = false;

    /**
     * Whether the parser should compile trigrams
     *
-     * @access  private
     * @var bool
     */
-    var $_compile_trigram = false;
+    protected $_compile_trigram = false;

    /**
     * Whether the trigram parser should pad the beginning of the string
     *
-     * @access  private
     * @var bool
     */
-    var $_trigram_pad_start = false;
+    protected $_trigram_pad_start = false;

    /**
     * Whether the unicode parser should skip non-alphabetical ascii chars
     *
-     * @access  private
     * @var bool
     */
-    var $_unicode_skip_symbols = true;
+    protected $_unicode_skip_symbols = true;

    /**
     * Constructor
     *
-     * @access  private
     * @param string $string string to be parsed
     */
-    function __construct($string) {
+    public function __construct($string)
+    {
        $this->_string = $string;
    }

+    /**
+     * PHP 4 constructor for backwards compatibility.
+     *
+     * @param string $string string to be parsed
+     *
+     * @return void
+     */
+    public function Text_LanguageDetect_Parser($string)
+    {
+        self::__construct($string);
+    }
+
    /**
     * Returns true if a string is suitable for parsing
     *
     * @param string $str input string to test
+     *
     * @return bool true if acceptable, false if not
     */
-    public static function validateString($str) {
+    public static function validateString($str)
+    {
        if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
            return true;
        } else {
@ -121,34 +114,37 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
    }

    /**
-     * turn on/off trigram counting
+     * Turn on/off trigram counting
     *
-     * @access  public
     * @param bool $bool true for on, false for off
+     *
+     * @return void
     */
-    function prepareTrigram($bool = true)
+    public function prepareTrigram($bool = true)
    {
        $this->_compile_trigram = $bool;
    }

    /**
-     * turn on/off unicode block counting
+     * Turn on/off unicode block counting
     *
-     * @access  public
     * @param bool $bool true for on, false for off
+     *
+     * @return void
     */
-    function prepareUnicode($bool = true)
+    public function prepareUnicode($bool = true)
    {
        $this->_compile_unicode = $bool;
    }

    /**
-     * turn on/off padding the beginning of the sample string
+     * Turn on/off padding the beginning of the sample string
     *
-     * @access  public
     * @param bool $bool true for on, false for off
+     *
+     * @return void
     */
-    function setPadStart($bool = true)
+    public function setPadStart($bool = true)
    {
        $this->_trigram_pad_start = $bool;
    }
@ -156,10 +152,11 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
    /**
     * Should the unicode block counter skip non-alphabetical ascii chars?
     *
-     * @access  public
     * @param bool $bool true for on, false for off
+     *
+     * @return void
     */
-    function setUnicodeSkipSymbols($bool = true)
+    public function setUnicodeSkipSymbols($bool = true)
    {
        $this->_unicode_skip_symbols = $bool;
    }
@ -167,10 +164,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
    /**
     * Returns the trigram ranks for the text sample
     *
-     * @access  public
-     * @return  array    trigram ranks in the text sample
+     * @return array Trigram ranks in the text sample
     */
-    function &getTrigramRanks()
+    public function getTrigramRanks()
    {
        return $this->_trigram_ranks;
    }
@ -178,23 +174,21 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
    /**
     * Return the trigram freqency table
     *
-     * only used in testing to make sure the parser is working
+     * Only used in testing to make sure the parser is working
     *
-     * @access  public
-     * @return  array    trigram freqencies in the text sample
+     * @return array Trigram freqencies in the text sample
     */
-    function &getTrigramFreqs()
+    public function getTrigramFreqs()
    {
        return $this->_trigram;
    }

    /**
-     * returns the array of unicode blocks
+     * Returns the array of unicode blocks
     *
-     * @access  public
-     * @return  array   unicode blocks in the text sample
+     * @return array Unicode blocks in the text sample
     */
-    function &getUnicodeBlocks()
+    public function getUnicodeBlocks()
    {
        return $this->_unicode_blocks;
    }
@ -208,9 +202,9 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
     * Afterwards the get*() functions can be used to access the compiled
     * information.
     *
-     * @access public
+     * @return void
     */
-    function analyze()
+    public function analyze()
    {
        $len = strlen($this->_string);
        $byte_counter = 0;
@ -274,7 +268,8 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
                    && strlen($char) == 1
                    && ($char < 'A' || $char > 'z'
                    || ($char > 'Z' && $char < 'a'))
-                        && $char != "'") {  // does not skip the apostrophe
+                    && $char != "'"
+                ) {  // does not skip the apostrophe
                                            // since it's included in the language
                                            // models

@ -297,7 +292,8 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
        if ($this->_compile_unicode) {
            foreach ($unicode_chars as $utf8_char => $count) {
                $search_result = $this->_unicode_block_name(
-                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
+                    $this->_utf8char2unicode($utf8_char), $blocks, $block_count
+                );

                if ($search_result != -1) {
                    $block_name = $search_result[2];
@ -343,5 +339,3 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect
        }
    }
 }
-
-/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
--- a/libraries/language-detect/unicode_blocks.dat
+++ b/libraries/language-detect/unicode_blocks.dat
--- a/libraries/readability/Readability.php
+++ b/libraries/readability/Readability.php
@ -122,6 +122,7 @@ class Readability
 		if ($parser=='gumbo') {
 			// Can we avoid this encoding/deocding step? Test on:
 			// http://www.medialens.org/index.php/alerts/alert-archive/2017/837-undermining-democracy-corporate-media-bias-on-jeremy-corbyn-boris-johnson-and-syria.html
+			$html = str_replace('&apos;', "'", $html); // other named entities handled okay
 			$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
 			$html = mb_convert_encoding($html, "UTF-8", 'HTML-ENTITIES');
 			$this->dom = @Layershifter\Gumbo\Parser::load($html);
--- a/makefulltextfeed.php
+++ b/makefulltextfeed.php
@ -3,8 +3,8 @@
 // Author: Keyvan Minoukadeh
 // Copyright (c) 2017 Keyvan Minoukadeh
 // License: AGPLv3
-// Version: 3.7
-// Date: 2017-02-12
+// Version: 3.8
+// Date: 2017-09-25
 // More info: http://fivefilters.org/content-only/
 // Help: http://help.fivefilters.org

@ -183,7 +183,9 @@ if (!isset($_REQUEST['url'])) {
 	die('No URL supplied'); 
 }
 $url = trim($_REQUEST['url']);
-if (strtolower(substr($url, 0, 7)) == 'feed://') {
+if (strtolower(substr($url, 0, 6)) == 'sec://') {
+	$url = 'https://'.substr($url, 6);
+} elseif (strtolower(substr($url, 0, 7)) == 'feed://') {
 	$url = 'http://'.substr($url, 7);
 }
 if (!preg_match('!^https?://.+!i', $url)) {
@ -345,10 +347,10 @@ if ($options->content === 'user') {
 // HTML5 output?
 ///////////////////////////////////////////////
 if ($options->html5_output === 'user') {
-	if (isset($_REQUEST['content']) && $_REQUEST['content'] === 'html5') {
-		$options->html5_output = true;
-	} else {
+	if (isset($_REQUEST['content']) && $_REQUEST['content'] === '1') {
 		$options->html5_output = false;
+	} else {
+		$options->html5_output = true;
 	}
 }

@ -820,7 +822,7 @@ foreach ($items as $key => $item) {
 					continue; // skip this feed item entry
 				}
 			}
-			$base_url = get_base_url($readability->dom);
+			$base_url = get_base_url($readability->dom, $effective_url);
 			if (!$base_url) $base_url = $effective_url;
 			$content_block = ($extract_result) ? $extractor->getContent() : null;			
 			$extracted_title = ($extract_result) ? $extractor->getTitle() : '';
@ -945,6 +947,7 @@ foreach ($items as $key => $item) {
 			//unset($content_block);
 			// post-processing cleanup
 			$html = preg_replace('!<p>[\s\h\v]*</p>!u', '', $html);
+			$html = str_replace('<p>&nbsp;</p>', '', $html);
 			if ($links == 'remove') {
 				$html = preg_replace('!<a\s+[^>]*>!', '', $html);
 				$html = preg_replace('!</a>!', '', $html);
@ -1080,6 +1083,7 @@ foreach ($items as $key => $item) {
 					$l_result = $l->detect($text_sample, 1);
 					if (count($l_result) > 0) {
 						$language = key($l_result);
+						debug('Language detected: '.$language);
 					}
 				}
 			} catch (Exception $e) {
@ -1248,6 +1252,17 @@ function get_self_url() {
 }

 function validate_url($url) {
+	if (function_exists('idn_to_ascii')) {
+		if ($host = @parse_url($url, PHP_URL_HOST)) {
+			$puny = idn_to_ascii($host, 0, INTL_IDNA_VARIANT_UTS46);
+			if ($host != $puny) {
+				$pos = strpos($url, $host);
+				if ($pos !== false) {
+					$url = substr_replace($url, $puny, $pos, strlen($host));
+				}
+			}
+		}
+	}
 	$url = filter_var($url, FILTER_SANITIZE_URL);
 	$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
 	// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
@ -1261,9 +1276,14 @@ function validate_url($url) {
 	}
 }

-function get_base_url($dom) {
+function get_base_url($dom, $url=null) {
 	$xpath = new DOMXPath($dom);
-	return @$xpath->evaluate('string(//head/base/@href)', $dom);
+	$base = @$xpath->evaluate('string(//head/base/@href)', $dom);
+	if (!$base) return false;
+	if (isset($url) && !preg_match('!^https?://!i', $base)) {
+		$base = make_absolute_str($url, $base);
+	}
+	return $base;
 }

 function is_ssl() {
@ -1436,7 +1456,7 @@ function make_absolute_attr($base, $e, $attr) {
 		$url = str_replace(' ', '%20', $url);
 		if (!preg_match('!https?://!i', $url)) {
 			if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
-				$e->setAttribute($attr, $absolute);
+				$e->setAttribute($attr, $absolute->get_uri());
 			}
 		}
 	}
@ -1450,7 +1470,7 @@ function make_absolute_str($base, $url) {
 		return $url;
 	} else {
 		if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
-			return $absolute;
+			return $absolute->get_uri();
 		}
 		return false;
 	}
@ -1529,7 +1549,7 @@ function get_single_page($item, $html, $url) {
 				}
 			}
 		}
-		$base_url = get_base_url($readability->dom);
+		$base_url = get_base_url($readability->dom, $url);
 		if (!$base_url) $base_url = $url;
 		// If we've got URL, resolve against $base_url
 		if (isset($single_page_url) && ($single_page_url = make_absolute_str($base_url, $single_page_url))) {