2011-01-11 19:06:12 +01:00
< ? php
2014-05-15 22:49:16 +02:00
// Full-Text RSS: Create Full-Text Feeds
2011-01-11 19:06:12 +01:00
// Author: Keyvan Minoukadeh
2019-04-04 23:23:27 +02:00
// Copyright (c) 2017 Keyvan Minoukadeh
2011-01-11 19:06:12 +01:00
// License: AGPLv3
2019-04-04 23:46:36 +02:00
// Version: 3.8
// Date: 2017-09-25
2013-04-18 16:11:06 +02:00
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
2011-01-11 19:06:12 +01:00
/*
This program is free software : you can redistribute it and / or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation , either version 3 of the License , or
( at your option ) any later version .
This program is distributed in the hope that it will be useful ,
but WITHOUT ANY WARRANTY ; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
GNU Affero General Public License for more details .
You should have received a copy of the GNU Affero General Public License
along with this program . If not , see < http :// www . gnu . org / licenses />.
*/
// Usage
// -----
2014-05-15 23:03:31 +02:00
// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article
// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage
2011-01-11 19:06:12 +01:00
error_reporting ( E_ALL ^ E_NOTICE );
2015-06-14 02:03:20 +02:00
libxml_use_internal_errors ( true );
2017-02-18 16:06:19 +01:00
libxml_disable_entity_loader ( true );
2011-01-11 19:06:12 +01:00
ini_set ( " display_errors " , 1 );
@ set_time_limit ( 120 );
2014-09-15 22:24:06 +02:00
if ( ! defined ( '_FF_FTR_MODE' )) define ( '_FF_FTR_MODE' , 'full' );
if ( _FF_FTR_MODE === 'simple' ) {
$_REQUEST = array_merge ( $_GET , $_POST );
} else {
$_REQUEST = $_GET ;
}
2011-01-11 19:06:12 +01:00
// set include path
set_include_path ( realpath ( dirname ( __FILE__ ) . '/libraries' ) . PATH_SEPARATOR . get_include_path ());
// Autoloading of classes allows us to include files only when they're
// needed. If we've got a cached copy, for example, only Zend_Cache is loaded.
2013-04-18 16:11:06 +02:00
function autoload ( $class_name ) {
2014-05-15 22:49:16 +02:00
static $dir = null ;
if ( $dir === null ) $dir = dirname ( __FILE__ ) . '/libraries/' ;
2011-01-11 19:06:12 +01:00
static $mapping = array (
// Include FeedCreator for RSS/Atom creation
'FeedWriter' => 'feedwriter/FeedWriter.php' ,
'FeedItem' => 'feedwriter/FeedItem.php' ,
2011-11-04 18:40:29 +01:00
// Include ContentExtractor and Readability for identifying and extracting content from URLs
'ContentExtractor' => 'content-extractor/ContentExtractor.php' ,
'SiteConfig' => 'content-extractor/SiteConfig.php' ,
2011-01-11 19:06:12 +01:00
'Readability' => 'readability/Readability.php' ,
// Include Humble HTTP Agent to allow parallel requests and response caching
'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php' ,
2011-11-04 18:10:31 +01:00
'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php' ,
2011-11-04 18:40:29 +01:00
'CookieJar' => 'humble-http-agent/CookieJar.php' ,
2014-09-15 22:24:06 +02:00
'HumbleHttpAgentDummy' => 'humble-http-agent/HumbleHttpAgentDummy.php' ,
2011-01-11 19:06:12 +01:00
// Include Zend Cache to improve performance (cache results)
2011-03-23 23:39:01 +01:00
'Zend_Cache' => 'Zend/Cache.php' ,
2013-04-18 16:11:06 +02:00
// Language detect
2014-05-15 22:49:16 +02:00
'Text_LanguageDetect' => 'language-detect/LanguageDetect.php' ,
2014-09-15 22:24:06 +02:00
// HTML5 PHP (can't be used unless PHP version is >= 5.3)
2015-06-14 02:03:20 +02:00
'Masterminds\HTML5' => 'html5php/HTML5.php' ,
2014-05-15 22:49:16 +02:00
// htmLawed - used if XSS filter is enabled (xss_filter)
2015-06-14 02:03:20 +02:00
'htmLawed' => 'htmLawed/htmLawed.php' ,
// Disable SimplePie sanitization
'DisableSimplePieSanitize' => 'DisableSimplePieSanitize.php'
2011-01-11 19:06:12 +01:00
);
if ( isset ( $mapping [ $class_name ])) {
2014-05-15 22:49:16 +02:00
debug ( " ** Loading class $class_name ( { $mapping [ $class_name ] } ) " );
require $dir . $mapping [ $class_name ];
2011-01-11 19:06:12 +01:00
return true ;
} else {
return false ;
}
}
2013-04-18 16:11:06 +02:00
spl_autoload_register ( 'autoload' );
2014-05-15 22:56:02 +02:00
require dirname ( __FILE__ ) . '/libraries/simplepie/autoloader.php' ;
2014-09-15 22:24:06 +02:00
require dirname ( __FILE__ ) . '/libraries/html5php/autoloader.php' ;
2011-01-11 19:06:12 +01:00
////////////////////////////////
2013-04-18 16:11:06 +02:00
// Load config file
2011-01-11 19:06:12 +01:00
////////////////////////////////
2014-05-15 22:49:16 +02:00
require dirname ( __FILE__ ) . '/config.php' ;
2011-01-11 19:06:12 +01:00
2013-04-18 16:11:06 +02:00
////////////////////////////////
// Prevent indexing/following by search engines because:
// 1. The content is already public and presumably indexed (why create duplicates?)
// 2. Not doing so might increase number of requests from search engines, thus increasing server load
// Note: feed readers and services such as Yahoo Pipes will not be affected by this header.
// Note: Using Disallow in a robots.txt file will be more effective (search engines will check
// that before even requesting makefulltextfeed.php).
////////////////////////////////
header ( 'X-Robots-Tag: noindex, nofollow' );
2014-09-15 22:24:06 +02:00
////////////////////////////////
// Content security headers
////////////////////////////////
header ( " Content-Security-Policy: script-src 'self'; connect-src 'none'; font-src 'none'; style-src 'self' " );
2011-01-11 19:06:12 +01:00
////////////////////////////////
// Check if service is enabled
////////////////////////////////
if ( ! $options -> enabled ) {
die ( 'The full-text RSS service is currently disabled' );
}
2019-04-04 23:15:15 +02:00
//////////////////////////////////
// Enable Cross-Origin Resource Sharing (CORS)
//////////////////////////////////
if ( $options -> cors ) header ( 'Access-Control-Allow-Origin: *' );
2014-05-15 22:49:16 +02:00
////////////////////////////////
// Debug mode?
// See the config file for debug options.
////////////////////////////////
$debug_mode = false ;
2014-09-15 22:24:06 +02:00
$debug_show_raw_html = false ;
$debug_show_parsed_html = false ;
if ( isset ( $_REQUEST [ 'debug' ])) {
2014-05-15 22:49:16 +02:00
if ( $options -> debug === true || $options -> debug == 'user' ) {
$debug_mode = true ;
} elseif ( $options -> debug == 'admin' ) {
session_start ();
$debug_mode = ( @ $_SESSION [ 'auth' ] == 1 );
}
if ( $debug_mode ) {
header ( 'Content-Type: text/plain; charset=utf-8' );
2014-09-15 22:24:06 +02:00
$debug_show_raw_html = ( $_REQUEST [ 'debug' ] === 'rawhtml' );
$debug_show_parsed_html = ( $_REQUEST [ 'debug' ] === 'parsedhtml' );
2014-05-15 22:49:16 +02:00
} else {
if ( $options -> debug == 'admin' ) {
die ( 'You must be logged in to the <a href="admin/">admin area</a> to see debug output.' );
} else {
die ( 'Debugging is disabled.' );
}
}
}
////////////////////////////////
// Check for APC
////////////////////////////////
$options -> apc = $options -> apc && function_exists ( 'apc_add' );
if ( $options -> apc ) {
debug ( 'APC is enabled and available on server' );
} else {
debug ( 'APC is disabled or not available on server' );
}
////////////////////////////////
// Check for smart cache
////////////////////////////////
$options -> smart_cache = $options -> smart_cache && function_exists ( 'apc_inc' );
2011-01-11 19:06:12 +01:00
////////////////////////////////
// Check for feed URL
////////////////////////////////
2014-09-15 22:24:06 +02:00
if ( ! isset ( $_REQUEST [ 'url' ])) {
2011-01-11 19:06:12 +01:00
die ( 'No URL supplied' );
}
2014-09-15 22:24:06 +02:00
$url = trim ( $_REQUEST [ 'url' ]);
2019-04-04 23:46:36 +02:00
if ( strtolower ( substr ( $url , 0 , 6 )) == 'sec://' ) {
$url = 'https://' . substr ( $url , 6 );
} elseif ( strtolower ( substr ( $url , 0 , 7 )) == 'feed://' ) {
2013-04-18 16:11:06 +02:00
$url = 'http://' . substr ( $url , 7 );
}
2011-01-11 19:06:12 +01:00
if ( ! preg_match ( '!^https?://.+!i' , $url )) {
$url = 'http://' . $url ;
}
2015-06-14 02:03:20 +02:00
$url = validate_url ( $url );
if ( ! $url ) die ( 'Invalid URL supplied' );
2011-11-04 18:10:31 +01:00
2014-05-15 22:49:16 +02:00
debug ( " Supplied URL: $url " );
2011-03-23 23:39:01 +01:00
/////////////////////////////////
// Redirect to hide API key
2014-09-15 22:24:06 +02:00
// (if in 'full' mode)
2011-03-23 23:39:01 +01:00
/////////////////////////////////
2014-09-15 22:24:06 +02:00
if (( _FF_FTR_MODE == 'full' ) && isset ( $_REQUEST [ 'key' ]) && ( $key_index = array_search ( $_REQUEST [ 'key' ], $options -> api_keys )) !== false ) {
2015-06-14 02:03:20 +02:00
if ( isset ( $_REQUEST [ 'key_redirect' ]) && $_REQUEST [ 'key_redirect' ] === '0' ) {
$_REQUEST [ 'hash' ] = sha1 ( $_REQUEST [ 'key' ] . $url );
$_REQUEST [ 'key' ] = $key_index ;
2014-05-15 22:56:02 +02:00
} else {
2015-06-14 02:03:20 +02:00
$redirect = get_self_url ();
if ( $debug_mode ) {
debug ( 'Redirecting to hide access key, follow URL below to continue' );
debug ( " Location: $redirect " );
} else {
header ( " Location: $redirect " );
}
exit ;
2014-05-15 22:56:02 +02:00
}
2011-01-11 19:06:12 +01:00
}
2011-11-04 18:10:31 +01:00
///////////////////////////////////////////////
// Set timezone.
// Prevents warnings, but needs more testing -
// perhaps if timezone is set in php.ini we
// don't need to set it at all...
///////////////////////////////////////////////
if ( ! ini_get ( 'date.timezone' ) || !@ date_default_timezone_set ( ini_get ( 'date.timezone' ))) {
date_default_timezone_set ( 'UTC' );
}
2011-01-11 19:06:12 +01:00
///////////////////////////////////////////////
2015-06-14 02:03:20 +02:00
// Should we treat input URL as feed or HTML?
///////////////////////////////////////////////
$accept = 'auto' ;
if ( isset ( $_REQUEST [ 'accept' ]) && in_array ( strtolower ( $_REQUEST [ 'accept' ]), array ( 'html' , 'feed' , 'auto' ))) {
$accept = strtolower ( $_REQUEST [ 'accept' ]);
} elseif ( isset ( $_REQUEST [ 'html' ]) && ( $_REQUEST [ 'html' ] == '1' || $_REQUEST [ 'html' ] == 'true' )) {
$accept = 'html' ;
}
///////////////////////////////////////////////
// User-submitted site config
2011-01-11 19:06:12 +01:00
///////////////////////////////////////////////
2015-06-14 02:03:20 +02:00
$user_submitted_config = null ;
if ( isset ( $_REQUEST [ 'siteconfig' ])) {
$user_submitted_config = $_REQUEST [ 'siteconfig' ];
2017-02-18 16:06:19 +01:00
if ( ! $options -> user_submitted_config && $user_submitted_config ) {
2015-06-14 02:03:20 +02:00
die ( 'User-submitted site configs are currently disabled. Please remove the siteconfig parameter.' );
}
}
2011-01-11 19:06:12 +01:00
///////////////////////////////////////////////
// Check if valid key supplied
///////////////////////////////////////////////
2011-03-23 23:39:01 +01:00
$valid_key = false ;
2014-09-15 22:24:06 +02:00
$key_index = false ;
// first check for hidden key using hash (key (int) + hash parameters) (can appear in both simple and full modes)
if ( isset ( $_REQUEST [ 'key' ]) && isset ( $_REQUEST [ 'hash' ]) && isset ( $options -> api_keys [( int ) $_REQUEST [ 'key' ]])) {
$valid_key = ( $_REQUEST [ 'hash' ] == sha1 ( $options -> api_keys [( int ) $_REQUEST [ 'key' ]] . $url ));
if ( $valid_key ) $key_index = ( int ) $_REQUEST [ 'key' ];
}
// next check for full key (string) passed in request (only simple mode)
if ( ! $valid_key && _FF_FTR_MODE === 'simple' && isset ( $_REQUEST [ 'key' ])) {
$key_index = array_search ( $_REQUEST [ 'key' ], $options -> api_keys );
if ( $key_index !== false ) $valid_key = true ;
2011-03-23 23:39:01 +01:00
}
2013-04-18 16:11:06 +02:00
if ( ! $valid_key && $options -> key_required ) {
die ( 'A valid key must be supplied' );
}
2014-09-15 22:24:06 +02:00
if ( ! $valid_key && isset ( $_REQUEST [ 'key' ]) && $_REQUEST [ 'key' ] != '' ) {
2013-04-18 16:11:06 +02:00
die ( 'The entered key is invalid' );
}
if ( file_exists ( 'custom_init.php' )) require 'custom_init.php' ;
2011-01-11 19:06:12 +01:00
///////////////////////////////////////////////
// Check URL against list of blacklisted URLs
///////////////////////////////////////////////
2019-04-04 23:23:27 +02:00
if ( ! url_allowed ( $url )) die ( $options -> blocked_message );
2011-01-11 19:06:12 +01:00
///////////////////////////////////////////////
// Max entries
// see config.php to find these values
///////////////////////////////////////////////
2014-09-15 22:24:06 +02:00
if ( isset ( $_REQUEST [ 'max' ])) {
$max = ( int ) $_REQUEST [ 'max' ];
2011-01-11 19:06:12 +01:00
if ( $valid_key ) {
$max = min ( $max , $options -> max_entries_with_key );
} else {
$max = min ( $max , $options -> max_entries );
}
} else {
if ( $valid_key ) {
$max = $options -> default_entries_with_key ;
} else {
$max = $options -> default_entries ;
}
}
///////////////////////////////////////////////
// Link handling
///////////////////////////////////////////////
2014-09-15 22:24:06 +02:00
if ( isset ( $_REQUEST [ 'links' ]) && in_array ( $_REQUEST [ 'links' ], array ( 'preserve' , 'footnotes' , 'remove' ))) {
$links = $_REQUEST [ 'links' ];
2011-01-11 19:06:12 +01:00
} else {
$links = 'preserve' ;
}
2019-04-04 23:23:27 +02:00
///////////////////////////////////////////////
// Image handling
///////////////////////////////////////////////
$images = true ;
if ( isset ( $_REQUEST [ 'images' ]) && in_array ( $_REQUEST [ 'images' ], array ( '0' , 'remove' ))) {
$images = false ;
}
2014-05-15 22:56:02 +02:00
///////////////////////////////////////////////
// Favour item titles in feed?
///////////////////////////////////////////////
$favour_feed_titles = true ;
if ( $options -> favour_feed_titles == 'user' ) {
2019-04-04 23:23:27 +02:00
$favour_feed_titles = ( ! isset ( $_REQUEST [ 'use_extracted_title' ]) || $_REQUEST [ 'use_extracted_title' ] === '0' );
2014-05-15 22:56:02 +02:00
} else {
$favour_feed_titles = $options -> favour_feed_titles ;
}
2019-04-04 23:15:15 +02:00
///////////////////////////////////////////////
// Favour effective URL
///////////////////////////////////////////////
$favour_effective_url = false ;
if ( $options -> favour_effective_url == 'user' ) {
2019-04-04 23:23:27 +02:00
$favour_effective_url = ( isset ( $_REQUEST [ 'use_effective_url' ]) && $_REQUEST [ 'use_effective_url' ] !== '0' );
2019-04-04 23:15:15 +02:00
} else {
$favour_effective_url = $options -> favour_effective_url ;
}
2014-05-15 23:03:31 +02:00
///////////////////////////////////////////////
// Include full content in output?
///////////////////////////////////////////////
if ( $options -> content === 'user' ) {
2014-09-15 22:24:06 +02:00
if ( isset ( $_REQUEST [ 'content' ]) && $_REQUEST [ 'content' ] === '0' ) {
2014-05-15 23:03:31 +02:00
$options -> content = false ;
} else {
$options -> content = true ;
}
}
2019-04-04 23:23:27 +02:00
///////////////////////////////////////////////
// HTML5 output?
///////////////////////////////////////////////
if ( $options -> html5_output === 'user' ) {
2019-04-04 23:46:36 +02:00
if ( isset ( $_REQUEST [ 'content' ]) && $_REQUEST [ 'content' ] === '1' ) {
2019-04-04 23:23:27 +02:00
$options -> html5_output = false ;
2019-04-04 23:46:36 +02:00
} else {
$options -> html5_output = true ;
2019-04-04 23:23:27 +02:00
}
}
2014-05-15 23:03:31 +02:00
///////////////////////////////////////////////
// Include summaries in output?
///////////////////////////////////////////////
if ( $options -> summary === 'user' ) {
2014-09-15 22:24:06 +02:00
if ( isset ( $_REQUEST [ 'summary' ]) && $_REQUEST [ 'summary' ] === '1' ) {
2014-05-15 23:03:31 +02:00
$options -> summary = true ;
} else {
$options -> summary = false ;
}
}
2011-03-23 23:39:01 +01:00
///////////////////////////////////////////////
// Exclude items if extraction fails
///////////////////////////////////////////////
2014-05-15 22:49:16 +02:00
if ( $options -> exclude_items_on_fail === 'user' ) {
2014-09-15 22:24:06 +02:00
$exclude_on_fail = ( isset ( $_REQUEST [ 'exc' ]) && ( $_REQUEST [ 'exc' ] == '1' ));
2011-03-23 23:39:01 +01:00
} else {
$exclude_on_fail = $options -> exclude_items_on_fail ;
}
2013-04-18 16:11:06 +02:00
///////////////////////////////////////////////
// Detect language
///////////////////////////////////////////////
2014-05-15 22:49:16 +02:00
if ( $options -> detect_language === 'user' ) {
2014-09-15 22:24:06 +02:00
if ( isset ( $_REQUEST [ 'lang' ])) $_REQUEST [ 'l' ] = $_REQUEST [ 'lang' ];
if ( isset ( $_REQUEST [ 'l' ])) {
$detect_language = ( int ) $_REQUEST [ 'l' ];
2013-04-18 16:11:06 +02:00
} else {
$detect_language = 1 ;
}
} else {
$detect_language = $options -> detect_language ;
}
2019-04-04 23:23:27 +02:00
$use_cld = extension_loaded ( 'cld' );
2013-04-18 16:11:06 +02:00
2011-01-11 19:06:12 +01:00
/////////////////////////////////////
// Check for valid format
2012-05-01 00:51:43 +02:00
// (stick to RSS (or RSS as JSON) for the time being)
2011-01-11 19:06:12 +01:00
/////////////////////////////////////
2014-09-15 22:24:06 +02:00
if ( isset ( $_REQUEST [ 'format' ]) && $_REQUEST [ 'format' ] == 'json' ) {
2012-05-01 00:51:43 +02:00
$format = 'json' ;
} else {
$format = 'rss' ;
}
2011-01-11 19:06:12 +01:00
2014-05-15 22:49:16 +02:00
/////////////////////////////////////
// Should we do XSS filtering?
/////////////////////////////////////
if ( $options -> xss_filter === 'user' ) {
2014-09-15 22:24:06 +02:00
$xss_filter = isset ( $_REQUEST [ 'xss' ]) && $_REQUEST [ 'xss' ] !== '0' ;
2014-05-15 22:49:16 +02:00
} else {
$xss_filter = $options -> xss_filter ;
}
2014-09-15 22:24:06 +02:00
if ( ! $xss_filter && ( isset ( $_REQUEST [ 'xss' ]) && $_REQUEST [ 'xss' ] !== '0' )) {
2014-05-15 22:49:16 +02:00
die ( 'XSS filtering is disabled in config' );
}
/////////////////////////////////////
// Check for JSONP
// Regex from https://gist.github.com/1217080
/////////////////////////////////////
$callback = null ;
2014-09-15 22:24:06 +02:00
if ( $format == 'json' && isset ( $_REQUEST [ 'callback' ])) {
$callback = trim ( $_REQUEST [ 'callback' ]);
2014-05-15 22:49:16 +02:00
foreach ( explode ( '.' , $callback ) as $_identifier ) {
if ( ! preg_match ( '/^[a-zA-Z_$][0-9a-zA-Z_$]*(?:\[(?:".+"|\'.+\'|\d+)\])*?$/' , $_identifier )) {
die ( 'Invalid JSONP callback' );
}
}
debug ( " JSONP callback: $callback " );
}
2014-09-15 22:24:06 +02:00
///////////////////////////////////////////////
// Override default HTML parser?
///////////////////////////////////////////////
$parser = null ;
if ( $options -> allow_parser_override && isset ( $_REQUEST [ 'parser' ]) && in_array ( $_REQUEST [ 'parser' ], $options -> allowed_parsers )) {
$parser = $_REQUEST [ 'parser' ];
}
///////////////////////////////////////////////
// Use proxy?
///////////////////////////////////////////////
$proxy = false ;
if ( ! empty ( $options -> proxy_servers )) {
if ( isset ( $_REQUEST [ 'proxy' ])) {
// We're choosing proxy based on &proxy value (unless it's not allowed...)
if ( ! $options -> allow_proxy_override ) die ( 'Proxy overriding is disabled.' );
$proxy = $_REQUEST [ 'proxy' ];
if ( $proxy === '0' ) {
$proxy = false ;
} elseif ( $proxy === '1' ) {
$proxy = true ; // random
}
} else {
// We'll use proxy based on config setting
$proxy = $options -> proxy ;
}
// Is it a valid value (false, true, or one of the proxies in config)
if ( $proxy !== false && $proxy !== true && ! in_array ( $proxy , array_keys ( $options -> proxy_servers ))) {
die ( 'Proxy not recognised.' );
}
if ( $proxy === false ) {
debug ( 'Proxy will not be used' );
} else {
if ( $proxy === true ) {
$proxy = array_rand ( $options -> proxy_servers );
}
if ( is_string ( $options -> proxy_servers [ $proxy ]) && $options -> proxy_servers [ $proxy ] === 'direct' ) {
debug ( 'Proxy will not be used' );
$proxy = false ;
} else {
debug ( 'Proxy ' . $proxy . ' will be used.' );
$proxy = $options -> proxy_servers [ $proxy ];
}
}
}
//////////////////////////////////
// Has the HTML been given in the request?
//////////////////////////////////
if ( isset ( $_REQUEST [ 'inputhtml' ]) && _FF_FTR_MODE == 'simple' ) {
// disable multi-page processing (what we have is what we have)
$options -> singlepage = false ;
$options -> multipage = false ;
// disable disk caching
$options -> caching = false ;
}
2011-01-11 19:06:12 +01:00
//////////////////////////////////
// Check for cached copy
//////////////////////////////////
if ( $options -> caching ) {
2014-05-15 22:49:16 +02:00
debug ( 'Caching is enabled...' );
2019-04-04 23:23:27 +02:00
$cache_id = md5 ( $max . $url . ( int ) $valid_key . $accept . $links . $images . ( int ) $favour_feed_titles . ( int ) $options -> content . ( int ) $options -> html5_output . ( int ) $options -> summary . ( int ) $xss_filter . ( int ) $favour_effective_url . ( int ) $exclude_on_fail . $format . $detect_language . $parser . $user_submitted_config . _FF_FTR_MODE );
2014-05-15 22:49:16 +02:00
$check_cache = true ;
if ( $options -> apc && $options -> smart_cache ) {
2014-09-15 22:24:06 +02:00
apc_add ( " cache. $cache_id " , 0 , $options -> cache_time * 60 );
2014-05-15 22:49:16 +02:00
$apc_cache_hits = ( int ) apc_fetch ( " cache. $cache_id " );
$check_cache = ( $apc_cache_hits >= 2 );
apc_inc ( " cache. $cache_id " );
if ( $check_cache ) {
debug ( 'Cache key found in APC, we\'ll try to load cache file from disk' );
2012-05-01 00:51:43 +02:00
} else {
2014-05-15 22:49:16 +02:00
debug ( 'Cache key not found in APC' );
}
}
if ( $check_cache ) {
$cache = get_cache ();
if ( $data = $cache -> load ( $cache_id )) {
if ( $debug_mode ) {
debug ( 'Loaded cached copy' );
exit ;
}
if ( $format == 'json' ) {
if ( $callback === null ) {
header ( 'Content-type: application/json; charset=UTF-8' );
} else {
header ( 'Content-type: application/javascript; charset=UTF-8' );
}
} else {
header ( 'Content-type: text/xml; charset=UTF-8' );
header ( 'X-content-type-options: nosniff' );
}
if ( headers_sent ()) die ( 'Some data has already been output, can\'t send RSS file' );
if ( $callback ) {
echo " $callback ( $data ); " ;
} else {
echo $data ;
}
exit ;
2012-05-01 00:51:43 +02:00
}
2011-01-11 19:06:12 +01:00
}
}
//////////////////////////////////
2014-09-15 22:24:06 +02:00
// Set cache header
2011-01-11 19:06:12 +01:00
//////////////////////////////////
2014-05-15 22:49:16 +02:00
if ( ! $debug_mode ) {
2014-09-15 22:24:06 +02:00
if ( $options -> cache_time ) {
header ( 'Cache-Control: public, max-age=' . ( $options -> cache_time * 60 ));
header ( 'Expires: ' . gmdate ( 'D, d M Y H:i:s' , time () + ( $options -> cache_time * 60 )) . ' GMT' );
}
2011-01-11 19:06:12 +01:00
}
//////////////////////////////////
// Set up HTTP agent
//////////////////////////////////
2014-09-15 22:24:06 +02:00
if ( isset ( $_REQUEST [ 'inputhtml' ]) && _FF_FTR_MODE == 'simple' ) {
// the user has supplied the HTML, so we use the Dummy agent with
// the given HTML (it will always return this HTML)
$http = new HumbleHttpAgentDummy ( $_REQUEST [ 'inputhtml' ]);
} else {
$_req_options = null ;
if ( $proxy !== false ) {
$_req_options = array ( 'proxyhost' => $proxy [ 'host' ]);
if ( isset ( $proxy [ 'auth' ])) {
$_req_options [ 'proxyauth' ] = $proxy [ 'auth' ];
}
}
$http = new HumbleHttpAgent ( $_req_options );
$http -> debug = $debug_mode ;
2017-02-18 16:06:19 +01:00
// User agents can now be set in site config files using the http_header directive
//$http->userAgentMap = $options->user_agents;
2014-09-15 22:24:06 +02:00
$http -> headerOnlyTypes = array_keys ( $options -> content_type_exc );
$http -> rewriteUrls = $options -> rewrite_url ;
unset ( $_req_options );
}
2011-01-11 19:06:12 +01:00
2011-11-04 18:40:29 +01:00
//////////////////////////////////
// Set up Content Extractor
//////////////////////////////////
2012-05-01 00:51:43 +02:00
$extractor = new ContentExtractor ( dirname ( __FILE__ ) . '/site_config/custom' , dirname ( __FILE__ ) . '/site_config/standard' );
2014-05-15 22:49:16 +02:00
$extractor -> debug = $debug_mode ;
SiteConfig :: $debug = $debug_mode ;
SiteConfig :: use_apc ( $options -> apc );
2013-04-18 16:11:06 +02:00
$extractor -> fingerprints = $options -> fingerprints ;
2014-05-15 22:49:16 +02:00
$extractor -> allowedParsers = $options -> allowed_parsers ;
2014-09-15 22:24:06 +02:00
$extractor -> parserOverride = $parser ;
2019-04-04 23:23:27 +02:00
if ( ! $images ) $extractor -> stripImages = true ;
2015-06-14 02:03:20 +02:00
if ( $options -> user_submitted_config && $user_submitted_config ) {
$extractor -> setUserSubmittedConfig ( $user_submitted_config );
}
2017-02-18 16:06:19 +01:00
$http -> siteConfigBuilder = $extractor ;
2011-01-11 19:06:12 +01:00
////////////////////////////////
// Get RSS/Atom feed
////////////////////////////////
2015-06-14 02:03:20 +02:00
if ( $accept !== 'html' ) {
2014-05-15 22:49:16 +02:00
debug ( '--------' );
debug ( " Attempting to process URL as feed " );
2013-04-18 16:11:06 +02:00
// Send user agent header showing PHP (prevents a HTML response from feedburner)
$http -> userAgentDefault = HumbleHttpAgent :: UA_PHP ;
2011-11-04 18:10:31 +01:00
// configure SimplePie HTTP extension class to use our HumbleHttpAgent instance
SimplePie_HumbleHttpAgent :: set_agent ( $http );
2011-01-11 19:06:12 +01:00
$feed = new SimplePie ();
2013-04-18 16:11:06 +02:00
// some feeds use the text/html content type - force_feed tells SimplePie to process anyway
$feed -> force_feed ( true );
2011-11-04 18:10:31 +01:00
$feed -> set_file_class ( 'SimplePie_HumbleHttpAgent' );
2015-06-14 02:03:20 +02:00
$feed -> set_sanitize_class ( 'DisableSimplePieSanitize' );
// need to assign this manually it seems
$feed -> sanitize = new DisableSimplePieSanitize ();
2012-05-01 00:51:43 +02:00
//$feed->set_feed_url($url); // colons appearing in the URL's path get encoded
$feed -> feed_url = $url ;
2011-01-11 19:06:12 +01:00
$feed -> set_autodiscovery_level ( SIMPLEPIE_LOCATOR_NONE );
$feed -> set_timeout ( 20 );
$feed -> enable_cache ( false );
$feed -> set_stupidly_fast ( true );
$feed -> enable_order_by_date ( false ); // we don't want to do anything to the feed
$feed -> set_url_replacements ( array ());
// initialise the feed
// the @ suppresses notices which on some servers causes a 500 internal server error
$result = @ $feed -> init ();
//$feed->handle_content_type();
//$feed->get_title();
if ( $result && ( ! is_array ( $feed -> data ) || count ( $feed -> data ) == 0 )) {
die ( 'Sorry, no feed items found' );
2015-06-14 02:03:20 +02:00
} elseif ( ! $result && $accept === 'feed' ) {
die ( 'Sorry, couldn\'t parse as feed' );
2011-01-11 19:06:12 +01:00
}
2013-04-18 16:11:06 +02:00
// from now on, we'll identify ourselves as a browser
$http -> userAgentDefault = HumbleHttpAgent :: UA_BROWSER ;
2011-01-11 19:06:12 +01:00
}
////////////////////////////////////////////////////////////////////////////////
2012-05-01 00:51:43 +02:00
// Our given URL is not a feed, so let's create our own feed with a single item:
// the given URL. This basically treats all non-feed URLs as if they were
// single-item feeds.
2011-01-11 19:06:12 +01:00
////////////////////////////////////////////////////////////////////////////////
2012-05-01 00:51:43 +02:00
$isDummyFeed = false ;
2015-06-14 02:03:20 +02:00
if ( $accept === 'html' || ! $result ) {
2014-05-15 22:49:16 +02:00
debug ( '--------' );
debug ( " Constructing a single-item feed from URL " );
2012-05-01 00:51:43 +02:00
$isDummyFeed = true ;
2011-01-11 19:06:12 +01:00
unset ( $feed , $result );
2012-05-01 00:51:43 +02:00
// create single item dummy feed object
class DummySingleItemFeed {
public $item ;
function __construct ( $url ) { $this -> item = new DummySingleItem ( $url ); }
public function get_title () { return '' ; }
public function get_description () { return 'Content extracted from ' . $this -> item -> url ; }
public function get_link () { return $this -> item -> url ; }
2013-04-18 16:11:06 +02:00
public function get_language () { return false ; }
2012-05-01 00:51:43 +02:00
public function get_image_url () { return false ; }
public function get_items ( $start = 0 , $max = 1 ) { return array ( 0 => $this -> item ); }
2014-09-15 22:24:06 +02:00
public function get_channel_tags ( $namespace = '' , $tag = '' ) { return null ; }
2011-01-11 19:06:12 +01:00
}
2012-05-01 00:51:43 +02:00
class DummySingleItem {
public $url ;
function __construct ( $url ) { $this -> url = $url ; }
public function get_permalink () { return $this -> url ; }
2014-05-15 22:56:02 +02:00
public function get_title () { return null ; }
2012-05-01 00:51:43 +02:00
public function get_date ( $format = '' ) { return false ; }
2013-04-18 16:11:06 +02:00
public function get_author ( $key = 0 ) { return null ; }
public function get_authors () { return null ; }
2012-05-01 00:51:43 +02:00
public function get_description () { return '' ; }
2013-04-18 16:11:06 +02:00
public function get_enclosure ( $key = 0 , $prefer = null ) { return null ; }
public function get_enclosures () { return null ; }
2014-05-15 22:56:02 +02:00
public function get_categories () { return null ; }
2019-04-04 23:23:27 +02:00
public function get_item_tags ( $namespace = '' , $tag = '' ) { return null ; }
2011-01-11 19:06:12 +01:00
}
2012-05-01 00:51:43 +02:00
$feed = new DummySingleItemFeed ( $url );
2011-01-11 19:06:12 +01:00
}
////////////////////////////////////////////
// Create full-text feed
////////////////////////////////////////////
$output = new FeedWriter ();
2014-09-15 22:24:06 +02:00
if ( _FF_FTR_MODE === 'simple' ) $output -> enableSimpleJson ();
2015-06-14 02:03:20 +02:00
//$feed_title = $feed->get_title();
//echo $feed_title; exit;
2014-05-15 22:56:02 +02:00
$output -> setTitle ( strip_tags ( $feed -> get_title ()));
$output -> setDescription ( strip_tags ( $feed -> get_description ()));
2011-03-23 23:39:01 +01:00
$output -> setXsl ( 'css/feed.xsl' ); // Chrome uses this, most browsers ignore it
2014-09-15 22:24:06 +02:00
$ttl = $feed -> get_channel_tags ( SIMPLEPIE_NAMESPACE_RSS_20 , 'ttl' );
if ( $ttl !== null ) {
$ttl = ( int ) $ttl [ 0 ][ 'data' ];
$output -> setTtl ( $ttl );
2011-03-23 23:39:01 +01:00
}
2015-06-14 02:03:20 +02:00
$output -> setSelf ( get_self_url ());
$output -> setAlternate ( $url , 'Source URL' );
$output -> setRelated ( 'http://www.subtome.com/#/subscribe?feeds=' . urlencode ( get_self_url ()) . '&back=' . urlencode ( get_self_url ()), 'Subscribe to feed' );
2011-03-23 23:39:01 +01:00
$output -> setLink ( $feed -> get_link ()); // Google Reader uses this for pulling in favicons
2011-01-11 19:06:12 +01:00
if ( $img_url = $feed -> get_image_url ()) {
$output -> setImage ( $feed -> get_title (), $feed -> get_link (), $img_url );
}
////////////////////////////////////////////
// Loop through feed items
////////////////////////////////////////////
$items = $feed -> get_items ( 0 , $max );
// Request all feed items in parallel (if supported)
$urls_sanitized = array ();
$urls = array ();
foreach ( $items as $key => $item ) {
2017-02-18 16:06:19 +01:00
$permalink = htmlspecialchars_decode ( trim ( $item -> get_permalink ()));
2012-05-01 00:51:43 +02:00
// Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
$permalink = str_replace ( '%3A' , ':' , $permalink );
2013-04-18 16:11:06 +02:00
// validateUrl() strips non-ascii characters
// simplepie already sanitizes URLs so let's not do it again here.
//$permalink = $http->validateUrl($permalink);
2011-01-11 19:06:12 +01:00
if ( $permalink ) {
2015-06-14 02:03:20 +02:00
if ( ! url_allowed ( $permalink )) {
debug ( 'URL blocked, skipping...' );
$permalink = false ;
} else {
$urls_sanitized [] = $permalink ;
}
2011-01-11 19:06:12 +01:00
}
$urls [ $key ] = $permalink ;
}
2014-05-15 22:49:16 +02:00
debug ( '--------' );
debug ( 'Fetching feed items' );
2011-01-11 19:06:12 +01:00
$http -> fetchAll ( $urls_sanitized );
2011-11-04 18:10:31 +01:00
//$http->cacheAll();
2011-01-11 19:06:12 +01:00
2013-04-18 16:11:06 +02:00
// count number of items added to full feed
$item_count = 0 ;
2011-01-11 19:06:12 +01:00
foreach ( $items as $key => $item ) {
2015-06-14 02:03:20 +02:00
libxml_clear_errors ();
2014-05-15 22:49:16 +02:00
debug ( '--------' );
debug ( 'Processing feed item ' . ( $item_count + 1 ));
2013-04-18 16:11:06 +02:00
$do_content_extraction = true ;
2011-03-23 23:39:01 +01:00
$extract_result = false ;
2013-04-18 16:11:06 +02:00
$text_sample = null ;
2011-01-11 19:06:12 +01:00
$permalink = $urls [ $key ];
2014-05-15 22:49:16 +02:00
debug ( " Item URL: $permalink " );
2014-05-15 22:56:02 +02:00
$extracted_title = '' ;
$feed_item_title = $item -> get_title ();
if ( $feed_item_title !== null ) {
$feed_item_title = strip_tags ( htmlspecialchars_decode ( $feed_item_title ));
}
2011-01-11 19:06:12 +01:00
$newitem = $output -> createNewItem ();
2014-05-15 22:56:02 +02:00
$newitem -> setTitle ( $feed_item_title );
2014-09-15 22:24:06 +02:00
if ( $permalink !== false ) {
$newitem -> setLink ( $permalink );
2011-01-11 19:06:12 +01:00
} else {
2014-09-15 22:24:06 +02:00
$newitem -> setLink ( $item -> get_permalink ());
2011-01-11 19:06:12 +01:00
}
2014-09-15 22:24:06 +02:00
// Status codes to accept (200 range)
// Some sites might return correct content with error status codes
// e.g. prospectmagazine.co.uk returns 403 - in some earlier versions of FTR we accepted a wider range of status codes
// to allow for such cases:
//if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
// With the introduction of proxy support in 3.3, we're limiting range of acceptable status codes to avoid proxy
// errors being treated as valid responses.
if ( $permalink && ( $response = $http -> get ( $permalink , true )) && ( $response [ 'status_code' ] < 300 )) {
2011-01-11 19:06:12 +01:00
$effective_url = $response [ 'effective_url' ];
2015-06-14 02:03:20 +02:00
if ( ! url_allowed ( $effective_url )) {
debug ( 'URL blocked, skipping...' );
continue ;
}
2013-04-18 16:11:06 +02:00
// check if action defined for returned Content-Type
2014-05-15 22:49:16 +02:00
$mime_info = get_mime_action_info ( $response [ 'headers' ]);
if ( isset ( $mime_info [ 'action' ])) {
if ( $mime_info [ 'action' ] == 'exclude' ) {
continue ; // skip this feed item entry
} elseif ( $mime_info [ 'action' ] == 'link' ) {
if ( $mime_info [ 'type' ] == 'image' ) {
$html = " <a href= \" $effective_url\ " >< img src = \ " $effective_url\ " alt = \ " { $mime_info [ 'name' ] } \" /></a> " ;
} else {
$html = " <a href= \" $effective_url\ " > Download { $mime_info [ 'name' ]} </ a > " ;
2013-04-18 16:11:06 +02:00
}
2014-05-15 22:56:02 +02:00
$extracted_title = $mime_info [ 'name' ];
2014-05-15 22:49:16 +02:00
$do_content_extraction = false ;
2013-04-18 16:11:06 +02:00
}
}
if ( $do_content_extraction ) {
$html = $response [ 'body' ];
// remove strange things
$html = str_replace ( '</[>' , '' , $html );
$html = convert_to_utf8 ( $html , $response [ 'headers' ]);
2014-09-15 22:24:06 +02:00
// if user has asked to see raw HTML from remote server, show it and exit.
if ( $debug_show_raw_html ) {
debug ( " Here are the HTTP response headers from the remote server: " );
echo $response [ 'headers' ];
debug ( " Here's the raw HTML (after attempted UTF-8 conversion): " );
die ( $html );
}
2014-05-15 22:49:16 +02:00
// check site config for single page URL - fetch it if found
$is_single_page = false ;
2015-06-14 02:03:20 +02:00
if ( $options -> singlepage && ( $single_page_response = get_single_page ( $item , $html , $effective_url ))) {
2014-05-15 22:49:16 +02:00
$is_single_page = true ;
$effective_url = $single_page_response [ 'effective_url' ];
2014-05-15 23:03:31 +02:00
// check if action defined for returned Content-Type
$mime_info = get_mime_action_info ( $single_page_response [ 'headers' ]);
if ( isset ( $mime_info [ 'action' ])) {
if ( $mime_info [ 'action' ] == 'exclude' ) {
continue ; // skip this feed item entry
} elseif ( $mime_info [ 'action' ] == 'link' ) {
if ( $mime_info [ 'type' ] == 'image' ) {
$html = " <a href= \" $effective_url\ " >< img src = \ " $effective_url\ " alt = \ " { $mime_info [ 'name' ] } \" /></a> " ;
} else {
$html = " <a href= \" $effective_url\ " > Download { $mime_info [ 'name' ]} </ a > " ;
}
$extracted_title = $mime_info [ 'name' ];
$do_content_extraction = false ;
}
}
if ( $do_content_extraction ) {
$html = $single_page_response [ 'body' ];
// remove strange things
$html = str_replace ( '</[>' , '' , $html );
$html = convert_to_utf8 ( $html , $single_page_response [ 'headers' ]);
debug ( " Retrieved single-page view from $effective_url " );
}
2014-05-15 22:49:16 +02:00
unset ( $single_page_response );
}
2014-05-15 23:03:31 +02:00
}
if ( $do_content_extraction ) {
2014-05-15 22:49:16 +02:00
debug ( '--------' );
debug ( 'Attempting to extract content' );
$extract_result = $extractor -> process ( $html , $effective_url );
$readability = $extractor -> readability ;
2014-09-15 22:24:06 +02:00
// if user has asked to see parsed HTML, show it and exit.
if ( $debug_show_parsed_html ) {
debug ( " Here's the full HTML after it's been parsed by Full-Text RSS: " );
2019-04-04 23:23:27 +02:00
die ( make_html ( $readability -> dom -> documentElement ));
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
// is this a native ad?
if ( $extract_result && $extractor -> isNativeAd ()) {
debug ( " This article appears to be a native ad " );
if ( ! $isDummyFeed && $options -> remove_native_ads ) {
continue ; // skip this feed item entry
}
}
2019-04-04 23:46:36 +02:00
$base_url = get_base_url ( $readability -> dom , $effective_url );
2019-04-04 23:23:27 +02:00
if ( ! $base_url ) $base_url = $effective_url ;
2014-05-15 22:49:16 +02:00
$content_block = ( $extract_result ) ? $extractor -> getContent () : null ;
2014-05-15 22:56:02 +02:00
$extracted_title = ( $extract_result ) ? $extractor -> getTitle () : '' ;
2014-05-15 22:49:16 +02:00
// Deal with multi-page articles
//die('Next: '.$extractor->getNextPageUrl());
$is_multi_page = ( ! $is_single_page && $extract_result && $extractor -> getNextPageUrl ());
2014-05-15 23:03:31 +02:00
if ( $options -> multipage && $is_multi_page && $options -> content ) {
2014-05-15 22:49:16 +02:00
debug ( '--------' );
debug ( 'Attempting to process multi-page article' );
$multi_page_urls = array ();
$multi_page_content = array ();
while ( $next_page_url = $extractor -> getNextPageUrl ()) {
debug ( '--------' );
debug ( 'Processing next page: ' . $next_page_url );
2019-04-04 23:23:27 +02:00
// If we've got URL, resolve against $base_url
if ( $next_page_url = make_absolute_str ( $base_url , $next_page_url )) {
2014-05-15 22:49:16 +02:00
// check it's not what we have already!
if ( ! in_array ( $next_page_url , $multi_page_urls )) {
// it's not, so let's attempt to fetch it
$multi_page_urls [] = $next_page_url ;
$_prev_ref = $http -> referer ;
if (( $response = $http -> get ( $next_page_url , true )) && $response [ 'status_code' ] < 300 ) {
// make sure mime type is not something with a different action associated
$page_mime_info = get_mime_action_info ( $response [ 'headers' ]);
if ( ! isset ( $page_mime_info [ 'action' ])) {
$html = $response [ 'body' ];
// remove strange things
$html = str_replace ( '</[>' , '' , $html );
$html = convert_to_utf8 ( $html , $response [ 'headers' ]);
if ( $extractor -> process ( $html , $next_page_url )) {
$multi_page_content [] = $extractor -> getContent ();
continue ;
} else { debug ( 'Failed to extract content' ); }
} else { debug ( 'MIME type requires different action' ); }
} else { debug ( 'Failed to fetch URL' ); }
} else { debug ( 'URL already processed' ); }
} else { debug ( 'Failed to resolve against ' . $effective_url ); }
// failed to process next_page_url, so cancel further requests
$multi_page_content = array ();
break ;
2013-04-18 16:11:06 +02:00
}
2014-05-15 22:49:16 +02:00
// did we successfully deal with this multi-page article?
if ( empty ( $multi_page_content )) {
debug ( 'Failed to extract all parts of multi-page article, so not going to include them' );
2014-05-15 23:03:31 +02:00
$_page = $readability -> dom -> createElement ( 'p' );
$_page -> innerHTML = '<em>This article appears to continue on subsequent pages which we could not extract</em>' ;
$multi_page_content [] = $_page ;
2014-05-15 22:49:16 +02:00
}
foreach ( $multi_page_content as $_page ) {
$_page = $content_block -> ownerDocument -> importNode ( $_page , true );
$content_block -> appendChild ( $_page );
}
2014-05-15 23:03:31 +02:00
unset ( $multi_page_urls , $multi_page_content , $page_mime_info , $next_page_url , $_page );
2012-05-01 00:51:43 +02:00
}
}
// use extracted title for both feed and item title if we're using single-item dummy feed
if ( $isDummyFeed ) {
2014-05-15 22:56:02 +02:00
$output -> setTitle ( $extracted_title );
$newitem -> setTitle ( $extracted_title );
} else {
// use extracted title instead of feed item title?
if ( ! $favour_feed_titles && $extracted_title != '' ) {
debug ( 'Using extracted title in generated feed' );
$newitem -> setTitle ( $extracted_title );
}
2011-03-23 23:39:01 +01:00
}
}
2013-04-18 16:11:06 +02:00
if ( $do_content_extraction ) {
// if we failed to extract content...
if ( ! $extract_result ) {
2019-04-04 23:23:27 +02:00
if ( $exclude_on_fail && ( _FF_FTR_MODE != 'simple' )) {
2014-05-15 22:49:16 +02:00
debug ( 'Failed to extract, so skipping (due to exclude on fail parameter)' );
continue ; // skip this and move to next item
2011-11-04 18:10:31 +01:00
}
2019-04-04 23:23:27 +02:00
if ( _FF_FTR_MODE === 'simple' ) {
$html = '' ;
} else {
//TODO: get text sample for language detection
$html = $options -> error_message ;
// keep the original item description
$html .= $item -> get_description ();
}
2011-01-11 19:06:12 +01:00
} else {
2013-04-18 16:11:06 +02:00
$readability -> clean ( $content_block , 'select' );
2015-06-14 02:03:20 +02:00
if ( $options -> rewrite_relative_urls ) {
2019-04-04 23:23:27 +02:00
// we've got $base_url already above
//$base_url = get_base_url($readability->dom);
//if (!$base_url) $base_url = $effective_url;
2015-06-14 02:03:20 +02:00
// rewrite URLs
make_absolute ( $base_url , $content_block );
}
2013-04-18 16:11:06 +02:00
// footnotes
if (( $links == 'footnotes' ) && ( strpos ( $effective_url , 'wikipedia.org' ) === false )) {
$readability -> addFootnotes ( $content_block );
}
2014-09-15 22:24:06 +02:00
// normalise
$content_block -> normalize ();
// remove empty text nodes
foreach ( $content_block -> childNodes as $_n ) {
if ( $_n -> nodeType === XML_TEXT_NODE && trim ( $_n -> textContent ) == '' ) {
$content_block -> removeChild ( $_n );
}
}
2014-05-15 22:49:16 +02:00
// remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
while ( $content_block -> childNodes -> length == 1 && $content_block -> firstChild -> nodeType === XML_ELEMENT_NODE ) {
// only follow these tag names
if ( ! in_array ( strtolower ( $content_block -> tagName ), array ( 'div' , 'article' , 'section' , 'header' , 'footer' ))) break ;
//$html = $content_block->firstChild->innerHTML; // FTR 2.9.5
$content_block = $content_block -> firstChild ;
}
// convert content block to HTML string
// Need to preserve things like body: //img[@id='feature']
2014-09-15 22:24:06 +02:00
if ( in_array ( strtolower ( $content_block -> tagName ), array ( 'div' , 'article' , 'section' , 'header' , 'footer' , 'li' , 'td' ))) {
2019-04-04 23:23:27 +02:00
//$html = $content_block->innerHTML;
$html = make_html ( $content_block , true ); // true = innerHTML
2014-09-15 22:24:06 +02:00
//} elseif (in_array(strtolower($content_block->tagName), array('td', 'li'))) {
// $html = '<div>'.$content_block->innerHTML.'</div>';
2013-04-18 16:11:06 +02:00
} else {
2019-04-04 23:23:27 +02:00
//$html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
$html = make_html ( $content_block ); // outerHTML
2013-04-18 16:11:06 +02:00
}
2014-05-15 23:03:31 +02:00
//unset($content_block);
2013-04-18 16:11:06 +02:00
// post-processing cleanup
$html = preg_replace ( '!<p>[\s\h\v]*</p>!u' , '' , $html );
2019-04-04 23:46:36 +02:00
$html = str_replace ( '<p> </p>' , '' , $html );
2013-04-18 16:11:06 +02:00
if ( $links == 'remove' ) {
2019-04-04 23:23:27 +02:00
$html = preg_replace ( '!<a\s+[^>]*>!' , '' , $html );
$html = preg_replace ( '!</a>!' , '' , $html );
2013-04-18 16:11:06 +02:00
}
// get text sample for language detection
2019-04-04 23:23:27 +02:00
$_og = $extractor -> getOpenGraph ();
$text_sample = '' ;
if ( isset ( $_og [ 'og:title' ])) {
$text_sample .= $_og [ 'og:title' ];
}
if ( isset ( $_og [ 'og:description' ])) {
$text_sample .= ' ' . $_og [ 'og:description' ];
}
$text_sample .= mb_substr ( $content_block -> textContent , 0 , 3000 );
unset ( $_og );
2014-05-15 22:49:16 +02:00
$html = make_substitutions ( $options -> message_to_prepend ) . $html ;
$html .= make_substitutions ( $options -> message_to_append );
2011-11-04 18:40:29 +01:00
}
2011-01-11 19:06:12 +01:00
}
2014-05-15 22:49:16 +02:00
2019-04-04 23:15:15 +02:00
// guid
$_guid = $item -> get_permalink ();
$_ispermalink = 'true' ;
$_g = $item -> get_item_tags ( '' , 'guid' );
if ( is_array ( $_g ) && count ( $_g ) > 0 ) {
$_ispermalink = null ;
$_guid = $_g [ 0 ][ 'data' ];
if ( isset ( $_g [ 0 ][ 'attribs' ]) && isset ( $_g [ 0 ][ 'attribs' ][ '' ]) && isset ( $_g [ 0 ][ 'attribs' ][ '' ][ 'isPermaLink' ])) {
$_ispermalink = $_g [ 0 ][ 'attribs' ][ '' ][ 'isPermaLink' ];
if ( $_ispermalink !== 'true' ) $_ispermalink = 'false' ;
}
}
if ( isset ( $_ispermalink )) {
$newitem -> addElement ( 'guid' , $_guid , array ( 'isPermaLink' => $_ispermalink ));
} else {
$newitem -> addElement ( 'guid' , $_guid );
}
unset ( $_g , $_guid , $_ispermalink );
2014-05-15 23:03:31 +02:00
// filter xss?
if ( $xss_filter ) {
debug ( 'Filtering HTML to remove XSS' );
$html = htmLawed :: hl ( $html , array ( 'safe' => 1 , 'deny_attribute' => 'style' , 'comment' => 1 , 'cdata' => 1 ));
}
// add content
if ( $options -> summary === true ) {
// get summary
$summary = '' ;
if ( ! $do_content_extraction ) {
$summary = $html ;
2011-03-23 23:39:01 +01:00
} else {
2014-05-15 23:03:31 +02:00
// Try to get first few paragraphs
if ( isset ( $content_block ) && ( $content_block instanceof DOMElement )) {
$_paras = $content_block -> getElementsByTagName ( 'p' );
foreach ( $_paras as $_para ) {
$summary .= preg_replace ( " /[ \n \r \t ]+/ " , ' ' , $_para -> textContent ) . ' ' ;
if ( strlen ( $summary ) > 200 ) break ;
2014-05-15 22:56:02 +02:00
}
2014-05-15 23:03:31 +02:00
} else {
$summary = $html ;
2013-04-18 16:11:06 +02:00
}
2014-05-15 23:03:31 +02:00
}
unset ( $_paras , $_para );
$summary = get_excerpt ( $summary );
$newitem -> setDescription ( $summary );
if ( $options -> content ) $newitem -> setElement ( 'content:encoded' , $html );
} else {
if ( $options -> content ) $newitem -> setDescription ( $html );
}
// set date
if (( int ) $item -> get_date ( 'U' ) > 0 ) {
$newitem -> setDate (( int ) $item -> get_date ( 'U' ));
} elseif ( $extractor -> getDate ()) {
$newitem -> setDate ( $extractor -> getDate ());
}
// add authors
if ( $authors = $item -> get_authors ()) {
foreach ( $authors as $author ) {
// for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel
if ( $author -> get_name () !== null ) {
$newitem -> addElement ( 'dc:creator' , $author -> get_name ());
} elseif ( $author -> get_email () !== null ) {
$newitem -> addElement ( 'dc:creator' , $author -> get_email ());
2013-04-18 16:11:06 +02:00
}
2011-01-11 19:06:12 +01:00
}
2014-05-15 23:03:31 +02:00
} elseif ( $authors = $extractor -> getAuthors ()) {
//TODO: make sure the list size is reasonable
foreach ( $authors as $author ) {
// TODO: xpath often selects authors from other articles linked from the page.
// for now choose first item
$newitem -> addElement ( 'dc:creator' , $author );
break ;
}
}
2017-02-18 16:06:19 +01:00
// add open graph
if ( $opengraph = $extractor -> getOpenGraph ()) {
2019-04-04 23:23:27 +02:00
foreach ( $opengraph as $_prop => $_val ) {
$newitem -> addElement ( $_prop , $_val );
2017-02-18 16:06:19 +01:00
}
}
2019-04-04 23:23:27 +02:00
// add Twitter Card
if ( $twitterCard = $extractor -> getTwitterCard ()) {
foreach ( $twitterCard as $_prop => $_val ) {
$newitem -> addElement ( $_prop , $_val );
}
}
unset ( $_prop , $_val );
2014-05-15 23:03:31 +02:00
// add language
if ( $detect_language ) {
$language = $extractor -> getLanguage ();
if ( ! $language ) $language = $feed -> get_language ();
if (( $detect_language == 3 || ( ! $language && $detect_language == 2 )) && $text_sample ) {
try {
if ( $use_cld ) {
// Use PHP-CLD extension
$php_cld = 'CLD\detect' ; // in quotes to prevent PHP 5.2 parse error
$res = $php_cld ( $text_sample );
if ( is_array ( $res ) && count ( $res ) > 0 ) {
$language = $res [ 0 ][ 'code' ];
}
} else {
//die('what');
// Use PEAR's Text_LanguageDetect
if ( ! isset ( $l )) {
$l = new Text_LanguageDetect ();
$l -> setNameMode ( 2 ); // return ISO 639-1 codes (e.g. "en")
}
$l_result = $l -> detect ( $text_sample , 1 );
if ( count ( $l_result ) > 0 ) {
$language = key ( $l_result );
2019-04-04 23:46:36 +02:00
debug ( 'Language detected: ' . $language );
2013-04-18 16:11:06 +02:00
}
}
2014-05-15 23:03:31 +02:00
} catch ( Exception $e ) {
//die('error: '.$e);
// do nothing
2013-04-18 16:11:06 +02:00
}
}
2014-05-15 23:03:31 +02:00
if ( $language && ( strlen ( $language ) < 7 )) {
$newitem -> addElement ( 'dc:language' , $language );
2013-04-18 16:11:06 +02:00
}
2014-05-15 23:03:31 +02:00
}
// add MIME type (if it appeared in our exclusions lists)
if ( isset ( $mime_info [ 'mime' ])) $newitem -> addElement ( 'dc:format' , $mime_info [ 'mime' ]);
// add effective URL (URL after redirects)
if ( isset ( $effective_url )) {
//TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
2015-06-14 02:03:20 +02:00
//http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-– -25th-March-2012-Special-Program-from-Liari-(Karachi)
2014-05-15 23:03:31 +02:00
//temporary measure: use utf8_encode()
$newitem -> addElement ( 'dc:identifier' , remove_url_cruft ( utf8_encode ( $effective_url )));
2019-04-04 23:15:15 +02:00
if ( $favour_effective_url ) $newitem -> setLink ( remove_url_cruft ( utf8_encode ( $effective_url )));
2014-05-15 23:03:31 +02:00
} else {
$newitem -> addElement ( 'dc:identifier' , remove_url_cruft ( $item -> get_permalink ()));
}
2015-06-14 02:03:20 +02:00
// is this a native ad?
if ( $extractor -> isNativeAd ()) {
$newitem -> addElement ( 'dc:type' , 'Native Ad' );
}
2014-05-15 23:03:31 +02:00
// add categories
if ( $categories = $item -> get_categories ()) {
foreach ( $categories as $category ) {
if ( $category -> get_label () !== null ) {
$newitem -> addElement ( 'category' , $category -> get_label ());
2014-05-15 22:56:02 +02:00
}
}
2014-05-15 23:03:31 +02:00
}
// check for enclosures
if ( $options -> keep_enclosures ) {
if ( $enclosures = $item -> get_enclosures ()) {
foreach ( $enclosures as $enclosure ) {
// thumbnails
foreach (( array ) $enclosure -> get_thumbnails () as $thumbnail ) {
$newitem -> addElement ( 'media:thumbnail' , '' , array ( 'url' => $thumbnail ));
2013-04-18 16:11:06 +02:00
}
2014-05-15 23:03:31 +02:00
if ( ! $enclosure -> get_link ()) continue ;
$enc = array ();
// Media RSS spec ($enc): http://search.yahoo.com/mrss
// SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
$enc [ 'url' ] = $enclosure -> get_link ();
if ( $enclosure -> get_length ()) $enc [ 'fileSize' ] = $enclosure -> get_length ();
if ( $enclosure -> get_type ()) $enc [ 'type' ] = $enclosure -> get_type ();
if ( $enclosure -> get_medium ()) $enc [ 'medium' ] = $enclosure -> get_medium ();
if ( $enclosure -> get_expression ()) $enc [ 'expression' ] = $enclosure -> get_expression ();
if ( $enclosure -> get_bitrate ()) $enc [ 'bitrate' ] = $enclosure -> get_bitrate ();
if ( $enclosure -> get_framerate ()) $enc [ 'framerate' ] = $enclosure -> get_framerate ();
if ( $enclosure -> get_sampling_rate ()) $enc [ 'samplingrate' ] = $enclosure -> get_sampling_rate ();
if ( $enclosure -> get_channels ()) $enc [ 'channels' ] = $enclosure -> get_channels ();
if ( $enclosure -> get_duration ()) $enc [ 'duration' ] = $enclosure -> get_duration ();
if ( $enclosure -> get_height ()) $enc [ 'height' ] = $enclosure -> get_height ();
if ( $enclosure -> get_width ()) $enc [ 'width' ] = $enclosure -> get_width ();
if ( $enclosure -> get_language ()) $enc [ 'lang' ] = $enclosure -> get_language ();
$newitem -> addElement ( 'media:content' , '' , $enc );
2013-04-18 16:11:06 +02:00
}
}
2014-05-15 23:03:31 +02:00
}
2011-01-11 19:06:12 +01:00
$output -> addItem ( $newitem );
unset ( $html );
2013-04-18 16:11:06 +02:00
$item_count ++ ;
2011-01-11 19:06:12 +01:00
}
2013-04-18 16:11:06 +02:00
2011-01-11 19:06:12 +01:00
// output feed
2014-05-15 22:49:16 +02:00
debug ( 'Done!' );
/*
if ( $debug_mode ) {
$_apc_data = apc_cache_info ( 'user' );
var_dump ( $_apc_data ); exit ;
}
*/
if ( ! $debug_mode ) {
if ( $callback ) echo " $callback ( " ; // if $callback is set, $format also == 'json'
if ( $format == 'json' ) $output -> setFormat (( $callback === null ) ? JSON : JSONP );
$add_to_cache = $options -> caching ;
// is smart cache mode enabled?
if ( $add_to_cache && $options -> apc && $options -> smart_cache ) {
// yes, so only cache if this is the second request for this URL
$add_to_cache = ( $apc_cache_hits >= 2 );
// purge cache
if ( $options -> cache_cleanup > 0 ) {
if ( rand ( 1 , $options -> cache_cleanup ) == 1 ) {
// apc purge code adapted from from http://www.thimbleopensource.com/tutorials-snippets/php-apc-expunge-script
$_apc_data = apc_cache_info ( 'user' );
foreach ( $_apc_data [ 'cache_list' ] as $_apc_item ) {
2014-09-15 22:24:06 +02:00
// APCu keys incompatible with original APC keys, apparently fixed in newer versions, but not in 4.0.4
// So let's look for those keys and fix here (ctime -> creation_time, key -> info).
if ( isset ( $_apc_item [ 'ctime' ])) $_apc_item [ 'creation_time' ] = $_apc_item [ 'ctime' ];
if ( isset ( $_apc_item [ 'key' ])) $_apc_item [ 'info' ] = $_apc_item [ 'key' ];
if ( $_apc_item [ 'ttl' ] > 0 && ( $_apc_item [ 'ttl' ] + $_apc_item [ 'creation_time' ] < time ())) {
apc_delete ( $_apc_item [ 'info' ]);
}
2014-05-15 22:49:16 +02:00
}
}
}
}
if ( $add_to_cache ) {
ob_start ();
2014-09-15 22:24:06 +02:00
$output -> generateFeed ();
2014-05-15 22:49:16 +02:00
$output = ob_get_contents ();
ob_end_clean ();
2015-06-14 02:03:20 +02:00
if ( $accept === 'html' && $item_count == 0 ) {
2014-05-15 22:49:16 +02:00
// do not cache - in case of temporary server glitch at source URL
} else {
$cache = get_cache ();
if ( $add_to_cache ) $cache -> save ( $output , $cache_id );
}
echo $output ;
2013-04-18 16:11:06 +02:00
} else {
2014-09-15 22:24:06 +02:00
$output -> generateFeed ();
2012-05-01 00:51:43 +02:00
}
2014-05-15 22:49:16 +02:00
if ( $callback ) echo ');' ;
2011-01-11 19:06:12 +01:00
}
2012-05-01 00:51:43 +02:00
///////////////////////////////
// HELPER FUNCTIONS
///////////////////////////////
2015-06-14 02:03:20 +02:00
function get_self_url () {
global $options , $url ;
$scheme = ( is_ssl ()) ? 'https://' : 'http://' ;
$host = $_SERVER [ 'HTTP_HOST' ];
$path = rtrim ( dirname ( $_SERVER [ 'SCRIPT_NAME' ]), '/\\' );
$_qs_url = ( strtolower ( substr ( $url , 0 , 7 )) == 'http://' ) ? substr ( $url , 7 ) : $url ;
$self = $scheme . htmlspecialchars ( $host . $path ) . '/makefulltextfeed.php?url=' . urlencode ( $_qs_url );
// hide API key if we can
if ( isset ( $_GET [ 'key' ]) && ( $key_index = array_search ( $_GET [ 'key' ], $options -> api_keys )) !== false ) {
$_hash = sha1 ( $_GET [ 'key' ] . $url );
$self .= '&key=' . $key_index ;
$self .= '&hash=' . urlencode ( $_hash );
} elseif ( isset ( $_GET [ 'key' ]) && isset ( $_GET [ 'hash' ])) {
$self .= '&key=' . urlencode ( $_GET [ 'key' ]);
$self .= '&hash=' . urlencode ( $_GET [ 'hash' ]);
}
if ( isset ( $_GET [ 'html' ])) $self .= '&html=' . urlencode ( $_GET [ 'html' ]);
if ( isset ( $_GET [ 'accept' ])) $self .= '&accept=' . urlencode ( $_GET [ 'accept' ]);
if ( isset ( $_GET [ 'max' ])) $self .= '&max=' . ( int ) $_GET [ 'max' ];
if ( isset ( $_GET [ 'links' ])) $self .= '&links=' . urlencode ( $_GET [ 'links' ]);
2019-04-04 23:23:27 +02:00
if ( isset ( $_GET [ 'images' ])) $self .= '&images=' . urlencode ( $_GET [ 'images' ]);
2015-06-14 02:03:20 +02:00
if ( isset ( $_GET [ 'exc' ])) $self .= '&exc=' . urlencode ( $_GET [ 'exc' ]);
if ( isset ( $_GET [ 'format' ])) $self .= '&format=' . urlencode ( $_GET [ 'format' ]);
if ( isset ( $_GET [ 'callback' ])) $self .= '&callback=' . urlencode ( $_GET [ 'callback' ]);
if ( isset ( $_GET [ 'l' ])) $self .= '&l=' . urlencode ( $_GET [ 'l' ]);
if ( isset ( $_GET [ 'lang' ])) $self .= '&lang=' . urlencode ( $_GET [ 'lang' ]);
if ( isset ( $_GET [ 'xss' ])) $self .= '&xss' ;
if ( isset ( $_GET [ 'use_extracted_title' ])) $self .= '&use_extracted_title' ;
2019-04-04 23:15:15 +02:00
if ( isset ( $_GET [ 'use_effective_url' ])) $self .= '&use_effective_url' ;
2015-06-14 02:03:20 +02:00
if ( isset ( $_GET [ 'content' ])) $self .= '&content=' . urlencode ( $_GET [ 'content' ]);
if ( isset ( $_GET [ 'summary' ])) $self .= '&summary=' . urlencode ( $_GET [ 'summary' ]);
if ( isset ( $_GET [ 'debug' ])) $self .= '&debug' ;
if ( isset ( $_GET [ 'parser' ])) $self .= '&parser=' . urlencode ( $_GET [ 'parser' ]);
if ( isset ( $_GET [ 'proxy' ])) $self .= '&proxy=' . urlencode ( $_GET [ 'proxy' ]);
if ( isset ( $_GET [ 'siteconfig' ])) $self .= '&siteconfig=' . urlencode ( $_GET [ 'siteconfig' ]);
return $self ;
}
function validate_url ( $url ) {
2019-04-04 23:46:36 +02:00
if ( function_exists ( 'idn_to_ascii' )) {
if ( $host = @ parse_url ( $url , PHP_URL_HOST )) {
2019-04-04 23:52:03 +02:00
if ( defined ( 'INTL_IDNA_VARIANT_UTS46' )) {
$puny = idn_to_ascii ( $host , 0 , INTL_IDNA_VARIANT_UTS46 );
} else {
$puny = idn_to_ascii ( $host );
}
2019-04-04 23:46:36 +02:00
if ( $host != $puny ) {
$pos = strpos ( $url , $host );
if ( $pos !== false ) {
$url = substr_replace ( $url , $puny , $pos , strlen ( $host ));
}
}
}
}
2015-06-14 02:03:20 +02:00
$url = filter_var ( $url , FILTER_SANITIZE_URL );
2019-04-05 00:07:05 +02:00
$test = filter_var ( $url , FILTER_VALIDATE_URL );
2015-06-14 02:03:20 +02:00
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
if ( $test === false ) {
2019-04-05 00:07:05 +02:00
$test = filter_var ( strtr ( $url , '-' , '_' ), FILTER_VALIDATE_URL );
2015-06-14 02:03:20 +02:00
}
if ( $test !== false && $test !== null && preg_match ( '!^https?://!i' , $url )) {
return $url ;
} else {
return false ;
}
}
2019-04-04 23:46:36 +02:00
function get_base_url ( $dom , $url = null ) {
2015-06-14 02:03:20 +02:00
$xpath = new DOMXPath ( $dom );
2019-04-04 23:46:36 +02:00
$base = @ $xpath -> evaluate ( 'string(//head/base/@href)' , $dom );
if ( ! $base ) return false ;
if ( isset ( $url ) && ! preg_match ( '!^https?://!i' , $base )) {
$base = make_absolute_str ( $url , $base );
}
return $base ;
2015-06-14 02:03:20 +02:00
}
function is_ssl () {
if ( isset ( $_SERVER [ 'HTTPS' ]) && ( $_SERVER [ 'HTTPS' ] != '' ) && ( $_SERVER [ 'HTTPS' ] != 'off' )) {
return true ;
} elseif ( isset ( $_SERVER [ 'HTTP_X_FORWARDED_PROTO' ]) && $_SERVER [ 'HTTP_X_FORWARDED_PROTO' ] == 'https' ) {
return true ;
} else {
return false ;
}
}
2014-05-15 23:03:31 +02:00
// Adapted from WordPress
// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173
function get_excerpt ( $text , $num_words = 55 , $more = null ) {
2015-06-14 02:03:20 +02:00
if ( null === $more ) $more = '…' ;
2014-05-15 23:03:31 +02:00
$text = strip_tags ( $text );
//TODO: Check if word count is based on single characters (East Asian characters)
/*
if ( 1 == 2 ) {
$text = trim ( preg_replace ( " /[ \n \r \t ]+/ " , ' ' , $text ), ' ' );
preg_match_all ( '/./u' , $text , $words_array );
$words_array = array_slice ( $words_array [ 0 ], 0 , $num_words + 1 );
$sep = '' ;
} else {
$words_array = preg_split ( " /[ \n \r \t ]+/ " , $text , $num_words + 1 , PREG_SPLIT_NO_EMPTY );
$sep = ' ' ;
}
*/
$words_array = preg_split ( " /[ \n \r \t ]+/ " , $text , $num_words + 1 , PREG_SPLIT_NO_EMPTY );
$sep = ' ' ;
if ( count ( $words_array ) > $num_words ) {
array_pop ( $words_array );
$text = implode ( $sep , $words_array );
$text = $text . $more ;
} else {
$text = implode ( $sep , $words_array );
}
// trim whitespace at beginning or end of string
// See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2
$text = preg_replace ( '/^[\pZ\pC]+|[\pZ\pC]+$/u' , '' , $text );
return $text ;
}
2012-05-01 00:51:43 +02:00
function url_allowed ( $url ) {
global $options ;
if ( ! empty ( $options -> allowed_urls )) {
$allowed = false ;
foreach ( $options -> allowed_urls as $allowurl ) {
if ( stristr ( $url , $allowurl ) !== false ) {
$allowed = true ;
break ;
}
}
if ( ! $allowed ) return false ;
} else {
foreach ( $options -> blocked_urls as $blockurl ) {
if ( stristr ( $url , $blockurl ) !== false ) {
return false ;
}
}
}
return true ;
}
//////////////////////////////////////////////
// Convert $html to UTF8
// (uses HTTP headers and HTML to find encoding)
// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
//////////////////////////////////////////////
2014-09-15 22:24:06 +02:00
function convert_to_utf8 ( $html , $header = null ) {
2012-05-01 00:51:43 +02:00
$encoding = null ;
if ( $html || $header ) {
if ( is_array ( $header )) $header = implode ( " \n " , $header );
if ( ! $header || ! preg_match_all ( '/^Content-Type:\s+([^;]+)(?:;\s*charset=["\']?([^;"\'\n]*))?/im' , $header , $match , PREG_SET_ORDER )) {
// error parsing the response
2014-05-15 22:49:16 +02:00
debug ( 'Could not find Content-Type header in HTTP response' );
2012-05-01 00:51:43 +02:00
} else {
$match = end ( $match ); // get last matched element (in case of redirects)
2013-04-18 16:11:06 +02:00
if ( isset ( $match [ 2 ])) $encoding = trim ( $match [ 2 ], " \" ' \r \n \0 \x0B \t " );
2012-05-01 00:51:43 +02:00
}
2013-04-18 16:11:06 +02:00
// TODO: check to see if encoding is supported (can we convert it?)
// If it's not, result will be empty string.
// For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
// Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
if ( ! $encoding || $encoding == 'none' ) {
2014-05-15 22:49:16 +02:00
// search for encoding in HTML - only look at the first 50000 characters
// Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
// TODO: improve this so it looks at smaller chunks first
$html_head = substr ( $html , 0 , 50000 );
2013-04-18 16:11:06 +02:00
if ( preg_match ( '/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s' , $html_head , $match )) {
2012-05-01 00:51:43 +02:00
$encoding = trim ( $match [ 1 ], '"\'' );
2013-04-18 16:11:06 +02:00
} elseif ( preg_match ( '/<meta\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\s*charset=["\']?([^;"\'>]+)/i' , $html_head , $match )) {
$encoding = trim ( $match [ 1 ]);
} elseif ( preg_match_all ( '/<meta\s+([^>]+)>/i' , $html_head , $match )) {
foreach ( $match [ 1 ] as $_test ) {
if ( preg_match ( '/charset=["\']?([^"\']+)/i' , $_test , $_m )) {
$encoding = trim ( $_m [ 1 ]);
break ;
}
}
2012-05-01 00:51:43 +02:00
}
}
2015-06-14 02:03:20 +02:00
if ( isset ( $encoding )) $encoding = strtolower ( trim ( $encoding ));
// fix bad encoding values
if ( $encoding === 'iso-8850-1' ) $encoding = 'iso-8859-1' ;
if ( ! $encoding || ( $encoding === 'iso-8859-1' )) {
2013-04-18 16:11:06 +02:00
// replace MS Word smart qutoes
$trans = array ();
$trans [ chr ( 130 )] = '‚' ; // Single Low-9 Quotation Mark
$trans [ chr ( 131 )] = 'ƒ' ; // Latin Small Letter F With Hook
$trans [ chr ( 132 )] = '„' ; // Double Low-9 Quotation Mark
$trans [ chr ( 133 )] = '…' ; // Horizontal Ellipsis
$trans [ chr ( 134 )] = '†' ; // Dagger
$trans [ chr ( 135 )] = '‡' ; // Double Dagger
$trans [ chr ( 136 )] = 'ˆ' ; // Modifier Letter Circumflex Accent
$trans [ chr ( 137 )] = '‰' ; // Per Mille Sign
$trans [ chr ( 138 )] = 'Š' ; // Latin Capital Letter S With Caron
$trans [ chr ( 139 )] = '‹' ; // Single Left-Pointing Angle Quotation Mark
$trans [ chr ( 140 )] = 'Œ' ; // Latin Capital Ligature OE
$trans [ chr ( 145 )] = '‘' ; // Left Single Quotation Mark
$trans [ chr ( 146 )] = '’' ; // Right Single Quotation Mark
$trans [ chr ( 147 )] = '“' ; // Left Double Quotation Mark
$trans [ chr ( 148 )] = '”' ; // Right Double Quotation Mark
$trans [ chr ( 149 )] = '•' ; // Bullet
$trans [ chr ( 150 )] = '–' ; // En Dash
$trans [ chr ( 151 )] = '—' ; // Em Dash
$trans [ chr ( 152 )] = '˜' ; // Small Tilde
$trans [ chr ( 153 )] = '™' ; // Trade Mark Sign
$trans [ chr ( 154 )] = 'š' ; // Latin Small Letter S With Caron
$trans [ chr ( 155 )] = '›' ; // Single Right-Pointing Angle Quotation Mark
$trans [ chr ( 156 )] = 'œ' ; // Latin Small Ligature OE
$trans [ chr ( 159 )] = 'Ÿ' ; // Latin Capital Letter Y With Diaeresis
$html = strtr ( $html , $trans );
2014-05-15 22:49:16 +02:00
}
2012-05-01 00:51:43 +02:00
if ( ! $encoding ) {
2014-05-15 22:49:16 +02:00
debug ( 'No character encoding found, so treating as UTF-8' );
2012-05-01 00:51:43 +02:00
$encoding = 'utf-8' ;
} else {
2014-05-15 22:49:16 +02:00
debug ( 'Character encoding: ' . $encoding );
2015-06-14 02:03:20 +02:00
if ( $encoding !== 'utf-8' ) {
2014-05-15 22:49:16 +02:00
debug ( 'Converting to UTF-8' );
2012-05-01 00:51:43 +02:00
$html = SimplePie_Misc :: change_encoding ( $html , $encoding , 'utf-8' );
}
}
}
return $html ;
}
2015-06-14 02:03:20 +02:00
function make_absolute ( $base , $elem ) {
2013-04-18 16:11:06 +02:00
$base = new SimplePie_IRI ( $base );
// remove '//' in URL path (used to prevent URLs from resolving properly)
// TODO: check if this is still the case
if ( isset ( $base -> path )) $base -> path = preg_replace ( '!//+!' , '/' , $base -> path );
2012-05-01 00:51:43 +02:00
foreach ( array ( 'a' => 'href' , 'img' => 'src' ) as $tag => $attr ) {
$elems = $elem -> getElementsByTagName ( $tag );
for ( $i = $elems -> length - 1 ; $i >= 0 ; $i -- ) {
$e = $elems -> item ( $i );
//$e->parentNode->replaceChild($articleContent->ownerDocument->createTextNode($e->textContent), $e);
2015-06-14 02:03:20 +02:00
make_absolute_attr ( $base , $e , $attr );
2012-05-01 00:51:43 +02:00
}
2015-06-14 02:03:20 +02:00
if ( strtolower ( $elem -> tagName ) == $tag ) make_absolute_attr ( $base , $elem , $attr );
2012-05-01 00:51:43 +02:00
}
}
2015-06-14 02:03:20 +02:00
function make_absolute_attr ( $base , $e , $attr ) {
2012-05-01 00:51:43 +02:00
if ( $e -> hasAttribute ( $attr )) {
// Trim leading and trailing white space. I don't really like this but
// unfortunately it does appear on some sites. e.g. <img src=" /path/to/image.jpg" />
$url = trim ( str_replace ( '%20' , ' ' , $e -> getAttribute ( $attr )));
$url = str_replace ( ' ' , '%20' , $url );
if ( ! preg_match ( '!https?://!i' , $url )) {
2013-04-18 16:11:06 +02:00
if ( $absolute = SimplePie_IRI :: absolutize ( $base , $url )) {
2019-04-04 23:46:36 +02:00
$e -> setAttribute ( $attr , $absolute -> get_uri ());
2012-05-01 00:51:43 +02:00
}
}
}
}
2015-06-14 02:03:20 +02:00
function make_absolute_str ( $base , $url ) {
2013-04-18 16:11:06 +02:00
$base = new SimplePie_IRI ( $base );
2012-05-01 00:51:43 +02:00
// remove '//' in URL path (causes URLs not to resolve properly)
2013-04-18 16:11:06 +02:00
if ( isset ( $base -> path )) $base -> path = preg_replace ( '!//+!' , '/' , $base -> path );
2012-05-01 00:51:43 +02:00
if ( preg_match ( '!^https?://!i' , $url )) {
// already absolute
return $url ;
} else {
2013-04-18 16:11:06 +02:00
if ( $absolute = SimplePie_IRI :: absolutize ( $base , $url )) {
2019-04-04 23:46:36 +02:00
return $absolute -> get_uri ();
2013-04-18 16:11:06 +02:00
}
2012-05-01 00:51:43 +02:00
return false ;
}
}
2019-04-04 23:23:27 +02:00
function make_html ( $dom , $inner = false ) {
global $options ;
static $html5 = null ;
if ( $options -> html5_output ) {
if ( $html5 === null ) {
$html5 = new Masterminds\HTML5 ( array ( 'disable_html_ns' => true ));
}
if ( ! $inner ) {
return $html5 -> saveHTML ( $dom );
} else {
$_inner = '' ;
if ( $dom -> hasChildNodes ()) {
foreach ( $dom -> childNodes as $child ) {
$_inner .= $html5 -> saveHTML ( $child );
}
}
return $_inner ;
}
} else {
if ( ! $inner ) {
return $dom -> ownerDocument -> saveXML ( $dom );
} else {
return $dom -> innerHTML ;
}
}
}
2012-05-01 00:51:43 +02:00
// returns single page response, or false if not found
2015-06-14 02:03:20 +02:00
function get_single_page ( $item , $html , $url ) {
2013-04-18 16:11:06 +02:00
global $http , $extractor ;
2014-05-15 22:49:16 +02:00
debug ( 'Looking for site config files to see if single page link exists' );
$site_config = $extractor -> buildSiteConfig ( $url , $html );
2012-05-01 00:51:43 +02:00
$splink = null ;
if ( ! empty ( $site_config -> single_page_link )) {
$splink = $site_config -> single_page_link ;
} elseif ( ! empty ( $site_config -> single_page_link_in_feed )) {
// single page link xpath is targeted at feed
$splink = $site_config -> single_page_link_in_feed ;
// so let's replace HTML with feed item description
$html = $item -> get_description ();
}
if ( isset ( $splink )) {
// Build DOM tree from HTML
$readability = new Readability ( $html , $url );
$xpath = new DOMXPath ( $readability -> dom );
// Loop through single_page_link xpath expressions
$single_page_url = null ;
foreach ( $splink as $pattern ) {
2017-02-18 16:06:19 +01:00
// Do we have conditions?
$condition = $site_config -> get_if_page_contains_condition ( 'single_page_link' , $pattern );
if ( $condition ) {
$elems = @ $xpath -> evaluate ( $condition , $readability -> dom );
if ( $elems instanceof DOMNodeList && $elems -> length > 0 ) {
// all fine
} else {
// move on to next single page link XPath
continue ;
}
}
2012-05-01 00:51:43 +02:00
$elems = @ $xpath -> evaluate ( $pattern , $readability -> dom );
if ( is_string ( $elems )) {
$single_page_url = trim ( $elems );
break ;
} elseif ( $elems instanceof DOMNodeList && $elems -> length > 0 ) {
foreach ( $elems as $item ) {
2013-04-18 16:11:06 +02:00
if ( $item instanceof DOMElement && $item -> hasAttribute ( 'href' )) {
2012-05-01 00:51:43 +02:00
$single_page_url = $item -> getAttribute ( 'href' );
2014-05-15 22:49:16 +02:00
break 2 ;
2013-04-18 16:11:06 +02:00
} elseif ( $item instanceof DOMAttr && $item -> value ) {
$single_page_url = $item -> value ;
2014-05-15 22:49:16 +02:00
break 2 ;
2012-05-01 00:51:43 +02:00
}
}
}
}
2019-04-04 23:46:36 +02:00
$base_url = get_base_url ( $readability -> dom , $url );
2019-04-04 23:23:27 +02:00
if ( ! $base_url ) $base_url = $url ;
// If we've got URL, resolve against $base_url
if ( isset ( $single_page_url ) && ( $single_page_url = make_absolute_str ( $base_url , $single_page_url ))) {
2012-05-01 00:51:43 +02:00
// check it's not what we have already!
if ( $single_page_url != $url ) {
// it's not, so let's try to fetch it...
2013-04-18 16:11:06 +02:00
$_prev_ref = $http -> referer ;
$http -> referer = $single_page_url ;
2012-05-01 00:51:43 +02:00
if (( $response = $http -> get ( $single_page_url , true )) && $response [ 'status_code' ] < 300 ) {
2013-04-18 16:11:06 +02:00
$http -> referer = $_prev_ref ;
2012-05-01 00:51:43 +02:00
return $response ;
}
2013-04-18 16:11:06 +02:00
$http -> referer = $_prev_ref ;
2012-05-01 00:51:43 +02:00
}
}
}
return false ;
}
2014-05-15 22:49:16 +02:00
// based on content-type http header, decide what to do
// param: HTTP headers string
// return: array with keys: 'mime', 'type', 'subtype', 'action', 'name'
// e.g. array('mime'=>'image/jpeg', 'type'=>'image', 'subtype'=>'jpeg', 'action'=>'link', 'name'=>'Image')
function get_mime_action_info ( $headers ) {
global $options ;
// check if action defined for returned Content-Type
$info = array ();
if ( preg_match ( '!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im' , $headers , $match )) {
// look for full mime type (e.g. image/jpeg) or just type (e.g. image)
// match[1] = full mime type, e.g. image/jpeg
// match[2] = first part, e.g. image
// match[3] = last part, e.g. jpeg
$info [ 'mime' ] = strtolower ( trim ( $match [ 1 ]));
$info [ 'type' ] = strtolower ( trim ( $match [ 2 ]));
$info [ 'subtype' ] = strtolower ( trim ( $match [ 3 ]));
foreach ( array ( $info [ 'mime' ], $info [ 'type' ]) as $_mime ) {
if ( isset ( $options -> content_type_exc [ $_mime ])) {
$info [ 'action' ] = $options -> content_type_exc [ $_mime ][ 'action' ];
$info [ 'name' ] = $options -> content_type_exc [ $_mime ][ 'name' ];
break ;
}
}
}
return $info ;
}
2013-04-18 16:11:06 +02:00
function remove_url_cruft ( $url ) {
// remove google analytics for the time being
// regex adapted from http://navitronic.co.uk/2010/12/removing-google-analytics-cruft-from-urls/
// https://gist.github.com/758177
return preg_replace ( '/(\?|\&)utm_[a-z]+=[^\&]+/' , '' , $url );
}
2012-05-01 00:51:43 +02:00
function make_substitutions ( $string ) {
if ( $string == '' ) return $string ;
global $item , $effective_url ;
$string = str_replace ( '{url}' , htmlspecialchars ( $item -> get_permalink ()), $string );
$string = str_replace ( '{effective-url}' , htmlspecialchars ( $effective_url ), $string );
return $string ;
}
2014-05-15 22:49:16 +02:00
function get_cache () {
global $options , $valid_key ;
static $cache = null ;
if ( $cache === null ) {
$frontendOptions = array (
2014-09-15 22:24:06 +02:00
'lifetime' => $options -> cache_time * 60 , // cache lifetime
2014-05-15 22:49:16 +02:00
'automatic_serialization' => false ,
'write_control' => false ,
'automatic_cleaning_factor' => $options -> cache_cleanup ,
'ignore_user_abort' => false
);
$backendOptions = array (
'cache_dir' => ( $valid_key ) ? $options -> cache_dir . '/rss-with-key/' : $options -> cache_dir . '/rss/' , // directory where to put the cache files
'file_locking' => false ,
'read_control' => true ,
'read_control_type' => 'strlen' ,
'hashed_directory_level' => $options -> cache_directory_level ,
'hashed_directory_perm' => 0777 ,
'cache_file_perm' => 0664 ,
'file_name_prefix' => 'ff'
);
// getting a Zend_Cache_Core object
$cache = Zend_Cache :: factory ( 'Core' , 'File' , $frontendOptions , $backendOptions );
}
return $cache ;
}
function debug ( $msg ) {
global $debug_mode ;
if ( $debug_mode ) {
echo '* ' , $msg , " \n " ;
ob_flush ();
flush ();
}
}