diff --git a/changelog.txt b/changelog.txt
index 9ab591d..d90ad28 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -2,6 +2,16 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
+2.5 (2011-01-08)
+ - New option: custom extraction pattern (CSS selectors)
+ - New option: allowed URLs (restrict service to pre-defined feeds/domains)
+ - New option: exclude items on fail (remove items from feed if content extraction fails)
+ - Remove 'http://' from URL before form submission (prevents errors on hosts which have overly vigilant security software)
+ - Allow overriding of index.php with custom_index.php
+ - config.php now required (override with custom_config.php)
+ - index.php now uses config.php to determine what to display
+ - Bug fix: occasional fatal error in IRI::__toString() (IRI updated)
+ - Bug fix: workaround for PHP bug http://bugs.php.net/51192 (fixed in HumbleHttpAgent.php)
2.2 (2010-10-30)
- Character-encoding detection improved (minor change)
diff --git a/cleancache.php b/cleancache.php
new file mode 100644
index 0000000..49c6a3e
--- /dev/null
+++ b/cleancache.php
@@ -0,0 +1,119 @@
+.
+*/
+
+// Usage
+// -----
+// Set up your scheduler (e.g. cron) to request this file periodically.
+// Note: this file must not be named cleancache.php so please rename it.
+// We ask you to do this to prevent others from initiating
+// the cache cleanup process. It will not run if it's called cleancache.php.
+
+error_reporting(E_ALL ^ E_NOTICE);
+ini_set("display_errors", 1);
+@set_time_limit(120);
+
+// check file name
+if (basename(__FILE__) == 'cleancache.php') die('cleancache.php must be renamed');
+
+// set include path
+set_include_path(realpath(dirname(__FILE__).'/libraries').PATH_SEPARATOR.get_include_path());
+
+// Autoloading of classes allows us to include files only when they're
+// needed. If we've got a cached copy, for example, only Zend_Cache is loaded.
+function __autoload($class_name) {
+ static $mapping = array(
+ 'Zend_Cache' => 'Zend/Cache.php'
+ );
+ if (isset($mapping[$class_name])) {
+ //echo "Loading $class_name\n ";
+ require_once $mapping[$class_name];
+ return true;
+ } else {
+ return false;
+ }
+}
+require_once(dirname(__FILE__).'/config.php');
+if (!$options->caching) die('Caching is disabled');
+
+// clean http response cache
+$frontendOptions = array(
+ 'lifetime' => 30*60, // cache lifetime of 30 minutes
+ 'automatic_serialization' => true,
+ 'write_control' => false,
+ 'automatic_cleaning_factor' => 0,
+ 'ignore_user_abort' => false
+);
+$backendOptions = array(
+ 'cache_dir' => $options->cache_dir.'/http-responses/',
+ 'file_locking' => false,
+ 'read_control' => true,
+ 'read_control_type' => 'strlen',
+ 'hashed_directory_level' => $options->cache_directory_level,
+ 'hashed_directory_umask' => 0777,
+ 'cache_file_umask' => 0664,
+ 'file_name_prefix' => 'ff'
+);
+$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
+$cache->clean(Zend_Cache::CLEANING_MODE_OLD);
+
+// clean rss (non-key) cache
+$frontendOptions = array(
+ 'lifetime' => 20*60,
+ 'automatic_serialization' => false,
+ 'write_control' => false,
+ 'automatic_cleaning_factor' => 0,
+ 'ignore_user_abort' => false
+);
+$backendOptions = array(
+ 'cache_dir' => $options->cache_dir.'/rss/',
+ 'file_locking' => false,
+ 'read_control' => true,
+ 'read_control_type' => 'strlen',
+ 'hashed_directory_level' => $options->cache_directory_level,
+ 'hashed_directory_umask' => 0777,
+ 'cache_file_umask' => 0664,
+ 'file_name_prefix' => 'ff'
+);
+$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
+$cache->clean(Zend_Cache::CLEANING_MODE_OLD);
+
+// clean rss (key) cache
+$frontendOptions = array(
+ 'lifetime' => 20*60,
+ 'automatic_serialization' => false,
+ 'write_control' => false,
+ 'automatic_cleaning_factor' => 0,
+ 'ignore_user_abort' => false
+);
+$backendOptions = array(
+ 'cache_dir' => $options->cache_dir.'/rss-with-key/',
+ 'file_locking' => false,
+ 'read_control' => true,
+ 'read_control_type' => 'strlen',
+ 'hashed_directory_level' => $options->cache_directory_level,
+ 'hashed_directory_umask' => 0777,
+ 'cache_file_umask' => 0664,
+ 'file_name_prefix' => 'ff'
+);
+$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
+$cache->clean(Zend_Cache::CLEANING_MODE_OLD);
+
+?>
\ No newline at end of file
diff --git a/config-sample.php b/config.php
similarity index 63%
rename from config-sample.php
rename to config.php
index 0336ecb..b85f127 100644
--- a/config-sample.php
+++ b/config.php
@@ -1,5 +1,10 @@
enabled = true;
-// Restrict service
-// ----------------------
-// Set this to true if you'd like certain features
-// to be available only to key holders.
-// Affected features:
-// * Link handling (disabled for non-key holders if set to true)
-// * Cache time (20 minutes for non-key holders if set to true)
-$options->restrict = false;
-
// Default entries (without API key)
// ----------------------
// The number of feed items to process when no API key is supplied.
@@ -35,6 +31,27 @@ $options->max_entries = 10;
// the extracted content block.
$options->rewrite_relative_urls = true;
+// Exclude items if extraction fails
+// ---------------------------------
+// Excludes items from the resulting feed
+// if we cannot extract any content from the
+// item URL.
+// Possible values...
+// Enable: true
+// Disable: false (default)
+// User decides: 'user' (this option will appear on the form)
+$options->exclude_items_on_fail = 'user';
+
+// Extraction pattern
+// ---------------------------------
+// Specify what should get extracted
+// Possible values:
+// Auto detect: 'auto'
+// Custom: css string (e.g. 'div#content')
+// Element within auto-detected block: 'auto ' + css string (e.g. 'auto p')
+// User decides: 'user' (default, this option will appear on the form)
+$options->extraction_pattern = 'user';
+
// Enable caching
// ----------------------
// Enable this if you'd like to cache results
@@ -58,9 +75,19 @@ $options->message_to_prepend = '';
// HTML to insert at the end of each feed item when no API key is supplied.
$options->message_to_append = '';
+// URLs to allow
+// ----------------------
+// List of URLs (or parts of a URL) which the service will accept.
+// If the list is empty, all URLs (except those specified in the blocked list below)
+// will be permitted.
+// Empty: array();
+// Non-empty example: array('example.com', 'anothersite.org');
+$options->allowed_urls = array();
+
// URLs to block
// ----------------------
-// List of URLs (or parts of a URL) which the service should not accept
+// List of URLs (or parts of a URL) which the service will not accept.
+// Note: this list is ignored if allowed_urls is not empty
$options->blocked_urls = array();
// Error message when content extraction fails (without API key)
@@ -71,43 +98,6 @@ $options->error_message = '[unable to retrieve full-text content]';
/// ADVANCED OPTIONS ////////////////////////////
/////////////////////////////////////////////////
-// API keys
-// ----------------------
-// NOTE: You do not need an API key from fivefilters.org to run your own
-// copy of the code. This is here if you'd like to offer others an API key
-// to access _your_ copy.
-// Keys let you group users - those with a key and those without - and
-// restrict access to the service to those without a key.
-// If you want everyone to access the service in the same way, you can
-// leave the array below empty and ignore the API key options further down.
-// The options further down in this file will allow you to specify
-// how the service should behave in each mode.
-$options->api_keys = array();
-
-// Default entries (with API key)
-// ----------------------
-// The number of feed items to process when a valid API key is supplied.
-$options->default_entries_with_key = 5;
-
-// Max entries (with API key)
-// ----------------------
-// The maximum number of feed items to process when a valid API key is supplied.
-$options->max_entries_with_key = 10;
-
-// Message to prepend (with API key)
-// ----------------------
-// HTML to insert at the beginning of each feed item when a valid API key is supplied.
-$options->message_to_prepend_with_key = '';
-
-// Message to append (with API key)
-// ----------------------
-// HTML to insert at the end of each feed item when a valid API key is supplied.
-$options->message_to_append_with_key = '';
-
-// Error message when content extraction fails (with API key)
-// ----------------------
-$options->error_message_with_key = '[unable to retrieve full-text content]';
-
// Alternative Full-Text RSS service URL
// ----------------------
// This option is to offer very simple load distribution for the service.
@@ -127,4 +117,71 @@ $options->alternative_url = '';
// It's best not to change this if you're unsure.
$options->cache_directory_level = 0;
+// Cache cleanup
+// -------------
+// 0 = script will not clean cache (rename cachecleanup.php and use it for scheduled (e.g. cron) cache cleanup)
+// 1 = clean cache everytime the script runs (not recommended)
+// 100 = (roughly) clean cache once out of every 100 script runs
+// x = (roughly) clean cache once out of every x script runs
+// ...you get the idea :)
+$options->cache_cleanup = 100;
+
+/////////////////////////////////////////////////
+/// DEPRECATED OPTIONS
+/// THESE OPTIONS WILL CHANGE IN THE NEXT
+/// VERSION, WE RECOMMEND YOU DO NOT USE THEM
+/////////////////////////////////////////////////
+
+// Restrict service (deprecated)
+// -----------------------------
+// Set this to true if you'd like certain features
+// to be available only to key holders.
+// Affected features:
+// * Link handling (disabled for non-key holders if set to true)
+// * Cache time (20 minutes for non-key holders if set to true)
+$options->restrict = false;
+
+// API keys (deprecated)
+// ----------------------
+// NOTE: You do not need an API key from fivefilters.org to run your own
+// copy of the code. This is here if you'd like to offer others an API key
+// to access _your_ copy.
+// Keys let you group users - those with a key and those without - and
+// restrict access to the service to those without a key.
+// If you want everyone to access the service in the same way, you can
+// leave the array below empty and ignore the API key options further down.
+// The options further down in this file will allow you to specify
+// how the service should behave in each mode.
+$options->api_keys = array();
+
+// Default entries (with API key) (deprecated)
+// ----------------------
+// The number of feed items to process when a valid API key is supplied.
+$options->default_entries_with_key = 5;
+
+// Max entries (with API key) (deprecated)
+// ----------------------
+// The maximum number of feed items to process when a valid API key is supplied.
+$options->max_entries_with_key = 10;
+
+// Message to prepend (with API key) (deprecated)
+// ----------------------
+// HTML to insert at the beginning of each feed item when a valid API key is supplied.
+$options->message_to_prepend_with_key = '';
+
+// Message to append (with API key) (deprecated)
+// ----------------------
+// HTML to insert at the end of each feed item when a valid API key is supplied.
+$options->message_to_append_with_key = '';
+
+// Error message when content extraction fails (with API key) (deprecated)
+// ----------------------
+$options->error_message_with_key = '[unable to retrieve full-text content]';
+
+/////////////////////////////////////////////////
+/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
+/////////////////////////////////////////////////
+
+if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '2.5');
+
?>
\ No newline at end of file
diff --git a/css/feed.css b/css/feed.css
new file mode 100644
index 0000000..866ab76
--- /dev/null
+++ b/css/feed.css
@@ -0,0 +1,30 @@
+/* RSS CSS Document */
+
+* { margin:0; padding:0; }
+
+p { padding: .5em 0; }
+
+h1,h2,h3,h4,h5,h6 { font-size: 1em; padding: .5em 0; }
+
+html { display:block; padding-bottom:50px; }
+body { font:80% Verdana, sans-serif; color:#000; padding:25px 0 0 35px; }
+
+a { color:#5BAB03; text-decoration:none; }
+a:hover { color:#5BAB03; text-decoration: underline;}
+
+ul { margin-left:1.5em; }
+li { margin-bottom:0.4em; }
+div#content>ul { list-style-type: none; }
+div.article>li>a { font-weight:bold; font-size: 1.3em;}
+
+
+div { line-height:1.6em; }
+
+div#content { background:#fff; margin-right:15px; padding-left:1em;}
+div#content div { margin:0 1em 1em 0; }
+
+div#explanation { padding:1em 1em 0 1em; border:1px solid #ddd; background:#efefef; margin:0 2em 2em 0; }
+div#explanation h1 { font-weight:normal; font-size:1.8em; margin-bottom:0.3em; }
+div#explanation p { margin-bottom:1em; }
+
+.small { font-size: .7em; color: #666; }
\ No newline at end of file
diff --git a/css/feed.xsl b/css/feed.xsl
new file mode 100644
index 0000000..216d34e
--- /dev/null
+++ b/css/feed.xsl
@@ -0,0 +1,34 @@
+
+
+
+
+
+
+
+ (full-text feed)
+
+
+
+
+
(full-text feed)
+
You are viewing an auto-generated full-text RSS feed. RSS feeds allow you to stay up to date with the latest news and features you want from websites. To subscribe to it, you will need a News Reader or other similar device.
+
Below is the latest content available from this feed.
Thanks for downloading and setting this up. If you haven't done so already, check server compatibility
- to see if your environment will support this application.
-
If everything's okay, feel free to edit this file (index.html) and make any changes you like. If you plan
- to offer this service to others, please keep a download link so users can grab a copy of the code if they
- want it (you can either offer the download yourself, or link to the download page on fivefilters.org).
- That's one requirement of the license.
-
Thanks! :)
-
-
For everyone else
-
-
About
-
This is a free software project to help people extract content from web pages. It can extract content from a standard HTML page and return a 1-item feed or it can transform an existing feed into a full-text feed. It is being developed as part of the Five Filters project to promote independent, non-corporate media.
-
-
Bookmarklet
-
To easily transform partial-feeds you encounter (or convert any content on a page into a 1-item feed), drag the link below to your browser's bookmarks toolbar.
- Then whenever you'd like a full-text feed, click the bookmarklet.
-
Drag this:
-
-
-
API
-
To extract content from a web page or to transform an existing partial feed to full text, pass the URL (encoded) in the querystring to the following URL:
-
-
/makefulltextfeed.php?url=[url]
-
-
If you have an API key, add that to the querystring:
-
-
/makefulltextfeed.php?key=[key]&url=[url]
-
/makefulltextfeed.php?key=[key]&max=[number of feed items]&url=[url]
-
-
-
If you're not hosting this yourself, you do not have to rely on an external API if you don't want to — this is a free software (open source)
- project licensed under the AGPL. You're free to download your own copy.
The application uses PHP, PHP Readability, SimplePie, FeedWriter, Humble HTTP Agent, Zend Cache and IRI. Readability is the magic piece of code that tries to identify and extract the content block from any given web page.
-
-
System Requirements
-
-
PHP 5.2 or above is required.
- The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using WampServer.
-
-
Download and Installation
-
The software can be downloaded free of charge through launchpad.net using a Bazaar client (see below).
- However, for those who'd like a simpler solution, you can also buy a zip package with the
- source code.
-
-
Installation with the Bazaar client
-
-
-
Log in to your host using SSH
-
Change to the directory where you want Full-Text RSS installed
-
Enter bzr export full-text-rss http://bazaar.launchpad.net/~keyvan/fivefilters/content-only/
-
Now enter chmod -R 0777 full-text-rss/cache/
-
That's it! Try accessing the full-text-rss folder through your web browser, you should see the form asking for a URL.
-
(Optional) If you'd like to customise the software, rename config-sample.php to config.php and edit the file.
-
-
-
If you'd like to create a feed without going through the form first, you can simply pass the URL in the query string to makefulltextfeed.php (see the API section above).
-
-
License
-
This web application is licensed under the AGPL version 3 — which basically means if you use the code to offer the same or similar service for your users, you are also required to share the code with your users so they can do the same themselves. (More on why this is important.)
-
The libraries used by the application are licensed as follows...
Thanks for downloading and setting this up. If you haven't done so already, check server compatibility
+ to see if your environment will support this application. Full-Text RSS runs on most shared web hosting environments.
+
Configure
+
In addition to the options above, Full-Text RSS comes with a configuration file which allows you to control how the application works. Features include:
+
+
Restrict access to a pre-defined set of URLs or block certain URLs
+
Restrict the maximum number of feed items to be processed
+
Prepend or append an HTML fragment to each feed item processed
+
Caching
+
+
To change the configuration, save a copy of config.php as custom_config.php and make any changes you like to it.To change the configuration, edit custom_config.php and make any changes you like.
+
+
If everything works fine, feel free to modify this page by saving it as custom_index.php and change it to whatever you like.
+
+
Sharing is caring
+ If you plan to offer this service to others through your hosted copy, please keep a download link so users can grab a copy of the code if they
+ want it (you can either offer the download yourself, or link to the download page on fivefilters.org to support us).
+ That's one requirement of the license.
+
We have more information in the section below, but if you need help with anything, please email fivefilters@fivefilters.org.
+
+
+
+
For everyone
+
+
About
+
This is a free software project to help people extract content from web pages. It can extract content from a standard HTML page and return a 1-item feed or it can transform an existing feed into a full-text feed. It is being developed as part of the Five Filters project to promote independent, non-corporate media.
+
+
Bookmarklet
+
To easily transform partial-feeds you encounter (or convert any content on a page into a 1-item feed), drag the link below to your browser's bookmarks toolbar.
+ Then whenever you'd like a full-text feed, click the bookmarklet.
+
Drag this:
+
+
+
API
+
To extract content from a web page or to transform an existing partial feed to full text, pass the URL (encoded) in the querystring to the following URL:
+
+
/makefulltextfeed.php?url=[url]
+
+
If you have an API key, add that to the querystring:
+
+
/makefulltextfeed.php?key=[key]&url=[url]
+
/makefulltextfeed.php?key=[key]&max=[number of feed items]&url=[url]
+
+
+
All the parameters in the form above can be passed in this way. Examine the URL in the addressbar after you click 'Create Feed' to see the values.
+
+
Note: If you're not hosting this yourself, you do not have to rely on an external API if you don't want to — this is a free software (open source)
+ project licensed under the AGPL. You're free to download your own copy.
+
+
Source Code and Technologies
+
The application uses PHP, PHP Readability, SimplePie, FeedWriter, Humble HTTP Agent. Depending on configuration, these optional components may also be used: Zend Cache, Zend DOM Query and IRI. Readability is the magic piece of code that tries to identify and extract the content block from any given web page.
+
+
System Requirements
+
+
PHP 5.2 or above is required. A simple shared web hosting account will work fine.
+ The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using WampServer.
+
+
Download
+
Download from fivefilters.org - old versions are available in the code repository.
+
+
License
+
This web application is licensed under the AGPL version 3 — which basically means if you use the code to offer the same or similar service for your users, you are also required to share the code with your users so they can run it for themselves. (More on why this is important.)
+
The libraries used by the application are licensed as follows...