Full-Text RSS 3.3

This commit is contained in:
FiveFilters.org 2014-09-15 22:24:06 +02:00
parent ec7275f58d
commit cfe4c012ef
55 changed files with 9456 additions and 7232 deletions

View File

@ -22,7 +22,7 @@
*/
$VERSION='$Id: apc.php 307048 2011-01-03 23:53:17Z kalle $';
$VERSION='$Id$';
////////// READ OPTIONAL CONFIGURATION FILE ////////////
if (file_exists("apc.conf.php")) include("apc.conf.php");
@ -35,9 +35,10 @@ $admin_page = 'apc';
require_once('../config.php');
require_once('require_login.php');
require_once('template.php');
if (!isset($_REQUEST['IMG'])) tpl_header('APC');
if (!isset($_REQUEST['IMG'])) tpl_header('APCu');
////////// BEGIN OF DEFAULT CONFIG AREA ///////////////////////////////////////////////////////////
defaults('USE_AUTHENTICATION',0); // Use (internal) authentication - best choice if
// no other authentication is available
// If set to 0:
@ -46,8 +47,8 @@ defaults('USE_AUTHENTICATION',0); // Use (internal) authentication - best choi
// If set to 1:
// You need to change ADMIN_PASSWORD to make
// this work!
//defaults('ADMIN_USERNAME','admin'); // Admin Username
//defaults('ADMIN_PASSWORD',''); // Admin Password - CHANGE THIS TO ENABLE!!!
defaults('ADMIN_USERNAME','apc'); // Admin Username
defaults('ADMIN_PASSWORD','password'); // Admin Password - CHANGE THIS TO ENABLE!!!
// (beckerr) I'm using a clear text password here, because I've no good idea how to let
// users generate a md5 or crypt password in a easy way to fill it in above
@ -79,10 +80,8 @@ if (isset($_SERVER['SERVER_ADDR'])) {
// operation constants
define('OB_HOST_STATS',1);
define('OB_SYS_CACHE',2);
define('OB_USER_CACHE',3);
define('OB_SYS_CACHE_DIR',4);
define('OB_VERSION_CHECK',9);
define('OB_USER_CACHE',2);
define('OB_VERSION_CHECK',3);
// check validity of input variables
$vardom=array(
@ -99,12 +98,9 @@ $vardom=array(
'SORT1' => '/^[AHSMCDTZ]$/', // first sort key
'SORT2' => '/^[DA]$/', // second sort key
'AGGR' => '/^\d+$/', // aggregation by dir level
'SEARCH' => '~^[a-zA-Z0-1/_.-]*$~' // aggregation by dir level
'SEARCH' => '~^[a-zA-Z0-9/_.-]*$~' // aggregation by dir level
);
// default cache mode
$cache_mode='opcode';
// cache scope
$scope_list=array(
'A' => 'cache_list',
@ -183,28 +179,24 @@ EOB;
}
}
}
// select cache mode
if ($AUTHENTICATED && $MYREQUEST['OB'] == OB_USER_CACHE) {
$cache_mode='user';
}
// clear cache
if ($AUTHENTICATED && isset($MYREQUEST['CC']) && $MYREQUEST['CC']) {
apc_clear_cache($cache_mode);
apcu_clear_cache();
}
if ($AUTHENTICATED && !empty($MYREQUEST['DU'])) {
apc_delete($MYREQUEST['DU']);
apcu_delete($MYREQUEST['DU']);
}
if(!function_exists('apc_cache_info') || !($cache=@apc_cache_info($cache_mode))) {
if(!function_exists('apcu_cache_info')) {
echo "No cache info available. APC does not appear to be running.";
exit;
}
$cache_user = apc_cache_info('user', 1);
$mem=apc_sma_info();
if(!$cache['num_hits']) { $cache['num_hits']=1; $time++; } // Avoid division by 0 errors on a cache clear
$cache = apcu_cache_info();
$mem=apcu_sma_info();
// don't cache this page
//
@ -390,13 +382,13 @@ if (isset($MYREQUEST['IMG']))
text_arc($image,$x,$y,$size,$angle[0]*360,$angle[1]*360,$col_black,bsize($s*($angle[1]-$angle[0])));
}
break;
case 2:
$s=$cache['num_hits']+$cache['num_misses'];
$a=$cache['num_hits'];
$s=$cache['nhits']+$cache['nmisses'];
$a=$cache['nhits'];
fill_box($image, 30,$size,50,-$a*($size-21)/$s,$col_black,$col_green,sprintf("%.1f%%",$cache['num_hits']*100/$s));
fill_box($image,130,$size,50,-max(4,($s-$a)*($size-21)/$s),$col_black,$col_red,sprintf("%.1f%%",$cache['num_misses']*100/$s));
fill_box($image, 30,$size,50,$s ? (-$a*($size-21)/$s) : 0,$col_black,$col_green,sprintf("%.1f%%",$s ? $cache['nhits']*100/$s : 0));
fill_box($image,130,$size,50,$s ? -max(4,($s-$a)*($size-21)/$s) : 0,$col_black,$col_red,sprintf("%.1f%%",$s ? $cache['nmisses']*100/$s : 0));
break;
case 3:
@ -439,15 +431,16 @@ if (isset($MYREQUEST['IMG']))
}
}
break;
case 4:
$s=$cache['num_hits']+$cache['num_misses'];
$a=$cache['num_hits'];
fill_box($image, 30,$size,50,-$a*($size-21)/$s,$col_black,$col_green,sprintf("%.1f%%",$cache['num_hits']*100/$s));
fill_box($image,130,$size,50,-max(4,($s-$a)*($size-21)/$s),$col_black,$col_red,sprintf("%.1f%%",$cache['num_misses']*100/$s));
case 4:
$s=$cache['nhits']+$cache['nmisses'];
$a=$cache['nhits'];
fill_box($image, 30,$size,50,$s ? -$a*($size-21)/$s : 0,$col_black,$col_green,sprintf("%.1f%%", $s ? $cache['nhits']*100/$s : 0));
fill_box($image,130,$size,50,$s ? -max(4,($s-$a)*($size-21)/$s) : 0,$col_black,$col_red,sprintf("%.1f%%", $s ? $cache['nmisses']*100/$s : 0));
break;
}
header("Content-type: image/png");
imagepng($image);
exit;
@ -524,7 +517,7 @@ function block_sort($array1, $array2)
/*
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head><title>APC INFO <?php echo $host ?></title>
<head><title>APCu INFO <?php echo $host ?></title>
*/
?>
<style><!--
@ -594,7 +587,7 @@ hr.apc {
ol,menu { margin:1em 0 0 0; padding:0.2em; margin-left:1em;}
ol.menu li { display:inline; margin-right:0.7em; list-style:none; font-size:85%}
.menu a {
ol.menu a {
background:rgb(153,153,204);
border:solid rgb(102,102,153) 2px;
color:white;
@ -602,7 +595,7 @@ ol.menu li { display:inline; margin-right:0.7em; list-style:none; font-size:85%}
margin-right:0em;
padding:0.1em 0.5em 0.1em 0.5em;
text-decoration:none;
margin-left: 0;
margin-left: 5px;
}
ol.menu a.child_active {
background:rgb(153,153,204);
@ -733,41 +726,35 @@ input {
<body>
<div class="head">
<h1 class="apc">
<div class="logo"><span class="logo"><a href="http://pecl.php.net/package/APC">APC</a></span></div>
<div class="nameinfo">Opcode Cache</div>
<div class="logo"><span class="logo"><a href="http://pecl.php.net/package/APCu">APCu</a></span></div>
<div class="nameinfo">User Cache</div>
</h1>
<div class="login">
<?php put_login_link(); ?>
</div>
<hr class="apc">
</div>
<?php
// Display main Menu
echo <<<EOB
<ol class=menu>
<li><a href="$MY_SELF&OB={$MYREQUEST['OB']}&SH={$MYREQUEST['SH']}">Refresh Data</a></li>
EOB;
echo
menu_entry(1,'View Host Stats'),
menu_entry(2,'System Cache Entries');
if ($AUTHENTICATED) {
echo menu_entry(4,'Per-Directory Entries');
}
echo
menu_entry(3,'User Cache Entries'),
menu_entry(9,'Version Check');
menu_entry(OB_HOST_STATS,'View Host Stats'),
menu_entry(OB_USER_CACHE,'User Cache Entries'),
menu_entry(OB_VERSION_CHECK,'Version Check');
if ($AUTHENTICATED) {
echo <<<EOB
<li><a class="aright" href="$MY_SELF&CC=1&OB={$MYREQUEST['OB']}" onClick="javascript:return confirm('Are you sure?');">Clear Cache</a></li>
EOB;
}
echo <<<EOB
</ol>
EOB;
if ($AUTHENTICATED) {
echo <<<EOB
<div class="menu" style="margin-top: 10px; text-align: right;"><a class="aright clearcache" href="$MY_SELF&CC=1&OB={$MYREQUEST['OB']}" onClick="javascript:return confirm('Are you sure?');">Clear $cache_mode Cache</a></div>
EOB;
}
// CONTENT
echo <<<EOB
@ -777,11 +764,6 @@ EOB;
// MAIN SWITCH STATEMENT
switch ($MYREQUEST['OB']) {
// -----------------------------------------------
// Host Stats
// -----------------------------------------------
@ -790,77 +772,59 @@ case OB_HOST_STATS:
$mem_avail= $mem['avail_mem'];
$mem_used = $mem_size-$mem_avail;
$seg_size = bsize($mem['seg_size']);
$req_rate = sprintf("%.2f",($cache['num_hits']+$cache['num_misses'])/($time-$cache['start_time']));
$hit_rate = sprintf("%.2f",($cache['num_hits'])/($time-$cache['start_time']));
$miss_rate = sprintf("%.2f",($cache['num_misses'])/($time-$cache['start_time']));
$insert_rate = sprintf("%.2f",($cache['num_inserts'])/($time-$cache['start_time']));
$req_rate_user = sprintf("%.2f",($cache_user['num_hits']+$cache_user['num_misses'])/($time-$cache_user['start_time']));
$hit_rate_user = sprintf("%.2f",($cache_user['num_hits'])/($time-$cache_user['start_time']));
$miss_rate_user = sprintf("%.2f",($cache_user['num_misses'])/($time-$cache_user['start_time']));
$insert_rate_user = sprintf("%.2f",($cache_user['num_inserts'])/($time-$cache_user['start_time']));
$apcversion = phpversion('apc');
$req_rate_user = sprintf("%.2f", $cache['nhits'] ? (($cache['nhits']+$cache['nmisses'])/($time-$cache['stime'])) : 0);
$hit_rate_user = sprintf("%.2f", $cache['nhits'] ? (($cache['nhits'])/($time-$cache['stime'])) : 0);
$miss_rate_user = sprintf("%.2f", $cache['nmisses'] ? (($cache['nmisses'])/($time-$cache['stime'])) : 0);
$insert_rate_user = sprintf("%.2f", $cache['ninserts'] ? (($cache['ninserts'])/($time-$cache['stime'])) : 0);
$apcversion = phpversion('apcu');
$phpversion = phpversion();
$number_files = $cache['num_entries'];
$size_files = bsize($cache['mem_size']);
$number_vars = $cache_user['num_entries'];
$size_vars = bsize($cache_user['mem_size']);
$number_vars = $cache['nentries'];
$size_vars = bsize($cache['mem_size']);
$i=0;
echo <<< EOB
<div class="info div1"><h2>General Cache Information</h2>
<table cellspacing=0><tbody>
<tr class=tr-0><td class=td-0>APC Version</td><td>$apcversion</td></tr>
<tr class=tr-0><td class=td-0>APCu Version</td><td>$apcversion</td></tr>
<tr class=tr-1><td class=td-0>PHP Version</td><td>$phpversion</td></tr>
EOB;
if(!empty($_SERVER['SERVER_NAME']))
echo "<tr class=tr-0><td class=td-0>APC Host</td><td>{$_SERVER['SERVER_NAME']} $host</td></tr>\n";
echo "<tr class=tr-0><td class=td-0>APCu Host</td><td>{$_SERVER['SERVER_NAME']} $host</td></tr>\n";
if(!empty($_SERVER['SERVER_SOFTWARE']))
echo "<tr class=tr-1><td class=td-0>Server Software</td><td>{$_SERVER['SERVER_SOFTWARE']}</td></tr>\n";
echo <<<EOB
<tr class=tr-0><td class=td-0>Shared Memory</td><td>{$mem['num_seg']} Segment(s) with $seg_size
<br/> ({$cache['memory_type']} memory, {$cache['locking_type']} locking)
<br/> ({$cache['memory_type']} memory)
</td></tr>
EOB;
echo '<tr class=tr-1><td class=td-0>Start Time</td><td>',date(DATE_FORMAT,$cache['start_time']),'</td></tr>';
echo '<tr class=tr-0><td class=td-0>Uptime</td><td>',duration($cache['start_time']),'</td></tr>';
echo '<tr class=tr-1><td class=td-0>Start Time</td><td>',date(DATE_FORMAT,$cache['stime']),'</td></tr>';
echo '<tr class=tr-0><td class=td-0>Uptime</td><td>',duration($cache['stime']),'</td></tr>';
echo '<tr class=tr-1><td class=td-0>File Upload Support</td><td>',$cache['file_upload_progress'],'</td></tr>';
echo <<<EOB
</tbody></table>
</div>
<div class="info div1"><h2>File Cache Information</h2>
<table cellspacing=0><tbody>
<tr class=tr-0><td class=td-0>Cached Files</td><td>$number_files ($size_files)</td></tr>
<tr class=tr-1><td class=td-0>Hits</td><td>{$cache['num_hits']}</td></tr>
<tr class=tr-0><td class=td-0>Misses</td><td>{$cache['num_misses']}</td></tr>
<tr class=tr-1><td class=td-0>Request Rate (hits, misses)</td><td>$req_rate cache requests/second</td></tr>
<tr class=tr-0><td class=td-0>Hit Rate</td><td>$hit_rate cache requests/second</td></tr>
<tr class=tr-1><td class=td-0>Miss Rate</td><td>$miss_rate cache requests/second</td></tr>
<tr class=tr-0><td class=td-0>Insert Rate</td><td>$insert_rate cache requests/second</td></tr>
<tr class=tr-1><td class=td-0>Cache full count</td><td>{$cache['expunges']}</td></tr>
</tbody></table>
</div>
<div class="info div1"><h2>User Cache Information</h2>
<table cellspacing=0><tbody>
<tr class=tr-0><td class=td-0>Cached Variables</td><td>$number_vars ($size_vars)</td></tr>
<tr class=tr-1><td class=td-0>Hits</td><td>{$cache_user['num_hits']}</td></tr>
<tr class=tr-0><td class=td-0>Misses</td><td>{$cache_user['num_misses']}</td></tr>
<tr class=tr-1><td class=td-0>Request Rate (hits, misses)</td><td>$req_rate_user cache requests/second</td></tr>
<tr class=tr-0><td class=td-0>Hit Rate</td><td>$hit_rate_user cache requests/second</td></tr>
<tr class=tr-1><td class=td-0>Miss Rate</td><td>$miss_rate_user cache requests/second</td></tr>
<tr class=tr-0><td class=td-0>Insert Rate</td><td>$insert_rate_user cache requests/second</td></tr>
<tr class=tr-1><td class=td-0>Cache full count</td><td>{$cache_user['expunges']}</td></tr>
</tbody></table>
<div class="info div1"><h2>Cache Information</h2>
<table cellspacing=0>
<tbody>
<tr class=tr-0><td class=td-0>Cached Variables</td><td>$number_vars ($size_vars)</td></tr>
<tr class=tr-1><td class=td-0>Hits</td><td>{$cache['nhits']}</td></tr>
<tr class=tr-0><td class=td-0>Misses</td><td>{$cache['nmisses']}</td></tr>
<tr class=tr-1><td class=td-0>Request Rate (hits, misses)</td><td>$req_rate_user cache requests/second</td></tr>
<tr class=tr-0><td class=td-0>Hit Rate</td><td>$hit_rate_user cache requests/second</td></tr>
<tr class=tr-1><td class=td-0>Miss Rate</td><td>$miss_rate_user cache requests/second</td></tr>
<tr class=tr-0><td class=td-0>Insert Rate</td><td>$insert_rate_user cache requests/second</td></tr>
<tr class=tr-1><td class=td-0>Cache full count</td><td>{$cache['nexpunges']}</td></tr>
</tbody>
</table>
</div>
<div class="info div2"><h2>Runtime Settings</h2><table cellspacing=0><tbody>
EOB;
$j = 0;
foreach (ini_get_all('apc') as $k => $v) {
foreach (ini_get_all('apcu') as $k => $v) {
echo "<tr class=tr-$j><td class=td-0>",$k,"</td><td>",str_replace(',',',<br />',$v['local_value']),"</td></tr>\n";
$j = 1 - $j;
}
@ -878,7 +842,7 @@ EOB;
<table cellspacing=0><tbody>
EOB;
$size='width='.(GRAPH_SIZE+50).' height='.(GRAPH_SIZE+10);
echo <<<EOB
echo <<<EOB
<tr>
<td class=td-0>$mem_note</td>
<td class=td-1>Hits &amp; Misses</td>
@ -893,11 +857,11 @@ EOB;
: "",
'<tr>',
'<td class=td-0><span class="green box">&nbsp;</span>Free: ',bsize($mem_avail).sprintf(" (%.1f%%)",$mem_avail*100/$mem_size),"</td>\n",
'<td class=td-1><span class="green box">&nbsp;</span>Hits: ',$cache['num_hits'].sprintf(" (%.1f%%)",$cache['num_hits']*100/($cache['num_hits']+$cache['num_misses'])),"</td>\n",
'<td class=td-1><span class="green box">&nbsp;</span>Hits: ',$cache['nhits'].@sprintf(" (%.1f%%)",$cache['nhits']*100/($cache['nhits']+$cache['nmisses'])),"</td>\n",
'</tr>',
'<tr>',
'<td class=td-0><span class="red box">&nbsp;</span>Used: ',bsize($mem_used ).sprintf(" (%.1f%%)",$mem_used *100/$mem_size),"</td>\n",
'<td class=td-1><span class="red box">&nbsp;</span>Misses: ',$cache['num_misses'].sprintf(" (%.1f%%)",$cache['num_misses']*100/($cache['num_hits']+$cache['num_misses'])),"</td>\n";
'<td class=td-1><span class="red box">&nbsp;</span>Misses: ',$cache['nmisses'].@sprintf(" (%.1f%%)",$cache['nmisses']*100/($cache['nhits']+$cache['nmisses'])),"</td>\n";
echo <<< EOB
</tr>
</tbody></table>
@ -968,67 +932,9 @@ case OB_USER_CACHE:
echo '</div>';
break;
}
$fieldname='info';
$fieldname='key';
$fieldheading='User Entry Label';
$fieldkey='info';
// -----------------------------------------------
// System Cache Entries
// -----------------------------------------------
case OB_SYS_CACHE:
if (!isset($fieldname))
{
$fieldname='filename';
$fieldheading='Script Filename';
if(ini_get("apc.stat")) $fieldkey='inode';
else $fieldkey='filename';
}
if (!empty($MYREQUEST['SH']))
{
echo <<< EOB
<div class="info"><table cellspacing=0><tbody>
<tr><th>Attribute</th><th>Value</th></tr>
EOB;
$m=0;
foreach($scope_list as $j => $list) {
foreach($cache[$list] as $i => $entry) {
if (md5($entry[$fieldkey])!=$MYREQUEST['SH']) continue;
foreach($entry as $k => $value) {
if (!$AUTHENTICATED) {
// hide all path entries if not logged in
$value=preg_replace('/^.*(\\/|\\\\)/','<i>&lt;hidden&gt;</i>/',$value);
}
if ($k == "num_hits") {
$value=sprintf("%s (%.2f%%)",$value,$value*100/$cache['num_hits']);
}
if ($k == 'deletion_time') {
if(!$entry['deletion_time']) $value = "None";
}
echo
"<tr class=tr-$m>",
"<td class=td-0>",ucwords(preg_replace("/_/"," ",$k)),"</td>",
"<td class=td-last>",(preg_match("/time/",$k) && $value!='None') ? date(DATE_FORMAT,$value) : htmlspecialchars($value, ENT_QUOTES, 'UTF-8'),"</td>",
"</tr>";
$m=1-$m;
}
if($fieldkey=='info') {
echo "<tr class=tr-$m><td class=td-0>Stored Value</td><td class=td-last><pre>";
$output = var_export(apc_fetch($entry[$fieldkey]),true);
echo htmlspecialchars($output, ENT_QUOTES, 'UTF-8');
echo "</pre></td></tr>\n";
}
break;
}
}
echo <<<EOB
</tbody></table>
</div>
EOB;
break;
}
$fieldkey='key';
$cols=6;
echo <<<EOB
@ -1040,7 +946,7 @@ EOB;
"<option value=A",$MYREQUEST['SCOPE']=='A' ? " selected":"",">Active</option>",
"<option value=D",$MYREQUEST['SCOPE']=='D' ? " selected":"",">Deleted</option>",
"</select>",
" Sorting: <select name=SORT1>",
", Sorting:<select name=SORT1>",
"<option value=H",$MYREQUEST['SORT1']=='H' ? " selected":"",">Hits</option>",
"<option value=Z",$MYREQUEST['SORT1']=='Z' ? " selected":"",">Size</option>",
"<option value=S",$MYREQUEST['SORT1']=='S' ? " selected":"",">$fieldheading</option>",
@ -1051,11 +957,11 @@ EOB;
if($fieldname=='info') echo
"<option value=D",$MYREQUEST['SORT1']=='T' ? " selected":"",">Timeout</option>";
echo
'</select> ',
'</select>',
'<select name=SORT2>',
'<option value=D',$MYREQUEST['SORT2']=='D' ? ' selected':'','>DESC</option>',
'<option value=A',$MYREQUEST['SORT2']=='A' ? ' selected':'','>ASC</option>',
'</select> ',
'</select>',
'<select name=COUNT onChange="form.submit()">',
'<option value=10 ',$MYREQUEST['COUNT']=='10' ? ' selected':'','>Top 10</option>',
'<option value=20 ',$MYREQUEST['COUNT']=='20' ? ' selected':'','>Top 20</option>',
@ -1066,8 +972,7 @@ EOB;
'<option value=500',$MYREQUEST['COUNT']=='500'? ' selected':'','>Top 500</option>',
'<option value=0 ',$MYREQUEST['COUNT']=='0' ? ' selected':'','>All</option>',
'</select>',
'<br />',
'Search: <input name=SEARCH value="',$MYREQUEST['SEARCH'],'" type=text size=25/>',
'&nbsp; Search: <input name=SEARCH value="',$MYREQUEST['SEARCH'],'" type=text size=25/>',
'&nbsp;<input type=submit value="GO!">',
'</form></div>';
@ -1100,16 +1005,17 @@ EOB;
// builds list with alpha numeric sortable keys
//
$list = array();
foreach($cache[$scope_list[$MYREQUEST['SCOPE']]] as $i => $entry) {
switch($MYREQUEST['SORT1']) {
case 'A': $k=sprintf('%015d-',$entry['access_time']); break;
case 'H': $k=sprintf('%015d-',$entry['num_hits']); break;
case 'A': $k=sprintf('%015d-',$entry['atime']); break;
case 'H': $k=sprintf('%015d-',$entry['nhits']); break;
case 'Z': $k=sprintf('%015d-',$entry['mem_size']); break;
case 'M': $k=sprintf('%015d-',$entry['mtime']); break;
case 'C': $k=sprintf('%015d-',$entry['creation_time']); break;
case 'C': $k=sprintf('%015d-',$entry['ctime']); break;
case 'T': $k=sprintf('%015d-',$entry['ttl']); break;
case 'D': $k=sprintf('%015d-',$entry['deletion_time']); break;
case 'S': $k=''; break;
case 'D': $k=sprintf('%015d-',$entry['dtime']); break;
case 'S': $k=$entry["key"]; break;
}
if (!$AUTHENTICATED) {
// hide all path entries if not logged in
@ -1120,7 +1026,6 @@ EOB;
}
if ($list) {
// sort list
//
switch ($MYREQUEST['SORT2']) {
@ -1131,16 +1036,17 @@ EOB;
// output list
$i=0;
foreach($list as $k => $entry) {
if(!$MYREQUEST['SEARCH'] || preg_match($MYREQUEST['SEARCH'], $entry[$fieldname]) != 0) {
if(!$MYREQUEST['SEARCH'] || preg_match($MYREQUEST['SEARCH'], $entry[$fieldname]) != 0) {
$sh=md5($entry["key"]);
$field_value = htmlentities(strip_tags($entry[$fieldname],''), ENT_QUOTES, 'UTF-8');
echo
'<tr class=tr-',$i%2,'>',
"<td class=td-0><a href=\"$MY_SELF&OB=",$MYREQUEST['OB'],"&SH=",md5($entry[$fieldkey]),"\">",$field_value,'</a></td>',
'<td class="td-n center">',$entry['num_hits'],'</td>',
"<td class=td-0><a href=\"$MY_SELF&OB=",$MYREQUEST['OB'],"&SH=",$sh,"\">",$field_value,'</a></td>',
'<td class="td-n center">',$entry['nhits'],'</td>',
'<td class="td-n right">',$entry['mem_size'],'</td>',
'<td class="td-n center">',date(DATE_FORMAT,$entry['access_time']),'</td>',
'<td class="td-n center">',date(DATE_FORMAT,$entry['atime']),'</td>',
'<td class="td-n center">',date(DATE_FORMAT,$entry['mtime']),'</td>',
'<td class="td-n center">',date(DATE_FORMAT,$entry['creation_time']),'</td>';
'<td class="td-n center">',date(DATE_FORMAT,$entry['ctime']),'</td>';
if($fieldname=='info') {
if($entry['ttl'])
@ -1148,9 +1054,9 @@ EOB;
else
echo '<td class="td-n center">None</td>';
}
if ($entry['deletion_time']) {
if ($entry['dtime']) {
echo '<td class="td-last center">', date(DATE_FORMAT,$entry['deletion_time']), '</td>';
echo '<td class="td-last center">', date(DATE_FORMAT,$entry['dtime']), '</td>';
} else if ($MYREQUEST['OB'] == OB_USER_CACHE) {
echo '<td class="td-last center">';
@ -1160,6 +1066,11 @@ EOB;
echo '<td class="td-last center"> &nbsp; </td>';
}
echo '</tr>';
if ($sh == $MYREQUEST["SH"]) {
echo '<tr>';
echo '<td colspan="7"><pre>'.htmlentities(print_r(apcu_fetch($entry['key']), 1)).'</pre></td>';
echo '</tr>';
}
$i++;
if ($i == $MYREQUEST['COUNT'])
break;
@ -1182,139 +1093,12 @@ EOB;
EOB;
break;
// -----------------------------------------------
// Per-Directory System Cache Entries
// -----------------------------------------------
case OB_SYS_CACHE_DIR:
if (!$AUTHENTICATED) {
break;
}
echo <<<EOB
<div class=sorting><form>Scope:
<input type=hidden name=OB value={$MYREQUEST['OB']}>
<select name=SCOPE>
EOB;
echo
"<option value=A",$MYREQUEST['SCOPE']=='A' ? " selected":"",">Active</option>",
"<option value=D",$MYREQUEST['SCOPE']=='D' ? " selected":"",">Deleted</option>",
"</select>",
" Sorting: <select name=SORT1>",
"<option value=H",$MYREQUEST['SORT1']=='H' ? " selected":"",">Total Hits</option>",
"<option value=Z",$MYREQUEST['SORT1']=='Z' ? " selected":"",">Total Size</option>",
"<option value=T",$MYREQUEST['SORT1']=='T' ? " selected":"",">Number of Files</option>",
"<option value=S",$MYREQUEST['SORT1']=='S' ? " selected":"",">Directory Name</option>",
"<option value=A",$MYREQUEST['SORT1']=='A' ? " selected":"",">Avg. Size</option>",
"<option value=C",$MYREQUEST['SORT1']=='C' ? " selected":"",">Avg. Hits</option>",
'</select> ',
'<select name=SORT2>',
'<option value=D',$MYREQUEST['SORT2']=='D' ? ' selected':'','>DESC</option>',
'<option value=A',$MYREQUEST['SORT2']=='A' ? ' selected':'','>ASC</option>',
'</select> ',
'<select name=COUNT onChange="form.submit()">',
'<option value=10 ',$MYREQUEST['COUNT']=='10' ? ' selected':'','>Top 10</option>',
'<option value=20 ',$MYREQUEST['COUNT']=='20' ? ' selected':'','>Top 20</option>',
'<option value=50 ',$MYREQUEST['COUNT']=='50' ? ' selected':'','>Top 50</option>',
'<option value=100',$MYREQUEST['COUNT']=='100'? ' selected':'','>Top 100</option>',
'<option value=150',$MYREQUEST['COUNT']=='150'? ' selected':'','>Top 150</option>',
'<option value=200',$MYREQUEST['COUNT']=='200'? ' selected':'','>Top 200</option>',
'<option value=500',$MYREQUEST['COUNT']=='500'? ' selected':'','>Top 500</option>',
'<option value=0 ',$MYREQUEST['COUNT']=='0' ? ' selected':'','>All</option>',
'</select> ',
"Group By Dir Level: <select name=AGGR>",
"<option value='' selected>None</option>";
for ($i = 1; $i < 10; $i++)
echo "<option value=$i",$MYREQUEST['AGGR']==$i ? " selected":"",">$i</option>";
echo '</select>',
'&nbsp;<input type=submit value="GO!">',
'</form></div>',
'<div class="info"><table cellspacing=0><tbody>',
'<tr>',
'<th>',sortheader('S','Directory Name', "&OB=".$MYREQUEST['OB']),'</th>',
'<th>',sortheader('T','Number of Files',"&OB=".$MYREQUEST['OB']),'</th>',
'<th>',sortheader('H','Total Hits', "&OB=".$MYREQUEST['OB']),'</th>',
'<th>',sortheader('Z','Total Size', "&OB=".$MYREQUEST['OB']),'</th>',
'<th>',sortheader('C','Avg. Hits', "&OB=".$MYREQUEST['OB']),'</th>',
'<th>',sortheader('A','Avg. Size', "&OB=".$MYREQUEST['OB']),'</th>',
'</tr>';
// builds list with alpha numeric sortable keys
//
$tmp = $list = array();
foreach($cache[$scope_list[$MYREQUEST['SCOPE']]] as $entry) {
$n = dirname($entry['filename']);
if ($MYREQUEST['AGGR'] > 0) {
$n = preg_replace("!^(/?(?:[^/\\\\]+[/\\\\]){".($MYREQUEST['AGGR']-1)."}[^/\\\\]*).*!", "$1", $n);
}
if (!isset($tmp[$n])) {
$tmp[$n] = array('hits'=>0,'size'=>0,'ents'=>0);
}
$tmp[$n]['hits'] += $entry['num_hits'];
$tmp[$n]['size'] += $entry['mem_size'];
++$tmp[$n]['ents'];
}
foreach ($tmp as $k => $v) {
switch($MYREQUEST['SORT1']) {
case 'A': $kn=sprintf('%015d-',$v['size'] / $v['ents']);break;
case 'T': $kn=sprintf('%015d-',$v['ents']); break;
case 'H': $kn=sprintf('%015d-',$v['hits']); break;
case 'Z': $kn=sprintf('%015d-',$v['size']); break;
case 'C': $kn=sprintf('%015d-',$v['hits'] / $v['ents']);break;
case 'S': $kn = $k; break;
}
$list[$kn.$k] = array($k, $v['ents'], $v['hits'], $v['size']);
}
if ($list) {
// sort list
//
switch ($MYREQUEST['SORT2']) {
case "A": krsort($list); break;
case "D": ksort($list); break;
}
// output list
$i = 0;
foreach($list as $entry) {
echo
'<tr class=tr-',$i%2,'>',
"<td class=td-0>",$entry[0],'</a></td>',
'<td class="td-n center">',$entry[1],'</td>',
'<td class="td-n center">',$entry[2],'</td>',
'<td class="td-n center">',$entry[3],'</td>',
'<td class="td-n center">',round($entry[2] / $entry[1]),'</td>',
'<td class="td-n center">',round($entry[3] / $entry[1]),'</td>',
'</tr>';
if (++$i == $MYREQUEST['COUNT']) break;
}
} else {
echo '<tr class=tr-0><td class="center" colspan=6><i>No data</i></td></tr>';
}
echo <<< EOB
</tbody></table>
EOB;
if ($list && $i < count($list)) {
echo "<a href=\"$MY_SELF&OB=",$MYREQUEST['OB'],"&COUNT=0\"><i>",count($list)-$i,' more available...</i></a>';
}
echo <<< EOB
</div>
EOB;
break;
// -----------------------------------------------
// Version check
// -----------------------------------------------
case OB_VERSION_CHECK:
echo <<<EOB
<div class="info"><h2>APC Version Information</h2>
<div class="info"><h2>APCu Version Information</h2>
<table cellspacing=0><tbody>
<tr>
<th></th>
@ -1322,24 +1106,24 @@ case OB_VERSION_CHECK:
EOB;
if (defined('PROXY')) {
$ctxt = stream_context_create( array( 'http' => array( 'proxy' => PROXY, 'request_fulluri' => True ) ) );
$rss = @file_get_contents("http://pecl.php.net/feeds/pkg_apc.rss", False, $ctxt);
$rss = @file_get_contents("http://pecl.php.net/feeds/pkg_apcu.rss", False, $ctxt);
} else {
$rss = @file_get_contents("http://pecl.php.net/feeds/pkg_apc.rss");
$rss = @file_get_contents("http://pecl.php.net/feeds/pkg_apcu.rss");
}
if (!$rss) {
echo '<tr class="td-last center"><td>Unable to fetch version information.</td></tr>';
} else {
$apcversion = phpversion('apc');
preg_match('!<title>APC ([0-9.]+)</title>!', $rss, $match);
preg_match('!<title>APCu ([0-9.]+)</title>!', $rss, $match);
echo '<tr class="tr-0 center"><td>';
if (version_compare($apcversion, $match[1], '>=')) {
echo '<div class="ok">You are running the latest version of APC ('.$apcversion.')</div>';
echo '<div class="ok">You are running the latest version of APCu ('.$apcversion.')</div>';
$i = 3;
} else {
echo '<div class="failed">You are running an older version of APC ('.$apcversion.'),
newer version '.$match[1].' is available at <a href="http://pecl.php.net/package/APC/'.$match[1].'">
http://pecl.php.net/package/APC/'.$match[1].'</a>
echo '<div class="failed">You are running an older version of APCu ('.$apcversion.'),
newer version '.$match[1].' is available at <a href="http://pecl.php.net/package/APCu/'.$match[1].'">
http://pecl.php.net/package/APCu/'.$match[1].'</a>
</div>';
$i = -1;
}
@ -1356,7 +1140,7 @@ EOB;
} else if (!$i--) {
break;
}
echo "<b><a href=\"http://pecl.php.net/package/APC/$ver\">".htmlspecialchars($v, ENT_QUOTES, 'UTF-8')."</a></b><br><blockquote>";
echo "<b><a href=\"http://pecl.php.net/package/APCu/$ver\">".htmlspecialchars($v, ENT_QUOTES, 'UTF-8')."</a></b><br><blockquote>";
echo nl2br(htmlspecialchars(current($match[2]), ENT_QUOTES, 'UTF-8'))."</blockquote>";
next($match[2]);
}

View File

@ -107,6 +107,9 @@ if ($_SERVER['REQUEST_METHOD'] == 'POST') {
if ($options->apc && function_exists('apc_delete') && function_exists('apc_cache_info')) {
$_apc_data = apc_cache_info('user');
foreach ($_apc_data['cache_list'] as $_apc_item) {
// APCu keys incompatible with original APC keys, apparently fixed in newer versions, but not in 4.0.4
// So let's look for those keys and fix here (key -> info).
if (isset($_apc_item['key'])) $_apc_item['info'] = $_apc_item['key'];
if (substr($_apc_item['info'], 0, 3) == 'sc.') {
apc_delete($_apc_item['info']);
}

View File

@ -40,7 +40,7 @@ global $admin_page;
<ul class="nav">
<li <?php if (@$admin_page == 'update') echo 'class="active"'; ?>><a href="update.php">Update patterns</a></li>
<li <?php if (@$admin_page == 'edit-pattern') echo 'class="active"'; ?>><a href="edit-pattern.php">Edit patterns</a></li>
<li <?php if (@$admin_page == 'apc') echo 'class="active"'; ?>><a href="apc.php?OB=3">APC</a></li>
<li <?php if (@$admin_page == 'apc') echo 'class="active"'; ?>><a href="apc.php?OB=2">APC</a></li>
<li><a href="index.php?logout">Logout</a></li>
</ul>
</div>

View File

@ -1,9 +1,9 @@
<?php
// Update site config files for Full-Text RSS
// Author: Keyvan Minoukadeh
// Copyright (c) 2013 Keyvan Minoukadeh
// Copyright (c) 2014 Keyvan Minoukadeh
// License: AGPLv3
// Date: 2013-05-12
// Date: 2013-05-02
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
@ -169,7 +169,7 @@ $standard_local_dir = '../site_config/standard/';
@file_put_contents($tmp_latest_local, @file_get_contents($latest_remote));
$headers = implode("\n", $http_response_header);
//var_dump($headers); exit;
if (strpos($headers, 'HTTP/1.0 200') === false) {
if ((strpos($headers, 'HTTP/1.0 200') === false) && (strpos($headers, 'HTTP/1.1 200') === false)) {
println("Sorry, something went wrong. Please contact us if the problem persists.");
exit;
}
@ -202,6 +202,9 @@ if (class_exists('ZipArchive') && file_exists($tmp_latest_local)) {
if ($options->apc && function_exists('apc_delete') && function_exists('apc_cache_info')) {
$_apc_data = apc_cache_info('user');
foreach ($_apc_data['cache_list'] as $_apc_item) {
// APCu keys incompatible with original APC keys, apparently fixed in newer versions, but not in 4.0.4
// So let's look for those keys and fix here (key -> info).
if (isset($_apc_item['key'])) $_apc_item['info'] = $_apc_item['key'];
if (substr($_apc_item['info'], 0, 3) == 'sc.') {
apc_delete($_apc_item['info']);
}

View File

@ -2,6 +2,26 @@ FiveFilters.org: Full-Text RSS
http://fivefilters.org/content-only/
CHANGELOG
------------------------------------
3.3 (2014-05-13)
- Content extractor now looks for Schema.org articleBody elements
- New endpoint extract.php for developers looking for simpler JSON results (no RSS as input/output)
- New endpoint extract.php accepts POST requests and HTML as input (inputhtml request parameter)
- Proxy support added (proxy servers can now be added to the config file, see $options->proxy_servers, ->proxy and ->allow_proxy_override)
- New HTML5 parser: HTML5Lib has been replaced by HTML5-PHP (the old one had too many problems)
- New config option: cache time ($options->cache_time)
- New config option: enable/disable single-page retrieval ($options->singlepage)
- New config option: allow HTML parser override through querystring ($options->allow_parser_override)
- New request parameter: parser - use it to force new HTML5 parser to be used, &parser=html5php (it will be slower)
- Expanded debug request parameter: &debug=rawhtml (shows original response headers and body), &debug=parsedhtml (shows response body after parsing)
- APC stats page now expects APCu (older version of APC still supported, but stats within admin area won't be viewable)
- Auto update of site-specific extraction rules fixed
- Content security HTTP headers now used for the feed preview
- Request parameters and response examples now listed in a table on the index page (new Request Parameters tab)
- Compatibility test file updated to show if HTML5-PHP parser is supported (PHP 5.3 dependency), and to test for HHVM (not yet supported)
- Config option removed: $options->registration_key
- Preserve TTL element in RSS 2.0 feeds
- Other minor fixes/improvements
3.2 (2013-05-14)
- A short excerpt from the first few lines of the extracted content can now be included in the output (pass &summary=1 in querystring, see $options->summary in config file for more info)
- Full content can now be excluded from the output (pass &content=0 in querystring, see $options->content in config file for more info)

View File

@ -57,15 +57,20 @@ if (!$options->caching) die('Caching is disabled');
if ($options->apc && function_exists('apc_delete')) {
$_apc_data = apc_cache_info('user');
foreach ($_apc_data['cache_list'] as $_apc_item) {
if ($_apc_item['ttl'] > 0 && ($_apc_item['ttl'] + $_apc_item['creation_time'] < time())) {
apc_delete($_apc_item['info']);
}
//var_dump($_apc_item); exit;
// APCu keys incompatible with original APC keys, apparently fixed in newer versions, but not in 4.0.4
// So let's look for those keys and fix here (ctime -> creation_time, key -> info).
if (isset($_apc_item['ctime'])) $_apc_item['creation_time'] = $_apc_item['ctime'];
if (isset($_apc_item['key'])) $_apc_item['info'] = $_apc_item['key'];
if ($_apc_item['ttl'] > 0 && ($_apc_item['ttl'] + $_apc_item['creation_time'] < time())) {
apc_delete($_apc_item['info']);
}
}
}
// clean rss (non-key) cache
$frontendOptions = array(
'lifetime' => 20*60,
'lifetime' => $options->cache_time*60,
'automatic_serialization' => false,
'write_control' => false,
'automatic_cleaning_factor' => 0,
@ -86,11 +91,11 @@ $cache->clean(Zend_Cache::CLEANING_MODE_OLD);
// clean rss (key) cache
$frontendOptions = array(
'lifetime' => 20*60,
'automatic_serialization' => false,
'write_control' => false,
'automatic_cleaning_factor' => 0,
'ignore_user_abort' => false
'lifetime' => $options->cache_time*60,
'automatic_serialization' => false,
'write_control' => false,
'automatic_cleaning_factor' => 0,
'ignore_user_abort' => false
);
$backendOptions = array(
'cache_dir' => $options->cache_dir.'/rss-with-key/',
@ -103,6 +108,4 @@ $backendOptions = array(
'file_name_prefix' => 'ff'
);
$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
$cache->clean(Zend_Cache::CLEANING_MODE_OLD);
?>
$cache->clean(Zend_Cache::CLEANING_MODE_OLD);

View File

@ -116,6 +116,13 @@ $options->rewrite_relative_urls = true;
// User decides: 'user' (this option will appear on the form)
$options->exclude_items_on_fail = 'user';
// Enable single-page support
// -------------------------
// If enabled, we will try to follow single page links (e.g. print view) on multi-page articles.
// Currently this only happens for sites where single_page_link has been defined
// in a site config file.
$options->singlepage = true;
// Enable multi-page support
// -------------------------
// If enabled, we will try to follow next page links on multi-page articles.
@ -123,16 +130,27 @@ $options->exclude_items_on_fail = 'user';
// in a site config file.
$options->multipage = true;
// Enable caching
// Enable disk caching
// ----------------------
// Enable this if you'd like to cache results
// for 10 minutes. Cache files are written to disk (in cache/ subfolders
// - which must be writable).
// Enable this if you'd like to cache results on disk.
// Possible options:
// * Disable caching: false (default)
// * Enable caching: true
// Cache files are written to disk (in cache/ subfolders - which must be writable).
// Initially it's best to keep this disabled to make sure everything works
// as expected. If you have APC enabled, please also see smart_cache in the
// advanced section.
$options->caching = false;
// Cache time (minutes)
// ----------------------
// How long should a response be cached?
// Note: this affects both disk caching and the cache headers
// sent in the Full-Text RSS HTTP response.
// So even if you disable disk caching, this value will still
// affect the cache headers in the HTTP response.
$options->cache_time = 10;
// Cache directory
// ----------------------
// Only used if caching is true
@ -172,22 +190,9 @@ $options->keep_enclosures = true;
// * Use article/feed metadata (e.g. HTML lang attribute): 1 (default)
// * As above, but guess if not present: 2
// * Always guess: 3
// * User decides: 'user' (value of 0-3 can be passed in querystring: e.g. &l=2)
// * User decides: 'user' (value of 0-3 can be passed in querystring: e.g. &lang=2)
$options->detect_language = 1;
// Registration key
// ---------------
// The registration key is optional. It is not required to use Full-Text RSS,
// and does not affect the normal operation of Full-Text RSS. It is currently
// only used on admin pages which help you update site patterns with the
// latest version offered by FiveFilters.org. For these admin-related
// tasks to complete, we will require a valid registration key.
// If you would like one, you can purchase the latest version of Full-Text RSS
// at http://fivefilters.org/content-only/
// Your registration key will automatically be sent in the confirmation email.
// Once you have it, simply copy and paste it here.
$options->registration_key = '';
/////////////////////////////////////////////////
/// RESTRICT ACCESS /////////////////////////////
/////////////////////////////////////////////////
@ -298,23 +303,33 @@ $options->max_entries_with_key = 10;
//
// Valid values:
// true - enabled, all content will be filtered
// 'user' (default) - user must pass &xss in makefulltextfeed.php querystring to enable
// 'user' (default) - user must pass &xss=1 in makefulltextfeed.php querystring to enable
// false - disabled
$options->xss_filter = 'user';
// Allowed parsers
// Allowed HTML parsers
// ----------------------
// Full-Text RSS attempts to use PHP's libxml extension to process HTML.
// While fast, on some sites it may not always produce good results.
// For these sites, you can specify an alternative HTML parser:
// parser: html5lib
// The html5lib parser is bundled with Full-Text RSS.
// see http://code.google.com/p/html5lib/
// parser: html5php
// The html5php parser is bundled with Full-Text RSS.
// see https://github.com/Masterminds/html5-php
//
// To disable HTML parsing with html5lib, you can remove it from this list.
// By default we allow both: libxml and html5lib.
$options->allowed_parsers = array('libxml', 'html5lib');
//$options->allowed_parsers = array('libxml'); //disable html5lib - forcing libxml in all cases
// To disable HTML parsing with html5php, remove it from this list.
// By default we allow both libxml and html5php.
// Note: html5php requires PHP 5.3 or higher. If you're running PHP 5.2,
// we'll always use libxml.
$options->allowed_parsers = array('libxml', 'html5php');
//$options->allowed_parsers = array('libxml'); //disable html5php - forcing libxml in all cases
// Parser override in querystring
// ---------------------
// If enabled, user can pass &parser=html5php to override default parser.
// Possible values:
// * false: Don't allow override in querystring
// * true: Allow (default)
$options->allow_parser_override = true;
// Enable Cross-Origin Resource Sharing (CORS)
// ----------------------
@ -323,6 +338,45 @@ $options->allowed_parsers = array('libxml', 'html5lib');
// see http://en.wikipedia.org/wiki/Cross-origin_resource_sharing
$options->cors = false;
// Proxy server(s)
// ----------------------
// You can specify proxy servers here and ask Full-Text RSS to
// route HTTP requests through these servers.
// If no proxy server is listed, all requests will be made directly.
// A proxy server should be given a unique name (key in the array)
// and as its value another array with key 'host' and, if required, 'auth'.
//
// Note: if you're listing proxies so Full-Text RSS randomly chooses one
// for each request, you can also specify 'direct' as a value to make
// sure direct requests are randomly made as well.
$options->proxy_servers = array();
// For example:
//$options->proxy_servers = array('example1'=>array('host'=>'127.0.0.1:8888'), 'example2'=>array('host'=>'127.0.0.1:8888', 'auth'=>'user:pass'), 'direct'=>array());
// If Polipo is installed and you want to use it as a caching proxy, uncomment the following line.
//$options->proxy_servers = array('polipo'=>array('host'=>'127.0.0.1:8123'));
// Proxy mode
// ----------------------
// How the proxy servers above should be used:
// Possible options:
// * Disable: false (no proxy will be used)
// * Named: specify which server should be used (e.g. 'example1')
// * Random: true (default) a random one from the set above will be used each time Full-Text RSS is called.
// Note: if no proxy servers are entered in $options->proxy_servers, no proxies will be used.
$options->proxy = true;
// Proxy override in querystring
// ----------------------
// If enabled, user can disable or change the proxy server used.
// Possible values:
// * false: Don't allow override in querystring
// * true: Allow user to disable or choose a proxy through a request parameter, like so...
// &proxy=0 to disable
// &proxy=1 for default behaviour (see $options->proxy) (default)
// &proxy=example1 to specify one of the proxies listed in $options->proxy_servers
// Note: Only proxy servers listed in the config file can be used.
$options->allow_proxy_override = true;
// Use APC user cache?
// ----------------------
// If enabled we will store site config files (when requested
@ -427,7 +481,7 @@ $options->cache_cleanup = 100;
/// DO NOT CHANGE ANYTHING BELOW THIS ///////////
/////////////////////////////////////////////////
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2');
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.3');
if (basename(__FILE__) == 'config.php') {
if (file_exists(dirname(__FILE__).'/custom_config.php')) {

View File

@ -6,9 +6,7 @@
<html>
<head>
<title><xsl:value-of select="$title"/> (full-text feed)</title>
<style type="text/css">
@import url(css/feed.css);
</style>
<link rel="stylesheet" type="text/css" href="css/feed.css" />
</head>
<body>
<div id="explanation">

64
extract.php Normal file
View File

@ -0,0 +1,64 @@
<?php
// Full-Text RSS: Simple extraction - results in JSON
// Author: Keyvan Minoukadeh
// Copyright (c) 2014 Keyvan Minoukadeh
// License: AGPLv3
// Version: 3.3
// Date: 2014-05-07
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
/*
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
// Usage
// -----
// Request this file passing it a web page URL in the querystring: extract.php?url=example.org
// You can use GET and POST requests.
// You'll get a simple JSON response:
/*
HTTP/1.0 200 OK
{
"title": "Blowing Smoke with Boxing's Big Voice",
"content" <div><p>Content here</p><p>More content</p></div>",
"author": "Rafi Kohan",
"excerpt": "Short extract from the beginning of the article.",
"language": "en",
"url": "http://example.org/article.html",
"effective_url": "http://example.org/article.html",
"date": "2014-05-10"
}
*/
define('_FF_FTR_MODE', 'simple');
// Don't process URL as feed
$_POST['html'] = '1';
// JSON output only
$_POST['format'] = 'json';
// Enable excerpts
$_POST['summary'] = '1';
// Don't produce result if extraction fails
$_POST['exc'] = '1';
// Enable XSS filtering (unless explicitly disabled)
if (isset($_POST['xss']) && $_POST['xss'] !== '0') {
$_POST['xss'] = '1';
} elseif (isset($_GET['xss']) && $_GET['xss'] !== '0') {
$_GET['xss'] = '1';
} else {
$_POST['xss'] = '1';
}
require 'makefulltextfeed.php';

View File

@ -16,9 +16,10 @@ SimplePie.org. We have kept most of their checks intact as we use SimplePie in o
http://github.com/simplepie/simplepie/tree/master/compatibility_test/
*/
$app_name = 'Full-Text RSS 3.2';
$app_name = 'Full-Text RSS 3.3';
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>='));
// Full-Text RSS is not yet compatible with HHVM, that's why we check for it with HHVM_VERSION.
$php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=') && !defined('HHVM_VERSION'));
$pcre_ok = extension_loaded('pcre');
$zlib_ok = extension_loaded('zlib');
$mbstring_ok = extension_loaded('mbstring');
@ -295,7 +296,7 @@ div.chunk {
<?php if ($tidy_ok): ?>
<li><strong>Tidy:</strong> You have <code>Tidy</code> support installed. No problems here.</li>
<?php else: ?>
<li class="highlight"><strong>Tidy:</strong> The <code>Tidy</code> extension is not available. <?php echo $app_name; ?> should still work with most feeds/articles, but you may experience problems with some.</li>
<li class="highlight"><strong>Tidy:</strong> The <code>Tidy</code> extension is not available. <?php echo $app_name; ?> should still work with most feeds/articles, but you may experience problems with some. For problem feeds we recommend you use the HTML5 parser.</li>
<?php endif; ?>
<?php if ($curl_ok): ?>
@ -362,7 +363,7 @@ div.chunk {
?>
<p class="highlight"><strong><?php echo $http_type; ?></strong> will be used on this server.</p>
<h4>Alternative PHP Cache (APC)</h4>
<h4>Alternative PHP Cache (APC/APCu)</h4>
<p>Full-Text RSS can make use of APC's memory cache to store site config files (when requested for the first time). This is not required, but if available it may improve performance slightly by reducing disk access.</p>
<?php
if (function_exists('apc_add')) {
@ -372,6 +373,16 @@ div.chunk {
}
?>
<h4>HTML parser</h4>
<p>Full-Text RSS uses the fast libxml parser (the default PHP parser) but it can also make use of HTML5-PHP (an HTML5 parser written in PHP) if your version of PHP supports it. The latter might produce better results for some sites, especially if Tidy is not available on your server, however, it is slower than libxml.</p>
<?php
if (version_compare(PHP_VERSION, '5.3.0') >= 0) {
echo '<p class="highlight"><strong>HTML5-PHP</strong> can be used on this server.</p>';
} else {
echo '<p class="highlight">You need at least PHP 5.3 to be able to use HTML5-PHP.</p>';
}
?>
<h4>Language detection</h4>
<p>Full-Text RSS can detect the language of each article processed. This occurs using <a href="http://pear.php.net/package/Text_LanguageDetect">Text_LanguageDetect</a> or <a href="https://github.com/lstrojny/php-cld">PHP-CLD</a> (if available).</p>
<?php

334
index.php

File diff suppressed because one or more lines are too long

View File

@ -5,10 +5,10 @@
* Uses patterns specified in site config files and auto detection (hNews/PHP Readability)
* to extract content from HTML files.
*
* @version 1.0
* @date 2013-02-05
* @version 1.1
* @date 2014-03-28
* @author Keyvan Minoukadeh
* @copyright 2013 Keyvan Minoukadeh
* @copyright 2014 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@ -40,7 +40,9 @@ class ContentExtractor
protected $body;
protected $success = false;
protected $nextPageUrl;
public $allowedParsers = array('libxml', 'html5lib');
public $allowedParsers = array('libxml', 'html5php');
public $defaultParser = 'libxml';
public $parserOverride = null;
public $fingerprints = array();
public $readability;
public $debug = false;
@ -184,10 +186,18 @@ class ContentExtractor
}
// load and parse html
$_parser = $this->config->parser();
if ($this->parserOverride) {
// from querystring: &parser=xxx
$_parser = $this->parserOverride;
} else {
// from site config file: parser: xxx
$_parser = $this->config->parser();
}
// for backword compatibility...
if ($_parser == 'html5lib') $_parser = 'html5php';
if (!in_array($_parser, $this->allowedParsers)) {
$this->debug("HTML parser $_parser not listed, using libxml instead");
$_parser = 'libxml';
$this->debug("HTML parser $_parser not listed, using ".$this->defaultParser." instead");
$_parser = $this->defaultParser;
}
$this->debug("Attempting to parse HTML with $_parser");
$this->readability = new Readability($html, $url, $_parser);
@ -310,7 +320,9 @@ class ContentExtractor
if ($elems && $elems->length > 0) {
$this->debug('Stripping '.$elems->length.' elements (strip)');
for ($i=$elems->length-1; $i >= 0; $i--) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
if ($elems->item($i)->parentNode) {
$elems->item($i)->parentNode->removeChild($elems->item($i));
}
}
}
}
@ -456,7 +468,7 @@ class ContentExtractor
if ($detect_date) {
// check for time element with pubdate attribute
$elems = @$xpath->query(".//time[@pubdate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
$elems = @$xpath->query(".//time[@pubdate or @pubDate] | .//abbr[contains(concat(' ',normalize-space(@class),' '),' published ')]", $hentry);
if ($elems && $elems->length > 0) {
$this->date = strtotime(trim($elems->item(0)->textContent));
// remove date from document
@ -572,6 +584,55 @@ class ContentExtractor
$detect_body = false;
}
}
// check for elements marked with itemprop="articleBody" (from Schema.org)
if ($detect_body) {
$elems = @$xpath->query("//*[@itemprop='articleBody']", $this->readability->dom);
if ($elems && $elems->length > 0) {
$this->debug('body found (Schema.org itemprop="articleBody")');
if ($elems->length == 1) {
// what if it's empty? (content placed outside an empty itemprop='articleBody' element)
$e = $elems->item(0);
if (($e->tagName == 'img') || (trim($e->textContent) != '')) {
$this->body = $elems->item(0);
// prune (clean up elements that may not be content)
if ($this->config->prune()) {
$this->debug('Pruning content');
$this->readability->prepArticle($this->body);
}
$detect_body = false;
} else {
$this->debug('Schema.org: skipping itemprop="articleBody" - appears not to contain content');
}
unset($e);
} else {
$this->body = $this->readability->dom->createElement('div');
$this->debug($elems->length.' itemprop="articleBody" elems found');
foreach ($elems as $elem) {
if (!isset($elem->parentNode)) continue;
$isDescendant = false;
foreach ($this->body->childNodes as $parent) {
if ($this->isDescendant($parent, $elem)) {
$isDescendant = true;
break;
}
}
if ($isDescendant) {
$this->debug('Element is child of another body element, skipping.');
} else {
// prune (clean up elements that may not be content)
if ($this->config->prune()) {
$this->debug('Pruning content');
$this->readability->prepArticle($elem);
}
$this->debug('Element added to body');
$this->body->appendChild($elem);
}
}
$detect_body = false;
}
}
}
// Find author in rel="author" marked element
// We only use this if there's exactly one.
@ -594,7 +655,7 @@ class ContentExtractor
// For the same reason given above, we only use this
// if there's exactly one element.
if ($detect_date) {
$elems = @$xpath->query("//time[@pubdate]", $this->readability->dom);
$elems = @$xpath->query("//time[@pubdate or @pubDate]", $this->readability->dom);
if ($elems && $elems->length == 1) {
$this->date = strtotime(trim($elems->item(0)->textContent));
// remove date from document

View File

@ -159,6 +159,7 @@ class SiteConfig
$key = strtolower($key);
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
if ($config->cache_key) $key = $config->cache_key;
$key .= '.'.self::get_key_suffix();
self::$config_cache[$key] = $config;
if (self::$apc && $use_apc) {
self::debug("Adding site config to APC cache with key sc.$key");
@ -169,6 +170,7 @@ class SiteConfig
public static function is_cached($key) {
$key = strtolower($key);
$key .= '.'.self::get_key_suffix();
if (substr($key, 0, 4) == 'www.') $key = substr($key, 4);
if (array_key_exists($key, self::$config_cache)) {
return true;
@ -198,6 +200,16 @@ class SiteConfig
}
}
// This is used to make sure that when a different primary folder is chosen
// The key for the cached result includes that folder choice.
// Otherwise, a subsequent request choosing a different folder
// could return the wrong cached config.
public static function get_key_suffix() {
$key_suffix = basename(self::$config_path);
if ($key_suffix === 'custom') $key_suffix = '';
return $key_suffix;
}
// returns SiteConfig instance if an appropriate one is found, false otherwise
// if $exact_host_match is true, we will not look for wildcard config matches
// by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists
@ -216,13 +228,20 @@ class SiteConfig
}
}
// Which primary folder should we look inside?
// If it's not the default ('custom'), we need
// a key suffix to distinguish site config fules
// held in this folder from those in other folders.
$key_suffix = self::get_key_suffix();
// look for site config file in primary folder
self::debug(". looking for site config for $host in primary folder");
foreach ($try as $h) {
if (array_key_exists($h, self::$config_cache)) {
$h_key = "$h.$key_suffix";
if (array_key_exists($h_key, self::$config_cache)) {
self::debug("... site config for $h already loaded in this request");
return self::$config_cache[$h];
} elseif (self::$apc && ($sconfig = apc_fetch("sc.$h"))) {
return self::$config_cache[$h_key];
} elseif (self::$apc && ($sconfig = apc_fetch("sc.$h_key"))) {
self::debug("... site config for $h in APC cache");
return $sconfig;
} elseif (file_exists(self::$config_path."/$h.txt")) {

View File

@ -6,7 +6,7 @@ define('JSONP', 3, true);
/**
* Univarsel Feed Writer class
*
* Genarate RSS2 or JSON (original: RSS 1.0, RSS2.0 and ATOM Feed)
* Generate RSS2 or JSON (original: RSS 1.0, RSS2.0 and ATOM Feed)
*
* Modified for FiveFilters.org's Full-Text RSS project
* to allow for inclusion of hubs, JSON output.
@ -26,6 +26,7 @@ define('JSONP', 3, true);
private $CDATAEncoding = array(); // The tag names which have to encoded as CDATA
private $xsl = null; // stylesheet to render RSS (used by Chrome)
private $json = null; // JSON object
private $simplejson = false;
private $version = null;
@ -52,6 +53,10 @@ define('JSONP', 3, true);
// Start # public functions ---------------------------------------------
public function enableSimpleJson($enable=true) {
$this->simplejson = $enable;
}
/**
* Set a channel element
* @access public
@ -82,12 +87,12 @@ define('JSONP', 3, true);
}
/**
* Genarate the actual RSS/JSON file
* Generate the actual RSS/JSON file
*
* @access public
* @return void
*/
public function genarateFeed()
public function generateFeed()
{
if ($this->version == RSS2) {
header('Content-type: text/xml; charset=UTF-8');
@ -106,7 +111,46 @@ define('JSONP', 3, true);
$this->printItems();
$this->printTale();
if ($this->version == JSON || $this->version == JSONP) {
echo json_encode($this->json);
if (!$this->simplejson) {
echo json_encode($this->json);
} else {
$simplejson = new stdClass();
if (is_array($this->json->rss['channel']->item)) {
// get first item
$jsonitem = $this->json->rss['channel']->item[0];
} else {
$jsonitem = $this->json->rss['channel']->item;
}
// defaults
$simplejson->title = null;
$simplejson->excerpt = null;
$simplejson->date = null;
$simplejson->author = null;
$simplejson->language = null;
$simplejson->url = null;
$simplejson->effective_url = null;
$simplejson->content = null;
// actual values
$simplejson->url = $jsonitem->link;
$simplejson->effective_url = $jsonitem->dc_identifier;
if (isset($jsonitem->title)) $simplejson->title = $jsonitem->title;
if (isset($jsonitem->dc_language)) $simplejson->language = $jsonitem->dc_language;
if (isset($jsonitem->content_encoded)) {
$simplejson->content = $jsonitem->content_encoded;
if (isset($jsonitem->description)) {
$simplejson->excerpt = $jsonitem->description;
}
} else {
$simplejson->content = $jsonitem->description;
}
if (isset($jsonitem->dc_creator)) {
$simplejson->author = $jsonitem->dc_creator;
}
if (isset($jsonitem->pubDate)) {
$simplejson->date = gmdate(DATE_ATOM, strtotime($jsonitem->pubDate));
}
echo json_encode($simplejson);
}
}
}
@ -175,7 +219,19 @@ define('JSONP', 3, true);
public function setXsl($xsl)
{
$this->xsl = $xsl;
}
}
/**
* Set TTL
*
* @access public
* @param int time to live (minutes)
* @return void
*/
public function setTtl($ttl)
{
$this->setChannelElement('ttl', (int)$ttl);
}
/**
* Set self URL
@ -196,10 +252,9 @@ define('JSONP', 3, true);
* @param srting value of 'description' channel tag
* @return void
*/
public function setDescription($desciption)
{
$tag = ($this->version == ATOM)? 'subtitle' : 'description';
$this->setChannelElement($tag, $desciption);
public function setDescription($description)
{
$this->setChannelElement('description', $description);
}
/**
@ -404,9 +459,9 @@ define('JSONP', 3, true);
echo $this->endItem();
if ($this->version == JSON || $this->version == JSONP) {
if (count($this->items) > 1) {
$this->json->rss['channel']->item[] = $json_item;
$this->json->rss['channel']->item[] = (object)$json_item;
} else {
$this->json->rss['channel']->item = $json_item;
$this->json->rss['channel']->item = (object)$json_item;
}
}
}

View File

@ -1,8 +1,8 @@
<?php
/*
htmLawed 1.1.14, 8 August 2012
OOP code, 8 August 2012
htmLawed 1.1.17, 11 March 2014
OOP code, 11 March 2014
Copyright Santosh Patnaik
Dual LGPL v3 and GPL v2+ license
A PHP Labware internal utility; www.bioinformatics.org/phplabware/internal_utilities/htmLawed
@ -339,7 +339,7 @@ $c = isset($C['schemes'][$c]) ? $C['schemes'][$c] : $C['schemes']['*'];
static $d = 'denied:';
if(isset($c['!']) && substr($p, 0, 7) != $d){$p = "$d$p";}
if(isset($c['*']) or !strcspn($p, '#?;') or (substr($p, 0, 7) == $d)){return "{$b}{$p}{$a}";} // All ok, frag, query, param
if(preg_match('`^([a-z\d\-+.&#; ]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot
if(preg_match('`^([^:?[@!$()*,=/\'\]]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot
return "{$b}{$d}{$p}{$a}";
}
if($C['abs_url']){
@ -382,7 +382,7 @@ return $r;
public static function hl_spec($t){
// final $spec
$s = array();
$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace('/"(?>(`.|[^"])*)"/sme', 'substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", \'`"\'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\""), "$0"), 1, -1)', trim($t)));
$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace_callback('/"(?>(`.|[^"])*)"/sm', create_function('$m', 'return substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", \'`"\'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\""), $m[0]), 1, -1);'), trim($t)));
for($i = count(($t = explode(';', $t))); --$i>=0;){
$w = $t[$i];
if(empty($w) or ($e = strpos($w, '=')) === false or !strlen(($a = substr($w, $e+1)))){continue;}
@ -647,7 +647,7 @@ return '';
public static function hl_tidy($t, $w, $p){
// Tidy/compact HTM
if(strpos(' pre,script,textarea', "$p,")){return $t;}
$t = str_replace(' </', '</', preg_replace(array('`(<\w[^>]*(?<!/)>)\s+`', '`\s+`', '`(<\w[^>]*(?<!/)>) `'), array(' $1', ' ', '$1'), preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t)));
$t = preg_replace('`\s+`', ' ', preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), create_function('$m', 'return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", " "), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];'), $t));
if(($w = strtolower($w)) == -1){
return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t);
}
@ -655,7 +655,7 @@ $s = strpos(" $w", 't') ? "\t" : ' ';
$s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2));
$N = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0;
$a = array('br'=>1);
$b = array('button'=>1, 'input'=>1, 'option'=>1);
$b = array('button'=>1, 'input'=>1, 'option'=>1, 'param'=>1);
$c = array('caption'=>1, 'dd'=>1, 'dt'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1);
$d = array('address'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'form'=>1, 'hr'=>1, 'iframe'=>1, 'map'=>1, 'menu'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1);
$T = explode('<', $t);
@ -677,20 +677,20 @@ while($X){
else{++$N; ob_end_clean(); continue 2;}
}
else{echo "\n", str_repeat($s, $n), "$e\n", str_repeat($s, ($x != 1 ? ++$n : $n));}
echo ltrim($r); continue;
echo $r; continue;
}
$f = "\n". str_repeat($s, $n);
if(isset($c[$y])){
if(!$x){echo $e, $f, ltrim($r);}
if(!$x){echo $e, $f, $r;}
else{echo $f, $e, $r;}
}elseif(isset($b[$y])){echo $f, $e, $r;
}elseif(isset($a[$y])){echo $e, $f, ltrim($r);
}elseif(!$y){echo $f, $e, $f, ltrim($r);
}elseif(isset($a[$y])){echo $e, $f, $r;
}elseif(!$y){echo $f, $e, $f, $r;
}else{echo $e, $r;}
}
$X = 0;
}
$t = preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents());
$t = str_replace(array("\n ", " \n"), "\n", preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents()));
ob_end_clean();
if(($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)){
$t = str_replace("\n", $l, $t);
@ -701,7 +701,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array(
public static function hl_version(){
// rel
return '1.1.14';
return '1.1.17';
// eof
}

View File

@ -1,114 +0,0 @@
<?php
// warning: this file is encoded in UTF-8!
class HTML5_Data
{
// at some point this should be moved to a .ser file. Another
// possible optimization is to give UTF-8 bytes, not Unicode
// codepoints
// XXX: Not quite sure why it's named this; this is
// actually the numeric entity dereference table.
protected static $realCodepointTable = array(
0x00 => 0xFFFD, // REPLACEMENT CHARACTER
0x0D => 0x000A, // LINE FEED (LF)
0x80 => 0x20AC, // EURO SIGN ('€')
0x81 => 0x0081, // <control>
0x82 => 0x201A, // SINGLE LOW-9 QUOTATION MARK ('')
0x83 => 0x0192, // LATIN SMALL LETTER F WITH HOOK ('ƒ')
0x84 => 0x201E, // DOUBLE LOW-9 QUOTATION MARK ('„')
0x85 => 0x2026, // HORIZONTAL ELLIPSIS ('…')
0x86 => 0x2020, // DAGGER ('†')
0x87 => 0x2021, // DOUBLE DAGGER ('‡')
0x88 => 0x02C6, // MODIFIER LETTER CIRCUMFLEX ACCENT ('ˆ')
0x89 => 0x2030, // PER MILLE SIGN ('‰')
0x8A => 0x0160, // LATIN CAPITAL LETTER S WITH CARON ('Š')
0x8B => 0x2039, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK ('')
0x8C => 0x0152, // LATIN CAPITAL LIGATURE OE ('Œ')
0x8D => 0x008D, // <control>
0x8E => 0x017D, // LATIN CAPITAL LETTER Z WITH CARON ('Ž')
0x8F => 0x008F, // <control>
0x90 => 0x0090, // <control>
0x91 => 0x2018, // LEFT SINGLE QUOTATION MARK ('')
0x92 => 0x2019, // RIGHT SINGLE QUOTATION MARK ('')
0x93 => 0x201C, // LEFT DOUBLE QUOTATION MARK ('“')
0x94 => 0x201D, // RIGHT DOUBLE QUOTATION MARK ('”')
0x95 => 0x2022, // BULLET ('•')
0x96 => 0x2013, // EN DASH ('')
0x97 => 0x2014, // EM DASH ('—')
0x98 => 0x02DC, // SMALL TILDE ('˜')
0x99 => 0x2122, // TRADE MARK SIGN ('™')
0x9A => 0x0161, // LATIN SMALL LETTER S WITH CARON ('š')
0x9B => 0x203A, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK ('')
0x9C => 0x0153, // LATIN SMALL LIGATURE OE ('œ')
0x9D => 0x009D, // <control>
0x9E => 0x017E, // LATIN SMALL LETTER Z WITH CARON ('ž')
0x9F => 0x0178, // LATIN CAPITAL LETTER Y WITH DIAERESIS ('Ÿ')
);
protected static $namedCharacterReferences;
protected static $namedCharacterReferenceMaxLength;
/**
* Returns the "real" Unicode codepoint of a malformed character
* reference.
*/
public static function getRealCodepoint($ref) {
if (!isset(self::$realCodepointTable[$ref])) return false;
else return self::$realCodepointTable[$ref];
}
public static function getNamedCharacterReferences() {
if (!self::$namedCharacterReferences) {
self::$namedCharacterReferences = unserialize(
file_get_contents(dirname(__FILE__) . '/named-character-references.ser'));
}
return self::$namedCharacterReferences;
}
/**
* Converts a Unicode codepoint to sequence of UTF-8 bytes.
* @note Shamelessly stolen from HTML Purifier, which is also
* shamelessly stolen from Feyd (which is in public domain).
*/
public static function utf8chr($code) {
/* We don't care: we live dangerously
* if($code > 0x10FFFF or $code < 0x0 or
($code >= 0xD800 and $code <= 0xDFFF) ) {
// bits are set outside the "valid" range as defined
// by UNICODE 4.1.0
return "\xEF\xBF\xBD";
}*/
$x = $y = $z = $w = 0;
if ($code < 0x80) {
// regular ASCII character
$x = $code;
} else {
// set up bits for UTF-8
$x = ($code & 0x3F) | 0x80;
if ($code < 0x800) {
$y = (($code & 0x7FF) >> 6) | 0xC0;
} else {
$y = (($code & 0xFC0) >> 6) | 0x80;
if($code < 0x10000) {
$z = (($code >> 12) & 0x0F) | 0xE0;
} else {
$z = (($code >> 12) & 0x3F) | 0x80;
$w = (($code >> 18) & 0x07) | 0xF0;
}
}
}
// set up the actual character
$ret = '';
if($w) $ret .= chr($w);
if($z) $ret .= chr($z);
if($y) $ret .= chr($y);
$ret .= chr($x);
return $ret;
}
}

View File

@ -1,284 +0,0 @@
<?php
/*
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
// Some conventions:
// /* */ indicates verbatim text from the HTML 5 specification
// // indicates regular comments
class HTML5_InputStream {
/**
* The string data we're parsing.
*/
private $data;
/**
* The current integer byte position we are in $data
*/
private $char;
/**
* Length of $data; when $char === $data, we are at the end-of-file.
*/
private $EOF;
/**
* Parse errors.
*/
public $errors = array();
/**
* @param $data Data to parse
*/
public function __construct($data) {
/* Given an encoding, the bytes in the input stream must be
converted to Unicode characters for the tokeniser, as
described by the rules for that encoding, except that the
leading U+FEFF BYTE ORDER MARK character, if any, must not
be stripped by the encoding layer (it is stripped by the rule below).
Bytes or sequences of bytes in the original byte stream that
could not be converted to Unicode characters must be converted
to U+FFFD REPLACEMENT CHARACTER code points. */
// XXX currently assuming input data is UTF-8; once we
// build encoding detection this will no longer be the case
//
// We previously had an mbstring implementation here, but that
// implementation is heavily non-conforming, so it's been
// omitted.
if (extension_loaded('iconv')) {
// non-conforming
$data = @iconv('UTF-8', 'UTF-8//IGNORE', $data);
} else {
// we can make a conforming native implementation
throw new Exception('Not implemented, please install mbstring or iconv');
}
/* One leading U+FEFF BYTE ORDER MARK character must be
ignored if any are present. */
if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
$data = substr($data, 3);
}
/* All U+0000 NULL characters in the input must be replaced
by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
characters is a parse error. */
for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) {
$this->errors[] = array(
'type' => HTML5_Tokenizer::PARSEERROR,
'data' => 'null-character'
);
}
/* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED
(LF) characters are treated specially. Any CR characters
that are followed by LF characters must be removed, and any
CR characters not followed by LF characters must be converted
to LF characters. Thus, newlines in HTML DOMs are represented
by LF characters, and there are never any CR characters in the
input to the tokenization stage. */
$data = str_replace(
array(
"\0",
"\r\n",
"\r"
),
array(
"\xEF\xBF\xBD",
"\n",
"\n"
),
$data
);
/* Any occurrences of any characters in the ranges U+0001 to
U+0008, U+000B, U+000E to U+001F, U+007F to U+009F,
U+D800 to U+DFFF , U+FDD0 to U+FDEF, and
characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and
U+10FFFF are parse errors. (These are all control characters
or permanently undefined Unicode characters.) */
// Check PCRE is loaded.
if (extension_loaded('pcre')) {
$count = preg_match_all(
'/(?:
[\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F
|
\xC2[\x80-\x9F] # U+0080 to U+009F
|
\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
|
\xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
|
\xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
|
[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
)/x',
$data,
$matches
);
for ($i = 0; $i < $count; $i++) {
$this->errors[] = array(
'type' => HTML5_Tokenizer::PARSEERROR,
'data' => 'invalid-codepoint'
);
}
} else {
// XXX: Need non-PCRE impl, probably using substr_count
}
$this->data = $data;
$this->char = 0;
$this->EOF = strlen($data);
}
/**
* Returns the current line that the tokenizer is at.
*/
public function getCurrentLine() {
// Check the string isn't empty
if($this->EOF) {
// Add one to $this->char because we want the number for the next
// byte to be processed.
return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
} else {
// If the string is empty, we are on the first line (sorta).
return 1;
}
}
/**
* Returns the current column of the current line that the tokenizer is at.
*/
public function getColumnOffset() {
// strrpos is weird, and the offset needs to be negative for what we
// want (i.e., the last \n before $this->char). This needs to not have
// one (to make it point to the next character, the one we want the
// position of) added to it because strrpos's behaviour includes the
// final offset byte.
$lastLine = strrpos($this->data, "\n", $this->char - 1 - strlen($this->data));
// However, for here we want the length up until the next byte to be
// processed, so add one to the current byte ($this->char).
if($lastLine !== false) {
$findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
} else {
$findLengthOf = substr($this->data, 0, $this->char);
}
// Get the length for the string we need.
if(extension_loaded('iconv')) {
return iconv_strlen($findLengthOf, 'utf-8');
} elseif(extension_loaded('mbstring')) {
return mb_strlen($findLengthOf, 'utf-8');
} elseif(extension_loaded('xml')) {
return strlen(utf8_decode($findLengthOf));
} else {
$count = count_chars($findLengthOf);
// 0x80 = 0x7F - 0 + 1 (one added to get inclusive range)
// 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range)
return array_sum(array_slice($count, 0, 0x80)) +
array_sum(array_slice($count, 0xC2, 0x33));
}
}
/**
* Retrieve the currently consume character.
* @note This performs bounds checking
*/
public function char() {
return ($this->char++ < $this->EOF)
? $this->data[$this->char - 1]
: false;
}
/**
* Get all characters until EOF.
* @note This performs bounds checking
*/
public function remainingChars() {
if($this->char < $this->EOF) {
$data = substr($this->data, $this->char);
$this->char = $this->EOF;
return $data;
} else {
return false;
}
}
/**
* Matches as far as possible until we reach a certain set of bytes
* and returns the matched substring.
* @param $bytes Bytes to match.
*/
public function charsUntil($bytes, $max = null) {
if ($this->char < $this->EOF) {
if ($max === 0 || $max) {
$len = strcspn($this->data, $bytes, $this->char, $max);
} else {
$len = strcspn($this->data, $bytes, $this->char);
}
$string = (string) substr($this->data, $this->char, $len);
$this->char += $len;
return $string;
} else {
return false;
}
}
/**
* Matches as far as possible with a certain set of bytes
* and returns the matched substring.
* @param $bytes Bytes to match.
*/
public function charsWhile($bytes, $max = null) {
if ($this->char < $this->EOF) {
if ($max === 0 || $max) {
$len = strspn($this->data, $bytes, $this->char, $max);
} else {
$len = strspn($this->data, $bytes, $this->char);
}
$string = (string) substr($this->data, $this->char, $len);
$this->char += $len;
return $string;
} else {
return false;
}
}
/**
* Unconsume one character.
*/
public function unget() {
if ($this->char <= $this->EOF) {
$this->char--;
}
}
}

View File

@ -1,36 +0,0 @@
<?php
require_once dirname(__FILE__) . '/Data.php';
require_once dirname(__FILE__) . '/InputStream.php';
require_once dirname(__FILE__) . '/TreeBuilder.php';
require_once dirname(__FILE__) . '/Tokenizer.php';
/**
* Outwards facing interface for HTML5.
*/
class HTML5_Parser
{
/**
* Parses a full HTML document.
* @param $text HTML text to parse
* @param $builder Custom builder implementation
* @return Parsed HTML as DOMDocument
*/
static public function parse($text, $builder = null) {
$tokenizer = new HTML5_Tokenizer($text, $builder);
$tokenizer->parse();
return $tokenizer->save();
}
/**
* Parses an HTML fragment.
* @param $text HTML text to parse
* @param $context String name of context element to pretend parsing is in.
* @param $builder Custom builder implementation
* @return Parsed HTML as DOMDocument
*/
static public function parseFragment($text, $context = null, $builder = null) {
$tokenizer = new HTML5_Tokenizer($text, $builder);
$tokenizer->parseFragment($context);
return $tokenizer->save();
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,220 @@
<?php
/**
* The main HTML5 front end.
*/
use HTML5\Parser\StringInputStream;
use HTML5\Parser\FileInputStream;
use HTML5\Parser\Scanner;
use HTML5\Parser\Tokenizer;
use HTML5\Parser\DOMTreeBuilder;
use HTML5\Serializer\OutputRules;
use HTML5\Serializer\Traverser;
/**
* This class offers convenience methods for parsing and serializing HTML5.
* It is roughly designed to mirror the \DOMDocument class that is
* provided with most versions of PHP.
*
* EXPERIMENTAL. This may change or be completely replaced.
*/
class HTML5 {
/**
* Global options for the parser and serializer.
* @var array
*/
public static $options = array(
// If the serializer should encode all entities.
'encode_entities' => FALSE,
);
/**
* Load and parse an HTML file.
*
* This will apply the HTML5 parser, which is tolerant of many
* varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
* 3. Note that in these cases, not all of the old data will be
* preserved. For example, XHTML's XML declaration will be removed.
*
* The rules governing parsing are set out in the HTML 5 spec.
*
* @param string $file
* The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first
* byte of input.
* @return \DOMDocument
* A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP.
*/
public static function load($file) {
// Handle the case where file is a resource.
if (is_resource($file)) {
// FIXME: We need a StreamInputStream class.
return static::loadHTML(stream_get_contents($file));
}
$input = new FileInputStream($file);
return static::parse($input);
}
/**
* Parse a HTML Document from a string.
*
* Take a string of HTML 5 (or earlier) and parse it into a
* DOMDocument.
*
* @param string $string
* A html5 document as a string.
* @return \DOMDocument
* A DOM document. DOM is part of libxml, which is included with
* almost all distribtions of PHP.
*/
public static function loadHTML($string) {
$input = new StringInputStream($string);
return static::parse($input);
}
/**
* Convenience function to load an HTML file.
*
* This is here to provide backwards compatibility with the
* PHP DOM implementation. It simply calls load().
*
* @param string $file
* The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first
* byte of input.
*
* @return \DOMDocument
* A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP.
*/
public static function loadHTMLFile($file, $options = NULL) {
return static::load($file, $options);
}
/**
* Parse a HTML fragment from a string.
*
* @param string $string
* The html5 fragment as a string.
*
* @return \DOMDocumentFragment
* A DOM fragment. The DOM is part of libxml, which is included with
* almost all distributions of PHP.
*/
public static function loadHTMLFragment($string) {
$input = new StringInputStream($string);
return static::parseFragment($input);
}
/**
* Save a DOM into a given file as HTML5.
*
* @param mixed $dom
* The DOM to be serialized.
* @param string $file
* The filename to be written.
* @param array $options
* Configuration options when serializing the DOM. These include:
* - encode_entities: Text written to the output is escaped by default and not all
* entities are encoded. If this is set to TRUE all entities will be encoded.
* Defaults to FALSE.
*/
public static function save($dom, $file, $options = array()) {
$options = $options + static::options();
$close = TRUE;
if (is_resource($file)) {
$stream = $file;
$close = FALSE;
}
else {
$stream = fopen($file, 'w');
}
$rules = new OutputRules($stream, $options);
$trav = new Traverser($dom, $stream, $rules, $options);
$trav->walk();
if ($close) {
fclose($stream);
}
}
/**
* Convert a DOM into an HTML5 string.
*
* @param mixed $dom
* The DOM to be serialized.
* @param array $options
* Configuration options when serializing the DOM. These include:
* - encode_entities: Text written to the output is escaped by default and not all
* entities are encoded. If this is set to TRUE all entities will be encoded.
* Defaults to FALSE.
*
* @return string
* A HTML5 documented generated from the DOM.
*/
public static function saveHTML($dom, $options = array()) {
$stream = fopen('php://temp', 'w');
static::save($dom, $stream, $options);
return stream_get_contents($stream, -1, 0);
}
/**
* Parse an input stream.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public static function parse(\HTML5\Parser\InputStream $input) {
$events = new DOMTreeBuilder();
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);
$parser->parse();
return $events->document();
}
/**
* Parse an input stream where the stream is a fragment.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public static function parseFragment(\HTML5\Parser\InputStream $input) {
$events = new DOMTreeBuilder(TRUE);
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);
$parser->parse();
return $events->fragment();
}
/**
* Get the default options.
*
* @return array
* The default options.
*/
public static function options() {
return static::$options;
}
/**
* Set a default option.
*
* @param string $name
* The option name.
* @param mixed $value
* The option value.
*/
public static function setOption($name, $value) {
static::$options[$name] = $value;
}
}

View File

@ -0,0 +1,614 @@
<?php
/**
* Provide general element functions.
*/
namespace HTML5;
/**
* This class provides general information about HTML5 elements,
* including syntactic and semantic issues. Parsers and serializers can
* use this class as a reference point for information about the rules
* of various HTML5 elements.
*
* @todo consider using a bitmask table lookup. There is enough overlap in
* naming that this could significantly shrink the size and maybe make it
* faster. See the Go teams implementation at https://code.google.com/p/go/source/browse/html/atom.
*/
class Elements {
/** Indicates an element is described in the specification. */
const KNOWN_ELEMENT = 1;
// From section 8.1.2: "script", "style"
// From 8.2.5.4.7 ("in body" insertion mode): "noembed", "noscript"
// From 8.4 "style", "xmp", "iframe", "noembed", "noframes"
/** Indicates the contained text should be processed as raw text. */
const TEXT_RAW = 2;
// From section 8.1.2: "textarea", "title"
/** Indicates the contained text should be processed as RCDATA. */
const TEXT_RCDATA = 4;
/** Indicates the tag cannot have content. */
const VOID_TAG = 8;
// "address", "article", "aside", "blockquote", "center", "details", "dialog", "dir", "div", "dl",
// "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu",
// "nav", "ol", "p", "section", "summary", "ul"
// "h1", "h2", "h3", "h4", "h5", "h6"
// "pre", "listing"
// "form"
// "plaintext"
/**
* Indicates that if a previous event is for a P tag, that element
* should be considered closed.
*/
const AUTOCLOSE_P = 16;
/** Indicates that the text inside is plaintext (pre). */
const TEXT_PLAINTEXT = 32;
// See https://developer.mozilla.org/en-US/docs/HTML/Block-level_elements
/** Indicates that the tag is a block. */
const BLOCK_TAG = 64;
/**
* The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html.
* @var array
*/
public static $html5 = array(
"a" => 1,
"abbr" => 1,
"address" => 89, // NORMAL | VOID_TAG | AUTOCLOSE_P | BLOCK_TAG
"area" => 9, // NORMAL | VOID_TAG
"article" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"aside" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"audio" => 65, // NORMAL | BLOCK_TAG
"b" => 1,
"base" => 9, // NORMAL | VOID_TAG
"bdi" => 1,
"bdo" => 1,
"blockquote" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"body" => 1,
"br" => 9, // NORMAL | VOID_TAG
"button" => 1,
"canvas" => 65, // NORMAL | BLOCK_TAG
"caption" => 1,
"cite" => 1,
"code" => 1,
"col" => 9, // NORMAL | VOID_TAG
"colgroup" => 1,
"command" => 9, // NORMAL | VOID_TAG
//"data" => 1, // This is highly experimental and only part of the whatwg spec (not w3c). See https://developer.mozilla.org/en-US/docs/HTML/Element/data
"datalist" => 1,
"dd" => 65, // NORMAL | BLOCK_TAG
"del" => 1,
"details" => 17, // NORMAL | AUTOCLOSE_P,
"dfn" => 1,
"dialog" => 17, // NORMAL | AUTOCLOSE_P,
"div" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"dl" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"dt" => 1,
"em" => 1,
"embed" => 9, // NORMAL | VOID_TAG
"fieldset" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"figcaption" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"figure" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"footer" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"form" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"h1" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"h2" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"h3" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"h4" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"h5" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"h6" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"head" => 1,
"header" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"hgroup" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"hr" => 73, // NORMAL | VOID_TAG | BLOCK_TAG
"html" => 1,
"i" => 1,
"iframe" => 3, // NORMAL | TEXT_RAW
"img" => 9, // NORMAL | VOID_TAG
"input" => 9, // NORMAL | VOID_TAG
"kbd" => 1,
"ins" => 1,
"keygen" => 9, // NORMAL | VOID_TAG
"label" => 1,
"legend" => 1,
"li" => 1,
"link" => 9, // NORMAL | VOID_TAG
"map" => 1,
"mark" => 1,
"menu" => 17, // NORMAL | AUTOCLOSE_P,
"meta" => 9, // NORMAL | VOID_TAG
"meter" => 1,
"nav" => 17, // NORMAL | AUTOCLOSE_P,
"noscript" => 67, // NORMAL | TEXT_RAW | BLOCK_TAG
"object" => 1,
"ol" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"optgroup" => 1,
"option" => 1,
"output" => 65, // NORMAL | BLOCK_TAG
"p" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"param" => 9, // NORMAL | VOID_TAG
"pre" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"progress" => 1,
"q" => 1,
"rp" => 1,
"rt" => 1,
"ruby" => 1,
"s" => 1,
"samp" => 1,
"script" => 3, // NORMAL | TEXT_RAW
"section" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"select" => 1,
"small" => 1,
"source" => 9, // NORMAL | VOID_TAG
"span" => 1,
"strong" => 1,
"style" => 1,
"sub" => 1,
"summary" => 17, // NORMAL | AUTOCLOSE_P,
"sup" => 1,
"table" => 65, // NORMAL | BLOCK_TAG
"tbody" => 1,
"td" => 1,
"textarea" => 5, // NORMAL | TEXT_RCDATA
"tfoot" => 65, // NORMAL | BLOCK_TAG
"th" => 1,
"thead" => 1,
"time" => 1,
"title" => 5, // NORMAL | TEXT_RCDATA
"tr" => 1,
"track" => 9, // NORMAL | VOID_TAG
"u" => 1,
"ul" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG
"var" => 1,
"video" => 65, // NORMAL | BLOCK_TAG
"wbr" => 9, // NORMAL | VOID_TAG
// Legacy?
'basefont' => 8, // VOID_TAG
'bgsound' => 8, // VOID_TAG
'noframes' => 2, // RAW_TEXT
'frame' => 9, // NORMAL | VOID_TAG
'frameset' => 1,
'center' => 16, 'dir' => 16, 'listing' => 16, // AUTOCLOSE_P
'plaintext' => 48, // AUTOCLOSE_P | TEXT_PLAINTEXT
'applet' => 0,
'marquee' => 0,
'isindex' => 8, // VOID_TAG
'xmp' => 20, // AUTOCLOSE_P | VOID_TAG | RAW_TEXT
'noembed' => 2, // RAW_TEXT
);
/**
* The MathML elements. See http://www.w3.org/wiki/MathML/Elements.
*
* In our case we are only concerned with presentation MathML and not content
* MathML. There is a nice list of this subset at https://developer.mozilla.org/en-US/docs/MathML/Element.
*
* @var array
*/
public static $mathml = array(
"maction" => 1,
"maligngroup" => 1,
"malignmark" => 1,
"math" => 1,
"menclose" => 1,
"merror" => 1,
"mfenced" => 1,
"mfrac" => 1,
"mglyph" => 1,
"mi" => 1,
"mlabeledtr" => 1,
"mlongdiv" => 1,
"mmultiscripts" => 1,
"mn" => 1,
"mo" => 1,
"mover" => 1,
"mpadded" => 1,
"mphantom" => 1,
"mroot" => 1,
"mrow" => 1,
"ms" => 1,
"mscarries" => 1,
"mscarry" => 1,
"msgroup" => 1,
"msline" => 1,
"mspace" => 1,
"msqrt" => 1,
"msrow" => 1,
"mstack" => 1,
"mstyle" => 1,
"msub" => 1,
"msup" => 1,
"msubsup" => 1,
"mtable" => 1,
"mtd" => 1,
"mtext" => 1,
"mtr" => 1,
"munder" => 1,
"munderover" => 1,
);
/**
* The svg elements.
*
* The Mozilla documentation has a good list at https://developer.mozilla.org/en-US/docs/SVG/Element.
* The w3c list appears to be lacking in some areas like filter effect elements.
* That list can be found at http://www.w3.org/wiki/SVG/Elements.
*
* Note, FireFox appears to do a better job rendering filter effects than chrome.
* While they are in the spec I'm not sure how widely implemented they are.
*
* @var array
*/
public static $svg = array(
"a" => 1,
"altGlyph" => 1,
"altGlyphDef" => 1,
"altGlyphItem" => 1,
"animate" => 1,
"animateColor" => 1,
"animateMotion" => 1,
"animateTransform" => 1,
"circle" => 1,
"clipPath" => 1,
"color-profile" => 1,
"cursor" => 1,
"defs" => 1,
"desc" => 1,
"ellipse" => 1,
"feBlend" => 1,
"feColorMatrix" => 1,
"feComponentTransfer" => 1,
"feComposite" => 1,
"feConvolveMatrix" => 1,
"feDiffuseLighting" => 1,
"feDisplacementMap" => 1,
"feDistantLight" => 1,
"feFlood" => 1,
"feFuncA" => 1,
"feFuncB" => 1,
"feFuncG" => 1,
"feFuncR" => 1,
"feGaussianBlur" => 1,
"feImage" => 1,
"feMerge" => 1,
"feMergeNode" => 1,
"feMorphology" => 1,
"feOffset" => 1,
"fePointLight" => 1,
"feSpecularLighting" => 1,
"feSpotLight" => 1,
"feTile" => 1,
"feTurbulence" => 1,
"filter" => 1,
"font" => 1,
"font-face" => 1,
"font-face-format" => 1,
"font-face-name" => 1,
"font-face-src" => 1,
"font-face-uri" => 1,
"foreignObject" => 1,
"g" => 1,
"glyph" => 1,
"glyphRef" => 1,
"hkern" => 1,
"image" => 1,
"line" => 1,
"linearGradient" => 1,
"marker" => 1,
"mask" => 1,
"metadata" => 1,
"missing-glyph" => 1,
"mpath" => 1,
"path" => 1,
"pattern" => 1,
"polygon" => 1,
"polyline" => 1,
"radialGradient" => 1,
"rect" => 1,
"script" => 3, // NORMAL | RAW_TEXT
"set" => 1,
"stop" => 1,
"style" => 3, // NORMAL | RAW_TEXT
"svg" => 1,
"switch" => 1,
"symbol" => 1,
"text" => 1,
"textPath" => 1,
"title" => 1,
"tref" => 1,
"tspan" => 1,
"use" => 1,
"view" => 1,
"vkern" => 1,
);
/**
* Some attributes in SVG are case sensetitive.
*
* This map contains key/value pairs with the key as the lowercase attribute
* name and the value with the correct casing.
*/
public static $svgCaseSensitiveAttributeMap = array(
'attributename' => 'attributeName',
'attributetype' => 'attributeType',
'basefrequency' => 'baseFrequency',
'baseprofile' => 'baseProfile',
'calcmode' => 'calcMode',
'clippathunits' => 'clipPathUnits',
'contentscripttype' => 'contentScriptType',
'contentstyletype' => 'contentStyleType',
'diffuseconstant' => 'diffuseConstant',
'edgemode' => 'edgeMode',
'externalresourcesrequired' => 'externalResourcesRequired',
'filterres' => 'filterRes',
'filterunits' => 'filterUnits',
'glyphref' => 'glyphRef',
'gradienttransform' => 'gradientTransform',
'gradientunits' => 'gradientUnits',
'kernelmatrix' => 'kernelMatrix',
'kernelunitlength' => 'kernelUnitLength',
'keypoints' => 'keyPoints',
'keysplines' => 'keySplines',
'keytimes' => 'keyTimes',
'lengthadjust' => 'lengthAdjust',
'limitingconeangle' => 'limitingConeAngle',
'markerheight' => 'markerHeight',
'markerunits' => 'markerUnits',
'markerwidth' => 'markerWidth',
'maskcontentunits' => 'maskContentUnits',
'maskunits' => 'maskUnits',
'numoctaves' => 'numOctaves',
'pathlength' => 'pathLength',
'patterncontentunits' => 'patternContentUnits',
'patterntransform' => 'patternTransform',
'patternunits' => 'patternUnits',
'pointsatx' => 'pointsAtX',
'pointsaty' => 'pointsAtY',
'pointsatz' => 'pointsAtZ',
'preservealpha' => 'preserveAlpha',
'preserveaspectratio' => 'preserveAspectRatio',
'primitiveunits' => 'primitiveUnits',
'refx' => 'refX',
'refy' => 'refY',
'repeatcount' => 'repeatCount',
'repeatdur' => 'repeatDur',
'requiredextensions' => 'requiredExtensions',
'requiredfeatures' => 'requiredFeatures',
'specularconstant' => 'specularConstant',
'specularexponent' => 'specularExponent',
'spreadmethod' => 'spreadMethod',
'startoffset' => 'startOffset',
'stddeviation' => 'stdDeviation',
'stitchtiles' => 'stitchTiles',
'surfacescale' => 'surfaceScale',
'systemlanguage' => 'systemLanguage',
'tablevalues' => 'tableValues',
'targetx' => 'targetX',
'targety' => 'targetY',
'textlength' => 'textLength',
'viewbox' => 'viewBox',
'viewtarget' => 'viewTarget',
'xchannelselector' => 'xChannelSelector',
'ychannelselector' => 'yChannelSelector',
'zoomandpan' => 'zoomAndPan',
);
/**
* Some SVG elements are case sensetitive. This map contains these.
*
* The map contains key/value store of the name is lowercase as the keys and
* the correct casing as the value.
*/
public static $svgCaseSensitiveElementMap = array(
'altglyph' => 'altGlyph',
'altglyphdef' => 'altGlyphDef',
'altglyphitem' => 'altGlyphItem',
'animatecolor' => 'animateColor',
'animatemotion' => 'animateMotion',
'animatetransform' => 'animateTransform',
'clippath' => 'clipPath',
'feblend' => 'feBlend',
'fecolormatrix' => 'feColorMatrix',
'fecomponenttransfer' => 'feComponentTransfer',
'fecomposite' => 'feComposite',
'feconvolvematrix' => 'feConvolveMatrix',
'fediffuselighting' => 'feDiffuseLighting',
'fedisplacementmap' => 'feDisplacementMap',
'fedistantlight' => 'feDistantLight',
'feflood' => 'feFlood',
'fefunca' => 'feFuncA',
'fefuncb' => 'feFuncB',
'fefuncg' => 'feFuncG',
'fefuncr' => 'feFuncR',
'fegaussianblur' => 'feGaussianBlur',
'feimage' => 'feImage',
'femerge' => 'feMerge',
'femergenode' => 'feMergeNode',
'femorphology' => 'feMorphology',
'feoffset' => 'feOffset',
'fepointlight' => 'fePointLight',
'fespecularlighting' => 'feSpecularLighting',
'fespotlight' => 'feSpotLight',
'fetile' => 'feTile',
'feturbulence' => 'feTurbulence',
'foreignobject' => 'foreignObject',
'glyphref' => 'glyphRef',
'lineargradient' => 'linearGradient',
'radialgradient' => 'radialGradient',
'textpath' => 'textPath',
);
/**
* Check whether the given element meets the given criterion.
*
* Example:
*
* Elements::isA('script', Elements::TEXT_RAW); // Returns true.
*
* Elements::isA('script', Elements::TEXT_RCDATA); // Returns false.
*
* @param string $name
* The element name.
* @param int $mask
* One of the constants on this class.
* @return boolean
* TRUE if the element matches the mask, FALSE otherwise.
*/
public static function isA($name, $mask) {
if (!static::isElement($name)) {
return FALSE;
}
return (static::element($name) & $mask) == $mask;
}
/**
* Test if an element is a valid html5 element.
*
* @param string $name
* The name of the element.
*
* @return bool
* True if a html5 element and false otherwise.
*/
public static function isHtml5Element($name) {
// html5 element names are case insensetitive. Forcing lowercase for the check.
// Do we need this check or will all data passed here already be lowercase?
return isset(static::$html5[strtolower($name)]);
}
/**
* Test if an element name is a valid MathML presentation element.
*
* @param string $name
* The name of the element.
*
* @return bool
* True if a MathML name and false otherwise.
*/
public static function isMathMLElement($name) {
// MathML is case-sensetitive unlike html5 elements.
return isset(static::$mathml[$name]);
}
/**
* Test if an element is a valid SVG element.
*
* @param string $name
* The name of the element.
*
* @return boolean
* True if a SVG element and false otherise.
*/
public static function isSvgElement($name) {
// SVG is case-sensetitive unlike html5 elements.
return isset(static::$svg[$name]);
}
/**
* Is an element name valid in an html5 document.
*
* This includes html5 elements along with other allowed embedded content
* such as svg and mathml.
*
* @param string $name
* The name of the element.
*
* @return bool
* True if valid and false otherwise.
*/
public static function isElement($name) {
return static::isHtml5Element($name) || static::isMathMLElement($name) || static::isSvgElement($name);
}
/**
* Get the element mask for the given element name.
*
* @param string $name
* The name of the element.
*
* @return int
* The element mask.
*/
public static function element($name) {
if (isset(static::$html5[$name])) {
return static::$html5[$name];
}
if (isset(static::$svg[$name])) {
return static::$svg[$name];
}
if (isset(static::$mathml[$name])) {
return static::$mathml[$name];
}
return FALSE;
}
/**
* Normalize a SVG element name to its proper case and form.
*
* @param string $name
* The name of the element.
*
* @return string
* The normalized form of the element name.
*/
public static function normalizeSvgElement($name) {
$name = strtolower($name);
if (isset(static::$svgCaseSensitiveElementMap[$name])) {
$name = static::$svgCaseSensitiveElementMap[$name];
}
return $name;
}
/**
* Normalize a SVG attribute name to its proper case and form.
*
* @param string $name
* The name of the attribute.
*
* @return string
* The normalized form of the attribute name.
*/
public static function normalizeSvgAttribute($name) {
$name = strtolower($name);
if (isset(static::$svgCaseSensitiveAttributeMap[$name])) {
$name = static::$svgCaseSensitiveAttributeMap[$name];
}
return $name;
}
/**
* Normalize a MathML attribute name to its proper case and form.
*
* Note, all MathML element names are lowercase.
*
* @param string $name
* The name of the attribute.
*
* @return string
* The normalized form of the attribute name.
*/
public static function normalizeMathMlAttribute($name) {
$name = strtolower($name);
// Only one attribute has a mixed case form for MathML.
if ($name == 'definitionurl') {
$name = 'definitionURL';
}
return $name;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,8 @@
<?php
namespace HTML5;
/**
* The base exception for the HTML5 project.
*/
class Exception extends \Exception {
}

View File

@ -0,0 +1,43 @@
<?php
/**
* A handler for processor instructions.
*/
namespace HTML5;
/**
* Provide an processor to handle embedded instructions.
*
* XML defines a mechanism for inserting instructions (like PHP) into a
* document. These are called "Processor Instructions." The HTML5 parser
* provides an opportunity to handle these processor instructions during
* the tree-building phase (before the DOM is constructed), which makes
* it possible to alter the document as it is being created.
*
* One could, for example, use this mechanism to execute well-formed PHP
* code embedded inside of an HTML5 document.
*/
interface InstructionProcessor {
/**
* Process an individual processing instruction.
*
* The process() function is responsible for doing the following:
* - Determining whether $name is an instruction type it can handle.
* - Determining what to do with the data passed in.
* - Making any subsequent modifications to the DOM by modifying the
* DOMElement or its attached DOM tree.
*
* @param DOMElement $element
* The parent element for the current processing instruction.
* @param string $name
* The instruction's name. E.g. `&lt;?php` has the name `php`.
* @param string $data
* All of the data between the opening and closing PI marks.
* @return DOMElement
* The element that should be considered "Current". This may just be
* the element passed in, but if the processor added more elements,
* it may choose to reset the current element to one of the elements
* it created. (When in doubt, return the element passed in.)
*/
public function process(\DOMElement $element, $name, $data);
}

View File

@ -0,0 +1,56 @@
<?php
namespace HTML5\Parser;
use \HTML5\Entities;
/**
* Manage entity references.
*
* This is a simple resolver for HTML5 character reference entitites.
* See \HTML5\Entities for the list of supported entities.
*/
class CharacterReference {
protected static $numeric_mask = array(0x0, 0x2FFFF, 0, 0xFFFF);
/**
* Given a name (e.g. 'amp'), lookup the UTF-8 character ('&')
*
* @param string $name
* The name to look up.
* @return string
* The character sequence. In UTF-8 this may be more than one byte.
*/
public static function lookupName($name) {
// Do we really want to return NULL here? or FFFD
return isset(Entities::$byName[$name]) ? Entities::$byName[$name] : NULL;
}
/**
* Given a Unicode codepoint, return the UTF-8 character.
*
* (NOT USED ANYWHERE)
*/
/*
public static function lookupCode($codePoint) {
return 'POINT';
}
*/
/**
* Given a decimal number, return the UTF-8 character.
*/
public static function lookupDecimal($int) {
$entity = '&#' . $int . ';';
// UNTESTED: This may fail on some planes. Couldn't find full documentation
// on the value of the mask array.
return mb_decode_numericentity($entity, static::$numeric_mask, 'utf-8');
}
/**
* Given a hexidecimal number, return the UTF-8 character.
*/
public static function lookupHex($hexdec) {
return static::lookupDecimal(hexdec($hexdec));
}
}

View File

@ -0,0 +1,475 @@
<?php
namespace HTML5\Parser;
use HTML5\Elements;
/**
* Create an HTML5 DOM tree from events.
*
* This attempts to create a DOM from events emitted by a parser. This
* attempts (but does not guarantee) to up-convert older HTML documents
* to HTML5. It does this by applying HTML5's rules, but it will not
* change the architecture of the document itself.
*
* Many of the error correction and quirks features suggested in the specification
* are implemented herein; however, not all of them are. Since we do not
* assume a graphical user agent, no presentation-specific logic is conducted
* during tree building.
*
* FIXME: The present tree builder does not exactly follow the state machine rules
* for insert modes as outlined in the HTML5 spec. The processor needs to be
* re-written to accomodate this. See, for example, the Go language HTML5
* parser.
*/
class DOMTreeBuilder implements EventHandler {
/**
* Defined in 8.2.5.
*/
const IM_INITIAL = 0;
const IM_BEFORE_HTML = 1;
const IM_BEFORE_HEAD = 2;
const IM_IN_HEAD = 3;
const IM_IN_HEAD_NOSCRIPT = 4;
const IM_AFTER_HEAD = 5;
const IM_IN_BODY = 6;
const IM_TEXT = 7;
const IM_IN_TABLE = 8;
const IM_IN_TABLE_TEXT = 9;
const IM_IN_CAPTION = 10;
const IM_IN_COLUMN_GROUP = 11;
const IM_IN_TABLE_BODY = 12;
const IM_IN_ROW = 13;
const IM_IN_CELL = 14;
const IM_IN_SELECT = 15;
const IM_IN_SELECT_IN_TABLE = 16;
const IM_AFTER_BODY = 17;
const IM_IN_FRAMESET = 18;
const IM_AFTER_FRAMESET = 19;
const IM_AFTER_AFTER_BODY = 20;
const IM_AFTER_AFTER_FRAMESET = 21;
const IM_IN_SVG = 22;
const IM_IN_MATHML = 23;
protected $stack = array();
protected $current; // Pointer in the tag hierarchy.
protected $doc;
protected $processor;
protected $insertMode = 0;
/**
* Quirks mode is enabled by default. Any document that is missing the
* DT will be considered to be in quirks mode.
*/
protected $quirks = TRUE;
public $isFragment = FALSE;
public function __construct($isFragment = FALSE) {
$impl = new \DOMImplementation();
// XXX:
// Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = $impl->createDocumentType('html');
//$this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc = $impl->createDocument(NULL, NULL, $dt);
$this->doc->errors = array();
// $this->current = $this->doc->documentElement;
$this->current = $this->doc; //->documentElement;
// Create a rules engine for tags.
$this->rules = new TreeBuildingRules($this->doc);
if ($isFragment) {
$this->isFragment = TRUE;
$this->insertMode = static::IM_IN_BODY;
$ele = $this->doc->createElement('html');
$this->doc->appendChild($ele);
$this->current = $ele;
}
}
/**
* Get the document.
*/
public function document() {
return $this->doc;
}
/**
* Get the DOM fragment for the body.
*
* This returns a DOMNodeList because a fragment may have zero or more
* DOMNodes at its root.
*
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context
*
* @return \DOMFragmentDocumentFragment
*/
public function fragment() {
$append = $this->doc->documentElement->childNodes;
$frag = $this->doc->createDocumentFragment();
// appendChild() modifies the DOMNodeList, so we
// have to buffer up the items first, then use the
// array buffer and loop twice.
$buffer = array();
foreach ($append as $node) {
$buffer[] = $node;
}
foreach ($buffer as $node) {
$frag->appendChild($node);
}
$frag->errors = $this->doc->errors;
return $frag;
}
/**
* Provide an instruction processor.
*
* This is used for handling Processor Instructions as they are
* inserted. If omitted, PI's are inserted directly into the DOM tree.
*/
public function setInstructionProcessor(\HTML5\InstructionProcessor $proc) {
$this->processor = $proc;
}
public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE) {
// This is used solely for setting quirks mode. Currently we don't
// try to preserve the inbound DT. We convert it to HTML5.
$this->quirks = $quirks;
if ($this->insertMode > static::IM_INITIAL) {
$this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name);
return;
}
$this->insertMode = static::IM_BEFORE_HTML;
}
/**
* Process the start tag.
*
* @todo
* - XMLNS namespace handling (we need to parse, even if it's not valid)
* - XLink, MathML and SVG namespace handling
* - Omission rules: 8.1.2.4 Optional tags
*/
public function startTag($name, $attributes = array(), $selfClosing = FALSE) {
// fprintf(STDOUT, $name);
$lname = $this->normalizeTagName($name);
// Make sure we have an html element.
if (!$this->doc->documentElement && $name !== 'html') {
$this->startTag('html');
}
// Set quirks mode if we're at IM_INITIAL with no doctype.
if ($this->insertMode == static::IM_INITIAL) {
$this->quirks = TRUE;
$this->parseError("No DOCTYPE specified.");
}
// SPECIAL TAG HANDLING:
// Spec says do this, and "don't ask."
if ($name == 'image') {
$name = 'img';
}
// Autoclose p tags where appropriate.
if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) {
$this->autoclose('p');
}
// Set insert mode:
switch ($name) {
case 'html':
$this->insertMode = static::IM_BEFORE_HEAD;
break;
case 'head':
if ($this->insertMode > static::IM_BEFORE_HEAD) {
$this->parseError("Unexpected head tag outside of head context.");
}
else {
$this->insertMode = static::IM_IN_HEAD;
}
break;
case 'body':
$this->insertMode = static::IM_IN_BODY;
break;
case 'svg':
$this->insertMode = static::IM_IN_SVG;
break;
case 'math':
$this->insertMode = static::IM_IN_MATHML;
break;
case 'noscript':
if ($this->insertMode == static::IM_IN_HEAD) {
$this->insertMode = static::IM_IN_HEAD_NOSCRIPT;
}
break;
}
// Special case handling for SVG.
if ($this->insertMode == static::IM_IN_SVG) {
$lname = Elements::normalizeSvgElement($lname);
}
try {
$ele = $this->doc->createElement($lname);
}
catch(\DOMException $e) {
$this->parseError("Illegal tag name: <$lname>. Replaced with <invalid>.");
$ele = $this->doc->createElement('invalid');
}
foreach ($attributes as $aName => $aVal) {
if ($this->insertMode == static::IM_IN_SVG) {
$aName = Elements::normalizeSvgAttribute($aName);
}
elseif ($this->insertMode == static::IM_IN_MATHML) {
$aName = Elements::normalizeMathMlAttribute($aName);
}
try {
$ele->setAttribute($aName, $aVal);
}
catch(\DOMException $e) {
$this->parseError("Illegal attribute name for tag $name. Ignoring: $aName");
continue;
}
// This is necessary on a non-DTD schema, like HTML5.
if ($aName == 'id') {
$ele->setIdAttribute('id', TRUE);
}
}
// Some elements have special processing rules. Handle those separately.
if ($this->rules->hasRules($name)) {
$this->current = $this->rules->evaluate($ele, $this->current);
}
// Otherwise, it's a standard element.
else {
$this->current->appendChild($ele);
// XXX: Need to handle self-closing tags and unary tags.
if (!Elements::isA($name, Elements::VOID_TAG)) {
$this->current = $ele;
}
}
// This is sort of a last-ditch attempt to correct for cases where no head/body
// elements are provided.
if ($this->insertMode <= static::IM_BEFORE_HEAD && $name != 'head' && $name != 'html') {
$this->insertMode = static::IM_IN_BODY;
}
// Return the element mask, which the tokenizer can then use to set
// various processing rules.
return Elements::element($name);
}
public function endTag($name) {
$lname = $this->normalizeTagName($name);
// Ignore closing tags for unary elements.
if (Elements::isA($name, Elements::VOID_TAG)) {
return;
}
if ($this->insertMode <= static::IM_BEFORE_HTML) {
// 8.2.5.4.2
if (in_array($name, array('html', 'br', 'head', 'title'))) {
$this->startTag('html');
$this->endTag($name);
$this->insertMode = static::IM_BEFORE_HEAD;
return;
}
// Ignore the tag.
$this->parseError("Illegal closing tag at global scope.");
return;
}
// Special case handling for SVG.
if ($this->insertMode == static::IM_IN_SVG) {
$lname = Elements::normalizeSvgElement($lname);
}
// XXX: Not sure whether we need this anymore.
// if ($name != $lname) {
// return $this->quirksTreeResolver($lname);
//}
// XXX: HTML has no parent. What do we do, though,
// if this element appears in the wrong place?
if ($lname == 'html') {
return;
}
//$this->current = $this->current->parentNode;
if (!$this->autoclose($lname)) {
$this->parseError('Could not find closing tag for ' . $lname);
}
//switch ($this->insertMode) {
switch ($lname) {
case "head":
$this->insertMode = static::IM_AFTER_HEAD;
break;
case "body":
$this->insertMode = static::IM_AFTER_BODY;
break;
case "svg":
case "mathml":
$this->insertMode = static::IM_IN_BODY;
break;
}
}
public function comment($cdata) {
// TODO: Need to handle case where comment appears outside of the HTML tag.
$node = $this->doc->createComment($cdata);
$this->current->appendChild($node);
}
public function text($data) {
// XXX: Hmmm.... should we really be this strict?
if ($this->insertMode < static::IM_IN_HEAD) {
// Per '8.2.5.4.3 The "before head" insertion mode' the characters
// " \t\n\r\f" should be ignored but no mention of a parse error. This is
// practical as most documents contain these characters. Other text is not
// expected here so recording a parse error is necessary.
$dataTmp = trim($data, " \t\n\r\f");
if (!empty($dataTmp)) {
//fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode);
$this->parseError("Unexpected text. Ignoring: " . $dataTmp);
}
return;
}
//fprintf(STDOUT, "Appending text %s.", $data);
$node = $this->doc->createTextNode($data);
$this->current->appendChild($node);
}
public function eof() {
// If the $current isn't the $root, do we need to do anything?
}
public function parseError($msg, $line = 0, $col = 0) {
$this->doc->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg);
}
public function cdata($data) {
$node = $this->doc->createCDATASection($data);
$this->current->appendChild($node);
}
public function processingInstruction($name, $data = NULL) {
// XXX: Ignore initial XML declaration, per the spec.
if ($this->insertMode == static::IM_INITIAL && 'xml' == strtolower($name)) {
return;
}
// Important: The processor may modify the current DOM tree however
// it sees fit.
if (isset($this->processor)) {
$res = $this->processor->process($this->current, $name, $data);
if (!empty($res)) {
$this->current = $res;
}
return;
}
// Otherwise, this is just a dumb PI element.
$node = $this->doc->createProcessingInstruction($name, $data);
$this->current->appendChild($node);
}
// ==========================================================================
// UTILITIES
// ==========================================================================
/**
* Apply normalization rules to a tag name.
*
* See sections 2.9 and 8.1.2.
*
* @param string $name
* The tag name.
* @return string
* The normalized tag name.
*/
protected function normalizeTagName($name) {
/* Section 2.9 suggests that we should not do this.
if (strpos($name, ':') !== FALSE) {
// We know from the grammar that there must be at least one other
// char besides :, since : is not a legal tag start.
$parts = explode(':', $name);
return array_pop($parts);
}
*/
return $name;
}
protected function quirksTreeResolver($name) {
throw new \Exception("Not implemented.");
}
/**
* Automatically climb the tree and close the closest node with the matching $tag.
*/
protected function autoclose($tag) {
$working = $this->current;
do {
if ($working->nodeType != XML_ELEMENT_NODE) {
return FALSE;
}
if ($working->tagName == $tag) {
$this->current = $working->parentNode;
return TRUE;
}
} while ($working = $working->parentNode);
return FALSE;
}
/**
* Checks if the given tagname is an ancestor of the present candidate.
*
* If $this->current or anything above $this->current matches the given tag
* name, this returns TRUE.
*/
protected function isAncestor($tagname) {
$candidate = $this->current;
while ($candidate->nodeType === XML_ELEMENT_NODE) {
if ($candidate->tagName == $tagname) {
return TRUE;
}
$candidate = $candidate->parentNode;
}
return FALSE;
}
/**
* Returns TRUE if the immediate parent element is of the given tagname.
*/
protected function isParent($tagname) {
return $this->current->tagName == $tagname;
}
}

View File

@ -0,0 +1,111 @@
<?php
namespace HTML5\Parser;
/**
* Standard events for HTML5.
*
* This is roughly analogous to a SAX2 or expat-style interface.
* However, it is tuned specifically for HTML5, according to section 8
* of the HTML5 specification.
*
* An event handler receives parser events. For a concrete
* implementation, see DOMTreeBuilder.
*
* Quirks support in the parser is limited to close-in syntax (malformed
* tags or attributes). Higher order syntax and semantic issues with a
* document (e.g. mismatched tags, illegal nesting, etc.) are the
* responsibility of the event handler implementation.
*
* See HTML5 spec section 8.2.4
*/
interface EventHandler {
const DOCTYPE_NONE = 0;
const DOCTYPE_PUBLIC = 1;
const DOCTYPE_SYSTEM = 2;
/**
* A doctype declaration.
*
* @param string $name
* The name of the root element.
* @param int $idType
* One of DOCTYPE_NONE, DOCTYPE_PUBLIC, or DOCTYPE_SYSTEM.
* @param string $id
* The identifier. For DOCTYPE_PUBLIC, this is the public ID. If DOCTYPE_SYSTEM,
* then this is a system ID.
* @param boolean $quirks
* Indicates whether the builder should enter quirks mode.
*/
public function doctype($name, $idType = 0, $id = NULL, $quirks = FALSE);
/**
* A start tag.
*
* IMPORTANT: The parser watches the return value of this event. If this returns
* an integer, the parser will switch TEXTMODE patters according to the int.
*
* This is how the Tree Builder can tell the Tokenizer when a certain tag should
* cause the parser to go into RAW text mode.
*
* The HTML5 standard requires that the builder is the one that initiates this
* step, and this is the only way short of a circular reference that we can
* do that.
*
* Example: if a startTag even for a `script` name is fired, and the startTag()
* implementation returns Tokenizer::TEXTMODE_RAW, then the tokenizer will
* switch into RAW text mode and consume data until it reaches a closing
* `script` tag.
*
* The textmode is automatically reset to Tokenizer::TEXTMODE_NORMAL when the
* closing tag is encounter. **This behavior may change.**
*
* @param string $name
* The tag name.
* @param array $attributes
* An array with all of the tag's attributes.
* @param boolean $selfClosing
* An indicator of whether or not this tag is self-closing (<foo/>)
* @return numeric
* One of the Tokenizer::TEXTMODE_* constants.
*/
public function startTag($name, $attributes = array(), $selfClosing = FALSE);
/**
* An end-tag.
*/
public function endTag($name);
/**
* A comment section (unparsed character data).
*/
public function comment($cdata);
/**
* A unit of parsed character data.
*
* Entities in this text are *already decoded*.
*/
public function text($cdata);
/**
* Indicates that the document has been entirely processed.
*/
public function eof();
/**
* Emitted when the parser encounters an error condition.
*/
public function parseError($msg, $line, $col);
/**
* A CDATA section.
*
* @param string $data
* The unparsed character data.
*/
public function cdata($data);
/**
* This is a holdover from the XML spec.
*
* While user agents don't get PIs, server-side does.
*
* @param string $name
* The name of the processor (e.g. 'php').
* @param string $data
* The unparsed data.
*/
public function processingInstruction($name, $data = NULL);
}

View File

@ -0,0 +1,35 @@
<?php
namespace HTML5\Parser;
/**
* The FileInputStream loads a file to be parsed.
*
* @todo A buffered input stream would be useful.
*/
class FileInputStream extends StringInputStream implements InputStream {
/*
* So right now we read files into strings and then process the
* string. We chose to do this largely for the sake of expediency of
* development, and also because we could optimize toward processing
* arbitrarily large chunks of the input. But in the future, we'd
* really like to rewrite this class to efficiently handle lower level
* stream reads (and thus efficiently handle large documents).
*/
/**
* Load a file input stream.
*
* @param string $data
* The file or url path to load.
*/
function __construct($data, $encoding = 'UTF-8', $debug = '') {
// Get the contents of the file.
$content = file_get_contents($data);
parent::__construct($content, $encoding, $debug);
}
}

View File

@ -0,0 +1,88 @@
<?php
namespace HTML5\Parser;
/**
* Interface for stream readers.
*
* The parser only reads from streams. Various input sources can write
* an adapater to this InputStream.
*
* Currently provided InputStream implementations include
* FileInputStream and StringInputStream.
*/
interface InputStream extends \Iterator {
/**
* Returns the current line that is being consumed.
*
* TODO: Move this to the scanner.
*/
public function currentLine();
/**
* Returns the current column of the current line that the tokenizer is at.
*
* Newlines are column 0. The first char after a newline is column 1.
*
* @TODO Move this to the scanner.
*
* @return int
* The column number.
*/
public function columnOffset();
/**
* Get all characters until EOF.
*
* This consumes characters until the EOF.
*/
public function remainingChars();
/**
* Read to a particular match (or until $max bytes are consumed).
*
* This operates on byte sequences, not characters.
*
* Matches as far as possible until we reach a certain set of bytes
* and returns the matched substring.
*
* @see strcspn
* @param string $bytes
* Bytes to match.
* @param int $max
* Maximum number of bytes to scan.
* @return mixed
* Index or FALSE if no match is found. You should use strong
* equality when checking the result, since index could be 0.
*/
public function charsUntil($bytes, $max = null);
/**
* Returns the string so long as $bytes matches.
*
* Matches as far as possible with a certain set of bytes
* and returns the matched substring.
*
* @see strspn
* @param string $bytes
* A mask of bytes to match. If ANY byte in this mask matches the
* current char, the pointer advances and the char is part of the
* substring.
* @param int $max
* The max number of chars to read.
*/
public function charsWhile($bytes, $max = null);
/**
* Unconsume one character.
*
* @param int $howMany
* The number of characters to move the pointer back.
*/
public function unconsume($howMany = 1);
/**
* Retrieve the next character without advancing the pointer.
*/
public function peek();
}

View File

@ -0,0 +1,8 @@
<?php
namespace HTML5\Parser;
/**
* Emit when the parser has an error.
*/
class ParseError extends \Exception {
}

View File

@ -0,0 +1,53 @@
# The Parser Model
The parser model here follows the model in section
[8.2.1](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#parsing)
of the HTML5 specification, though we do not assume a networking layer.
[ InputStream ] // Generic support for reading input.
||
[ Scanner ] // Breaks down the stream into characters.
||
[ Tokenizer ] // Groups characters into syntactic
||
[ Tree Builder ] // Organizes units into a tree of objects
||
[ DOM Document ] // The final state of the parsed document.
## InputStream
This is an interface with at least two concrete implementations:
- StringInputStream: Reads an HTML5 string.
- FileInputStream: Reads an HTML5 file.
## Scanner
This is a mechanical piece of the parser.
## Tokenizer
This follows section 8.4 of the HTML5 spec. It is (roughly) a recursive
descent parser. (Though there are plenty of optimizations that are less
than purely functional.
## EventHandler and DOMTree
EventHandler is the interface for tree builders. Since not all
implementations will necessarily build trees, we've chosen a more
generic name.
The event handler emits tokens during tokenization.
The DOMTree is an event handler that builds a DOM tree. The output of
the DOMTree builder is a DOMDocument.
## DOMDocument
PHP has a DOMDocument class built-in (technically, it's part of libxml.)
We use that, thus rendering the output of this process compatible with
SimpleXML, QueryPath, and many other XML/HTML processing tools.
For cases where the HTML5 is a fragment of a HTML5 document a
DOMDocumentFragment is returned instead. This is another built-in class.

View File

@ -0,0 +1,207 @@
<?php
namespace HTML5\Parser;
/**
* The scanner.
*
* This scans over an input stream.
*/
class Scanner {
const CHARS_HEX = 'abcdefABCDEF01234567890';
const CHARS_ALNUM = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ';
protected $is;
// Flipping this to TRUE will give minisculely more debugging info.
public $debug = FALSE;
/**
* Create a new Scanner.
*
* @param \HTML5\Parser\InputStream $input
* An InputStream to be scanned.
*/
public function __construct($input) {
$this->is = $input;
}
/**
* Get the current position.
*
* @return int
* The current intiger byte position.
*/
public function position() {
return $this->is->key();
}
/**
* Take a peek at the next character in the data.
*
* @return string
* The next character.
*/
public function peek() {
return $this->is->peek();
}
/**
* Get the next character.
*
* Note: This advances the pointer.
*
* @return string
* The next character.
*/
public function next() {
$this->is->next();
if ($this->is->valid()) {
if ($this->debug) fprintf(STDOUT, "> %s\n", $this->is->current());
return $this->is->current();
}
return FALSE;
}
/**
* Get the current character.
*
* Note, this does not advance the pointer.
*
* @return string
* The current character.
*/
public function current() {
if ($this->is->valid()) {
return $this->is->current();
}
return FALSE;
}
/**
* Silently consume N chars.
*/
public function consume($count = 1) {
for ($i = 0; $i < $count; ++$i) {
$this->next();
}
}
/**
* Unconsume some of the data. This moves the data pointer backwards.
*
* @param int $howMany
* The number of characters to move the pointer back.
*/
public function unconsume($howMany = 1) {
$this->is->unconsume($howMany);
}
/**
* Get the next group of that contains hex characters.
*
* Note, along with getting the characters the pointer in the data will be
* moved as well.
*
* @return string
* The next group that is hex characters.
*/
public function getHex() {
return $this->is->charsWhile(static::CHARS_HEX);
}
/**
* Get the next group of characters that are ASCII Alpha characters.
*
* Note, along with getting the characters the pointer in the data will be
* moved as well.
*
* @return string
* The next group of ASCII alpha characters.
*/
public function getAsciiAlpha() {
return $this->is->charsWhile(static::CHARS_ALPHA);
}
/**
* Get the next group of characters that are ASCII Alpha characters and numbers.
*
* Note, along with getting the characters the pointer in the data will be
* moved as well.
*
* @return string
* The next group of ASCII alpha characters and numbers.
*/
public function getAsciiAlphaNum() {
return $this->is->charsWhile(static::CHARS_ALNUM);
}
/**
* Get the next group of numbers.
*
* Note, along with getting the characters the pointer in the data will be
* moved as well.
*
* @return string
* The next group of numbers.
*/
public function getNumeric() {
return $this->is->charsWhile('0123456789');
}
/**
* Consume whitespace.
*
* Whitespace in HTML5 is: formfeed, tab, newline, space.
*/
public function whitespace() {
return $this->is->charsWhile("\n\t\f ");
}
/**
* Returns the current line that is being consumed.
*
* @return int
* The current line number.
*/
public function currentLine() {
return $this->is->currentLine();
}
/**
* Read chars until something in the mask is encountered.
*/
public function charsUntil($mask) {
return $this->is->charsUntil($mask);
}
/**
* Read chars as long as the mask matches.
*/
public function charsWhile($mask) {
return $this->is->charsWhile($mask);
}
/**
* Returns the current column of the current line that the tokenizer is at.
*
* Newlines are column 0. The first char after a newline is column 1.
*
* @return int
* The column number.
*/
public function columnOffset() {
return $this->is->columnOffset();
}
/**
* Get all characters until EOF.
*
* This consumes characters until the EOF.
*
* @return int
* The number of characters remaining.
*/
public function remainingChars() {
return $this->is->remainingChars();
}
}

View File

@ -0,0 +1,315 @@
<?php
/**
* Loads a string to be parsed.
*/
namespace HTML5\Parser;
/*
*
* Based on code from html5lib:
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
// Some conventions:
// - /* */ indicates verbatim text from the HTML 5 specification
// MPB: Not sure which version of the spec. Moving from HTML5lib to
// HTML5-PHP, I have been using this version:
// http://www.w3.org/TR/2012/CR-html5-20121217/Overview.html#contents
//
// - // indicates regular comments
class StringInputStream implements InputStream {
/**
* The string data we're parsing.
*/
private $data;
/**
* The current integer byte position we are in $data
*/
private $char;
/**
* Length of $data; when $char === $data, we are at the end-of-file.
*/
private $EOF;
/**
* Parse errors.
*/
public $errors = array();
/**
* Create a new InputStream wrapper.
*
* @param $data Data to parse
*/
public function __construct($data, $encoding = 'UTF-8', $debug = '') {
$data = UTF8Utils::convertToUTF8($data, $encoding);
if ($debug) fprintf(STDOUT, $debug, $data, strlen($data));
// There is good reason to question whether it makes sense to
// do this here, since most of these checks are done during
// parsing, and since this check doesn't actually *do* anything.
$this->errors = UTF8Utils::checkForIllegalCodepoints($data);
//if (!empty($e)) {
// throw new ParseError("UTF-8 encoding issues: " . implode(', ', $e));
//}
$data = $this->replaceLinefeeds($data);
$this->data = $data;
$this->char = 0;
$this->EOF = strlen($data);
}
/**
* Replace linefeed characters according to the spec.
*/
protected function replaceLinefeeds($data) {
/* U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED
(LF) characters are treated specially. Any CR characters
that are followed by LF characters must be removed, and any
CR characters not followed by LF characters must be converted
to LF characters. Thus, newlines in HTML DOMs are represented
by LF characters, and there are never any CR characters in the
input to the tokenization stage. */
$crlfTable = array(
"\0" => "\xEF\xBF\xBD",
"\r\n" => "\n",
"\r" => "\n",
);
return strtr($data, $crlfTable);
}
/**
* Returns the current line that the tokenizer is at.
*/
public function currentLine() {
if (empty($this->EOF) || $this->char == 0) {
return 1;
}
// Add one to $this->char because we want the number for the next
// byte to be processed.
return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
}
/**
* @deprecated
*/
public function getCurrentLine() {
return currentLine();
}
/**
* Returns the current column of the current line that the tokenizer is at.
*
* Newlines are column 0. The first char after a newline is column 1.
*
* @return int
* The column number.
*/
public function columnOffset() {
// Short circuit for the first char.
if ($this->char == 0) {
return 0;
}
// strrpos is weird, and the offset needs to be negative for what we
// want (i.e., the last \n before $this->char). This needs to not have
// one (to make it point to the next character, the one we want the
// position of) added to it because strrpos's behaviour includes the
// final offset byte.
$backwardFrom = $this->char - 1 - strlen($this->data);
$lastLine = strrpos($this->data, "\n", $backwardFrom);
// However, for here we want the length up until the next byte to be
// processed, so add one to the current byte ($this->char).
if ($lastLine !== FALSE) {
$findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
}
else {
// After a newline.
$findLengthOf = substr($this->data, 0, $this->char);
}
return UTF8Utils::countChars($findLengthOf);
}
/**
* @deprecated
*/
public function getColumnOffset() {
return $this->columnOffset();
}
/**
* Get the current character.
*
* @return string
* The current character.
*/
public function current() {
return $this->data[$this->char];
}
/**
* Advance the pointer. This is part of the Iterator interface.
*/
public function next() {
$this->char++;
}
/**
* Rewind to the start of the string.
*/
public function rewind() {
$this->char = 0;
}
/**
* Is the current pointer location valid.
*
* @return bool
* Is the current pointer location valid.
*/
public function valid() {
if ($this->char < $this->EOF) {
return TRUE;
}
return FALSE;
}
/**
* Get all characters until EOF.
*
* This reads to the end of the file, and sets the read marker at the
* end of the file.
*
* @note This performs bounds checking
*
* @return string
* Returns the remaining text. If called when the InputStream is
* already exhausted, it returns an empty string.
*/
public function remainingChars() {
if ($this->char < $this->EOF) {
$data = substr($this->data, $this->char);
$this->char = $this->EOF;
return $data;
}
return '';//FALSE;
}
/**
* Read to a particular match (or until $max bytes are consumed).
*
* This operates on byte sequences, not characters.
*
* Matches as far as possible until we reach a certain set of bytes
* and returns the matched substring.
*
* @param string $bytes
* Bytes to match.
* @param int $max
* Maximum number of bytes to scan.
* @return mixed
* Index or FALSE if no match is found. You should use strong
* equality when checking the result, since index could be 0.
*/
public function charsUntil($bytes, $max = null) {
if ($this->char >= $this->EOF) {
return FALSE;
}
if ($max === 0 || $max) {
$len = strcspn($this->data, $bytes, $this->char, $max);
}
else {
$len = strcspn($this->data, $bytes, $this->char);
}
$string = (string) substr($this->data, $this->char, $len);
$this->char += $len;
return $string;
}
/**
* Returns the string so long as $bytes matches.
*
* Matches as far as possible with a certain set of bytes
* and returns the matched substring.
*
* @param string $bytes
* A mask of bytes to match. If ANY byte in this mask matches the
* current char, the pointer advances and the char is part of the
* substring.
* @param int $max
* The max number of chars to read.
*/
public function charsWhile($bytes, $max = null) {
if ($this->char >= $this->EOF) {
return FALSE;
}
if ($max === 0 || $max) {
$len = strspn($this->data, $bytes, $this->char, $max);
}
else {
$len = strspn($this->data, $bytes, $this->char);
}
$string = (string) substr($this->data, $this->char, $len);
$this->char += $len;
return $string;
}
/**
* Unconsume characters.
*
* @param int $howMany
* The number of characters to unconsume.
*/
public function unconsume($howMany = 1) {
if (($this->char - $howMany) >= 0) {
$this->char = $this->char - $howMany;
}
}
/**
* Look ahead without moving cursor.
*/
public function peek() {
if (($this->char + 1) <= $this->EOF) {
return $this->data[$this->char + 1];
}
return FALSE;
}
public function key() {
return $this->char;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,114 @@
<?php
namespace HTML5\Parser;
use HTML5\Elements;
/**
* Handles special-case rules for the DOM tree builder.
*
* Many tags have special rules that need to be accomodated on an
* individual basis. This class handles those rules.
*
* See section 8.1.2.4 of the spec.
*
* @todo
* - colgroup and col special behaviors
* - body and head special behaviors
*/
class TreeBuildingRules {
protected static $tags = array(
'li' => 1,
'dd' => 1,
'dt' => 1,
'rt' => 1,
'rp' => 1,
'tr' => 1,
'th' => 1,
'td' => 1,
'thead' => 1,
'tfoot' => 1,
'tbody' => 1,
'table' => 1,
'optgroup' => 1,
'option' => 1,
);
/**
* Build a new rules engine.
*
* @param \DOMDocument $doc
* The DOM document to use for evaluation and modification.
*/
public function __construct($doc) {
$this->doc = $doc;
}
/**
* Returns TRUE if the given tagname has special processing rules.
*/
public function hasRules($tagname) {
return isset(static::$tags[$tagname]);
}
/**
* Evaluate the rule for the current tag name.
*
* This may modify the existing DOM.
*
* @return \DOMElement
* The new Current DOM element.
*/
public function evaluate($new, $current) {
switch($new->tagName) {
case 'li':
return $this->handleLI($new, $current);
case 'dt':
case 'dd':
return $this->handleDT($new, $current);
case 'rt':
case 'rp':
return $this->handleRT($new, $current);
case 'optgroup':
return $this->closeIfCurrentMatches($new, $current, array('optgroup'));
case 'option':
return $this->closeIfCurrentMatches($new, $current, array('option', 'optgroup'));
case 'tr':
return $this->closeIfCurrentMatches($new, $current, array('tr'));
case 'td':
case 'th':
return $this->closeIfCurrentMatches($new, $current, array('th', 'td'));
case 'tbody':
case 'thead':
case 'tfoot':
case 'table': // Spec isn't explicit about this, but it's necessary.
return $this->closeIfCurrentMatches($new, $current, array('thead', 'tfoot', 'tbody'));
}
return $current;
}
protected function handleLI($ele, $current) {
return $this->closeIfCurrentMatches($ele, $current, array('li'));
}
protected function handleDT($ele, $current) {
return $this->closeIfCurrentMatches($ele, $current, array('dt','dd'));
}
protected function handleRT($ele, $current) {
return $this->closeIfCurrentMatches($ele, $current, array('rt','rp'));
}
protected function closeIfCurrentMatches($ele, $current, $match) {
$tname = $current->tagName;
if (in_array($current->tagName, $match)) {
$current->parentNode->appendChild($ele);
}
else {
$current->appendChild($ele);
}
return $ele;
}
}

View File

@ -0,0 +1,187 @@
<?php
/*
*
* Portions based on code from html5lib files with the following copyright:
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
namespace HTML5\Parser;
/**
* UTF-8 Utilities
*/
class UTF8Utils {
/**
* The Unicode replacement character..
*/
const FFFD = "\xEF\xBF\xBD";
/**
* Count the number of characters in a string.
*
* UTF-8 aware. This will try (in order) iconv,
* MB, libxml, and finally a custom counter.
*
* @todo Move this to a general utility class.
*/
public static function countChars($string) {
// Get the length for the string we need.
if(function_exists('iconv_strlen')) {
return iconv_strlen($string, 'utf-8');
}
elseif(function_exists('mb_strlen')) {
return mb_strlen($string, 'utf-8');
}
elseif(function_exists('utf8_decode')) {
// MPB: Will this work? Won't certain decodes lead to two chars
// extrapolated out of 2-byte chars?
return strlen(utf8_decode($string));
}
$count = count_chars($string);
// 0x80 = 0x7F - 0 + 1 (one added to get inclusive range)
// 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range)
return array_sum(array_slice($count, 0, 0x80)) +
array_sum(array_slice($count, 0xC2, 0x33));
}
/**
* Convert data from the given encoding to UTF-8.
*
* This has not yet been tested with charactersets other than UTF-8.
* It should work with ISO-8859-1/-13 and standard Latin Win charsets.
*
* @param string $data
* The data to convert.
* @param string $encoding
* A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php
*/
public static function convertToUTF8($data, $encoding = 'UTF-8') {
/*
* From the HTML5 spec:
Given an encoding, the bytes in the input stream must be
converted to Unicode characters for the tokeniser, as
described by the rules for that encoding, except that the
leading U+FEFF BYTE ORDER MARK character, if any, must not
be stripped by the encoding layer (it is stripped by the rule below).
Bytes or sequences of bytes in the original byte stream that
could not be converted to Unicode characters must be converted
to U+FFFD REPLACEMENT CHARACTER code points. */
// mb_convert_encoding is chosen over iconv because of a bug. The best
// details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643
// which contains links to the actual but reports as well as work around
// details.
if (function_exists('mb_convert_encoding')) {
// mb library has the following behaviors:
// - UTF-16 surrogates result in FALSE.
// - Overlongs and outside Plane 16 result in empty strings.
// Before we run mb_convert_encoding we need to tell it what to do with
// characters it does not know. This could be different than the parent
// application executing this library so we store the value, change it
// to our needs, and then change it back when we are done. This feels
// a little excessive and it would be great if there was a better way.
$save = ini_get('mbstring.substitute_character');
ini_set('mbstring.substitute_character', "none");
$data = mb_convert_encoding($data, 'UTF-8', $encoding);
ini_set('mbstring.substitute_character', $save);
}
// @todo Get iconv running in at least some environments if that is possible.
elseif (function_exists('iconv') && $encoding != 'auto') {
// fprintf(STDOUT, "iconv found\n");
// iconv has the following behaviors:
// - Overlong representations are ignored.
// - Beyond Plane 16 is replaced with a lower char.
// - Incomplete sequences generate a warning.
$data = @iconv($encoding, 'UTF-8//IGNORE', $data);
}
else {
// we can make a conforming native implementation
throw new Exception('Not implemented, please install mbstring or iconv');
}
/* One leading U+FEFF BYTE ORDER MARK character must be
ignored if any are present. */
if (substr($data, 0, 3) === "\xEF\xBB\xBF") {
$data = substr($data, 3);
}
return $data;
}
/**
* Checks for Unicode code points that are not valid in a document.
*
* @param string $data
* A string to analyze.
* @return array
* An array of (string) error messages produced by the scanning.
*/
public static function checkForIllegalCodepoints($data) {
if (!function_exists('preg_match_all')) {
throw \Exception('The PCRE library is not loaded or is not available.');
}
// Vestigal error handling.
$errors = array();
/* All U+0000 NULL characters in the input must be replaced
by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
characters is a parse error. */
for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i++) {
$errors[] = 'null-character';
}
/* Any occurrences of any characters in the ranges U+0001 to
U+0008, U+000B, U+000E to U+001F, U+007F to U+009F,
U+D800 to U+DFFF , U+FDD0 to U+FDEF, and
characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF,
U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE,
U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE,
U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and
U+10FFFF are parse errors. (These are all control characters
or permanently undefined Unicode characters.) */
// Check PCRE is loaded.
$count = preg_match_all(
'/(?:
[\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F
|
\xC2[\x80-\x9F] # U+0080 to U+009F
|
\xED(?:\xA0[\x80-\xFF]|[\xA1-\xBE][\x00-\xFF]|\xBF[\x00-\xBF]) # U+D800 to U+DFFFF
|
\xEF\xB7[\x90-\xAF] # U+FDD0 to U+FDEF
|
\xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF
|
[\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16})
)/x',
$data,
$matches
);
for ($i = 0; $i < $count; $i++) {
$errors[] = 'invalid-codepoint';
}
return $errors;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,314 @@
<?php
/**
* @file
* The rules for generating output in the serializer.
*
* These output rules are likely to generate output similar to the document that
* was parsed. It is not intended to output exactly the document that was parsed.
*/
namespace HTML5\Serializer;
use \HTML5\Elements;
/**
* Generate the output html5 based on element rules.
*/
class OutputRules implements \HTML5\Serializer\RulesInterface {
const IM_IN_HTML = 1;
const IM_IN_SVG = 2;
const IM_IN_MATHML = 3;
protected $traverser;
protected $encode = FALSE;
protected $out;
protected $outputMode;
const DOCTYPE = '<!DOCTYPE html>';
public function __construct($output, $options = array()) {
if (isset($options['encode_entities'])) {
$this->encode = $options['encode_entities'];
}
$this->outputMode = static::IM_IN_HTML;
$this->out = $output;
}
public function setTraverser(\HTML5\Serializer\Traverser $traverser) {
$this->traverser = $traverser;
return $this;
}
public function document($dom) {
$this->doctype();
$this->traverser->node($dom->documentElement);
$this->nl();
}
protected function doctype() {
$this->wr(static::DOCTYPE);
$this->nl();
}
public function element($ele) {
$name = $ele->tagName;
// Per spec:
// If the element has a declared namespace in the HTML, MathML or
// SVG namespaces, we use the lname instead of the tagName.
if ($this->traverser->isLocalElement($ele)) {
$name = $ele->localName;
}
// If we are in SVG or MathML there is special handling.
// Using if/elseif instead of switch because it's faster in PHP.
if ($name == 'svg') {
$this->outputMode = static::IM_IN_SVG;
$name = Elements::normalizeSvgElement($name);
}
elseif ($name == 'math') {
$this->outputMode = static::IM_IN_MATHML;
}
$this->openTag($ele);
// Handle children.
if ($ele->hasChildNodes()) {
$this->traverser->children($ele->childNodes);
}
// Close out the SVG or MathML special handling.
if ($name == 'svg' || $name == 'math') {
$this->outputMode = static::IM_IN_HTML;
}
// If not unary, add a closing tag.
if (!Elements::isA($name, Elements::VOID_TAG)) {
$this->closeTag($ele);
}
}
/**
* Write a text node.
*
* @param \DOMText $ele
* The text node to write.
*/
public function text($ele) {
if (isset($ele->parentNode) && isset($ele->parentNode->tagName) && Elements::isA($ele->parentNode->tagName, Elements::TEXT_RAW)) {
$this->wr($ele->data);
return;
}
// FIXME: This probably needs some flags set.
$this->wr($this->enc($ele->data));
}
public function cdata($ele) {
// This encodes CDATA.
$this->wr($ele->ownerDocument->saveXML($ele));
}
public function comment($ele) {
// These produce identical output.
//$this->wr('<!--')->wr($ele->data)->wr('-->');
$this->wr($ele->ownerDocument->saveXML($ele));
}
public function processorInstruction($ele) {
$this->wr('<?')->wr($ele->target)->wr(' ')->wr($ele->data)->wr('?>');
}
/**
* Write the opening tag.
*
* Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the
* qualified name (8.3).
*
* @param \DOMNode $ele
* The element being written.
*/
protected function openTag($ele) {
$this->wr('<')->wr($ele->tagName);
$this->attrs($ele);
if ($this->outputMode == static::IM_IN_HTML) {
$this->wr('>');
}
// If we are not in html mode we are in SVG, MathML, or XML embedded content.
else {
if ($ele->hasChildNodes()) {
$this->wr('>');
}
// If there are no children this is self closing.
else {
$this->wr(' />');
}
}
}
protected function attrs($ele) {
// FIXME: Needs support for xml, xmlns, xlink, and namespaced elements.
if (!$ele->hasAttributes()) {
return $this;
}
// TODO: Currently, this always writes name="value", and does not do
// value-less attributes.
$map = $ele->attributes;
$len = $map->length;
for ($i = 0; $i < $len; ++$i) {
$node = $map->item($i);
$val = $this->enc($node->value, TRUE);
// XXX: The spec says that we need to ensure that anything in
// the XML, XMLNS, or XLink NS's should use the canonical
// prefix. It seems that DOM does this for us already, but there
// may be exceptions.
$name = $node->name;
// Special handling for attributes in SVG and MathML.
// Using if/elseif instead of switch because it's faster in PHP.
if ($this->outputMode == static::IM_IN_SVG) {
$name = Elements::normalizeSvgAttribute($name);
}
elseif ($this->outputMode == static::IM_IN_MATHML) {
$name = Elements::normalizeMathMlAttribute($name);
}
$this->wr(' ')->wr($name);
if (isset($val) && $val !== '') {
$this->wr('="')->wr($val)->wr('"');
}
}
}
/**
* Write the closing tag.
*
* Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the
* qualified name (8.3).
*
* @param \DOMNode $ele
* The element being written.
*/
protected function closeTag($ele) {
if ($this->outputMode == static::IM_IN_HTML || $ele->hasChildNodes()) {
$this->wr('</')->wr($ele->tagName)->wr('>');
}
}
/**
* Write to the output.
*
* @param string $text
* The string to put into the output.
*
* @return HTML5\Serializer\Traverser
* $this so it can be used in chaining.
*/
protected function wr($text) {
fwrite($this->out, $text);
return $this;
}
/**
* Write a new line character.
*
* @return HTML5\Serializer\Traverser
* $this so it can be used in chaining.
*/
protected function nl() {
fwrite($this->out, PHP_EOL);
return $this;
}
/**
* Encode text.
*
* When encode is set to FALSE, the default value, the text passed in is
* escaped per section 8.3 of the html5 spec. For details on how text is
* escaped see the escape() method.
*
* When encoding is set to true the text is converted to named character
* references where appropriate. Section 8.1.4 Character references of the
* html5 spec refers to using named character references. This is useful for
* characters that can't otherwise legally be used in the text.
*
* The named character references are listed in section 8.5.
*
* @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#named-character-references
*
* True encoding will turn all named character references into their entities.
* This includes such characters as +.# and many other common ones. By default
* encoding here will just escape &'<>".
*
* Note, PHP 5.4+ has better html5 encoding.
*
* @todo Use the Entities class in php 5.3 to have html5 entities.
*
* @param string $text
* text to encode.
* @param boolean $attribute
* True if we are encoding an attrubute, false otherwise
*
* @return string
* The encoded text.
*/
protected function enc($text, $attribute = FALSE) {
// Escape the text rather than convert to named character references.
if (!$this->encode) {
return $this->escape($text, $attribute);
}
// If we are in PHP 5.4+ we can use the native html5 entity functionality to
// convert the named character references.
if (defined('ENT_HTML5')) {
return htmlentities($text, ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES, 'UTF-8', FALSE);
}
// If a version earlier than 5.4 html5 entities are not entirely handled.
// This manually handles them.
else {
return strtr($text, \HTML5\Serializer\HTML5Entities::$map);
}
}
/**
* Escape test.
*
* According to the html5 spec section 8.3 Serializing HTML fragments, text
* within tags that are not style, script, xmp, iframe, noembed, and noframes
* need to be properly escaped.
*
* The & should be converted to &amp;, no breaking space unicode characters
* converted to &nbsp;, when in attribute mode the " should be converted to
* &quot;, and when not in attribute mode the < and > should be converted to
* &lt; and &gt;.
*
* @see http://www.w3.org/TR/2013/CR-html5-20130806/syntax.html#escapingString
*
* @param string $text
* text to escape.
* @param boolean $attribute
* True if we are escaping an attrubute, false otherwise
*/
protected function escape($text, $attribute = FALSE) {
// Not using htmlspecialchars because, while it does escaping, it doesn't
// match the requirements of section 8.5. For example, it doesn't handle
// non-breaking spaces.
if ($attribute) {
$replace = array('"'=>'&quot;', '&'=>'&amp;', "\xc2\xa0"=>'&nbsp;');
}
else {
$replace = array('<'=>'&lt;', '>'=>'&gt;', '&'=>'&amp;', "\xc2\xa0"=>'&nbsp;');
}
return strtr($text, $replace);
}
}

View File

@ -0,0 +1,33 @@
# The Serializer (Writer) Model
The serializer roughly follows sections _8.1 Writing HTML documents_ and section
_8.3 Serializing HTML fragments_ by converting DOMDocument, DOMDocumentFragment,
and DOMNodeList into HTML5.
[ HTML5 ] // Interface for saving.
||
[ Traverser ] // Walk the DOM
||
[ Rules ] // Convert DOM elements into strings.
||
[ HTML5 ] // HTML5 document or fragment in text.
## HTML5 Class
Provides the top level interface for saving.
## The Traverser
Walks the DOM finding each element and passing it off to the output rules to
convert to HTML5.
## Output Rules
The output rules are defined in the RulesInterface which can have multiple
implementations. Currently, the OutputRules is the default implementation that
converts a DOM as is into HTML5.
## HTML5 String
The output of the process it HTML5 as a string or saved to a file.

View File

@ -0,0 +1,102 @@
<?php
/**
* @file
* The interface definition for Rules to generate output.
*/
namespace HTML5\Serializer;
/**
* To create a new rule set for writing output the RulesInterface needs to be
* implemented. The resulting class can be specified in the options with the
* key of rules.
*
* For an example implementation see \HTML5\Serializer\OutputRules.
*/
interface RulesInterface {
/**
* The class constructor.
*
* Note, before the rules can be used a traverser must be registered.
*
* @param mixed $output
* The output stream to write output to.
* @param array $options
* An array of options.
*/
public function __construct($output, $options = array());
/**
* Register the traverser used in but the rules.
*
* Note, only one traverser can be used by the rules.
*
* @param \HTML5\Serializer\Traverser $traverser
* The traverser used in the rules.
* @return \HTML5\Serializer\RulesInterface
* $this for the current object.
*/
public function setTraverser(\HTML5\Serializer\Traverser $traverser);
/**
* Write a document element (\DOMDocument).
*
* Instead of returning the result write it to the output stream ($output)
* that was passed into the constructor.
*
* @param \DOMDocument $dom
*/
public function document($dom);
/**
* Write an element.
*
* Instead of returning the result write it to the output stream ($output)
* that was passed into the constructor.
*
* @param mixed $ele
*/
public function element($ele);
/**
* Write a text node.
*
* Instead of returning the result write it to the output stream ($output)
* that was passed into the constructor.
*
* @param mixed $ele
*/
public function text($ele);
/**
* Write a CDATA node.
*
* Instead of returning the result write it to the output stream ($output)
* that was passed into the constructor.
*
* @param mixed $ele
*/
public function cdata($ele);
/**
* Write a comment node.
*
* Instead of returning the result write it to the output stream ($output)
* that was passed into the constructor.
*
* @param mixed $ele
*/
public function comment($ele);
/**
* Write a processor instruction.
*
* To learn about processor instructions see \HTML5\InstructionProcessor
*
* Instead of returning the result write it to the output stream ($output)
* that was passed into the constructor.
*
* @param mixed $ele
*/
public function processorInstruction($ele);
}

View File

@ -0,0 +1,142 @@
<?php
namespace HTML5\Serializer;
/**
* Traverser for walking a DOM tree.
*
* This is a concrete traverser designed to convert a DOM tree into an
* HTML5 document. It is not intended to be a generic DOMTreeWalker
* implementation.
*
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments
*/
class Traverser {
/** Namespaces that should be treated as "local" to HTML5. */
static $local_ns = array(
'http://www.w3.org/1999/xhtml' => 'html',
'http://www.w3.org/1998/Math/MathML' => 'math',
'http://www.w3.org/2000/svg' => 'svg',
);
protected $dom;
protected $options;
protected $encode = FALSE;
protected $rules;
protected $out;
/**
* Create a traverser.
*
* @param DOMNode|DOMNodeList $dom
* The document or node to traverse.
* @param resource $out
* A stream that allows writing. The traverser will output into this
* stream.
* @param array $options
* An array or options for the traverser as key/value pairs. These include:
* - encode_entities: A bool to specify if full encding should happen for all named
* charachter references. Defaults to FALSE which escapes &'<>".
* - output_rules: The path to the class handling the output rules.
*/
public function __construct($dom, $out, RulesInterface $rules, $options = array()) {
$this->dom = $dom;
$this->out = $out;
$this->rules = $rules;
$this->options = $options;
$this->rules->setTraverser($this);
}
/**
* Tell the traverser to walk the DOM.
*
* @return resource $out
* Returns the output stream.
*/
public function walk() {
if ($this->dom instanceof \DOMDocument) {
$this->rules->document($this->dom);
}
elseif ($this->dom instanceof \DOMDocumentFragment) {
// Document fragments are a special case. Only the children need to
// be serialized.
if ($this->dom->hasChildNodes()) {
$this->children($this->dom->childNodes);
}
}
// If NodeList, loop
elseif ($this->dom instanceof \DOMNodeList) {
// If this is a NodeList of DOMDocuments this will not work.
$this->children($this->dom);
}
// Else assume this is a DOMNode-like datastructure.
else {
$this->node($this->dom);
}
return $this->out;
}
/**
* Process a node in the DOM.
*
* @param mixed $node
* A node implementing \DOMNode.
*/
public function node($node) {
// A listing of types is at http://php.net/manual/en/dom.constants.php
switch ($node->nodeType) {
case XML_ELEMENT_NODE:
$this->rules->element($node);
break;
case XML_TEXT_NODE:
$this->rules->text($node);
break;
case XML_CDATA_SECTION_NODE:
$this->rules->cdata($node);
break;
// FIXME: It appears that the parser doesn't do PI's.
case XML_PI_NODE:
$this->rules->processorInstruction($node);
break;
case XML_COMMENT_NODE:
$this->rules->comment($node);
break;
// Currently we don't support embedding DTDs.
default:
print '<!-- Skipped -->';
break;
}
}
/**
* Walk through all the nodes on a node list.
*
* @param \DOMNodeList $nl
* A list of child elements to walk through.
*/
public function children($nl) {
foreach ($nl as $node) {
$this->node($node);
}
}
/**
* Is an element local?
*
* @param mixed $ele
* An element that implement \DOMNode.
*
* @return bool
* True if local and false otherwise.
*/
public function isLocalElement($ele) {
$uri = $ele->namespaceURI;
if (empty($uri)) {
return FALSE;
}
return isset(static::$local_ns[$uri]);
}
}

View File

@ -0,0 +1,65 @@
## HTML5-PHP License
Copyright (c) 2013 The Authors of HTML5-PHP
Matt Butcher - technosophos@gmail.com
Matt Farina - matt@mattfarina.com
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
## HTML5Lib License
Portions of this are based on html5lib's PHP version, which was a
sub-project of html5lib. The following is the list of contributors from
html5lib:
html5lib:
Copyright (c) 2006-2009 The Authors
Contributors:
James Graham - jg307@cam.ac.uk
Anne van Kesteren - annevankesteren@gmail.com
Lachlan Hunt - lachlan.hunt@lachy.id.au
Matt McDonald - kanashii@kanashii.ca
Sam Ruby - rubys@intertwingly.net
Ian Hickson (Google) - ian@hixie.ch
Thomas Broyer - t.broyer@ltgt.net
Jacques Distler - distler@golem.ph.utexas.edu
Henri Sivonen - hsivonen@iki.fi
Adam Barth - abarth@webkit.org
Eric Seidel - eric@webkit.org
The Mozilla Foundation (contributions from Henri Sivonen since 2008)
David Flanagan (Mozilla) - dflanagan@mozilla.com
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,182 @@
# HTML5-PHP
The need for an HTML5 parser in PHP is clear. This project initially
began with the seemingly abandoned `html5lib` project [original source](https://code.google.com/p/html5lib/source/checkout).
But after some initial refactoring work, we began a new parser.
- An HTML5 serializer
- Support for PHP namespaces
- Composer support
- Event-based (SAX-like) parser
- DOM tree builder
- Interoperability with QueryPath [[in progress](https://github.com/technosophos/querypath/issues/114)]
[![Build Status](https://travis-ci.org/Masterminds/html5-php.png?branch=master)](https://travis-ci.org/Masterminds/html5-php) [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5) [![Coverage Status](https://coveralls.io/repos/Masterminds/html5-php/badge.png?branch=master)](https://coveralls.io/r/Masterminds/html5-php?branch=master)
## Installation
Install HTML5-PHP using [composer](http://getcomposer.org/).
To install, add `masterminds/html5` to your `composer.json` file:
```
{
"require" : {
"masterminds/html5": "1.*"
},
}
```
(You may substitute `1.*` for a more specific release tag, of
course.)
From there, use the `composer install` or `composer update` commands to
install.
## Basic Usage
HTML5-PHP has a high-level API and a low-level API.
Here is how you use the high-level `HTML5` library API:
```php
<?php
// Assuming you installed from Composer:
require "vendor/autoload.php";
// An example HTML document:
$html = <<< 'HERE'
<html>
<head>
<title>TEST</title>
</head>
<body id='foo'>
<h1>Hello World</h1>
<p>This is a test of the HTML5 parser.</p>
</body>
</html>
HERE;
// Parse the document. $dom is a DOMDocument.
$dom = HTML5::loadHTML($html);
// Render it as HTML5:
print HTML5::saveHTML($dom);
// Or save it to a file:
HTML5::save($dom, 'out.html');
?>
```
The `$dom` created by the parser is a full `DOMDocument` object. And the
`save()` and `saveHTML()` methods will take any DOMDocument.
## The Low-Level API
This library provides the following low-level APIs that you can use to
create more customized HTML5 tools:
- An `InputStream` abstraction that can work with different kinds of
input source (not just files and strings).
- A SAX-like event-based parser that you can hook into for special kinds
of parsing.
- A flexible error-reporting mechanism that can be tuned to document
syntax checking.
- A DOM implementation that uses PHP's built-in DOM library.
The unit tests exercise each piece of the API, and every public function
is well-documented.
### Parser Design
The parser is designed as follows:
- The `InputStream` portion handles direct I/O.
- The `Scanner` handles scanning on behalf of the parser.
- The `Tokenizer` requests data off of the scanner, parses it, clasifies
it, and sends it to an `EventHandler`. It is a *recursive descent parser.*
- The `EventHandler` receives notifications and data for each specific
semantic event that occurs during tokenization.
- The `DOMBuilder` is an `EventHandler` that listens for tokenizing
events and builds a document tree (`DOMDocument`) based on the events.
### Serializer Design
The serializer takes a data structure (the `DOMDocument`) and transforms
it into a character representation -- an HTML5 document.
The serializer is broken into three parts:
- The `OutputRules` contain the rules to turn DOM elements into strings. The
rules are an implementation of the interface `RulesInterface` allowing for
different rule sets to be used.
- The `Traverser`, which is a special-purpose tree walker. It visits
each node node in the tree and uses the `OutputRules` to transform the node
into a string.
- `\HTML5` manages the `Traverser` and stores the resultant data
in the correct place.
The serializer (`save()`, `saveHTML()`) follows the
[section 8.9 of the HTML 5.0 spec](http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#serializing-html-fragments).
So tags are serialized according to these rules:
- A tag with children: &lt;foo&gt;CHILDREN&lt;/foo&gt;
- A tag that cannot have content: &lt;foo&gt; (no closing tag)
- A tag that could have content, but doesn't: &lt;foo&gt;&lt;/foo&gt;
## Known Issues (Or, Things We Designed Against the Spec)
Please check the issue queue for a full list, but the following are
issues known issues that are not presently on the roadmap:
- Namespaces: HTML5 only [supports a selected list of namespaces](http://www.w3.org/TR/html5/infrastructure.html#namespaces)
and they do not operate in the same way as XML namespaces. A `:` has no special
meaning. The parser does not support XML style namespaces via `:`.
- Scripts: This parser does not contain a JavaScript or a CSS
interpreter. While one may be supplied, not all features will be
supported.
- Rentrance: The current parser is not re-entrant. (Thus you can't pause
the parser to modify the HTML string mid-parse.)
- Validation: The current tree builder is **not** a validating parser.
While it will correct some HTML, it does not check that the HTML
conforms to the standard. (Should you wish, you can build a validating
parser by extending DOMTree or building your own EventHandler
implementation.)
* There is limited support for insertion modes.
* Some autocorrection is done automatically.
* Per the spec, many legacy tags are admitted and correctly handled,
even though they are technically not part of HTML5.
- Attribute names and values: Due to the implementation details of the
PHP implementation of DOM, attribute names that do not follow the
XML 1.0 standard are not inserted into the DOM. (Effectively, they
are ignored.) If you've got a clever fix for this, jump in!
- Processor Instructions: The HTML5 spec does not allow processor
instructions. We do. Since this is a server-side library, we think
this is useful. And that means, dear reader, that in some cases you
can parse the HTML from a mixed PHP/HTML document. This, however,
is an incidental feature, not a core feature.
- HTML manifests: Unsupported.
- PLAINTEXT: Unsupported.
- Adoption Agency Algorithm: Not yet implemented. (8.2.5.4.7)
## Thanks to...
We owe a huge debt of gratitude to the original authors of html5lib.
While not much of the orignal parser remains, we learned a lot from
reading the html5lib library. And some pieces remain here. In
particular, much of the UTF-8 and Unicode handling is derived from the
html5lib project.
## License
This software is released under the MIT license. The original html5lib
library was also released under the MIT license.
See LICENSE.txt
Certain files contain copyright assertions by specific individuals
involved with html5lib. Those have been retained where appropriate.

View File

@ -0,0 +1,26 @@
# Release Notes
1.0.4 (2014-04-29)
- #30/#31 Don't throw an exception for invalid tag names.
1.0.3 (2014-02-28)
- #23 and #29: Ignore attributes with illegal chars in name for the PHP DOM.
1.0.2 (2014-02-12)
- #23: Handle missing tag close in attribute list.
- #25: Fixed text escaping in the serializer (HTML% 8.3).
- #27: Fixed tests on Windows: changed "\n" -> PHP_EOL.
- #28: Fixed infinite loop for char "&" in unquoted attribute in parser.
- #26: Updated tag name case handling to deal with uppercase usage.
- #24: Newlines and tabs are allowed inside quoted attributes (HTML5 8.2.4).
- Fixed Travis CI testing.
1.0.1 (2013-11-07)
- CDATA encoding is improved. (Non-standard; Issue #19)
- Some parser rules were not returning the new current element. (Issue #20)
- Added, to the README, details on code test coverage and to packagist version.
- Fixed processor instructions.
- Improved test coverage and documentation coverage.
1.0.0 (2013-10-02)
- Initial release.

View File

@ -0,0 +1,35 @@
<?php
// autoloader
spl_autoload_register(array(new HTML5PHP_Autoloader(), 'autoload'));
/**
* Autoloader class
*/
class HTML5PHP_Autoloader
{
/**
* Constructor
*/
public function __construct()
{
$this->path = dirname(__FILE__);
}
/**
* Autoloader
*
* @param string $class The name of the class to attempt to load.
*/
public function autoload($class)
{
// Only load the class if it starts with "HTML5"
if (strpos($class, 'HTML5') !== 0)
{
return;
}
//die($class);
$filename = $this->path . DIRECTORY_SEPARATOR . str_replace('\\', DIRECTORY_SEPARATOR, $class) . '.php';
include $filename;
}
}

View File

@ -7,11 +7,11 @@
* For environments which do not have these options, it reverts to standard sequential
* requests (using file_get_contents())
*
* @version 1.4
* @date 2013-05-10
* @version 1.5
* @date 2014-03-28
* @see http://php.net/HttpRequestPool
* @author Keyvan Minoukadeh
* @copyright 2011-2013 Keyvan Minoukadeh
* @copyright 2011-2014 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
@ -31,6 +31,7 @@ class HumbleHttpAgent
protected $maxParallelRequests = 5;
protected $cache = null; //TODO
protected $httpContext;
protected $curlOptions;
protected $minimiseMemoryUse = false; //TODO
protected $method;
protected $cookieJar;
@ -80,6 +81,7 @@ class HumbleHttpAgent
// create cookie jar
$this->cookieJar = new CookieJar();
// set request options (redirect must be 0)
// HTTP PECL (http://php.net/manual/en/http.request.options.php)
$this->requestOptions = array(
'timeout' => 15,
'connecttimeout' => 15,
@ -90,6 +92,7 @@ class HumbleHttpAgent
if (is_array($requestOptions)) {
$this->requestOptions = array_merge($this->requestOptions, $requestOptions);
}
// HTTP file_get_contents
$this->httpContext = array(
'http' => array(
'ignore_errors' => true,
@ -98,6 +101,23 @@ class HumbleHttpAgent
'header' => "Accept: */*\r\n"
)
);
// HTTP cURL
$this->curlOptions = array(
CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
CURLOPT_TIMEOUT => $this->requestOptions['timeout']
);
// Use proxy?
if ($this->requestOptions['proxyhost']) {
// For file_get_contents (see http://stackoverflow.com/a/1336419/407938)
$this->httpContext['http']['proxy'] = 'tcp://'.$this->requestOptions['proxyhost'];
$this->httpContext['http']['request_fulluri'] = true;
// For cURL (see http://stackoverflow.com/a/9247672/407938)
$this->curlOptions[CURLOPT_PROXY] = $this->requestOptions['proxyhost'];
if (isset($this->requestOptions['proxyauth'])) {
$this->httpContext['http']['header'] .= "Proxy-Authorization: Basic ".base64_encode($this->requestOptions['proxyauth'])."\r\n";
$this->curlOptions[CURLOPT_PROXYUSERPWD] = $this->requestOptions['proxyauth'];
}
}
}
protected function debug($msg) {
@ -168,7 +188,7 @@ class HumbleHttpAgent
public function getMetaRefreshURL($url, $html) {
if ($html == '') return false;
// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']*>!i', $html, $match)) {
if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']?!i', $html, $match)) {
return false;
}
$redirect_url = $match[1];
@ -443,10 +463,7 @@ class HumbleHttpAgent
$this->debug("......sending cookies: $cookies");
$headers[] = 'Cookie: '.$cookies;
}
$httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, array(
CURLOPT_CONNECTTIMEOUT => $this->requestOptions['timeout'],
CURLOPT_TIMEOUT => $this->requestOptions['timeout']
));
$httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, $this->curlOptions);
$httpRequest->set_original_url($orig);
$this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest);
$this->requests[$orig]['original_url'] = $orig; // TODO: is this needed anymore?
@ -661,7 +678,7 @@ class HumbleHttpAgent
*/
if ($remove && $response) unset($this->requests[$url]);
if ($gzdecode && stripos($response['headers'], 'Content-Encoding: gzip')) {
if ($html = gzdecode($response['body'])) {
if ($html = @gzdecode($response['body'])) {
$response['body'] = $html;
}
}

View File

@ -0,0 +1,106 @@
<?php
/**
* Humble HTTP Agent Dummy
*
* This class is designed to respond to HumbleHttpAgent calls
* but to return a predefined HTML response rather than
* actually making HTTP requests.
*
* @version 1.5
* @date 2014-05-07
* @author Keyvan Minoukadeh
* @copyright 2014 Keyvan Minoukadeh
* @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3
*/
class HumbleHttpAgentDummy
{
public $debug = false;
public $debugVerbose = false;
public $rewriteHashbangFragment = true;
public $maxRedirects = 5;
public $userAgentMap = array();
public $rewriteUrls = array();
public $userAgentDefault;
public $referer;
protected $body = '';
protected $headers = "HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n";
function __construct($body, $headers=null) {
$this->body = $body;
if (isset($headers)) $this->headers = $headers;
}
public function rewriteHashbangFragment($url) {
return $url;
}
public function getRedirectURLfromHTML($url, $html) {
return false;
}
public function getMetaRefreshURL($url, $html) {
return false;
}
public function getUglyURL($url, $html) {
return false;
}
public function removeFragment($url) {
return $url;
}
public function rewriteUrls($url) {
return $url;
}
public function enableDebug($bool=true) {
return;
}
public function minimiseMemoryUse($bool = true) {
return;
}
public function setMaxParallelRequests($max) {
return;
}
public function validateUrl($url) {
$url = filter_var($url, FILTER_SANITIZE_URL);
$test = filter_var($url, FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
// deal with bug http://bugs.php.net/51192 (present in PHP 5.2.13 and PHP 5.3.2)
if ($test === false) {
$test = filter_var(strtr($url, '-', '_'), FILTER_VALIDATE_URL, FILTER_FLAG_SCHEME_REQUIRED);
}
if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
return $url;
} else {
return false;
}
}
public function fetchAll(array $urls) {
return;
}
// fetch all URLs without following redirects
public function fetchAllOnce(array $urls, $isRedirect=false) {
return;
}
public function get($url, $remove=false, $gzdecode=true) {
return array(
'body' => $this->body,
'headers' => $this->headers,
'status_code' => 200,
'effective_url' => $url
);
}
public function parallelSupport() {
return false;
}
}

View File

@ -12,7 +12,7 @@
* More information: http://fivefilters.org/content-only/
* License: Apache License, Version 2.0
* Requires: PHP5
* Date: 2012-09-19
* Date: 2014-03-27
*
* Differences between the PHP port and the original
* ------------------------------------------------------
@ -71,7 +71,7 @@ class Readability
public $revertForcedParagraphElements = true;
public $articleTitle;
public $articleContent;
public $dom;
public $dom = null;
public $url = null; // optional - URL where HTML was retrieved
public $debug = false;
public $lightClean = true; // preserves more content (experimental) added 2012-09-19
@ -95,7 +95,7 @@ class Readability
// 'trimRe' => '/^\s+|\s+$/g', // PHP has trim()
'normalize' => '/\s{2,}/',
'killBreaks' => '/(<br\s*\/?>(\s|&nbsp;?)*){1,}/',
'video' => '!//(player\.|www\.)?(youtube|vimeo|viddler)\.com!i',
'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|twitch\.tv)!i',
'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i'
);
@ -118,9 +118,12 @@ class Readability
$html = preg_replace($this->regexps['replaceFonts'], '<$1span>', $html);
$html = mb_convert_encoding($html, 'HTML-ENTITIES', "UTF-8");
if (trim($html) == '') $html = '<html></html>';
if ($parser=='html5lib' && ($this->dom = HTML5_Parser::parse($html))) {
// all good
} else {
if ($parser=='html5lib' || $parser=='html5php') {
if (version_compare(PHP_VERSION, '5.3.0') >= 0) {
$this->dom = HTML5::loadHTML($html);
}
}
if ($this->dom === null) {
$this->dom = new DOMDocument();
$this->dom->preserveWhiteSpace = false;
@$this->dom->loadHTML($html);

View File

@ -1,10 +1,10 @@
<?php
// Full-Text RSS: Create Full-Text Feeds
// Author: Keyvan Minoukadeh
// Copyright (c) 2013 Keyvan Minoukadeh
// Copyright (c) 2014 Keyvan Minoukadeh
// License: AGPLv3
// Version: 3.2
// Date: 2013-05-13
// Version: 3.3
// Date: 2014-05-07
// More info: http://fivefilters.org/content-only/
// Help: http://help.fivefilters.org
@ -32,9 +32,17 @@ error_reporting(E_ALL ^ E_NOTICE);
ini_set("display_errors", 1);
@set_time_limit(120);
if (!defined('_FF_FTR_MODE')) define('_FF_FTR_MODE', 'full');
if (_FF_FTR_MODE === 'simple') {
$_REQUEST = array_merge($_GET, $_POST);
} else {
$_REQUEST = $_GET;
}
// Deal with magic quotes
if (get_magic_quotes_gpc()) {
$process = array(&$_GET, &$_POST, &$_REQUEST);
$process = array(&$_REQUEST);
while (list($key, $val) = each($process)) {
foreach ($val as $k => $v) {
unset($process[$key][$k]);
@ -68,12 +76,13 @@ function autoload($class_name) {
'HumbleHttpAgent' => 'humble-http-agent/HumbleHttpAgent.php',
'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php',
'CookieJar' => 'humble-http-agent/CookieJar.php',
'HumbleHttpAgentDummy' => 'humble-http-agent/HumbleHttpAgentDummy.php',
// Include Zend Cache to improve performance (cache results)
'Zend_Cache' => 'Zend/Cache.php',
// Language detect
'Text_LanguageDetect' => 'language-detect/LanguageDetect.php',
// HTML5 Lib
'HTML5_Parser' => 'html5/Parser.php',
// HTML5 PHP (can't be used unless PHP version is >= 5.3)
'HTML5' => 'html5php/HTML5.php',
// htmLawed - used if XSS filter is enabled (xss_filter)
'htmLawed' => 'htmLawed/htmLawed.php'
);
@ -87,6 +96,7 @@ function autoload($class_name) {
}
spl_autoload_register('autoload');
require dirname(__FILE__).'/libraries/simplepie/autoloader.php';
require dirname(__FILE__).'/libraries/html5php/autoloader.php';
////////////////////////////////
// Load config file
@ -103,6 +113,11 @@ require dirname(__FILE__).'/config.php';
////////////////////////////////
header('X-Robots-Tag: noindex, nofollow');
////////////////////////////////
// Content security headers
////////////////////////////////
header("Content-Security-Policy: script-src 'self'; connect-src 'none'; font-src 'none'; style-src 'self'");
////////////////////////////////
// Check if service is enabled
////////////////////////////////
@ -115,7 +130,9 @@ if (!$options->enabled) {
// See the config file for debug options.
////////////////////////////////
$debug_mode = false;
if (isset($_GET['debug'])) {
$debug_show_raw_html = false;
$debug_show_parsed_html = false;
if (isset($_REQUEST['debug'])) {
if ($options->debug === true || $options->debug == 'user') {
$debug_mode = true;
} elseif ($options->debug == 'admin') {
@ -124,6 +141,8 @@ if (isset($_GET['debug'])) {
}
if ($debug_mode) {
header('Content-Type: text/plain; charset=utf-8');
$debug_show_raw_html = ($_REQUEST['debug'] === 'rawhtml');
$debug_show_parsed_html = ($_REQUEST['debug'] === 'parsedhtml');
} else {
if ($options->debug == 'admin') {
die('You must be logged in to the <a href="admin/">admin area</a> to see debug output.');
@ -151,10 +170,10 @@ $options->smart_cache = $options->smart_cache && function_exists('apc_inc');
////////////////////////////////
// Check for feed URL
////////////////////////////////
if (!isset($_GET['url'])) {
if (!isset($_REQUEST['url'])) {
die('No URL supplied');
}
$url = trim($_GET['url']);
$url = trim($_REQUEST['url']);
if (strtolower(substr($url, 0, 7)) == 'feed://') {
$url = 'http://'.substr($url, 7);
}
@ -178,26 +197,30 @@ debug("Supplied URL: $url");
/////////////////////////////////
// Redirect to hide API key
// (if in 'full' mode)
/////////////////////////////////
if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->api_keys)) !== false) {
if ((_FF_FTR_MODE == 'full') && isset($_REQUEST['key']) && ($key_index = array_search($_REQUEST['key'], $options->api_keys)) !== false) {
$host = $_SERVER['HTTP_HOST'];
$path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\');
$_qs_url = (strtolower(substr($url, 0, 7)) == 'http://') ? substr($url, 7) : $url;
$redirect = 'http://'.htmlspecialchars($host.$path).'/makefulltextfeed.php?url='.urlencode($_qs_url);
$redirect .= '&key='.$key_index;
$redirect .= '&hash='.urlencode(sha1($_GET['key'].$url));
if (isset($_GET['html'])) $redirect .= '&html='.urlencode($_GET['html']);
if (isset($_GET['max'])) $redirect .= '&max='.(int)$_GET['max'];
if (isset($_GET['links'])) $redirect .= '&links='.urlencode($_GET['links']);
if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
if (isset($_GET['callback'])) $redirect .= '&callback='.urlencode($_GET['callback']);
if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
if (isset($_GET['xss'])) $redirect .= '&xss';
if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title';
if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']);
if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']);
if (isset($_GET['debug'])) $redirect .= '&debug';
$redirect .= '&hash='.urlencode(sha1($_REQUEST['key'].$url));
if (isset($_REQUEST['html'])) $redirect .= '&html='.urlencode($_REQUEST['html']);
if (isset($_REQUEST['max'])) $redirect .= '&max='.(int)$_REQUEST['max'];
if (isset($_REQUEST['links'])) $redirect .= '&links='.urlencode($_REQUEST['links']);
if (isset($_REQUEST['exc'])) $redirect .= '&exc='.urlencode($_REQUEST['exc']);
if (isset($_REQUEST['format'])) $redirect .= '&format='.urlencode($_REQUEST['format']);
if (isset($_REQUEST['callback'])) $redirect .= '&callback='.urlencode($_REQUEST['callback']);
if (isset($_REQUEST['l'])) $redirect .= '&l='.urlencode($_REQUEST['l']);
if (isset($_REQUEST['lang'])) $redirect .= '&lang='.urlencode($_REQUEST['lang']);
if (isset($_REQUEST['xss'])) $redirect .= '&xss';
if (isset($_REQUEST['use_extracted_title'])) $redirect .= '&use_extracted_title';
if (isset($_REQUEST['content'])) $redirect .= '&content='.urlencode($_REQUEST['content']);
if (isset($_REQUEST['summary'])) $redirect .= '&summary='.urlencode($_REQUEST['summary']);
if (isset($_REQUEST['debug'])) $redirect .= '&debug';
if (isset($_REQUEST['parser'])) $redirect .= '&parser='.urlencode($_REQUEST['parser']);
if (isset($_REQUEST['proxy'])) $redirect .= '&proxy='.urlencode($_REQUEST['proxy']);
if ($debug_mode) {
debug('Redirecting to hide access key, follow URL below to continue');
debug("Location: $redirect");
@ -220,20 +243,27 @@ if (!ini_get('date.timezone') || !@date_default_timezone_set(ini_get('date.timez
///////////////////////////////////////////////
// Check if the request is explicitly for an HTML page
///////////////////////////////////////////////
$html_only = (isset($_GET['html']) && ($_GET['html'] == '1' || $_GET['html'] == 'true'));
$html_only = (isset($_REQUEST['html']) && ($_REQUEST['html'] == '1' || $_REQUEST['html'] == 'true'));
///////////////////////////////////////////////
// Check if valid key supplied
///////////////////////////////////////////////
$valid_key = false;
if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int)$_GET['key']])) {
$valid_key = ($_GET['hash'] == sha1($options->api_keys[(int)$_GET['key']].$url));
$key_index = false;
// first check for hidden key using hash (key (int) + hash parameters) (can appear in both simple and full modes)
if (isset($_REQUEST['key']) && isset($_REQUEST['hash']) && isset($options->api_keys[(int)$_REQUEST['key']])) {
$valid_key = ($_REQUEST['hash'] == sha1($options->api_keys[(int)$_REQUEST['key']].$url));
if ($valid_key) $key_index = (int)$_REQUEST['key'];
}
// next check for full key (string) passed in request (only simple mode)
if (!$valid_key && _FF_FTR_MODE === 'simple' && isset($_REQUEST['key'])) {
$key_index = array_search($_REQUEST['key'], $options->api_keys);
if ($key_index !== false) $valid_key = true;
}
$key_index = ($valid_key) ? (int)$_GET['key'] : 0;
if (!$valid_key && $options->key_required) {
die('A valid key must be supplied');
}
if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') {
if (!$valid_key && isset($_REQUEST['key']) && $_REQUEST['key'] != '') {
die('The entered key is invalid');
}
@ -248,8 +278,8 @@ if (!url_allowed($url)) die('URL blocked');
// Max entries
// see config.php to find these values
///////////////////////////////////////////////
if (isset($_GET['max'])) {
$max = (int)$_GET['max'];
if (isset($_REQUEST['max'])) {
$max = (int)$_REQUEST['max'];
if ($valid_key) {
$max = min($max, $options->max_entries_with_key);
} else {
@ -266,8 +296,8 @@ if (isset($_GET['max'])) {
///////////////////////////////////////////////
// Link handling
///////////////////////////////////////////////
if (isset($_GET['links']) && in_array($_GET['links'], array('preserve', 'footnotes', 'remove'))) {
$links = $_GET['links'];
if (isset($_REQUEST['links']) && in_array($_REQUEST['links'], array('preserve', 'footnotes', 'remove'))) {
$links = $_REQUEST['links'];
} else {
$links = 'preserve';
}
@ -277,7 +307,7 @@ if (isset($_GET['links']) && in_array($_GET['links'], array('preserve', 'footnot
///////////////////////////////////////////////
$favour_feed_titles = true;
if ($options->favour_feed_titles == 'user') {
$favour_feed_titles = !isset($_GET['use_extracted_title']);
$favour_feed_titles = !isset($_REQUEST['use_extracted_title']);
} else {
$favour_feed_titles = $options->favour_feed_titles;
}
@ -286,7 +316,7 @@ if ($options->favour_feed_titles == 'user') {
// Include full content in output?
///////////////////////////////////////////////
if ($options->content === 'user') {
if (isset($_GET['content']) && $_GET['content'] === '0') {
if (isset($_REQUEST['content']) && $_REQUEST['content'] === '0') {
$options->content = false;
} else {
$options->content = true;
@ -297,7 +327,7 @@ if ($options->content === 'user') {
// Include summaries in output?
///////////////////////////////////////////////
if ($options->summary === 'user') {
if (isset($_GET['summary']) && $_GET['summary'] === '1') {
if (isset($_REQUEST['summary']) && $_REQUEST['summary'] === '1') {
$options->summary = true;
} else {
$options->summary = false;
@ -308,7 +338,7 @@ if ($options->summary === 'user') {
// Exclude items if extraction fails
///////////////////////////////////////////////
if ($options->exclude_items_on_fail === 'user') {
$exclude_on_fail = (isset($_GET['exc']) && ($_GET['exc'] == '1'));
$exclude_on_fail = (isset($_REQUEST['exc']) && ($_REQUEST['exc'] == '1'));
} else {
$exclude_on_fail = $options->exclude_items_on_fail;
}
@ -317,8 +347,9 @@ if ($options->exclude_items_on_fail === 'user') {
// Detect language
///////////////////////////////////////////////
if ($options->detect_language === 'user') {
if (isset($_GET['l'])) {
$detect_language = (int)$_GET['l'];
if (isset($_REQUEST['lang'])) $_REQUEST['l'] = $_REQUEST['lang'];
if (isset($_REQUEST['l'])) {
$detect_language = (int)$_REQUEST['l'];
} else {
$detect_language = 1;
}
@ -332,7 +363,7 @@ $use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >=
// Check for valid format
// (stick to RSS (or RSS as JSON) for the time being)
/////////////////////////////////////
if (isset($_GET['format']) && $_GET['format'] == 'json') {
if (isset($_REQUEST['format']) && $_REQUEST['format'] == 'json') {
$format = 'json';
} else {
$format = 'rss';
@ -342,11 +373,11 @@ if (isset($_GET['format']) && $_GET['format'] == 'json') {
// Should we do XSS filtering?
/////////////////////////////////////
if ($options->xss_filter === 'user') {
$xss_filter = isset($_GET['xss']);
$xss_filter = isset($_REQUEST['xss']) && $_REQUEST['xss'] !== '0';
} else {
$xss_filter = $options->xss_filter;
}
if (!$xss_filter && isset($_GET['xss'])) {
if (!$xss_filter && (isset($_REQUEST['xss']) && $_REQUEST['xss'] !== '0')) {
die('XSS filtering is disabled in config');
}
@ -355,8 +386,8 @@ if (!$xss_filter && isset($_GET['xss'])) {
// Regex from https://gist.github.com/1217080
/////////////////////////////////////
$callback = null;
if ($format =='json' && isset($_GET['callback'])) {
$callback = trim($_GET['callback']);
if ($format =='json' && isset($_REQUEST['callback'])) {
$callback = trim($_REQUEST['callback']);
foreach (explode('.', $callback) as $_identifier) {
if (!preg_match('/^[a-zA-Z_$][0-9a-zA-Z_$]*(?:\[(?:".+"|\'.+\'|\d+)\])*?$/', $_identifier)) {
die('Invalid JSONP callback');
@ -365,20 +396,78 @@ if ($format =='json' && isset($_GET['callback'])) {
debug("JSONP callback: $callback");
}
///////////////////////////////////////////////
// Override default HTML parser?
///////////////////////////////////////////////
$parser = null;
if ($options->allow_parser_override && isset($_REQUEST['parser']) && in_array($_REQUEST['parser'], $options->allowed_parsers)) {
$parser = $_REQUEST['parser'];
}
///////////////////////////////////////////////
// Use proxy?
///////////////////////////////////////////////
$proxy = false;
if (!empty($options->proxy_servers)) {
if (isset($_REQUEST['proxy'])) {
// We're choosing proxy based on &proxy value (unless it's not allowed...)
if (!$options->allow_proxy_override) die('Proxy overriding is disabled.');
$proxy = $_REQUEST['proxy'];
if ($proxy === '0') {
$proxy = false;
} elseif ($proxy === '1') {
$proxy = true; // random
}
} else {
// We'll use proxy based on config setting
$proxy = $options->proxy;
}
// Is it a valid value (false, true, or one of the proxies in config)
if ($proxy !== false && $proxy !== true && !in_array($proxy, array_keys($options->proxy_servers))) {
die('Proxy not recognised.');
}
if ($proxy === false) {
debug('Proxy will not be used');
} else {
if ($proxy === true) {
$proxy = array_rand($options->proxy_servers);
}
if (is_string($options->proxy_servers[$proxy]) && $options->proxy_servers[$proxy] === 'direct') {
debug('Proxy will not be used');
$proxy = false;
} else {
debug('Proxy '.$proxy.' will be used.');
$proxy = $options->proxy_servers[$proxy];
}
}
}
//////////////////////////////////
// Enable Cross-Origin Resource Sharing (CORS)
//////////////////////////////////
if ($options->cors) header('Access-Control-Allow-Origin: *');
//////////////////////////////////
// Has the HTML been given in the request?
//////////////////////////////////
if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') {
// disable multi-page processing (what we have is what we have)
$options->singlepage = false;
$options->multipage = false;
// disable disk caching
$options->caching = false;
}
//////////////////////////////////
// Check for cached copy
//////////////////////////////////
if ($options->caching) {
debug('Caching is enabled...');
$cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub']));
$cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.
(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.$parser._FF_FTR_MODE);
$check_cache = true;
if ($options->apc && $options->smart_cache) {
apc_add("cache.$cache_id", 0, 10*60);
apc_add("cache.$cache_id", 0, $options->cache_time*60);
$apc_cache_hits = (int)apc_fetch("cache.$cache_id");
$check_cache = ($apc_cache_hits >= 2);
apc_inc("cache.$cache_id");
@ -417,20 +506,37 @@ if ($options->caching) {
}
//////////////////////////////////
// Set Expires header
// Set cache header
//////////////////////////////////
if (!$debug_mode) {
header('Expires: ' . gmdate('D, d M Y H:i:s', time()+(60*10)) . ' GMT');
if ($options->cache_time) {
header('Cache-Control: public, max-age='.($options->cache_time*60));
header('Expires: '.gmdate('D, d M Y H:i:s', time()+($options->cache_time*60)).' GMT');
}
}
//////////////////////////////////
// Set up HTTP agent
//////////////////////////////////
$http = new HumbleHttpAgent();
$http->debug = $debug_mode;
$http->userAgentMap = $options->user_agents;
$http->headerOnlyTypes = array_keys($options->content_type_exc);
$http->rewriteUrls = $options->rewrite_url;
if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') {
// the user has supplied the HTML, so we use the Dummy agent with
// the given HTML (it will always return this HTML)
$http = new HumbleHttpAgentDummy($_REQUEST['inputhtml']);
} else {
$_req_options = null;
if ($proxy !== false) {
$_req_options = array('proxyhost' => $proxy['host']);
if (isset($proxy['auth'])) {
$_req_options['proxyauth'] = $proxy['auth'];
}
}
$http = new HumbleHttpAgent($_req_options);
$http->debug = $debug_mode;
$http->userAgentMap = $options->user_agents;
$http->headerOnlyTypes = array_keys($options->content_type_exc);
$http->rewriteUrls = $options->rewrite_url;
unset($_req_options);
}
//////////////////////////////////
// Set up Content Extractor
@ -441,6 +547,7 @@ SiteConfig::$debug = $debug_mode;
SiteConfig::use_apc($options->apc);
$extractor->fingerprints = $options->fingerprints;
$extractor->allowedParsers = $options->allowed_parsers;
$extractor->parserOverride = $parser;
////////////////////////////////
// Get RSS/Atom feed
@ -497,6 +604,7 @@ if ($html_only || !$result) {
public function get_language() { return false; }
public function get_image_url() { return false; }
public function get_items($start=0, $max=1) { return array(0=>$this->item); }
public function get_channel_tags($namespace='', $tag='') { return null; }
}
class DummySingleItem {
public $url;
@ -518,14 +626,16 @@ if ($html_only || !$result) {
// Create full-text feed
////////////////////////////////////////////
$output = new FeedWriter();
if (_FF_FTR_MODE === 'simple') $output->enableSimpleJson();
$output->setTitle(strip_tags($feed->get_title()));
$output->setDescription(strip_tags($feed->get_description()));
$output->setXsl('css/feed.xsl'); // Chrome uses this, most browsers ignore it
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
$output->addHub('http://fivefilters.superfeedr.com/');
$output->addHub('http://pubsubhubbub.appspot.com/');
$output->setSelf('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']);
$ttl = $feed->get_channel_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'ttl');
if ($ttl !== null) {
$ttl = (int)$ttl[0]['data'];
$output->setTtl($ttl);
}
//$output->setSelf('http://'.$_SERVER['HTTP_HOST'].$_SERVER['REQUEST_URI']);
$output->setLink($feed->get_link()); // Google Reader uses this for pulling in favicons
if ($img_url = $feed->get_image_url()) {
$output->setImage($feed->get_title(), $feed->get_link(), $img_url);
@ -573,23 +683,19 @@ foreach ($items as $key => $item) {
}
$newitem = $output->createNewItem();
$newitem->setTitle($feed_item_title);
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
if ($permalink !== false) {
$newitem->setLink('http://fivefilters.org/content-only/redirect.php?url='.urlencode($permalink));
} else {
$newitem->setLink('http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()));
}
if ($permalink !== false) {
$newitem->setLink($permalink);
} else {
if ($permalink !== false) {
$newitem->setLink($permalink);
} else {
$newitem->setLink($item->get_permalink());
}
$newitem->setLink($item->get_permalink());
}
//if ($permalink && ($response = $http->get($permalink, true)) && $response['status_code'] < 300) {
// Allowing error codes - some sites return correct content with error status
// e.g. prospectmagazine.co.uk returns 403
if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
// Status codes to accept (200 range)
// Some sites might return correct content with error status codes
// e.g. prospectmagazine.co.uk returns 403 - in some earlier versions of FTR we accepted a wider range of status codes
// to allow for such cases:
//if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
// With the introduction of proxy support in 3.3, we're limiting range of acceptable status codes to avoid proxy
// errors being treated as valid responses.
if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300)) {
$effective_url = $response['effective_url'];
if (!url_allowed($effective_url)) continue;
// check if action defined for returned Content-Type
@ -612,9 +718,16 @@ foreach ($items as $key => $item) {
// remove strange things
$html = str_replace('</[>', '', $html);
$html = convert_to_utf8($html, $response['headers']);
// if user has asked to see raw HTML from remote server, show it and exit.
if ($debug_show_raw_html) {
debug("Here are the HTTP response headers from the remote server:");
echo $response['headers'];
debug("Here's the raw HTML (after attempted UTF-8 conversion):");
die($html);
}
// check site config for single page URL - fetch it if found
$is_single_page = false;
if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
if ($options->singlepage && ($single_page_response = getSinglePage($item, $html, $effective_url))) {
$is_single_page = true;
$effective_url = $single_page_response['effective_url'];
// check if action defined for returned Content-Type
@ -647,6 +760,11 @@ foreach ($items as $key => $item) {
debug('Attempting to extract content');
$extract_result = $extractor->process($html, $effective_url);
$readability = $extractor->readability;
// if user has asked to see parsed HTML, show it and exit.
if ($debug_show_parsed_html) {
debug("Here's the full HTML after it's been parsed by Full-Text RSS:");
die($readability->dom->saveXML($readability->dom->documentElement));
}
$content_block = ($extract_result) ? $extractor->getContent() : null;
$extracted_title = ($extract_result) ? $extractor->getTitle() : '';
// Deal with multi-page articles
@ -731,6 +849,14 @@ foreach ($items as $key => $item) {
if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
$readability->addFootnotes($content_block);
}
// normalise
$content_block->normalize();
// remove empty text nodes
foreach ($content_block->childNodes as $_n) {
if ($_n->nodeType === XML_TEXT_NODE && trim($_n->textContent) == '') {
$content_block->removeChild($_n);
}
}
// remove nesting: <div><div><div><p>test</p></div></div></div> = <p>test</p>
while ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
// only follow these tag names
@ -740,8 +866,10 @@ foreach ($items as $key => $item) {
}
// convert content block to HTML string
// Need to preserve things like body: //img[@id='feature']
if (in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer'))) {
if (in_array(strtolower($content_block->tagName), array('div', 'article', 'section', 'header', 'footer', 'li', 'td'))) {
$html = $content_block->innerHTML;
//} elseif (in_array(strtolower($content_block->tagName), array('td', 'li'))) {
// $html = '<div>'.$content_block->innerHTML.'</div>';
} else {
$html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML
}
@ -758,11 +886,7 @@ foreach ($items as $key => $item) {
}
}
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
$newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
} else {
$newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
}
$newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
// filter xss?
if ($xss_filter) {
@ -935,16 +1059,20 @@ if (!$debug_mode) {
// apc purge code adapted from from http://www.thimbleopensource.com/tutorials-snippets/php-apc-expunge-script
$_apc_data = apc_cache_info('user');
foreach ($_apc_data['cache_list'] as $_apc_item) {
if ($_apc_item['ttl'] > 0 && ($_apc_item['ttl'] + $_apc_item['creation_time'] < time())) {
apc_delete($_apc_item['info']);
}
// APCu keys incompatible with original APC keys, apparently fixed in newer versions, but not in 4.0.4
// So let's look for those keys and fix here (ctime -> creation_time, key -> info).
if (isset($_apc_item['ctime'])) $_apc_item['creation_time'] = $_apc_item['ctime'];
if (isset($_apc_item['key'])) $_apc_item['info'] = $_apc_item['key'];
if ($_apc_item['ttl'] > 0 && ($_apc_item['ttl'] + $_apc_item['creation_time'] < time())) {
apc_delete($_apc_item['info']);
}
}
}
}
}
if ($add_to_cache) {
ob_start();
$output->genarateFeed();
$output->generateFeed();
$output = ob_get_contents();
ob_end_clean();
if ($html_only && $item_count == 0) {
@ -955,7 +1083,7 @@ if (!$debug_mode) {
}
echo $output;
} else {
$output->genarateFeed();
$output->generateFeed();
}
if ($callback) echo ');';
}
@ -1022,8 +1150,7 @@ function url_allowed($url) {
// (uses HTTP headers and HTML to find encoding)
// adapted from http://stackoverflow.com/questions/910793/php-detect-encoding-and-make-everything-utf-8
//////////////////////////////////////////////
function convert_to_utf8($html, $header=null)
{
function convert_to_utf8($html, $header=null) {
$encoding = null;
if ($html || $header) {
if (is_array($header)) $header = implode("\n", $header);
@ -1245,7 +1372,7 @@ function get_cache() {
static $cache = null;
if ($cache === null) {
$frontendOptions = array(
'lifetime' => 10*60, // cache lifetime of 10 minutes
'lifetime' => $options->cache_time*60, // cache lifetime
'automatic_serialization' => false,
'write_control' => false,
'automatic_cleaning_factor' => $options->cache_cleanup,

View File

@ -1,3 +1,2 @@
<?php
// this is here to prevent directory listing over the web
?>
// this is here to prevent directory listing over the web

View File

@ -3,17 +3,19 @@ body: //div[@id = 'bodyContent']
strip_id_or_class: editsection
#strip_id_or_class: toc
strip_id_or_class: vertical-navbox
strip: //table[@id='toc']
strip: //*[@id='toc']
strip: //div[@id='catlinks']
strip: //div[@id='jump-to-nav']
strip: //div[@class='thumbcaption']//div[@class='magnify']
strip: //table[@class='navbox']
strip: //table[contains(@class, 'infobox')]
#strip: //table[contains(@class, 'infobox')]
strip: //div[@class='dablink']
strip: //div[@id='contentSub']
strip: //table[contains(@class, 'metadata')]
strip: //*[contains(@class, 'noprint')]
strip: //span[@title='pronunciation:']
strip: //span[@class='noexcerpt']
prune: no
tidy: no
test_url: http://en.wikipedia.org/wiki/Christopher_Lloyd
test_url: http://en.wikipedia.org/wiki/Christopher_Lloyd
test_url: https://en.wikipedia.org/wiki/Ronnie_James_Dio
test_url: https://en.wikipedia.org/wiki/Metallica

View File

@ -0,0 +1,36 @@
Full-Text RSS site config files
================
[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no site patterns, it tries to detect the content block automatically.
This repository contains the site config files we use in Full-Text RSS.
### Contributing changes
We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface.
You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model:
> The Fork & Pull Model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination.
When we receive a pull request we'll review the changes and if everything's okay we'll update our copy.
If a site is not in our set, you can create a file for it in the same way. See [Creating files on GitHub](https://github.com/blog/1327-creating-files-on-github).
### How to write a site config file
Please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns) for a brief guide. We hope to have some tutorials up soon.
### Instapaper
When we introduced site patterns, we chose to adopt the [same format](http://blog.instapaper.com/post/730281947) used by Instapaper. This allows us to make use of the existing extraction rules contributed by Instapaper users.
Marco, Instapaper's creator, graciously opened up the database of contributions to everyone:
> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached.
Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (login required).
### Testing site config files
Currently you will have to have a copy of Full-Text RSS to test changes to the site config files. In the future we will try to make this process easier.

View File

@ -1 +1 @@
2013-05-12T22:53:07Z
2014-05-05T08:36:15Z