diff --git a/admin/apc.php b/admin/apc.php new file mode 100644 index 0000000..45896c2 --- /dev/null +++ b/admin/apc.php @@ -0,0 +1,1383 @@ + | + | Rasmus Lerdorf | + | Ilia Alshanetsky | + +----------------------------------------------------------------------+ + + All other licensing and usage conditions are those of the PHP Group. + + */ + +$VERSION='$Id: apc.php 307048 2011-01-03 23:53:17Z kalle $'; + +////////// READ OPTIONAL CONFIGURATION FILE //////////// +if (file_exists("apc.conf.php")) include("apc.conf.php"); +//////////////////////////////////////////////////////// + +//////////////////////////////// +// Load config file +//////////////////////////////// +$admin_page = 'apc'; +require_once('../config.php'); +require_once('require_login.php'); +require_once('template.php'); +if (!isset($_REQUEST['IMG'])) tpl_header('APC'); + +////////// BEGIN OF DEFAULT CONFIG AREA /////////////////////////////////////////////////////////// +defaults('USE_AUTHENTICATION',0); // Use (internal) authentication - best choice if + // no other authentication is available + // If set to 0: + // There will be no further authentication. You + // will have to handle this by yourself! + // If set to 1: + // You need to change ADMIN_PASSWORD to make + // this work! +//defaults('ADMIN_USERNAME','admin'); // Admin Username +//defaults('ADMIN_PASSWORD',''); // Admin Password - CHANGE THIS TO ENABLE!!! + +// (beckerr) I'm using a clear text password here, because I've no good idea how to let +// users generate a md5 or crypt password in a easy way to fill it in above + +//defaults('DATE_FORMAT', "d.m.Y H:i:s"); // German +defaults('DATE_FORMAT', 'Y/m/d H:i:s'); // US + +defaults('GRAPH_SIZE',200); // Image size + +//defaults('PROXY', 'tcp://127.0.0.1:8080'); + +////////// END OF DEFAULT CONFIG AREA ///////////////////////////////////////////////////////////// + + +// "define if not defined" +function defaults($d,$v) { + if (!defined($d)) define($d,$v); // or just @define(...) +} + +// rewrite $PHP_SELF to block XSS attacks +// +$PHP_SELF= isset($_SERVER['PHP_SELF']) ? htmlentities(strip_tags($_SERVER['PHP_SELF'],''), ENT_QUOTES, 'UTF-8') : ''; +$time = time(); +$host = php_uname('n'); +if($host) { $host = '('.$host.')'; } +if (isset($_SERVER['SERVER_ADDR'])) { + $host .= ' ('.$_SERVER['SERVER_ADDR'].')'; +} + +// operation constants +define('OB_HOST_STATS',1); +define('OB_SYS_CACHE',2); +define('OB_USER_CACHE',3); +define('OB_SYS_CACHE_DIR',4); +define('OB_VERSION_CHECK',9); + +// check validity of input variables +$vardom=array( + 'OB' => '/^\d+$/', // operational mode switch + 'CC' => '/^[01]$/', // clear cache requested + 'DU' => '/^.*$/', // Delete User Key + 'SH' => '/^[a-z0-9]+$/', // shared object description + + 'IMG' => '/^[123]$/', // image to generate + 'LO' => '/^1$/', // login requested + + 'COUNT' => '/^\d+$/', // number of line displayed in list + 'SCOPE' => '/^[AD]$/', // list view scope + 'SORT1' => '/^[AHSMCDTZ]$/', // first sort key + 'SORT2' => '/^[DA]$/', // second sort key + 'AGGR' => '/^\d+$/', // aggregation by dir level + 'SEARCH' => '~^[a-zA-Z0-1/_.-]*$~' // aggregation by dir level +); + +// default cache mode +$cache_mode='opcode'; + +// cache scope +$scope_list=array( + 'A' => 'cache_list', + 'D' => 'deleted_list' +); + +// handle POST and GET requests +if (empty($_REQUEST)) { + if (!empty($_GET) && !empty($_POST)) { + $_REQUEST = array_merge($_GET, $_POST); + } else if (!empty($_GET)) { + $_REQUEST = $_GET; + } else if (!empty($_POST)) { + $_REQUEST = $_POST; + } else { + $_REQUEST = array(); + } +} + +// check parameter syntax +foreach($vardom as $var => $dom) { + if (!isset($_REQUEST[$var])) { + $MYREQUEST[$var]=NULL; + } else if (!is_array($_REQUEST[$var]) && preg_match($dom.'D',$_REQUEST[$var])) { + $MYREQUEST[$var]=$_REQUEST[$var]; + } else { + $MYREQUEST[$var]=$_REQUEST[$var]=NULL; + } +} + +// check parameter sematics +if (empty($MYREQUEST['SCOPE'])) $MYREQUEST['SCOPE']="A"; +if (empty($MYREQUEST['SORT1'])) $MYREQUEST['SORT1']="H"; +if (empty($MYREQUEST['SORT2'])) $MYREQUEST['SORT2']="D"; +if (empty($MYREQUEST['OB'])) $MYREQUEST['OB']=OB_HOST_STATS; +if (!isset($MYREQUEST['COUNT'])) $MYREQUEST['COUNT']=20; +if (!isset($scope_list[$MYREQUEST['SCOPE']])) $MYREQUEST['SCOPE']='A'; + +$MY_SELF= + "$PHP_SELF". + "?SCOPE=".$MYREQUEST['SCOPE']. + "&SORT1=".$MYREQUEST['SORT1']. + "&SORT2=".$MYREQUEST['SORT2']. + "&COUNT=".$MYREQUEST['COUNT']; +$MY_SELF_WO_SORT= + "$PHP_SELF". + "?SCOPE=".$MYREQUEST['SCOPE']. + "&COUNT=".$MYREQUEST['COUNT']; + +// authentication needed? +// +if (!USE_AUTHENTICATION) { + $AUTHENTICATED=1; +} else { + $AUTHENTICATED=0; + if (ADMIN_PASSWORD!='password' && ($MYREQUEST['LO'] == 1 || isset($_SERVER['PHP_AUTH_USER']))) { + + if (!isset($_SERVER['PHP_AUTH_USER']) || + !isset($_SERVER['PHP_AUTH_PW']) || + $_SERVER['PHP_AUTH_USER'] != ADMIN_USERNAME || + $_SERVER['PHP_AUTH_PW'] != ADMIN_PASSWORD) { + Header("WWW-Authenticate: Basic realm=\"APC Login\""); + Header("HTTP/1.0 401 Unauthorized"); + + echo << +

Rejected!

+ Wrong Username or Password!
 
  + Continue... + +EOB; + exit; + + } else { + $AUTHENTICATED=1; + } + } +} + +// select cache mode +if ($AUTHENTICATED && $MYREQUEST['OB'] == OB_USER_CACHE) { + $cache_mode='user'; +} +// clear cache +if ($AUTHENTICATED && isset($MYREQUEST['CC']) && $MYREQUEST['CC']) { + apc_clear_cache($cache_mode); +} + +if ($AUTHENTICATED && !empty($MYREQUEST['DU'])) { + apc_delete($MYREQUEST['DU']); +} + +if(!function_exists('apc_cache_info') || !($cache=@apc_cache_info($cache_mode))) { + echo "No cache info available. APC does not appear to be running."; + exit; +} + +$cache_user = apc_cache_info('user', 1); +$mem=apc_sma_info(); +if(!$cache['num_hits']) { $cache['num_hits']=1; $time++; } // Avoid division by 0 errors on a cache clear + +// don't cache this page +// +header("Cache-Control: no-store, no-cache, must-revalidate"); // HTTP/1.1 +header("Cache-Control: post-check=0, pre-check=0", false); +header("Pragma: no-cache"); // HTTP/1.0 + +function duration($ts) { + global $time; + $years = (int)((($time - $ts)/(7*86400))/52.177457); + $rem = (int)(($time-$ts)-($years * 52.177457 * 7 * 86400)); + $weeks = (int)(($rem)/(7*86400)); + $days = (int)(($rem)/86400) - $weeks*7; + $hours = (int)(($rem)/3600) - $days*24 - $weeks*7*24; + $mins = (int)(($rem)/60) - $hours*60 - $days*24*60 - $weeks*7*24*60; + $str = ''; + if($years==1) $str .= "$years year, "; + if($years>1) $str .= "$years years, "; + if($weeks==1) $str .= "$weeks week, "; + if($weeks>1) $str .= "$weeks weeks, "; + if($days==1) $str .= "$days day,"; + if($days>1) $str .= "$days days,"; + if($hours == 1) $str .= " $hours hour and"; + if($hours>1) $str .= " $hours hours and"; + if($mins == 1) $str .= " 1 minute"; + else $str .= " $mins minutes"; + return $str; +} + +// create graphics +// +function graphics_avail() { + return extension_loaded('gd'); +} +if (isset($MYREQUEST['IMG'])) +{ + if (!graphics_avail()) { + exit(0); + } + + function fill_arc($im, $centerX, $centerY, $diameter, $start, $end, $color1,$color2,$text='',$placeindex=0) { + $r=$diameter/2; + $w=deg2rad((360+$start+($end-$start)/2)%360); + + + if (function_exists("imagefilledarc")) { + // exists only if GD 2.0.1 is avaliable + imagefilledarc($im, $centerX+1, $centerY+1, $diameter, $diameter, $start, $end, $color1, IMG_ARC_PIE); + imagefilledarc($im, $centerX, $centerY, $diameter, $diameter, $start, $end, $color2, IMG_ARC_PIE); + imagefilledarc($im, $centerX, $centerY, $diameter, $diameter, $start, $end, $color1, IMG_ARC_NOFILL|IMG_ARC_EDGED); + } else { + imagearc($im, $centerX, $centerY, $diameter, $diameter, $start, $end, $color2); + imageline($im, $centerX, $centerY, $centerX + cos(deg2rad($start)) * $r, $centerY + sin(deg2rad($start)) * $r, $color2); + imageline($im, $centerX, $centerY, $centerX + cos(deg2rad($start+1)) * $r, $centerY + sin(deg2rad($start)) * $r, $color2); + imageline($im, $centerX, $centerY, $centerX + cos(deg2rad($end-1)) * $r, $centerY + sin(deg2rad($end)) * $r, $color2); + imageline($im, $centerX, $centerY, $centerX + cos(deg2rad($end)) * $r, $centerY + sin(deg2rad($end)) * $r, $color2); + imagefill($im,$centerX + $r*cos($w)/2, $centerY + $r*sin($w)/2, $color2); + } + if ($text) { + if ($placeindex>0) { + imageline($im,$centerX + $r*cos($w)/2, $centerY + $r*sin($w)/2,$diameter, $placeindex*12,$color1); + imagestring($im,4,$diameter, $placeindex*12,$text,$color1); + + } else { + imagestring($im,4,$centerX + $r*cos($w)/2, $centerY + $r*sin($w)/2,$text,$color1); + } + } + } + + function text_arc($im, $centerX, $centerY, $diameter, $start, $end, $color1,$text,$placeindex=0) { + $r=$diameter/2; + $w=deg2rad((360+$start+($end-$start)/2)%360); + + if ($placeindex>0) { + imageline($im,$centerX + $r*cos($w)/2, $centerY + $r*sin($w)/2,$diameter, $placeindex*12,$color1); + imagestring($im,4,$diameter, $placeindex*12,$text,$color1); + + } else { + imagestring($im,4,$centerX + $r*cos($w)/2, $centerY + $r*sin($w)/2,$text,$color1); + } + } + + function fill_box($im, $x, $y, $w, $h, $color1, $color2,$text='',$placeindex='') { + global $col_black; + $x1=$x+$w-1; + $y1=$y+$h-1; + + imagerectangle($im, $x, $y1, $x1+1, $y+1, $col_black); + if($y1>$y) imagefilledrectangle($im, $x, $y, $x1, $y1, $color2); + else imagefilledrectangle($im, $x, $y1, $x1, $y, $color2); + imagerectangle($im, $x, $y1, $x1, $y, $color1); + if ($text) { + if ($placeindex>0) { + + if ($placeindex<16) + { + $px=5; + $py=$placeindex*12+6; + imagefilledrectangle($im, $px+90, $py+3, $px+90-4, $py-3, $color2); + imageline($im,$x,$y+$h/2,$px+90,$py,$color2); + imagestring($im,2,$px,$py-6,$text,$color1); + + } else { + if ($placeindex<31) { + $px=$x+40*2; + $py=($placeindex-15)*12+6; + } else { + $px=$x+40*2+100*intval(($placeindex-15)/15); + $py=($placeindex%15)*12+6; + } + imagefilledrectangle($im, $px, $py+3, $px-4, $py-3, $color2); + imageline($im,$x+$w,$y+$h/2,$px,$py,$color2); + imagestring($im,2,$px+2,$py-6,$text,$color1); + } + } else { + imagestring($im,4,$x+5,$y1-16,$text,$color1); + } + } + } + + + $size = GRAPH_SIZE; // image size + if ($MYREQUEST['IMG']==3) + $image = imagecreate(2*$size+150, $size+10); + else + $image = imagecreate($size+50, $size+10); + + $col_white = imagecolorallocate($image, 0xFF, 0xFF, 0xFF); + $col_red = imagecolorallocate($image, 0xD0, 0x60, 0x30); + $col_green = imagecolorallocate($image, 0x60, 0xF0, 0x60); + $col_black = imagecolorallocate($image, 0, 0, 0); + imagecolortransparent($image,$col_white); + + switch ($MYREQUEST['IMG']) { + + case 1: + $s=$mem['num_seg']*$mem['seg_size']; + $a=$mem['avail_mem']; + $x=$y=$size/2; + $fuzz = 0.000001; + + // This block of code creates the pie chart. It is a lot more complex than you + // would expect because we try to visualize any memory fragmentation as well. + $angle_from = 0; + $string_placement=array(); + for($i=0; $i<$mem['num_seg']; $i++) { + $ptr = 0; + $free = $mem['block_lists'][$i]; + uasort($free, 'block_sort'); + foreach($free as $block) { + if($block['offset']!=$ptr) { // Used block + $angle_to = $angle_from+($block['offset']-$ptr)/$s; + if(($angle_to+$fuzz)>1) $angle_to = 1; + if( ($angle_to*360) - ($angle_from*360) >= 1) { + fill_arc($image,$x,$y,$size,$angle_from*360,$angle_to*360,$col_black,$col_red); + if (($angle_to-$angle_from)>0.05) { + array_push($string_placement, array($angle_from,$angle_to)); + } + } + $angle_from = $angle_to; + } + $angle_to = $angle_from+($block['size'])/$s; + if(($angle_to+$fuzz)>1) $angle_to = 1; + if( ($angle_to*360) - ($angle_from*360) >= 1) { + fill_arc($image,$x,$y,$size,$angle_from*360,$angle_to*360,$col_black,$col_green); + if (($angle_to-$angle_from)>0.05) { + array_push($string_placement, array($angle_from,$angle_to)); + } + } + $angle_from = $angle_to; + $ptr = $block['offset']+$block['size']; + } + if ($ptr < $mem['seg_size']) { // memory at the end + $angle_to = $angle_from + ($mem['seg_size'] - $ptr)/$s; + if(($angle_to+$fuzz)>1) $angle_to = 1; + fill_arc($image,$x,$y,$size,$angle_from*360,$angle_to*360,$col_black,$col_red); + if (($angle_to-$angle_from)>0.05) { + array_push($string_placement, array($angle_from,$angle_to)); + } + } + } + foreach ($string_placement as $angle) { + text_arc($image,$x,$y,$size,$angle[0]*360,$angle[1]*360,$col_black,bsize($s*($angle[1]-$angle[0]))); + } + break; + + case 2: + $s=$cache['num_hits']+$cache['num_misses']; + $a=$cache['num_hits']; + + fill_box($image, 30,$size,50,-$a*($size-21)/$s,$col_black,$col_green,sprintf("%.1f%%",$cache['num_hits']*100/$s)); + fill_box($image,130,$size,50,-max(4,($s-$a)*($size-21)/$s),$col_black,$col_red,sprintf("%.1f%%",$cache['num_misses']*100/$s)); + break; + + case 3: + $s=$mem['num_seg']*$mem['seg_size']; + $a=$mem['avail_mem']; + $x=130; + $y=1; + $j=1; + + // This block of code creates the bar chart. It is a lot more complex than you + // would expect because we try to visualize any memory fragmentation as well. + for($i=0; $i<$mem['num_seg']; $i++) { + $ptr = 0; + $free = $mem['block_lists'][$i]; + uasort($free, 'block_sort'); + foreach($free as $block) { + if($block['offset']!=$ptr) { // Used block + $h=(GRAPH_SIZE-5)*($block['offset']-$ptr)/$s; + if ($h>0) { + $j++; + if($j<75) fill_box($image,$x,$y,50,$h,$col_black,$col_red,bsize($block['offset']-$ptr),$j); + else fill_box($image,$x,$y,50,$h,$col_black,$col_red); + } + $y+=$h; + } + $h=(GRAPH_SIZE-5)*($block['size'])/$s; + if ($h>0) { + $j++; + if($j<75) fill_box($image,$x,$y,50,$h,$col_black,$col_green,bsize($block['size']),$j); + else fill_box($image,$x,$y,50,$h,$col_black,$col_green); + } + $y+=$h; + $ptr = $block['offset']+$block['size']; + } + if ($ptr < $mem['seg_size']) { // memory at the end + $h = (GRAPH_SIZE-5) * ($mem['seg_size'] - $ptr) / $s; + if ($h > 0) { + fill_box($image,$x,$y,50,$h,$col_black,$col_red,bsize($mem['seg_size']-$ptr),$j++); + } + } + } + break; + case 4: + $s=$cache['num_hits']+$cache['num_misses']; + $a=$cache['num_hits']; + + fill_box($image, 30,$size,50,-$a*($size-21)/$s,$col_black,$col_green,sprintf("%.1f%%",$cache['num_hits']*100/$s)); + fill_box($image,130,$size,50,-max(4,($s-$a)*($size-21)/$s),$col_black,$col_red,sprintf("%.1f%%",$cache['num_misses']*100/$s)); + break; + + } + header("Content-type: image/png"); + imagepng($image); + exit; +} + +// pretty printer for byte values +// +function bsize($s) { + foreach (array('','K','M','G') as $i => $k) { + if ($s < 1024) break; + $s/=1024; + } + return sprintf("%5.1f %sBytes",$s,$k); +} + +// sortable table header in "scripts for this host" view +function sortheader($key,$name,$extra='') { + global $MYREQUEST, $MY_SELF_WO_SORT; + + if ($MYREQUEST['SORT1']==$key) { + $MYREQUEST['SORT2'] = $MYREQUEST['SORT2']=='A' ? 'D' : 'A'; + } + return "$name"; + +} + +// create menu entry +function menu_entry($ob,$title) { + global $MYREQUEST,$MY_SELF; + if ($MYREQUEST['OB']!=$ob) { + return "
  • $title
  • "; + } else if (empty($MYREQUEST['SH'])) { + return "
  • $title
  • "; + } else { + return "
  • $title
  • "; + } +} + +function put_login_link($s="Login") +{ + global $MY_SELF,$MYREQUEST,$AUTHENTICATED; + // needs ADMIN_PASSWORD to be changed! + // + if (!USE_AUTHENTICATION) { + return; + } else if (ADMIN_PASSWORD=='password') + { + print <<$s +EOB; + } else if ($AUTHENTICATED) { + print <<$s +EOB; + } +} + +function block_sort($array1, $array2) +{ + if ($array1['offset'] > $array2['offset']) { + return 1; + } else { + return -1; + } +} + + +?> + + +APC INFO <?php echo $host ?> +*/ +?> + + + +
    +

    + +
    Opcode Cache
    +

    + +
    +
    + +
  • Refresh Data
  • +EOB; +echo + menu_entry(1,'View Host Stats'), + menu_entry(2,'System Cache Entries'); +if ($AUTHENTICATED) { + echo menu_entry(4,'Per-Directory Entries'); +} +echo + menu_entry(3,'User Cache Entries'), + menu_entry(9,'Version Check'); + +echo << +EOB; + +if ($AUTHENTICATED) { + echo <<Clear $cache_mode Cache +EOB; +} + +// CONTENT +echo << +EOB; + +// MAIN SWITCH STATEMENT + +switch ($MYREQUEST['OB']) { + + + + + +// ----------------------------------------------- +// Host Stats +// ----------------------------------------------- +case OB_HOST_STATS: + $mem_size = $mem['num_seg']*$mem['seg_size']; + $mem_avail= $mem['avail_mem']; + $mem_used = $mem_size-$mem_avail; + $seg_size = bsize($mem['seg_size']); + $req_rate = sprintf("%.2f",($cache['num_hits']+$cache['num_misses'])/($time-$cache['start_time'])); + $hit_rate = sprintf("%.2f",($cache['num_hits'])/($time-$cache['start_time'])); + $miss_rate = sprintf("%.2f",($cache['num_misses'])/($time-$cache['start_time'])); + $insert_rate = sprintf("%.2f",($cache['num_inserts'])/($time-$cache['start_time'])); + $req_rate_user = sprintf("%.2f",($cache_user['num_hits']+$cache_user['num_misses'])/($time-$cache_user['start_time'])); + $hit_rate_user = sprintf("%.2f",($cache_user['num_hits'])/($time-$cache_user['start_time'])); + $miss_rate_user = sprintf("%.2f",($cache_user['num_misses'])/($time-$cache_user['start_time'])); + $insert_rate_user = sprintf("%.2f",($cache_user['num_inserts'])/($time-$cache_user['start_time'])); + $apcversion = phpversion('apc'); + $phpversion = phpversion(); + $number_files = $cache['num_entries']; + $size_files = bsize($cache['mem_size']); + $number_vars = $cache_user['num_entries']; + $size_vars = bsize($cache_user['mem_size']); + $i=0; + echo <<< EOB +

    General Cache Information

    + + + +EOB; + + if(!empty($_SERVER['SERVER_NAME'])) + echo "\n"; + if(!empty($_SERVER['SERVER_SOFTWARE'])) + echo "\n"; + + echo << +EOB; + echo ''; + echo ''; + echo ''; + echo <<
    APC Version$apcversion
    PHP Version$phpversion
    APC Host{$_SERVER['SERVER_NAME']} $host
    Server Software{$_SERVER['SERVER_SOFTWARE']}
    Shared Memory{$mem['num_seg']} Segment(s) with $seg_size +
    ({$cache['memory_type']} memory, {$cache['locking_type']} locking) +
    Start Time',date(DATE_FORMAT,$cache['start_time']),'
    Uptime',duration($cache['start_time']),'
    File Upload Support',$cache['file_upload_progress'],'
    +
    + +

    File Cache Information

    + + + + + + + + + +
    Cached Files$number_files ($size_files)
    Hits{$cache['num_hits']}
    Misses{$cache['num_misses']}
    Request Rate (hits, misses)$req_rate cache requests/second
    Hit Rate$hit_rate cache requests/second
    Miss Rate$miss_rate cache requests/second
    Insert Rate$insert_rate cache requests/second
    Cache full count{$cache['expunges']}
    +
    + +

    User Cache Information

    + + + + + + + + + + +
    Cached Variables$number_vars ($size_vars)
    Hits{$cache_user['num_hits']}
    Misses{$cache_user['num_misses']}
    Request Rate (hits, misses)$req_rate_user cache requests/second
    Hit Rate$hit_rate_user cache requests/second
    Miss Rate$miss_rate_user cache requests/second
    Insert Rate$insert_rate_user cache requests/second
    Cache full count{$cache_user['expunges']}
    +
    + +

    Runtime Settings

    +EOB; + + $j = 0; + foreach (ini_get_all('apc') as $k => $v) { + echo "\n"; + $j = 1 - $j; + } + + if($mem['num_seg']>1 || $mem['num_seg']==1 && count($mem['block_lists'][0])>1) + $mem_note = "Memory Usage
    (multiple slices indicate fragments)"; + else + $mem_note = "Memory Usage"; + + echo <<< EOB +
    ",$k,"",str_replace(',',',
    ',$v['local_value']),"
    +
    + +

    Host Status Diagrams

    + +EOB; + $size='width='.(GRAPH_SIZE+50).' height='.(GRAPH_SIZE+10); + echo << + + + +EOB; + + echo + graphics_avail() ? + ''. + "". + "\n" + : "", + '', + '\n", + '\n", + '', + '', + '\n", + '\n"; + echo <<< EOB + +
    $mem_noteHits & Misses
    \"\"\"\"
     Free: ',bsize($mem_avail).sprintf(" (%.1f%%)",$mem_avail*100/$mem_size)," Hits: ',$cache['num_hits'].sprintf(" (%.1f%%)",$cache['num_hits']*100/($cache['num_hits']+$cache['num_misses'])),"
     Used: ',bsize($mem_used ).sprintf(" (%.1f%%)",$mem_used *100/$mem_size)," Misses: ',$cache['num_misses'].sprintf(" (%.1f%%)",$cache['num_misses']*100/($cache['num_hits']+$cache['num_misses'])),"
    + +
    +

    Detailed Memory Usage and Fragmentation

    + + + + +EOB; + if(isset($mem['adist'])) { + foreach($mem['adist'] as $i=>$v) { + $cur = pow(2,$i); $nxt = pow(2,$i+1)-1; + if($i==0) $range = "1"; + else $range = "$cur - $nxt"; + echo "\n"; + } + } + echo <<

    +EOB; + + // Fragementation: (freeseg - 1) / total_seg + $nseg = $freeseg = $fragsize = $freetotal = 0; + for($i=0; $i<$mem['num_seg']; $i++) { + $ptr = 0; + foreach($mem['block_lists'][$i] as $block) { + if ($block['offset'] != $ptr) { + ++$nseg; + } + $ptr = $block['offset'] + $block['size']; + /* Only consider blocks <5M for the fragmentation % */ + if($block['size']<(5*1024*1024)) $fragsize+=$block['size']; + $freetotal+=$block['size']; + } + $freeseg += count($mem['block_lists'][$i]); + } + + if ($freeseg > 1) { + $frag = sprintf("%.2f%% (%s out of %s in %d fragments)", ($fragsize/$freetotal)*100,bsize($fragsize),bsize($freetotal),$freeseg); + } else { + $frag = "0%"; + } + + if (graphics_avail()) { + $size='width='.(2*GRAPH_SIZE+150).' height='.(GRAPH_SIZE+10); + echo << +EOB; + } + echo <<Fragmentation: $frag +
    $range$v
    +
    +EOB; + + break; + + +// ----------------------------------------------- +// User Cache Entries +// ----------------------------------------------- +case OB_USER_CACHE: + if (!$AUTHENTICATED) { + echo '
    You need to login to see the user values here!
     
    '; + put_login_link("Login now!"); + echo '
    '; + break; + } + $fieldname='info'; + $fieldheading='User Entry Label'; + $fieldkey='info'; + +// ----------------------------------------------- +// System Cache Entries +// ----------------------------------------------- +case OB_SYS_CACHE: + if (!isset($fieldname)) + { + $fieldname='filename'; + $fieldheading='Script Filename'; + if(ini_get("apc.stat")) $fieldkey='inode'; + else $fieldkey='filename'; + } + if (!empty($MYREQUEST['SH'])) + { + echo <<< EOB +
    + +EOB; + + $m=0; + foreach($scope_list as $j => $list) { + foreach($cache[$list] as $i => $entry) { + if (md5($entry[$fieldkey])!=$MYREQUEST['SH']) continue; + foreach($entry as $k => $value) { + if (!$AUTHENTICATED) { + // hide all path entries if not logged in + $value=preg_replace('/^.*(\\/|\\\\)/','<hidden>/',$value); + } + + if ($k == "num_hits") { + $value=sprintf("%s (%.2f%%)",$value,$value*100/$cache['num_hits']); + } + if ($k == 'deletion_time') { + if(!$entry['deletion_time']) $value = "None"; + } + echo + "", + "", + "", + ""; + $m=1-$m; + } + if($fieldkey=='info') { + echo "\n"; + } + break; + } + } + + echo <<
    AttributeValue
    ",ucwords(preg_replace("/_/"," ",$k)),"",(preg_match("/time/",$k) && $value!='None') ? date(DATE_FORMAT,$value) : htmlspecialchars($value, ENT_QUOTES, 'UTF-8'),"
    Stored Value
    ";
    +					$output = var_export(apc_fetch($entry[$fieldkey]),true);
    +					echo htmlspecialchars($output, ENT_QUOTES, 'UTF-8');
    +					echo "
    +
    +EOB; + break; + } + + $cols=6; + echo <<
    Scope: + + ", + " Sorting: ', + ' ', + '', + '
    ', + 'Search: ', + ' ', + '
    '; + + if (isset($MYREQUEST['SEARCH'])) { + // Don't use preg_quote because we want the user to be able to specify a + // regular expression subpattern. + $MYREQUEST['SEARCH'] = '/'.str_replace('/', '\\/', $MYREQUEST['SEARCH']).'/i'; + if (preg_match($MYREQUEST['SEARCH'], 'test') === false) { + echo '
    Error: enter a valid regular expression as a search query.
    '; + break; + } + } + + echo + '
    ', + '', + '', + '', + '', + '', + '', + ''; + + if($fieldname=='info') { + $cols+=2; + echo ''; + } + echo ''; + + // builds list with alpha numeric sortable keys + // + $list = array(); + foreach($cache[$scope_list[$MYREQUEST['SCOPE']]] as $i => $entry) { + switch($MYREQUEST['SORT1']) { + case 'A': $k=sprintf('%015d-',$entry['access_time']); break; + case 'H': $k=sprintf('%015d-',$entry['num_hits']); break; + case 'Z': $k=sprintf('%015d-',$entry['mem_size']); break; + case 'M': $k=sprintf('%015d-',$entry['mtime']); break; + case 'C': $k=sprintf('%015d-',$entry['creation_time']); break; + case 'T': $k=sprintf('%015d-',$entry['ttl']); break; + case 'D': $k=sprintf('%015d-',$entry['deletion_time']); break; + case 'S': $k=''; break; + } + if (!$AUTHENTICATED) { + // hide all path entries if not logged in + $list[$k.$entry[$fieldname]]=preg_replace('/^.*(\\/|\\\\)/','*hidden*/',$entry); + } else { + $list[$k.$entry[$fieldname]]=$entry; + } + } + + if ($list) { + + // sort list + // + switch ($MYREQUEST['SORT2']) { + case "A": krsort($list); break; + case "D": ksort($list); break; + } + + // output list + $i=0; + foreach($list as $k => $entry) { + if(!$MYREQUEST['SEARCH'] || preg_match($MYREQUEST['SEARCH'], $entry[$fieldname]) != 0) { + $field_value = htmlentities(strip_tags($entry[$fieldname],''), ENT_QUOTES, 'UTF-8'); + echo + '', + "', + '', + '', + '', + '', + ''; + + if($fieldname=='info') { + if($entry['ttl']) + echo ''; + else + echo ''; + } + if ($entry['deletion_time']) { + + echo ''; + } else if ($MYREQUEST['OB'] == OB_USER_CACHE) { + + echo ''; + } else { + echo ''; + } + echo ''; + $i++; + if ($i == $MYREQUEST['COUNT']) + break; + } + } + + } else { + echo ''; + } + echo <<< EOB +
    ',sortheader('S',$fieldheading, "&OB=".$MYREQUEST['OB']),'',sortheader('H','Hits', "&OB=".$MYREQUEST['OB']),'',sortheader('Z','Size', "&OB=".$MYREQUEST['OB']),'',sortheader('A','Last accessed',"&OB=".$MYREQUEST['OB']),'',sortheader('M','Last modified',"&OB=".$MYREQUEST['OB']),'',sortheader('C','Created at', "&OB=".$MYREQUEST['OB']),'',sortheader('T','Timeout',"&OB=".$MYREQUEST['OB']),'',sortheader('D','Deleted at',"&OB=".$MYREQUEST['OB']),'
    ",$field_value,'',$entry['num_hits'],'',$entry['mem_size'],'',date(DATE_FORMAT,$entry['access_time']),'',date(DATE_FORMAT,$entry['mtime']),'',date(DATE_FORMAT,$entry['creation_time']),''.$entry['ttl'].' secondsNone', date(DATE_FORMAT,$entry['deletion_time']), ''; + echo '[Delete Now]'; + echo '  
    No data
    +EOB; + + if ($list && $i < count($list)) { + echo "",count($list)-$i,' more available...'; + } + + echo <<< EOB +
    +EOB; + break; + + +// ----------------------------------------------- +// Per-Directory System Cache Entries +// ----------------------------------------------- +case OB_SYS_CACHE_DIR: + if (!$AUTHENTICATED) { + break; + } + + echo <<
    Scope: + + ", + " Sorting: ', + ' ', + ' ', + "Group By Dir Level: ', + ' ', + '
    ', + + '
    ', + '', + '', + '', + '', + '', + '', + '', + ''; + + // builds list with alpha numeric sortable keys + // + $tmp = $list = array(); + foreach($cache[$scope_list[$MYREQUEST['SCOPE']]] as $entry) { + $n = dirname($entry['filename']); + if ($MYREQUEST['AGGR'] > 0) { + $n = preg_replace("!^(/?(?:[^/\\\\]+[/\\\\]){".($MYREQUEST['AGGR']-1)."}[^/\\\\]*).*!", "$1", $n); + } + if (!isset($tmp[$n])) { + $tmp[$n] = array('hits'=>0,'size'=>0,'ents'=>0); + } + $tmp[$n]['hits'] += $entry['num_hits']; + $tmp[$n]['size'] += $entry['mem_size']; + ++$tmp[$n]['ents']; + } + + foreach ($tmp as $k => $v) { + switch($MYREQUEST['SORT1']) { + case 'A': $kn=sprintf('%015d-',$v['size'] / $v['ents']);break; + case 'T': $kn=sprintf('%015d-',$v['ents']); break; + case 'H': $kn=sprintf('%015d-',$v['hits']); break; + case 'Z': $kn=sprintf('%015d-',$v['size']); break; + case 'C': $kn=sprintf('%015d-',$v['hits'] / $v['ents']);break; + case 'S': $kn = $k; break; + } + $list[$kn.$k] = array($k, $v['ents'], $v['hits'], $v['size']); + } + + if ($list) { + + // sort list + // + switch ($MYREQUEST['SORT2']) { + case "A": krsort($list); break; + case "D": ksort($list); break; + } + + // output list + $i = 0; + foreach($list as $entry) { + echo + '', + "', + '', + '', + '', + '', + '', + ''; + + if (++$i == $MYREQUEST['COUNT']) break; + } + + } else { + echo ''; + } + echo <<< EOB +
    ',sortheader('S','Directory Name', "&OB=".$MYREQUEST['OB']),'',sortheader('T','Number of Files',"&OB=".$MYREQUEST['OB']),'',sortheader('H','Total Hits', "&OB=".$MYREQUEST['OB']),'',sortheader('Z','Total Size', "&OB=".$MYREQUEST['OB']),'',sortheader('C','Avg. Hits', "&OB=".$MYREQUEST['OB']),'',sortheader('A','Avg. Size', "&OB=".$MYREQUEST['OB']),'
    ",$entry[0],'',$entry[1],'',$entry[2],'',$entry[3],'',round($entry[2] / $entry[1]),'',round($entry[3] / $entry[1]),'
    No data
    +EOB; + + if ($list && $i < count($list)) { + echo "",count($list)-$i,' more available...'; + } + + echo <<< EOB +
    +EOB; + break; + +// ----------------------------------------------- +// Version check +// ----------------------------------------------- +case OB_VERSION_CHECK: + echo <<

    APC Version Information

    + + + + +EOB; + if (defined('PROXY')) { + $ctxt = stream_context_create( array( 'http' => array( 'proxy' => PROXY, 'request_fulluri' => True ) ) ); + $rss = @file_get_contents("http://pecl.php.net/feeds/pkg_apc.rss", False, $ctxt); + } else { + $rss = @file_get_contents("http://pecl.php.net/feeds/pkg_apc.rss"); + } + if (!$rss) { + echo ''; + } else { + $apcversion = phpversion('apc'); + + preg_match('!APC ([0-9.]+)!', $rss, $match); + echo ''; + echo ''; + } + echo <<< EOB +
    Unable to fetch version information.
    '; + if (version_compare($apcversion, $match[1], '>=')) { + echo '
    You are running the latest version of APC ('.$apcversion.')
    '; + $i = 3; + } else { + echo '
    You are running an older version of APC ('.$apcversion.'), + newer version '.$match[1].' is available at + http://pecl.php.net/package/APC/'.$match[1].' +
    '; + $i = -1; + } + echo '

    Change Log:


    '; + + preg_match_all('!<(title|description)>([^<]+)!', $rss, $match); + next($match[2]); next($match[2]); + + while (list(,$v) = each($match[2])) { + list(,$ver) = explode(' ', $v, 2); + if ($i < 0 && version_compare($apcversion, $ver, '>=')) { + break; + } else if (!$i--) { + break; + } + echo "".htmlspecialchars($v, ENT_QUOTES, 'UTF-8')."
    "; + echo nl2br(htmlspecialchars(current($match[2]), ENT_QUOTES, 'UTF-8'))."
    "; + next($match[2]); + } + echo '
    + +EOB; + break; + +} + +echo <<< EOB + +EOB; + +?> + + + + +*/ \ No newline at end of file diff --git a/admin/edit-pattern.php b/admin/edit-pattern.php index b5b7285..821f219 100644 --- a/admin/edit-pattern.php +++ b/admin/edit-pattern.php @@ -3,7 +3,7 @@ // Author: Keyvan Minoukadeh // Copyright (c) 2013 Keyvan Minoukadeh // License: AGPLv3 -// Date: 2013-02-25 +// Date: 2013-05-09 // More info: http://fivefilters.org/content-only/ // Help: http://help.fivefilters.org @@ -57,7 +57,7 @@ require_once('require_login.php'); require_once('template.php'); tpl_header('Edit site patterns'); -$version = include('../site_config/standard/version.php'); +$version = file_get_contents('../site_config/standard/version.txt'); function filter_only_text($filename) { return (strtolower(substr($filename, -4)) == '.txt'); diff --git a/admin/index.php b/admin/index.php index b6ef012..ef5599c 100644 --- a/admin/index.php +++ b/admin/index.php @@ -40,4 +40,8 @@ tpl_header('Admin'); ?>

    The admin pages are intended to help you manage your copy of Full-Text RSS more easily.

    -

    We currently offer an experimental update tool which you can use to update your site patterns.

    \ No newline at end of file +
      +
    • Update patterns: an easy way to keep site config files up to date.
    • +
    • Edit patterns: need to fine-tune extraction for a certain site? Use this tool.
    • +
    • APC: If APC is enabled, you can use this tool to see what Full-Text RSS caches, and clear the cache if you need to.
    • +
    \ No newline at end of file diff --git a/admin/login.php b/admin/login.php index 24152d8..b0ff3dc 100644 --- a/admin/login.php +++ b/admin/login.php @@ -1,4 +1,5 @@ admin_credentials) || $options->admin_credentials['username'] == '' || $options->admin_credentials['password'] == '') { diff --git a/admin/require_login.php b/admin/require_login.php index e504f1f..0aa7382 100644 --- a/admin/require_login.php +++ b/admin/require_login.php @@ -1,9 +1,9 @@ password -$users = array($options->admin_credentials['username'] => $options->admin_credentials['password']); - -if (empty($_SERVER['PHP_AUTH_DIGEST'])) { - header('HTTP/1.1 401 Unauthorized'); - header('WWW-Authenticate: Digest realm="'.$realm. - '",qop="auth",nonce="'.uniqid().'",opaque="'.md5($realm).'"'); - - die('If you can\'t remember your admin credentials, open your custom_config.php and you\'ll find them in there.'); -} - - -// analyze the PHP_AUTH_DIGEST variable -if (!($data = http_digest_parse($_SERVER['PHP_AUTH_DIGEST'])) || - !isset($users[$data['username']])) - die('Wrong credentials!'); - - -// generate the valid response -$A1 = md5($data['username'] . ':' . $realm . ':' . $users[$data['username']]); -$A2 = md5($_SERVER['REQUEST_METHOD'].':'.$data['uri']); -$valid_response = md5($A1.':'.$data['nonce'].':'.$data['nc'].':'.$data['cnonce'].':'.$data['qop'].':'.$A2); - -if ($data['response'] != $valid_response) - die('Wrong credentials!'); - -// ok, valid username & password -// echo 'Thanks! You are now logged in.'; -unset($realm, $users, $data, $A1, $A2, $valid_response); - -// function to parse the http auth header -function http_digest_parse($txt) -{ - // protect against missing data - $needed_parts = array('nonce'=>1, 'nc'=>1, 'cnonce'=>1, 'qop'=>1, 'username'=>1, 'uri'=>1, 'response'=>1); - $data = array(); - $keys = implode('|', array_keys($needed_parts)); - - preg_match_all('@(' . $keys . ')=(?:([\'"])([^\2]+?)\2|([^\s,]+))@', $txt, $matches, PREG_SET_ORDER); - - foreach ($matches as $m) { - $data[$m[1]] = $m[3] ? $m[3] : $m[4]; - unset($needed_parts[$m[1]]); - } - - return $needed_parts ? false : $data; -} -*/ -?> \ No newline at end of file +} \ No newline at end of file diff --git a/admin/template.php b/admin/template.php index 9a733ec..3d56942 100644 --- a/admin/template.php +++ b/admin/template.php @@ -40,6 +40,7 @@ global $admin_page; diff --git a/admin/update.php b/admin/update.php index f3eaf00..27655a2 100644 --- a/admin/update.php +++ b/admin/update.php @@ -1,9 +1,9 @@ . // Usage // ----- -// Access this file in your browser and follow the instructions to update your site config files. +// * Access this file in your browser and follow the instructions to update your site config files. +// * See section on automatic updates for a URL you can fetch periodically (e.g. with cron) to update site config files error_reporting(E_ALL ^ E_NOTICE); ini_set("display_errors", 1); @@ -35,20 +36,38 @@ ini_set("display_errors", 1); //////////////////////////////// $admin_page = 'update'; require_once('../config.php'); -require_once('require_login.php'); -require_once('template.php'); +require_once 'template.php'; tpl_header('Update site patterns'); -$version = include('../site_config/standard/version.php'); +////////////////////////////////// +// Username and password must be available +////////////////////////////////// +if (!isset($options->admin_credentials) || $options->admin_credentials['username'] == '' || $options->admin_credentials['password'] == '') { + header("X-Robots-Tag: noindex, nofollow", true); + + die('

    Username and password not set

    Full-Text RSS has not been configured with admin credentials.

    If you are the administrator, please edit your custom_config.php file and enter the credentials in the appropriate section. When you\'ve done that, this page will prompt you for your admin credentials.

    '); +} +$admin_hash = sha1($options->admin_credentials['username'].'+'.$options->admin_credentials['password']); + +$_self_host = $_SERVER['HTTP_HOST']; +$_self_path = rtrim(dirname($_SERVER['SCRIPT_NAME']), '/\\'); +$self_update_url = 'http://'.htmlspecialchars($_self_host.$_self_path).'/update.php?key='.urlencode($admin_hash); + +$latest_remote = 'https://codeload.github.com/fivefilters/ftr-site-config/zip/master'; +$version = @file_get_contents('../site_config/standard/version.txt'); ///////////////////////////////// -// Check for valid update key +// Check for update key ///////////////////////////////// if (!isset($_REQUEST['key']) || trim($_REQUEST['key']) == '') { + + require_once 'require_login.php'; + if ($_SERVER['REQUEST_METHOD'] == 'POST') { header('Location: update.php'); exit; - } + } + $auto = true; $no_auto_reasons = array(); if (!class_exists('ZipArchive')) { @@ -59,14 +78,8 @@ if (!isset($_REQUEST['key']) || trim($_REQUEST['key']) == '') { $auto = false; $no_auto_reasons[] = 'your site_config/ folder is not writable - change permissions to 777 and try again.

    '; } - if (!file_exists('../site_config/standard/version.php')) { - die('Could not determine current version of your site pattern files (site_config/standard/version.php). Make sure you\'re using at least version 2.9.5 of Full-Text RSS.'); - } - if (!@$options->registration_key) { - $input_field = ''; - } else { - $reg_key = preg_replace('/[^a-z0-9-]/i', '', $options->registration_key); - $input_field = ''; + if (!file_exists('../site_config/standard/version.txt')) { + die('Could not determine current version of your site pattern files (site_config/standard/version.txt). Make sure you\'re using at least version 3.2 of Full-Text RSS.'); } ?>

    You have Full-Text RSS @@ -74,15 +87,19 @@ if (!isset($_REQUEST['key']) || trim($_REQUEST['key']) == '') {

    To see if you have the latest versions, check for updates.

    Registration key

    This update tool requires a registration key issued by FiveFilters.org. You do not need a registration key to use Full-Text RSS, and none of the regular funtionality is affected if you do not have one. The update tool is simply a convenience service we offer our customers.

    '; if ($auto) { - echo '

    This update tool will attempt to fetch the latest site patterns from FiveFilters.org and update yours.

    '; + echo '

    This update tool will attempt to fetch the latest site patterns from our GitHub repository.

    '; echo '

    Important: if you\'ve modified or added your own config files in the site_config/standard/ folder, please move them to site_config/custom/ — the update process will attempt to replace everything in site_config/standard/ with our updated version.

    '; - echo $reg_key_info; - if (!isset($reg_key)) { - echo '

    Your registration key should be your PayPal or Avangate transaction ID. If you don\'t have a registration key, you will get one sent to you automatically when you purchase Full-Text RSS from FiveFilters.org.

    '; - } - echo '
    ',$input_field,'
    '; + echo '
    '; + echo ''; + echo ''; + echo '
    '; + echo '

    Automatic updates

    '; + echo '

    You can schedule automatic updates using something like cron. The URL to call is:

    '; + echo '

    '.$self_update_url.'

    '; + echo '

    We recommend you schedule this URL to be fetched once a day. If you do not have access to a scheduling service '; + echo 'you may want to consider one of these online services: Easycron, SetCronJob, onlinecronjobs.com.

    '; + echo '

    Note: the key contained in the URL is a hash value generated from your admin credentials. If you change these, the key will also change.

    '; } else { echo '
    '; echo '

    We cannot automatically update your site pattern files because:

    '; @@ -93,37 +110,56 @@ if (!isset($_REQUEST['key']) || trim($_REQUEST['key']) == '') { echo ''; echo '

    You can still manually update by downloading the zip file and replacing everything in your site_config/standard/ folder with the contents of the zip file.

    '; echo '
    '; - echo $reg_key_info; - if (!isset($reg_key)) { - echo '

    Enter your registration key below to download the latest version of the site config files from FiveFilters.org

    '; - echo '

    Your registration key should be your PayPal or Avangate transaction ID.

    '; - } - echo '
    ',$input_field,'
    '; + echo '

    Download site config files (zip)

    '; } echo '

    Help

    '; echo '

    If you have any trouble, please contact us via our support site.

    '; exit; } +////////////////////////////////// +// Check update key valid +////////////////////////////////// +if ($_REQUEST['key'] !== $admin_hash) { + println("Sorry, invalid key supplied."); + exit; +} + ////////////////////////////////// // Check for updates ////////////////////////////////// -$ff_version = (float)@file_get_contents('http://fivefilters.org/content-only/site_config/standard/version.txt'); -if (version_compare($version, $ff_version) != -1) { +//$ff_version = @file_get_contents('http://fivefilters.org/content-only/site_config/standard/version.txt'); +$_context = stream_context_create(array('http' => array('user_agent' => 'PHP/5.4'))); +$latest_info_json = @file_get_contents('https://api.github.com/repos/fivefilters/ftr-site-config', false, $_context); +if (!$latest_info_json) { + println("Sorry, couldn't get info on latest site config files. Please try again later or contact us."); + exit; +} +$latest_info_json = @json_decode($latest_info_json); +if (!is_object($latest_info_json)) { + println("Sorry, couldn't parse JSON from GitHub. Please try again later or contact us."); + exit; +} +$ff_version = $latest_info_json->updated_at; +if ($version == $ff_version) { die('Your site config files are up to date! If you have trouble extracting from a particular site, please email us: help@fivefilters.org'); } else { - println("Updated site patterns are available at FiveFilters.org (version $ff_version)..."); + println("Updated site patterns are available (version $ff_version)..."); } ////////////////////////////////// // Prepare ////////////////////////////////// -$latest_remote = 'http://fivefilters.org/content-only/update/get_site_config.php?key='.urlencode($_REQUEST['key']); $tmp_latest_local = '../site_config/latest_site_config.zip'; $tmp_latest_local_dir = '../site_config/standard_latest'; $tmp_old_local_dir = '../site_config/standard_old'; if (file_exists($tmp_latest_local)) unlink($tmp_latest_local); -if (file_exists($tmp_latest_local_dir)) rrmdir($tmp_latest_local_dir); +if (file_exists($tmp_latest_local_dir)) { + if (!rrmdir($tmp_latest_local_dir)) { + println("Sorry, couldn't remove old folder from last update"); + exit; + } +} if (file_exists($tmp_old_local_dir)) { rrmdir($tmp_old_local_dir); } @@ -133,11 +169,8 @@ $standard_local_dir = '../site_config/standard/'; @file_put_contents($tmp_latest_local, @file_get_contents($latest_remote)); $headers = implode("\n", $http_response_header); //var_dump($headers); exit; -if (strpos($headers, 'HTTP/1.1 403') !== false) { - println("Invalid registration key supplied"); - exit; -} elseif (strpos($headers, 'HTTP/1.1 200') === false) { - println("Sorry, something went wrong. We're looking into it. Please contact us if the problem persists."); +if (strpos($headers, 'HTTP/1.0 200') === false) { + println("Sorry, something went wrong. Please contact us if the problem persists."); exit; } if (class_exists('ZipArchive') && file_exists($tmp_latest_local)) { @@ -149,15 +182,33 @@ if (class_exists('ZipArchive') && file_exists($tmp_latest_local)) { @unlink($tmp_latest_local); if (file_exists($tmp_latest_local_dir)) { println("Unzipped contents to $tmp_latest_local_dir"); - if (!file_exists($tmp_latest_local_dir.'/version.php')) { + if (!file_exists($tmp_latest_local_dir.'/ftr-site-config-master/README.md')) { println("There was a problem extracting the latest site patterns archive - your current site patterns remain untouched."); println("Please update manually."); exit; } + @file_put_contents($tmp_latest_local_dir.'/ftr-site-config-master/version.txt', $ff_version); + if (!file_exists($tmp_latest_local_dir.'/ftr-site-config-master/version.txt')) { + println("There was a problem writing the new version number - your current site patterns remain untouched."); + println("Please update manually."); + exit; + } rename($standard_local_dir, $tmp_old_local_dir); if (file_exists($tmp_old_local_dir)) println("Renamed $standard_local_dir to $tmp_old_local_dir"); - rename($tmp_latest_local_dir, $standard_local_dir); - if (file_exists($standard_local_dir)) println("Renamed $tmp_latest_local_dir to $standard_local_dir"); + rename($tmp_latest_local_dir."/ftr-site-config-master", $standard_local_dir); + if (file_exists($standard_local_dir)) println("Renamed $tmp_latest_local_dir/ftr-site-config-master to $standard_local_dir"); + rmdir($tmp_latest_local_dir); + // clear cached site config files from APC + if ($options->apc && function_exists('apc_delete') && function_exists('apc_cache_info')) { + $_apc_data = apc_cache_info('user'); + foreach ($_apc_data['cache_list'] as $_apc_item) { + if (substr($_apc_item['info'], 0, 3) == 'sc.') { + apc_delete($_apc_item['info']); + } + } + println('Cleared site config cache in APC.'); + } + // all done! println("All done! Your old site config files are in $tmp_old_local_dir — these will be removed next time you go through the update process."); } else { if (file_exists($tmp_latest_local)) @unlink($tmp_latest_local); @@ -179,13 +230,12 @@ function println($txt) { } function rrmdir($dir) { - foreach(glob($dir . '/{*.txt,*.php,.*.txt,.*.php}', GLOB_BRACE|GLOB_NOSORT) as $file) { + foreach(glob($dir . '/{*.txt,*.php,.*.txt,.*.php,.gitattributes,.gitignore,ftr-site-config-master,README.md}', GLOB_BRACE|GLOB_NOSORT) as $file) { if(is_dir($file)) { rrmdir($file); } else { unlink($file); } } - rmdir($dir); -} -?> \ No newline at end of file + return rmdir($dir); +} \ No newline at end of file diff --git a/changelog.txt b/changelog.txt index 9b78221..a497bb4 100644 --- a/changelog.txt +++ b/changelog.txt @@ -2,11 +2,24 @@ FiveFilters.org: Full-Text RSS http://fivefilters.org/content-only/ CHANGELOG ------------------------------------ +3.2 (2013-05-14) + - A short excerpt from the first few lines of the extracted content can now be included in the output (pass &summary=1 in querystring, see $options->summary in config file for more info) + - Full content can now be excluded from the output (pass &content=0 in querystring, see $options->content in config file for more info) + - Site config files can now be automatically updated from our GitHub repository (URL to call visible in admin area) + - Site config files updated for better extraction + - PHP Readability updated to be more lenient when pruning HTML + - Language detection library updated + - HTML meta refresh redirects now also followed + - APC stats (if APC is available on your server) now visible in admin area + - Bug fix: Duplicate find_string and replace_string values in site config files no longer removed (thanks Fabrizio!) + - Bug fix: MIME type actions now applied when following single page URLs + - Other minor fixes/improvements + 3.1 (2013-03-06) - PHP Readability updated to preserve more images/videos - Site config files updated for better extraction - SimplePie updated - - New site config option favour_feed_titles and request parameter use_extracted_title to allow extracted titles to be used in generated feed + - New config option favour_feed_titles and request parameter use_extracted_title to allow extracted titles to be used in generated feed - Remove image lazy loading (looks for markup used by http://wordpress.org/extend/plugins/lazy-load/) - elements appearing inside elements are now preserved in generated feed - elements now preserved diff --git a/config.php b/config.php index c0dbf7d..d382f4f 100644 --- a/config.php +++ b/config.php @@ -45,6 +45,60 @@ $options->default_entries = 5; // 10, only 10 will be processed. $options->max_entries = 10; +// Full content +// ---------------------- +// By default Full-Text RSS includes the extracted content in the output. +// You can exclude this from the output by passing '&content=0' in the querystring. +// +// Possible values... +// Always include: true +// Never include: false +// Include unless user overrides (&content=0): 'user' (default) +// +// Note: currently this does not disable full content extraction. It simply omits it +// from the output. +$options->content = 'user'; + +// Excerpts +// ---------------------- +// By default Full-Text RSS does not include excerpts in the output. +// You can enable this by passing '&summary=1' in the querystring. +// This will include a plain text excerpt from the extracted content. +// +// Possible values... +// Always include: true (recommended for new users) +// Never include: false +// Don't include unless user overrides (&summary=1): 'user' (default) +// +// Important: if both content and excerpts are requested, the excerpt will be +// placed in the description element and the full content inside content:encoded. +// If excerpts are not requested, the full content will go inside the description element. +// +// Why are we not returning both excerpts and content by default? +// Mainly for backward compatibility. +// Excerpts should appear in the feed item's description element. Previous versions +// of Full-Text RSS did not return excerpts, so the description element was always +// used for the full content (as recommended by the RSS advisory). When returning both, +// we need somewhere else to place the content (content:encoded). +// Having both enabled should not create any problems for news readers, but it may create +// problems for developers upgrading from one of our earlier versions who may now find +// their applications are returning excerpts instead of the full content they were +// expecting. To avoid such surprises for users who are upgrading Full-Text RSS, +// excerpts must be explicitly requested in the querystring by default. +// +// Why not use a different element name for excerpts? +// According to the RSS advisory: +// "Publishers who employ summaries should store the summary in description and +// the full content in content:encoded, ordering description first within the item. +// On items with no summary, the full content should be stored in description." +// See: http://www.rssboard.org/rss-profile#namespace-elements-content-encoded +// +// For more consistent element naming, we recommend new users set this option to true. +// The full content can still be excluded via the querystring, but the element names +// will not change: when $options->summary = true, the description element will always +// be reserved for the excerpt and content:encoded always for full content. +$options->summary = 'user'; + // Rewrite relative URLs // ---------------------- // With this enabled relative URLs found in the extracted content @@ -373,7 +427,7 @@ $options->cache_cleanup = 100; /// DO NOT CHANGE ANYTHING BELOW THIS /////////// ///////////////////////////////////////////////// -if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.1'); +if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.2'); if (basename(__FILE__) == 'config.php') { if (file_exists(dirname(__FILE__).'/custom_config.php')) { diff --git a/css/feed.xsl b/css/feed.xsl index 216d34e..9211f8e 100644 --- a/css/feed.xsl +++ b/css/feed.xsl @@ -1,5 +1,5 @@ - + @@ -22,7 +22,12 @@
  • -
    +
    + + + + +
  • diff --git a/ftr_compatibility_test.php b/ftr_compatibility_test.php index 0aada2c..bb19148 100644 --- a/ftr_compatibility_test.php +++ b/ftr_compatibility_test.php @@ -8,12 +8,15 @@ to it at www.example.com/ftr_compatibility_test.php 2) Open your web browser and go to the page you just uploaded. +If things don't look right, have a look at our hosting suggestions: +http://help.fivefilters.org/customer/portal/articles/1143210-hosting + Note: This compatibility test has been borrowed (and slightly adapted) from the one supplied by SimplePie.org. We have kept most of their checks intact as we use SimplePie in our application. http://github.com/simplepie/simplepie/tree/master/compatibility_test/ */ -$app_name = 'Full-Text RSS 3.1'; +$app_name = 'Full-Text RSS 3.2'; $php_ok = (function_exists('version_compare') && version_compare(phpversion(), '5.2.0', '>=')); $pcre_ok = extension_loaded('pcre'); @@ -129,33 +132,33 @@ em strong { text-transform: uppercase; } -table#chart { +table.chart { border-collapse:collapse; } -table#chart th { +table.chart th { background-color:#eee; padding:2px 3px; border:1px solid #fff; } -table#chart td { +table.chart td { text-align:center; padding:2px 3px; border:1px solid #eee; } -table#chart tr.enabled td { +table.chart tr.enabled td { /* Leave this alone */ } -table#chart tr.disabled td, -table#chart tr.disabled td a { +table.chart tr.disabled td, +table.chart tr.disabled td a { color:#999; font-style:italic; } -table#chart tr.disabled td a { +table.chart tr.disabled td a { text-decoration:underline; } @@ -186,7 +189,7 @@ div.chunk {

    : Compatibility Test

    - +
    @@ -253,7 +256,7 @@ div.chunk {
    Test
    - +

    What does this mean?

      @@ -292,7 +295,7 @@ div.chunk {
    1. Tidy: You have Tidy support installed. No problems here.
    2. -
    3. Tidy: The Tidy extension is not available. should still work with most feeds/articles, but you may experience problems with some. If you do, we suggest you specify parsing with html5lib.
    4. +
    5. Tidy: The Tidy extension is not available. should still work with most feeds/articles, but you may experience problems with some.
    6. @@ -341,12 +344,58 @@ div.chunk {

      Note: Passing this test does not guarantee that will run on your webhost — it only ensures that the basic requirements have been addressed. If you experience any problems, please let us know.

      Bottom Line: We're sorry…

      -

      Your webhost does not support the minimum requirements for . It may be a good idea to contact your webhost and point them to the results of this test. They may be able to enable/install the required components.

      +

      Your webhost does not support the minimum requirements for . It may be a good idea to contact your webhost and point them to the results of this test. They may be able to enable/install the required components.

      Alternatively, you can try one of our recommended hosts.

    +
    +

    Further info

    +

    HTTP module

    +

    Full-Text RSS can make use of HttpRequestPool or curl_multi to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using file_get_contents.

    + +

    will be used on this server.

    + +

    Alternative PHP Cache (APC)

    +

    Full-Text RSS can make use of APC's memory cache to store site config files (when requested for the first time). This is not required, but if available it may improve performance slightly by reducing disk access.

    + APC is available on this server.

    '; + } else { + echo '

    APC is not available on this server.

    '; + } + ?> + +

    Language detection

    +

    Full-Text RSS can detect the language of each article processed. This occurs using Text_LanguageDetect or PHP-CLD (if available).

    + = 0)) { + echo '

    PHP-CLD will be used on this server.

    '; + } else { + echo '

    Text_LanguageDetect will be used on this server.

    '; + } + ?> + +

    Automatic site config updates

    +

    Full-Text RSS can be configured to update its site config files (which determine how content should be extracted for certain sites) by downloading the latest set from our GitHub repository. This functionaility is not required, and can be done manually. To configure this to occur automatically, you will need zip support enabled in PHP - we make use of the ZipArchive class.

    + ZipArchive is not available on this server. To update the site config files you will need to do it manually by downloading the latest set and uploading it to your server.

    '; + } else { + echo '

    ZipArchive is available on this server.

    '; + } + ?> +
    +

    This compatibility test has been borrowed (and slightly adapted) from the one supplied by SimplePie.org. We have kept most of their checks intact as we use SimplePie in our application.

    +

    Date:

    diff --git a/index.php b/index.php index 00c1e56..31d35a2 100644 --- a/index.php +++ b/index.php @@ -115,7 +115,16 @@ if (!defined('_FF_FTR_INDEX')) { + summary == 'user') { ?>
    + +
    + +
    +
    + + +
    @@ -170,7 +179,7 @@ if (!defined('_FF_FTR_INDEX')) {

    To change the configuration, save a copy of config.php as custom_config.php and make any changes you like to it.To change the configuration, edit custom_config.php and make any changes you like.

    Manage and update site config files

    -

    For best results, we suggest you update the site config files bundled with Full-Text RSS. If you've purchased Full-Text RSS from us, you'll receive an email when these are updated.

    +

    For best results, we suggest you update the site config files bundled with Full-Text RSS.

    The easiest way to update these is via the admin area. (For advanced users, you'll also be able to edit and test the extraction rules contained in the site config files from the admin area.)

    Customise this page

    @@ -253,9 +262,9 @@ if (!defined('_FF_FTR_INDEX')) {

    Your version of Full-Text RSS:
    diff --git a/libraries/content-extractor/ContentExtractor.php b/libraries/content-extractor/ContentExtractor.php index b004e19..21e693e 100644 --- a/libraries/content-extractor/ContentExtractor.php +++ b/libraries/content-extractor/ContentExtractor.php @@ -230,7 +230,7 @@ class ContentExtractor $this->debug("...XPath match: $pattern"); // remove title from document try { - $elems->item(0)->parentNode->removeChild($elems->item(0)); + @$elems->item(0)->parentNode->removeChild($elems->item(0)); } catch (DOMException $e) { // do nothing } @@ -724,5 +724,4 @@ class ContentExtractor public function getNextPageUrl() { return $this->nextPageUrl; } -} -?> \ No newline at end of file +} \ No newline at end of file diff --git a/libraries/content-extractor/SiteConfig.php b/libraries/content-extractor/SiteConfig.php index b73f1d7..1f6a760 100644 --- a/libraries/content-extractor/SiteConfig.php +++ b/libraries/content-extractor/SiteConfig.php @@ -5,10 +5,10 @@ * Each instance of this class should hold extraction patterns and other directives * for a website. See ContentExtractor class to see how it's used. * - * @version 0.7 - * @date 2012-08-27 + * @version 0.8 + * @date 2013-04-16 * @author Keyvan Minoukadeh - * @copyright 2012 Keyvan Minoukadeh + * @copyright 2013 Keyvan Minoukadeh * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 */ @@ -180,7 +180,7 @@ class SiteConfig public function append(SiteConfig $newconfig) { // check for commands where we accept multiple statements (no test_url) - foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header', 'find_string', 'replace_string') as $var) { + foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'http_header') as $var) { // append array elements for this config variable from $newconfig to this config //$this->$var = $this->$var + $newconfig->$var; $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); @@ -190,6 +190,12 @@ class SiteConfig foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { if ($this->$var === null) $this->$var = $newconfig->$var; } + // treat find_string and replace_string separately (don't apply array_unique) (thanks fabrizio!) + foreach (array('find_string', 'replace_string') as $var) { + // append array elements for this config variable from $newconfig to this config + //$this->$var = $this->$var + $newconfig->$var; + $this->$var = array_merge($this->$var, $newconfig->$var); + } } // returns SiteConfig instance if an appropriate one is found, false otherwise @@ -334,5 +340,4 @@ class SiteConfig } return $config; } -} -?> \ No newline at end of file +} \ No newline at end of file diff --git a/libraries/feedwriter/FeedWriter.php b/libraries/feedwriter/FeedWriter.php index 4b2fd17..0cd1ea0 100644 --- a/libraries/feedwriter/FeedWriter.php +++ b/libraries/feedwriter/FeedWriter.php @@ -110,6 +110,11 @@ define('JSONP', 3, true); } } + public function &getItems() + { + return $this->items; + } + /** * Create a new FeedItem. * @@ -239,7 +244,7 @@ define('JSONP', 3, true); { $out = ''."\n"; if ($this->xsl) $out .= 'xsl).'"?>' . PHP_EOL; - $out .= '' . PHP_EOL; + $out .= '' . PHP_EOL; echo $out; } elseif ($this->version == JSON || $this->version == JSONP) diff --git a/libraries/html5/TreeBuilder.php b/libraries/html5/TreeBuilder.php index 2f5244f..c4a48b2 100644 --- a/libraries/html5/TreeBuilder.php +++ b/libraries/html5/TreeBuilder.php @@ -134,6 +134,7 @@ class HTML5_TreeBuilder { // Namespaces for foreign content const NS_HTML = null; // to prevent DOM from requiring NS on everything + const NS_XHTML = 'http://www.w3.org/1999/xhtml'; const NS_MATHML = 'http://www.w3.org/1998/Math/MathML'; const NS_SVG = 'http://www.w3.org/2000/svg'; const NS_XLINK = 'http://www.w3.org/1999/xlink'; @@ -3157,11 +3158,19 @@ class HTML5_TreeBuilder { } private function insertElement($token, $append = true) { - $el = $this->dom->createElementNS(self::NS_HTML, $token['name']); + //$el = $this->dom->createElementNS(self::NS_HTML, $token['name']); + $namespaceURI = strpos($token['name'], ':') ? self::NS_XHTML : self::NS_HTML; + $el = $this->dom->createElementNS($namespaceURI, $token['name']); if (!empty($token['attr'])) { foreach($token['attr'] as $attr) { - if(!$el->hasAttribute($attr['name'])) { + + // mike@macgirvin.com 2011-11-17, check attribute name for + // validity (ignoring extenders and combiners) as illegal chars in names + // causes everything to abort + + $valid = preg_match('/^[a-zA-Z\_\:]([\-a-zA-Z0-9\_\:\.]+$)/',$attr['name']); + if($attr['name'] && (!$el->hasAttribute($attr['name'])) && ($valid)) { $el->setAttribute($attr['name'], $attr['value']); } } diff --git a/libraries/humble-http-agent/CookieJar.php b/libraries/humble-http-agent/CookieJar.php index d91b711..e4d5f49 100644 --- a/libraries/humble-http-agent/CookieJar.php +++ b/libraries/humble-http-agent/CookieJar.php @@ -400,5 +400,4 @@ class CookieJar } return false; } -} -?> \ No newline at end of file +} \ No newline at end of file diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php index 0b30599..963f0c0 100644 --- a/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/libraries/humble-http-agent/HumbleHttpAgent.php @@ -7,11 +7,11 @@ * For environments which do not have these options, it reverts to standard sequential * requests (using file_get_contents()) * - * @version 1.1 - * @date 2012-08-20 + * @version 1.4 + * @date 2013-05-10 * @see http://php.net/HttpRequestPool * @author Keyvan Minoukadeh - * @copyright 2011-2012 Keyvan Minoukadeh + * @copyright 2011-2013 Keyvan Minoukadeh * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 */ @@ -22,7 +22,7 @@ class HumbleHttpAgent const METHOD_FILE_GET_CONTENTS = 4; //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; - const UA_PHP = 'PHP/5.2'; + const UA_PHP = 'PHP/5.4'; const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; protected $requests = array(); @@ -82,6 +82,8 @@ class HumbleHttpAgent // set request options (redirect must be 0) $this->requestOptions = array( 'timeout' => 15, + 'connecttimeout' => 15, + 'dns_cache_timeout' => 300, 'redirect' => 0 // we handle redirects manually so we can rewrite the new hashbang URLs that are creeping up over the web // TODO: test onprogress? ); @@ -155,6 +157,37 @@ class HumbleHttpAgent return $iri->get_iri(); } + public function getRedirectURLfromHTML($url, $html) { + $redirect_url = $this->getMetaRefreshURL($url, $html); + if (!$redirect_url) { + $redirect_url = $this->getUglyURL($url, $html); + } + return $redirect_url; + } + + public function getMetaRefreshURL($url, $html) { + if ($html == '') return false; + // + if (!preg_match('!]+)["\']*>!i', $html, $match)) { + return false; + } + $redirect_url = $match[1]; + if (preg_match('!^https?://!i', $redirect_url)) { + // already absolute + $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url); + return $redirect_url; + } + // absolutize redirect URL + $base = new SimplePie_IRI($url); + // remove '//' in URL path (causes URLs not to resolve properly) + if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); + if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { + $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); + return $absolute; + } + return false; + } + public function getUglyURL($url, $html) { if ($html == '') return false; $found = false; @@ -173,7 +206,9 @@ class HumbleHttpAgent } $query['_escaped_fragment_'] = ''; $iri->query = str_replace('%2F', '/', http_build_query($query)); // needed for some sites - return $iri->get_iri(); + $ugly_url = $iri->get_iri(); + $this->debug('AJAX trigger (meta name="fragment" content="!") found, new URL: '.$ugly_url); + return $ugly_url; } public function removeFragment($url) { @@ -339,9 +374,8 @@ class HumbleHttpAgent // for AJAX sites, e.g. Blogger with its dynamic views templates. // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); if ($redirectURL) { - $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); $this->redirectQueue[$orig] = $redirectURL; } } @@ -464,9 +498,8 @@ class HumbleHttpAgent // for AJAX sites, e.g. Blogger with its dynamic views templates. // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); if ($redirectURL) { - $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); $this->redirectQueue[$orig] = $redirectURL; } } @@ -551,9 +584,8 @@ class HumbleHttpAgent // for AJAX sites, e.g. Blogger with its dynamic views templates. // Based on Google's spec: https://developers.google.com/webmasters/ajax-crawling/docs/specification if (isset($this->requests[$orig]['body'])) { - $redirectURL = $this->getUglyURL($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); + $redirectURL = $this->getRedirectURLfromHTML($this->requests[$orig]['effective_url'], substr($this->requests[$orig]['body'], 0, 4000)); if ($redirectURL) { - $this->debug('AJAX trigger (meta name="fragment" content="!") found. Queueing '.$redirectURL); $this->redirectQueue[$orig] = $redirectURL; } } @@ -775,5 +807,4 @@ if (!function_exists('gzdecode')) { } return $data; } -} -?> \ No newline at end of file +} \ No newline at end of file diff --git a/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php b/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php index ce76a92..c524a1e 100644 --- a/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php +++ b/libraries/humble-http-agent/SimplePie_HumbleHttpAgent.php @@ -75,5 +75,4 @@ class SimplePie_HumbleHttpAgent extends SimplePie_File $this->success = false; } } -} -?> \ No newline at end of file +} \ No newline at end of file diff --git a/libraries/language-detect/LanguageDetect.php b/libraries/language-detect/LanguageDetect.php index 09b1154..382d869 100644 --- a/libraries/language-detect/LanguageDetect.php +++ b/libraries/language-detect/LanguageDetect.php @@ -6,23 +6,24 @@ * Attempts to detect the language of a sample of text by correlating ranked * 3-gram frequencies to a table of 3-gram frequencies of known languages. * - * Implements a version of a technique originally proposed by Cavnar & Trenkle - * (1994): "N-Gram-Based Text Categorization" + * Implements a version of a technique originally proposed by Cavnar & Trenkle + * (1994): "N-Gram-Based Text Categorization" * - * PHP versions 4 and 5 + * PHP version 5 * - * @category Text - * @package Text_LanguageDetect - * @author Nicholas Pisarro - * @copyright 2005-2006 Nicholas Pisarro - * @license http://www.debian.org/misc/bsd.license BSD - * @version CVS: $Id: LanguageDetect.php,v 1.20 2008/07/01 02:09:15 taak Exp $ - * @link http://pear.php.net/package/Text_LanguageDetect/ - * @link http://langdetect.blogspot.com/ + * @category Text + * @package Text_LanguageDetect + * @author Nicholas Pisarro + * @copyright 2005-2006 Nicholas Pisarro + * @license http://www.debian.org/misc/bsd.license BSD + * @version SVN: $Id: LanguageDetect.php 322353 2012-01-16 08:41:43Z cweiske $ + * @link http://pear.php.net/package/Text_LanguageDetect/ + * @link http://langdetect.blogspot.com/ */ -//require_once 'PEAR.php'; -require_once 'Parser.php'; +require_once 'LanguageDetect/Exception.php'; +require_once 'LanguageDetect/Parser.php'; +require_once 'LanguageDetect/ISO639.php'; /** * Language detection class @@ -41,9 +42,10 @@ require_once 'Parser.php'; * * echo "Supported languages:\n"; * - * $langs = $l->getLanguages(); - * if (PEAR::isError($langs)) { - * die($langs->getMessage()); + * try { + * $langs = $l->getLanguages(); + * } catch (Text_LanguageDetect_Exception $e) { + * die($e->getMessage()); * } * * sort($langs); @@ -54,38 +56,38 @@ require_once 'Parser.php'; * } * * - * @category Text - * @package Text_LanguageDetect - * @author Nicholas Pisarro - * @copyright 2005 Nicholas Pisarro - * @license http://www.debian.org/misc/bsd.license BSD - * @version Release: @package_version@ - * @todo allow users to generate their own language models + * @category Text + * @package Text_LanguageDetect + * @author Nicholas Pisarro + * @copyright 2005 Nicholas Pisarro + * @license http://www.debian.org/misc/bsd.license BSD + * @version Release: @package_version@ + * @link http://pear.php.net/package/Text_LanguageDetect/ + * @todo allow users to generate their own language models */ - class Text_LanguageDetect { - /** + /** * The filename that stores the trigram data for the detector * - * If this value starts with a slash (/) or a dot (.) the value of + * If this value starts with a slash (/) or a dot (.) the value of * $this->_data_dir will be ignored - * + * * @var string * @access private */ - var $_db_filename = './lang.dat'; + var $_db_filename = 'lang.dat'; /** * The filename that stores the unicode block definitions * - * If this value starts with a slash (/) or a dot (.) the value of + * If this value starts with a slash (/) or a dot (.) the value of * $this->_data_dir will be ignored - * + * * @var string * @access private */ - var $_unicode_db_filename = './unicode_blocks.dat'; + var $_unicode_db_filename = 'unicode_blocks.dat'; /** * The data directory @@ -99,11 +101,8 @@ class Text_LanguageDetect /** * The trigram data for comparison - * - * Will be loaded on start from $this->_db_filename * - * May be set to a PEAR_Error object if there is an error during its - * initialization + * Will be loaded on start from $this->_db_filename * * @var array * @access private @@ -120,7 +119,7 @@ class Text_LanguageDetect /** * The size of the trigram data arrays - * + * * @var int * @access private */ @@ -140,7 +139,7 @@ class Text_LanguageDetect /** * Whether or not to simulate perl's Language::Guess exactly - * + * * @access private * @var bool * @see setPerlCompatible() @@ -164,19 +163,25 @@ class Text_LanguageDetect */ var $_clusters; + /** + * Which type of "language names" are accepted and returned: + * + * 0 - language name ("english") + * 2 - 2-letter ISO 639-1 code ("en") + * 3 - 3-letter ISO 639-2 code ("eng") + */ + var $_name_mode = 0; + /** * Constructor * * Will attempt to load the language database. If it fails, you will get - * a PEAR_Error object returned when you try to use detect() - * + * an exception. */ - function Text_LanguageDetect($db=null, $unicode_db=null) + function __construct() { - if (isset($db)) $this->_db_filename = $db; - if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; - $data = $this->_readdb($this->_db_filename); + $this->_checkTrigram($data['trigram']); $this->_lang_db = $data['trigram']; if (isset($data['trigram-unicodemap'])) { @@ -186,29 +191,32 @@ class Text_LanguageDetect // Not yet implemented: if (isset($data['trigram-clusters'])) { $this->_clusters = $data['trigram-clusters']; - } + } } /** * Returns the path to the location of the database * - * @access private - * @return string expected path to the language model database + * @param string $fname File name to load + * + * @return string expected path to the language model database + * @access private */ function _get_data_loc($fname) { - return $fname; + return dirname(__FILE__).'/'.$fname; } /** * Loads the language trigram database from filename * * Trigram datbase should be a serialize()'d array - * - * @access private - * @param string $fname the filename where the data is stored - * @return array the language model data - * @throws PEAR_Error + * + * @param string $fname the filename where the data is stored + * + * @return array the language model data + * @throws Text_LanguageDetect_Exception + * @access private */ function _readdb($fname) { @@ -217,79 +225,74 @@ class Text_LanguageDetect // input check if (!file_exists($fname)) { - throw new Exception('Language database does not exist.'); + throw new Text_LanguageDetect_Exception( + 'Language database does not exist: ' . $fname, + Text_LanguageDetect_Exception::DB_NOT_FOUND + ); } elseif (!is_readable($fname)) { - throw new Exception('Language database is not readable.'); + throw new Text_LanguageDetect_Exception( + 'Language database is not readable: ' . $fname, + Text_LanguageDetect_Exception::DB_NOT_READABLE + ); } - if (function_exists('file_get_contents')) { - return unserialize(file_get_contents($fname)); - } else { - // if you don't have file_get_contents(), - // then this is the next fastest way - ob_start(); - readfile($fname); - $contents = ob_get_contents(); - ob_end_clean(); - return unserialize($contents); - } + return unserialize(file_get_contents($fname)); } /** * Checks if this object is ready to detect languages - * - * @access private - * @param mixed &$err error object to be returned by reference, if any - * @return bool true if no errors + * + * @param array $trigram Trigram data from database + * + * @return void + * @access private */ - function _setup_ok(&$err) + function _checkTrigram($trigram) { - if (!is_array($this->_lang_db)) { + if (!is_array($trigram)) { if (ini_get('magic_quotes_runtime')) { - throw new Exception('Error loading database. Try turning magic_quotes_runtime off.'); - } else { - throw new Exception('Language database is not an array.'); + throw new Text_LanguageDetect_Exception( + 'Error loading database. Try turning magic_quotes_runtime off.', + Text_LanguageDetect_Exception::MAGIC_QUOTES + ); } - return false; - - } elseif (empty($this->_lang_db)) { - throw new Exception('Language database has no elements.'); - return false; - - } else { - return true; + throw new Text_LanguageDetect_Exception( + 'Language database is not an array.', + Text_LanguageDetect_Exception::DB_NOT_ARRAY + ); + } elseif (empty($trigram)) { + throw new Text_LanguageDetect_Exception( + 'Language database has no elements.', + Text_LanguageDetect_Exception::DB_EMPTY + ); } } /** * Omits languages * - * Pass this function the name of or an array of names of + * Pass this function the name of or an array of names of * languages that you don't want considered * - * If you're only expecting a limited set of languages, this can greatly + * If you're only expecting a limited set of languages, this can greatly * speed up processing * - * @access public - * @param mixed $omit_list language name or array of names to omit - * @param bool $include_only if true will include (rather than - * exclude) only those in the list - * @return int number of languages successfully deleted - * @throws PEAR_Error + * @param mixed $omit_list language name or array of names to omit + * @param bool $include_only if true will include (rather than + * exclude) only those in the list + * + * @return int number of languages successfully deleted + * @throws Text_LanguageDetect_Exception */ - function omitLanguages($omit_list, $include_only = false) + public function omitLanguages($omit_list, $include_only = false) { - - // setup check - if (!$this->_setup_ok($err)) { - return $err; - } - $deleted = 0; - // deleting the given languages + $omit_list = $this->_convertFromNameMode($omit_list); + if (!$include_only) { + // deleting the given languages if (!is_array($omit_list)) { $omit_list = strtolower($omit_list); // case desensitize if (isset($this->_lang_db[$omit_list])) { @@ -301,12 +304,12 @@ class Text_LanguageDetect if (isset($this->_lang_db[$omit_lang])) { unset($this->_lang_db[$omit_lang]); $deleted++; - } + } } } - // deleting all except the given languages } else { + // deleting all except the given languages if (!is_array($omit_list)) { $omit_list = array($omit_list); } @@ -327,7 +330,7 @@ class Text_LanguageDetect // reset the cluster cache if the number of languages changes // this will then have to be recalculated if (isset($this->_clusters) && $deleted > 0) { - unset($this->_clusters); + $this->_clusters = null; } return $deleted; @@ -339,49 +342,40 @@ class Text_LanguageDetect * * @access public * @return int the number of languages - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception */ function getLanguageCount() { - if (!$this->_setup_ok($err)) { - return $err; - } else { - return count($this->_lang_db); - } + return count($this->_lang_db); } /** - * Returns true if a given language exists + * Checks if the language with the given name exists in the database * - * If passed an array of names, will return true only if all exist + * @param mixed $lang Language name or array of language names * - * @access public - * @param mixed $lang language name or array of language names - * @return bool true if language model exists - * @throws PEAR_Error + * @return bool true if language model exists */ - function languageExists($lang) + public function languageExists($lang) { - if (!$this->_setup_ok($err)) { - return $err; - } else { - // string - if (is_string($lang)) { - return isset($this->_lang_db[strtolower($lang)]); + $lang = $this->_convertFromNameMode($lang); - // array - } elseif (is_array($lang)) { - foreach ($lang as $test_lang) { - if (!isset($this->_lang_db[strtolower($test_lang)])) { - return false; - } + if (is_string($lang)) { + return isset($this->_lang_db[strtolower($lang)]); + + } elseif (is_array($lang)) { + foreach ($lang as $test_lang) { + if (!isset($this->_lang_db[strtolower($test_lang)])) { + return false; } - return true; - - // other (error) - } else { - throw new Exception('Unknown type passed to languageExists()'); } + return true; + + } else { + throw new Text_LanguageDetect_Exception( + 'Unsupported parameter type passed to languageExists()', + Text_LanguageDetect_Exception::PARAM_TYPE + ); } } @@ -389,25 +383,24 @@ class Text_LanguageDetect * Returns the list of detectable languages * * @access public - * @return array the names of the languages known to this object - * @throws PEAR_Error + * @return array the names of the languages known to this object<<<<<<< + * @throws Text_LanguageDetect_Exception */ function getLanguages() { - if (!$this->_setup_ok($err)) { - return $err; - } else { - return array_keys($this->_lang_db); - } + return $this->_convertToNameMode( + array_keys($this->_lang_db) + ); } /** * Make this object behave like Language::Guess - * - * @access public - * @param bool $setting false to turn off perl compatibility + * + * @param bool $setting false to turn off perl compatibility + * + * @return void */ - function setPerlCompatible($setting = true) + public function setPerlCompatible($setting = true) { if (is_bool($setting)) { // input check $this->_perl_compatible = $setting; @@ -421,6 +414,21 @@ class Text_LanguageDetect } + /** + * Sets the way how language names are accepted and returned. + * + * @param integer $name_mode One of the following modes: + * 0 - language name ("english") + * 2 - 2-letter ISO 639-1 code ("en") + * 3 - 3-letter ISO 639-2 code ("eng") + * + * @return void + */ + function setNameMode($name_mode) + { + $this->_name_mode = $name_mode; + } + /** * Whether to use unicode block ranges in detection * @@ -429,10 +437,11 @@ class Text_LanguageDetect * in languages that use latin scripts. In other cases it should speed up * detection noticeably. * - * @access public - * @param bool $setting false to turn off + * @param bool $setting false to turn off + * + * @return void */ - function useUnicodeBlocks($setting = true) + public function useUnicodeBlocks($setting = true) { if (is_bool($setting)) { $this->_use_unicode_narrowing = $setting; @@ -442,15 +451,15 @@ class Text_LanguageDetect /** * Converts a piece of text into trigrams * - * Superceded by the Text_LanguageDetect_Parser class + * @param string $text text to convert * - * @access private - * @param string $text text to convert - * @return array array of trigram frequencies + * @return array array of trigram frequencies + * @access private + * @deprecated Superceded by the Text_LanguageDetect_Parser class */ function _trigram($text) { - $s = new Text_LanguageDetect_Parser($text, $this->_db_filename, $this->_unicode_db_filename); + $s = new Text_LanguageDetect_Parser($text); $s->prepareTrigram(); $s->prepareUnicode(false); $s->setPadStart(!$this->_perl_compatible); @@ -463,11 +472,12 @@ class Text_LanguageDetect * * Thresholds (cuts off) the list at $this->_threshold * - * @access protected - * @param array $arr array of trgram - * @return array ranks of trigrams + * @param array $arr array of trigram + * + * @return array ranks of trigrams + * @access protected */ - function _arr_rank(&$arr) + function _arr_rank($arr) { // sorts alphabetically first as a standard way of breaking rank ties @@ -494,14 +504,17 @@ class Text_LanguageDetect /** * Sorts an array by value breaking ties alphabetically - * - * @access private - * @param array &$arr the array to sort + * + * @param array &$arr the array to sort + * + * @return void + * @access private */ function _bub_sort(&$arr) { // should do the same as this perl statement: - // sort { $trigrams{$b} == $trigrams{$a} ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } + // sort { $trigrams{$b} == $trigrams{$a} + // ? $a cmp $b : $trigrams{$b} <=> $trigrams{$a} } // needs to sort by both key and value at once // using the key to break ties for the value @@ -528,13 +541,14 @@ class Text_LanguageDetect /** * Sort function used by bubble sort * - * Callback function for usort(). + * Callback function for usort(). * - * @access private - * @param array first param passed by usort() - * @param array second param passed by usort() - * @return int 1 if $a is greater, -1 if not - * @see _bub_sort() + * @param array $a first param passed by usort() + * @param array $b second param passed by usort() + * + * @return int 1 if $a is greater, -1 if not + * @see _bub_sort() + * @access private */ function _sort_func($a, $b) { @@ -542,12 +556,12 @@ class Text_LanguageDetect list($a_key, $a_value) = $a; list($b_key, $b_value) = $b; - // if the values are the same, break ties using the key if ($a_value == $b_value) { + // if the values are the same, break ties using the key return strcmp($a_key, $b_key); - // if not, just sort normally } else { + // if not, just sort normally if ($a_value > $b_value) { return -1; } else { @@ -559,23 +573,24 @@ class Text_LanguageDetect } /** - * Calculates a linear rank-order distance statistic between two sets of + * Calculates a linear rank-order distance statistic between two sets of * ranked trigrams * - * Sums the differences in rank for each trigram. If the trigram does not + * Sums the differences in rank for each trigram. If the trigram does not * appear in both, consider it a difference of $this->_threshold. * * This distance measure was proposed by Cavnar & Trenkle (1994). Despite * its simplicity it has been shown to be highly accurate for language * identification tasks. * - * @access private - * @param array $arr1 the reference set of trigram ranks - * @param array $arr2 the target set of trigram ranks - * @return int the sum of the differences between the ranks of - * the two trigram sets + * @param array $arr1 the reference set of trigram ranks + * @param array $arr2 the target set of trigram ranks + * + * @return int the sum of the differences between the ranks of + * the two trigram sets + * @access private */ - function _distance(&$arr1, &$arr2) + function _distance($arr1, $arr2) { $sumdist = 0; @@ -598,14 +613,15 @@ class Text_LanguageDetect /** * Normalizes the score returned by _distance() - * + * * Different if perl compatible or not * - * @access private - * @param int $score the score from _distance() - * @param int $base_count the number of trigrams being considered - * @return float the normalized score - * @see _distance() + * @param int $score the score from _distance() + * @param int $base_count the number of trigrams being considered + * + * @return float the normalized score + * @see _distance() + * @access private */ function _normalize_score($score, $base_count = null) { @@ -630,29 +646,24 @@ class Text_LanguageDetect * * If perl compatible, the score is 300-0, 0 being most similar. * Otherwise, it's 0-1 with 1 being most similar. - * + * * The $sample text should be at least a few sentences in length; * should be ascii-7 or utf8 encoded, if another and the mbstring extension * is present it will try to detect and convert. However, experience has - * shown that mb_detect_encoding() *does not work very well* with at least + * shown that mb_detect_encoding() *does not work very well* with at least * some types of encoding. * - * @access public - * @param string $sample a sample of text to compare. - * @param int $limit if specified, return an array of the most likely - * $limit languages and their scores. - * @return mixed sorted array of language scores, blank array if no - * useable text was found, or PEAR_Error if error - * with the object setup - * @see _distance() - * @throws PEAR_Error + * @param string $sample a sample of text to compare. + * @param int $limit if specified, return an array of the most likely + * $limit languages and their scores. + * + * @return mixed sorted array of language scores, blank array if no + * useable text was found + * @see _distance() + * @throws Text_LanguageDetect_Exception */ - function detect($sample, $limit = 0) + public function detect($sample, $limit = 0) { - if (!$this->_setup_ok($err)) { - return $err; - } - // input check if (!Text_LanguageDetect_Parser::validateString($sample)) { return array(); @@ -660,36 +671,27 @@ class Text_LanguageDetect // check char encoding // (only if mbstring extension is compiled and PHP > 4.0.6) - if (function_exists('mb_detect_encoding') - && function_exists('mb_convert_encoding')) { - + if (function_exists('mb_detect_encoding') + && function_exists('mb_convert_encoding') + ) { // mb_detect_encoding isn't very reliable, to say the least - // detection should still work with a sufficient sample of ascii characters + // detection should still work with a sufficient sample + // of ascii characters $encoding = mb_detect_encoding($sample); // mb_detect_encoding() will return FALSE if detection fails // don't attempt conversion if that's the case - if ($encoding != 'ASCII' && $encoding != 'UTF-8' && $encoding !== false) { - - if (function_exists('mb_list_encodings')) { - - // verify the encoding exists in mb_list_encodings - if (in_array($encoding, mb_list_encodings())) { - $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); - } - - // if the previous condition failed: - // somehow we detected an encoding that also we don't support - - } else { - // php 4 doesnt have mb_list_encodings() - // so attempt with error suppression - $sample = @mb_convert_encoding($sample, 'UTF-8', $encoding); + if ($encoding != 'ASCII' && $encoding != 'UTF-8' + && $encoding !== false + ) { + // verify the encoding exists in mb_list_encodings + if (in_array($encoding, mb_list_encodings())) { + $sample = mb_convert_encoding($sample, 'UTF-8', $encoding); } } } - $sample_obj = new Text_LanguageDetect_Parser($sample, $this->_db_filename, $this->_unicode_db_filename); + $sample_obj = new Text_LanguageDetect_Parser($sample); $sample_obj->prepareTrigram(); if ($this->_use_unicode_narrowing) { $sample_obj->prepareUnicode(); @@ -713,7 +715,10 @@ class Text_LanguageDetect if (is_array($blocks)) { $present_blocks = array_keys($blocks); } else { - throw new Exception('Error during block detection'); + throw new Text_LanguageDetect_Exception( + 'Error during block detection', + Text_LanguageDetect_Exception::BLOCK_DETECTION + ); } $possible_langs = array(); @@ -731,30 +736,30 @@ class Text_LanguageDetect } // could also try an intersect operation rather than a union - // in other words, choose languages whose trigrams contain + // in other words, choose languages whose trigrams contain // ALL of the unicode blocks found in this sample // would improve speed but would be completely thrown off by an // unexpected character, like an umlaut appearing in english text $possible_langs = array_intersect( - array_keys($this->_lang_db), - array_unique($possible_langs) + array_keys($this->_lang_db), + array_unique($possible_langs) ); - // needs to intersect it with the keys of _lang_db in case + // needs to intersect it with the keys of _lang_db in case // languages have been omitted - // or just try 'em all } else { + // or just try 'em all $possible_langs = array_keys($this->_lang_db); } foreach ($possible_langs as $lang) { - $scores[$lang] = - $this->_normalize_score( - $this->_distance($this->_lang_db[$lang], $trigram_freqs), - $trigram_count); + $scores[$lang] = $this->_normalize_score( + $this->_distance($this->_lang_db[$lang], $trigram_freqs), + $trigram_count + ); } unset($sample_obj); @@ -772,7 +777,6 @@ class Text_LanguageDetect $limited_scores = array(); $i = 0; - foreach ($scores as $key => $value) { if ($i++ >= $limit) { break; @@ -781,9 +785,9 @@ class Text_LanguageDetect $limited_scores[$key] = $value; } - return $limited_scores; + return $this->_convertToNameMode($limited_scores, true); } else { - return $scores; + return $this->_convertToNameMode($scores, true); } } @@ -791,35 +795,33 @@ class Text_LanguageDetect * Returns only the most similar language to the text sample * * Calls $this->detect() and returns only the top result - * - * @access public - * @param string $sample text to detect the language of - * @return string the name of the most likely language - * or null if no language is similar - * @see detect() - * @throws PEAR_Error + * + * @param string $sample text to detect the language of + * + * @return string the name of the most likely language + * or null if no language is similar + * @see detect() + * @throws Text_LanguageDetect_Exception */ - function detectSimple($sample) + public function detectSimple($sample) { $scores = $this->detect($sample, 1); // if top language has the maximum possible score, // then the top score will have been picked at random - if ( !is_array($scores) - || empty($scores) - || current($scores) == $this->_max_score) { - + if (!is_array($scores) || empty($scores) + || current($scores) == $this->_max_score + ) { return null; - } else { - return ucfirst(key($scores)); + return key($scores); } } /** * Returns an array containing the most similar language and a confidence * rating - * + * * Confidence is a simple measure calculated from the similarity score * minus the similarity score from the next most similar language * divided by the highest possible score. Languages that have closely @@ -827,46 +829,43 @@ class Text_LanguageDetect * confidence scores. * * The similarity score answers the question "How likely is the text the - * returned language regardless of the other languages considered?" The + * returned language regardless of the other languages considered?" The * confidence score is one way of answering the question "how likely is the * text the detected language relative to the rest of the language model * set?" * * To see how similar languages are a priori, see languageSimilarity() - * - * @access public - * @param string $sample text for which language will be detected - * @return array most similar language, score and confidence rating - * or null if no language is similar - * @see detect() - * @throws PEAR_Error + * + * @param string $sample text for which language will be detected + * + * @return array most similar language, score and confidence rating + * or null if no language is similar + * @see detect() + * @throws Text_LanguageDetect_Exception */ - function detectConfidence($sample) + public function detectConfidence($sample) { $scores = $this->detect($sample, 2); - // if most similar language has the max score, it + // if most similar language has the max score, it // will have been picked at random - if ( !is_array($scores) - || empty($scores) - || current($scores) == $this->_max_score) { - + if (!is_array($scores) || empty($scores) + || current($scores) == $this->_max_score + ) { return null; } - $arr['language'] = ucfirst(key($scores)); + $arr['language'] = key($scores); $arr['similarity'] = current($scores); if (next($scores) !== false) { // if false then no next element // the goal is to return a higher value if the distance between // the similarity of the first score and the second score is high if ($this->_perl_compatible) { - - $arr['confidence'] = - (current($scores) - $arr['similarity']) / $this->_max_score; + $arr['confidence'] = (current($scores) - $arr['similarity']) + / $this->_max_score; } else { - $arr['confidence'] = $arr['similarity'] - current($scores); } @@ -882,32 +881,26 @@ class Text_LanguageDetect * Returns the distribution of unicode blocks in a given utf8 string * * For the block name of a single char, use unicodeBlockName() - * - * @access public - * @param string $str input string. Must be ascii or utf8 - * @param bool $skip_symbols if true, skip ascii digits, symbols and - * non-printing characters. Includes spaces, - * newlines and common punctutation characters. + * + * @param string $str input string. Must be ascii or utf8 + * @param bool $skip_symbols if true, skip ascii digits, symbols and + * non-printing characters. Includes spaces, + * newlines and common punctutation characters. + * * @return array - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception */ - function detectUnicodeBlocks($str, $skip_symbols) + public function detectUnicodeBlocks($str, $skip_symbols) { - // input check - if (!is_bool($skip_symbols)) { - throw new Exception('Second parameter must be boolean'); - } + $skip_symbols = (bool)$skip_symbols; + $str = (string)$str; - if (!is_string($str)) { - throw new Exception('First parameter was not a string'); - } - - $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); + $sample_obj = new Text_LanguageDetect_Parser($str); $sample_obj->prepareUnicode(); $sample_obj->prepareTrigram(false); $sample_obj->setUnicodeSkipSymbols($skip_symbols); $sample_obj->analyze(); - $blocks =& $sample_obj->getUnicodeBlocks(); + $blocks = $sample_obj->getUnicodeBlocks(); unset($sample_obj); return $blocks; } @@ -915,38 +908,37 @@ class Text_LanguageDetect /** * Returns the block name for a given unicode value * - * If passed a string, will assume it is being passed a UTF8-formatted + * If passed a string, will assume it is being passed a UTF8-formatted * character and will automatically convert. Otherwise it will assume it * is being passed a numeric unicode value. * * Make sure input is of the correct type! * - * @access public * @param mixed $unicode unicode value or utf8 char + * * @return mixed the block name string or false if not found - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception */ - function unicodeBlockName($unicode) { + public function unicodeBlockName($unicode) + { if (is_string($unicode)) { // assume it is being passed a utf8 char, so convert it - - // input check - if ($this->utf8strlen($unicode) > 1) { - throw new Exception('Pass this function only a single char'); + if (self::utf8strlen($unicode) > 1) { + throw new Text_LanguageDetect_Exception( + 'Pass a single char only to this method', + Text_LanguageDetect_Exception::PARAM_TYPE + ); } - $unicode = $this->_utf8char2unicode($unicode); - if ($unicode == -1) { - throw new Exception('Malformatted char'); - } - - // input check } elseif (!is_int($unicode)) { - throw new Exception('Input must be of type string or int.'); + throw new Text_LanguageDetect_Exception( + 'Input must be of type string or int.', + Text_LanguageDetect_Exception::PARAM_TYPE + ); } - $blocks =& $this->_read_unicode_block_db(); + $blocks = $this->_read_unicode_block_db(); $result = $this->_unicode_block_name($unicode, $blocks); @@ -964,14 +956,17 @@ class Text_LanguageDetect * the public interface for this function, which does input checks which * this function omits for speed. * - * @access protected - * @param int $unicode the unicode value - * @param array &$blocks the block database - * @param int $block_count the number of defined blocks in the database - * @see unicodeBlockName() + * @param int $unicode the unicode value + * @param array $blocks the block database + * @param int $block_count the number of defined blocks in the database + * + * @return mixed Block name, -1 if it failed + * @see unicodeBlockName() + * @access protected */ - function _unicode_block_name($unicode, &$blocks, $block_count = -1) { - // for a reference, see + function _unicode_block_name($unicode, $blocks, $block_count = -1) + { + // for a reference, see // http://www.unicode.org/Public/UNIDATA/Blocks.txt // assume that ascii characters are the most common @@ -994,35 +989,36 @@ class Text_LanguageDetect while ($low <= $high) { $mid = floor(($low + $high) / 2); - // if it's lower than the lower bound if ($unicode < $blocks[$mid][0]) { + // if it's lower than the lower bound $high = $mid - 1; - // if it's higher than the upper bound } elseif ($unicode > $blocks[$mid][1]) { + // if it's higher than the upper bound $low = $mid + 1; - // found it } else { + // found it return $blocks[$mid]; } } - // failed to find the block + // failed to find the block return -1; - // todo: differentiate when it's out of range or when it falls + // todo: differentiate when it's out of range or when it falls // into an unassigned range? } /** * Brings up the unicode block database * - * @access protected * @return array the database of unicode block definitions - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception + * @access protected */ - function &_read_unicode_block_db() { + function _read_unicode_block_db() + { // since the unicode definitions are always going to be the same, // might as well share the memory for the db with all other instances // of this class @@ -1037,29 +1033,27 @@ class Text_LanguageDetect /** * Calculate the similarities between the language models - * + * * Use this function to see how similar languages are to each other. * * If passed 2 language names, will return just those languages compared. * If passed 1 language name, will return that language compared to * all others. - * If passed none, will return an array of every language model compared + * If passed none, will return an array of every language model compared * to every other one. * - * @access public - * @param string $lang1 the name of the first language to be compared - * @param string $lang2 the name of the second language to be compared - * @return array scores of every language compared - * or the score of just the provided languages - * or null if one of the supplied languages does not exist - * @throws PEAR_Error + * @param string $lang1 the name of the first language to be compared + * @param string $lang2 the name of the second language to be compared + * + * @return array scores of every language compared + * or the score of just the provided languages + * or null if one of the supplied languages does not exist + * @throws Text_LanguageDetect_Exception */ - function languageSimilarity($lang1 = null, $lang2 = null) + public function languageSimilarity($lang1 = null, $lang2 = null) { - if (!$this->_setup_ok($err)) { - return $err; - } - + $lang1 = $this->_convertFromNameMode($lang1); + $lang2 = $this->_convertFromNameMode($lang2); if ($lang1 != null) { $lang1 = strtolower($lang1); @@ -1069,12 +1063,8 @@ class Text_LanguageDetect } if ($lang2 != null) { - - // can't only set the second param - if ($lang1 == null) { - return null; - // check if language model exists - } elseif (!isset($this->_lang_db[$lang2])) { + if (!isset($this->_lang_db[$lang2])) { + // check if language model exists return null; } @@ -1088,14 +1078,15 @@ class Text_LanguageDetect ) ); - - // compare just $lang1 to all languages } else { + // compare just $lang1 to all languages $return_arr = array(); foreach ($this->_lang_db as $key => $value) { - if ($key != $lang1) { // don't compare a language to itself + if ($key != $lang1) { + // don't compare a language to itself $return_arr[$key] = $this->_normalize_score( - $this->_distance($this->_lang_db[$lang1], $value)); + $this->_distance($this->_lang_db[$lang1], $value) + ); } } asort($return_arr); @@ -1104,30 +1095,27 @@ class Text_LanguageDetect } - // compare all languages to each other } else { + // compare all languages to each other $return_arr = array(); foreach (array_keys($this->_lang_db) as $lang1) { foreach (array_keys($this->_lang_db) as $lang2) { - // skip comparing languages to themselves - if ($lang1 != $lang2) { - - // don't re-calculate what's already been done + if ($lang1 != $lang2) { + if (isset($return_arr[$lang2][$lang1])) { + // don't re-calculate what's already been done + $return_arr[$lang1][$lang2] + = $return_arr[$lang2][$lang1]; - $return_arr[$lang1][$lang2] = - $return_arr[$lang2][$lang1]; - - // calculate } else { - - $return_arr[$lang1][$lang2] = - $this->_normalize_score( - $this->_distance( - $this->_lang_db[$lang1], - $this->_lang_db[$lang2] - ) + // calculate + $return_arr[$lang1][$lang2] + = $this->_normalize_score( + $this->_distance( + $this->_lang_db[$lang1], + $this->_lang_db[$lang2] + ) ); } @@ -1150,20 +1138,14 @@ class Text_LanguageDetect * * @access public * @return array language cluster data - * @throws PEAR_Error + * @throws Text_LanguageDetect_Exception * @see languageSimilarity() - * @deprecated this function will eventually be removed and placed into + * @deprecated this function will eventually be removed and placed into * the model generation class */ function clusterLanguages() { // todo: set the maximum number of clusters - - // setup check - if (!$this->_setup_ok($err)) { - return $err; - } - // return cached result, if any if (isset($this->_clusters)) { return $this->_clusters; @@ -1177,7 +1159,10 @@ class Text_LanguageDetect foreach ($langs as $lang) { if (!isset($this->_lang_db[$lang])) { - throw new Exception("missing $lang!\n"); + throw new Text_LanguageDetect_Exception( + "missing $lang!", + Text_LanguageDetect_Exception::UNKNOWN_LANGUAGE + ); } } @@ -1186,7 +1171,9 @@ class Text_LanguageDetect $langs[$lang1] = $lang1; unset($langs[$old_key]); } - + + $result_data = $really_map = array(); + $i = 0; while (count($langs) > 2 && $i++ < 200) { $highest_score = -1; @@ -1194,18 +1181,22 @@ class Text_LanguageDetect $highest_key2 = ''; foreach ($langs as $lang1) { foreach ($langs as $lang2) { - if ( $lang1 != $lang2 - && $arr[$lang1][$lang2] > $highest_score) { + if ($lang1 != $lang2 + && $arr[$lang1][$lang2] > $highest_score + ) { $highest_score = $arr[$lang1][$lang2]; $highest_key1 = $lang1; $highest_key2 = $lang2; } } } - + if (!$highest_key1) { // should not ever happen - throw new Exception("no highest key? (step: $i)"); + throw new Text_LanguageDetect_Exception( + "no highest key? (step: $i)", + Text_LanguageDetect_Exception::NO_HIGHEST_KEY + ); } if ($highest_score == 0) { @@ -1217,7 +1208,7 @@ class Text_LanguageDetect $sum1 = array_sum($arr[$highest_key1]); $sum2 = array_sum($arr[$highest_key2]); - // use the score for the one that is most similar to the rest of + // use the score for the one that is most similar to the rest of // the field as the score for the group // todo: could try averaging or "centroid" method instead // seems like that might make more sense @@ -1248,7 +1239,7 @@ class Text_LanguageDetect $really_lang = $replaceme; while (isset($really_map[$really_lang])) { $really_lang = $really_map[$really_lang]; - } + } $really_map[$newkey] = $really_lang; @@ -1259,8 +1250,8 @@ class Text_LanguageDetect $arr[$key1][$newkey] = $arr[$key1][$key2]; unset($arr[$key1][$key2]); // replacing $arr[$key1][$key2] with $arr[$key1][$newkey] - } - + } + if ($key1 == $replaceme) { $arr[$newkey][$key2] = $arr[$key1][$key2]; unset($arr[$key1][$key2]); @@ -1273,7 +1264,7 @@ class Text_LanguageDetect } } } - + unset($langs[$highest_key1]); unset($langs[$highest_key2]); @@ -1293,7 +1284,7 @@ class Text_LanguageDetect } $return_val = array( - 'open_forks' => $langs, + 'open_forks' => $langs, // the top level of clusters // clusters that are mutually exclusive // or specified by a specific maximum @@ -1323,11 +1314,11 @@ class Text_LanguageDetect * use, and it may disappear or its functionality may change in future * releases without notice. * - * This compares the sample text to top the top level of clusters. If the + * This compares the sample text to top the top level of clusters. If the * sample is similar to the cluster it will drop down and compare it to the * languages in the cluster, and so on until it hits a leaf node. * - * this should find the language in considerably fewer compares + * this should find the language in considerably fewer compares * (the equivalent of a binary search), however clusterLanguages() is costly * and the loss of accuracy from this technique is significant. * @@ -1337,15 +1328,14 @@ class Text_LanguageDetect * was very large, however in such cases some method of Bayesian inference * might be more helpful. * - * @see clusterLanguages() - * @access public - * @param string $str input string - * @return array language scores (only those compared) - * @throws PEAR_Error + * @param string $str input string + * + * @return array language scores (only those compared) + * @throws Text_LanguageDetect_Exception + * @see clusterLanguages() */ - function clusteredSearch($str) + public function clusteredSearch($str) { - // input check if (!Text_LanguageDetect_Parser::validateString($str)) { return array(); @@ -1359,7 +1349,7 @@ class Text_LanguageDetect $dendogram_data = $result['fork_data']; $dendogram_alias = $result['name_map']; - $sample_obj = new Text_LanguageDetect_Parser($str, $this->_db_filename, $this->_unicode_db_filename); + $sample_obj = new Text_LanguageDetect_Parser($str); $sample_obj->prepareTrigram(); $sample_obj->setPadStart(!$this->_perl_compatible); $sample_obj->analyze(); @@ -1372,7 +1362,7 @@ class Text_LanguageDetect } $i = 0; // counts the number of steps - + foreach ($dendogram_start as $lang) { if (isset($dendogram_alias[$lang])) { $lang_key = $dendogram_alias[$lang]; @@ -1382,7 +1372,8 @@ class Text_LanguageDetect $scores[$lang] = $this->_normalize_score( $this->_distance($this->_lang_db[$lang_key], $sample_result), - $sample_count); + $sample_count + ); $i++; } @@ -1411,7 +1402,8 @@ class Text_LanguageDetect $scores[$lang] = $this->_normalize_score( $this->_distance($this->_lang_db[$lang_key], $sample_result), - $sample_count); + $sample_count + ); //todo: does not need to do same comparison again } @@ -1428,8 +1420,8 @@ class Text_LanguageDetect $diff = $scores[$cur_key] - $scores[$loser_key]; - // $cur_key ({$dendogram_alias[$cur_key]}) wins - // over $loser_key ({$dendogram_alias[$loser_key]}) + // $cur_key ({$dendogram_alias[$cur_key]}) wins + // over $loser_key ({$dendogram_alias[$loser_key]}) // with a difference of $diff } @@ -1439,9 +1431,9 @@ class Text_LanguageDetect // which paths the algorithm decided to take along the tree // but sometimes the last item is only the second highest - if ( ($this->_perl_compatible && (end($scores) > prev($scores))) - || (!$this->_perl_compatible && (end($scores) < prev($scores)))) { - + if (($this->_perl_compatible && (end($scores) > prev($scores))) + || (!$this->_perl_compatible && (end($scores) < prev($scores))) + ) { $real_last_score = current($scores); $real_last_key = key($scores); @@ -1449,7 +1441,7 @@ class Text_LanguageDetect unset($scores[$real_last_key]); $scores[$real_last_key] = $real_last_score; } - + if (!$this->_perl_compatible) { $scores = array_reverse($scores, true); @@ -1464,12 +1456,11 @@ class Text_LanguageDetect * * Returns the numbers of characters (not bytes) in a utf8 string * - * @static - * @access public - * @param string $str string to get the length of - * @return int number of chars + * @param string $str string to get the length of + * + * @return int number of chars */ - function utf8strlen($str) + public static function utf8strlen($str) { // utf8_decode() will convert unknown chars to '?', which is actually // ideal for counting. @@ -1482,53 +1473,45 @@ class Text_LanguageDetect /** * Returns the unicode value of a utf8 char * - * @access protected - * @param string $char a utf8 (possibly multi-byte) char - * @return int unicode value or -1 if malformatted + * @param string $char a utf8 (possibly multi-byte) char + * + * @return int unicode value + * @access protected + * @link http://en.wikipedia.org/wiki/UTF-8 */ - function _utf8char2unicode($char) { - + function _utf8char2unicode($char) + { // strlen() here will actually get the binary length of a single char switch (strlen($char)) { + case 1: + // normal ASCII-7 byte + // 0xxxxxxx --> 0xxxxxxx + return ord($char{0}); - // for a reference, see http://en.wikipedia.org/wiki/UTF-8 + case 2: + // 2 byte unicode + // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx + $z = (ord($char{0}) & 0x000001F) << 6; + $x = (ord($char{1}) & 0x0000003F); + return ($z | $x); - case 1: - // normal ASCII-7 byte - // 0xxxxxxx --> 0xxxxxxx - return ord($char{0}); + case 3: + // 3 byte unicode + // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx + $z = (ord($char{0}) & 0x0000000F) << 12; + $x1 = (ord($char{1}) & 0x0000003F) << 6; + $x2 = (ord($char{2}) & 0x0000003F); + return ($z | $x1 | $x2); - case 2: - // 2 byte unicode - // 110zzzzx 10xxxxxx --> 00000zzz zxxxxxxx - $z = (ord($char{0}) & 0x000001F) << 6; - $x = (ord($char{1}) & 0x0000003F); - - return ($z | $x); - - case 3: - // 3 byte unicode - // 1110zzzz 10zxxxxx 10xxxxxx --> zzzzzxxx xxxxxxxx - $z = (ord($char{0}) & 0x0000000F) << 12; - $x1 = (ord($char{1}) & 0x0000003F) << 6; - $x2 = (ord($char{2}) & 0x0000003F); - - return ($z | $x1 | $x2); - - case 4: - // 4 byte unicode - // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> - // 000zzzzz xxxxxxxx xxxxxxxx - $z1 = (ord($char{0}) & 0x00000007) << 18; - $z2 = (ord($char{1}) & 0x0000003F) << 12; - $x1 = (ord($char{2}) & 0x0000003F) << 6; - $x2 = (ord($char{3}) & 0x0000003F); - - return ($z1 | $z2 | $x1 | $x2); - - default: - // error: malformatted char? - return -1; + case 4: + // 4 byte unicode + // 11110zzz 10zzxxxx 10xxxxxx 10xxxxxx --> + // 000zzzzz xxxxxxxx xxxxxxxx + $z1 = (ord($char{0}) & 0x00000007) << 18; + $z2 = (ord($char{1}) & 0x0000003F) << 12; + $x1 = (ord($char{2}) & 0x0000003F) << 6; + $x2 = (ord($char{3}) & 0x0000003F); + return ($z1 | $z2 | $x1 | $x2); } } @@ -1536,18 +1519,18 @@ class Text_LanguageDetect * utf8-safe fast character iterator * * Will get the next character starting from $counter, which will then be - * incremented. If a multi-byte char the bytes will be concatenated and + * incremented. If a multi-byte char the bytes will be concatenated and * $counter will be incremeted by the number of bytes in the char. * - * @access private - * @param string &$str the string being iterated over - * @param int &$counter the iterator, will increment by reference - * @param bool $special_convert whether to do special conversions - * @return char the next (possibly multi-byte) char from $counter + * @param string $str the string being iterated over + * @param int &$counter the iterator, will increment by reference + * @param bool $special_convert whether to do special conversions + * + * @return char the next (possibly multi-byte) char from $counter + * @access private */ - function _next_char(&$str, &$counter, $special_convert = false) + static function _next_char($str, &$counter, $special_convert = false) { - $char = $str{$counter++}; $ord = ord($char); @@ -1556,7 +1539,6 @@ class Text_LanguageDetect // normal ascii one byte char if ($ord <= 127) { - // special conversions needed for this package // (that only apply to regular ascii characters) // lower case, and convert all non-alphanumeric characters @@ -1571,8 +1553,8 @@ class Text_LanguageDetect return $char; - // multi-byte chars } elseif ($ord >> 5 == 6) { // two-byte char + // multi-byte chars $nextchar = $str{$counter++}; // get next byte // lower-casing of non-ascii characters is still incomplete @@ -1582,27 +1564,27 @@ class Text_LanguageDetect if ($ord == 195) { $nextord = ord($nextchar); $nextord_adj = $nextord + 64; - // for a reference, see + // for a reference, see // http://www.ramsch.org/martin/uni/fmi-hp/iso8859-1.html // À - Þ but not × - if ( $nextord_adj >= 192 - && $nextord_adj <= 222 - && $nextord_adj != 215) { - - $nextchar = chr($nextord + 32); + if ($nextord_adj >= 192 + && $nextord_adj <= 222 + && $nextord_adj != 215 + ) { + $nextchar = chr($nextord + 32); } - // lower case cyrillic alphabet } elseif ($ord == 208) { + // lower case cyrillic alphabet $nextord = ord($nextchar); // if A - Pe if ($nextord >= 144 && $nextord <= 159) { // lower case $nextchar = chr($nextord + 32); - // if Er - Ya } elseif ($nextord >= 160 && $nextord <= 175) { + // if Er - Ya // lower case $char = chr(209); // == $ord++ $nextchar = chr($nextord - 32); @@ -1611,12 +1593,11 @@ class Text_LanguageDetect } // tag on next byte - return $char . $nextchar; - + return $char . $nextchar; } elseif ($ord >> 4 == 14) { // three-byte char - + // tag on next 2 bytes - return $char . $str{$counter++} . $str{$counter++}; + return $char . $str{$counter++} . $str{$counter++}; } elseif ($ord >> 3 == 30) { // four-byte char @@ -1628,8 +1609,85 @@ class Text_LanguageDetect } } + /** + * Converts an $language input parameter from the configured mode + * to the language name that is used internally. + * + * Works for strings and arrays. + * + * @param string|array $lang A language description ("english"/"en"/"eng") + * @param boolean $convertKey If $lang is an array, setting $key + * converts the keys to the language name. + * + * @return string|array Language name + */ + function _convertFromNameMode($lang, $convertKey = false) + { + if ($this->_name_mode == 0) { + return $lang; + } + + if ($this->_name_mode == 2) { + $method = 'code2ToName'; + } else { + $method = 'code3ToName'; + } + + if (is_string($lang)) { + return (string)Text_LanguageDetect_ISO639::$method($lang); + } + + $newlang = array(); + foreach ($lang as $key => $val) { + if ($convertKey) { + $newkey = (string)Text_LanguageDetect_ISO639::$method($key); + $newlang[$newkey] = $val; + } else { + $newlang[$key] = (string)Text_LanguageDetect_ISO639::$method($val); + } + } + return $newlang; + } + + /** + * Converts an $language output parameter from the language name that is + * used internally to the configured mode. + * + * Works for strings and arrays. + * + * @param string|array $lang A language description ("english"/"en"/"eng") + * @param boolean $convertKey If $lang is an array, setting $key + * converts the keys to the language name. + * + * @return string|array Language name + */ + function _convertToNameMode($lang, $convertKey = false) + { + if ($this->_name_mode == 0) { + return $lang; + } + + if ($this->_name_mode == 2) { + $method = 'nameToCode2'; + } else { + $method = 'nameToCode3'; + } + + if (is_string($lang)) { + return Text_LanguageDetect_ISO639::$method($lang); + } + + $newlang = array(); + foreach ($lang as $key => $val) { + if ($convertKey) { + $newkey = Text_LanguageDetect_ISO639::$method($key); + $newlang[$newkey] = $val; + } else { + $newlang[$key] = Text_LanguageDetect_ISO639::$method($val); + } + } + return $newlang; + } } -/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ - -?> +/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file diff --git a/libraries/language-detect/LanguageDetect/Exception.php b/libraries/language-detect/LanguageDetect/Exception.php new file mode 100644 index 0000000..196d994 --- /dev/null +++ b/libraries/language-detect/LanguageDetect/Exception.php @@ -0,0 +1,57 @@ + + * @copyright 2011 Christian Weiske + * @license http://www.debian.org/misc/bsd.license BSD + * @version SVN: $Id$ + * @link http://pear.php.net/package/Text_LanguageDetect/ + */ + +/** + * Provides a mapping between the languages from lang.dat and the + * ISO 639-1 and ISO-639-2 codes. + * + * Note that this class contains only languages that exist in lang.dat. + * + * @category Text + * @package Text_LanguageDetect + * @author Christian Weiske + * @copyright 2011 Christian Weiske + * @license http://www.debian.org/misc/bsd.license BSD + * @link http://www.loc.gov/standards/iso639-2/php/code_list.php + */ +class Text_LanguageDetect_ISO639 +{ + /** + * Maps all language names from the language database to the + * ISO 639-1 2-letter language code. + * + * NULL indicates that there is no 2-letter code. + * + * @var array + */ + public static $nameToCode2 = array( + 'albanian' => 'sq', + 'arabic' => 'ar', + 'azeri' => 'az', + 'bengali' => 'bn', + 'bulgarian' => 'bg', + 'cebuano' => null, + 'croatian' => 'hr', + 'czech' => 'cs', + 'danish' => 'da', + 'dutch' => 'nl', + 'english' => 'en', + 'estonian' => 'et', + 'farsi' => 'fa', + 'finnish' => 'fi', + 'french' => 'fr', + 'german' => 'de', + 'hausa' => 'ha', + 'hawaiian' => null, + 'hindi' => 'hi', + 'hungarian' => 'hu', + 'icelandic' => 'is', + 'indonesian' => 'id', + 'italian' => 'it', + 'kazakh' => 'kk', + 'kyrgyz' => 'ky', + 'latin' => 'la', + 'latvian' => 'lv', + 'lithuanian' => 'lt', + 'macedonian' => 'mk', + 'mongolian' => 'mn', + 'nepali' => 'ne', + 'norwegian' => 'no', + 'pashto' => 'ps', + 'pidgin' => null, + 'polish' => 'pl', + 'portuguese' => 'pt', + 'romanian' => 'ro', + 'russian' => 'ru', + 'serbian' => 'sr', + 'slovak' => 'sk', + 'slovene' => 'sl', + 'somali' => 'so', + 'spanish' => 'es', + 'swahili' => 'sw', + 'swedish' => 'sv', + 'tagalog' => 'tl', + 'turkish' => 'tr', + 'ukrainian' => 'uk', + 'urdu' => 'ur', + 'uzbek' => 'uz', + 'vietnamese' => 'vi', + 'welsh' => 'cy', + ); + + /** + * Maps all language names from the language database to the + * ISO 639-2 3-letter language code. + * + * @var array + */ + public static $nameToCode3 = array( + 'albanian' => 'sqi', + 'arabic' => 'ara', + 'azeri' => 'aze', + 'bengali' => 'ben', + 'bulgarian' => 'bul', + 'cebuano' => 'ceb', + 'croatian' => 'hrv', + 'czech' => 'ces', + 'danish' => 'dan', + 'dutch' => 'nld', + 'english' => 'eng', + 'estonian' => 'est', + 'farsi' => 'fas', + 'finnish' => 'fin', + 'french' => 'fra', + 'german' => 'deu', + 'hausa' => 'hau', + 'hawaiian' => 'haw', + 'hindi' => 'hin', + 'hungarian' => 'hun', + 'icelandic' => 'isl', + 'indonesian' => 'ind', + 'italian' => 'ita', + 'kazakh' => 'kaz', + 'kyrgyz' => 'kir', + 'latin' => 'lat', + 'latvian' => 'lav', + 'lithuanian' => 'lit', + 'macedonian' => 'mkd', + 'mongolian' => 'mon', + 'nepali' => 'nep', + 'norwegian' => 'nor', + 'pashto' => 'pus', + 'pidgin' => 'crp', + 'polish' => 'pol', + 'portuguese' => 'por', + 'romanian' => 'ron', + 'russian' => 'rus', + 'serbian' => 'srp', + 'slovak' => 'slk', + 'slovene' => 'slv', + 'somali' => 'som', + 'spanish' => 'spa', + 'swahili' => 'swa', + 'swedish' => 'swe', + 'tagalog' => 'tgl', + 'turkish' => 'tur', + 'ukrainian' => 'ukr', + 'urdu' => 'urd', + 'uzbek' => 'uzb', + 'vietnamese' => 'vie', + 'welsh' => 'cym', + ); + + /** + * Maps ISO 639-1 2-letter language codes to the language names + * in the language database + * + * Not all languages have a 2 letter code, so some are missing + * + * @var array + */ + public static $code2ToName = array( + 'ar' => 'arabic', + 'az' => 'azeri', + 'bg' => 'bulgarian', + 'bn' => 'bengali', + 'cs' => 'czech', + 'cy' => 'welsh', + 'da' => 'danish', + 'de' => 'german', + 'en' => 'english', + 'es' => 'spanish', + 'et' => 'estonian', + 'fa' => 'farsi', + 'fi' => 'finnish', + 'fr' => 'french', + 'ha' => 'hausa', + 'hi' => 'hindi', + 'hr' => 'croatian', + 'hu' => 'hungarian', + 'id' => 'indonesian', + 'is' => 'icelandic', + 'it' => 'italian', + 'kk' => 'kazakh', + 'ky' => 'kyrgyz', + 'la' => 'latin', + 'lt' => 'lithuanian', + 'lv' => 'latvian', + 'mk' => 'macedonian', + 'mn' => 'mongolian', + 'ne' => 'nepali', + 'nl' => 'dutch', + 'no' => 'norwegian', + 'pl' => 'polish', + 'ps' => 'pashto', + 'pt' => 'portuguese', + 'ro' => 'romanian', + 'ru' => 'russian', + 'sk' => 'slovak', + 'sl' => 'slovene', + 'so' => 'somali', + 'sq' => 'albanian', + 'sr' => 'serbian', + 'sv' => 'swedish', + 'sw' => 'swahili', + 'tl' => 'tagalog', + 'tr' => 'turkish', + 'uk' => 'ukrainian', + 'ur' => 'urdu', + 'uz' => 'uzbek', + 'vi' => 'vietnamese', + ); + + /** + * Maps ISO 639-2 3-letter language codes to the language names + * in the language database. + * + * @var array + */ + public static $code3ToName = array( + 'ara' => 'arabic', + 'aze' => 'azeri', + 'ben' => 'bengali', + 'bul' => 'bulgarian', + 'ceb' => 'cebuano', + 'ces' => 'czech', + 'crp' => 'pidgin', + 'cym' => 'welsh', + 'dan' => 'danish', + 'deu' => 'german', + 'eng' => 'english', + 'est' => 'estonian', + 'fas' => 'farsi', + 'fin' => 'finnish', + 'fra' => 'french', + 'hau' => 'hausa', + 'haw' => 'hawaiian', + 'hin' => 'hindi', + 'hrv' => 'croatian', + 'hun' => 'hungarian', + 'ind' => 'indonesian', + 'isl' => 'icelandic', + 'ita' => 'italian', + 'kaz' => 'kazakh', + 'kir' => 'kyrgyz', + 'lat' => 'latin', + 'lav' => 'latvian', + 'lit' => 'lithuanian', + 'mkd' => 'macedonian', + 'mon' => 'mongolian', + 'nep' => 'nepali', + 'nld' => 'dutch', + 'nor' => 'norwegian', + 'pol' => 'polish', + 'por' => 'portuguese', + 'pus' => 'pashto', + 'rom' => 'romanian', + 'rus' => 'russian', + 'slk' => 'slovak', + 'slv' => 'slovene', + 'som' => 'somali', + 'spa' => 'spanish', + 'sqi' => 'albanian', + 'srp' => 'serbian', + 'swa' => 'swahili', + 'swe' => 'swedish', + 'tgl' => 'tagalog', + 'tur' => 'turkish', + 'ukr' => 'ukrainian', + 'urd' => 'urdu', + 'uzb' => 'uzbek', + 'vie' => 'vietnamese', + ); + + /** + * Returns the 2-letter ISO 639-1 code for the given language name. + * + * @param string $lang English language name like "swedish" + * + * @return string Two-letter language code (e.g. "sv") or NULL if not found + */ + public static function nameToCode2($lang) + { + $lang = strtolower($lang); + if (!isset(self::$nameToCode2[$lang])) { + return null; + } + return self::$nameToCode2[$lang]; + } + + /** + * Returns the 3-letter ISO 639-2 code for the given language name. + * + * @param string $lang English language name like "swedish" + * + * @return string Three-letter language code (e.g. "swe") or NULL if not found + */ + public static function nameToCode3($lang) + { + $lang = strtolower($lang); + if (!isset(self::$nameToCode3[$lang])) { + return null; + } + return self::$nameToCode3[$lang]; + } + + /** + * Returns the language name for the given 2-letter ISO 639-1 code. + * + * @param string $code Two-letter language code (e.g. "sv") + * + * @return string English language name like "swedish" + */ + public static function code2ToName($code) + { + $lang = strtolower($code); + if (!isset(self::$code2ToName[$code])) { + return null; + } + return self::$code2ToName[$code]; + } + + /** + * Returns the language name for the given 3-letter ISO 639-2 code. + * + * @param string $code Three-letter language code (e.g. "swe") + * + * @return string English language name like "swedish" + */ + public static function code3ToName($code) + { + $lang = strtolower($code); + if (!isset(self::$code3ToName[$code])) { + return null; + } + return self::$code3ToName[$code]; + } +} \ No newline at end of file diff --git a/libraries/language-detect/Parser.php b/libraries/language-detect/LanguageDetect/Parser.php similarity index 94% rename from libraries/language-detect/Parser.php rename to libraries/language-detect/LanguageDetect/Parser.php index 7f15fa9..fb0e1e2 100644 --- a/libraries/language-detect/Parser.php +++ b/libraries/language-detect/LanguageDetect/Parser.php @@ -8,7 +8,7 @@ * @author Nicholas Pisarro * @copyright 2006 * @license BSD - * @version CVS: $Id: Parser.php,v 1.5 2006/03/11 05:45:05 taak Exp $ + * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $ * @link http://pear.php.net/package/Text_LanguageDetect/ * @link http://langdetect.blogspot.com/ */ @@ -28,7 +28,7 @@ * @author Nicholas Pisarro * @copyright 2006 * @license BSD - * @version release: 0.2.3 + * @version release: 0.3.0 */ class Text_LanguageDetect_Parser extends Text_LanguageDetect { @@ -102,21 +102,17 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect * @access private * @param string $string string to be parsed */ - function Text_LanguageDetect_Parser($string, $db=null, $unicode_db=null) { - if (isset($db)) $this->_db_filename = $db; - if (isset($unicode_db)) $this->_unicode_db_filename = $unicode_db; + function Text_LanguageDetect_Parser($string) { $this->_string = $string; } /** * Returns true if a string is suitable for parsing * - * @static - * @access public * @param string $str input string to test * @return bool true if acceptable, false if not */ - function validateString($str) { + public static function validateString($str) { if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { return true; } else { @@ -222,8 +218,7 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect // unicode startup if ($this->_compile_unicode) { - $blocks =& $this->_read_unicode_block_db(); - + $blocks = $this->_read_unicode_block_db(); $block_count = count($blocks); $skipped_count = 0; @@ -349,6 +344,4 @@ class Text_LanguageDetect_Parser extends Text_LanguageDetect } } -/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ - -?> +/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ \ No newline at end of file diff --git a/libraries/readability/Readability.php b/libraries/readability/Readability.php index 442430f..d0f09d7 100644 --- a/libraries/readability/Readability.php +++ b/libraries/readability/Readability.php @@ -1059,8 +1059,8 @@ class Readability } else if ( $input > floor($p/3) ) { $this->dbg(' too many elements'); $toRemove = true; - } else if ($contentLength < 25 && ($embedCount === 0 && ($img === 0 || $img > 2))) { - $this->dbg(' content length less than 25 chars, 0 embeds and either 0 images or more than 2 images'); + } else if ($contentLength < 10 && ($embedCount === 0 && ($img === 0 || $img > 2))) { + $this->dbg(' content length less than 10 chars, 0 embeds and either 0 images or more than 2 images'); $toRemove = true; } else if($weight < 25 && $linkDensity > 0.2) { $this->dbg(' weight smaller than 25 and link density above 0.2'); diff --git a/makefulltextfeed.php b/makefulltextfeed.php index 20296ba..ac4c81b 100644 --- a/makefulltextfeed.php +++ b/makefulltextfeed.php @@ -3,8 +3,8 @@ // Author: Keyvan Minoukadeh // Copyright (c) 2013 Keyvan Minoukadeh // License: AGPLv3 -// Version: 3.1 -// Date: 2013-03-05 +// Version: 3.2 +// Date: 2013-05-13 // More info: http://fivefilters.org/content-only/ // Help: http://help.fivefilters.org @@ -25,12 +25,8 @@ along with this program. If not, see . // Usage // ----- -// Request this file passing it your feed in the querystring: makefulltextfeed.php?url=mysite.org -// The following options can be passed in the querystring: -// * URL: url=[feed or website url] (required, should be URL-encoded - in php: urlencode($url)) -// * URL points to HTML (not feed): html=true (optional, by default it's automatically detected) -// * API key: key=[api key] (optional, refer to config.php) -// * Max entries to process: max=[max number of items] (optional) +// Request this file passing it a web page or feed URL in the querystring: makefulltextfeed.php?url=example.org/article +// For more request parameters, see http://help.fivefilters.org/customer/portal/articles/226660-usage error_reporting(E_ALL ^ E_NOTICE); ini_set("display_errors", 1); @@ -199,6 +195,8 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']); if (isset($_GET['xss'])) $redirect .= '&xss'; if (isset($_GET['use_extracted_title'])) $redirect .= '&use_extracted_title'; + if (isset($_GET['content'])) $redirect .= '&content='.urlencode($_GET['content']); + if (isset($_GET['summary'])) $redirect .= '&summary='.urlencode($_GET['summary']); if (isset($_GET['debug'])) $redirect .= '&debug'; if ($debug_mode) { debug('Redirecting to hide access key, follow URL below to continue'); @@ -284,6 +282,28 @@ if ($options->favour_feed_titles == 'user') { $favour_feed_titles = $options->favour_feed_titles; } +/////////////////////////////////////////////// +// Include full content in output? +/////////////////////////////////////////////// +if ($options->content === 'user') { + if (isset($_GET['content']) && $_GET['content'] === '0') { + $options->content = false; + } else { + $options->content = true; + } +} + +/////////////////////////////////////////////// +// Include summaries in output? +/////////////////////////////////////////////// +if ($options->summary === 'user') { + if (isset($_GET['summary']) && $_GET['summary'] === '1') { + $options->summary = true; + } else { + $options->summary = false; + } +} + /////////////////////////////////////////////// // Exclude items if extraction fails /////////////////////////////////////////////// @@ -306,15 +326,6 @@ if ($options->detect_language === 'user') { $detect_language = $options->detect_language; } -if ($detect_language >= 2) { - $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg', - 'cebuano' => 'ceb', // ISO 639-2 - 'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha', - 'hawaiian' => 'haw', // ISO 639-2 - 'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps', - 'pidgin' => 'cpe', // ISO 639-2 - 'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy'); -} $use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0); ///////////////////////////////////// @@ -364,7 +375,7 @@ if ($options->cors) header('Access-Control-Allow-Origin: *'); ////////////////////////////////// if ($options->caching) { debug('Caching is enabled...'); - $cache_id = md5($max.$url.$valid_key.$links.$favour_feed_titles.$xss_filter.$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); + $cache_id = md5($max.$url.(int)$valid_key.$links.(int)$favour_feed_titles.(int)$options->content.(int)$options->summary.(int)$xss_filter.(int)$exclude_on_fail.$format.$detect_language.(int)isset($_GET['pubsub'])); $check_cache = true; if ($options->apc && $options->smart_cache) { apc_add("cache.$cache_id", 0, 10*60); @@ -605,14 +616,33 @@ foreach ($items as $key => $item) { $is_single_page = false; if ($single_page_response = getSinglePage($item, $html, $effective_url)) { $is_single_page = true; - $html = $single_page_response['body']; - // remove strange things - $html = str_replace('', '', $html); - $html = convert_to_utf8($html, $single_page_response['headers']); $effective_url = $single_page_response['effective_url']; - debug("Retrieved single-page view from $effective_url"); + // check if action defined for returned Content-Type + $mime_info = get_mime_action_info($single_page_response['headers']); + if (isset($mime_info['action'])) { + if ($mime_info['action'] == 'exclude') { + continue; // skip this feed item entry + } elseif ($mime_info['action'] == 'link') { + if ($mime_info['type'] == 'image') { + $html = "\"{$mime_info['name']}\""; + } else { + $html = "Download {$mime_info['name']}"; + } + $extracted_title = $mime_info['name']; + $do_content_extraction = false; + } + } + if ($do_content_extraction) { + $html = $single_page_response['body']; + // remove strange things + $html = str_replace('', '', $html); + $html = convert_to_utf8($html, $single_page_response['headers']); + debug("Retrieved single-page view from $effective_url"); + } unset($single_page_response); } + } + if ($do_content_extraction) { debug('--------'); debug('Attempting to extract content'); $extract_result = $extractor->process($html, $effective_url); @@ -622,7 +652,7 @@ foreach ($items as $key => $item) { // Deal with multi-page articles //die('Next: '.$extractor->getNextPageUrl()); $is_multi_page = (!$is_single_page && $extract_result && $extractor->getNextPageUrl()); - if ($options->multipage && $is_multi_page) { + if ($options->multipage && $is_multi_page && $options->content) { debug('--------'); debug('Attempting to process multi-page article'); $multi_page_urls = array(); @@ -660,13 +690,15 @@ foreach ($items as $key => $item) { // did we successfully deal with this multi-page article? if (empty($multi_page_content)) { debug('Failed to extract all parts of multi-page article, so not going to include them'); - $multi_page_content[] = $readability->dom->createElement('p')->innerHTML = 'This article appears to continue on subsequent pages which we could not extract'; + $_page = $readability->dom->createElement('p'); + $_page->innerHTML = 'This article appears to continue on subsequent pages which we could not extract'; + $multi_page_content[] = $_page; } foreach ($multi_page_content as $_page) { $_page = $content_block->ownerDocument->importNode($_page, true); $content_block->appendChild($_page); } - unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url); + unset($multi_page_urls, $multi_page_content, $page_mime_info, $next_page_url, $_page); } } // use extracted title for both feed and item title if we're using single-item dummy feed @@ -713,7 +745,7 @@ foreach ($items as $key => $item) { } else { $html = $content_block->ownerDocument->saveXML($content_block); // essentially outerHTML } - unset($content_block); + //unset($content_block); // post-processing cleanup $html = preg_replace('!

    [\s\h\v]*

    !u', '', $html); if ($links == 'remove') { @@ -726,130 +758,156 @@ foreach ($items as $key => $item) { } } - if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment - $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); + if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment + $newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false')); + } else { + $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); + } + + // filter xss? + if ($xss_filter) { + debug('Filtering HTML to remove XSS'); + $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); + } + + // add content + if ($options->summary === true) { + // get summary + $summary = ''; + if (!$do_content_extraction) { + $summary = $html; } else { - $newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true')); - } - // filter xss? - if ($xss_filter) { - debug('Filtering HTML to remove XSS'); - $html = htmLawed::hl($html, array('safe'=>1, 'deny_attribute'=>'style', 'comment'=>1, 'cdata'=>1)); - } - $newitem->setDescription($html); - - // set date - if ((int)$item->get_date('U') > 0) { - $newitem->setDate((int)$item->get_date('U')); - } elseif ($extractor->getDate()) { - $newitem->setDate($extractor->getDate()); - } - - // add authors - if ($authors = $item->get_authors()) { - foreach ($authors as $author) { - // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel - if ($author->get_name() !== null) { - $newitem->addElement('dc:creator', $author->get_name()); - } elseif ($author->get_email() !== null) { - $newitem->addElement('dc:creator', $author->get_email()); + // Try to get first few paragraphs + if (isset($content_block) && ($content_block instanceof DOMElement)) { + $_paras = $content_block->getElementsByTagName('p'); + foreach ($_paras as $_para) { + $summary .= preg_replace("/[\n\r\t ]+/", ' ', $_para->textContent).' '; + if (strlen($summary) > 200) break; } - } - } elseif ($authors = $extractor->getAuthors()) { - //TODO: make sure the list size is reasonable - foreach ($authors as $author) { - // TODO: xpath often selects authors from other articles linked from the page. - // for now choose first item - $newitem->addElement('dc:creator', $author); - break; + } else { + $summary = $html; } } - - // add language - if ($detect_language) { - $language = $extractor->getLanguage(); - if (!$language) $language = $feed->get_language(); - if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { - try { - if ($use_cld) { - // Use PHP-CLD extension - $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error - $res = $php_cld($text_sample); - if (is_array($res) && count($res) > 0) { - $language = $res[0]['code']; - } - } else { - //die('what'); - // Use PEAR's Text_LanguageDetect - if (!isset($l)) { - $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat'); - } - $l_result = $l->detect($text_sample, 1); - if (count($l_result) > 0) { - $language = $language_codes[key($l_result)]; - } + unset($_paras, $_para); + $summary = get_excerpt($summary); + $newitem->setDescription($summary); + if ($options->content) $newitem->setElement('content:encoded', $html); + } else { + if ($options->content) $newitem->setDescription($html); + } + + // set date + if ((int)$item->get_date('U') > 0) { + $newitem->setDate((int)$item->get_date('U')); + } elseif ($extractor->getDate()) { + $newitem->setDate($extractor->getDate()); + } + + // add authors + if ($authors = $item->get_authors()) { + foreach ($authors as $author) { + // for some feeds, SimplePie stores author's name as email, e.g. http://feeds.feedburner.com/nymag/intel + if ($author->get_name() !== null) { + $newitem->addElement('dc:creator', $author->get_name()); + } elseif ($author->get_email() !== null) { + $newitem->addElement('dc:creator', $author->get_email()); + } + } + } elseif ($authors = $extractor->getAuthors()) { + //TODO: make sure the list size is reasonable + foreach ($authors as $author) { + // TODO: xpath often selects authors from other articles linked from the page. + // for now choose first item + $newitem->addElement('dc:creator', $author); + break; + } + } + + // add language + if ($detect_language) { + $language = $extractor->getLanguage(); + if (!$language) $language = $feed->get_language(); + if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) { + try { + if ($use_cld) { + // Use PHP-CLD extension + $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error + $res = $php_cld($text_sample); + if (is_array($res) && count($res) > 0) { + $language = $res[0]['code']; + } + } else { + //die('what'); + // Use PEAR's Text_LanguageDetect + if (!isset($l)) { + $l = new Text_LanguageDetect(); + $l->setNameMode(2); // return ISO 639-1 codes (e.g. "en") } - } catch (Exception $e) { - //die('error: '.$e); - // do nothing - } - } - if ($language && (strlen($language) < 7)) { - $newitem->addElement('dc:language', $language); - } - } - - // add MIME type (if it appeared in our exclusions lists) - if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']); - // add effective URL (URL after redirects) - if (isset($effective_url)) { - //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g. - //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-–-25th-March-2012-Special-Program-from-Liari-(Karachi) - //temporary measure: use utf8_encode() - $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url))); - } else { - $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink())); - } - - // add categories - if ($categories = $item->get_categories()) { - foreach ($categories as $category) { - if ($category->get_label() !== null) { - $newitem->addElement('category', $category->get_label()); - } - } - } - - // check for enclosures - if ($options->keep_enclosures) { - if ($enclosures = $item->get_enclosures()) { - foreach ($enclosures as $enclosure) { - // thumbnails - foreach ((array)$enclosure->get_thumbnails() as $thumbnail) { - $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail)); + $l_result = $l->detect($text_sample, 1); + if (count($l_result) > 0) { + $language = key($l_result); } - if (!$enclosure->get_link()) continue; - $enc = array(); - // Media RSS spec ($enc): http://search.yahoo.com/mrss - // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4 - $enc['url'] = $enclosure->get_link(); - if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length(); - if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type(); - if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium(); - if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression(); - if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate(); - if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate(); - if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate(); - if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels(); - if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration(); - if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height(); - if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width(); - if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language(); - $newitem->addElement('media:content', '', $enc); } + } catch (Exception $e) { + //die('error: '.$e); + // do nothing } } - /* } */ + if ($language && (strlen($language) < 7)) { + $newitem->addElement('dc:language', $language); + } + } + + // add MIME type (if it appeared in our exclusions lists) + if (isset($mime_info['mime'])) $newitem->addElement('dc:format', $mime_info['mime']); + // add effective URL (URL after redirects) + if (isset($effective_url)) { + //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g. + //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-–-25th-March-2012-Special-Program-from-Liari-(Karachi) + //temporary measure: use utf8_encode() + $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url))); + } else { + $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink())); + } + + // add categories + if ($categories = $item->get_categories()) { + foreach ($categories as $category) { + if ($category->get_label() !== null) { + $newitem->addElement('category', $category->get_label()); + } + } + } + + // check for enclosures + if ($options->keep_enclosures) { + if ($enclosures = $item->get_enclosures()) { + foreach ($enclosures as $enclosure) { + // thumbnails + foreach ((array)$enclosure->get_thumbnails() as $thumbnail) { + $newitem->addElement('media:thumbnail', '', array('url'=>$thumbnail)); + } + if (!$enclosure->get_link()) continue; + $enc = array(); + // Media RSS spec ($enc): http://search.yahoo.com/mrss + // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4 + $enc['url'] = $enclosure->get_link(); + if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length(); + if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type(); + if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium(); + if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression(); + if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate(); + if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate(); + if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate(); + if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels(); + if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration(); + if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height(); + if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width(); + if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language(); + $newitem->addElement('media:content', '', $enc); + } + } + } $output->addItem($newitem); unset($html); $item_count++; @@ -906,6 +964,38 @@ if (!$debug_mode) { // HELPER FUNCTIONS /////////////////////////////// +// Adapted from WordPress +// http://core.trac.wordpress.org/browser/tags/3.5.1/wp-includes/formatting.php#L2173 +function get_excerpt($text, $num_words=55, $more=null) { + if (null === $more) $more = '…'; + $text = strip_tags($text); + //TODO: Check if word count is based on single characters (East Asian characters) + /* + if (1==2) { + $text = trim(preg_replace("/[\n\r\t ]+/", ' ', $text), ' '); + preg_match_all('/./u', $text, $words_array); + $words_array = array_slice($words_array[0], 0, $num_words + 1); + $sep = ''; + } else { + $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY); + $sep = ' '; + } + */ + $words_array = preg_split("/[\n\r\t ]+/", $text, $num_words + 1, PREG_SPLIT_NO_EMPTY); + $sep = ' '; + if (count($words_array) > $num_words) { + array_pop($words_array); + $text = implode($sep, $words_array); + $text = $text.$more; + } else { + $text = implode($sep, $words_array); + } + // trim whitespace at beginning or end of string + // See: http://stackoverflow.com/questions/4166896/trim-unicode-whitespace-in-php-5-2 + $text = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $text); + return $text; +} + function url_allowed($url) { global $options; if (!empty($options->allowed_urls)) { @@ -1005,14 +1095,6 @@ function convert_to_utf8($html, $header=null) if (strtolower($encoding) != 'utf-8') { debug('Converting to UTF-8'); $html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8'); - /* - if (function_exists('iconv')) { - // iconv appears to handle certain character encodings better than mb_convert_encoding - $html = iconv($encoding, 'utf-8', $html); - } else { - $html = mb_convert_encoding($html, 'utf-8', $encoding); - } - */ } } } diff --git a/manifest.yml b/manifest.yml index 8d0c255..0834cba 100644 --- a/manifest.yml +++ b/manifest.yml @@ -1,3 +1,5 @@ +# This file is only used when deploying Full-Text RSS to AppFog. +# See http://help.fivefilters.org/customer/portal/articles/1143210-hosting --- applications: .: diff --git a/site_config/index.php b/site_config/index.php index a3d5f73..76ca8b3 100644 --- a/site_config/index.php +++ b/site_config/index.php @@ -1,3 +1,2 @@ \ No newline at end of file +// this is here to prevent directory listing over the web \ No newline at end of file diff --git a/site_config/standard/index.php b/site_config/standard/index.php deleted file mode 100644 index a3d5f73..0000000 --- a/site_config/standard/index.php +++ /dev/null @@ -1,3 +0,0 @@ - \ No newline at end of file diff --git a/site_config/standard/version.php b/site_config/standard/version.php deleted file mode 100644 index 34a8735..0000000 --- a/site_config/standard/version.php +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/site_config/standard/version.txt b/site_config/standard/version.txt index bf0d87a..eaf01eb 100644 --- a/site_config/standard/version.txt +++ b/site_config/standard/version.txt @@ -1 +1 @@ -4 \ No newline at end of file +2013-05-12T22:53:07Z \ No newline at end of file