2014-09-15 20:24:06 +00:00
< ? php
/**
* @ file
* The rules for generating output in the serializer .
*
* These output rules are likely to generate output similar to the document that
* was parsed . It is not intended to output exactly the document that was parsed .
*/
2015-06-14 00:03:20 +00:00
namespace Masterminds\HTML5\Serializer ;
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
use Masterminds\HTML5\Elements ;
2014-09-15 20:24:06 +00:00
/**
* Generate the output html5 based on element rules .
*/
2015-06-14 00:03:20 +00:00
class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface
{
/**
* Defined in http :// www . w3 . org / TR / html51 / infrastructure . html #html-namespace-0
*/
const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml' ;
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML' ;
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
const NAMESPACE_SVG = 'http://www.w3.org/2000/svg' ;
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink' ;
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace' ;
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/' ;
/**
* Holds the HTML5 element names that causes a namespace switch
*
* @ var array
*/
protected $implicitNamespaces = array (
self :: NAMESPACE_HTML ,
self :: NAMESPACE_SVG ,
self :: NAMESPACE_MATHML ,
self :: NAMESPACE_XML ,
self :: NAMESPACE_XMLNS ,
);
const IM_IN_HTML = 1 ;
const IM_IN_SVG = 2 ;
const IM_IN_MATHML = 3 ;
/**
* Used as cache to detect if is available ENT_HTML5
* @ var boolean
*/
private $hasHTML5 = false ;
protected $traverser ;
protected $encode = false ;
protected $out ;
protected $outputMode ;
private $xpath ;
protected $nonBooleanAttributes = array (
/*
array (
'nodeNamespace' => 'http://www.w3.org/1999/xhtml' ,
'attrNamespace' => 'http://www.w3.org/1999/xhtml' ,
'nodeName' => 'img' , 'nodeName' => array ( 'img' , 'a' ),
'attrName' => 'alt' , 'attrName' => array ( 'title' , 'alt' ),
),
*/
array (
2019-04-04 21:15:15 +00:00
'nodeNamespace' => 'http://www.w3.org/1999/xhtml' ,
'attrName' => array ( 'href' ,
'hreflang' ,
'http-equiv' ,
'icon' ,
'id' ,
'keytype' ,
'kind' ,
'label' ,
'lang' ,
'language' ,
'list' ,
'maxlength' ,
'media' ,
'method' ,
'name' ,
'placeholder' ,
'rel' ,
'rows' ,
'rowspan' ,
'sandbox' ,
'spellcheck' ,
'scope' ,
'seamless' ,
'shape' ,
'size' ,
'sizes' ,
'span' ,
'src' ,
'srcdoc' ,
'srclang' ,
'srcset' ,
'start' ,
'step' ,
'style' ,
'summary' ,
'tabindex' ,
'target' ,
'title' ,
'type' ,
'value' ,
'width' ,
'border' ,
'charset' ,
'cite' ,
'class' ,
'code' ,
'codebase' ,
'color' ,
'cols' ,
'colspan' ,
'content' ,
'coords' ,
'data' ,
'datetime' ,
'default' ,
'dir' ,
'dirname' ,
'enctype' ,
'for' ,
'form' ,
'formaction' ,
'headers' ,
'height' ,
'accept' ,
'accept-charset' ,
'accesskey' ,
'action' ,
'align' ,
'alt' ,
'bgcolor' ,
),
),
array (
'nodeNamespace' => 'http://www.w3.org/1999/xhtml' ,
'xpath' => 'starts-with(local-name(), \'data-\')' ,
2015-06-14 00:03:20 +00:00
),
);
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
const DOCTYPE = '<!DOCTYPE html>' ;
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
public function __construct ( $output , $options = array ())
{
if ( isset ( $options [ 'encode_entities' ])) {
$this -> encode = $options [ 'encode_entities' ];
}
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
$this -> outputMode = static :: IM_IN_HTML ;
$this -> out = $output ;
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
// If HHVM, see https://github.com/facebook/hhvm/issues/2727
$this -> hasHTML5 = defined ( 'ENT_HTML5' ) && ! defined ( 'HHVM_VERSION' );
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
public function addRule ( array $rule )
{
$this -> nonBooleanAttributes [] = $rule ;
}
public function setTraverser ( \Masterminds\HTML5\Serializer\Traverser $traverser )
{
$this -> traverser = $traverser ;
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
return $this ;
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
public function document ( $dom )
{
$this -> doctype ();
2017-02-18 15:06:19 +00:00
if ( $dom -> documentElement ) {
2019-04-04 21:23:27 +00:00
foreach ( $dom -> childNodes as $node ) {
$this -> traverser -> node ( $node );
}
2017-02-18 15:06:19 +00:00
$this -> nl ();
}
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
protected function doctype ()
{
$this -> wr ( static :: DOCTYPE );
$this -> nl ();
}
2014-09-15 20:24:06 +00:00
2015-06-14 00:03:20 +00:00
public function element ( $ele )
{
$name = $ele -> tagName ;
// Per spec:
// If the element has a declared namespace in the HTML, MathML or
// SVG namespaces, we use the lname instead of the tagName.
if ( $this -> traverser -> isLocalElement ( $ele )) {
$name = $ele -> localName ;
}
// If we are in SVG or MathML there is special handling.
// Using if/elseif instead of switch because it's faster in PHP.
if ( $name == 'svg' ) {
$this -> outputMode = static :: IM_IN_SVG ;
$name = Elements :: normalizeSvgElement ( $name );
} elseif ( $name == 'math' ) {
$this -> outputMode = static :: IM_IN_MATHML ;
}
$this -> openTag ( $ele );
if ( Elements :: isA ( $name , Elements :: TEXT_RAW )) {
foreach ( $ele -> childNodes as $child ) {
2019-04-04 21:23:27 +00:00
if ( $child instanceof \DOMCharacterData ) {
$this -> wr ( $child -> data );
} elseif ( $child instanceof \DOMElement ) {
$this -> element ( $child );
}
2015-06-14 00:03:20 +00:00
}
} else {
// Handle children.
if ( $ele -> hasChildNodes ()) {
$this -> traverser -> children ( $ele -> childNodes );
}
// Close out the SVG or MathML special handling.
if ( $name == 'svg' || $name == 'math' ) {
$this -> outputMode = static :: IM_IN_HTML ;
}
}
// If not unary, add a closing tag.
if ( ! Elements :: isA ( $name , Elements :: VOID_TAG )) {
$this -> closeTag ( $ele );
}
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
/**
* Write a text node .
*
* @ param \DOMText $ele
* The text node to write .
*/
public function text ( $ele )
{
if ( isset ( $ele -> parentNode ) && isset ( $ele -> parentNode -> tagName ) && Elements :: isA ( $ele -> parentNode -> localName , Elements :: TEXT_RAW )) {
$this -> wr ( $ele -> data );
return ;
}
// FIXME: This probably needs some flags set.
$this -> wr ( $this -> enc ( $ele -> data ));
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
public function cdata ( $ele )
{
// This encodes CDATA.
$this -> wr ( $ele -> ownerDocument -> saveXML ( $ele ));
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
public function comment ( $ele )
{
// These produce identical output.
// $this->wr('<!--')->wr($ele->data)->wr('-->');
$this -> wr ( $ele -> ownerDocument -> saveXML ( $ele ));
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
public function processorInstruction ( $ele )
{
$this -> wr ( '<?' )
-> wr ( $ele -> target )
-> wr ( ' ' )
-> wr ( $ele -> data )
-> wr ( '?>' );
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
/**
* Write the namespace attributes
*
*
* @ param \DOMNode $ele
* The element being written .
*/
protected function namespaceAttrs ( $ele )
{
if ( ! $this -> xpath || $this -> xpath -> document !== $ele -> ownerDocument ){
$this -> xpath = new \DOMXPath ( $ele -> ownerDocument );
}
foreach ( $this -> xpath -> query ( 'namespace::*[not(.=../../namespace::*)]' , $ele ) as $nsNode ) {
if ( ! in_array ( $nsNode -> nodeValue , $this -> implicitNamespaces )) {
$this -> wr ( ' ' ) -> wr ( $nsNode -> nodeName ) -> wr ( '="' ) -> wr ( $nsNode -> nodeValue ) -> wr ( '"' );
}
}
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
/**
* Write the opening tag .
*
* Tags for HTML , MathML , and SVG are in the local name . Otherwise , use the
* qualified name ( 8.3 ) .
*
* @ param \DOMNode $ele
* The element being written .
*/
protected function openTag ( $ele )
{
$this -> wr ( '<' ) -> wr ( $this -> traverser -> isLocalElement ( $ele ) ? $ele -> localName : $ele -> tagName );
$this -> attrs ( $ele );
$this -> namespaceAttrs ( $ele );
if ( $this -> outputMode == static :: IM_IN_HTML ) {
$this -> wr ( '>' );
} // If we are not in html mode we are in SVG, MathML, or XML embedded content.
else {
if ( $ele -> hasChildNodes ()) {
$this -> wr ( '>' );
} // If there are no children this is self closing.
else {
$this -> wr ( ' />' );
}
}
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
protected function attrs ( $ele )
{
// FIXME: Needs support for xml, xmlns, xlink, and namespaced elements.
if ( ! $ele -> hasAttributes ()) {
return $this ;
}
// TODO: Currently, this always writes name="value", and does not do
// value-less attributes.
$map = $ele -> attributes ;
$len = $map -> length ;
for ( $i = 0 ; $i < $len ; ++ $i ) {
$node = $map -> item ( $i );
$val = $this -> enc ( $node -> value , true );
// XXX: The spec says that we need to ensure that anything in
// the XML, XMLNS, or XLink NS's should use the canonical
// prefix. It seems that DOM does this for us already, but there
// may be exceptions.
2019-04-04 21:23:27 +00:00
$name = $node -> nodeName ;
2015-06-14 00:03:20 +00:00
// Special handling for attributes in SVG and MathML.
// Using if/elseif instead of switch because it's faster in PHP.
if ( $this -> outputMode == static :: IM_IN_SVG ) {
$name = Elements :: normalizeSvgAttribute ( $name );
} elseif ( $this -> outputMode == static :: IM_IN_MATHML ) {
$name = Elements :: normalizeMathMlAttribute ( $name );
}
$this -> wr ( ' ' ) -> wr ( $name );
if (( isset ( $val ) && $val !== '' ) || $this -> nonBooleanAttribute ( $node )) {
$this -> wr ( '="' ) -> wr ( $val ) -> wr ( '"' );
}
}
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
protected function nonBooleanAttribute ( \DOMAttr $attr )
{
$ele = $attr -> ownerElement ;
foreach ( $this -> nonBooleanAttributes as $rule ){
if ( isset ( $rule [ 'nodeNamespace' ]) && $rule [ 'nodeNamespace' ] !== $ele -> namespaceURI ){
continue ;
}
if ( isset ( $rule [ 'attNamespace' ]) && $rule [ 'attNamespace' ] !== $attr -> namespaceURI ){
continue ;
}
if ( isset ( $rule [ 'nodeName' ]) && ! is_array ( $rule [ 'nodeName' ]) && $rule [ 'nodeName' ] !== $ele -> localName ){
continue ;
}
if ( isset ( $rule [ 'nodeName' ]) && is_array ( $rule [ 'nodeName' ]) && ! in_array ( $ele -> localName , $rule [ 'nodeName' ], true )){
continue ;
}
if ( isset ( $rule [ 'attrName' ]) && ! is_array ( $rule [ 'attrName' ]) && $rule [ 'attrName' ] !== $attr -> localName ){
continue ;
}
if ( isset ( $rule [ 'attrName' ]) && is_array ( $rule [ 'attrName' ]) && ! in_array ( $attr -> localName , $rule [ 'attrName' ], true )){
continue ;
}
if ( isset ( $rule [ 'xpath' ])){
$xp = $this -> getXPath ( $attr );
if ( isset ( $rule [ 'prefixes' ])){
foreach ( $rule [ 'prefixes' ] as $nsPrefix => $ns ){
$xp -> registerNamespace ( $nsPrefix , $ns );
}
}
2019-04-04 21:15:15 +00:00
if ( ! $xp -> evaluate ( $rule [ 'xpath' ], $attr )){
2015-06-14 00:03:20 +00:00
continue ;
}
}
return true ;
}
return false ;
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
private function getXPath ( \DOMNode $node ){
if ( ! $this -> xpath ){
$this -> xpath = new \DOMXPath ( $node -> ownerDocument );
}
return $this -> xpath ;
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
/**
* Write the closing tag .
*
* Tags for HTML , MathML , and SVG are in the local name . Otherwise , use the
* qualified name ( 8.3 ) .
*
* @ param \DOMNode $ele
* The element being written .
*/
protected function closeTag ( $ele )
{
if ( $this -> outputMode == static :: IM_IN_HTML || $ele -> hasChildNodes ()) {
$this -> wr ( '</' ) -> wr ( $this -> traverser -> isLocalElement ( $ele ) ? $ele -> localName : $ele -> tagName ) -> wr ( '>' );
}
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
/**
* Write to the output .
*
* @ param string $text
* The string to put into the output .
*
* @ return \Masterminds\HTML5\Serializer\Traverser $this so it can be used in chaining .
*/
protected function wr ( $text )
{
fwrite ( $this -> out , $text );
return $this ;
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
/**
* Write a new line character .
*
* @ return \Masterminds\HTML5\Serializer\Traverser $this so it can be used in chaining .
*/
protected function nl ()
{
fwrite ( $this -> out , PHP_EOL );
return $this ;
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
/**
* Encode text .
*
* When encode is set to false , the default value , the text passed in is
* escaped per section 8.3 of the html5 spec . For details on how text is
* escaped see the escape () method .
*
* When encoding is set to true the text is converted to named character
* references where appropriate . Section 8.1 . 4 Character references of the
* html5 spec refers to using named character references . This is useful for
* characters that can ' t otherwise legally be used in the text .
*
* The named character references are listed in section 8.5 .
*
* @ see http :// www . w3 . org / TR / 2013 / CR - html5 - 20130806 / syntax . html #named-character-references True encoding will turn all named character references into their entities.
* This includes such characters as +. # and many other common ones. By default
* encoding here will just escape & ' <> " .
*
* Note , PHP 5.4 + has better html5 encoding .
*
* @ todo Use the Entities class in php 5.3 to have html5 entities .
*
* @ param string $text
* text to encode .
* @ param boolean $attribute
* True if we are encoding an attrubute , false otherwise
*
* @ return string The encoded text .
*/
protected function enc ( $text , $attribute = false )
{
// Escape the text rather than convert to named character references.
if ( ! $this -> encode ) {
return $this -> escape ( $text , $attribute );
}
// If we are in PHP 5.4+ we can use the native html5 entity functionality to
// convert the named character references.
if ( $this -> hasHTML5 ) {
return htmlentities ( $text , ENT_HTML5 | ENT_SUBSTITUTE | ENT_QUOTES , 'UTF-8' , false );
} // If a version earlier than 5.4 html5 entities are not entirely handled.
// This manually handles them.
else {
return strtr ( $text , \Masterminds\HTML5\Serializer\HTML5Entities :: $map );
}
2014-09-15 20:24:06 +00:00
}
2015-06-14 00:03:20 +00:00
/**
* Escape test .
*
* According to the html5 spec section 8.3 Serializing HTML fragments , text
* within tags that are not style , script , xmp , iframe , noembed , and noframes
* need to be properly escaped .
*
* The & should be converted to & amp ;, no breaking space unicode characters
* converted to & nbsp ;, when in attribute mode the " should be converted to
* & quot ;, and when not in attribute mode the < and > should be converted to
* & lt ; and & gt ; .
*
* @ see http :// www . w3 . org / TR / 2013 / CR - html5 - 20130806 / syntax . html #escapingString
*
* @ param string $text
* text to escape .
* @ param boolean $attribute
* True if we are escaping an attrubute , false otherwise
*/
protected function escape ( $text , $attribute = false )
{
// Not using htmlspecialchars because, while it does escaping, it doesn't
// match the requirements of section 8.5. For example, it doesn't handle
// non-breaking spaces.
if ( $attribute ) {
$replace = array (
'"' => '"' ,
'&' => '&' ,
" \xc2 \xa0 " => ' '
);
} else {
$replace = array (
'<' => '<' ,
'>' => '>' ,
'&' => '&' ,
" \xc2 \xa0 " => ' '
);
}
return strtr ( $text , $replace );
}
2014-09-15 20:24:06 +00:00
}