2014-09-15 22:24:06 +02:00
< ? php
2015-06-14 02:03:20 +02:00
namespace Masterminds\HTML5\Parser ;
use Masterminds\HTML5\Elements ;
2014-09-15 22:24:06 +02:00
/**
* Create an HTML5 DOM tree from events .
*
* This attempts to create a DOM from events emitted by a parser . This
* attempts ( but does not guarantee ) to up - convert older HTML documents
* to HTML5 . It does this by applying HTML5 ' s rules , but it will not
* change the architecture of the document itself .
*
* Many of the error correction and quirks features suggested in the specification
* are implemented herein ; however , not all of them are . Since we do not
* assume a graphical user agent , no presentation - specific logic is conducted
* during tree building .
*
* FIXME : The present tree builder does not exactly follow the state machine rules
* for insert modes as outlined in the HTML5 spec . The processor needs to be
* re - written to accomodate this . See , for example , the Go language HTML5
* parser .
*/
2015-06-14 02:03:20 +02:00
class DOMTreeBuilder implements EventHandler
{
/**
* Defined in http :// www . w3 . org / TR / html51 / infrastructure . html #html-namespace-0
*/
const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml' ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML' ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
const NAMESPACE_SVG = 'http://www.w3.org/2000/svg' ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink' ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace' ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/' ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
/**
* Holds the HTML5 element names that causes a namespace switch
*
* @ var array
*/
protected $nsRoots = array (
'html' => self :: NAMESPACE_HTML ,
'svg' => self :: NAMESPACE_SVG ,
'math' => self :: NAMESPACE_MATHML
);
/**
* Holds the always available namespaces ( which does not require the XMLNS declaration ) .
*
* @ var array
*/
protected $implicitNamespaces = array (
'xml' => self :: NAMESPACE_XML ,
'xmlns' => self :: NAMESPACE_XMLNS ,
'xlink' => self :: NAMESPACE_XLINK
);
/**
* Holds a stack of currently active namespaces .
*
* @ var array
*/
protected $nsStack = array ();
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
/**
* Holds the number of namespaces declared by a node .
*
* @ var array
*/
protected $pushes = array ();
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
/**
* Defined in 8.2 . 5.
*/
const IM_INITIAL = 0 ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
const IM_BEFORE_HTML = 1 ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
const IM_BEFORE_HEAD = 2 ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
const IM_IN_HEAD = 3 ;
const IM_IN_HEAD_NOSCRIPT = 4 ;
const IM_AFTER_HEAD = 5 ;
const IM_IN_BODY = 6 ;
const IM_TEXT = 7 ;
const IM_IN_TABLE = 8 ;
const IM_IN_TABLE_TEXT = 9 ;
const IM_IN_CAPTION = 10 ;
const IM_IN_COLUMN_GROUP = 11 ;
const IM_IN_TABLE_BODY = 12 ;
const IM_IN_ROW = 13 ;
const IM_IN_CELL = 14 ;
const IM_IN_SELECT = 15 ;
const IM_IN_SELECT_IN_TABLE = 16 ;
const IM_AFTER_BODY = 17 ;
const IM_IN_FRAMESET = 18 ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
const IM_AFTER_FRAMESET = 19 ;
const IM_AFTER_AFTER_BODY = 20 ;
const IM_AFTER_AFTER_FRAMESET = 21 ;
const IM_IN_SVG = 22 ;
const IM_IN_MATHML = 23 ;
protected $options = array ();
protected $stack = array ();
protected $current ; // Pointer in the tag hierarchy.
protected $doc ;
protected $frag ;
protected $processor ;
protected $insertMode = 0 ;
/**
* Quirks mode is enabled by default .
* Any document that is missing the
* DT will be considered to be in quirks mode .
*/
protected $quirks = true ;
protected $errors = array ();
public function __construct ( $isFragment = false , array $options = array ())
{
$this -> options = $options ;
$impl = new \DOMImplementation ();
// XXX:
// Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = $impl -> createDocumentType ( 'html' );
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this -> doc = $impl -> createDocument ( null , null , $dt );
$this -> errors = array ();
$this -> current = $this -> doc ; // ->documentElement;
// Create a rules engine for tags.
$this -> rules = new TreeBuildingRules ( $this -> doc );
// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
array_unshift ( $this -> nsStack , ( isset ( $this -> options [ " implicitNamespaces " ]) ? $this -> options [ " implicitNamespaces " ] : array ()) + array (
'' => self :: NAMESPACE_HTML
) + $this -> implicitNamespaces );
if ( $isFragment ) {
$this -> insertMode = static :: IM_IN_BODY ;
$this -> frag = $this -> doc -> createDocumentFragment ();
$this -> current = $this -> frag ;
}
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
/**
* Get the document .
*/
public function document ()
{
return $this -> doc ;
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
/**
* Get the DOM fragment for the body .
*
* This returns a DOMNodeList because a fragment may have zero or more
* DOMNodes at its root .
*
* @ see http :// www . w3 . org / TR / 2012 / CR - html5 - 20121217 / syntax . html #concept-frag-parse-context
*
* @ return \DOMFragmentDocumentFragment
*/
public function fragment ()
{
return $this -> frag ;
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
/**
* Provide an instruction processor .
*
* This is used for handling Processor Instructions as they are
* inserted . If omitted , PI ' s are inserted directly into the DOM tree .
*/
public function setInstructionProcessor ( \Masterminds\HTML5\InstructionProcessor $proc )
{
$this -> processor = $proc ;
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
public function doctype ( $name , $idType = 0 , $id = null , $quirks = false )
{
// This is used solely for setting quirks mode. Currently we don't
// try to preserve the inbound DT. We convert it to HTML5.
$this -> quirks = $quirks ;
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
if ( $this -> insertMode > static :: IM_INITIAL ) {
$this -> parseError ( " Illegal placement of DOCTYPE tag. Ignoring: " . $name );
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
return ;
}
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
$this -> insertMode = static :: IM_BEFORE_HTML ;
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
/**
* Process the start tag .
*
* @ todo - XMLNS namespace handling ( we need to parse , even if it ' s not valid )
* - XLink , MathML and SVG namespace handling
* - Omission rules : 8.1 . 2.4 Optional tags
*/
public function startTag ( $name , $attributes = array (), $selfClosing = false )
{
// fprintf(STDOUT, $name);
$lname = $this -> normalizeTagName ( $name );
// Make sure we have an html element.
if ( ! $this -> doc -> documentElement && $name !== 'html' && ! $this -> frag ) {
$this -> startTag ( 'html' );
}
// Set quirks mode if we're at IM_INITIAL with no doctype.
if ( $this -> insertMode == static :: IM_INITIAL ) {
$this -> quirks = true ;
$this -> parseError ( " No DOCTYPE specified. " );
}
// SPECIAL TAG HANDLING:
// Spec says do this, and "don't ask."
if ( $name == 'image' ) {
$name = 'img' ;
}
// Autoclose p tags where appropriate.
if ( $this -> insertMode >= static :: IM_IN_BODY && Elements :: isA ( $name , Elements :: AUTOCLOSE_P )) {
$this -> autoclose ( 'p' );
}
// Set insert mode:
switch ( $name ) {
case 'html' :
$this -> insertMode = static :: IM_BEFORE_HEAD ;
break ;
case 'head' :
if ( $this -> insertMode > static :: IM_BEFORE_HEAD ) {
$this -> parseError ( " Unexpected head tag outside of head context. " );
} else {
$this -> insertMode = static :: IM_IN_HEAD ;
}
break ;
case 'body' :
$this -> insertMode = static :: IM_IN_BODY ;
break ;
case 'svg' :
$this -> insertMode = static :: IM_IN_SVG ;
break ;
case 'math' :
$this -> insertMode = static :: IM_IN_MATHML ;
break ;
case 'noscript' :
if ( $this -> insertMode == static :: IM_IN_HEAD ) {
$this -> insertMode = static :: IM_IN_HEAD_NOSCRIPT ;
}
break ;
}
// Special case handling for SVG.
if ( $this -> insertMode == static :: IM_IN_SVG ) {
$lname = Elements :: normalizeSvgElement ( $lname );
}
$pushes = 0 ;
// when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace
if ( isset ( $this -> nsRoots [ $lname ]) && $this -> nsStack [ 0 ][ '' ] !== $this -> nsRoots [ $lname ]) {
array_unshift ( $this -> nsStack , array (
'' => $this -> nsRoots [ $lname ]
) + $this -> nsStack [ 0 ]);
$pushes ++ ;
}
$needsWorkaround = false ;
if ( isset ( $this -> options [ " xmlNamespaces " ]) && $this -> options [ " xmlNamespaces " ]) {
// when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack
foreach ( $attributes as $aName => $aVal ) {
if ( $aName === 'xmlns' ) {
$needsWorkaround = $aVal ;
array_unshift ( $this -> nsStack , array (
'' => $aVal
) + $this -> nsStack [ 0 ]);
$pushes ++ ;
} elseif ((( $pos = strpos ( $aName , ':' )) ? substr ( $aName , 0 , $pos ) : '' ) === 'xmlns' ) {
array_unshift ( $this -> nsStack , array (
substr ( $aName , $pos + 1 ) => $aVal
) + $this -> nsStack [ 0 ]);
$pushes ++ ;
}
}
}
try {
$prefix = ( $pos = strpos ( $lname , ':' )) ? substr ( $lname , 0 , $pos ) : '' ;
if ( $needsWorkaround !== false ) {
$xml = " < $lname xmlns= \" $needsWorkaround\ " " .(strlen( $prefix ) && isset( $this->nsStack [0][ $prefix ])?( " xmlns : $prefix = \ " " . $this -> nsStack [ 0 ][ $prefix ] . " \" " ) : " " ) . " /> " ;
$frag = new \DOMDocument ( '1.0' , 'UTF-8' );
$frag -> loadXML ( $xml );
$ele = $this -> doc -> importNode ( $frag -> documentElement , true );
} else {
if ( isset ( $this -> nsStack [ 0 ][ $prefix ])) {
$ele = $this -> doc -> createElementNS ( $this -> nsStack [ 0 ][ $prefix ], $lname );
} else {
$ele = $this -> doc -> createElement ( $lname );
}
}
} catch ( \DOMException $e ) {
$this -> parseError ( " Illegal tag name: < $lname >. Replaced with <invalid>. " );
$ele = $this -> doc -> createElement ( 'invalid' );
}
// When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them.
// When we are on a void tag, we do not need to care about namesapce nesting.
if ( $pushes > 0 && ! Elements :: isA ( $name , Elements :: VOID_TAG )) {
// PHP tends to free the memory used by DOM,
// to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes
// see https://bugs.php.net/bug.php?id=67459
$this -> pushes [ spl_object_hash ( $ele )] = array ( $pushes , $ele );
// SEE https://github.com/facebook/hhvm/issues/2962
if ( defined ( 'HHVM_VERSION' )) {
$ele -> setAttribute ( 'html5-php-fake-id-attribute' , spl_object_hash ( $ele ));
}
}
foreach ( $attributes as $aName => $aVal ) {
// xmlns attributes can't be set
if ( $aName === 'xmlns' ) {
continue ;
}
if ( $this -> insertMode == static :: IM_IN_SVG ) {
$aName = Elements :: normalizeSvgAttribute ( $aName );
} elseif ( $this -> insertMode == static :: IM_IN_MATHML ) {
$aName = Elements :: normalizeMathMlAttribute ( $aName );
}
try {
$prefix = ( $pos = strpos ( $aName , ':' )) ? substr ( $aName , 0 , $pos ) : false ;
if ( $prefix === 'xmlns' ) {
$ele -> setAttributeNs ( self :: NAMESPACE_XMLNS , $aName , $aVal );
} elseif ( $prefix !== false && isset ( $this -> nsStack [ 0 ][ $prefix ])) {
$ele -> setAttributeNs ( $this -> nsStack [ 0 ][ $prefix ], $aName , $aVal );
} else {
$ele -> setAttribute ( $aName , $aVal );
}
} catch ( \DOMException $e ) {
$this -> parseError ( " Illegal attribute name for tag $name . Ignoring: $aName " );
continue ;
}
// This is necessary on a non-DTD schema, like HTML5.
if ( $aName == 'id' ) {
$ele -> setIdAttribute ( 'id' , true );
}
}
// Some elements have special processing rules. Handle those separately.
if ( $this -> rules -> hasRules ( $name )) {
$this -> current = $this -> rules -> evaluate ( $ele , $this -> current );
} // Otherwise, it's a standard element.
else {
$this -> current -> appendChild ( $ele );
// XXX: Need to handle self-closing tags and unary tags.
if ( ! Elements :: isA ( $name , Elements :: VOID_TAG )) {
$this -> current = $ele ;
}
}
// This is sort of a last-ditch attempt to correct for cases where no head/body
// elements are provided.
if ( $this -> insertMode <= static :: IM_BEFORE_HEAD && $name != 'head' && $name != 'html' ) {
$this -> insertMode = static :: IM_IN_BODY ;
}
// When we are on a void tag, we do not need to care about namesapce nesting,
// but we have to remove the namespaces pushed to $nsStack.
if ( $pushes > 0 && Elements :: isA ( $name , Elements :: VOID_TAG )) {
// remove the namespaced definded by current node
for ( $i = 0 ; $i < $pushes ; $i ++ ) {
array_shift ( $this -> nsStack );
}
}
// Return the element mask, which the tokenizer can then use to set
// various processing rules.
return Elements :: element ( $name );
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
public function endTag ( $name )
{
$lname = $this -> normalizeTagName ( $name );
// Ignore closing tags for unary elements.
if ( Elements :: isA ( $name , Elements :: VOID_TAG )) {
return ;
}
if ( $this -> insertMode <= static :: IM_BEFORE_HTML ) {
// 8.2.5.4.2
if ( in_array ( $name , array (
'html' ,
'br' ,
'head' ,
'title'
))) {
$this -> startTag ( 'html' );
$this -> endTag ( $name );
$this -> insertMode = static :: IM_BEFORE_HEAD ;
return ;
}
// Ignore the tag.
$this -> parseError ( " Illegal closing tag at global scope. " );
return ;
}
// Special case handling for SVG.
if ( $this -> insertMode == static :: IM_IN_SVG ) {
$lname = Elements :: normalizeSvgElement ( $lname );
}
// See https://github.com/facebook/hhvm/issues/2962
if ( defined ( 'HHVM_VERSION' ) && ( $cid = $this -> current -> getAttribute ( 'html5-php-fake-id-attribute' ))) {
$this -> current -> removeAttribute ( 'html5-php-fake-id-attribute' );
} else {
$cid = spl_object_hash ( $this -> current );
}
// XXX: Not sure whether we need this anymore.
// if ($name != $lname) {
// return $this->quirksTreeResolver($lname);
// }
// XXX: HTML has no parent. What do we do, though,
// if this element appears in the wrong place?
if ( $lname == 'html' ) {
return ;
}
// remove the namespaced definded by current node
if ( isset ( $this -> pushes [ $cid ])) {
for ( $i = 0 ; $i < $this -> pushes [ $cid ][ 0 ]; $i ++ ) {
array_shift ( $this -> nsStack );
}
unset ( $this -> pushes [ $cid ]);
}
if ( ! $this -> autoclose ( $lname )) {
$this -> parseError ( 'Could not find closing tag for ' . $lname );
}
// switch ($this->insertMode) {
switch ( $lname ) {
case " head " :
$this -> insertMode = static :: IM_AFTER_HEAD ;
break ;
case " body " :
$this -> insertMode = static :: IM_AFTER_BODY ;
break ;
case " svg " :
case " mathml " :
$this -> insertMode = static :: IM_IN_BODY ;
break ;
}
}
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
public function comment ( $cdata )
{
// TODO: Need to handle case where comment appears outside of the HTML tag.
$node = $this -> doc -> createComment ( $cdata );
$this -> current -> appendChild ( $node );
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
public function text ( $data )
{
// XXX: Hmmm.... should we really be this strict?
if ( $this -> insertMode < static :: IM_IN_HEAD ) {
// Per '8.2.5.4.3 The "before head" insertion mode' the characters
// " \t\n\r\f" should be ignored but no mention of a parse error. This is
// practical as most documents contain these characters. Other text is not
// expected here so recording a parse error is necessary.
$dataTmp = trim ( $data , " \t \n \r \ f " );
if ( ! empty ( $dataTmp )) {
// fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode);
$this -> parseError ( " Unexpected text. Ignoring: " . $dataTmp );
}
return ;
}
// fprintf(STDOUT, "Appending text %s.", $data);
$node = $this -> doc -> createTextNode ( $data );
$this -> current -> appendChild ( $node );
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
public function eof ()
{
// If the $current isn't the $root, do we need to do anything?
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
public function parseError ( $msg , $line = 0 , $col = 0 )
{
$this -> errors [] = sprintf ( " Line %d, Col %d: %s " , $line , $col , $msg );
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
public function getErrors ()
{
return $this -> errors ;
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
public function cdata ( $data )
{
$node = $this -> doc -> createCDATASection ( $data );
$this -> current -> appendChild ( $node );
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
public function processingInstruction ( $name , $data = null )
{
// XXX: Ignore initial XML declaration, per the spec.
if ( $this -> insertMode == static :: IM_INITIAL && 'xml' == strtolower ( $name )) {
return ;
}
// Important: The processor may modify the current DOM tree however
// it sees fit.
if ( isset ( $this -> processor )) {
$res = $this -> processor -> process ( $this -> current , $name , $data );
if ( ! empty ( $res )) {
$this -> current = $res ;
}
return ;
}
// Otherwise, this is just a dumb PI element.
$node = $this -> doc -> createProcessingInstruction ( $name , $data );
$this -> current -> appendChild ( $node );
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
// ==========================================================================
// UTILITIES
// ==========================================================================
/**
* Apply normalization rules to a tag name .
*
* See sections 2.9 and 8.1 . 2.
*
* @ param string $name
* The tag name .
* @ return string The normalized tag name .
2014-09-15 22:24:06 +02:00
*/
2015-06-14 02:03:20 +02:00
protected function normalizeTagName ( $name )
{
/*
* Section 2.9 suggests that we should not do this . if ( strpos ( $name , ':' ) !== false ) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); }
*/
return $name ;
}
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
protected function quirksTreeResolver ( $name )
{
throw new \Exception ( " Not implemented. " );
2014-09-15 22:24:06 +02:00
}
2015-06-14 02:03:20 +02:00
/**
* Automatically climb the tree and close the closest node with the matching $tag .
*/
protected function autoclose ( $tag )
{
$working = $this -> current ;
do {
if ( $working -> nodeType != XML_ELEMENT_NODE ) {
return false ;
}
if ( $working -> tagName == $tag ) {
$this -> current = $working -> parentNode ;
return true ;
}
} while ( $working = $working -> parentNode );
return false ;
}
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
/**
* Checks if the given tagname is an ancestor of the present candidate .
*
* If $this -> current or anything above $this -> current matches the given tag
* name , this returns true .
*/
protected function isAncestor ( $tagname )
{
$candidate = $this -> current ;
while ( $candidate -> nodeType === XML_ELEMENT_NODE ) {
if ( $candidate -> tagName == $tagname ) {
return true ;
}
$candidate = $candidate -> parentNode ;
}
return false ;
}
2014-09-15 22:24:06 +02:00
2015-06-14 02:03:20 +02:00
/**
* Returns true if the immediate parent element is of the given tagname .
*/
protected function isParent ( $tagname )
{
return $this -> current -> tagName == $tagname ;
}
2014-09-15 22:24:06 +02:00
}