2014-09-15 22:24:06 +02:00
|
|
|
<?php
|
2015-06-14 02:03:20 +02:00
|
|
|
namespace Masterminds;
|
|
|
|
|
|
|
|
use Masterminds\HTML5\Parser\FileInputStream;
|
|
|
|
use Masterminds\HTML5\Parser\StringInputStream;
|
|
|
|
use Masterminds\HTML5\Parser\DOMTreeBuilder;
|
|
|
|
use Masterminds\HTML5\Parser\Scanner;
|
|
|
|
use Masterminds\HTML5\Parser\Tokenizer;
|
|
|
|
use Masterminds\HTML5\Serializer\OutputRules;
|
|
|
|
use Masterminds\HTML5\Serializer\Traverser;
|
2014-09-15 22:24:06 +02:00
|
|
|
|
|
|
|
/**
|
|
|
|
* This class offers convenience methods for parsing and serializing HTML5.
|
2015-06-14 02:03:20 +02:00
|
|
|
* It is roughly designed to mirror the \DOMDocument class that is
|
2014-09-15 22:24:06 +02:00
|
|
|
* provided with most versions of PHP.
|
|
|
|
*
|
|
|
|
* EXPERIMENTAL. This may change or be completely replaced.
|
|
|
|
*/
|
2015-06-14 02:03:20 +02:00
|
|
|
class HTML5
|
|
|
|
{
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Global options for the parser and serializer.
|
|
|
|
*
|
|
|
|
* @var array
|
|
|
|
*/
|
|
|
|
protected $options = array(
|
|
|
|
// If the serializer should encode all entities.
|
|
|
|
'encode_entities' => false
|
|
|
|
);
|
|
|
|
|
|
|
|
protected $errors = array();
|
|
|
|
|
|
|
|
public function __construct(array $options = array())
|
|
|
|
{
|
|
|
|
$this->options = array_merge($this->options, $options);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the default options.
|
|
|
|
*
|
|
|
|
* @return array The default options.
|
|
|
|
*/
|
|
|
|
public function getOptions()
|
|
|
|
{
|
|
|
|
return $this->options;
|
2014-09-15 22:24:06 +02:00
|
|
|
}
|
|
|
|
|
2015-06-14 02:03:20 +02:00
|
|
|
/**
|
|
|
|
* Load and parse an HTML file.
|
|
|
|
*
|
|
|
|
* This will apply the HTML5 parser, which is tolerant of many
|
|
|
|
* varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
|
|
|
|
* 3. Note that in these cases, not all of the old data will be
|
|
|
|
* preserved. For example, XHTML's XML declaration will be removed.
|
|
|
|
*
|
|
|
|
* The rules governing parsing are set out in the HTML 5 spec.
|
|
|
|
*
|
|
|
|
* @param string $file
|
|
|
|
* The path to the file to parse. If this is a resource, it is
|
|
|
|
* assumed to be an open stream whose pointer is set to the first
|
|
|
|
* byte of input.
|
2017-02-18 16:06:19 +01:00
|
|
|
* @param array $options
|
|
|
|
* Configuration options when parsing the HTML
|
2015-06-14 02:03:20 +02:00
|
|
|
* @return \DOMDocument A DOM document. These object type is defined by the libxml
|
|
|
|
* library, and should have been included with your version of PHP.
|
|
|
|
*/
|
2017-02-18 16:06:19 +01:00
|
|
|
public function load($file, array $options = array())
|
2015-06-14 02:03:20 +02:00
|
|
|
{
|
|
|
|
// Handle the case where file is a resource.
|
|
|
|
if (is_resource($file)) {
|
|
|
|
// FIXME: We need a StreamInputStream class.
|
2017-02-18 16:06:19 +01:00
|
|
|
return $this->loadHTML(stream_get_contents($file), $options);
|
2015-06-14 02:03:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
$input = new FileInputStream($file);
|
|
|
|
|
2017-02-18 16:06:19 +01:00
|
|
|
return $this->parse($input, $options);
|
2014-09-15 22:24:06 +02:00
|
|
|
}
|
|
|
|
|
2015-06-14 02:03:20 +02:00
|
|
|
/**
|
|
|
|
* Parse a HTML Document from a string.
|
|
|
|
*
|
|
|
|
* Take a string of HTML 5 (or earlier) and parse it into a
|
|
|
|
* DOMDocument.
|
|
|
|
*
|
|
|
|
* @param string $string
|
|
|
|
* A html5 document as a string.
|
2017-02-18 16:06:19 +01:00
|
|
|
* @param array $options
|
|
|
|
* Configuration options when parsing the HTML
|
2015-06-14 02:03:20 +02:00
|
|
|
* @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
|
|
|
|
* almost all distribtions of PHP.
|
|
|
|
*/
|
2017-02-18 16:06:19 +01:00
|
|
|
public function loadHTML($string, array $options = array())
|
2015-06-14 02:03:20 +02:00
|
|
|
{
|
|
|
|
$input = new StringInputStream($string);
|
|
|
|
|
2017-02-18 16:06:19 +01:00
|
|
|
return $this->parse($input, $options);
|
2015-06-14 02:03:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Convenience function to load an HTML file.
|
|
|
|
*
|
|
|
|
* This is here to provide backwards compatibility with the
|
|
|
|
* PHP DOM implementation. It simply calls load().
|
|
|
|
*
|
|
|
|
* @param string $file
|
|
|
|
* The path to the file to parse. If this is a resource, it is
|
|
|
|
* assumed to be an open stream whose pointer is set to the first
|
|
|
|
* byte of input.
|
2017-02-18 16:06:19 +01:00
|
|
|
* @param array $options
|
|
|
|
* Configuration options when parsing the HTML
|
2015-06-14 02:03:20 +02:00
|
|
|
*
|
|
|
|
* @return \DOMDocument A DOM document. These object type is defined by the libxml
|
|
|
|
* library, and should have been included with your version of PHP.
|
|
|
|
*/
|
2017-02-18 16:06:19 +01:00
|
|
|
public function loadHTMLFile($file, array $options = array())
|
2015-06-14 02:03:20 +02:00
|
|
|
{
|
2017-02-18 16:06:19 +01:00
|
|
|
return $this->load($file, $options);
|
2015-06-14 02:03:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse a HTML fragment from a string.
|
|
|
|
*
|
|
|
|
* @param string $string
|
|
|
|
* The html5 fragment as a string.
|
2017-02-18 16:06:19 +01:00
|
|
|
* @param array $options
|
|
|
|
* Configuration options when parsing the HTML
|
2015-06-14 02:03:20 +02:00
|
|
|
*
|
|
|
|
* @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
|
|
|
|
* almost all distributions of PHP.
|
|
|
|
*/
|
2017-02-18 16:06:19 +01:00
|
|
|
public function loadHTMLFragment($string, array $options = array())
|
2015-06-14 02:03:20 +02:00
|
|
|
{
|
|
|
|
$input = new StringInputStream($string);
|
|
|
|
|
2017-02-18 16:06:19 +01:00
|
|
|
return $this->parseFragment($input, $options);
|
2015-06-14 02:03:20 +02:00
|
|
|
}
|
2014-09-15 22:24:06 +02:00
|
|
|
|
2015-06-14 02:03:20 +02:00
|
|
|
/**
|
|
|
|
* Return all errors encountered into parsing phase
|
|
|
|
*
|
|
|
|
* @return array
|
|
|
|
*/
|
|
|
|
public function getErrors()
|
|
|
|
{
|
|
|
|
return $this->errors;
|
2014-09-15 22:24:06 +02:00
|
|
|
}
|
|
|
|
|
2015-06-14 02:03:20 +02:00
|
|
|
/**
|
|
|
|
* Return true it some errors were encountered into parsing phase
|
|
|
|
*
|
|
|
|
* @return bool
|
|
|
|
*/
|
|
|
|
public function hasErrors()
|
|
|
|
{
|
|
|
|
return count($this->errors) > 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse an input stream.
|
|
|
|
*
|
|
|
|
* Lower-level loading function. This requires an input stream instead
|
|
|
|
* of a string, file, or resource.
|
|
|
|
*/
|
2017-02-18 16:06:19 +01:00
|
|
|
public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
|
2015-06-14 02:03:20 +02:00
|
|
|
{
|
|
|
|
$this->errors = array();
|
2017-02-18 16:06:19 +01:00
|
|
|
$events = new DOMTreeBuilder(false, array_merge($this->getOptions(), $options));
|
2015-06-14 02:03:20 +02:00
|
|
|
$scanner = new Scanner($input);
|
|
|
|
$parser = new Tokenizer($scanner, $events);
|
|
|
|
|
|
|
|
$parser->parse();
|
|
|
|
$this->errors = $events->getErrors();
|
|
|
|
|
|
|
|
return $events->document();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse an input stream where the stream is a fragment.
|
|
|
|
*
|
|
|
|
* Lower-level loading function. This requires an input stream instead
|
|
|
|
* of a string, file, or resource.
|
|
|
|
*/
|
2017-02-18 16:06:19 +01:00
|
|
|
public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array())
|
2015-06-14 02:03:20 +02:00
|
|
|
{
|
2017-02-18 16:06:19 +01:00
|
|
|
$events = new DOMTreeBuilder(true, array_merge($this->getOptions(), $options));
|
2015-06-14 02:03:20 +02:00
|
|
|
$scanner = new Scanner($input);
|
|
|
|
$parser = new Tokenizer($scanner, $events);
|
|
|
|
|
|
|
|
$parser->parse();
|
|
|
|
$this->errors = $events->getErrors();
|
|
|
|
|
|
|
|
return $events->fragment();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Save a DOM into a given file as HTML5.
|
|
|
|
*
|
|
|
|
* @param mixed $dom
|
|
|
|
* The DOM to be serialized.
|
|
|
|
* @param string $file
|
|
|
|
* The filename to be written.
|
|
|
|
* @param array $options
|
|
|
|
* Configuration options when serializing the DOM. These include:
|
|
|
|
* - encode_entities: Text written to the output is escaped by default and not all
|
|
|
|
* entities are encoded. If this is set to true all entities will be encoded.
|
|
|
|
* Defaults to false.
|
|
|
|
*/
|
|
|
|
public function save($dom, $file, $options = array())
|
|
|
|
{
|
|
|
|
$close = true;
|
|
|
|
if (is_resource($file)) {
|
|
|
|
$stream = $file;
|
|
|
|
$close = false;
|
|
|
|
} else {
|
|
|
|
$stream = fopen($file, 'w');
|
|
|
|
}
|
|
|
|
$options = array_merge($this->getOptions(), $options);
|
|
|
|
$rules = new OutputRules($stream, $options);
|
|
|
|
$trav = new Traverser($dom, $stream, $rules, $options);
|
|
|
|
|
|
|
|
$trav->walk();
|
|
|
|
|
|
|
|
if ($close) {
|
|
|
|
fclose($stream);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Convert a DOM into an HTML5 string.
|
|
|
|
*
|
|
|
|
* @param mixed $dom
|
|
|
|
* The DOM to be serialized.
|
|
|
|
* @param array $options
|
|
|
|
* Configuration options when serializing the DOM. These include:
|
|
|
|
* - encode_entities: Text written to the output is escaped by default and not all
|
|
|
|
* entities are encoded. If this is set to true all entities will be encoded.
|
|
|
|
* Defaults to false.
|
|
|
|
*
|
|
|
|
* @return string A HTML5 documented generated from the DOM.
|
|
|
|
*/
|
|
|
|
public function saveHTML($dom, $options = array())
|
|
|
|
{
|
|
|
|
$stream = fopen('php://temp', 'w');
|
|
|
|
$this->save($dom, $stream, array_merge($this->getOptions(), $options));
|
|
|
|
|
|
|
|
return stream_get_contents($stream, - 1, 0);
|
|
|
|
}
|
2014-09-15 22:24:06 +02:00
|
|
|
}
|