diff --git a/admin/update.php b/admin/update.php index db15e5a..2852781 100644 --- a/admin/update.php +++ b/admin/update.php @@ -1,9 +1,9 @@ user_agents - use site config files. + - Site config files which use single_page_link can now follow it with if_page_contains: XPath to make it conditional. + - Minimum supported PHP version is now 5.3. If you must use PHP 5.2, please download Full-Text RSS 3.4 + - Site config files updated for better extraction + - Other minor fixes/improvements 3.4.1 (unreleased) - Backporting Dave Vasilevsky cookie patch. Fixes issues with certain sites. See https://gist.github.com/fivefilters/0a758b6d64ce4fb5728c diff --git a/config.php b/config.php index 1deb0ca..5b58c30 100644 --- a/config.php +++ b/config.php @@ -430,22 +430,6 @@ $options->fingerprints = array( '"> PHP - 5.2.0 or higher + 5.3 or higher @@ -306,9 +308,9 @@ div.chunk { -
  • Parallel URL fetching: You have HttpRequestPool or curl_multi support installed. No problems here.
  • +
  • Parallel URL fetching: You have PHP's HTTP extension or curl_multi installed. No problems here.
  • -
  • Parallel URL fetching: HttpRequestPool or curl_multi support is not available. will use file_get_contents() instead to fetch URLs sequentially rather than in parallel.
  • +
  • Parallel URL fetching: HTTP extension or curl_multi support is not available. will use file_get_contents() instead to fetch URLs sequentially rather than in parallel.
  • @@ -352,11 +354,11 @@ div.chunk {

    Further info

    HTTP module

    -

    Full-Text RSS can make use of HttpRequestPool or curl_multi to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using file_get_contents.

    +

    Full-Text RSS can make use of PHP's HTTP extension or curl_multi to make parallel HTTP requests when processing feeds. If neither are available, it will make sequential requests using file_get_contents.

    System Requirements -

    PHP 5.2 or above is required. A simple shared web hosting account will work fine. - The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using WampServer. It has also been reported as working under IIS, but we have not tested this ourselves.

    +

    PHP 5.3 or above is required. A simple shared web hosting account should work fine, but we recommend a VPS with 1GB RAM. + The code has been tested on Windows and Linux using the Apache web server. If you're a Windows user, you can try it on your own machine using Uniform Server. It has also been reported as working under IIS, but we have not tested this ourselves.

    Download

    Download from fivefilters.org — old versions are available in our code repository.

    diff --git a/libraries/content-extractor/ContentExtractor.php b/libraries/content-extractor/ContentExtractor.php index a637e1e..078c993 100644 --- a/libraries/content-extractor/ContentExtractor.php +++ b/libraries/content-extractor/ContentExtractor.php @@ -15,12 +15,12 @@ class ContentExtractor { protected static $tidy_config = array( - 'clean' => true, + 'clean' => false, // can't preserve wbr tabs if this is set to true 'output-xhtml' => true, 'logical-emphasis' => true, 'show-body-only' => false, - 'new-blocklevel-tags' => 'article, aside, footer, header, hgroup, menu, nav, section, details, datagrid', - 'new-inline-tags' => 'mark, time, meter, progress, data', + 'new-blocklevel-tags' => 'article aside footer header hgroup menu nav section details datagrid', + 'new-inline-tags' => 'mark time meter progress data wbr', 'wrap' => 0, 'drop-empty-paras' => true, 'drop-proprietary-attributes' => false, @@ -42,6 +42,7 @@ class ContentExtractor protected $body; protected $success = false; protected $nextPageUrl; + protected $opengraph = array(); public $allowedParsers = array('libxml', 'html5php'); public $defaultParser = 'libxml'; public $parserOverride = null; @@ -79,6 +80,7 @@ class ContentExtractor $this->date = null; $this->nextPageUrl = null; $this->success = false; + $this->opengraph = array(); } public function findHostUsingFingerprints($html) { @@ -109,8 +111,11 @@ class ContentExtractor if (substr($host, 0, 4) == 'www.') $host = substr($host, 4); // is merged version already cached? if (SiteConfig::is_cached("$host.merged")) { - $this->debug("Returning cached and merged site config for $host"); - return SiteConfig::build("$host.merged"); + $config = SiteConfig::build("$host.merged"); + if ($config) { + $this->debug("Returning cached and merged site config for $host"); + return $config; + } } // let's build from site_config/custom/ and standard/ $config = SiteConfig::build($host); @@ -315,7 +320,25 @@ class ContentExtractor if ($this->language) break; } } - + + // try to open graph properties + $elems = @$xpath->query("//head//meta[@property='og:title' or @property='og:type' or @property='og:url' or @property='og:image' or @property='og:description']", $this->readability->dom); + // check for matches + if ($elems && $elems->length > 0) { + $this->debug('Extracting Open Graph elements'); + foreach ($elems as $elem) { + if ($elem->hasAttribute('content')) { + $_prop = strtolower($elem->getAttribute('property')); + $_val = $elem->getAttribute('content'); + // currently one of each is returned, so we keep the first one + if (!isset($this->opengraph[$_prop])) { + $this->opengraph[$_prop] = $_val; + } + } + } + unset($_prop, $_val); + } + // try to get date foreach ($this->config->date as $pattern) { $elems = @$xpath->evaluate($pattern, $this->readability->dom); @@ -397,6 +420,16 @@ class ContentExtractor $elems->item($i)->parentNode->removeChild($elems->item($i)); } } + + // strip empty a elements + $elems = $xpath->query("//a[not(./*) and normalize-space(.)='']", $this->readability->dom); + // check for matches + if ($elems && $elems->length > 0) { + $this->debug('Stripping '.$elems->length.' empty a elements'); + for ($i=$elems->length-1; $i >= 0; $i--) { + $elems->item($i)->parentNode->removeChild($elems->item($i)); + } + } // try to get body foreach ($this->config->body as $pattern) { @@ -789,6 +822,10 @@ class ContentExtractor return $this->body; } + public function getOpenGraph() { + return $this->opengraph; + } + public function isNativeAd() { return $this->nativeAd; } diff --git a/libraries/content-extractor/SiteConfig.php b/libraries/content-extractor/SiteConfig.php index 60d465b..3b90a75 100644 --- a/libraries/content-extractor/SiteConfig.php +++ b/libraries/content-extractor/SiteConfig.php @@ -5,10 +5,10 @@ * Each instance of this class should hold extraction patterns and other directives * for a website. See ContentExtractor class to see how it's used. * - * @version 0.8 - * @date 2013-04-16 + * @version 1.0 + * @date 2015-06-09 * @author Keyvan Minoukadeh - * @copyright 2013 Keyvan Minoukadeh + * @copyright 2015 Keyvan Minoukadeh * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 */ @@ -38,8 +38,7 @@ class SiteConfig // Mark article as a native ad if any of these expressions match (0 or more xpath expressions) public $native_ad_clue = array(); - // Additional HTTP headers to send - // NOT YET USED + // Additional HTTP headers to send (associative array) public $http_header = array(); // Process HTML with tidy before creating DOM (bool or null if undeclared) @@ -66,6 +65,15 @@ class SiteConfig // Test URL - if present, can be used to test the config above public $test_url = array(); + + // Test URL contains - one or more snippets of text from the article body. + // Used to determine if the extraction rules for the site are still valid (ie. still extracting relevant content) + // Keys should be one or more of the test URLs supplied, and value an array of strings to look for. + public $test_contains = array(); + + // If page contains - XPath expression. Used to determine if the preceding rule gets evaluated or not. + // Currently only works with single_page_link. + public $if_page_contains = array(); // Single-page link - should identify a link element or URL pointing to the page holding the entire article // This is useful for sites which split their articles across multiple pages. Links to such pages tend to @@ -185,11 +193,23 @@ class SiteConfig public function append(SiteConfig $newconfig) { // check for commands where we accept multiple statements (no test_url) - foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue', 'http_header') as $var) { + foreach (array('title', 'body', 'author', 'date', 'strip', 'strip_id_or_class', 'strip_image_src', 'single_page_link', 'single_page_link_in_feed', 'next_page_link', 'native_ad_clue') as $var) { // append array elements for this config variable from $newconfig to this config //$this->$var = $this->$var + $newconfig->$var; $this->$var = array_unique(array_merge($this->$var, $newconfig->$var)); } + // special handling of commands where key is important and config values being appended should not overwrite existing ones + foreach (array('http_header') as $var) { + $this->$var = array_merge($newconfig->$var, $this->$var); + } + // special handling of if_page_contains directive + foreach (array('single_page_link') as $var) { + if (isset($this->if_page_contains[$var]) && isset($newconfig->if_page_contains[$var])) { + $this->if_page_contains[$var] = array_merge($newconfig->if_page_contains[$var], $this->if_page_contains[$var]); + } elseif (isset($newconfig->if_page_contains[$var])) { + $this->if_page_contains[$var] = $newconfig->if_page_contains[$var]; + } + } // check for single statement commands // we do not overwrite existing non null values foreach (array('tidy', 'prune', 'parser', 'autodetect_on_failure') as $var) { @@ -213,6 +233,40 @@ class SiteConfig return $key_suffix; } + // Add test_contains to last test_url + public function add_test_contains($test_contains) { + if (!empty($this->test_url)) { + $test_contains = (string) $test_contains; + $key = end($this->test_url); + reset($this->test_url); + if (isset($this->test_contains[$key])) { + $this->test_contains[$key][] = $test_contains; + } else { + $this->test_contains[$key] = array($test_contains); + } + } + } + + // Add if_page_page_contains + // TODO: Expand so it can be used with other rules too + public function add_if_page_contains_condition($if_page_contains) { + if (!empty($this->single_page_link)) { + $if_page_contains = (string) $if_page_contains; + $key = end($this->single_page_link); + reset($this->single_page_link); + $this->if_page_contains['single_page_link'][$key] = $if_page_contains; + } + } + + public function get_if_page_contains_condition($directive_name, $directive_value) { + if (isset($this->if_page_contains[$directive_name])) { + if (isset($this->if_page_contains[$directive_name][$directive_value])) { + return $this->if_page_contains[$directive_name][$directive_value]; + } + } + return null; + } + // returns SiteConfig instance if an appropriate one is found, false otherwise // if $exact_host_match is true, we will not look for wildcard config matches // by default if host is 'test.example.org' we will look for and load '.example.org.txt' if it exists @@ -356,12 +410,20 @@ class SiteConfig // check for single statement commands stored as strings } elseif (in_array($command, array('parser'))) { $config->$command = $val; + // special treatment for test_contains + } elseif (in_array($command, array('test_contains'))) { + $config->add_test_contains($val); + // special treatment for if_page_contains + } elseif (in_array($command, array('if_page_contains'))) { + $config->add_if_page_contains_condition($val); // check for replace_string(find): replace } elseif ((substr($command, -1) == ')') && preg_match('!^([a-z0-9_]+)\((.*?)\)$!i', $command, $match)) { if (in_array($match[1], array('replace_string'))) { - $command = $match[1]; array_push($config->find_string, $match[2]); - array_push($config->$command, $val); + array_push($config->replace_string, $val); + } elseif (in_array($match[1], array('http_header'))) { + $_header = strtolower(trim($match[2])); + $config->http_header[$_header] = $val; } } } diff --git a/libraries/feedwriter/FeedWriter.php b/libraries/feedwriter/FeedWriter.php index 8279d02..c5f8c42 100644 --- a/libraries/feedwriter/FeedWriter.php +++ b/libraries/feedwriter/FeedWriter.php @@ -1,7 +1,7 @@ language = null; $simplejson->url = null; $simplejson->effective_url = null; + $simplejson->og_url = null; + $simplejson->og_title = null; + $simplejson->og_description = null; + $simplejson->og_image = null; + $simplejson->og_type = null; $simplejson->content = null; // actual values $simplejson->url = $jsonitem->link; @@ -151,6 +156,11 @@ define('JSONP', 3, true); if (isset($jsonitem->pubDate)) { $simplejson->date = gmdate(DATE_ATOM, strtotime($jsonitem->pubDate)); } + if (isset($jsonitem->og_url)) $simplejson->og_url = $jsonitem->og_url; + if (isset($jsonitem->og_title)) $simplejson->og_title = $jsonitem->og_title; + if (isset($jsonitem->og_description)) $simplejson->og_description = $jsonitem->og_description; + if (isset($jsonitem->og_image)) $simplejson->og_image = $jsonitem->og_image; + if (isset($jsonitem->og_type)) $simplejson->og_type = $jsonitem->og_type; echo json_encode($simplejson); } } @@ -327,7 +337,7 @@ define('JSONP', 3, true); { $out = ''."\n"; if ($this->xsl) $out .= 'xsl).'"?>' . PHP_EOL; - $out .= '' . PHP_EOL; + $out .= '' . PHP_EOL; echo $out; } elseif ($this->version == JSON || $this->version == JSONP) @@ -370,7 +380,9 @@ define('JSONP', 3, true); { foreach ($attributes as $key => $value) { - $attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" "; + //$attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false)."\" "; + // TODO: replace HTML entities not supported in XML with UTF8 equivalent characters + $attrText .= " $key=\"".htmlspecialchars($value, ENT_COMPAT, 'UTF-8')."\" "; } } $nodeText .= "<{$tagName}{$attrText}>"; @@ -384,7 +396,9 @@ define('JSONP', 3, true); else { //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? $tagContent : htmlentities($tagContent); - $nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false); + //$nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8', false); + // TODO: replace HTML entities not supported in XML with UTF8 equivalent characters + $nodeText .= htmlspecialchars($tagContent, ENT_COMPAT, 'UTF-8'); } //$nodeText .= (in_array($tagName, $this->CDATAEncoding))? "]]>" : ""; $nodeText .= ""; diff --git a/libraries/htmLawed/htmLawed.php b/libraries/htmLawed/htmLawed.php index 032ef79..5d6285e 100644 --- a/libraries/htmLawed/htmLawed.php +++ b/libraries/htmLawed/htmLawed.php @@ -1,8 +1,8 @@ $v){ $v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'htmLawed::hl_prot', $v); $v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v; }elseif(isset($aNP[$k]) or strpos($k, 'src') !== false or $k[0] == 'o'){ - $v = str_replace("\xad", ' ', (strpos($v, '&') !== false ? str_replace(array('­', '­', '­'), ' ', $v) : $v)); + $v = str_replace("­", ' ', (strpos($v, '&') !== false ? str_replace(array('­', '­', '­'), ' ', $v) : $v)); # double-quoted char is soft-hyphen; appears here as "­" or hyphen or something else depending on viewing software $v = htmLawed::hl_prot($v, $k); if($k == 'href'){ // X-spam if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){ @@ -701,7 +701,7 @@ return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array( public static function hl_version(){ // rel -return '1.1.17'; +return '1.1.19'; // eof } diff --git a/libraries/html5php/HTML5.php b/libraries/html5php/HTML5.php index 2585fd4..1c46c2b 100644 --- a/libraries/html5php/HTML5.php +++ b/libraries/html5php/HTML5.php @@ -60,20 +60,22 @@ class HTML5 * The path to the file to parse. If this is a resource, it is * assumed to be an open stream whose pointer is set to the first * byte of input. + * @param array $options + * Configuration options when parsing the HTML * @return \DOMDocument A DOM document. These object type is defined by the libxml * library, and should have been included with your version of PHP. */ - public function load($file) + public function load($file, array $options = array()) { // Handle the case where file is a resource. if (is_resource($file)) { // FIXME: We need a StreamInputStream class. - return $this->loadHTML(stream_get_contents($file)); + return $this->loadHTML(stream_get_contents($file), $options); } $input = new FileInputStream($file); - return $this->parse($input); + return $this->parse($input, $options); } /** @@ -84,14 +86,16 @@ class HTML5 * * @param string $string * A html5 document as a string. + * @param array $options + * Configuration options when parsing the HTML * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with * almost all distribtions of PHP. */ - public function loadHTML($string) + public function loadHTML($string, array $options = array()) { $input = new StringInputStream($string); - return $this->parse($input); + return $this->parse($input, $options); } /** @@ -104,13 +108,15 @@ class HTML5 * The path to the file to parse. If this is a resource, it is * assumed to be an open stream whose pointer is set to the first * byte of input. + * @param array $options + * Configuration options when parsing the HTML * * @return \DOMDocument A DOM document. These object type is defined by the libxml * library, and should have been included with your version of PHP. */ - public function loadHTMLFile($file) + public function loadHTMLFile($file, array $options = array()) { - return $this->load($file); + return $this->load($file, $options); } /** @@ -118,15 +124,17 @@ class HTML5 * * @param string $string * The html5 fragment as a string. + * @param array $options + * Configuration options when parsing the HTML * * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with * almost all distributions of PHP. */ - public function loadHTMLFragment($string) + public function loadHTMLFragment($string, array $options = array()) { $input = new StringInputStream($string); - return $this->parseFragment($input); + return $this->parseFragment($input, $options); } /** @@ -155,10 +163,10 @@ class HTML5 * Lower-level loading function. This requires an input stream instead * of a string, file, or resource. */ - public function parse(\Masterminds\HTML5\Parser\InputStream $input) + public function parse(\Masterminds\HTML5\Parser\InputStream $input, array $options = array()) { $this->errors = array(); - $events = new DOMTreeBuilder(false, $this->options); + $events = new DOMTreeBuilder(false, array_merge($this->getOptions(), $options)); $scanner = new Scanner($input); $parser = new Tokenizer($scanner, $events); @@ -174,9 +182,9 @@ class HTML5 * Lower-level loading function. This requires an input stream instead * of a string, file, or resource. */ - public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input) + public function parseFragment(\Masterminds\HTML5\Parser\InputStream $input, array $options = array()) { - $events = new DOMTreeBuilder(true, $this->options); + $events = new DOMTreeBuilder(true, array_merge($this->getOptions(), $options)); $scanner = new Scanner($input); $parser = new Tokenizer($scanner, $events); diff --git a/libraries/html5php/HTML5/Elements.php b/libraries/html5php/HTML5/Elements.php index 819ce0e..6cf72aa 100644 --- a/libraries/html5php/HTML5/Elements.php +++ b/libraries/html5php/HTML5/Elements.php @@ -66,6 +66,11 @@ class Elements */ const BLOCK_TAG = 64; + /** + * Indicates that the tag allows only inline elements as child nodes. + */ + const BLOCK_ONLY_INLINE = 128; + /** * The HTML5 elements as defined in http://dev.w3.org/html5/markup/elements.html. * @@ -120,7 +125,7 @@ class Elements "head" => 1, "header" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG "hgroup" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG - "hr" => 73, // NORMAL | VOID_TAG | BLOCK_TAG + "hr" => 73, // NORMAL | VOID_TAG "html" => 1, "i" => 1, "iframe" => 3, // NORMAL | TEXT_RAW @@ -145,7 +150,7 @@ class Elements "optgroup" => 1, "option" => 1, "output" => 65, // NORMAL | BLOCK_TAG - "p" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG + "p" => 209, // NORMAL | AUTOCLOSE_P | BLOCK_TAG | BLOCK_ONLY_INLINE "param" => 9, // NORMAL | VOID_TAG "pre" => 81, // NORMAL | AUTOCLOSE_P | BLOCK_TAG "progress" => 1, diff --git a/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php b/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php index 42a55fe..ccad229 100644 --- a/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php +++ b/libraries/html5php/HTML5/Parser/DOMTreeBuilder.php @@ -38,6 +38,12 @@ class DOMTreeBuilder implements EventHandler const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; + const OPT_DISABLE_HTML_NS = 'disable_html_ns'; + + const OPT_TARGET_DOC = 'target_document'; + + const OPT_IMPLICIT_NS = 'implicit_namespaces'; + /** * Holds the HTML5 element names that causes a namespace switch * @@ -138,6 +144,12 @@ class DOMTreeBuilder implements EventHandler protected $insertMode = 0; + /** + * Track if we are in an element that allows only inline child nodes + * @var string|null + */ + protected $onlyInline; + /** * Quirks mode is enabled by default. * Any document that is missing the @@ -151,13 +163,17 @@ class DOMTreeBuilder implements EventHandler { $this->options = $options; - $impl = new \DOMImplementation(); - // XXX: - // Create the doctype. For now, we are always creating HTML5 - // documents, and attempting to up-convert any older DTDs to HTML5. - $dt = $impl->createDocumentType('html'); - // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); - $this->doc = $impl->createDocument(null, null, $dt); + if (isset($options[self::OPT_TARGET_DOC])) { + $this->doc = $options[self::OPT_TARGET_DOC]; + } else { + $impl = new \DOMImplementation(); + // XXX: + // Create the doctype. For now, we are always creating HTML5 + // documents, and attempting to up-convert any older DTDs to HTML5. + $dt = $impl->createDocumentType('html'); + // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); + $this->doc = $impl->createDocument(null, null, $dt); + } $this->errors = array(); $this->current = $this->doc; // ->documentElement; @@ -165,8 +181,15 @@ class DOMTreeBuilder implements EventHandler // Create a rules engine for tags. $this->rules = new TreeBuildingRules($this->doc); + $implicitNS = array(); + if (isset($this->options[self::OPT_IMPLICIT_NS])) { + $implicitNS = $this->options[self::OPT_IMPLICIT_NS]; + } elseif (isset($this->options["implicitNamespaces"])) { + $implicitNS = $this->options["implicitNamespaces"]; + } + // Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options - array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array( + array_unshift($this->nsStack, $implicitNS + array( '' => self::NAMESPACE_HTML ) + $this->implicitNamespaces); @@ -320,6 +343,11 @@ class DOMTreeBuilder implements EventHandler } } + if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) { + $this->autoclose($this->onlyInline); + $this->onlyInline = null; + } + try { $prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : ''; @@ -334,10 +362,10 @@ class DOMTreeBuilder implements EventHandler $ele = $this->doc->importNode($frag->documentElement, true); } else { - if (isset($this->nsStack[0][$prefix])) { - $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); - } else { + if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) { $ele = $this->doc->createElement($lname); + } else { + $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); } } @@ -346,6 +374,10 @@ class DOMTreeBuilder implements EventHandler $ele = $this->doc->createElement('invalid'); } + if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) { + $this->onlyInline = $lname; + } + // When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them. // When we are on a void tag, we do not need to care about namesapce nesting. if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) { @@ -394,7 +426,7 @@ class DOMTreeBuilder implements EventHandler } // Some elements have special processing rules. Handle those separately. - if ($this->rules->hasRules($name)) { + if ($this->rules->hasRules($name) && $this->frag !== $this->current) { $this->current = $this->rules->evaluate($ele, $this->current); } // Otherwise, it's a standard element. else { @@ -649,4 +681,4 @@ class DOMTreeBuilder implements EventHandler { return $this->current->tagName == $tagname; } -} +} \ No newline at end of file diff --git a/libraries/html5php/HTML5/Parser/Scanner.php b/libraries/html5php/HTML5/Parser/Scanner.php index a92c608..f605c69 100644 --- a/libraries/html5php/HTML5/Parser/Scanner.php +++ b/libraries/html5php/HTML5/Parser/Scanner.php @@ -11,9 +11,9 @@ class Scanner const CHARS_HEX = 'abcdefABCDEF01234567890'; - const CHARS_ALNUM = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; + const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; - const CHARS_ALPHA = 'abcdefAghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXYZ'; + const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; protected $is; diff --git a/libraries/html5php/HTML5/Parser/Tokenizer.php b/libraries/html5php/HTML5/Parser/Tokenizer.php index 92510de..a779191 100644 --- a/libraries/html5php/HTML5/Parser/Tokenizer.php +++ b/libraries/html5php/HTML5/Parser/Tokenizer.php @@ -200,10 +200,12 @@ class Tokenizer if (is_null($this->untilTag)) { return $this->text(); } - $sequence = 'untilTag . '>'; + $sequence = 'untilTag; $txt = ''; $tok = $this->scanner->current(); - while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))))) { + + $caseSensitive = !Elements::isHtml5Element($this->untilTag); + while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) { if ($tok == '&') { $txt .= $this->decodeCharacterReference(); $tok = $this->scanner->current(); @@ -212,6 +214,13 @@ class Tokenizer $tok = $this->scanner->next(); } } + $len = strlen($sequence); + $this->scanner->consume($len); + $len += strlen($this->scanner->whitespace()); + if ($this->scanner->current() !== '>') { + $this->parseError("Unclosed RCDATA end tag"); + } + $this->scanner->unconsume($len); $this->events->text($txt); $this->setTextMode(0); return $this->endTag(); @@ -353,7 +362,7 @@ class Tokenizer } // We know this is at least one char. - $name = strtolower($this->scanner->charsWhile(":0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")); + $name = strtolower($this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")); $attributes = array(); $selfClose = false; @@ -891,7 +900,7 @@ class Tokenizer $buffer .= $this->scanner->charsUntil($first); // Stop as soon as we hit the stopping condition. - if ($this->sequenceMatches($sequence) || $this->sequenceMatches(strtoupper($sequence))) { + if ($this->sequenceMatches($sequence, false)) { return $buffer; } $buffer .= $this->scanner->current(); @@ -916,7 +925,7 @@ class Tokenizer * see if the input stream is at the start of a * '' string. */ - protected function sequenceMatches($sequence) + protected function sequenceMatches($sequence, $caseSensitive = true) { $len = strlen($sequence); $buffer = ''; @@ -932,7 +941,7 @@ class Tokenizer } $this->scanner->unconsume($len); - return $buffer == $sequence; + return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0; } /** @@ -1056,8 +1065,14 @@ class Tokenizer // [a-zA-Z0-9]+; $cname = $this->scanner->getAsciiAlpha(); $entity = CharacterReference::lookupName($cname); + + // When no entity is found provide the name of the unmatched string + // and continue on as the & is not part of an entity. The & will + // be converted to & elsewhere. if ($entity == null) { - $this->parseError("No match in entity table for '%s'", $entity); + $this->parseError("No match in entity table for '%s'", $cname); + $this->scanner->unconsume($this->scanner->position() - $start); + return '&'; } } diff --git a/libraries/html5php/HTML5/Serializer/OutputRules.php b/libraries/html5php/HTML5/Serializer/OutputRules.php index 7ea7c6a..c009698 100644 --- a/libraries/html5php/HTML5/Serializer/OutputRules.php +++ b/libraries/html5php/HTML5/Serializer/OutputRules.php @@ -115,8 +115,10 @@ class OutputRules implements \Masterminds\HTML5\Serializer\RulesInterface public function document($dom) { $this->doctype(); - $this->traverser->node($dom->documentElement); - $this->nl(); + if ($dom->documentElement) { + $this->traverser->node($dom->documentElement); + $this->nl(); + } } protected function doctype() diff --git a/libraries/html5php/HTML5/Serializer/Traverser.php b/libraries/html5php/HTML5/Serializer/Traverser.php index e910f3a..9c700da 100644 --- a/libraries/html5php/HTML5/Serializer/Traverser.php +++ b/libraries/html5php/HTML5/Serializer/Traverser.php @@ -112,7 +112,7 @@ class Traverser break; // Currently we don't support embedding DTDs. default: - print ''; + //print ''; break; } } diff --git a/libraries/html5php/LICENSE.txt b/libraries/html5php/LICENSE.txt index 6ecbf3e..3c275b5 100644 --- a/libraries/html5php/LICENSE.txt +++ b/libraries/html5php/LICENSE.txt @@ -2,8 +2,9 @@ Copyright (c) 2013 The Authors of HTML5-PHP -Matt Butcher - technosophos@gmail.com +Matt Butcher - mattbutcher@google.com Matt Farina - matt@mattfarina.com +Asmir Mustafic - goetas@gmail.com Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/libraries/html5php/README.md b/libraries/html5php/README.md index 746ab4d..bbe003f 100644 --- a/libraries/html5php/README.md +++ b/libraries/html5php/README.md @@ -10,6 +10,7 @@ But after some initial refactoring work, we began a new parser. - Event-based (SAX-like) parser - DOM tree builder - Interoperability with QueryPath [[in progress](https://github.com/technosophos/querypath/issues/114)] +- Runs on **PHP** 5.3.0 or newer and **HHVM** 3.2 or newer [![Build Status](https://travis-ci.org/Masterminds/html5-php.png?branch=master)](https://travis-ci.org/Masterminds/html5-php) [![Latest Stable Version](https://poser.pugx.org/masterminds/html5/v/stable.png)](https://packagist.org/packages/masterminds/html5) [![Coverage Status](https://coveralls.io/repos/Masterminds/html5-php/badge.png?branch=master)](https://coveralls.io/r/Masterminds/html5-php?branch=master) @@ -22,12 +23,12 @@ To install, add `masterminds/html5` to your `composer.json` file: ``` { "require" : { - "masterminds/html5": "1.*" + "masterminds/html5": "2.*" }, } ``` -(You may substitute `1.*` for a more specific release tag, of +(You may substitute `2.*` for a more specific release tag, of course.) From there, use the `composer install` or `composer update` commands to @@ -43,6 +44,7 @@ Here is how you use the high-level `HTML5` library API: loadHTML($html); // Render it as HTML5: -print HTML5::saveHTML($dom); +print $html5->saveHTML($dom); // Or save it to a file: -HTML5::save($dom, 'out.html'); +$html5->save($dom, 'out.html'); ?> ``` @@ -73,6 +76,35 @@ HTML5::save($dom, 'out.html'); The `$dom` created by the parser is a full `DOMDocument` object. And the `save()` and `saveHTML()` methods will take any DOMDocument. +### Options + +It is possible to pass in an array of configuration options when loading +an HTML5 document. + +```php +// An associative array of options +$options = array( + 'option_name' => 'option_value', +); + +// Provide the options to the constructor +$html5 = new HTML5($options); + +$dom = $html5->loadHTML($html); +``` + +The following options are supported: + +* `encode_entities` (boolean): Indicates that the serializer should aggressively + encode characters as entities. Without this, it only encodes the bare + minimum. +* `disable_html_ns` (boolean): Prevents the parser from automatically + assigning the HTML5 namespace to the DOM document. This is for + non-namespace aware DOM tools. +* `target_document` (\DOMDocument): A DOM document that will be used as the + destination for the parsed nodes. +* `implicit_namespaces` (array): An assoc array of namespaces that should be + used by the parser. Name is tag prefix, value is NS URI. ## The Low-Level API @@ -116,7 +148,7 @@ different rule sets to be used. - The `Traverser`, which is a special-purpose tree walker. It visits each node node in the tree and uses the `OutputRules` to transform the node into a string. -- `\HTML5` manages the `Traverser` and stores the resultant data +- `HTML5` manages the `Traverser` and stores the resultant data in the correct place. The serializer (`save()`, `saveHTML()`) follows the @@ -134,7 +166,9 @@ issues known issues that are not presently on the roadmap: - Namespaces: HTML5 only [supports a selected list of namespaces](http://www.w3.org/TR/html5/infrastructure.html#namespaces) and they do not operate in the same way as XML namespaces. A `:` has no special - meaning. The parser does not support XML style namespaces via `:`. + meaning. + By default the parser does not support XML style namespaces via `:`; + to enable the XML namespaces see the [XML Namespaces section](#xml-namespaces) - Scripts: This parser does not contain a JavaScript or a CSS interpreter. While one may be supplied, not all features will be supported. @@ -162,8 +196,45 @@ issues known issues that are not presently on the roadmap: - PLAINTEXT: Unsupported. - Adoption Agency Algorithm: Not yet implemented. (8.2.5.4.7) +##XML Namespaces + +To use XML style namespaces you have to configure well the main `HTML5` instance. + +```php +use Masterminds\HTML5; +$html = new HTML5(array( + "xmlNamespaces" => true +)); + +$dom = $html->loadHTML(''); + +$dom->documentElement->namespaceURI; // http://www.example.com + +``` + +You can also add some default prefixes that will not require the namespace declaration, +but it's elements will be namespaced. + +```php +use Masterminds\HTML5; +$html = new HTML5(array( + "implicitNamespaces"=>array( + "t"=>"http://www.example.com" + ) +)); + +$dom = $html->loadHTML(''); + +$dom->documentElement->namespaceURI; // http://www.example.com + +``` + ## Thanks to... +The dedicated (and patient) contributors of patches small and large, +who have already made this library better.See the CREDITS file for +a list of contributors. + We owe a huge debt of gratitude to the original authors of html5lib. While not much of the orignal parser remains, we learned a lot from diff --git a/libraries/html5php/RELEASE.md b/libraries/html5php/RELEASE.md index 56e0cf0..d4b64a5 100644 --- a/libraries/html5php/RELEASE.md +++ b/libraries/html5php/RELEASE.md @@ -1,5 +1,42 @@ # Release Notes +2.1.1 (2015-03-23) +- #78: Fixes bug where unmatched entity like string drops everything after &. + +2.1.0 (2015-02-01) +- #74: Added `disable_html_ns` and `target_doc` dom parsing options +- Unified option names +- #73: Fixed alphabet, ß now can be detected +- #75 and #76: Allow whitespace in RCDATA tags +- #77: Fixed parsing blunder for json embeds +- #72: Add options to HTML methods + +2.0.2 (2014-12-17) +- #50: empty document handling +- #63: tags with strange capitalization +- #65: dashes and underscores as allowed characters in tag names +- #68: Fixed issue with non-inline elements inside inline containers + +2.0.1 (2014-09-23) +- #59: Fixed issue parsing some fragments. +- #56: Incorrectly saw 0 as empty string +- Sami as new documentation generator + +2.0.0 (2014-07-28) +- #53: Improved boolean attributes handling +- #52: Facebook HHVM compatibility +- #48: Adopted PSR-2 as coding standard +- #47: Moved everything to Masterminds namespace +- #45: Added custom namespaces +- #44: Added support to XML-style namespaces +- #37: Refactored HTML5 class removing static methods + +1.0.5 (2014-06-10) +- #38: Set the dev-master branch as the 1.0.x branch for composer (goetas) +- #34: Tests use PSR-4 for autoloading. (goetas) +- #40, #41: Fix entity handling in RCDATA sections. (KitaitiMakoto) +- #32: Fixed issue where wharacter references were being incorrectly encoded in style tags. + 1.0.4 (2014-04-29) - #30/#31 Don't throw an exception for invalid tag names. diff --git a/libraries/humble-http-agent/HumbleHttpAgent.php b/libraries/humble-http-agent/HumbleHttpAgent.php index 6f60fe9..4f3b83d 100644 --- a/libraries/humble-http-agent/HumbleHttpAgent.php +++ b/libraries/humble-http-agent/HumbleHttpAgent.php @@ -7,11 +7,11 @@ * For environments which do not have these options, it reverts to standard sequential * requests (using file_get_contents()) * - * @version 1.5 - * @date 2014-03-28 - * @see http://php.net/HttpRequestPool + * @version 1.6 + * @date 2015-06-05 + * @see http://devel-m6w6.rhcloud.com/mdref/http * @author Keyvan Minoukadeh - * @copyright 2011-2014 Keyvan Minoukadeh + * @copyright 2011-2015 Keyvan Minoukadeh * @license http://www.gnu.org/licenses/agpl-3.0.html AGPL v3 */ @@ -22,7 +22,7 @@ class HumbleHttpAgent const METHOD_FILE_GET_CONTENTS = 4; //const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'; const UA_BROWSER = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.92 Safari/535.2'; - const UA_PHP = 'PHP/5.4'; + const UA_PHP = 'PHP/5.5'; const REF_GOOGLE = 'http://www.google.co.uk/url?sa=t&source=web&cd=1'; protected $requests = array(); @@ -38,6 +38,7 @@ class HumbleHttpAgent public $debug = false; public $debugVerbose = false; public $rewriteHashbangFragment = true; // see http://code.google.com/web/ajaxcrawling/docs/specification.html + public $siteConfigBuilder = null; // can be set to an instance of ContentExtractor to have site config files used for custom HTTP headers public $maxRedirects = 5; public $userAgentMap = array(); public $rewriteUrls = array(); @@ -67,7 +68,7 @@ class HumbleHttpAgent if (in_array($method, array(1,2,4))) { $this->method = $method; } else { - if (class_exists('HttpRequestPool')) { + if (class_exists('http\Client\Request')) { $this->method = self::METHOD_REQUEST_POOL; } elseif (function_exists('curl_multi_init')) { $this->method = self::METHOD_CURL_MULTI; @@ -192,6 +193,7 @@ class HumbleHttpAgent return false; } $redirect_url = $match[1]; + $redirect_url = htmlspecialchars_decode($redirect_url); // For Facebook! if (preg_match('!^https?://!i', $redirect_url)) { // already absolute $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$redirect_url); @@ -203,7 +205,7 @@ class HumbleHttpAgent if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path); if ($absolute = SimplePie_IRI::absolutize($base, $redirect_url)) { $this->debug('Meta refresh redirect found (http-equiv="refresh"), new URL: '.$absolute); - return $absolute; + return $absolute->get_iri(); } return false; } @@ -293,14 +295,16 @@ class HumbleHttpAgent if (empty($urls)) return; ////////////////////////////////////////////////////// - // parallel (HttpRequestPool) + // parallel (HTTP extension) if ($this->method == self::METHOD_REQUEST_POOL) { - $this->debug('Starting parallel fetch (HttpRequestPool)'); + $this->debug('Starting parallel fetch (HTTP Extension)'); try { while (count($urls) > 0) { $this->debug('Processing set of '.min($this->maxParallelRequests, count($urls))); $subset = array_splice($urls, 0, $this->maxParallelRequests); - $pool = new HttpRequestPool(); + //$pool = new HttpRequestPool(); + $pool = new http\Client; + $pool->setOptions($this->requestOptions); foreach ($subset as $orig => $url) { if (!$isRedirect) $orig = $url; unset($this->redirectQueue[$orig]); @@ -320,24 +324,62 @@ class HumbleHttpAgent $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; $req_url = $this->removeFragment($req_url); if (!empty($this->headerOnlyTypes) && !isset($this->requests[$orig]['wrongGuess']) && $this->possibleUnsupportedType($req_url)) { - $_meth = HttpRequest::METH_HEAD; + $_meth = "HEAD"; } else { - $_meth = HttpRequest::METH_GET; + $_meth = "GET"; unset($this->requests[$orig]['wrongGuess']); } - $httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); - // send cookies, if we have any - if ($cookies = $this->getCookies($orig, $req_url)) { - $this->debug("......sending cookies: $cookies"); - $httpRequest->addHeaders(array('Cookie' => $cookies)); + //$httpRequest = new HttpRequest($req_url, $_meth, $this->requestOptions); + $httpRequest = new http\Client\Request($_meth, $req_url); + $httpRequest->setOptions($this->requestOptions); + + // check site config for additional http headers + $scHeaders = array(); + if (isset($this->siteConfigBuilder)) { + $scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header; } - //$httpRequest->addHeaders(array('User-Agent' => $this->userAgent)); - $httpRequest->addHeaders($this->getUserAgent($req_url, true)); + + // send cookies, if we have any + $_cookies = null; + if (isset($scHeaders['cookie'])) { + $_cookies = $scHeaders['cookie']; + } else { + //$_cookies = $this->cookieJar->getMatchingCookies($req_url); + $_cookies = $this->getCookies($orig, $req_url); + } + if ($_cookies) { + $this->debug("......sending cookies: $_cookies"); + $httpRequest->addHeaders(array('Cookie' => $_cookies)); + } + + // send user agent + $_ua = null; + if (isset($scHeaders['user-agent'])) { + $_ua = $scHeaders['user-agent']; + } else { + $_ua = $this->getUserAgent($req_url, true); + $_ua = $_ua['User-Agent']; + } + if ($_ua) { + $this->debug("......user-agent set to: $_ua"); + $httpRequest->addHeaders(array('User-Agent' => $_ua)); + } + // add referer for picky sites - $httpRequest->addheaders(array('Referer' => $this->referer)); + $_referer = null; + if (isset($scHeaders['referer'])) { + $_referer = $scHeaders['referer']; + } else { + $_referer = $this->referer; + } + if ($_referer) { + $this->debug("......referer set to: $_referer"); + $httpRequest->addheaders(array('Referer'=>$_referer)); + } + $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); $this->requests[$orig]['original_url'] = $orig; - $pool->attach($httpRequest); + $pool->enqueue($httpRequest); } } // did we get anything into the pool? @@ -345,16 +387,20 @@ class HumbleHttpAgent $this->debug('Sending request...'); try { $pool->send(); - } catch (HttpRequestPoolException $e) { + } catch (http\Exception $e) { // do nothing } $this->debug('Received responses'); foreach($subset as $orig => $url) { if (!$isRedirect) $orig = $url; $request = $this->requests[$orig]['httpRequest']; + $response = $pool->getResponse($request); //$this->requests[$orig]['headers'] = $this->headersToString($request->getResponseHeader()); // getResponseHeader() doesn't return status line, so, for consistency... - $this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); + //$headers = $response->toString(); + $this->requests[$orig]['headers'] = $response->getInfo()."\n".$this->headersToString($response->getHeaders(), true); + // v1 HTTP extension code + //$this->requests[$orig]['headers'] = substr($request->getRawResponseMessage(), 0, $request->getResponseInfo('header_size')); // check content type // TODO: use getResponseHeader('content-type') or getResponseInfo() if ($this->headerOnlyType($this->requests[$orig]['headers'])) { @@ -362,25 +408,37 @@ class HumbleHttpAgent $_header_only_type = true; $this->debug('Header only type returned'); } else { - $this->requests[$orig]['body'] = $request->getResponseBody(); + $this->requests[$orig]['body'] = $response->getBody()->toString(); + //var_dump($this->requests[$orig]['body']);exit; + // v1 HTTP ext. code + //$this->requests[$orig]['body'] = $request->getResponseBody(); $_header_only_type = false; } - $this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); - $this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); + $this->requests[$orig]['effective_url'] = $response->getTransferInfo('effective_url'); + $this->requests[$orig]['status_code'] = $status_code = $response->getResponseCode(); + // v1 HTTP ext. code + //$this->requests[$orig]['effective_url'] = $request->getResponseInfo('effective_url'); + //$this->requests[$orig]['status_code'] = $status_code = $request->getResponseCode(); // is redirect? - if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { - $redirectURL = $request->getResponseHeader('location'); + if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $response->getHeader('location')) { + // v1 HTTP ext. code + //if ((in_array($status_code, array(300, 301, 302, 303, 307)) || $status_code > 307 && $status_code < 400) && $request->getResponseHeader('location')) { + $redirectURL = $response->getHeader('location'); if (!preg_match('!^https?://!i', $redirectURL)) { $redirectURL = SimplePie_Misc::absolutize_url($redirectURL, $url); } if ($this->validateURL($redirectURL)) { $this->debug('Redirect detected. Valid URL: '.$redirectURL); + // store any cookies + //$cookies = $request->getResponseHeader('set-cookie'); + //if ($cookies && !is_array($cookies)) $cookies = array($cookies); + //if ($cookies) $this->cookieJar->storeCookies($url, $cookies); $this->storeCookies($orig, $url); $this->redirectQueue[$orig] = $redirectURL; } else { $this->debug('Redirect detected. Invalid URL: '.$redirectURL); } - } elseif (!$_header_only_type && $request->getMethod() === HttpRequest::METH_HEAD) { + } elseif (!$_header_only_type && $request->getRequestMethod() == "HEAD") { // the response content-type did not match our 'header only' types, // but we'd issues a HEAD request because we assumed it would. So // let's queue a proper GET request for this item... @@ -399,7 +457,7 @@ class HumbleHttpAgent } } //die($url.' -multi- '.$request->getResponseInfo('effective_url')); - $pool->detach($request); + $pool->dequeue($request); unset($this->requests[$orig]['httpRequest'], $request); /* if ($this->minimiseMemoryUse) { @@ -411,7 +469,7 @@ class HumbleHttpAgent } } } - } catch (HttpException $e) { + } catch (http\Exception $e) { $this->debug($e); return false; } @@ -450,17 +508,53 @@ class HumbleHttpAgent } else { $_meth = 'GET'; unset($this->requests[$orig]['wrongGuess']); - } - $headers = array(); - //$headers[] = 'User-Agent: '.$this->userAgent; - $headers[] = $this->getUserAgent($req_url); - // add referer for picky sites - $headers[] = 'Referer: '.$this->referer; - // send cookies, if we have any - if ($cookies = $this->getCookies($orig, $req_url)) { - $this->debug("......sending cookies: $cookies"); - $headers[] = 'Cookie: '.$cookies; } + $headers = array(); + + // check site config for additional http headers + $scHeaders = array(); + if (isset($this->siteConfigBuilder)) { + $scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header; + } + + // send cookies, if we have any + $_cookies = null; + if (isset($scHeaders['cookie'])) { + $_cookies = $scHeaders['cookie']; + } else { + //$_cookies = $this->cookieJar->getMatchingCookies($req_url); + $_cookies = $this->getCookies($orig, $req_url); + } + if ($_cookies) { + $this->debug("......sending cookies: $_cookies"); + $headers[] = 'Cookie: '.$_cookies; + } + + // send user agent + $_ua = null; + if (isset($scHeaders['user-agent'])) { + $_ua = $scHeaders['user-agent']; + } else { + $_ua = $this->getUserAgent($req_url, true); + $_ua = $_ua['User-Agent']; + } + if ($_ua) { + $this->debug("......user-agent set to: $_ua"); + $headers[] = 'User-Agent: '.$_ua; + } + + // add referer for picky sites + $_referer = null; + if (isset($scHeaders['referer'])) { + $_referer = $scHeaders['referer']; + } else { + $_referer = $this->referer; + } + if ($_referer) { + $this->debug("......referer set to: $_referer"); + $headers[] = 'Referer: '.$_referer; + } + $httpRequest = new RollingCurlRequest($req_url, $_meth, null, $headers, $this->curlOptions); $httpRequest->set_original_url($orig); $this->requests[$orig] = array('headers'=>null, 'body'=>null, 'httpRequest'=>$httpRequest); @@ -494,7 +588,10 @@ class HumbleHttpAgent } if ($this->validateURL($redirectURL)) { $this->debug('Redirect detected. Valid URL: '.$redirectURL); - $this->storeCookies($orig, $url); + // store any cookies + //$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); + //if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); + $this->storeCookies($orig, $url); $this->redirectQueue[$orig] = $redirectURL; } else { $this->debug('Redirect detected. Invalid URL: '.$redirectURL); @@ -548,15 +645,52 @@ class HumbleHttpAgent $req_url = $this->rewriteUrls($url); $req_url = ($this->rewriteHashbangFragment) ? $this->rewriteHashbangFragment($req_url) : $req_url; $req_url = $this->removeFragment($req_url); - // send cookies, if we have any $httpContext = $this->httpContext; - $httpContext['http']['header'] .= $this->getUserAgent($req_url)."\r\n"; - // add referer for picky sites - $httpContext['http']['header'] .= 'Referer: '.$this->referer."\r\n"; - if ($cookies = $this->getCookies($orig, $req_url)) { - $this->debug("......sending cookies: $cookies"); - $httpContext['http']['header'] .= 'Cookie: '.$cookies."\r\n"; + + // check site config for additional http headers + $scHeaders = array(); + if (isset($this->siteConfigBuilder)) { + $scHeaders = $this->siteConfigBuilder->buildSiteConfig($req_url)->http_header; } + + // send cookies, if we have any + $_cookies = null; + if (isset($scHeaders['cookie'])) { + $_cookies = $scHeaders['cookie']; + } else { + //$_cookies = $this->cookieJar->getMatchingCookies($req_url); + $_cookies = $this->getCookies($orig, $req_url); + } + if ($_cookies) { + $this->debug("......sending cookies: $_cookies"); + $httpContext['http']['header'] .= 'Cookie: '.$_cookies."\r\n"; + } + + // send user agent + $_ua = null; + if (isset($scHeaders['user-agent'])) { + $_ua = $scHeaders['user-agent']; + } else { + $_ua = $this->getUserAgent($req_url, true); + $_ua = $_ua['User-Agent']; + } + if ($_ua) { + $this->debug("......user-agent set to: $_ua"); + $httpContext['http']['header'] .= 'User-Agent: '.$_ua."\r\n"; + } + + // add referer for picky sites + $_referer = null; + if (isset($scHeaders['referer'])) { + $_referer = $scHeaders['referer']; + } else { + $_referer = $this->referer; + } + if ($_referer) { + $this->debug("......referer set to: $_referer"); + $httpContext['http']['header'] .= 'Referer: '.$_referer."\r\n"; + } + if (false !== ($html = @file_get_contents($req_url, false, stream_context_create($httpContext)))) { $this->debug('Received response'); // get status code @@ -585,6 +719,9 @@ class HumbleHttpAgent } if ($this->validateURL($redirectURL)) { $this->debug('Redirect detected. Valid URL: '.$redirectURL); + // store any cookies + //$cookies = $this->cookieJar->extractCookies($this->requests[$orig]['headers']); + //if (!empty($cookies)) $this->cookieJar->storeCookies($url, $cookies); $this->storeCookies($orig, $url); $this->redirectQueue[$orig] = $redirectURL; } else { @@ -680,7 +817,7 @@ class HumbleHttpAgent } public function parallelSupport() { - return class_exists('HttpRequestPool') || function_exists('curl_multi_init'); + return class_exists('http\Client') || function_exists('curl_multi_init'); } private function headerOnlyType($headers) { @@ -703,7 +840,7 @@ class HumbleHttpAgent } return false; } - + protected function getCookies($orig, $req_url) { $jar = $this->cookieJar[$orig]; if (!isset($jar)) { @@ -727,6 +864,7 @@ class HumbleHttpAgent protected function deleteCookies() { $this->cookieJar = array(); } + } // gzdecode from http://www.php.net/manual/en/function.gzdecode.php#82930 diff --git a/libraries/humble-http-agent/HumbleHttpAgentDummy.php b/libraries/humble-http-agent/HumbleHttpAgentDummy.php index f2a93e9..cd30c32 100644 --- a/libraries/humble-http-agent/HumbleHttpAgentDummy.php +++ b/libraries/humble-http-agent/HumbleHttpAgentDummy.php @@ -22,6 +22,7 @@ class HumbleHttpAgentDummy public $userAgentMap = array(); public $rewriteUrls = array(); public $userAgentDefault; + public $siteConfigBuilder = null; public $referer; protected $body = ''; diff --git a/libraries/readability/Readability.php b/libraries/readability/Readability.php index b99f3bf..9e41237 100644 --- a/libraries/readability/Readability.php +++ b/libraries/readability/Readability.php @@ -12,7 +12,7 @@ * More information: http://fivefilters.org/content-only/ * License: Apache License, Version 2.0 * Requires: PHP5 -* Date: 2014-03-27 +* Date: 2015-06-01 * * Differences between the PHP port and the original * ------------------------------------------------------ @@ -95,7 +95,7 @@ class Readability // 'trimRe' => '/^\s+|\s+$/g', // PHP has trim() 'normalize' => '/\s{2,}/', 'killBreaks' => '/((\s| ?)*){1,}/', - 'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|twitch\.tv)!i', + 'video' => '!//(player\.|www\.)?(youtube\.com|vimeo\.com|viddler\.com|soundcloud\.com|twitch\.tv)!i', 'skipFootnoteLink' => '/^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i' ); @@ -121,8 +121,12 @@ class Readability if (version_compare(PHP_VERSION, '5.3.0') >= 0) { //use Masterminds\HTML5; $html5class = 'Masterminds\HTML5'; - $html5 = new $html5class(); + $html5 = new $html5class(array('disable_html_ns' => true)); $this->dom = $html5->loadHTML($html); + //echo $html5->saveHTML($this->dom);exit; + //$xpath = new DOMXPath($this->dom); + //$elems = $xpath->query("//a"); + //print_r($elems);exit; } } if ($this->dom === null) { @@ -314,7 +318,11 @@ class Readability $styleTags = $this->dom->getElementsByTagName('style'); for ($i = $styleTags->length-1; $i >= 0; $i--) { - $styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); + try { + @$styleTags->item($i)->parentNode->removeChild($styleTags->item($i)); + } catch (Exception $e) { + // Do nothing + } } /* Turn all double br's into p's */ @@ -832,7 +840,11 @@ class Readability $scripts = $doc->getElementsByTagName('script'); for($i = $scripts->length-1; $i >= 0; $i--) { - $scripts->item($i)->parentNode->removeChild($scripts->item($i)); + try { + $scripts->item($i)->parentNode->removeChild($scripts->item($i)); + } catch (Exception $e) { + // do nothing + } } } diff --git a/makefulltextfeed.php b/makefulltextfeed.php index 603aada..d17dabd 100644 --- a/makefulltextfeed.php +++ b/makefulltextfeed.php @@ -1,10 +1,10 @@ . error_reporting(E_ALL ^ E_NOTICE); libxml_use_internal_errors(true); +libxml_disable_entity_loader(true); ini_set("display_errors", 1); @set_time_limit(120); @@ -234,7 +235,7 @@ if (isset($_REQUEST['accept']) && in_array(strtolower($_REQUEST['accept']), arra $user_submitted_config = null; if (isset($_REQUEST['siteconfig'])) { $user_submitted_config = $_REQUEST['siteconfig']; - if (!$options->user_submitted_content && $user_submitted_config) { + if (!$options->user_submitted_config && $user_submitted_config) { die('User-submitted site configs are currently disabled. Please remove the siteconfig parameter.'); } } @@ -526,7 +527,8 @@ if (isset($_REQUEST['inputhtml']) && _FF_FTR_MODE == 'simple') { } $http = new HumbleHttpAgent($_req_options); $http->debug = $debug_mode; - $http->userAgentMap = $options->user_agents; + // User agents can now be set in site config files using the http_header directive + //$http->userAgentMap = $options->user_agents; $http->headerOnlyTypes = array_keys($options->content_type_exc); $http->rewriteUrls = $options->rewrite_url; unset($_req_options); @@ -545,6 +547,7 @@ $extractor->parserOverride = $parser; if ($options->user_submitted_config && $user_submitted_config) { $extractor->setUserSubmittedConfig($user_submitted_config); } +$http->siteConfigBuilder = $extractor; //////////////////////////////// // Get RSS/Atom feed @@ -655,7 +658,7 @@ $items = $feed->get_items(0, $max); $urls_sanitized = array(); $urls = array(); foreach ($items as $key => $item) { - $permalink = htmlspecialchars_decode($item->get_permalink()); + $permalink = htmlspecialchars_decode(trim($item->get_permalink())); // Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded $permalink = str_replace('%3A', ':', $permalink); // validateUrl() strips non-ascii characters @@ -973,6 +976,13 @@ foreach ($items as $key => $item) { break; } } + + // add open graph + if ($opengraph = $extractor->getOpenGraph()) { + foreach ($opengraph as $og_prop => $og_val) { + $newitem->addElement($og_prop, $og_val); + } + } // add language if ($detect_language) { @@ -1390,6 +1400,17 @@ function get_single_page($item, $html, $url) { // Loop through single_page_link xpath expressions $single_page_url = null; foreach ($splink as $pattern) { + // Do we have conditions? + $condition = $site_config->get_if_page_contains_condition('single_page_link', $pattern); + if ($condition) { + $elems = @$xpath->evaluate($condition, $readability->dom); + if ($elems instanceof DOMNodeList && $elems->length > 0) { + // all fine + } else { + // move on to next single page link XPath + continue; + } + } $elems = @$xpath->evaluate($pattern, $readability->dom); if (is_string($elems)) { $single_page_url = trim($elems); diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..f9efb86 --- /dev/null +++ b/robots.txt @@ -0,0 +1,3 @@ +User-agent: * +Disallow: /makefulltextfeed.php +Disallow: /extract.php \ No newline at end of file diff --git a/site_config/standard/index.php b/site_config/standard/index.php new file mode 100644 index 0000000..a3d5f73 --- /dev/null +++ b/site_config/standard/index.php @@ -0,0 +1,3 @@ + \ No newline at end of file