', $data);
- }
- }
-
- if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
- {
- if ($this->strip_comments)
- {
- $data = SimplePie_Misc::strip_comments($data);
- }
-
- if ($this->strip_htmltags)
- {
- foreach ($this->strip_htmltags as $tag)
- {
- $pcre = "/<($tag)" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . "(>(.*)<\/$tag" . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>|(\/)?>)/siU';
- while (preg_match($pcre, $data))
- {
- $data = preg_replace_callback($pcre, array(&$this, 'do_strip_htmltags'), $data);
- }
- }
- }
-
- if ($this->strip_attributes)
- {
- foreach ($this->strip_attributes as $attrib)
- {
- $data = preg_replace('/(<[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*)' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . trim($attrib) . '(?:\s*=\s*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>/', '\1\2\3>', $data);
- }
- }
-
- $this->base = $base;
- foreach ($this->replace_url_attributes as $element => $attributes)
- {
- $data = $this->replace_urls($data, $element, $attributes);
- }
-
- if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
- {
- $images = SimplePie_Misc::get_element('img', $data);
- foreach ($images as $img)
- {
- if (isset($img['attribs']['src']['data']))
- {
- $image_url = call_user_func($this->cache_name_function, $img['attribs']['src']['data']);
- $cache = call_user_func(array($this->cache_class, 'create'), $this->cache_location, $image_url, 'spi');
-
- if ($cache->load())
- {
- $img['attribs']['src']['data'] = $this->image_handler . $image_url;
- $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
- }
- else
- {
- $file = new $this->file_class($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen);
- $headers = $file->headers;
-
- if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
- {
- if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
- {
- $img['attribs']['src']['data'] = $this->image_handler . $image_url;
- $data = str_replace($img['full'], SimplePie_Misc::element_implode($img), $data);
- }
- else
- {
- trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
- }
- }
- }
- }
- }
- }
-
- $data = trim($data);
- }
-
- if ($type & SIMPLEPIE_CONSTRUCT_IRI)
- {
- $data = SimplePie_Misc::absolutize_url($data, $base);
- }
-
- if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
- {
- $data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
- }
-
- if ($this->output_encoding !== 'UTF-8')
- {
- $data = SimplePie_Misc::change_encoding($data, 'UTF-8', $this->output_encoding);
- }
- }
- return $data;
- }
-
- public function replace_urls($data, $tag, $attributes)
- {
- if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
- {
- $elements = SimplePie_Misc::get_element($tag, $data);
- foreach ($elements as $element)
- {
- if (is_array($attributes))
- {
- foreach ($attributes as $attribute)
- {
- if (isset($element['attribs'][$attribute]['data']))
- {
- $element['attribs'][$attribute]['data'] = SimplePie_Misc::absolutize_url($element['attribs'][$attribute]['data'], $this->base);
- $new_element = SimplePie_Misc::element_implode($element);
- $data = str_replace($element['full'], $new_element, $data);
- $element['full'] = $new_element;
- }
- }
- }
- elseif (isset($element['attribs'][$attributes]['data']))
- {
- $element['attribs'][$attributes]['data'] = SimplePie_Misc::absolutize_url($element['attribs'][$attributes]['data'], $this->base);
- $data = str_replace($element['full'], SimplePie_Misc::element_implode($element), $data);
- }
- }
- }
- return $data;
- }
-
- public function do_strip_htmltags($match)
- {
- if ($this->encode_instead_of_strip)
- {
- if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
- {
- $match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
- $match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
- return "<$match[1]$match[2]>$match[3]</$match[1]>";
- }
- else
- {
- return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
- }
- }
- elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
- {
- return $match[4];
- }
- else
- {
- return '';
- }
- }
-}
-
-
-
-
-class SimplePie_Source
-{
- var $item;
- var $data = array();
-
- public function __construct($item, $data)
- {
- $this->item = $item;
- $this->data = $data;
- }
-
- public function __toString()
- {
- return md5(serialize($this->data));
- }
-
- public function get_source_tags($namespace, $tag)
- {
- if (isset($this->data['child'][$namespace][$tag]))
- {
- return $this->data['child'][$namespace][$tag];
- }
- else
- {
- return null;
- }
- }
-
- public function get_base($element = array())
- {
- return $this->item->get_base($element);
- }
-
- public function sanitize($data, $type, $base = '')
- {
- return $this->item->sanitize($data, $type, $base);
- }
-
- public function get_item()
- {
- return $this->item;
- }
-
- public function get_title()
- {
- if ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'title'))
- {
- return $this->sanitize($return[0]['data'], SimplePie_Misc::atom_10_construct_type($return[0]['attribs']), $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'title'))
- {
- return $this->sanitize($return[0]['data'], SimplePie_Misc::atom_03_construct_type($return[0]['attribs']), $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_10, 'title'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_MAYBE_HTML, $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_090, 'title'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_MAYBE_HTML, $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'title'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_MAYBE_HTML, $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_11, 'title'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_10, 'title'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- else
- {
- return null;
- }
- }
-
- public function get_category($key = 0)
- {
- $categories = $this->get_categories();
- if (isset($categories[$key]))
- {
- return $categories[$key];
- }
- else
- {
- return null;
- }
- }
-
- public function get_categories()
- {
- $categories = array();
-
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'category') as $category)
- {
- $term = null;
- $scheme = null;
- $label = null;
- if (isset($category['attribs']['']['term']))
- {
- $term = $this->sanitize($category['attribs']['']['term'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if (isset($category['attribs']['']['scheme']))
- {
- $scheme = $this->sanitize($category['attribs']['']['scheme'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if (isset($category['attribs']['']['label']))
- {
- $label = $this->sanitize($category['attribs']['']['label'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- $categories[] = new $this->item->feed->category_class($term, $scheme, $label);
- }
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'category') as $category)
- {
- $term = $this->sanitize($category['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- if (isset($category['attribs']['']['domain']))
- {
- $scheme = $this->sanitize($category['attribs']['']['domain'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- else
- {
- $scheme = null;
- }
- $categories[] = new $this->item->feed->category_class($term, $scheme, null);
- }
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_11, 'subject') as $category)
- {
- $categories[] = new $this->item->feed->category_class($this->sanitize($category['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null);
- }
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_10, 'subject') as $category)
- {
- $categories[] = new $this->item->feed->category_class($this->sanitize($category['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null);
- }
-
- if (!empty($categories))
- {
- return SimplePie_Misc::array_unique($categories);
- }
- else
- {
- return null;
- }
- }
-
- public function get_author($key = 0)
- {
- $authors = $this->get_authors();
- if (isset($authors[$key]))
- {
- return $authors[$key];
- }
- else
- {
- return null;
- }
- }
-
- public function get_authors()
- {
- $authors = array();
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'author') as $author)
- {
- $name = null;
- $uri = null;
- $email = null;
- if (isset($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['name'][0]['data']))
- {
- $name = $this->sanitize($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['name'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if (isset($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['uri'][0]['data']))
- {
- $uri = $this->sanitize($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['uri'][0]['data'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['uri'][0]));
- }
- if (isset($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['email'][0]['data']))
- {
- $email = $this->sanitize($author['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['email'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if ($name !== null || $email !== null || $uri !== null)
- {
- $authors[] = new $this->item->feed->author_class($name, $uri, $email);
- }
- }
- if ($author = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'author'))
- {
- $name = null;
- $url = null;
- $email = null;
- if (isset($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['name'][0]['data']))
- {
- $name = $this->sanitize($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['name'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if (isset($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['url'][0]['data']))
- {
- $url = $this->sanitize($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['url'][0]['data'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['url'][0]));
- }
- if (isset($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['email'][0]['data']))
- {
- $email = $this->sanitize($author[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['email'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if ($name !== null || $email !== null || $url !== null)
- {
- $authors[] = new $this->item->feed->author_class($name, $url, $email);
- }
- }
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_11, 'creator') as $author)
- {
- $authors[] = new $this->item->feed->author_class($this->sanitize($author['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null);
- }
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_10, 'creator') as $author)
- {
- $authors[] = new $this->item->feed->author_class($this->sanitize($author['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null);
- }
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_ITUNES, 'author') as $author)
- {
- $authors[] = new $this->item->feed->author_class($this->sanitize($author['data'], SIMPLEPIE_CONSTRUCT_TEXT), null, null);
- }
-
- if (!empty($authors))
- {
- return SimplePie_Misc::array_unique($authors);
- }
- else
- {
- return null;
- }
- }
-
- public function get_contributor($key = 0)
- {
- $contributors = $this->get_contributors();
- if (isset($contributors[$key]))
- {
- return $contributors[$key];
- }
- else
- {
- return null;
- }
- }
-
- public function get_contributors()
- {
- $contributors = array();
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'contributor') as $contributor)
- {
- $name = null;
- $uri = null;
- $email = null;
- if (isset($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['name'][0]['data']))
- {
- $name = $this->sanitize($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['name'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if (isset($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['uri'][0]['data']))
- {
- $uri = $this->sanitize($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['uri'][0]['data'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['uri'][0]));
- }
- if (isset($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['email'][0]['data']))
- {
- $email = $this->sanitize($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_10]['email'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if ($name !== null || $email !== null || $uri !== null)
- {
- $contributors[] = new $this->item->feed->author_class($name, $uri, $email);
- }
- }
- foreach ((array) $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'contributor') as $contributor)
- {
- $name = null;
- $url = null;
- $email = null;
- if (isset($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['name'][0]['data']))
- {
- $name = $this->sanitize($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['name'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if (isset($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['url'][0]['data']))
- {
- $url = $this->sanitize($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['url'][0]['data'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['url'][0]));
- }
- if (isset($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['email'][0]['data']))
- {
- $email = $this->sanitize($contributor['child'][SIMPLEPIE_NAMESPACE_ATOM_03]['email'][0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- if ($name !== null || $email !== null || $url !== null)
- {
- $contributors[] = new $this->item->feed->author_class($name, $url, $email);
- }
- }
-
- if (!empty($contributors))
- {
- return SimplePie_Misc::array_unique($contributors);
- }
- else
- {
- return null;
- }
- }
-
- public function get_link($key = 0, $rel = 'alternate')
- {
- $links = $this->get_links($rel);
- if (isset($links[$key]))
- {
- return $links[$key];
- }
- else
- {
- return null;
- }
- }
-
-
- public function get_permalink()
- {
- return $this->get_link(0);
- }
-
- public function get_links($rel = 'alternate')
- {
- if (!isset($this->data['links']))
- {
- $this->data['links'] = array();
- if ($links = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'link'))
- {
- foreach ($links as $link)
- {
- if (isset($link['attribs']['']['href']))
- {
- $link_rel = (isset($link['attribs']['']['rel'])) ? $link['attribs']['']['rel'] : 'alternate';
- $this->data['links'][$link_rel][] = $this->sanitize($link['attribs']['']['href'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($link));
- }
- }
- }
- if ($links = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'link'))
- {
- foreach ($links as $link)
- {
- if (isset($link['attribs']['']['href']))
- {
- $link_rel = (isset($link['attribs']['']['rel'])) ? $link['attribs']['']['rel'] : 'alternate';
- $this->data['links'][$link_rel][] = $this->sanitize($link['attribs']['']['href'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($link));
-
- }
- }
- }
- if ($links = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_10, 'link'))
- {
- $this->data['links']['alternate'][] = $this->sanitize($links[0]['data'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($links[0]));
- }
- if ($links = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_090, 'link'))
- {
- $this->data['links']['alternate'][] = $this->sanitize($links[0]['data'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($links[0]));
- }
- if ($links = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'link'))
- {
- $this->data['links']['alternate'][] = $this->sanitize($links[0]['data'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($links[0]));
- }
-
- $keys = array_keys($this->data['links']);
- foreach ($keys as $key)
- {
- if (SimplePie_Misc::is_isegment_nz_nc($key))
- {
- if (isset($this->data['links'][SIMPLEPIE_IANA_LINK_RELATIONS_REGISTRY . $key]))
- {
- $this->data['links'][SIMPLEPIE_IANA_LINK_RELATIONS_REGISTRY . $key] = array_merge($this->data['links'][$key], $this->data['links'][SIMPLEPIE_IANA_LINK_RELATIONS_REGISTRY . $key]);
- $this->data['links'][$key] =& $this->data['links'][SIMPLEPIE_IANA_LINK_RELATIONS_REGISTRY . $key];
- }
- else
- {
- $this->data['links'][SIMPLEPIE_IANA_LINK_RELATIONS_REGISTRY . $key] =& $this->data['links'][$key];
- }
- }
- elseif (substr($key, 0, 41) === SIMPLEPIE_IANA_LINK_RELATIONS_REGISTRY)
- {
- $this->data['links'][substr($key, 41)] =& $this->data['links'][$key];
- }
- $this->data['links'][$key] = array_unique($this->data['links'][$key]);
- }
- }
-
- if (isset($this->data['links'][$rel]))
- {
- return $this->data['links'][$rel];
- }
- else
- {
- return null;
- }
- }
-
- public function get_description()
- {
- if ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'subtitle'))
- {
- return $this->sanitize($return[0]['data'], SimplePie_Misc::atom_10_construct_type($return[0]['attribs']), $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'tagline'))
- {
- return $this->sanitize($return[0]['data'], SimplePie_Misc::atom_03_construct_type($return[0]['attribs']), $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_10, 'description'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_MAYBE_HTML, $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_090, 'description'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_MAYBE_HTML, $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'description'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_MAYBE_HTML, $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_11, 'description'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_10, 'description'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ITUNES, 'summary'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_HTML, $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ITUNES, 'subtitle'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_HTML, $this->get_base($return[0]));
- }
- else
- {
- return null;
- }
- }
-
- public function get_copyright()
- {
- if ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'rights'))
- {
- return $this->sanitize($return[0]['data'], SimplePie_Misc::atom_10_construct_type($return[0]['attribs']), $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_03, 'copyright'))
- {
- return $this->sanitize($return[0]['data'], SimplePie_Misc::atom_03_construct_type($return[0]['attribs']), $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'copyright'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_11, 'rights'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_10, 'rights'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- else
- {
- return null;
- }
- }
-
- public function get_language()
- {
- if ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_RSS_20, 'language'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_11, 'language'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_DC_10, 'language'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- elseif (isset($this->data['xml_lang']))
- {
- return $this->sanitize($this->data['xml_lang'], SIMPLEPIE_CONSTRUCT_TEXT);
- }
- else
- {
- return null;
- }
- }
-
- public function get_latitude()
- {
- if ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_W3C_BASIC_GEO, 'lat'))
- {
- return (float) $return[0]['data'];
- }
- elseif (($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_GEORSS, 'point')) && preg_match('/^((?:-)?[0-9]+(?:\.[0-9]+)) ((?:-)?[0-9]+(?:\.[0-9]+))$/', trim($return[0]['data']), $match))
- {
- return (float) $match[1];
- }
- else
- {
- return null;
- }
- }
-
- public function get_longitude()
- {
- if ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_W3C_BASIC_GEO, 'long'))
- {
- return (float) $return[0]['data'];
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_W3C_BASIC_GEO, 'lon'))
- {
- return (float) $return[0]['data'];
- }
- elseif (($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_GEORSS, 'point')) && preg_match('/^((?:-)?[0-9]+(?:\.[0-9]+)) ((?:-)?[0-9]+(?:\.[0-9]+))$/', trim($return[0]['data']), $match))
- {
- return (float) $match[2];
- }
- else
- {
- return null;
- }
- }
-
- public function get_image_url()
- {
- if ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ITUNES, 'image'))
- {
- return $this->sanitize($return[0]['attribs']['']['href'], SIMPLEPIE_CONSTRUCT_IRI);
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'logo'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($return[0]));
- }
- elseif ($return = $this->get_source_tags(SIMPLEPIE_NAMESPACE_ATOM_10, 'icon'))
- {
- return $this->sanitize($return[0]['data'], SIMPLEPIE_CONSTRUCT_IRI, $this->get_base($return[0]));
- }
- else
- {
- return null;
- }
- }
-}
-
-
-
-
-
-
-class SimplePie_XML_Declaration_Parser
-{
-
- var $version = '1.0';
-
-
- var $encoding = 'UTF-8';
-
-
- var $standalone = false;
-
-
- var $state = 'before_version_name';
-
-
- var $data = '';
-
-
- var $data_length = 0;
-
-
- var $position = 0;
-
-
- public function __construct($data)
- {
- $this->data = $data;
- $this->data_length = strlen($this->data);
- }
-
-
- public function parse()
- {
- while ($this->state && $this->state !== 'emit' && $this->has_data())
- {
- $state = $this->state;
- $this->$state();
- }
- $this->data = '';
- if ($this->state === 'emit')
- {
- return true;
- }
- else
- {
- $this->version = '';
- $this->encoding = '';
- $this->standalone = '';
- return false;
- }
- }
-
-
- public function has_data()
- {
- return (bool) ($this->position < $this->data_length);
- }
-
-
- public function skip_whitespace()
- {
- $whitespace = strspn($this->data, "\x09\x0A\x0D\x20", $this->position);
- $this->position += $whitespace;
- return $whitespace;
- }
-
-
- public function get_value()
- {
- $quote = substr($this->data, $this->position, 1);
- if ($quote === '"' || $quote === "'")
- {
- $this->position++;
- $len = strcspn($this->data, $quote, $this->position);
- if ($this->has_data())
- {
- $value = substr($this->data, $this->position, $len);
- $this->position += $len + 1;
- return $value;
- }
- }
- return false;
- }
-
- public function before_version_name()
- {
- if ($this->skip_whitespace())
- {
- $this->state = 'version_name';
- }
- else
- {
- $this->state = false;
- }
- }
-
- public function version_name()
- {
- if (substr($this->data, $this->position, 7) === 'version')
- {
- $this->position += 7;
- $this->skip_whitespace();
- $this->state = 'version_equals';
- }
- else
- {
- $this->state = false;
- }
- }
-
- public function version_equals()
- {
- if (substr($this->data, $this->position, 1) === '=')
- {
- $this->position++;
- $this->skip_whitespace();
- $this->state = 'version_value';
- }
- else
- {
- $this->state = false;
- }
- }
-
- public function version_value()
- {
- if ($this->version = $this->get_value())
- {
- $this->skip_whitespace();
- if ($this->has_data())
- {
- $this->state = 'encoding_name';
- }
- else
- {
- $this->state = 'emit';
- }
- }
- else
- {
- $this->state = false;
- }
- }
-
- public function encoding_name()
- {
- if (substr($this->data, $this->position, 8) === 'encoding')
- {
- $this->position += 8;
- $this->skip_whitespace();
- $this->state = 'encoding_equals';
- }
- else
- {
- $this->state = 'standalone_name';
- }
- }
-
- public function encoding_equals()
- {
- if (substr($this->data, $this->position, 1) === '=')
- {
- $this->position++;
- $this->skip_whitespace();
- $this->state = 'encoding_value';
- }
- else
- {
- $this->state = false;
- }
- }
-
- public function encoding_value()
- {
- if ($this->encoding = $this->get_value())
- {
- $this->skip_whitespace();
- if ($this->has_data())
- {
- $this->state = 'standalone_name';
- }
- else
- {
- $this->state = 'emit';
- }
- }
- else
- {
- $this->state = false;
- }
- }
-
- public function standalone_name()
- {
- if (substr($this->data, $this->position, 10) === 'standalone')
- {
- $this->position += 10;
- $this->skip_whitespace();
- $this->state = 'standalone_equals';
- }
- else
- {
- $this->state = false;
- }
- }
-
- public function standalone_equals()
- {
- if (substr($this->data, $this->position, 1) === '=')
- {
- $this->position++;
- $this->skip_whitespace();
- $this->state = 'standalone_value';
- }
- else
- {
- $this->state = false;
- }
- }
-
- public function standalone_value()
- {
- if ($standalone = $this->get_value())
- {
- switch ($standalone)
- {
- case 'yes':
- $this->standalone = true;
- break;
-
- case 'no':
- $this->standalone = false;
- break;
-
- default:
- $this->state = false;
- return;
- }
-
- $this->skip_whitespace();
- if ($this->has_data())
- {
- $this->state = false;
- }
- else
- {
- $this->state = 'emit';
- }
- }
- else
- {
- $this->state = false;
- }
- }
-}
\ No newline at end of file
diff --git a/makefulltextfeed.php b/makefulltextfeed.php
index f2df18a..782c90e 100644
--- a/makefulltextfeed.php
+++ b/makefulltextfeed.php
@@ -1,10 +1,12 @@
'simplepie/simplepie.class.php',
- 'SimplePie_Misc' => 'simplepie/simplepie.class.php',
- 'SimplePie_HTTP_Parser' => 'simplepie/simplepie.class.php',
- 'SimplePie_File' => 'simplepie/simplepie.class.php',
+ // 'SimplePie' => 'simplepie/simplepie.class.php',
+ // 'SimplePie_Misc' => 'simplepie/simplepie.class.php',
+ // 'SimplePie_HTTP_Parser' => 'simplepie/simplepie.class.php',
+ // 'SimplePie_File' => 'simplepie/simplepie.class.php',
// Include FeedCreator for RSS/Atom creation
'FeedWriter' => 'feedwriter/FeedWriter.php',
'FeedItem' => 'feedwriter/FeedItem.php',
@@ -58,11 +59,13 @@ function __autoload($class_name) {
'SimplePie_HumbleHttpAgent' => 'humble-http-agent/SimplePie_HumbleHttpAgent.php',
'CookieJar' => 'humble-http-agent/CookieJar.php',
// Include IRI class for resolving relative URLs
- 'IRI' => 'iri/iri.php',
+ // 'IRI' => 'iri/iri.php',
// Include Zend Cache to improve performance (cache results)
'Zend_Cache' => 'Zend/Cache.php',
// Include Zend CSS to XPath for dealing with custom patterns
- 'Zend_Dom_Query_Css2Xpath' => 'Zend/Dom/Query/Css2Xpath.php'
+ 'Zend_Dom_Query_Css2Xpath' => 'Zend/Dom/Query/Css2Xpath.php',
+ // Language detect
+ 'Text_LanguageDetect' => 'language-detect/LanguageDetect.php'
);
if (isset($mapping[$class_name])) {
//echo "Loading $class_name\n
";
@@ -72,12 +75,27 @@ function __autoload($class_name) {
return false;
}
}
+spl_autoload_register('autoload');
+require_once 'libraries/simplepie/SimplePieAutoloader.php';
+// always include Simplepie_Core as it defines constants which other SimplePie components
+// assume will always be available.
+require_once 'libraries/simplepie/SimplePie/Core.php';
////////////////////////////////
-// Load config file if it exists
+// Load config file
////////////////////////////////
require_once(dirname(__FILE__).'/config.php');
+////////////////////////////////
+// Prevent indexing/following by search engines because:
+// 1. The content is already public and presumably indexed (why create duplicates?)
+// 2. Not doing so might increase number of requests from search engines, thus increasing server load
+// Note: feed readers and services such as Yahoo Pipes will not be affected by this header.
+// Note: Using Disallow in a robots.txt file will be more effective (search engines will check
+// that before even requesting makefulltextfeed.php).
+////////////////////////////////
+header('X-Robots-Tag: noindex, nofollow');
+
////////////////////////////////
// Check if service is enabled
////////////////////////////////
@@ -92,6 +110,9 @@ if (!isset($_GET['url'])) {
die('No URL supplied');
}
$url = trim($_GET['url']);
+if (strtolower(substr($url, 0, 7)) == 'feed://') {
+ $url = 'http://'.substr($url, 7);
+}
if (!preg_match('!^https?://.+!i', $url)) {
$url = 'http://'.$url;
}
@@ -114,13 +135,14 @@ if ($test !== false && $test !== null && preg_match('!^https?://!', $url)) {
////////////////////////////////
if ($options->alternative_url != '' && !isset($_GET['redir']) && mt_rand(0, 100) > 50) {
$redirect = $options->alternative_url.'?redir=true&url='.urlencode($url);
- if (isset($_GET['html'])) $redirect .= '&html='.urlencode($_GET['html']);
+ if (isset($_GET['html'])) $redirect .= '&html='.urlencode($_GET['html']);
if (isset($_GET['key'])) $redirect .= '&key='.urlencode($_GET['key']);
if (isset($_GET['max'])) $redirect .= '&max='.(int)$_GET['max'];
if (isset($_GET['links'])) $redirect .= '&links='.$_GET['links'];
if (isset($_GET['exc'])) $redirect .= '&exc='.$_GET['exc'];
- if (isset($_GET['what'])) $redirect .= '&what='.$_GET['what'];
- if (isset($_GET['format'])) $redirect .= '&format='.$_GET['format'];
+ if (isset($_GET['what'])) $redirect .= '&what='.$_GET['what'];
+ if (isset($_GET['format'])) $redirect .= '&format='.$_GET['format'];
+ if (isset($_GET['l'])) $redirect .= '&format='.$_GET['l'];
header("Location: $redirect");
exit;
}
@@ -140,6 +162,7 @@ if (isset($_GET['key']) && ($key_index = array_search($_GET['key'], $options->ap
if (isset($_GET['exc'])) $redirect .= '&exc='.urlencode($_GET['exc']);
if (isset($_GET['what'])) $redirect .= '&what='.urlencode($_GET['what']);
if (isset($_GET['format'])) $redirect .= '&format='.urlencode($_GET['format']);
+ if (isset($_GET['l'])) $redirect .= '&l='.urlencode($_GET['l']);
header("Location: $redirect");
exit;
}
@@ -166,6 +189,15 @@ $valid_key = false;
if (isset($_GET['key']) && isset($_GET['hash']) && isset($options->api_keys[(int)$_GET['key']])) {
$valid_key = ($_GET['hash'] == sha1($options->api_keys[(int)$_GET['key']].$url));
}
+$key_index = ($valid_key) ? (int)$_GET['key'] : 0;
+if (!$valid_key && $options->key_required) {
+ die('A valid key must be supplied');
+}
+if (!$valid_key && isset($_GET['key']) && $_GET['key'] != '') {
+ die('The entered key is invalid');
+}
+
+if (file_exists('custom_init.php')) require 'custom_init.php';
///////////////////////////////////////////////
// Check URL against list of blacklisted URLs
@@ -209,6 +241,30 @@ if ($options->exclude_items_on_fail == 'user') {
$exclude_on_fail = $options->exclude_items_on_fail;
}
+///////////////////////////////////////////////
+// Detect language
+///////////////////////////////////////////////
+if ((string)$options->detect_language == 'user') {
+ if (isset($_GET['l'])) {
+ $detect_language = (int)$_GET['l'];
+ } else {
+ $detect_language = 1;
+ }
+} else {
+ $detect_language = $options->detect_language;
+}
+
+if ($detect_language >= 2) {
+ $language_codes = array('albanian' => 'sq','arabic' => 'ar','azeri' => 'az','bengali' => 'bn','bulgarian' => 'bg',
+ 'cebuano' => 'ceb', // ISO 639-2
+ 'croatian' => 'hr','czech' => 'cs','danish' => 'da','dutch' => 'nl','english' => 'en','estonian' => 'et','farsi' => 'fa','finnish' => 'fi','french' => 'fr','german' => 'de','hausa' => 'ha',
+ 'hawaiian' => 'haw', // ISO 639-2
+ 'hindi' => 'hi','hungarian' => 'hu','icelandic' => 'is','indonesian' => 'id','italian' => 'it','kazakh' => 'kk','kyrgyz' => 'ky','latin' => 'la','latvian' => 'lv','lithuanian' => 'lt','macedonian' => 'mk','mongolian' => 'mn','nepali' => 'ne','norwegian' => 'no','pashto' => 'ps',
+ 'pidgin' => 'cpe', // ISO 639-2
+ 'polish' => 'pl','portuguese' => 'pt','romanian' => 'ro','russian' => 'ru','serbian' => 'sr','slovak' => 'sk','slovene' => 'sl','somali' => 'so','spanish' => 'es','swahili' => 'sw','swedish' => 'sv','tagalog' => 'tl','turkish' => 'tr','ukrainian' => 'uk','urdu' => 'ur','uzbek' => 'uz','vietnamese' => 'vi','welsh' => 'cy');
+}
+$use_cld = extension_loaded('cld') && (version_compare(PHP_VERSION, '5.3.0') >= 0);
+
///////////////////////////////////////////////
// Extraction pattern
///////////////////////////////////////////////
@@ -278,7 +334,7 @@ if ($options->caching) {
// getting a Zend_Cache_Core object
$cache = Zend_Cache::factory('Core', 'File', $frontendOptions, $backendOptions);
- $cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.$format.(int)isset($_GET['pubsub']));
+ $cache_id = md5($max.$url.$valid_key.$links.$exclude_on_fail.$auto_extract.$extract_pattern.$format.(int)isset($_GET['l']).(int)isset($_GET['pubsub']));
if ($data = $cache->load($cache_id)) {
if ($format == 'json') {
@@ -305,11 +361,15 @@ if ($valid_key) {
// Set up HTTP agent
//////////////////////////////////
$http = new HumbleHttpAgent();
+$http->userAgentMap = $options->user_agents;
+$http->headerOnlyTypes = array_keys($options->content_type_exc);
+$http->rewriteUrls = $options->rewrite_url;
//////////////////////////////////
// Set up Content Extractor
//////////////////////////////////
$extractor = new ContentExtractor(dirname(__FILE__).'/site_config/custom', dirname(__FILE__).'/site_config/standard');
+$extractor->fingerprints = $options->fingerprints;
/*
if ($options->caching) {
@@ -339,9 +399,13 @@ if ($options->caching) {
// Get RSS/Atom feed
////////////////////////////////
if (!$html_only) {
+ // Send user agent header showing PHP (prevents a HTML response from feedburner)
+ $http->userAgentDefault = HumbleHttpAgent::UA_PHP;
// configure SimplePie HTTP extension class to use our HumbleHttpAgent instance
SimplePie_HumbleHttpAgent::set_agent($http);
$feed = new SimplePie();
+ // some feeds use the text/html content type - force_feed tells SimplePie to process anyway
+ $feed->force_feed(true);
$feed->set_file_class('SimplePie_HumbleHttpAgent');
//$feed->set_feed_url($url); // colons appearing in the URL's path get encoded
$feed->feed_url = $url;
@@ -359,6 +423,8 @@ if (!$html_only) {
if ($result && (!is_array($feed->data) || count($feed->data) == 0)) {
die('Sorry, no feed items found');
}
+ // from now on, we'll identify ourselves as a browser
+ $http->userAgentDefault = HumbleHttpAgent::UA_BROWSER;
}
////////////////////////////////////////////////////////////////////////////////
@@ -377,6 +443,7 @@ if ($html_only || !$result) {
public function get_title() { return ''; }
public function get_description() { return 'Content extracted from '.$this->item->url; }
public function get_link() { return $this->item->url; }
+ public function get_language() { return false; }
public function get_image_url() { return false; }
public function get_items($start=0, $max=1) { return array(0=>$this->item); }
}
@@ -386,8 +453,11 @@ if ($html_only || !$result) {
public function get_permalink() { return $this->url; }
public function get_title() { return ''; }
public function get_date($format='') { return false; }
- public function get_author() { return false; }
+ public function get_author($key=0) { return null; }
+ public function get_authors() { return null; }
public function get_description() { return ''; }
+ public function get_enclosure($key=0, $prefer=null) { return null; }
+ public function get_enclosures() { return null; }
}
$feed = new DummySingleItemFeed($url);
}
@@ -408,10 +478,12 @@ $output->setLink($feed->get_link()); // Google Reader uses this for pulling in f
if ($img_url = $feed->get_image_url()) {
$output->setImage($feed->get_title(), $feed->get_link(), $img_url);
}
+/*
if ($format == 'atom') {
$output->setChannelElement('updated', date(DATE_ATOM));
$output->setChannelElement('author', array('name'=>'Five Filters', 'uri'=>'http://fivefilters.org'));
}
+*/
////////////////////////////////////////////
// Loop through feed items
@@ -424,7 +496,9 @@ foreach ($items as $key => $item) {
$permalink = htmlspecialchars_decode($item->get_permalink());
// Colons in URL path segments get encoded by SimplePie, yet some sites expect them unencoded
$permalink = str_replace('%3A', ':', $permalink);
- $permalink = $http->validateUrl($permalink);
+ // validateUrl() strips non-ascii characters
+ // simplepie already sanitizes URLs so let's not do it again here.
+ //$permalink = $http->validateUrl($permalink);
if ($permalink) {
$urls_sanitized[] = $permalink;
}
@@ -433,8 +507,13 @@ foreach ($items as $key => $item) {
$http->fetchAll($urls_sanitized);
//$http->cacheAll();
+// count number of items added to full feed
+$item_count = 0;
+
foreach ($items as $key => $item) {
+ $do_content_extraction = true;
$extract_result = false;
+ $text_sample = null;
$permalink = $urls[$key];
$newitem = $output->createNewItem();
$newitem->setTitle(htmlspecialchars_decode($item->get_title()));
@@ -451,129 +530,256 @@ foreach ($items as $key => $item) {
$newitem->setLink($item->get_permalink());
}
}
- if ($permalink && ($response = $http->get($permalink, true)) && $response['status_code'] < 300) {
+ // TODO: Allow error codes - some sites return correct content with error status
+ // e.g. prospectmagazine.co.uk returns 403
+ //if ($permalink && ($response = $http->get($permalink, true)) && $response['status_code'] < 300) {
+ if ($permalink && ($response = $http->get($permalink, true)) && ($response['status_code'] < 300 || $response['status_code'] > 400)) {
$effective_url = $response['effective_url'];
if (!url_allowed($effective_url)) continue;
- $html = $response['body'];
- // remove strange things
- $html = str_replace('[>', '', $html);
- $html = convert_to_utf8($html, $response['headers']);
- if ($auto_extract) {
- // check site config for single page URL - fetch it if found
- if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
- $html = $single_page_response['body'];
- // remove strange things
- $html = str_replace('[>', '', $html);
- $html = convert_to_utf8($html, $single_page_response['headers']);
- $effective_url = $single_page_response['effective_url'];
- unset($single_page_response);
+ // check if action defined for returned Content-Type
+ $type = null;
+ if (preg_match('!^Content-Type:\s*(([-\w]+)/([-\w\+]+))!im', $response['headers'], $match)) {
+ // look for full mime type (e.g. image/jpeg) or just type (e.g. image)
+ $match[1] = strtolower(trim($match[1]));
+ $match[2] = strtolower(trim($match[2]));
+ foreach (array($match[1], $match[2]) as $_mime) {
+ if (isset($options->content_type_exc[$_mime])) {
+ $type = $match[1];
+ $_act = $options->content_type_exc[$_mime]['action'];
+ $_name = $options->content_type_exc[$_mime]['name'];
+ if ($_act == 'exclude') {
+ continue 2; // skip this feed item entry
+ } elseif ($_act == 'link') {
+ if ($match[2] == 'image') {
+ $html = "
";
+ } else {
+ $html = "
Download $_name";
+ }
+ $title = $_name;
+ $do_content_extraction = false;
+ break;
+ }
+ }
+ }
+ unset($_mime, $_act, $_name, $match);
+ }
+ if ($do_content_extraction) {
+ $html = $response['body'];
+ // remove strange things
+ $html = str_replace('[>', '', $html);
+ $html = convert_to_utf8($html, $response['headers']);
+ if ($auto_extract) {
+ // check site config for single page URL - fetch it if found
+ if ($single_page_response = getSinglePage($item, $html, $effective_url)) {
+ $html = $single_page_response['body'];
+ // remove strange things
+ $html = str_replace('[>', '', $html);
+ $html = convert_to_utf8($html, $single_page_response['headers']);
+ $effective_url = $single_page_response['effective_url'];
+ unset($single_page_response);
+ }
+ $extract_result = $extractor->process($html, $effective_url);
+ $readability = $extractor->readability;
+ $content_block = ($extract_result) ? $extractor->getContent() : null;
+ $title = ($extract_result) ? $extractor->getTitle() : '';
+ } else {
+ $readability = new Readability($html, $effective_url);
+ // content block is entire document (for now...)
+ $content_block = $readability->dom;
+ //TODO: get title
+ $title = '';
}
- $extract_result = $extractor->process($html, $effective_url);
- $readability = $extractor->readability;
- $content_block = ($extract_result) ? $extractor->getContent() : null;
- $title = ($extract_result) ? $extractor->getTitle() : '';
- } else {
- $readability = new Readability($html, $effective_url);
- // content block is entire document (for now...)
- $content_block = $readability->dom;
- //TODO: get title
- $title = '';
}
// use extracted title for both feed and item title if we're using single-item dummy feed
if ($isDummyFeed) {
$output->setTitle($title);
$newitem->setTitle($title);
}
- if ($extract_pattern && isset($content_block)) {
- $xpath = new DOMXPath($readability->dom);
- $elems = @$xpath->query($extract_pattern, $content_block);
- // check if our custom extraction pattern matched
- if ($elems && $elems->length > 0) {
- $extract_result = true;
- // get the first matched element
- $content_block = $elems->item(0);
- // clean it up
- $readability->removeScripts($content_block);
- $readability->prepArticle($content_block);
+ if ($do_content_extraction) {
+ if ($extract_pattern && isset($content_block)) {
+ $xpath = new DOMXPath($readability->dom);
+ $elems = @$xpath->query($extract_pattern, $content_block);
+ // check if our custom extraction pattern matched
+ if ($elems && $elems->length > 0) {
+ $extract_result = true;
+ // get the first matched element
+ $content_block = $elems->item(0);
+ // clean it up
+ $readability->removeScripts($content_block);
+ $readability->prepArticle($content_block);
+ }
}
}
}
- // if we failed to extract content...
- if (!$extract_result) {
- if ($exclude_on_fail) continue; // skip this and move to next item
- if (!$valid_key) {
- $html = $options->error_message;
- } else {
- $html = $options->error_message_with_key;
- }
- // keep the original item description
- $html .= $item->get_description();
- } else {
- $readability->clean($content_block, 'select');
- if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block);
- // footnotes
- if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
- $readability->addFootnotes($content_block);
- }
- if ($extract_pattern) {
- // get outerHTML
- $html = $content_block->ownerDocument->saveXML($content_block);
- } else {
- if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
- $html = $content_block->firstChild->innerHTML;
+ if ($do_content_extraction) {
+ // if we failed to extract content...
+ if (!$extract_result) {
+ if ($exclude_on_fail) continue; // skip this and move to next item
+ //TODO: get text sample for language detection
+ if (!$valid_key) {
+ $html = $options->error_message;
} else {
- $html = $content_block->innerHTML;
+ $html = $options->error_message_with_key;
+ }
+ // keep the original item description
+ $html .= $item->get_description();
+ } else {
+ $readability->clean($content_block, 'select');
+ if ($options->rewrite_relative_urls) makeAbsolute($effective_url, $content_block);
+ // footnotes
+ if (($links == 'footnotes') && (strpos($effective_url, 'wikipedia.org') === false)) {
+ $readability->addFootnotes($content_block);
+ }
+ if ($extract_pattern) {
+ // get outerHTML
+ $html = $content_block->ownerDocument->saveXML($content_block);
+ } else {
+ if ($content_block->childNodes->length == 1 && $content_block->firstChild->nodeType === XML_ELEMENT_NODE) {
+ $html = $content_block->firstChild->innerHTML;
+ } else {
+ $html = $content_block->innerHTML;
+ }
+ }
+ // post-processing cleanup
+ $html = preg_replace('!
[\s\h\v]*
!u', '', $html);
+ if ($links == 'remove') {
+ $html = preg_replace('!?a[^>]*>!', '', $html);
+ }
+ // get text sample for language detection
+ $text_sample = strip_tags(substr($html, 0, 500));
+ if (!$valid_key) {
+ $html = make_substitutions($options->message_to_prepend).$html;
+ $html .= make_substitutions($options->message_to_append);
+ } else {
+ $html = make_substitutions($options->message_to_prepend_with_key).$html;
+ $html .= make_substitutions($options->message_to_append_with_key);
}
}
- // post-processing cleanup
- $html = preg_replace('!
[\s\h\v]*
!u', '', $html);
- if ($links == 'remove') {
- $html = preg_replace('!?a[^>]*>!', '', $html);
- }
- if (!$valid_key) {
- $html = make_substitutions($options->message_to_prepend).$html;
- $html .= make_substitutions($options->message_to_append);
- } else {
- $html = make_substitutions($options->message_to_prepend_with_key).$html;
- $html .= make_substitutions($options->message_to_append_with_key);
- }
}
+ /*
if ($format == 'atom') {
$newitem->addElement('content', $html);
$newitem->setDate((int)$item->get_date('U'));
if ($author = $item->get_author()) {
$newitem->addElement('author', array('name'=>$author->get_name()));
}
- } else {
+ } else {
+ */
if ($valid_key && isset($_GET['pubsub'])) { // used only on fivefilters.org at the moment
$newitem->addElement('guid', 'http://fivefilters.org/content-only/redirect.php?url='.urlencode($item->get_permalink()), array('isPermaLink'=>'false'));
} else {
$newitem->addElement('guid', $item->get_permalink(), array('isPermaLink'=>'true'));
}
$newitem->setDescription($html);
+
+ // set date
if ((int)$item->get_date('U') > 0) {
$newitem->setDate((int)$item->get_date('U'));
+ } elseif ($extractor->getDate()) {
+ $newitem->setDate($extractor->getDate());
}
- if ($author = $item->get_author()) {
- $newitem->addElement('dc:creator', $author->get_name());
+
+ // add authors
+ if ($authors = $item->get_authors()) {
+ foreach ($authors as $author) {
+ $newitem->addElement('dc:creator', $author->get_name());
+ }
+ } elseif ($authors = $extractor->getAuthors()) {
+ //TODO: make sure the list size is reasonable
+ foreach ($authors as $author) {
+ //TODO: addElement replaces this element each time
+ $newitem->addElement('dc:creator', $author);
+ }
}
- }
+
+ // add language
+ if ($detect_language) {
+ $language = $extractor->getLanguage();
+ if (!$language) $language = $feed->get_language();
+ if (($detect_language == 3 || (!$language && $detect_language == 2)) && $text_sample) {
+ try {
+ if ($use_cld) {
+ // Use PHP-CLD extension
+ $php_cld = 'CLD\detect'; // in quotes to prevent PHP 5.2 parse error
+ $res = $php_cld($text_sample);
+ if (is_array($res) && count($res) > 0) {
+ $language = $res[0]['code'];
+ }
+ } else {
+ //die('what');
+ // Use PEAR's Text_LanguageDetect
+ if (!isset($l)) {
+ $l = new Text_LanguageDetect('libraries/language-detect/lang.dat', 'libraries/language-detect/unicode_blocks.dat');
+ }
+ $l_result = $l->detect($text_sample, 1);
+ if (count($l_result) > 0) {
+ $language = $language_codes[key($l_result)];
+ }
+ }
+ } catch (Exception $e) {
+ //die('error: '.$e);
+ // do nothing
+ }
+ }
+ if ($language && (strlen($language) < 7)) {
+ $newitem->addElement('dc:language', $language);
+ }
+ }
+
+ // add MIME type (if it appeared in our exclusions lists)
+ if (isset($type)) $newitem->addElement('dc:format', $type);
+ // add effective URL (URL after redirects)
+ if (isset($effective_url)) {
+ //TODO: ensure $effective_url is valid witout - sometimes it causes problems, e.g.
+ //http://www.siasat.pk/forum/showthread.php?108883-Pakistan-Chowk-by-Rana-Mubashir-–-25th-March-2012-Special-Program-from-Liari-(Karachi)
+ //temporary measure: use utf8_encode()
+ $newitem->addElement('dc:identifier', remove_url_cruft(utf8_encode($effective_url)));
+ } else {
+ $newitem->addElement('dc:identifier', remove_url_cruft($item->get_permalink()));
+ }
+ // check for enclosures
+ if ($options->keep_enclosures) {
+ if ($enclosures = $item->get_enclosures()) {
+ foreach ($enclosures as $enclosure) {
+ if (!$enclosure->get_link()) continue;
+ $enc = array();
+ // Media RSS spec ($enc): http://search.yahoo.com/mrss
+ // SimplePie methods ($enclosure): http://simplepie.org/wiki/reference/start#methods4
+ $enc['url'] = $enclosure->get_link();
+ if ($enclosure->get_length()) $enc['fileSize'] = $enclosure->get_length();
+ if ($enclosure->get_type()) $enc['type'] = $enclosure->get_type();
+ if ($enclosure->get_medium()) $enc['medium'] = $enclosure->get_medium();
+ if ($enclosure->get_expression()) $enc['expression'] = $enclosure->get_expression();
+ if ($enclosure->get_bitrate()) $enc['bitrate'] = $enclosure->get_bitrate();
+ if ($enclosure->get_framerate()) $enc['framerate'] = $enclosure->get_framerate();
+ if ($enclosure->get_sampling_rate()) $enc['samplingrate'] = $enclosure->get_sampling_rate();
+ if ($enclosure->get_channels()) $enc['channels'] = $enclosure->get_channels();
+ if ($enclosure->get_duration()) $enc['duration'] = $enclosure->get_duration();
+ if ($enclosure->get_height()) $enc['height'] = $enclosure->get_height();
+ if ($enclosure->get_width()) $enc['width'] = $enclosure->get_width();
+ if ($enclosure->get_language()) $enc['lang'] = $enclosure->get_language();
+ $newitem->addElement('media:content', '', $enc);
+ }
+ }
+ }
+ /* } */
$output->addItem($newitem);
unset($html);
+ $item_count++;
}
+
// output feed
-if ($options->caching || $format == 'json') {
+if ($format == 'json') $output->setFormat(JSON);
+if ($options->caching) {
ob_start();
$output->genarateFeed();
$output = ob_get_contents();
ob_end_clean();
- if ($format == 'json') {
- $jsonrss = new stdClass();
- $jsonrss->rss = @simplexml_load_string($output);
- $output = json_encode($jsonrss);
- header("Content-type: application/json; charset=UTF-8");
+ if ($html_only && $item_count == 0) {
+ // do not cache - in case of temporary server glitch at source URL
+ } else {
+ $cache->save($output, $cache_id);
}
- if ($options->caching) $cache->save($output, $cache_id);
echo $output;
} else {
$output->genarateFeed();
@@ -618,50 +824,64 @@ function convert_to_utf8($html, $header=null)
// error parsing the response
} else {
$match = end($match); // get last matched element (in case of redirects)
- if (isset($match[2])) $encoding = trim($match[2], '"\'');
+ if (isset($match[2])) $encoding = trim($match[2], "\"' \r\n\0\x0B\t");
}
- if (!$encoding) {
- if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html, $match)) {
+ // TODO: check to see if encoding is supported (can we convert it?)
+ // If it's not, result will be empty string.
+ // For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
+ // Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
+ if (!$encoding || $encoding == 'none') {
+ // search for encoding in HTML - only look at the first 35000 characters
+ $html_head = substr($html, 0, 40000);
+ if (preg_match('/^<\?xml\s+version=(?:"[^"]*"|\'[^\']*\')\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
$encoding = trim($match[1], '"\'');
- } elseif(preg_match('/
]+)/i', $html, $match)) {
- if (isset($match[1])) $encoding = trim($match[1]);
+ } elseif (preg_match('/
]+)/i', $html_head, $match)) {
+ $encoding = trim($match[1]);
+ } elseif (preg_match_all('/
]+)>/i', $html_head, $match)) {
+ foreach ($match[1] as $_test) {
+ if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) {
+ $encoding = trim($_m[1]);
+ break;
+ }
+ }
}
}
+ if (isset($encoding)) $encoding = trim($encoding);
+ // trim is important here!
+ if (!$encoding || (strtolower($encoding) == 'iso-8859-1')) {
+ // replace MS Word smart qutoes
+ $trans = array();
+ $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark
+ $trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook
+ $trans[chr(132)] = '„'; // Double Low-9 Quotation Mark
+ $trans[chr(133)] = '…'; // Horizontal Ellipsis
+ $trans[chr(134)] = '†'; // Dagger
+ $trans[chr(135)] = '‡'; // Double Dagger
+ $trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent
+ $trans[chr(137)] = '‰'; // Per Mille Sign
+ $trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron
+ $trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark
+ $trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE
+ $trans[chr(145)] = '‘'; // Left Single Quotation Mark
+ $trans[chr(146)] = '’'; // Right Single Quotation Mark
+ $trans[chr(147)] = '“'; // Left Double Quotation Mark
+ $trans[chr(148)] = '”'; // Right Double Quotation Mark
+ $trans[chr(149)] = '•'; // Bullet
+ $trans[chr(150)] = '–'; // En Dash
+ $trans[chr(151)] = '—'; // Em Dash
+ $trans[chr(152)] = '˜'; // Small Tilde
+ $trans[chr(153)] = '™'; // Trade Mark Sign
+ $trans[chr(154)] = 'š'; // Latin Small Letter S With Caron
+ $trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark
+ $trans[chr(156)] = 'œ'; // Latin Small Ligature OE
+ $trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis
+ $html = strtr($html, $trans);
+ }
if (!$encoding) {
$encoding = 'utf-8';
} else {
if (strtolower($encoding) != 'utf-8') {
- if (strtolower($encoding) == 'iso-8859-1') {
- // replace MS Word smart qutoes
- $trans = array();
- $trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark
- $trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook
- $trans[chr(132)] = '„'; // Double Low-9 Quotation Mark
- $trans[chr(133)] = '…'; // Horizontal Ellipsis
- $trans[chr(134)] = '†'; // Dagger
- $trans[chr(135)] = '‡'; // Double Dagger
- $trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent
- $trans[chr(137)] = '‰'; // Per Mille Sign
- $trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron
- $trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark
- $trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE
- $trans[chr(145)] = '‘'; // Left Single Quotation Mark
- $trans[chr(146)] = '’'; // Right Single Quotation Mark
- $trans[chr(147)] = '“'; // Left Double Quotation Mark
- $trans[chr(148)] = '”'; // Right Double Quotation Mark
- $trans[chr(149)] = '•'; // Bullet
- $trans[chr(150)] = '–'; // En Dash
- $trans[chr(151)] = '—'; // Em Dash
- $trans[chr(152)] = '˜'; // Small Tilde
- $trans[chr(153)] = '™'; // Trade Mark Sign
- $trans[chr(154)] = 'š'; // Latin Small Letter S With Caron
- $trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark
- $trans[chr(156)] = 'œ'; // Latin Small Ligature OE
- $trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis
- $html = strtr($html, $trans);
- }
$html = SimplePie_Misc::change_encoding($html, $encoding, 'utf-8');
-
/*
if (function_exists('iconv')) {
// iconv appears to handle certain character encodings better than mb_convert_encoding
@@ -677,9 +897,10 @@ function convert_to_utf8($html, $header=null)
}
function makeAbsolute($base, $elem) {
- $base = new IRI($base);
- // remove '//' in URL path (causes URLs not to resolve properly)
- if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
+ $base = new SimplePie_IRI($base);
+ // remove '//' in URL path (used to prevent URLs from resolving properly)
+ // TODO: check if this is still the case
+ if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
foreach(array('a'=>'href', 'img'=>'src') as $tag => $attr) {
$elems = $elem->getElementsByTagName($tag);
for ($i = $elems->length-1; $i >= 0; $i--) {
@@ -697,32 +918,42 @@ function makeAbsoluteAttr($base, $e, $attr) {
$url = trim(str_replace('%20', ' ', $e->getAttribute($attr)));
$url = str_replace(' ', '%20', $url);
if (!preg_match('!https?://!i', $url)) {
- $absolute = IRI::absolutize($base, $url);
- if ($absolute) {
+ if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
$e->setAttribute($attr, $absolute);
}
}
}
}
function makeAbsoluteStr($base, $url) {
- $base = new IRI($base);
+ $base = new SimplePie_IRI($base);
// remove '//' in URL path (causes URLs not to resolve properly)
- if (isset($base->ipath)) $base->ipath = preg_replace('!//+!', '/', $base->ipath);
+ if (isset($base->path)) $base->path = preg_replace('!//+!', '/', $base->path);
if (preg_match('!^https?://!i', $url)) {
// already absolute
return $url;
} else {
- $absolute = IRI::absolutize($base, $url);
- if ($absolute) return $absolute;
+ if ($absolute = SimplePie_IRI::absolutize($base, $url)) {
+ return $absolute;
+ }
return false;
}
}
// returns single page response, or false if not found
function getSinglePage($item, $html, $url) {
- global $http;
+ global $http, $extractor;
$host = @parse_url($url, PHP_URL_HOST);
$site_config = SiteConfig::build($host);
- if ($site_config === false) return false;
+ if ($site_config === false) {
+ // check for fingerprints
+ if (!empty($extractor->fingerprints) && ($_fphost = $extractor->findHostUsingFingerprints($html))) {
+ $site_config = SiteConfig::build($_fphost);
+ }
+ if ($site_config === false) $site_config = new SiteConfig();
+ SiteConfig::add_to_cache($host, $site_config);
+ return false;
+ } else {
+ SiteConfig::add_to_cache($host, $site_config);
+ }
$splink = null;
if (!empty($site_config->single_page_link)) {
$splink = $site_config->single_page_link;
@@ -745,9 +976,12 @@ function getSinglePage($item, $html, $url) {
break;
} elseif ($elems instanceof DOMNodeList && $elems->length > 0) {
foreach ($elems as $item) {
- if ($item->hasAttribute('href')) {
+ if ($item instanceof DOMElement && $item->hasAttribute('href')) {
$single_page_url = $item->getAttribute('href');
break;
+ } elseif ($item instanceof DOMAttr && $item->value) {
+ $single_page_url = $item->value;
+ break;
}
}
}
@@ -757,15 +991,26 @@ function getSinglePage($item, $html, $url) {
// check it's not what we have already!
if ($single_page_url != $url) {
// it's not, so let's try to fetch it...
+ $_prev_ref = $http->referer;
+ $http->referer = $single_page_url;
if (($response = $http->get($single_page_url, true)) && $response['status_code'] < 300) {
+ $http->referer = $_prev_ref;
return $response;
}
+ $http->referer = $_prev_ref;
}
}
}
return false;
}
+function remove_url_cruft($url) {
+ // remove google analytics for the time being
+ // regex adapted from http://navitronic.co.uk/2010/12/removing-google-analytics-cruft-from-urls/
+ // https://gist.github.com/758177
+ return preg_replace('/(\?|\&)utm_[a-z]+=[^\&]+/', '', $url);
+}
+
function make_substitutions($string) {
if ($string == '') return $string;
global $item, $effective_url;
diff --git a/site_config/README.txt b/site_config/README.txt
index 251cc9d..0aff456 100644
--- a/site_config/README.txt
+++ b/site_config/README.txt
@@ -1,59 +1,6 @@
Full-Text RSS Site Patterns
---------------------------
-NOTE: The information here is not up to date, but probably covers what you need for this version. For the most up to date information on Full-Text RSS site patterns, please see http://help.fivefilters.org/customer/portal/articles/223153-site-patterns
-
-We recommend using the latest release of Full-Text RSS - available for purchase at http://fivefilters.org/content-only/#download - which also comes bundled with hundreds of site patterns to improve extraction.
-
Site patterns allow you to specify what should be extracted from specific sites.
-How it works
-------------
-After we fetch the contents of a URL, we use the hostname (e.g. example.org) and check to see if a config file exists for that hostname. If there is a matching file (e.g. example.org.txt) in one of the config folders, we will fetch the rules it contains. If no such file is found, we attempt to detect the appropriate content block and title automatically. When there is a matching site config, we first try to use the patterns within to extract the content. If these patterns fail to match, we will, by default, revert to auto-detection - this gives us another chance to get at the content (useful when site redesigns invalidate our stored patterns).
-
-The 'standard' folder contains site config files bundled with Full-Text RSS. Users may contribute their own patterns and we will try to update with each release.
-
-The 'custom' folder can be used for sites not listed in standard, or to override sites in standard. If a site has an entry in both folders, only the one in 'custom' will be used. The custom folder allows you to separate your entries from the bundled ones, which also makes the task of upgrading to a new release easier (you benefit from the updated patterns in standard and copy over your existing patterns to custom).
-
-The pattern format has been borrowed from Instapaper. Please see http://blog.instapaper.com/post/730281947 and http://www.instapaper.com/bodytext (requires login). We make use of the patterns provided by Instapaper and, in the same spirit, will soon make available our own additions.
-
-
-Command reference (based on Instapaper)
----------------------------------------
-title: [XPath]
-The page title.
-XPaths evaluating to strings are also accepted.
-Multiple statements accepted.
-Will evaluate in order until a result is found.
-
-body: [XPath]
-The body-text container. Auto-detected by default.
-Multiple statements accepted.
-Will evaluate in order until a result is found.
-
-strip: [XPath]
-Strip any matching element and its children.
-Multiple statements accepted.
-
-strip_id_or_class: [string]
-Strip any element whose @id or @class contains this substring.
-Multiple statements accepted.
-
-strip_image_src: [string]
-Strip any
whose @src contains this substring.
-Multiple statements accepted.
-
-tidy: [yes|no] (default: yes)
-Preprocess with Tidy. May cause "no text" errors.
-
-prune: [yes|no] (default: yes)
-Strip elements within body that do not resemble content elements.
-
-autodetect_on_failure: [yes|no] (default: yes)
-If set to no, we will not attempt to auto-detect the title or content block.
-
-test_url: [string]
-Must be URL of an article from this site, not the site's front page.
-
-# comments
-Lines beginning with # are ignored.
\ No newline at end of file
+Please see http://help.fivefilters.org/customer/portal/articles/223153-site-patterns for more information.
\ No newline at end of file
diff --git a/site_config/standard/.wikipedia.org.txt b/site_config/standard/.wikipedia.org.txt
index c8a020c..8b98ae4 100644
--- a/site_config/standard/.wikipedia.org.txt
+++ b/site_config/standard/.wikipedia.org.txt
@@ -1,5 +1,19 @@
-body: //div[@id = 'content']
+title: //h1[@id='firstHeading']
+body: //div[@id = 'bodyContent']
strip_id_or_class: editsection
-strip_id_or_class: toc
+#strip_id_or_class: toc
+strip_id_or_class: vertical-navbox
+strip: //table[@id='toc']
+strip: //div[@id='catlinks']
+strip: //div[@id='jump-to-nav']
+strip: //div[@class='thumbcaption']//div[@class='magnify']
+strip: //table[@class='navbox']
+strip: //table[contains(@class, 'infobox')]
+strip: //div[@class='dablink']
+strip: //div[@id='contentSub']
+strip: //table[contains(@class, 'metadata')]
+strip: //*[contains(@class, 'noprint')]
+strip: //span[@title='pronunciation:']
prune: no
+tidy: no
test_url: http://en.wikipedia.org/wiki/Christopher_Lloyd
\ No newline at end of file