<?php
/**
* ReFilter: a dodgy RSS filter built with dodgy regular expressions.
*
* It processes a string of filters such as
*
* (in:link:slashdot AND -has:enclosure) OR "taco singing"
*
* Which will filter out all RSS items except those which
* 1. have the text "slashdot" in the <link> element
* AND
* 2. do not have an <enclosure> element
* OR
* 3. have any element containing the text "taco singing".
*
* See documentation at http://re.rephrase.net/filter/ for more
* information.
*
* LICENSE
* Do whatever, just keep attribution and tell me if you
* use it for anything cool. (That seems unlikely, though,
* since it's messy as all hell and hardly efficient. :)
* Contact me if you need it under a real license.
*
* ReFilter
* version 0.82, 2007-03-10
* copyright 2005 Sam Angove <sam@rephrase.net>
*/
class ReFilter {
var $_filters;
var $_filterstring;
var $_item_filters;
var $_logic;
var $_quoted;
var $_seq = array();
function ReFilter($filterstring = '', $rss = '') {
if ($filterstring) {
$this->set_filter($filterstring);
if ($rss) return $this->filter_rss($rss);
}
}
function set_filter($filterstring) {
$this->_filterstring = $filterstring;
$this->_process_filter_string($this->_filterstring);
}
/*
* Filtering functions. Traverse a feed and remove items matching or not matching
* a set of filters.
*/
function filter_rss($xml) {
if (count($this->_filters)) {
$xml = preg_replace_callback("#<(item|entry)( .*?)?>(.*?)</\\1>#si", array($this, '_filter_item'), $xml);
$title = htmlentities($this->_filterstring);
$xml = preg_replace("#<title>(.*?)</title>#", "<title>$1 | Filtered: $title</title>", $xml, 1);
// For RSS 0.9whatever feeds that have an <rdf:Seq>, remove
// items from it as well.
if (count($this->_seq) && stristr($xml, 'rdf:Seq')) {
foreach ($this->_seq as $li) {
$xml = str_replace("<rdf:li rdf:resource=\"$li\" />", '', $xml);
}
}
}
return $xml;
}
function _filter_item($matches) {
$this->_item_filters = $this->_filters;
preg_match_all("#<([a-zA-Z0-9:]*)(.*?)(>(.*?)</\\1>|/>)#s", $matches[3], $ematches, PREG_SET_ORDER);
foreach ($ematches as $element) {
// change (e.g.) "dc:date" to "dcdate"
$tag = strtolower( str_replace(':', '', $element[1]) );
$attributes = $this->_get_attributes($element[2]);
//$attributes = $this->_get_attributes($element[2]);
$content = $element[4];
$this->_filter_element($tag, $attributes, $content);
}
$filter_out = false;
if ( !$this->_test_filter(count($this->_filters)-1) ) $filter_out = true;
if (!$filter_out) {
return $matches[0];
} else {
// If items have rdf:about attributes and are filtered out, we
// probably need to remove them from the <rdf:Seq> as well.
$itat = $this->_get_attributes($matches[2]);
if (isset($itat['rdf:about'])) $this->_seq[] = $itat['rdf:about'];
}
}
function _filter_element($element, $attributes, $content) {
foreach ($this->_item_filters as $id => $filter) {
if ($filter['mode'] == 'sub') {
continue;
} elseif ($filter['mode'] == 'has') {
if ($filter['element'] == '_') {
if ($filter['search'] == $element) $this->_item_filters[$id]['match'] = true;
} elseif ($filter['element'] == $element) {
if (isset($attributes[$filter['search']])) {
$this->_item_filters[$id]['match'] = true;
}
}
} elseif ($filter['element'] == $element || $filter['element'] == '_') {
if ($filter['attribute']) {
if (isset($attributes[$filter['attribute']])) {
$result = $this->_filter($filter['mode'], $attributes[$filter['attribute']], $filter['search']);
if ($result) $this->_item_filters[$id]['match'] = true;
}
} else {
$result = $this->_filter($filter['mode'], $content, $filter['search']);
if ($result) $this->_item_filters[$id]['match'] = true;
}
}
}
}
function _filter($mode = 'in', $haystack, $needle) {
if (!$haystack || !$needle) return false;
switch($mode) {
case 'start':
return (substr($haystack, 0, strlen($needle)) == $needle);
break;
case 'end':
$length = strlen($needle);
return (substr($haystack, strlen($haystack)-$length, $length) == $needle);
break;
case 'in':
default:
return (stristr($haystack, $needle));
break;
}
}
/*
* Filter string Processing functions
*
* Extract something useful from a string like "(d AND (e OR f) && (g OR h))".
* The filters array contains search terms; logical relations have mode 'sub' and
* reference other filters.
*
* E.g., from "(a AND b)" (simplified):
* [0] => Array (
* [search] => a
* )
* [1] => Array (
* [search] => b
* )
* [2] => Array (
* [mode] => sub
* [logic] => and
* [0] => 0
* [1] => 1
* )
*/
function _process_filter_string($str) {
// Before doing anything, replace "strings with spaces" with
// md5sums of themselves -- sub 'em back in later. Chance of
// collision is negligible (read: I don't care), and it makes
// everything much, much simpler.
preg_match_all("#-*([a-z0-9:]+:)*(\"[^\"]+\")#i", $str, $matches, PREG_SET_ORDER);
foreach ($matches as $match) {
$whole = $match[0];
$key = md5($whole);
$this->_quoted[$key] = $whole;
$str = str_replace($whole, $key, $str);
}
// if parentheses are mismatched, try to balance them
$open = substr_count($str, '(');
$close = substr_count($str, ')');
if ($open > $close) $str .= str_repeat(')', $open-$close);
elseif ($open < $close) $str = str_repeat('(', $close-$open) . $str;
// if there are no parentheses, but spaces, disambiguate
if (!$open) $str = $this->_disambiguate($str);
// recursive disambiguation
while (strstr($str, '(')) {
//echo "\n$str\n";
$str = preg_replace_callback("#-*\(([^()]*)\)#", array(&$this, '_process_filter_string_callback'), $str);
if (!strstr($str, '(')) {
$str = $this->_disambiguate($str);
//if (strstr($str, ' ')) $str = $this->_disambiguate($str);
// no spaces either: single filter
//else $this->_process_filter($str);
}
}
}
function _process_filter_string_callback($matches) {
$positive = $this->_is_positive($matches[0]);
return $this->_disambiguate($matches[1], $positive);
}
// Resolve double/triple/etc. negatives.
//
function _is_positive($str) {
return ((strlen($str) - strlen(ltrim($str, '-'))) % 2 == 0) ? true : false;
}
// Adds extra parentheses to disambiguate expressions -- e.g.
// turn "a AND b AND c" into "(a AND b) AND c".
function _disambiguate($ambiguous, $positive = true) {
$terms = preg_split("#[ ]*( |AND|OR|[|&]{1,2})[ ]*(?!\))#", $ambiguous, -1, PREG_SPLIT_DELIM_CAPTURE);
if (count($terms) == 1)
return $positive ? '~'.$this->_process_filter($ambiguous) : '~'.$this->_process_filter("-$ambiguous");
//return $positive ? $ambiguous : "-$ambiguous";
if (count($terms) == 3) {
$one = $this->_process_filter($terms[0]);
$two = $this->_process_filter($terms[2]);
$key = count($this->_filters);
switch($terms[1]) {
case 'OR':
case '|':
case '||':
$logic = 'or';
break;
case ' ':
case 'AND':
case '&':
case '&&':
default:
$logic = 'and';
break;
}
$this->_filters[$key] = array('mode' => 'sub', 'logic' => $logic, 'positive' => $positive, $one, $two);
// Use something nobody's going to be entering to mark off references
// to the logic array. Should ~ be changed for \x00 or something?
return '~' . $key;
}
// If a longer string disambiguate with added parentheses.
$last = count($terms);
$out = '';
foreach ($terms as $index => $term) {
if ($index % 2 != 0) {
$out .= " $term ";
} elseif ($index == $last-1 && !$popen) {
$out .= $term;
} elseif (!$popen) {
$out .= "($term";
$popen = true;
} else {
$out .= "$term)";
$popen = false;
}
}
return "($out)";
}
function _process_filter($filterstring) {
// Is the filter logic?
if (preg_match('#-*~([0-9]+)#', $filterstring, $filter)) {
$positive = $this->_is_positive($filter[0]);
$a_pos = $this->_filters[$filter[1]]['positive'];
$this->_filters[$filter[1]]['positive'] = $positive ? $a_pos : !$a_pos;
return $filter[1];
} else {
// Sub in the quoted strings we removed earlier.
if ($this->_quoted) {
foreach ($this->_quoted as $key => $val) {
$filterstring = str_replace($key, $val, $filterstring);
}
}
$reserved = array('in', 'start', 'end', 'has');
preg_match("#-*([a-z0-9:]+:)*(\"[^\"]+\"|[^ ]+)#i", $filterstring, $filter);
//positive or negative filter
$positive = $this->_is_positive($filter[0]);
// search term (e.g.: "chicken")
$search = strtolower( trim($filter[2], '"') );
// search in what? (e.g.: "in:title")
$select = explode(':', trim($filter[1], ':'));
if (!in_array($select[0], $reserved)) {
$mode = 'in';
} else {
$mode = $select[0];
array_shift($select);
}
// if searching in particular element or element attribute
if (isset($select[0])) $element = $select[0];
if (isset($select[1])) $attribute = $select[1];
$key = ($element == '') ? '_' : $element;
$this->_filters[] = array('element' => $key, 'positive' => $positive,
'mode' => $mode, 'search' => $search,
'attribute' => $attribute);
}
return count($this->_filters)-1;
}
function _test_logic($rule) {
$logic = $rule['logic'];
$return = false;
if ($logic == 'and') {
if ($this->_test_filter($rule[0]) && $this->_test_filter($rule[1])) $return = true;
} elseif ($logic == 'or') {
if ($this->_test_filter($rule[0]) || $this->_test_filter($rule[1])) $return = true;
}
return ($rule['positive']) ? $return : !$return;
}
function _test_filter($id) {
$filter = $this->_item_filters[$id];
if ($filter['mode'] == 'sub') {
return $this->_test_logic($filter);
} else {
return $filter['positive'] ? $filter['match'] : !$filter['match'];
}
}
// From 'a="b" c="d"' to Array{'a'=>'b', 'c'=>'d'};
//
function _get_attributes($attrstring) {
$attributes = array();
preg_match_all('#[\s]*[a-zA-Z0-9:\-]+[\s]*=[\s]*(["\']).*?\\1[\s]*#is', $attrstring, $matches, PREG_PATTERN_ORDER);
if ($matches) {
foreach ($matches[0] as $attribute) {
$attribute = strtolower( trim($attribute) );
$brute = explode('=', $attribute, 2);
$attributes[$brute[0]] = trim($brute[1], '"');
}
}
return $attributes;
}
}
?>