From 372547b31b51d871f104ae5679e98420a99a12c1 Mon Sep 17 00:00:00 2001 From: Patrick J Cherry Date: Mon, 30 Apr 2012 12:49:24 +0100 Subject: Non-UTF8 stuff is now stripped. Added encoding tests. --- lib/mauve/alert.rb | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'lib') diff --git a/lib/mauve/alert.rb b/lib/mauve/alert.rb index 6b08f82..a5cb885 100644 --- a/lib/mauve/alert.rb +++ b/lib/mauve/alert.rb @@ -75,6 +75,20 @@ module Mauve def size; 99; end include DataMapper::Resource + + # + # If a string matches this regex, it is valid UTF8. + # + UTF8_REGEXP = Regexp.new(/^(?:#{[ + "[\x00-\x7F]", # ASCII + "[\xC2-\xDF][\x80-\xBF]", # non-overlong 2-byte + "\xE0[\xA0-\xBF][\x80-\xBF]", # excluding overlongs + "[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}", # straight 3-byte + "\xED[\x80-\x9F][\x80-\xBF]", # excluding surrogates + "\xF0[\x90-\xBF][\x80-\xBF]{2}", # planes 1-3 + "[\xF1-\xF3][\x80-\xBF]{3}", # planes 4-15 + "\xF4[\x80-\x8F][\x80-\xBF]{2}" # plane 16 + ].join("|")})*$/) property :id, Serial property :alert_id, String, :required => true, :unique_index => :alert_index, :length=>256, :lazy => false @@ -618,6 +632,7 @@ module Mauve # @return [String] def remove_html(str, conf = Sanitize::Config::DEFAULT) raise ArgumentError, "Expected a string, got a #{str.class}" unless str.is_a?(String) + str = clean_utf8(str) if str =~ /<[^0-9 <&.-]/ Sanitize.clean( str, conf ) @@ -631,9 +646,18 @@ module Mauve # @param [String] str String to clean # @return [String] def clean_html(str) + str = clean_utf8(str) remove_html(str, Sanitize::Config::RELAXED.merge({:remove_contents => true})) end + def clean_utf8(str) + unless UTF8_REGEXP.match(str) + str.gsub(/[^\x00-\x7F]/,'?') + else + str + end + end + # All alerts currently raised # # @return [Array] -- cgit v1.2.1