From 0c20e0fa286fdd948cf941a03a350fe28e2b751e Mon Sep 17 00:00:00 2001 From: Alex Young Date: Thu, 16 Apr 2015 12:32:55 +0100 Subject: Use 1.9's Encoding to do do away with UTF8-checking in mauve/alert.rb --- lib/mauve/alert.rb | 25 ++----------------------- test/tc_mauve_alert.rb | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/lib/mauve/alert.rb b/lib/mauve/alert.rb index 4cee65f..d27b5d4 100644 --- a/lib/mauve/alert.rb +++ b/lib/mauve/alert.rb @@ -1,3 +1,4 @@ +# encoding: utf-8 require 'mauve/proto' require 'mauve/alert_changed' require 'mauve/history' @@ -80,22 +81,6 @@ module Mauve include DataMapper::Resource - # - # If a string matches this regex, it is valid UTF8. This regex is - # in ASCII-8BIT, so we have to force the encoding of the string to - # match it. - # - UTF8_REGEXP = Regexp.new(/^(?:#{[ - "[\x00-\x7F]", # ASCII - "[\xC2-\xDF][\x80-\xBF]", # non-overlong 2-byte - "\xE0[\xA0-\xBF][\x80-\xBF]", # excluding overlongs - "[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}", # straight 3-byte - "\xED[\x80-\x9F][\x80-\xBF]", # excluding surrogates - "\xF0[\x90-\xBF][\x80-\xBF]{2}", # planes 1-3 - "[\xF1-\xF3][\x80-\xBF]{3}", # planes 4-15 - "\xF4[\x80-\x8F][\x80-\xBF]{2}" # plane 16 - ].join("|")})*$/) - property :id, Serial property :alert_id, String, :required => true, :unique_index => :alert_index, :length=>256, :lazy => false property :source, String, :required => true, :unique_index => :alert_index, :length=>512, :lazy => false @@ -702,13 +687,7 @@ module Mauve end def clean_utf8(str) - # We're explicitly throwing away non-valid data here. - forced = str.force_encoding("ASCII-8BIT") - unless UTF8_REGEXP.match(str) - str.gsub(/[^\x00-\x7F]/,'?') - else - str - end + str.encode("utf-8", :invalid => :replace, :replace => '?', :undef => :replace) end # All alerts currently raised diff --git a/test/tc_mauve_alert.rb b/test/tc_mauve_alert.rb index 3f22030..7456d20 100644 --- a/test/tc_mauve_alert.rb +++ b/test/tc_mauve_alert.rb @@ -1,3 +1,4 @@ +# encoding: utf-8 $:.unshift "../lib" @@ -494,4 +495,17 @@ EOF end + + def test_remove_html_utf_8 + problem_string = "
This is a ûŧđ ™ message.\n\n

" + fixed_string = Alert.remove_html(problem_string) + assert_equal "This is a ûŧđ ™ message.", fixed_string.strip + end + + def test_remove_html_invalid_character + problem_string = "caf\xa9".force_encoding("ascii") + fixed_string = Alert.remove_html(problem_string) + assert_equal "caf?", fixed_string + end + end -- cgit v1.2.1