aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Young <alex@bytemark.co.uk>2015-04-16 12:32:55 +0100
committerAlex Young <alex@bytemark.co.uk>2015-04-16 12:32:55 +0100
commit0c20e0fa286fdd948cf941a03a350fe28e2b751e (patch)
tree4ab9f0cb3b39afbc256c9be27c858ea9eb0c14a3
parentaffb4ec63d93eb0c9b4e6eec56fb2dc741918e1d (diff)
Use 1.9's Encoding to do do away with UTF8-checking in mauve/alert.rb
-rw-r--r--lib/mauve/alert.rb25
-rw-r--r--test/tc_mauve_alert.rb14
2 files changed, 16 insertions, 23 deletions
diff --git a/lib/mauve/alert.rb b/lib/mauve/alert.rb
index 4cee65f..d27b5d4 100644
--- a/lib/mauve/alert.rb
+++ b/lib/mauve/alert.rb
@@ -1,3 +1,4 @@
+# encoding: utf-8
require 'mauve/proto'
require 'mauve/alert_changed'
require 'mauve/history'
@@ -80,22 +81,6 @@ module Mauve
include DataMapper::Resource
- #
- # If a string matches this regex, it is valid UTF8. This regex is
- # in ASCII-8BIT, so we have to force the encoding of the string to
- # match it.
- #
- UTF8_REGEXP = Regexp.new(/^(?:#{[
- "[\x00-\x7F]", # ASCII
- "[\xC2-\xDF][\x80-\xBF]", # non-overlong 2-byte
- "\xE0[\xA0-\xBF][\x80-\xBF]", # excluding overlongs
- "[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}", # straight 3-byte
- "\xED[\x80-\x9F][\x80-\xBF]", # excluding surrogates
- "\xF0[\x90-\xBF][\x80-\xBF]{2}", # planes 1-3
- "[\xF1-\xF3][\x80-\xBF]{3}", # planes 4-15
- "\xF4[\x80-\x8F][\x80-\xBF]{2}" # plane 16
- ].join("|")})*$/)
-
property :id, Serial
property :alert_id, String, :required => true, :unique_index => :alert_index, :length=>256, :lazy => false
property :source, String, :required => true, :unique_index => :alert_index, :length=>512, :lazy => false
@@ -702,13 +687,7 @@ module Mauve
end
def clean_utf8(str)
- # We're explicitly throwing away non-valid data here.
- forced = str.force_encoding("ASCII-8BIT")
- unless UTF8_REGEXP.match(str)
- str.gsub(/[^\x00-\x7F]/,'?')
- else
- str
- end
+ str.encode("utf-8", :invalid => :replace, :replace => '?', :undef => :replace)
end
# All alerts currently raised
diff --git a/test/tc_mauve_alert.rb b/test/tc_mauve_alert.rb
index 3f22030..7456d20 100644
--- a/test/tc_mauve_alert.rb
+++ b/test/tc_mauve_alert.rb
@@ -1,3 +1,4 @@
+# encoding: utf-8
$:.unshift "../lib"
@@ -494,4 +495,17 @@ EOF
end
+
+ def test_remove_html_utf_8
+ problem_string = "<pre>This is a ûŧđ ™ message.\n\n</pre><hr/>"
+ fixed_string = Alert.remove_html(problem_string)
+ assert_equal "This is a ûŧđ ™ message.", fixed_string.strip
+ end
+
+ def test_remove_html_invalid_character
+ problem_string = "caf\xa9".force_encoding("ascii")
+ fixed_string = Alert.remove_html(problem_string)
+ assert_equal "caf?", fixed_string
+ end
+
end