1 # BEGIN BPS TAGGED BLOCK {{{
5 # This software is Copyright (c) 1996-2013 Best Practical Solutions, LLC
6 # <sales@bestpractical.com>
8 # (Except where explicitly superseded by other copyright notices)
13 # This work is made available to you under the terms of Version 2 of
14 # the GNU General Public License. A copy of that license should have
15 # been provided with this software, but in any event can be snarfed
18 # This work is distributed in the hope that it will be useful, but
19 # WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 # General Public License for more details.
23 # You should have received a copy of the GNU General Public License
24 # along with this program; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26 # 02110-1301 or visit their web page on the internet at
27 # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
30 # CONTRIBUTION SUBMISSION POLICY:
32 # (The following paragraph is not intended to limit the rights granted
33 # to you to modify and distribute this software under the terms of
34 # the GNU General Public License and is only of importance to you if
35 # you choose to contribute your changes and enhancements to the
36 # community by submitting them to Best Practical Solutions, LLC.)
38 # By intentionally submitting any modifications, corrections or
39 # derivatives to this work, or any other work intended for use with
40 # Request Tracker, to Best Practical Solutions, LLC, you confirm that
41 # you are the copyright holder for those contributions and you grant
42 # Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
43 # royalty-free, perpetual, license to use, copy, create derivative
44 # works based on those contributions, and sublicense and distribute
45 # those contributions and any derivatives thereof.
47 # END BPS TAGGED BLOCK }}}
51 RT::I18N - a base class for localization of RT
61 use Locale::Maketext 1.04;
62 use Locale::Maketext::Lexicon 0.25;
63 use base 'Locale::Maketext::Fuzzy';
70 # I decree that this project's first language is English.
73 'TEST_STRING' => 'Concrete Mixer',
75 '__Content-Type' => 'text/plain; charset=utf-8',
78 # That means that lookup failures can't happen -- if we get as far
79 # as looking for something in this lexicon, and we don't find it,
80 # then automagically set $Lexicon{$key} = $key, before possibly
83 # The exception is keys that start with "_" -- they aren't auto-makeable.
90 Initializes the lexicons used for localization.
97 my @lang = RT->Config->Get('LexiconLanguages');
98 @lang = ('*') unless @lang;
100 # load default functions
101 require substr(__FILE__, 0, -3) . '/i_default.pm';
103 # Load language-specific functions
104 foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
105 unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
106 warn("$file is tainted. not loading");
111 my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
112 next unless grep $_ eq '*' || $_ eq $lang, @lang;
117 foreach my $l ( @lang ) {
119 Gettext => $RT::LexiconPath."/$l.po",
121 push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
122 push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
123 Gettext => $RT::LocalLexiconPath."/$l.po");
126 # Acquire all .po files and iterate them into lexicons
127 Locale::Maketext::Lexicon->import({ _decode => 1, %import });
135 foreach my $k (keys %{RT::I18N::} ) {
136 next if $k eq 'main::';
137 next unless index($k, '::', -2) >= 0;
138 next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
140 my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
141 # run fetch to force load
142 my $tmp = $lex->{'foo'};
143 # XXX: untie may fail with "untie attempted
144 # while 1 inner references still exist"
145 # TODO: untie that has to lower fetch impact
146 # untie %$lex if tied %$lex;
152 Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
153 If it can't find anything, it returns 'ISO-8859-1'
160 sub encoding { 'utf-8' }
163 =head2 SetMIMEEntityToUTF8 $entity
165 An utility function which will try to convert entity body into utf8.
166 It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
170 sub SetMIMEEntityToUTF8 {
171 RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
176 =head2 IsTextualContentType $type
178 An utility function that determines whether $type is I<textual>, meaning
179 that it can sensibly be converted to Unicode text.
181 Currently, it returns true iff $type matches this regular expression
182 (case-insensitively):
184 ^(?:text/(?:plain|html)|message/rfc822)\b
189 sub IsTextualContentType {
191 ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
195 =head2 SetMIMEEntityToEncoding Entity => ENTITY, Encoding => ENCODING, PreserveWords => BOOL, IsOut => BOOL
197 An utility function which will try to convert entity body into specified
198 charset encoding (encoded as octets, *not* unicode-strings). It will
199 iterate all the entities in $entity, and try to convert each one into
200 specified charset if whose Content-Type is 'text/plain'.
202 If PreserveWords is true, values in mime head will be decoded.(default is false)
204 Incoming and outgoing mails are handled differently, if IsOut is true(default
205 is false), it'll be treated as outgoing mail, otherwise incomding mail:
209 2) if found then try to convert to utf-8 in croak mode, return if success
211 4) if guessed differently then try to convert to utf-8 in croak mode, return
213 5) mark part as application/octet-stream instead of falling back to any
218 2) if didn't find then do nothing, send as is, let MUA deal with it
219 3) if found then try to convert it to outgoing encoding in croak mode, return
221 4) do nothing otherwise, keep original encoding
223 This function doesn't return anything meaningful.
227 sub SetMIMEEntityToEncoding {
228 my ( $entity, $enc, $preserve_words, $is_out );
231 ( $entity, $enc, $preserve_words ) = @_;
237 PreserveWords => undef,
242 $entity = $args{Entity};
243 $enc = $args{Encoding};
244 $preserve_words = $args{PreserveWords};
245 $is_out = $args{IsOut};
248 unless ( $entity && $enc ) {
249 RT->Logger->error("Missing Entity or Encoding arguments");
253 # do the same for parts first of all
254 SetMIMEEntityToEncoding(
257 PreserveWords => $preserve_words,
259 ) foreach $entity->parts;
261 my $head = $entity->head;
263 my $charset = _FindOrGuessCharset($entity);
265 unless( Encode::find_encoding($charset) ) {
266 $RT::Logger->warning("Encoding '$charset' is not supported");
270 unless ( $charset ) {
271 $head->replace( "X-RT-Original-Content-Type" => $head->mime_attr('Content-Type') );
272 $head->mime_attr('Content-Type' => 'application/octet-stream');
276 SetMIMEHeadToEncoding(
278 From => _FindOrGuessCharset( $entity, 1 ),
280 PreserveWords => $preserve_words,
284 # If this is a textual entity, we'd need to preserve its original encoding
285 $head->replace( "X-RT-Original-Encoding" => $charset )
286 if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
288 return unless IsTextualContentType($head->mime_type);
290 my $body = $entity->bodyhandle;
292 if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) {
293 my $string = $body->as_string or return;
295 $RT::Logger->debug( "Converting '$charset' to '$enc' for "
296 . $head->mime_type . " - "
297 . ( $head->get('subject') || 'Subjectless message' ) );
299 # NOTE:: see the comments at the end of the sub.
300 Encode::_utf8_off($string);
301 my $orig_string = $string;
302 ( my $success, $string ) = EncodeFromToWithCroak( $orig_string, $charset => $enc );
307 my $guess = _GuessCharset($orig_string);
308 if ( $guess && $guess ne $charset ) {
309 $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" );
310 ( $success, $string ) = EncodeFromToWithCroak( $orig_string, $guess, $enc );
311 $error = $string unless $success;
315 $RT::Logger->error( "Encoding error: " . $error . " falling back to application/octet-stream" );
316 $head->mime_attr( "content-type" => 'application/octet-stream' );
321 my $new_body = MIME::Body::InCore->new($string);
323 # set up the new entity
324 $head->mime_attr( "content-type" => 'text/plain' )
325 unless ( $head->mime_attr("content-type") );
326 $head->mime_attr( "content-type.charset" => $enc );
327 $entity->bodyhandle($new_body);
331 # NOTES: Why Encode::_utf8_off before Encode::from_to
333 # All the strings in RT are utf-8 now. Quotes from Encode POD:
335 # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
336 # ... The data in $octets must be encoded as octets and not as
337 # characters in Perl's internal format. ...
339 # Not turning off the UTF-8 flag in the string will prevent the string
344 =head2 DecodeMIMEWordsToUTF8 $raw
346 An utility method which mimics MIME::Words::decode_mimewords, but only
347 limited functionality. This function returns an utf-8 string.
349 It returns the decoded string, or the original string if it's not
350 encoded. Since the subroutine converts specified string into utf-8
351 charset, it should not alter a subject written in English.
353 Why not use MIME::Words directly? Because it fails in RT when I
354 tried. Maybe it's ok now.
358 sub DecodeMIMEWordsToUTF8 {
360 return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
363 sub DecodeMIMEWordsToEncoding {
365 my $to_charset = _CanonicalizeCharset(shift);
366 my $field = shift || '';
367 $RT::Logger->warning(
368 "DecodeMIMEWordsToEncoding was called without field name."
369 ."It's known to cause troubles with decoding fields properly."
372 # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
373 # We _should_ be preserving them encoded until after parsing is completed and
374 # THEN undo the mime-encoding.
376 # This routine should be translating the existing mimeencoding to utf8 but leaving
379 # It's legal for headers to contain mime-encoded commas and semicolons which
380 # should not be treated as address separators. (Encoding == quoting here)
382 # until this is fixed, we must escape any string containing a comma or semicolon
383 # this is only a bandaid
385 # Some _other_ MUAs encode quotes _already_, and double quotes
386 # confuse us a lot, so only quote it if it isn't quoted
389 # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
390 # continuations, and similar syntax from RFC 2231
391 if ($field =~ /^Content-/i) {
392 # This concatenates continued parameters and normalizes encoded params
393 # to QB encoded-words which we handle below
394 my $params = MIME::Field::ParamVal->parse_params($str);
395 foreach my $v ( values %$params ) {
396 $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
397 # de-quote in case those were hidden inside encoded part
398 $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/;
400 $str = bless({}, 'MIME::Field::ParamVal')->set($params)->stringify;
402 elsif ( $field =~ /^(?:Resent-)?(?:To|From|B?Cc|Sender|Reply-To)$/i ) {
403 my @addresses = RT::EmailParser->ParseEmailAddress( $str );
404 foreach my $address ( @addresses ) {
405 foreach my $field (qw(phrase comment)) {
406 my $v = $address->$field() or next;
407 $v = _DecodeMIMEWordsToEncoding( $v, $to_charset );
408 if ( $field eq 'phrase' ) {
409 # de-quote in case quoted value were hidden inside encoded part
410 $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/;
412 $address->$field($v);
415 $str = join ', ', map $_->format, @addresses;
418 $str = _DecodeMIMEWordsToEncoding( $str, $to_charset );
422 # We might have \n without trailing whitespace, which will result in
429 sub _DecodeMIMEWordsToEncoding {
431 my $to_charset = shift;
433 # Pre-parse by removing all whitespace between encoded words
434 my $encoded_word = qr/
437 (?:\*[^?]+)? # optional '*language'
441 ([^?]+) # encoded string
444 $str =~ s/($encoded_word)\s+(?=$encoded_word)/$1/g;
446 # Also merge quoted-printable sections together, in case multiple
447 # octets of a single encoded character were split between chunks.
448 # Though not valid according to RFC 2047, this has been seen in the
450 1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i;
452 # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
453 # custom MIME word decoding and charset canonicalization. We can't do this
454 # until we parse before decode, instead of the other way around.
455 my @list = $str =~ m/(.*?) # prefix
459 return $str unless @list;
461 # add everything that hasn't matched to the end of the latest
462 # string in array this happen when we have 'key="=?encoded?="; key="plain"'
463 $list[-1] .= substr($str, pos $str);
467 my ($prefix, $charset, $encoding, $enc_str, $trailing) =
469 $charset = _CanonicalizeCharset($charset);
470 $encoding = lc $encoding;
472 $trailing =~ s/\s?\t?$//; # Observed from Outlook Express
474 if ( $encoding eq 'q' ) {
475 use MIME::QuotedPrint;
476 $enc_str =~ tr/_/ /; # Observed from Outlook Express
477 $enc_str = decode_qp($enc_str);
478 } elsif ( $encoding eq 'b' ) {
480 $enc_str = decode_base64($enc_str);
482 $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
483 ."only Q(uoted-printable) and B(ase64) are supported");
486 # now we have got a decoded subject, try to convert into the encoding
487 if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
488 if ( Encode::find_encoding($charset) ) {
489 Encode::from_to( $enc_str, $charset, $to_charset );
491 $RT::Logger->warning("Charset '$charset' is not supported");
492 $enc_str =~ s/[^[:print:]]/\357\277\275/g;
493 Encode::from_to( $enc_str, 'UTF-8', $to_charset )
494 unless $to_charset eq 'utf-8';
497 $str .= $prefix . $enc_str . $trailing;
504 =head2 _FindOrGuessCharset MIME::Entity, $head_only
506 When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
508 If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's.
512 sub _FindOrGuessCharset {
514 my $head_only = shift;
515 my $head = $entity->head;
517 if ( my $charset = $head->mime_attr("content-type.charset") ) {
518 return _CanonicalizeCharset($charset);
521 if ( !$head_only and $head->mime_type =~ m{^text/} ) {
522 my $body = $entity->bodyhandle or return;
523 return _GuessCharset( $body->as_string );
527 # potentially binary data -- don't guess the body
528 return _GuessCharset( $head->as_string );
534 =head2 _GuessCharset STRING
536 use Encode::Guess to try to figure it out the string's encoding.
540 use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
541 use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
544 my $fallback = _CanonicalizeCharset('iso-8859-1');
546 # if $_[0] is null/empty, we don't guess its encoding
548 unless defined $_[0] && length $_[0];
550 my @encodings = RT->Config->Get('EmailInputEncodings');
551 unless ( @encodings ) {
552 $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
556 if ( $encodings[0] eq '*' ) {
558 if ( HAS_ENCODE_DETECT ) {
559 my $charset = Encode::Detect::Detector::detect( $_[0] );
561 $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
562 return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
565 $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
570 "You requested to guess encoding, but we couldn't"
571 ." load Encode::Detect::Detector module"
576 unless ( @encodings ) {
577 $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
581 unless ( HAS_ENCODE_GUESS ) {
582 $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
586 Encode::Guess->set_suspects( @encodings );
587 my $decoder = Encode::Guess->guess( $_[0] );
588 unless ( defined $decoder ) {
589 $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
593 if ( ref $decoder ) {
594 my $charset = $decoder->name;
595 $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
596 return _CanonicalizeCharset( $charset );
598 elsif ($decoder =~ /(\S+ or .+)/) {
599 my %matched = map { $_ => 1 } split(/ or /, $1);
600 return 'utf-8' if $matched{'utf8'}; # one and only normalization
602 foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
603 next unless $matched{$suspect};
604 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
605 return _CanonicalizeCharset( $suspect );
609 $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
615 =head2 _CanonicalizeCharset NAME
617 canonicalize charset, return lowercase version.
618 special cases are: gb2312 => gbk, utf8 => utf-8
622 sub _CanonicalizeCharset {
623 my $charset = lc shift;
624 return $charset unless $charset;
626 # Canonicalize aliases if they're known
627 if (my $canonical = Encode::resolve_alias($charset)) {
628 $charset = $canonical;
631 if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
634 elsif ( $charset eq 'euc-cn' ) {
635 # gbk is superset of gb2312/euc-cn so it's safe
637 # XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
638 # but needs Encode::HanExtra installed
646 =head2 SetMIMEHeadToEncoding MIMEHead => HEAD, From => OLD_ENCODING, To => NEW_Encoding, PreserveWords => BOOL, IsOut => BOOL
648 Converts a MIME Head from one encoding to another. This totally violates the RFC.
649 We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
655 sub SetMIMEHeadToEncoding {
656 my ( $head, $charset, $enc, $preserve_words, $is_out );
659 ( $head, $charset, $enc, $preserve_words ) = @_;
666 PreserveWords => undef,
672 $charset = $args{From};
674 $preserve_words = $args{PreserveWords};
675 $is_out = $args{IsOut};
678 unless ( $head && $charset && $enc ) {
680 "Missing Head or From or To arguments");
684 $charset = _CanonicalizeCharset($charset);
685 $enc = _CanonicalizeCharset($enc);
687 return if $charset eq $enc and $preserve_words;
689 foreach my $tag ( $head->tags ) {
690 next unless $tag; # seen in wild: headers with no name
691 my @values = $head->get_all($tag);
693 foreach my $value (@values) {
694 if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) {
695 Encode::_utf8_off($value);
696 my $orig_value = $value;
697 ( my $success, $value ) = EncodeFromToWithCroak( $orig_value, $charset => $enc );
701 $value = $orig_value;
702 $head->add( $tag, $value );
706 my $guess = _GuessCharset($orig_value);
707 if ( $guess && $guess ne $charset ) {
708 $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" );
709 ( $success, $value ) = EncodeFromToWithCroak( $orig_value, $guess, $enc );
710 $error = $value unless $success;
714 $RT::Logger->error( "Encoding error: " . $error . " forcing conversion to $charset => $enc" );
715 $value = $orig_value;
716 Encode::from_to( $value, $charset => $enc );
721 $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
722 unless $preserve_words;
724 # We intentionally add a leading space when re-adding the
725 # header; Mail::Header strips it before storing, but it
726 # serves to prevent it from "helpfully" canonicalizing
727 # $head->add("Subject", "Subject: foo") into the same as
728 # $head->add("Subject", "foo");
729 $head->add( $tag, " " . $value );
735 =head2 EncodeFromToWithCroak $string, $from, $to
737 Try to encode string from encoding $from to encoding $to in croak mode
739 return (1, $encoded_string) if success, otherwise (0, $error)
743 sub EncodeFromToWithCroak {
748 eval { Encode::from_to( $string, $from => $to, Encode::FB_CROAK ); };
749 return $@ ? ( 0, $@ ) : ( 1, $string );
752 RT::Base->_ImportOverlays();