[usit-rt.git] / lib / RT / I18N.pm

# BEGIN BPS TAGGED BLOCK {{{
#
# COPYRIGHT:
#
# This software is Copyright (c) 1996-2012 Best Practical Solutions, LLC
#                                          <sales@bestpractical.com>
#
# (Except where explicitly superseded by other copyright notices)
#
#
# LICENSE:
#
# This work is made available to you under the terms of Version 2 of
# the GNU General Public License. A copy of that license should have
# been provided with this software, but in any event can be snarfed
# from www.gnu.org.
#
# This work is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 or visit their web page on the internet at
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
#
#
# CONTRIBUTION SUBMISSION POLICY:
#
# (The following paragraph is not intended to limit the rights granted
# to you to modify and distribute this software under the terms of
# the GNU General Public License and is only of importance to you if
# you choose to contribute your changes and enhancements to the
# community by submitting them to Best Practical Solutions, LLC.)
#
# By intentionally submitting any modifications, corrections or
# derivatives to this work, or any other work intended for use with
# Request Tracker, to Best Practical Solutions, LLC, you confirm that
# you are the copyright holder for those contributions and you grant
# Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
# royalty-free, perpetual, license to use, copy, create derivative
# works based on those contributions, and sublicense and distribute
# those contributions and any derivatives thereof.
#
# END BPS TAGGED BLOCK }}}

=head1 NAME

RT::I18N - a base class for localization of RT

=cut

package RT::I18N;

use strict;
use warnings;


use Locale::Maketext 1.04;
use Locale::Maketext::Lexicon 0.25;
use base 'Locale::Maketext::Fuzzy';

use Encode;
use MIME::Entity;
use MIME::Head;
use File::Glob;

# I decree that this project's first language is English.

our %Lexicon = (
   'TEST_STRING' => 'Concrete Mixer',

    '__Content-Type' => 'text/plain; charset=utf-8',

  '_AUTO' => 1,
  # That means that lookup failures can't happen -- if we get as far
  #  as looking for something in this lexicon, and we don't find it,
  #  then automagically set $Lexicon{$key} = $key, before possibly
  #  compiling it.
  
  # The exception is keys that start with "_" -- they aren't auto-makeable.

);
# End of lexicon.

=head2 Init

Initializes the lexicons used for localization.


=cut

sub Init {

    my @lang = RT->Config->Get('LexiconLanguages');
    @lang = ('*') unless @lang;

    # load default functions
    require substr(__FILE__, 0, -3) . '/i_default.pm';

    # Load language-specific functions
    foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
        unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
            warn("$file is tainted. not loading");
            next;
        }
        $file = $1;

        my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
        next unless grep $_ eq '*' || $_ eq $lang, @lang;
        require $file;
    }

    my %import;
    foreach my $l ( @lang ) {
        $import{$l} = [
            Gettext => $RT::LexiconPath."/$l.po",
        ];
        push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
        push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
                                Gettext => $RT::LocalLexiconPath."/$l.po");
    }

    # Acquire all .po files and iterate them into lexicons
    Locale::Maketext::Lexicon->import({ _decode => 1, %import });

    return 1;
}

sub LoadLexicons {

    no strict 'refs';
    foreach my $k (keys %{RT::I18N::} ) {
        next if $k eq 'main::';
        next unless index($k, '::', -2) >= 0;
        next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};

        my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
        # run fetch to force load
        my $tmp = $lex->{'foo'};
        # XXX: untie may fail with "untie attempted
        # while 1 inner references still exist"
        # TODO: untie that has to lower fetch impact
        # untie %$lex if tied %$lex;
    }
}

=head2 encoding

Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
If it can't find anything, it returns 'ISO-8859-1'


=cut


sub encoding { 'utf-8' }


=head2 SetMIMEEntityToUTF8 $entity

An utility function which will try to convert entity body into utf8.
It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').

=cut

sub SetMIMEEntityToUTF8 {
    RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
}


=head2 IsTextualContentType $type

An utility function that determines whether $type is I<textual>, meaning
that it can sensibly be converted to Unicode text.

Currently, it returns true iff $type matches this regular expression
(case-insensitively):

    ^(?:text/(?:plain|html)|message/rfc822)\b


=cut

sub IsTextualContentType {
    my $type = shift;
    ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
}


=head2 SetMIMEEntityToEncoding $entity, $encoding

An utility function which will try to convert entity body into specified
charset encoding (encoded as octets, *not* unicode-strings).  It will
iterate all the entities in $entity, and try to convert each one into
specified charset if whose Content-Type is 'text/plain'.

This function doesn't return anything meaningful.

=cut

sub SetMIMEEntityToEncoding {
    my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );

    # do the same for parts first of all
    SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;

    my $charset = _FindOrGuessCharset($entity) or return;

    SetMIMEHeadToEncoding(
	$entity->head,
	_FindOrGuessCharset($entity, 1) => $enc,
	$preserve_words
    );

    my $head = $entity->head;

    # If this is a textual entity, we'd need to preserve its original encoding
    $head->replace( "X-RT-Original-Encoding" => $charset )
	if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);

    return unless IsTextualContentType($head->mime_type);

    my $body = $entity->bodyhandle;

    if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) {
        my $string = $body->as_string or return;

        $RT::Logger->debug( "Converting '$charset' to '$enc' for "
              . $head->mime_type . " - "
              . ( $head->get('subject') || 'Subjectless message' ) );

        # NOTE:: see the comments at the end of the sub.
        Encode::_utf8_off($string);
        Encode::from_to( $string, $charset => $enc );

        my $new_body = MIME::Body::InCore->new($string);

        # set up the new entity
        $head->mime_attr( "content-type" => 'text/plain' )
          unless ( $head->mime_attr("content-type") );
        $head->mime_attr( "content-type.charset" => $enc );
        $entity->bodyhandle($new_body);
    }
}

# NOTES:  Why Encode::_utf8_off before Encode::from_to
#
# All the strings in RT are utf-8 now.  Quotes from Encode POD:
#
# [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
# ... The data in $octets must be encoded as octets and not as
# characters in Perl's internal format. ...
#
# Not turning off the UTF-8 flag in the string will prevent the string
# from conversion.


=head2 DecodeMIMEWordsToUTF8 $raw

An utility method which mimics MIME::Words::decode_mimewords, but only
limited functionality.  This function returns an utf-8 string.

It returns the decoded string, or the original string if it's not
encoded.  Since the subroutine converts specified string into utf-8
charset, it should not alter a subject written in English.

Why not use MIME::Words directly?  Because it fails in RT when I
tried.  Maybe it's ok now.

=cut

sub DecodeMIMEWordsToUTF8 {
    my $str = shift;
    return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
}

sub DecodeMIMEWordsToEncoding {
    my $str = shift;
    my $to_charset = _CanonicalizeCharset(shift);
    my $field = shift || '';

    # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
    # continuations, and similar syntax from RFC 2231
    if ($field =~ /^Content-(Type|Disposition)/i) {
        # This concatenates continued parameters and normalizes encoded params
        # to QB encoded-words which we handle below
        $str = MIME::Field::ParamVal->parse($str)->stringify;
    }

    # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
    # custom MIME word decoding and charset canonicalization.  We can't do this
    # until we parse before decode, instead of the other way around.
    my @list = $str =~ m/(.*?)          # prefix
                         =\?            # =?
                         ([^?]+?)       # charset
                         (?:\*[^?]+)?   # optional '*language'
                         \?             # ?
                         ([QqBb])       # encoding
                         \?             # ?
                         ([^?]+)        # encoded string
                         \?=            # ?=
                         ([^=]*)        # trailing
                        /xgcs;

    if ( @list ) {
        # add everything that hasn't matched to the end of the latest
        # string in array this happen when we have 'key="=?encoded?="; key="plain"'
        $list[-1] .= substr($str, pos $str);

        $str = "";
        while (@list) {
            my ($prefix, $charset, $encoding, $enc_str, $trailing) =
                    splice @list, 0, 5;
            $charset  = _CanonicalizeCharset($charset);
            $encoding = lc $encoding;

            $trailing =~ s/\s?\t?$//;               # Observed from Outlook Express

            if ( $encoding eq 'q' ) {
                use MIME::QuotedPrint;
                $enc_str =~ tr/_/ /;		# Observed from Outlook Express
                $enc_str = decode_qp($enc_str);
            } elsif ( $encoding eq 'b' ) {
                use MIME::Base64;
                $enc_str = decode_base64($enc_str);
            } else {
                $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
                    ."only Q(uoted-printable) and B(ase64) are supported");
            }

            # now we have got a decoded subject, try to convert into the encoding
            if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
                Encode::from_to( $enc_str, $charset, $to_charset );
            }

            # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
            # We _should_ be preserving them encoded until after parsing is completed and
            # THEN undo the mime-encoding.
            #
            # This routine should be translating the existing mimeencoding to utf8 but leaving
            # things encoded.
            #
            # It's legal for headers to contain mime-encoded commas and semicolons which
            # should not be treated as address separators. (Encoding == quoting here)
            #
            # until this is fixed, we must escape any string containing a comma or semicolon
            # this is only a bandaid

            # Some _other_ MUAs encode quotes _already_, and double quotes
            # confuse us a lot, so only quote it if it isn't quoted
            # already.
            $enc_str = qq{"$enc_str"}
                if $enc_str =~ /[,;]/
                and $enc_str !~ /^".*"$/
                and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);

            $str .= $prefix . $enc_str . $trailing;
        }
    }

    # We might have \n without trailing whitespace, which will result in
    # invalid headers.
    $str =~ s/\n//g;

    return ($str)
}


=head2 _FindOrGuessCharset MIME::Entity, $head_only

When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out

If $head_only is true, only guesses charset for head parts.  This is because header's encoding (e.g. filename="...") may be different from that of body's.

=cut

sub _FindOrGuessCharset {
    my $entity = shift;
    my $head_only = shift;
    my $head = $entity->head;

    if ( my $charset = $head->mime_attr("content-type.charset") ) {
        return _CanonicalizeCharset($charset);
    }

    if ( !$head_only and $head->mime_type =~ m{^text/} ) {
        my $body = $entity->bodyhandle or return;
        return _GuessCharset( $body->as_string );
    }
    else {

        # potentially binary data -- don't guess the body
        return _GuessCharset( $head->as_string );
    }
}


=head2 _GuessCharset STRING

use Encode::Guess to try to figure it out the string's encoding.

=cut

use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };

sub _GuessCharset {
    my $fallback = _CanonicalizeCharset('iso-8859-1');

    # if $_[0] is null/empty, we don't guess its encoding
    return $fallback
        unless defined $_[0] && length $_[0];

    my @encodings = RT->Config->Get('EmailInputEncodings');
    unless ( @encodings ) {
        $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
        return $fallback;
    }

    if ( $encodings[0] eq '*' ) {
        shift @encodings;
        if ( HAS_ENCODE_DETECT ) {
            my $charset = Encode::Detect::Detector::detect( $_[0] );
            if ( $charset ) {
                $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
                return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
            }
            else {
                $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
            }
        }
        else {
	    $RT::Logger->error(
                "You requested to guess encoding, but we couldn't"
                ." load Encode::Detect::Detector module"
            );
        }
    }

    unless ( @encodings ) {
        $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
        return $fallback;
    }

    unless ( HAS_ENCODE_GUESS ) {
        $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
        return $fallback;
    }

    Encode::Guess->set_suspects( @encodings );
    my $decoder = Encode::Guess->guess( $_[0] );
    unless ( defined $decoder ) {
        $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
        return $fallback;
    }

    if ( ref $decoder ) {
        my $charset = $decoder->name;
        $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
        return _CanonicalizeCharset( $charset );
    }
    elsif ($decoder =~ /(\S+ or .+)/) {
        my %matched = map { $_ => 1 } split(/ or /, $1);
        return 'utf-8' if $matched{'utf8'}; # one and only normalization

        foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
            next unless $matched{$suspect};
            $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
            return _CanonicalizeCharset( $suspect );
        }
    }
    else {
        $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
    }

    return $fallback;
}

=head2 _CanonicalizeCharset NAME

canonicalize charset, return lowercase version.
special cases are: gb2312 => gbk, utf8 => utf-8

=cut

sub _CanonicalizeCharset {
    my $charset = lc shift;
    return $charset unless $charset;

    # Canonicalize aliases if they're known
    if (my $canonical = Encode::resolve_alias($charset)) {
        $charset = $canonical;
    }

    if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
        return 'utf-8';
    }
    elsif ( $charset eq 'euc-cn' ) {
        # gbk is superset of gb2312/euc-cn so it's safe
        return 'gbk';
        # XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
        # but needs Encode::HanExtra installed
    }
    else {
        return $charset;
    }
}


=head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET

Converts a MIME Head from one encoding to another. This totally violates the RFC.
We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
all the time


=cut

sub SetMIMEHeadToEncoding {
    my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );

    $charset = _CanonicalizeCharset($charset);
    $enc     = _CanonicalizeCharset($enc);

    return if $charset eq $enc and $preserve_words;

    foreach my $tag ( $head->tags ) {
        next unless $tag; # seen in wild: headers with no name
        my @values = $head->get_all($tag);
        $head->delete($tag);
        foreach my $value (@values) {
            if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) {
                Encode::_utf8_off($value);
                Encode::from_to( $value, $charset => $enc );
            }
            $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
                unless $preserve_words;

            # We intentionally add a leading space when re-adding the
            # header; Mail::Header strips it before storing, but it
            # serves to prevent it from "helpfully" canonicalizing
            # $head->add("Subject", "Subject: foo") into the same as
            # $head->add("Subject", "foo");
            $head->add( $tag, " " . $value );
        }
    }

}

RT::Base->_ImportOverlays();

1;  # End of module.
Commit	Line	Data
84fb5b46 MKG	1	# BEGIN BPS TAGGED BLOCK {{{
	2	#
	3	# COPYRIGHT:
	4	#
	5	# This software is Copyright (c) 1996-2012 Best Practical Solutions, LLC
	6	# <sales@bestpractical.com>
	7	#
	8	# (Except where explicitly superseded by other copyright notices)
	9	#
	10	#
	11	# LICENSE:
	12	#
	13	# This work is made available to you under the terms of Version 2 of
	14	# the GNU General Public License. A copy of that license should have
	15	# been provided with this software, but in any event can be snarfed
	16	# from www.gnu.org.
	17	#
	18	# This work is distributed in the hope that it will be useful, but
	19	# WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	21	# General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
	26	# 02110-1301 or visit their web page on the internet at
	27	# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
	28	#
	29	#
	30	# CONTRIBUTION SUBMISSION POLICY:
	31	#
	32	# (The following paragraph is not intended to limit the rights granted
	33	# to you to modify and distribute this software under the terms of
	34	# the GNU General Public License and is only of importance to you if
	35	# you choose to contribute your changes and enhancements to the
	36	# community by submitting them to Best Practical Solutions, LLC.)
	37	#
	38	# By intentionally submitting any modifications, corrections or
	39	# derivatives to this work, or any other work intended for use with
	40	# Request Tracker, to Best Practical Solutions, LLC, you confirm that
	41	# you are the copyright holder for those contributions and you grant
	42	# Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
	43	# royalty-free, perpetual, license to use, copy, create derivative
	44	# works based on those contributions, and sublicense and distribute
	45	# those contributions and any derivatives thereof.
	46	#
	47	# END BPS TAGGED BLOCK }}}
	48
	49	=head1 NAME
	50
	51	RT::I18N - a base class for localization of RT
	52
	53	=cut
	54
	55	package RT::I18N;
	56
	57	use strict;
	58	use warnings;
	59
	60
	61	use Locale::Maketext 1.04;
	62	use Locale::Maketext::Lexicon 0.25;
	63	use base 'Locale::Maketext::Fuzzy';
	64
65	use Encode;
66	use MIME::Entity;
67	use MIME::Head;
68	use File::Glob;
69
70	# I decree that this project's first language is English.
71
72	our %Lexicon = (
73	'TEST_STRING' => 'Concrete Mixer',
74
75	'__Content-Type' => 'text/plain; charset=utf-8',
76
77	'_AUTO' => 1,
78	# That means that lookup failures can't happen -- if we get as far
79	# as looking for something in this lexicon, and we don't find it,
80	# then automagically set $Lexicon{$key} = $key, before possibly
81	# compiling it.
82
83	# The exception is keys that start with "_" -- they aren't auto-makeable.
84
85	);
86	# End of lexicon.
87
88	=head2 Init
89
90	Initializes the lexicons used for localization.
91
92
93	=cut
94
95	sub Init {
96
97	my @lang = RT->Config->Get('LexiconLanguages');
98	@lang = ('*') unless @lang;
99
100	# load default functions
101	require substr(__FILE__, 0, -3) . '/i_default.pm';
102
103	# Load language-specific functions
104	foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
105	unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
106	warn("$file is tainted. not loading");
107	next;
108	}
109	$file = $1;
110
111	my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
112	next unless grep $_ eq '*' \|\| $_ eq $lang, @lang;
113	require $file;
114	}
115
116	my %import;
117	foreach my $l ( @lang ) {
118	$import{$l} = [
119	Gettext => $RT::LexiconPath."/$l.po",
120	];
121	push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
122	push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
123	Gettext => $RT::LocalLexiconPath."/$l.po");
124	}
125
126	# Acquire all .po files and iterate them into lexicons
127	Locale::Maketext::Lexicon->import({ _decode => 1, %import });
128
129	return 1;
130	}
131
132	sub LoadLexicons {
133
134	no strict 'refs';
135	foreach my $k (keys %{RT::I18N::} ) {
136	next if $k eq 'main::';
137	next unless index($k, '::', -2) >= 0;
138	next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
139
140	my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
141	# run fetch to force load
142	my $tmp = $lex->{'foo'};
143	# XXX: untie may fail with "untie attempted
144	# while 1 inner references still exist"
145	# TODO: untie that has to lower fetch impact
146	# untie %$lex if tied %$lex;
147	}
148	}
149
150	=head2 encoding
151
152	Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
153	If it can't find anything, it returns 'ISO-8859-1'
154
155
156
157	=cut
158
159
160	sub encoding { 'utf-8' }
161
162
163	=head2 SetMIMEEntityToUTF8 $entity
164
165	An utility function which will try to convert entity body into utf8.
166	It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
167
168	=cut
169
170	sub SetMIMEEntityToUTF8 {
171	RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
172	}
173
174
175
176	=head2 IsTextualContentType $type
177
178	An utility function that determines whether $type is I<textual>, meaning
179	that it can sensibly be converted to Unicode text.
180
181	Currently, it returns true iff $type matches this regular expression
182	(case-insensitively):
183
184	^(?:text/(?:plain\|html)\|message/rfc822)\b
185
186
187	=cut
188
189	sub IsTextualContentType {
190	my $type = shift;
191	($type =~ m{^(?:text/(?:plain\|html)\|message/rfc822)\b}i) ? 1 : 0;
192	}
193
194
195	=head2 SetMIMEEntityToEncoding $entity, $encoding
196
197	An utility function which will try to convert entity body into specified
198	charset encoding (encoded as octets, not unicode-strings). It will
199	iterate all the entities in $entity, and try to convert each one into
200	specified charset if whose Content-Type is 'text/plain'.
201
202	This function doesn't return anything meaningful.
203
204	=cut
205
206	sub SetMIMEEntityToEncoding {
207	my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
208
209	# do the same for parts first of all
210	SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
211
212	my $charset = _FindOrGuessCharset($entity) or return;
213
214	SetMIMEHeadToEncoding(
215	$entity->head,
216	_FindOrGuessCharset($entity, 1) => $enc,
217	$preserve_words
218	);
219
220	my $head = $entity->head;
221
84fb5b46 MKG	222	# If this is a textual entity, we'd need to preserve its original encoding
	223	$head->replace( "X-RT-Original-Encoding" => $charset )
	224	if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
	225
	226	return unless IsTextualContentType($head->mime_type);
	227
	228	my $body = $entity->bodyhandle;
	229
dab09ea8	230	if ( $body && ($enc ne $charset \|\| $enc =~ /^utf-?8(?:-strict)?$/i) ) {
84fb5b46 MKG	231	my $string = $body->as_string or return;
	232
	233	$RT::Logger->debug( "Converting '$charset' to '$enc' for "
	234	. $head->mime_type . " - "
	235	. ( $head->get('subject') \|\| 'Subjectless message' ) );
	236
	237	# NOTE:: see the comments at the end of the sub.
	238	Encode::_utf8_off($string);
	239	Encode::from_to( $string, $charset => $enc );
	240
	241	my $new_body = MIME::Body::InCore->new($string);
	242
	243	# set up the new entity
	244	$head->mime_attr( "content-type" => 'text/plain' )
	245	unless ( $head->mime_attr("content-type") );
	246	$head->mime_attr( "content-type.charset" => $enc );
	247	$entity->bodyhandle($new_body);
	248	}
	249	}
	250
	251	# NOTES: Why Encode::_utf8_off before Encode::from_to
	252	#
	253	# All the strings in RT are utf-8 now. Quotes from Encode POD:
	254	#
	255	# [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
	256	# ... The data in $octets must be encoded as octets and not as
	257	# characters in Perl's internal format. ...
	258	#
	259	# Not turning off the UTF-8 flag in the string will prevent the string
	260	# from conversion.
	261
	262
	263
	264	=head2 DecodeMIMEWordsToUTF8 $raw
	265
	266	An utility method which mimics MIME::Words::decode_mimewords, but only
	267	limited functionality. This function returns an utf-8 string.
	268
	269	It returns the decoded string, or the original string if it's not
	270	encoded. Since the subroutine converts specified string into utf-8
	271	charset, it should not alter a subject written in English.
	272
	273	Why not use MIME::Words directly? Because it fails in RT when I
	274	tried. Maybe it's ok now.
	275
	276	=cut
	277
	278	sub DecodeMIMEWordsToUTF8 {
	279	my $str = shift;
	280	return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
	281	}
	282
	283	sub DecodeMIMEWordsToEncoding {
	284	my $str = shift;
	285	my $to_charset = _CanonicalizeCharset(shift);
	286	my $field = shift \|\| '';
	287
b5747ff2 MKG	288	# handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
	289	# continuations, and similar syntax from RFC 2231
	290	if ($field =~ /^Content-(Type\|Disposition)/i) {
	291	# This concatenates continued parameters and normalizes encoded params
	292	# to QB encoded-words which we handle below
	293	$str = MIME::Field::ParamVal->parse($str)->stringify;
	294	}
	295
	296	# XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
	297	# custom MIME word decoding and charset canonicalization. We can't do this
	298	# until we parse before decode, instead of the other way around.
	299	my @list = $str =~ m/(.*?) # prefix
	300	=\? # =?
	301	([^?]+?) # charset
	302	(?:\[^?]+)? # optional 'language'
	303	\? # ?
	304	([QqBb]) # encoding
	305	\? # ?
	306	([^?]+) # encoded string
	307	\?= # ?=
	308	([^=]*) # trailing
	309	/xgcs;
84fb5b46 MKG	310
	311	if ( @list ) {
	312	# add everything that hasn't matched to the end of the latest
	313	# string in array this happen when we have 'key="=?encoded?="; key="plain"'
	314	$list[-1] .= substr($str, pos $str);
	315
	316	$str = "";
	317	while (@list) {
	318	my ($prefix, $charset, $encoding, $enc_str, $trailing) =
	319	splice @list, 0, 5;
	320	$charset = _CanonicalizeCharset($charset);
	321	$encoding = lc $encoding;
	322
	323	$trailing =~ s/\s?\t?$//; # Observed from Outlook Express
	324
	325	if ( $encoding eq 'q' ) {
	326	use MIME::QuotedPrint;
	327	$enc_str =~ tr/_/ /; # Observed from Outlook Express
	328	$enc_str = decode_qp($enc_str);
	329	} elsif ( $encoding eq 'b' ) {
	330	use MIME::Base64;
	331	$enc_str = decode_base64($enc_str);
	332	} else {
	333	$RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
	334	."only Q(uoted-printable) and B(ase64) are supported");
	335	}
	336
	337	# now we have got a decoded subject, try to convert into the encoding
dab09ea8	338	if ( $charset ne $to_charset \|\| $charset =~ /^utf-?8(?:-strict)?$/i ) {
84fb5b46 MKG	339	Encode::from_to( $enc_str, $charset, $to_charset );
	340	}
	341
	342	# XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
	343	# We _should_ be preserving them encoded until after parsing is completed and
	344	# THEN undo the mime-encoding.
	345	#
	346	# This routine should be translating the existing mimeencoding to utf8 but leaving
	347	# things encoded.
	348	#
	349	# It's legal for headers to contain mime-encoded commas and semicolons which
	350	# should not be treated as address separators. (Encoding == quoting here)
	351	#
	352	# until this is fixed, we must escape any string containing a comma or semicolon
	353	# this is only a bandaid
	354
	355	# Some _other_ MUAs encode quotes _already_, and double quotes
	356	# confuse us a lot, so only quote it if it isn't quoted
	357	# already.
	358	$enc_str = qq{"$enc_str"}
	359	if $enc_str =~ /[,;]/
	360	and $enc_str !~ /^".*"$/
	361	and (!$field \|\| $field =~ /^(?:To$\|From$\|B?Cc$\|Content-)/i);
	362
	363	$str .= $prefix . $enc_str . $trailing;
	364	}
	365	}
	366
84fb5b46 MKG	367	# We might have \n without trailing whitespace, which will result in
	368	# invalid headers.
	369	$str =~ s/\n//g;
	370
	371	return ($str)
	372	}
	373
	374
	375
	376	=head2 _FindOrGuessCharset MIME::Entity, $head_only
	377
	378	When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
	379
	380	If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's.
	381
	382	=cut
	383
	384	sub _FindOrGuessCharset {
	385	my $entity = shift;
	386	my $head_only = shift;
	387	my $head = $entity->head;
	388
	389	if ( my $charset = $head->mime_attr("content-type.charset") ) {
	390	return _CanonicalizeCharset($charset);
	391	}
	392
	393	if ( !$head_only and $head->mime_type =~ m{^text/} ) {
	394	my $body = $entity->bodyhandle or return;
	395	return _GuessCharset( $body->as_string );
	396	}
	397	else {
	398
	399	# potentially binary data -- don't guess the body
	400	return _GuessCharset( $head->as_string );
	401	}
	402	}
	403
	404
	405
	406	=head2 _GuessCharset STRING
	407
	408	use Encode::Guess to try to figure it out the string's encoding.
	409
	410	=cut
	411
	412	use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
	413	use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
	414
	415	sub _GuessCharset {
	416	my $fallback = _CanonicalizeCharset('iso-8859-1');
	417
	418	# if $_[0] is null/empty, we don't guess its encoding
	419	return $fallback
	420	unless defined $_[0] && length $_[0];
	421
	422	my @encodings = RT->Config->Get('EmailInputEncodings');
	423	unless ( @encodings ) {
	424	$RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
	425	return $fallback;
	426	}
	427
	428	if ( $encodings[0] eq '*' ) {
	429	shift @encodings;
	430	if ( HAS_ENCODE_DETECT ) {
431	my $charset = Encode::Detect::Detector::detect( $_[0] );
432	if ( $charset ) {
433	$RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
434	return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
435	}
436	else {
437	$RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
438	}
439	}
440	else {
441	$RT::Logger->error(
442	"You requested to guess encoding, but we couldn't"
443	." load Encode::Detect::Detector module"
444	);
445	}
446	}
447
448	unless ( @encodings ) {
449	$RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
450	return $fallback;
451	}
452
453	unless ( HAS_ENCODE_GUESS ) {
454	$RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
455	return $fallback;
456	}
457
458	Encode::Guess->set_suspects( @encodings );
459	my $decoder = Encode::Guess->guess( $_[0] );
460	unless ( defined $decoder ) {
461	$RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
462	return $fallback;
463	}
464
465	if ( ref $decoder ) {
466	my $charset = $decoder->name;
467	$RT::Logger->debug("Encode::Guess guessed encoding: $charset");
468	return _CanonicalizeCharset( $charset );
469	}
470	elsif ($decoder =~ /(\S+ or .+)/) {
471	my %matched = map { $_ => 1 } split(/ or /, $1);
472	return 'utf-8' if $matched{'utf8'}; # one and only normalization
473
474	foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
475	next unless $matched{$suspect};
476	$RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
477	return _CanonicalizeCharset( $suspect );
478	}
479	}
480	else {
481	$RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
482	}
483
484	return $fallback;
485	}
486
487	=head2 _CanonicalizeCharset NAME
488
489	canonicalize charset, return lowercase version.
490	special cases are: gb2312 => gbk, utf8 => utf-8
491
492	=cut
493
494	sub _CanonicalizeCharset {
495	my $charset = lc shift;
496	return $charset unless $charset;
497
498	# Canonicalize aliases if they're known
499	if (my $canonical = Encode::resolve_alias($charset)) {
500	$charset = $canonical;
501	}
502
503	if ( $charset eq 'utf8' \|\| $charset eq 'utf-8-strict' ) {
504	return 'utf-8';
505	}
506	elsif ( $charset eq 'euc-cn' ) {
507	# gbk is superset of gb2312/euc-cn so it's safe
508	return 'gbk';
509	# XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
510	# but needs Encode::HanExtra installed
511	}
512	else {
513	return $charset;
514	}
515	}
516
517
518	=head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
519
520	Converts a MIME Head from one encoding to another. This totally violates the RFC.
521	We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
522	all the time
523
524
525	=cut
526
527	sub SetMIMEHeadToEncoding {
528	my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
529
530	$charset = _CanonicalizeCharset($charset);
531	$enc = _CanonicalizeCharset($enc);
532
533	return if $charset eq $enc and $preserve_words;
534
535	foreach my $tag ( $head->tags ) {
536	next unless $tag; # seen in wild: headers with no name
537	my @values = $head->get_all($tag);
538	$head->delete($tag);
539	foreach my $value (@values) {
dab09ea8	540	if ( $charset ne $enc \|\| $enc =~ /^utf-?8(?:-strict)?$/i ) {
84fb5b46 MKG	541	Encode::_utf8_off($value);
	542	Encode::from_to( $value, $charset => $enc );
	543	}
	544	$value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
	545	unless $preserve_words;
	546
	547	# We intentionally add a leading space when re-adding the
	548	# header; Mail::Header strips it before storing, but it
	549	# serves to prevent it from "helpfully" canonicalizing
	550	# $head->add("Subject", "Subject: foo") into the same as
	551	# $head->add("Subject", "foo");
	552	$head->add( $tag, " " . $value );
	553	}
	554	}
	555
	556	}
	557
	558	RT::Base->_ImportOverlays();
	559
	560	1; # End of module.
	561