Upgrade to 4.0.10.
[usit-rt.git] / lib / RT / I18N.pm
CommitLineData
84fb5b46
MKG
1# BEGIN BPS TAGGED BLOCK {{{
2#
3# COPYRIGHT:
4#
403d7b0b 5# This software is Copyright (c) 1996-2013 Best Practical Solutions, LLC
84fb5b46
MKG
6# <sales@bestpractical.com>
7#
8# (Except where explicitly superseded by other copyright notices)
9#
10#
11# LICENSE:
12#
13# This work is made available to you under the terms of Version 2 of
14# the GNU General Public License. A copy of that license should have
15# been provided with this software, but in any event can be snarfed
16# from www.gnu.org.
17#
18# This work is distributed in the hope that it will be useful, but
19# WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26# 02110-1301 or visit their web page on the internet at
27# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
28#
29#
30# CONTRIBUTION SUBMISSION POLICY:
31#
32# (The following paragraph is not intended to limit the rights granted
33# to you to modify and distribute this software under the terms of
34# the GNU General Public License and is only of importance to you if
35# you choose to contribute your changes and enhancements to the
36# community by submitting them to Best Practical Solutions, LLC.)
37#
38# By intentionally submitting any modifications, corrections or
39# derivatives to this work, or any other work intended for use with
40# Request Tracker, to Best Practical Solutions, LLC, you confirm that
41# you are the copyright holder for those contributions and you grant
42# Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
43# royalty-free, perpetual, license to use, copy, create derivative
44# works based on those contributions, and sublicense and distribute
45# those contributions and any derivatives thereof.
46#
47# END BPS TAGGED BLOCK }}}
48
49=head1 NAME
50
51RT::I18N - a base class for localization of RT
52
53=cut
54
55package RT::I18N;
56
57use strict;
58use warnings;
59
60
61use Locale::Maketext 1.04;
62use Locale::Maketext::Lexicon 0.25;
63use base 'Locale::Maketext::Fuzzy';
64
65use Encode;
66use MIME::Entity;
67use MIME::Head;
68use File::Glob;
69
70# I decree that this project's first language is English.
71
72our %Lexicon = (
73 'TEST_STRING' => 'Concrete Mixer',
74
75 '__Content-Type' => 'text/plain; charset=utf-8',
76
77 '_AUTO' => 1,
78 # That means that lookup failures can't happen -- if we get as far
79 # as looking for something in this lexicon, and we don't find it,
80 # then automagically set $Lexicon{$key} = $key, before possibly
81 # compiling it.
82
83 # The exception is keys that start with "_" -- they aren't auto-makeable.
84
85);
86# End of lexicon.
87
88=head2 Init
89
90Initializes the lexicons used for localization.
91
92
93=cut
94
95sub Init {
96
97 my @lang = RT->Config->Get('LexiconLanguages');
98 @lang = ('*') unless @lang;
99
100 # load default functions
101 require substr(__FILE__, 0, -3) . '/i_default.pm';
102
103 # Load language-specific functions
104 foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
105 unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
106 warn("$file is tainted. not loading");
107 next;
108 }
109 $file = $1;
110
111 my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
112 next unless grep $_ eq '*' || $_ eq $lang, @lang;
113 require $file;
114 }
115
116 my %import;
117 foreach my $l ( @lang ) {
118 $import{$l} = [
119 Gettext => $RT::LexiconPath."/$l.po",
120 ];
121 push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
122 push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
123 Gettext => $RT::LocalLexiconPath."/$l.po");
124 }
125
126 # Acquire all .po files and iterate them into lexicons
127 Locale::Maketext::Lexicon->import({ _decode => 1, %import });
128
129 return 1;
130}
131
132sub LoadLexicons {
133
134 no strict 'refs';
135 foreach my $k (keys %{RT::I18N::} ) {
136 next if $k eq 'main::';
137 next unless index($k, '::', -2) >= 0;
138 next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
139
140 my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
141 # run fetch to force load
142 my $tmp = $lex->{'foo'};
143 # XXX: untie may fail with "untie attempted
144 # while 1 inner references still exist"
145 # TODO: untie that has to lower fetch impact
146 # untie %$lex if tied %$lex;
147 }
148}
149
150=head2 encoding
151
152Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
153If it can't find anything, it returns 'ISO-8859-1'
154
155
156
157=cut
158
159
160sub encoding { 'utf-8' }
161
162
163=head2 SetMIMEEntityToUTF8 $entity
164
165An utility function which will try to convert entity body into utf8.
166It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
167
168=cut
169
170sub SetMIMEEntityToUTF8 {
171 RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
172}
173
174
175
176=head2 IsTextualContentType $type
177
178An utility function that determines whether $type is I<textual>, meaning
179that it can sensibly be converted to Unicode text.
180
181Currently, it returns true iff $type matches this regular expression
182(case-insensitively):
183
184 ^(?:text/(?:plain|html)|message/rfc822)\b
185
186
187=cut
188
189sub IsTextualContentType {
190 my $type = shift;
191 ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
192}
193
194
195=head2 SetMIMEEntityToEncoding $entity, $encoding
196
197An utility function which will try to convert entity body into specified
198charset encoding (encoded as octets, *not* unicode-strings). It will
199iterate all the entities in $entity, and try to convert each one into
200specified charset if whose Content-Type is 'text/plain'.
201
202This function doesn't return anything meaningful.
203
204=cut
205
206sub SetMIMEEntityToEncoding {
207 my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
208
209 # do the same for parts first of all
210 SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
211
403d7b0b
MKG
212 my $head = $entity->head;
213
214 my $charset = _FindOrGuessCharset($entity);
215 if ( $charset ) {
216 unless( Encode::find_encoding($charset) ) {
217 $RT::Logger->warning("Encoding '$charset' is not supported");
218 $charset = undef;
219 }
220 }
221 unless ( $charset ) {
222 $head->replace( "X-RT-Original-Content-Type" => $head->mime_attr('Content-Type') );
223 $head->mime_attr('Content-Type' => 'application/octet-stream');
224 return;
225 }
84fb5b46
MKG
226
227 SetMIMEHeadToEncoding(
403d7b0b 228 $head,
84fb5b46
MKG
229 _FindOrGuessCharset($entity, 1) => $enc,
230 $preserve_words
231 );
232
84fb5b46
MKG
233 # If this is a textual entity, we'd need to preserve its original encoding
234 $head->replace( "X-RT-Original-Encoding" => $charset )
235 if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
236
237 return unless IsTextualContentType($head->mime_type);
238
239 my $body = $entity->bodyhandle;
240
dab09ea8 241 if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) {
84fb5b46
MKG
242 my $string = $body->as_string or return;
243
244 $RT::Logger->debug( "Converting '$charset' to '$enc' for "
245 . $head->mime_type . " - "
246 . ( $head->get('subject') || 'Subjectless message' ) );
247
248 # NOTE:: see the comments at the end of the sub.
249 Encode::_utf8_off($string);
250 Encode::from_to( $string, $charset => $enc );
251
252 my $new_body = MIME::Body::InCore->new($string);
253
254 # set up the new entity
255 $head->mime_attr( "content-type" => 'text/plain' )
256 unless ( $head->mime_attr("content-type") );
257 $head->mime_attr( "content-type.charset" => $enc );
258 $entity->bodyhandle($new_body);
259 }
260}
261
262# NOTES: Why Encode::_utf8_off before Encode::from_to
263#
264# All the strings in RT are utf-8 now. Quotes from Encode POD:
265#
266# [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
267# ... The data in $octets must be encoded as octets and not as
268# characters in Perl's internal format. ...
269#
270# Not turning off the UTF-8 flag in the string will prevent the string
271# from conversion.
272
273
274
275=head2 DecodeMIMEWordsToUTF8 $raw
276
277An utility method which mimics MIME::Words::decode_mimewords, but only
278limited functionality. This function returns an utf-8 string.
279
280It returns the decoded string, or the original string if it's not
281encoded. Since the subroutine converts specified string into utf-8
282charset, it should not alter a subject written in English.
283
284Why not use MIME::Words directly? Because it fails in RT when I
285tried. Maybe it's ok now.
286
287=cut
288
289sub DecodeMIMEWordsToUTF8 {
290 my $str = shift;
291 return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
292}
293
294sub DecodeMIMEWordsToEncoding {
295 my $str = shift;
296 my $to_charset = _CanonicalizeCharset(shift);
297 my $field = shift || '';
298
b5747ff2
MKG
299 # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value
300 # continuations, and similar syntax from RFC 2231
301 if ($field =~ /^Content-(Type|Disposition)/i) {
302 # This concatenates continued parameters and normalizes encoded params
303 # to QB encoded-words which we handle below
304 $str = MIME::Field::ParamVal->parse($str)->stringify;
305 }
306
403d7b0b
MKG
307 # Pre-parse by removing all whitespace between encoded words
308 my $encoded_word = qr/
309 =\? # =?
310 ([^?]+?) # charset
311 (?:\*[^?]+)? # optional '*language'
312 \? # ?
313 ([QqBb]) # encoding
314 \? # ?
315 ([^?]+) # encoded string
316 \?= # ?=
317 /x;
318 $str =~ s/($encoded_word)\s+(?=$encoded_word)/$1/g;
319
320 # Also merge quoted-printable sections together, in case multiple
321 # octets of a single encoded character were split between chunks.
322 # Though not valid according to RFC 2047, this has been seen in the
323 # wild.
324 1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i;
325
b5747ff2
MKG
326 # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our
327 # custom MIME word decoding and charset canonicalization. We can't do this
328 # until we parse before decode, instead of the other way around.
329 my @list = $str =~ m/(.*?) # prefix
403d7b0b 330 $encoded_word
b5747ff2
MKG
331 ([^=]*) # trailing
332 /xgcs;
84fb5b46
MKG
333
334 if ( @list ) {
335 # add everything that hasn't matched to the end of the latest
336 # string in array this happen when we have 'key="=?encoded?="; key="plain"'
337 $list[-1] .= substr($str, pos $str);
338
339 $str = "";
340 while (@list) {
341 my ($prefix, $charset, $encoding, $enc_str, $trailing) =
342 splice @list, 0, 5;
343 $charset = _CanonicalizeCharset($charset);
344 $encoding = lc $encoding;
345
346 $trailing =~ s/\s?\t?$//; # Observed from Outlook Express
347
348 if ( $encoding eq 'q' ) {
349 use MIME::QuotedPrint;
350 $enc_str =~ tr/_/ /; # Observed from Outlook Express
351 $enc_str = decode_qp($enc_str);
352 } elsif ( $encoding eq 'b' ) {
353 use MIME::Base64;
354 $enc_str = decode_base64($enc_str);
355 } else {
356 $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
357 ."only Q(uoted-printable) and B(ase64) are supported");
358 }
359
360 # now we have got a decoded subject, try to convert into the encoding
dab09ea8 361 if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) {
403d7b0b
MKG
362 if ( Encode::find_encoding($charset) ) {
363 Encode::from_to( $enc_str, $charset, $to_charset );
364 } else {
365 $RT::Logger->warning("Charset '$charset' is not supported");
366 $enc_str =~ s/[^[:print:]]/\357\277\275/g;
367 Encode::from_to( $enc_str, 'UTF-8', $to_charset )
368 unless $to_charset eq 'utf-8';
369 }
84fb5b46
MKG
370 }
371
372 # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
373 # We _should_ be preserving them encoded until after parsing is completed and
374 # THEN undo the mime-encoding.
375 #
376 # This routine should be translating the existing mimeencoding to utf8 but leaving
377 # things encoded.
378 #
379 # It's legal for headers to contain mime-encoded commas and semicolons which
380 # should not be treated as address separators. (Encoding == quoting here)
381 #
382 # until this is fixed, we must escape any string containing a comma or semicolon
383 # this is only a bandaid
384
385 # Some _other_ MUAs encode quotes _already_, and double quotes
386 # confuse us a lot, so only quote it if it isn't quoted
387 # already.
388 $enc_str = qq{"$enc_str"}
389 if $enc_str =~ /[,;]/
390 and $enc_str !~ /^".*"$/
391 and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
392
393 $str .= $prefix . $enc_str . $trailing;
394 }
395 }
396
84fb5b46
MKG
397 # We might have \n without trailing whitespace, which will result in
398 # invalid headers.
399 $str =~ s/\n//g;
400
401 return ($str)
402}
403
404
405
406=head2 _FindOrGuessCharset MIME::Entity, $head_only
407
408When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
409
410If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's.
411
412=cut
413
414sub _FindOrGuessCharset {
415 my $entity = shift;
416 my $head_only = shift;
417 my $head = $entity->head;
418
419 if ( my $charset = $head->mime_attr("content-type.charset") ) {
420 return _CanonicalizeCharset($charset);
421 }
422
423 if ( !$head_only and $head->mime_type =~ m{^text/} ) {
424 my $body = $entity->bodyhandle or return;
425 return _GuessCharset( $body->as_string );
426 }
427 else {
428
429 # potentially binary data -- don't guess the body
430 return _GuessCharset( $head->as_string );
431 }
432}
433
434
435
436=head2 _GuessCharset STRING
437
438use Encode::Guess to try to figure it out the string's encoding.
439
440=cut
441
442use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
443use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
444
445sub _GuessCharset {
446 my $fallback = _CanonicalizeCharset('iso-8859-1');
447
448 # if $_[0] is null/empty, we don't guess its encoding
449 return $fallback
450 unless defined $_[0] && length $_[0];
451
452 my @encodings = RT->Config->Get('EmailInputEncodings');
453 unless ( @encodings ) {
454 $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
455 return $fallback;
456 }
457
458 if ( $encodings[0] eq '*' ) {
459 shift @encodings;
460 if ( HAS_ENCODE_DETECT ) {
461 my $charset = Encode::Detect::Detector::detect( $_[0] );
462 if ( $charset ) {
463 $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
464 return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
465 }
466 else {
467 $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
468 }
469 }
470 else {
471 $RT::Logger->error(
472 "You requested to guess encoding, but we couldn't"
473 ." load Encode::Detect::Detector module"
474 );
475 }
476 }
477
478 unless ( @encodings ) {
479 $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
480 return $fallback;
481 }
482
483 unless ( HAS_ENCODE_GUESS ) {
484 $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
485 return $fallback;
486 }
487
488 Encode::Guess->set_suspects( @encodings );
489 my $decoder = Encode::Guess->guess( $_[0] );
490 unless ( defined $decoder ) {
491 $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
492 return $fallback;
493 }
494
495 if ( ref $decoder ) {
496 my $charset = $decoder->name;
497 $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
498 return _CanonicalizeCharset( $charset );
499 }
500 elsif ($decoder =~ /(\S+ or .+)/) {
501 my %matched = map { $_ => 1 } split(/ or /, $1);
502 return 'utf-8' if $matched{'utf8'}; # one and only normalization
503
504 foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
505 next unless $matched{$suspect};
506 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
507 return _CanonicalizeCharset( $suspect );
508 }
509 }
510 else {
511 $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
512 }
513
514 return $fallback;
515}
516
517=head2 _CanonicalizeCharset NAME
518
519canonicalize charset, return lowercase version.
520special cases are: gb2312 => gbk, utf8 => utf-8
521
522=cut
523
524sub _CanonicalizeCharset {
525 my $charset = lc shift;
526 return $charset unless $charset;
527
528 # Canonicalize aliases if they're known
529 if (my $canonical = Encode::resolve_alias($charset)) {
530 $charset = $canonical;
531 }
532
533 if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
534 return 'utf-8';
535 }
536 elsif ( $charset eq 'euc-cn' ) {
537 # gbk is superset of gb2312/euc-cn so it's safe
538 return 'gbk';
539 # XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
540 # but needs Encode::HanExtra installed
541 }
542 else {
543 return $charset;
544 }
545}
546
547
548=head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
549
550Converts a MIME Head from one encoding to another. This totally violates the RFC.
551We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
552all the time
553
554
555=cut
556
557sub SetMIMEHeadToEncoding {
558 my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
559
560 $charset = _CanonicalizeCharset($charset);
561 $enc = _CanonicalizeCharset($enc);
562
563 return if $charset eq $enc and $preserve_words;
564
565 foreach my $tag ( $head->tags ) {
566 next unless $tag; # seen in wild: headers with no name
567 my @values = $head->get_all($tag);
568 $head->delete($tag);
569 foreach my $value (@values) {
dab09ea8 570 if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) {
84fb5b46
MKG
571 Encode::_utf8_off($value);
572 Encode::from_to( $value, $charset => $enc );
573 }
574 $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
575 unless $preserve_words;
576
577 # We intentionally add a leading space when re-adding the
578 # header; Mail::Header strips it before storing, but it
579 # serves to prevent it from "helpfully" canonicalizing
580 # $head->add("Subject", "Subject: foo") into the same as
581 # $head->add("Subject", "foo");
582 $head->add( $tag, " " . $value );
583 }
584 }
585
586}
587
588RT::Base->_ImportOverlays();
589
5901; # End of module.
591