RT count still running on 3.8.10 - porting to RT4.
[usit-rt.git] / lib / RT / I18N.pm
CommitLineData
84fb5b46
MKG
1# BEGIN BPS TAGGED BLOCK {{{
2#
3# COPYRIGHT:
4#
5# This software is Copyright (c) 1996-2012 Best Practical Solutions, LLC
6# <sales@bestpractical.com>
7#
8# (Except where explicitly superseded by other copyright notices)
9#
10#
11# LICENSE:
12#
13# This work is made available to you under the terms of Version 2 of
14# the GNU General Public License. A copy of that license should have
15# been provided with this software, but in any event can be snarfed
16# from www.gnu.org.
17#
18# This work is distributed in the hope that it will be useful, but
19# WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26# 02110-1301 or visit their web page on the internet at
27# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
28#
29#
30# CONTRIBUTION SUBMISSION POLICY:
31#
32# (The following paragraph is not intended to limit the rights granted
33# to you to modify and distribute this software under the terms of
34# the GNU General Public License and is only of importance to you if
35# you choose to contribute your changes and enhancements to the
36# community by submitting them to Best Practical Solutions, LLC.)
37#
38# By intentionally submitting any modifications, corrections or
39# derivatives to this work, or any other work intended for use with
40# Request Tracker, to Best Practical Solutions, LLC, you confirm that
41# you are the copyright holder for those contributions and you grant
42# Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
43# royalty-free, perpetual, license to use, copy, create derivative
44# works based on those contributions, and sublicense and distribute
45# those contributions and any derivatives thereof.
46#
47# END BPS TAGGED BLOCK }}}
48
49=head1 NAME
50
51RT::I18N - a base class for localization of RT
52
53=cut
54
55package RT::I18N;
56
57use strict;
58use warnings;
59
60
61use Locale::Maketext 1.04;
62use Locale::Maketext::Lexicon 0.25;
63use base 'Locale::Maketext::Fuzzy';
64
65use Encode;
66use MIME::Entity;
67use MIME::Head;
68use File::Glob;
69
70# I decree that this project's first language is English.
71
72our %Lexicon = (
73 'TEST_STRING' => 'Concrete Mixer',
74
75 '__Content-Type' => 'text/plain; charset=utf-8',
76
77 '_AUTO' => 1,
78 # That means that lookup failures can't happen -- if we get as far
79 # as looking for something in this lexicon, and we don't find it,
80 # then automagically set $Lexicon{$key} = $key, before possibly
81 # compiling it.
82
83 # The exception is keys that start with "_" -- they aren't auto-makeable.
84
85);
86# End of lexicon.
87
88=head2 Init
89
90Initializes the lexicons used for localization.
91
92
93=cut
94
95sub Init {
96
97 my @lang = RT->Config->Get('LexiconLanguages');
98 @lang = ('*') unless @lang;
99
100 # load default functions
101 require substr(__FILE__, 0, -3) . '/i_default.pm';
102
103 # Load language-specific functions
104 foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) {
105 unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) {
106 warn("$file is tainted. not loading");
107 next;
108 }
109 $file = $1;
110
111 my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/);
112 next unless grep $_ eq '*' || $_ eq $lang, @lang;
113 require $file;
114 }
115
116 my %import;
117 foreach my $l ( @lang ) {
118 $import{$l} = [
119 Gettext => $RT::LexiconPath."/$l.po",
120 ];
121 push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po');
122 push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po",
123 Gettext => $RT::LocalLexiconPath."/$l.po");
124 }
125
126 # Acquire all .po files and iterate them into lexicons
127 Locale::Maketext::Lexicon->import({ _decode => 1, %import });
128
129 return 1;
130}
131
132sub LoadLexicons {
133
134 no strict 'refs';
135 foreach my $k (keys %{RT::I18N::} ) {
136 next if $k eq 'main::';
137 next unless index($k, '::', -2) >= 0;
138 next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'};
139
140 my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH};
141 # run fetch to force load
142 my $tmp = $lex->{'foo'};
143 # XXX: untie may fail with "untie attempted
144 # while 1 inner references still exist"
145 # TODO: untie that has to lower fetch impact
146 # untie %$lex if tied %$lex;
147 }
148}
149
150=head2 encoding
151
152Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field.
153If it can't find anything, it returns 'ISO-8859-1'
154
155
156
157=cut
158
159
160sub encoding { 'utf-8' }
161
162
163=head2 SetMIMEEntityToUTF8 $entity
164
165An utility function which will try to convert entity body into utf8.
166It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8').
167
168=cut
169
170sub SetMIMEEntityToUTF8 {
171 RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8');
172}
173
174
175
176=head2 IsTextualContentType $type
177
178An utility function that determines whether $type is I<textual>, meaning
179that it can sensibly be converted to Unicode text.
180
181Currently, it returns true iff $type matches this regular expression
182(case-insensitively):
183
184 ^(?:text/(?:plain|html)|message/rfc822)\b
185
186
187=cut
188
189sub IsTextualContentType {
190 my $type = shift;
191 ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0;
192}
193
194
195=head2 SetMIMEEntityToEncoding $entity, $encoding
196
197An utility function which will try to convert entity body into specified
198charset encoding (encoded as octets, *not* unicode-strings). It will
199iterate all the entities in $entity, and try to convert each one into
200specified charset if whose Content-Type is 'text/plain'.
201
202This function doesn't return anything meaningful.
203
204=cut
205
206sub SetMIMEEntityToEncoding {
207 my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift );
208
209 # do the same for parts first of all
210 SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts;
211
212 my $charset = _FindOrGuessCharset($entity) or return;
213
214 SetMIMEHeadToEncoding(
215 $entity->head,
216 _FindOrGuessCharset($entity, 1) => $enc,
217 $preserve_words
218 );
219
220 my $head = $entity->head;
221
222 # convert at least MIME word encoded attachment filename
223 foreach my $attr (qw(content-type.name content-disposition.filename)) {
224 if ( my $name = $head->mime_attr($attr) and !$preserve_words ) {
225 $head->mime_attr( $attr => DecodeMIMEWordsToUTF8($name) );
226 }
227 }
228
229 # If this is a textual entity, we'd need to preserve its original encoding
230 $head->replace( "X-RT-Original-Encoding" => $charset )
231 if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type);
232
233 return unless IsTextualContentType($head->mime_type);
234
235 my $body = $entity->bodyhandle;
236
237 if ( $enc ne $charset && $body ) {
238 my $string = $body->as_string or return;
239
240 $RT::Logger->debug( "Converting '$charset' to '$enc' for "
241 . $head->mime_type . " - "
242 . ( $head->get('subject') || 'Subjectless message' ) );
243
244 # NOTE:: see the comments at the end of the sub.
245 Encode::_utf8_off($string);
246 Encode::from_to( $string, $charset => $enc );
247
248 my $new_body = MIME::Body::InCore->new($string);
249
250 # set up the new entity
251 $head->mime_attr( "content-type" => 'text/plain' )
252 unless ( $head->mime_attr("content-type") );
253 $head->mime_attr( "content-type.charset" => $enc );
254 $entity->bodyhandle($new_body);
255 }
256}
257
258# NOTES: Why Encode::_utf8_off before Encode::from_to
259#
260# All the strings in RT are utf-8 now. Quotes from Encode POD:
261#
262# [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
263# ... The data in $octets must be encoded as octets and not as
264# characters in Perl's internal format. ...
265#
266# Not turning off the UTF-8 flag in the string will prevent the string
267# from conversion.
268
269
270
271=head2 DecodeMIMEWordsToUTF8 $raw
272
273An utility method which mimics MIME::Words::decode_mimewords, but only
274limited functionality. This function returns an utf-8 string.
275
276It returns the decoded string, or the original string if it's not
277encoded. Since the subroutine converts specified string into utf-8
278charset, it should not alter a subject written in English.
279
280Why not use MIME::Words directly? Because it fails in RT when I
281tried. Maybe it's ok now.
282
283=cut
284
285sub DecodeMIMEWordsToUTF8 {
286 my $str = shift;
287 return DecodeMIMEWordsToEncoding($str, 'utf-8', @_);
288}
289
290sub DecodeMIMEWordsToEncoding {
291 my $str = shift;
292 my $to_charset = _CanonicalizeCharset(shift);
293 my $field = shift || '';
294
295 my @list = $str =~ m/(.*?)=\?([^?]+)\?([QqBb])\?([^?]+)\?=([^=]*)/gcs;
296
297 if ( @list ) {
298 # add everything that hasn't matched to the end of the latest
299 # string in array this happen when we have 'key="=?encoded?="; key="plain"'
300 $list[-1] .= substr($str, pos $str);
301
302 $str = "";
303 while (@list) {
304 my ($prefix, $charset, $encoding, $enc_str, $trailing) =
305 splice @list, 0, 5;
306 $charset = _CanonicalizeCharset($charset);
307 $encoding = lc $encoding;
308
309 $trailing =~ s/\s?\t?$//; # Observed from Outlook Express
310
311 if ( $encoding eq 'q' ) {
312 use MIME::QuotedPrint;
313 $enc_str =~ tr/_/ /; # Observed from Outlook Express
314 $enc_str = decode_qp($enc_str);
315 } elsif ( $encoding eq 'b' ) {
316 use MIME::Base64;
317 $enc_str = decode_base64($enc_str);
318 } else {
319 $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', "
320 ."only Q(uoted-printable) and B(ase64) are supported");
321 }
322
323 # now we have got a decoded subject, try to convert into the encoding
324 unless ( $charset eq $to_charset ) {
325 Encode::from_to( $enc_str, $charset, $to_charset );
326 }
327
328 # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers
329 # We _should_ be preserving them encoded until after parsing is completed and
330 # THEN undo the mime-encoding.
331 #
332 # This routine should be translating the existing mimeencoding to utf8 but leaving
333 # things encoded.
334 #
335 # It's legal for headers to contain mime-encoded commas and semicolons which
336 # should not be treated as address separators. (Encoding == quoting here)
337 #
338 # until this is fixed, we must escape any string containing a comma or semicolon
339 # this is only a bandaid
340
341 # Some _other_ MUAs encode quotes _already_, and double quotes
342 # confuse us a lot, so only quote it if it isn't quoted
343 # already.
344 $enc_str = qq{"$enc_str"}
345 if $enc_str =~ /[,;]/
346 and $enc_str !~ /^".*"$/
347 and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
348
349 $str .= $prefix . $enc_str . $trailing;
350 }
351 }
352
353# handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, see also rfc 2231
354 @list = $str =~ m/(.*?\*=)([^']*?)'([^']*?)'(\S+)(.*?)(?=(?:\*=|$))/gcs;
355 if (@list) {
356 $str = '';
357 while (@list) {
358 my ( $prefix, $charset, $language, $enc_str, $trailing ) =
359 splice @list, 0, 5;
360 $prefix =~ s/\*=$/=/; # remove the *
361 $charset = _CanonicalizeCharset($charset);
362 $enc_str =~ s/%(\w{2})/chr hex $1/eg;
363 unless ( $charset eq $to_charset ) {
364 Encode::from_to( $enc_str, $charset, $to_charset );
365 }
366 $enc_str = qq{"$enc_str"}
367 if $enc_str =~ /[,;]/
368 and $enc_str !~ /^".*"$/
369 and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i);
370 $str .= $prefix . $enc_str . $trailing;
371 }
372 }
373
374 # We might have \n without trailing whitespace, which will result in
375 # invalid headers.
376 $str =~ s/\n//g;
377
378 return ($str)
379}
380
381
382
383=head2 _FindOrGuessCharset MIME::Entity, $head_only
384
385When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out
386
387If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's.
388
389=cut
390
391sub _FindOrGuessCharset {
392 my $entity = shift;
393 my $head_only = shift;
394 my $head = $entity->head;
395
396 if ( my $charset = $head->mime_attr("content-type.charset") ) {
397 return _CanonicalizeCharset($charset);
398 }
399
400 if ( !$head_only and $head->mime_type =~ m{^text/} ) {
401 my $body = $entity->bodyhandle or return;
402 return _GuessCharset( $body->as_string );
403 }
404 else {
405
406 # potentially binary data -- don't guess the body
407 return _GuessCharset( $head->as_string );
408 }
409}
410
411
412
413=head2 _GuessCharset STRING
414
415use Encode::Guess to try to figure it out the string's encoding.
416
417=cut
418
419use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } };
420use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } };
421
422sub _GuessCharset {
423 my $fallback = _CanonicalizeCharset('iso-8859-1');
424
425 # if $_[0] is null/empty, we don't guess its encoding
426 return $fallback
427 unless defined $_[0] && length $_[0];
428
429 my @encodings = RT->Config->Get('EmailInputEncodings');
430 unless ( @encodings ) {
431 $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback");
432 return $fallback;
433 }
434
435 if ( $encodings[0] eq '*' ) {
436 shift @encodings;
437 if ( HAS_ENCODE_DETECT ) {
438 my $charset = Encode::Detect::Detector::detect( $_[0] );
439 if ( $charset ) {
440 $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset");
441 return _CanonicalizeCharset( Encode::resolve_alias( $charset ) );
442 }
443 else {
444 $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding");
445 }
446 }
447 else {
448 $RT::Logger->error(
449 "You requested to guess encoding, but we couldn't"
450 ." load Encode::Detect::Detector module"
451 );
452 }
453 }
454
455 unless ( @encodings ) {
456 $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback");
457 return $fallback;
458 }
459
460 unless ( HAS_ENCODE_GUESS ) {
461 $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback");
462 return $fallback;
463 }
464
465 Encode::Guess->set_suspects( @encodings );
466 my $decoder = Encode::Guess->guess( $_[0] );
467 unless ( defined $decoder ) {
468 $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback");
469 return $fallback;
470 }
471
472 if ( ref $decoder ) {
473 my $charset = $decoder->name;
474 $RT::Logger->debug("Encode::Guess guessed encoding: $charset");
475 return _CanonicalizeCharset( $charset );
476 }
477 elsif ($decoder =~ /(\S+ or .+)/) {
478 my %matched = map { $_ => 1 } split(/ or /, $1);
479 return 'utf-8' if $matched{'utf8'}; # one and only normalization
480
481 foreach my $suspect (RT->Config->Get('EmailInputEncodings')) {
482 next unless $matched{$suspect};
483 $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect");
484 return _CanonicalizeCharset( $suspect );
485 }
486 }
487 else {
488 $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback");
489 }
490
491 return $fallback;
492}
493
494=head2 _CanonicalizeCharset NAME
495
496canonicalize charset, return lowercase version.
497special cases are: gb2312 => gbk, utf8 => utf-8
498
499=cut
500
501sub _CanonicalizeCharset {
502 my $charset = lc shift;
503 return $charset unless $charset;
504
505 # Canonicalize aliases if they're known
506 if (my $canonical = Encode::resolve_alias($charset)) {
507 $charset = $canonical;
508 }
509
510 if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) {
511 return 'utf-8';
512 }
513 elsif ( $charset eq 'euc-cn' ) {
514 # gbk is superset of gb2312/euc-cn so it's safe
515 return 'gbk';
516 # XXX TODO: gb18030 is an even larger, more permissive superset of gbk,
517 # but needs Encode::HanExtra installed
518 }
519 else {
520 return $charset;
521 }
522}
523
524
525=head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET
526
527Converts a MIME Head from one encoding to another. This totally violates the RFC.
528We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff
529all the time
530
531
532=cut
533
534sub SetMIMEHeadToEncoding {
535 my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift );
536
537 $charset = _CanonicalizeCharset($charset);
538 $enc = _CanonicalizeCharset($enc);
539
540 return if $charset eq $enc and $preserve_words;
541
542 foreach my $tag ( $head->tags ) {
543 next unless $tag; # seen in wild: headers with no name
544 my @values = $head->get_all($tag);
545 $head->delete($tag);
546 foreach my $value (@values) {
547 if ( $charset ne $enc ) {
548 Encode::_utf8_off($value);
549 Encode::from_to( $value, $charset => $enc );
550 }
551 $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag )
552 unless $preserve_words;
553
554 # We intentionally add a leading space when re-adding the
555 # header; Mail::Header strips it before storing, but it
556 # serves to prevent it from "helpfully" canonicalizing
557 # $head->add("Subject", "Subject: foo") into the same as
558 # $head->add("Subject", "foo");
559 $head->add( $tag, " " . $value );
560 }
561 }
562
563}
564
565RT::Base->_ImportOverlays();
566
5671; # End of module.
568