]>
Commit | Line | Data |
---|---|---|
84fb5b46 MKG |
1 | # BEGIN BPS TAGGED BLOCK {{{ |
2 | # | |
3 | # COPYRIGHT: | |
4 | # | |
5 | # This software is Copyright (c) 1996-2012 Best Practical Solutions, LLC | |
6 | # <sales@bestpractical.com> | |
7 | # | |
8 | # (Except where explicitly superseded by other copyright notices) | |
9 | # | |
10 | # | |
11 | # LICENSE: | |
12 | # | |
13 | # This work is made available to you under the terms of Version 2 of | |
14 | # the GNU General Public License. A copy of that license should have | |
15 | # been provided with this software, but in any event can be snarfed | |
16 | # from www.gnu.org. | |
17 | # | |
18 | # This work is distributed in the hope that it will be useful, but | |
19 | # WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 | # General Public License for more details. | |
22 | # | |
23 | # You should have received a copy of the GNU General Public License | |
24 | # along with this program; if not, write to the Free Software | |
25 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA | |
26 | # 02110-1301 or visit their web page on the internet at | |
27 | # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html. | |
28 | # | |
29 | # | |
30 | # CONTRIBUTION SUBMISSION POLICY: | |
31 | # | |
32 | # (The following paragraph is not intended to limit the rights granted | |
33 | # to you to modify and distribute this software under the terms of | |
34 | # the GNU General Public License and is only of importance to you if | |
35 | # you choose to contribute your changes and enhancements to the | |
36 | # community by submitting them to Best Practical Solutions, LLC.) | |
37 | # | |
38 | # By intentionally submitting any modifications, corrections or | |
39 | # derivatives to this work, or any other work intended for use with | |
40 | # Request Tracker, to Best Practical Solutions, LLC, you confirm that | |
41 | # you are the copyright holder for those contributions and you grant | |
42 | # Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable, | |
43 | # royalty-free, perpetual, license to use, copy, create derivative | |
44 | # works based on those contributions, and sublicense and distribute | |
45 | # those contributions and any derivatives thereof. | |
46 | # | |
47 | # END BPS TAGGED BLOCK }}} | |
48 | ||
49 | =head1 NAME | |
50 | ||
51 | RT::I18N - a base class for localization of RT | |
52 | ||
53 | =cut | |
54 | ||
55 | package RT::I18N; | |
56 | ||
57 | use strict; | |
58 | use warnings; | |
59 | ||
60 | ||
61 | use Locale::Maketext 1.04; | |
62 | use Locale::Maketext::Lexicon 0.25; | |
63 | use base 'Locale::Maketext::Fuzzy'; | |
64 | ||
65 | use Encode; | |
66 | use MIME::Entity; | |
67 | use MIME::Head; | |
68 | use File::Glob; | |
69 | ||
70 | # I decree that this project's first language is English. | |
71 | ||
72 | our %Lexicon = ( | |
73 | 'TEST_STRING' => 'Concrete Mixer', | |
74 | ||
75 | '__Content-Type' => 'text/plain; charset=utf-8', | |
76 | ||
77 | '_AUTO' => 1, | |
78 | # That means that lookup failures can't happen -- if we get as far | |
79 | # as looking for something in this lexicon, and we don't find it, | |
80 | # then automagically set $Lexicon{$key} = $key, before possibly | |
81 | # compiling it. | |
82 | ||
83 | # The exception is keys that start with "_" -- they aren't auto-makeable. | |
84 | ||
85 | ); | |
86 | # End of lexicon. | |
87 | ||
88 | =head2 Init | |
89 | ||
90 | Initializes the lexicons used for localization. | |
91 | ||
92 | ||
93 | =cut | |
94 | ||
95 | sub Init { | |
96 | ||
97 | my @lang = RT->Config->Get('LexiconLanguages'); | |
98 | @lang = ('*') unless @lang; | |
99 | ||
100 | # load default functions | |
101 | require substr(__FILE__, 0, -3) . '/i_default.pm'; | |
102 | ||
103 | # Load language-specific functions | |
104 | foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) { | |
105 | unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) { | |
106 | warn("$file is tainted. not loading"); | |
107 | next; | |
108 | } | |
109 | $file = $1; | |
110 | ||
111 | my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/); | |
112 | next unless grep $_ eq '*' || $_ eq $lang, @lang; | |
113 | require $file; | |
114 | } | |
115 | ||
116 | my %import; | |
117 | foreach my $l ( @lang ) { | |
118 | $import{$l} = [ | |
119 | Gettext => $RT::LexiconPath."/$l.po", | |
120 | ]; | |
121 | push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po'); | |
122 | push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po", | |
123 | Gettext => $RT::LocalLexiconPath."/$l.po"); | |
124 | } | |
125 | ||
126 | # Acquire all .po files and iterate them into lexicons | |
127 | Locale::Maketext::Lexicon->import({ _decode => 1, %import }); | |
128 | ||
129 | return 1; | |
130 | } | |
131 | ||
132 | sub LoadLexicons { | |
133 | ||
134 | no strict 'refs'; | |
135 | foreach my $k (keys %{RT::I18N::} ) { | |
136 | next if $k eq 'main::'; | |
137 | next unless index($k, '::', -2) >= 0; | |
138 | next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'}; | |
139 | ||
140 | my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH}; | |
141 | # run fetch to force load | |
142 | my $tmp = $lex->{'foo'}; | |
143 | # XXX: untie may fail with "untie attempted | |
144 | # while 1 inner references still exist" | |
145 | # TODO: untie that has to lower fetch impact | |
146 | # untie %$lex if tied %$lex; | |
147 | } | |
148 | } | |
149 | ||
150 | =head2 encoding | |
151 | ||
152 | Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field. | |
153 | If it can't find anything, it returns 'ISO-8859-1' | |
154 | ||
155 | ||
156 | ||
157 | =cut | |
158 | ||
159 | ||
160 | sub encoding { 'utf-8' } | |
161 | ||
162 | ||
163 | =head2 SetMIMEEntityToUTF8 $entity | |
164 | ||
165 | An utility function which will try to convert entity body into utf8. | |
166 | It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8'). | |
167 | ||
168 | =cut | |
169 | ||
170 | sub SetMIMEEntityToUTF8 { | |
171 | RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8'); | |
172 | } | |
173 | ||
174 | ||
175 | ||
176 | =head2 IsTextualContentType $type | |
177 | ||
178 | An utility function that determines whether $type is I<textual>, meaning | |
179 | that it can sensibly be converted to Unicode text. | |
180 | ||
181 | Currently, it returns true iff $type matches this regular expression | |
182 | (case-insensitively): | |
183 | ||
184 | ^(?:text/(?:plain|html)|message/rfc822)\b | |
185 | ||
186 | ||
187 | =cut | |
188 | ||
189 | sub IsTextualContentType { | |
190 | my $type = shift; | |
191 | ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0; | |
192 | } | |
193 | ||
194 | ||
195 | =head2 SetMIMEEntityToEncoding $entity, $encoding | |
196 | ||
197 | An utility function which will try to convert entity body into specified | |
198 | charset encoding (encoded as octets, *not* unicode-strings). It will | |
199 | iterate all the entities in $entity, and try to convert each one into | |
200 | specified charset if whose Content-Type is 'text/plain'. | |
201 | ||
202 | This function doesn't return anything meaningful. | |
203 | ||
204 | =cut | |
205 | ||
206 | sub SetMIMEEntityToEncoding { | |
207 | my ( $entity, $enc, $preserve_words ) = ( shift, shift, shift ); | |
208 | ||
209 | # do the same for parts first of all | |
210 | SetMIMEEntityToEncoding( $_, $enc, $preserve_words ) foreach $entity->parts; | |
211 | ||
212 | my $charset = _FindOrGuessCharset($entity) or return; | |
213 | ||
214 | SetMIMEHeadToEncoding( | |
215 | $entity->head, | |
216 | _FindOrGuessCharset($entity, 1) => $enc, | |
217 | $preserve_words | |
218 | ); | |
219 | ||
220 | my $head = $entity->head; | |
221 | ||
84fb5b46 MKG |
222 | # If this is a textual entity, we'd need to preserve its original encoding |
223 | $head->replace( "X-RT-Original-Encoding" => $charset ) | |
224 | if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type); | |
225 | ||
226 | return unless IsTextualContentType($head->mime_type); | |
227 | ||
228 | my $body = $entity->bodyhandle; | |
229 | ||
dab09ea8 | 230 | if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) { |
84fb5b46 MKG |
231 | my $string = $body->as_string or return; |
232 | ||
233 | $RT::Logger->debug( "Converting '$charset' to '$enc' for " | |
234 | . $head->mime_type . " - " | |
235 | . ( $head->get('subject') || 'Subjectless message' ) ); | |
236 | ||
237 | # NOTE:: see the comments at the end of the sub. | |
238 | Encode::_utf8_off($string); | |
239 | Encode::from_to( $string, $charset => $enc ); | |
240 | ||
241 | my $new_body = MIME::Body::InCore->new($string); | |
242 | ||
243 | # set up the new entity | |
244 | $head->mime_attr( "content-type" => 'text/plain' ) | |
245 | unless ( $head->mime_attr("content-type") ); | |
246 | $head->mime_attr( "content-type.charset" => $enc ); | |
247 | $entity->bodyhandle($new_body); | |
248 | } | |
249 | } | |
250 | ||
251 | # NOTES: Why Encode::_utf8_off before Encode::from_to | |
252 | # | |
253 | # All the strings in RT are utf-8 now. Quotes from Encode POD: | |
254 | # | |
255 | # [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK]) | |
256 | # ... The data in $octets must be encoded as octets and not as | |
257 | # characters in Perl's internal format. ... | |
258 | # | |
259 | # Not turning off the UTF-8 flag in the string will prevent the string | |
260 | # from conversion. | |
261 | ||
262 | ||
263 | ||
264 | =head2 DecodeMIMEWordsToUTF8 $raw | |
265 | ||
266 | An utility method which mimics MIME::Words::decode_mimewords, but only | |
267 | limited functionality. This function returns an utf-8 string. | |
268 | ||
269 | It returns the decoded string, or the original string if it's not | |
270 | encoded. Since the subroutine converts specified string into utf-8 | |
271 | charset, it should not alter a subject written in English. | |
272 | ||
273 | Why not use MIME::Words directly? Because it fails in RT when I | |
274 | tried. Maybe it's ok now. | |
275 | ||
276 | =cut | |
277 | ||
278 | sub DecodeMIMEWordsToUTF8 { | |
279 | my $str = shift; | |
280 | return DecodeMIMEWordsToEncoding($str, 'utf-8', @_); | |
281 | } | |
282 | ||
283 | sub DecodeMIMEWordsToEncoding { | |
284 | my $str = shift; | |
285 | my $to_charset = _CanonicalizeCharset(shift); | |
286 | my $field = shift || ''; | |
287 | ||
b5747ff2 MKG |
288 | # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value |
289 | # continuations, and similar syntax from RFC 2231 | |
290 | if ($field =~ /^Content-(Type|Disposition)/i) { | |
291 | # This concatenates continued parameters and normalizes encoded params | |
292 | # to QB encoded-words which we handle below | |
293 | $str = MIME::Field::ParamVal->parse($str)->stringify; | |
294 | } | |
295 | ||
296 | # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our | |
297 | # custom MIME word decoding and charset canonicalization. We can't do this | |
298 | # until we parse before decode, instead of the other way around. | |
299 | my @list = $str =~ m/(.*?) # prefix | |
300 | =\? # =? | |
301 | ([^?]+?) # charset | |
302 | (?:\*[^?]+)? # optional '*language' | |
303 | \? # ? | |
304 | ([QqBb]) # encoding | |
305 | \? # ? | |
306 | ([^?]+) # encoded string | |
307 | \?= # ?= | |
308 | ([^=]*) # trailing | |
309 | /xgcs; | |
84fb5b46 MKG |
310 | |
311 | if ( @list ) { | |
312 | # add everything that hasn't matched to the end of the latest | |
313 | # string in array this happen when we have 'key="=?encoded?="; key="plain"' | |
314 | $list[-1] .= substr($str, pos $str); | |
315 | ||
316 | $str = ""; | |
317 | while (@list) { | |
318 | my ($prefix, $charset, $encoding, $enc_str, $trailing) = | |
319 | splice @list, 0, 5; | |
320 | $charset = _CanonicalizeCharset($charset); | |
321 | $encoding = lc $encoding; | |
322 | ||
323 | $trailing =~ s/\s?\t?$//; # Observed from Outlook Express | |
324 | ||
325 | if ( $encoding eq 'q' ) { | |
326 | use MIME::QuotedPrint; | |
327 | $enc_str =~ tr/_/ /; # Observed from Outlook Express | |
328 | $enc_str = decode_qp($enc_str); | |
329 | } elsif ( $encoding eq 'b' ) { | |
330 | use MIME::Base64; | |
331 | $enc_str = decode_base64($enc_str); | |
332 | } else { | |
333 | $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', " | |
334 | ."only Q(uoted-printable) and B(ase64) are supported"); | |
335 | } | |
336 | ||
337 | # now we have got a decoded subject, try to convert into the encoding | |
dab09ea8 | 338 | if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) { |
84fb5b46 MKG |
339 | Encode::from_to( $enc_str, $charset, $to_charset ); |
340 | } | |
341 | ||
342 | # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers | |
343 | # We _should_ be preserving them encoded until after parsing is completed and | |
344 | # THEN undo the mime-encoding. | |
345 | # | |
346 | # This routine should be translating the existing mimeencoding to utf8 but leaving | |
347 | # things encoded. | |
348 | # | |
349 | # It's legal for headers to contain mime-encoded commas and semicolons which | |
350 | # should not be treated as address separators. (Encoding == quoting here) | |
351 | # | |
352 | # until this is fixed, we must escape any string containing a comma or semicolon | |
353 | # this is only a bandaid | |
354 | ||
355 | # Some _other_ MUAs encode quotes _already_, and double quotes | |
356 | # confuse us a lot, so only quote it if it isn't quoted | |
357 | # already. | |
358 | $enc_str = qq{"$enc_str"} | |
359 | if $enc_str =~ /[,;]/ | |
360 | and $enc_str !~ /^".*"$/ | |
361 | and (!$field || $field =~ /^(?:To$|From$|B?Cc$|Content-)/i); | |
362 | ||
363 | $str .= $prefix . $enc_str . $trailing; | |
364 | } | |
365 | } | |
366 | ||
84fb5b46 MKG |
367 | # We might have \n without trailing whitespace, which will result in |
368 | # invalid headers. | |
369 | $str =~ s/\n//g; | |
370 | ||
371 | return ($str) | |
372 | } | |
373 | ||
374 | ||
375 | ||
376 | =head2 _FindOrGuessCharset MIME::Entity, $head_only | |
377 | ||
378 | When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out | |
379 | ||
380 | If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's. | |
381 | ||
382 | =cut | |
383 | ||
384 | sub _FindOrGuessCharset { | |
385 | my $entity = shift; | |
386 | my $head_only = shift; | |
387 | my $head = $entity->head; | |
388 | ||
389 | if ( my $charset = $head->mime_attr("content-type.charset") ) { | |
390 | return _CanonicalizeCharset($charset); | |
391 | } | |
392 | ||
393 | if ( !$head_only and $head->mime_type =~ m{^text/} ) { | |
394 | my $body = $entity->bodyhandle or return; | |
395 | return _GuessCharset( $body->as_string ); | |
396 | } | |
397 | else { | |
398 | ||
399 | # potentially binary data -- don't guess the body | |
400 | return _GuessCharset( $head->as_string ); | |
401 | } | |
402 | } | |
403 | ||
404 | ||
405 | ||
406 | =head2 _GuessCharset STRING | |
407 | ||
408 | use Encode::Guess to try to figure it out the string's encoding. | |
409 | ||
410 | =cut | |
411 | ||
412 | use constant HAS_ENCODE_GUESS => do { local $@; eval { require Encode::Guess; 1 } }; | |
413 | use constant HAS_ENCODE_DETECT => do { local $@; eval { require Encode::Detect::Detector; 1 } }; | |
414 | ||
415 | sub _GuessCharset { | |
416 | my $fallback = _CanonicalizeCharset('iso-8859-1'); | |
417 | ||
418 | # if $_[0] is null/empty, we don't guess its encoding | |
419 | return $fallback | |
420 | unless defined $_[0] && length $_[0]; | |
421 | ||
422 | my @encodings = RT->Config->Get('EmailInputEncodings'); | |
423 | unless ( @encodings ) { | |
424 | $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback"); | |
425 | return $fallback; | |
426 | } | |
427 | ||
428 | if ( $encodings[0] eq '*' ) { | |
429 | shift @encodings; | |
430 | if ( HAS_ENCODE_DETECT ) { | |
431 | my $charset = Encode::Detect::Detector::detect( $_[0] ); | |
432 | if ( $charset ) { | |
433 | $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset"); | |
434 | return _CanonicalizeCharset( Encode::resolve_alias( $charset ) ); | |
435 | } | |
436 | else { | |
437 | $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding"); | |
438 | } | |
439 | } | |
440 | else { | |
441 | $RT::Logger->error( | |
442 | "You requested to guess encoding, but we couldn't" | |
443 | ." load Encode::Detect::Detector module" | |
444 | ); | |
445 | } | |
446 | } | |
447 | ||
448 | unless ( @encodings ) { | |
449 | $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback"); | |
450 | return $fallback; | |
451 | } | |
452 | ||
453 | unless ( HAS_ENCODE_GUESS ) { | |
454 | $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback"); | |
455 | return $fallback; | |
456 | } | |
457 | ||
458 | Encode::Guess->set_suspects( @encodings ); | |
459 | my $decoder = Encode::Guess->guess( $_[0] ); | |
460 | unless ( defined $decoder ) { | |
461 | $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback"); | |
462 | return $fallback; | |
463 | } | |
464 | ||
465 | if ( ref $decoder ) { | |
466 | my $charset = $decoder->name; | |
467 | $RT::Logger->debug("Encode::Guess guessed encoding: $charset"); | |
468 | return _CanonicalizeCharset( $charset ); | |
469 | } | |
470 | elsif ($decoder =~ /(\S+ or .+)/) { | |
471 | my %matched = map { $_ => 1 } split(/ or /, $1); | |
472 | return 'utf-8' if $matched{'utf8'}; # one and only normalization | |
473 | ||
474 | foreach my $suspect (RT->Config->Get('EmailInputEncodings')) { | |
475 | next unless $matched{$suspect}; | |
476 | $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect"); | |
477 | return _CanonicalizeCharset( $suspect ); | |
478 | } | |
479 | } | |
480 | else { | |
481 | $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback"); | |
482 | } | |
483 | ||
484 | return $fallback; | |
485 | } | |
486 | ||
487 | =head2 _CanonicalizeCharset NAME | |
488 | ||
489 | canonicalize charset, return lowercase version. | |
490 | special cases are: gb2312 => gbk, utf8 => utf-8 | |
491 | ||
492 | =cut | |
493 | ||
494 | sub _CanonicalizeCharset { | |
495 | my $charset = lc shift; | |
496 | return $charset unless $charset; | |
497 | ||
498 | # Canonicalize aliases if they're known | |
499 | if (my $canonical = Encode::resolve_alias($charset)) { | |
500 | $charset = $canonical; | |
501 | } | |
502 | ||
503 | if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) { | |
504 | return 'utf-8'; | |
505 | } | |
506 | elsif ( $charset eq 'euc-cn' ) { | |
507 | # gbk is superset of gb2312/euc-cn so it's safe | |
508 | return 'gbk'; | |
509 | # XXX TODO: gb18030 is an even larger, more permissive superset of gbk, | |
510 | # but needs Encode::HanExtra installed | |
511 | } | |
512 | else { | |
513 | return $charset; | |
514 | } | |
515 | } | |
516 | ||
517 | ||
518 | =head2 SetMIMEHeadToEncoding HEAD OLD_CHARSET NEW_CHARSET | |
519 | ||
520 | Converts a MIME Head from one encoding to another. This totally violates the RFC. | |
521 | We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff | |
522 | all the time | |
523 | ||
524 | ||
525 | =cut | |
526 | ||
527 | sub SetMIMEHeadToEncoding { | |
528 | my ( $head, $charset, $enc, $preserve_words ) = ( shift, shift, shift, shift ); | |
529 | ||
530 | $charset = _CanonicalizeCharset($charset); | |
531 | $enc = _CanonicalizeCharset($enc); | |
532 | ||
533 | return if $charset eq $enc and $preserve_words; | |
534 | ||
535 | foreach my $tag ( $head->tags ) { | |
536 | next unless $tag; # seen in wild: headers with no name | |
537 | my @values = $head->get_all($tag); | |
538 | $head->delete($tag); | |
539 | foreach my $value (@values) { | |
dab09ea8 | 540 | if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) { |
84fb5b46 MKG |
541 | Encode::_utf8_off($value); |
542 | Encode::from_to( $value, $charset => $enc ); | |
543 | } | |
544 | $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag ) | |
545 | unless $preserve_words; | |
546 | ||
547 | # We intentionally add a leading space when re-adding the | |
548 | # header; Mail::Header strips it before storing, but it | |
549 | # serves to prevent it from "helpfully" canonicalizing | |
550 | # $head->add("Subject", "Subject: foo") into the same as | |
551 | # $head->add("Subject", "foo"); | |
552 | $head->add( $tag, " " . $value ); | |
553 | } | |
554 | } | |
555 | ||
556 | } | |
557 | ||
558 | RT::Base->_ImportOverlays(); | |
559 | ||
560 | 1; # End of module. | |
561 |