]>
Commit | Line | Data |
---|---|---|
84fb5b46 MKG |
1 | # BEGIN BPS TAGGED BLOCK {{{ |
2 | # | |
3 | # COPYRIGHT: | |
4 | # | |
3ffc5f4f | 5 | # This software is Copyright (c) 1996-2014 Best Practical Solutions, LLC |
84fb5b46 MKG |
6 | # <sales@bestpractical.com> |
7 | # | |
8 | # (Except where explicitly superseded by other copyright notices) | |
9 | # | |
10 | # | |
11 | # LICENSE: | |
12 | # | |
13 | # This work is made available to you under the terms of Version 2 of | |
14 | # the GNU General Public License. A copy of that license should have | |
15 | # been provided with this software, but in any event can be snarfed | |
16 | # from www.gnu.org. | |
17 | # | |
18 | # This work is distributed in the hope that it will be useful, but | |
19 | # WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 | # General Public License for more details. | |
22 | # | |
23 | # You should have received a copy of the GNU General Public License | |
24 | # along with this program; if not, write to the Free Software | |
25 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA | |
26 | # 02110-1301 or visit their web page on the internet at | |
27 | # http://www.gnu.org/licenses/old-licenses/gpl-2.0.html. | |
28 | # | |
29 | # | |
30 | # CONTRIBUTION SUBMISSION POLICY: | |
31 | # | |
32 | # (The following paragraph is not intended to limit the rights granted | |
33 | # to you to modify and distribute this software under the terms of | |
34 | # the GNU General Public License and is only of importance to you if | |
35 | # you choose to contribute your changes and enhancements to the | |
36 | # community by submitting them to Best Practical Solutions, LLC.) | |
37 | # | |
38 | # By intentionally submitting any modifications, corrections or | |
39 | # derivatives to this work, or any other work intended for use with | |
40 | # Request Tracker, to Best Practical Solutions, LLC, you confirm that | |
41 | # you are the copyright holder for those contributions and you grant | |
42 | # Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable, | |
43 | # royalty-free, perpetual, license to use, copy, create derivative | |
44 | # works based on those contributions, and sublicense and distribute | |
45 | # those contributions and any derivatives thereof. | |
46 | # | |
47 | # END BPS TAGGED BLOCK }}} | |
48 | ||
49 | =head1 NAME | |
50 | ||
51 | RT::I18N - a base class for localization of RT | |
52 | ||
53 | =cut | |
54 | ||
55 | package RT::I18N; | |
56 | ||
57 | use strict; | |
58 | use warnings; | |
59 | ||
60 | ||
61 | use Locale::Maketext 1.04; | |
62 | use Locale::Maketext::Lexicon 0.25; | |
63 | use base 'Locale::Maketext::Fuzzy'; | |
64 | ||
84fb5b46 MKG |
65 | use MIME::Entity; |
66 | use MIME::Head; | |
67 | use File::Glob; | |
68 | ||
69 | # I decree that this project's first language is English. | |
70 | ||
71 | our %Lexicon = ( | |
72 | 'TEST_STRING' => 'Concrete Mixer', | |
73 | ||
74 | '__Content-Type' => 'text/plain; charset=utf-8', | |
75 | ||
76 | '_AUTO' => 1, | |
77 | # That means that lookup failures can't happen -- if we get as far | |
78 | # as looking for something in this lexicon, and we don't find it, | |
79 | # then automagically set $Lexicon{$key} = $key, before possibly | |
80 | # compiling it. | |
81 | ||
82 | # The exception is keys that start with "_" -- they aren't auto-makeable. | |
83 | ||
84 | ); | |
85 | # End of lexicon. | |
86 | ||
87 | =head2 Init | |
88 | ||
89 | Initializes the lexicons used for localization. | |
90 | ||
91 | ||
92 | =cut | |
93 | ||
94 | sub Init { | |
95 | ||
96 | my @lang = RT->Config->Get('LexiconLanguages'); | |
97 | @lang = ('*') unless @lang; | |
98 | ||
99 | # load default functions | |
100 | require substr(__FILE__, 0, -3) . '/i_default.pm'; | |
101 | ||
102 | # Load language-specific functions | |
103 | foreach my $file ( File::Glob::bsd_glob(substr(__FILE__, 0, -3) . "/*.pm") ) { | |
104 | unless ( $file =~ /^([-\w\s\.\/\\~:]+)$/ ) { | |
105 | warn("$file is tainted. not loading"); | |
106 | next; | |
107 | } | |
108 | $file = $1; | |
109 | ||
110 | my ($lang) = ($file =~ /([^\\\/]+?)\.pm$/); | |
111 | next unless grep $_ eq '*' || $_ eq $lang, @lang; | |
112 | require $file; | |
113 | } | |
114 | ||
115 | my %import; | |
116 | foreach my $l ( @lang ) { | |
117 | $import{$l} = [ | |
118 | Gettext => $RT::LexiconPath."/$l.po", | |
119 | ]; | |
120 | push @{ $import{$l} }, map {(Gettext => "$_/$l.po")} RT->PluginDirs('po'); | |
121 | push @{ $import{$l} }, (Gettext => $RT::LocalLexiconPath."/*/$l.po", | |
122 | Gettext => $RT::LocalLexiconPath."/$l.po"); | |
123 | } | |
124 | ||
125 | # Acquire all .po files and iterate them into lexicons | |
126 | Locale::Maketext::Lexicon->import({ _decode => 1, %import }); | |
127 | ||
128 | return 1; | |
129 | } | |
130 | ||
131 | sub LoadLexicons { | |
132 | ||
133 | no strict 'refs'; | |
134 | foreach my $k (keys %{RT::I18N::} ) { | |
135 | next if $k eq 'main::'; | |
136 | next unless index($k, '::', -2) >= 0; | |
137 | next unless exists ${ 'RT::I18N::'. $k }{'Lexicon'}; | |
138 | ||
139 | my $lex = *{ ${'RT::I18N::'. $k }{'Lexicon'} }{HASH}; | |
140 | # run fetch to force load | |
141 | my $tmp = $lex->{'foo'}; | |
142 | # XXX: untie may fail with "untie attempted | |
143 | # while 1 inner references still exist" | |
144 | # TODO: untie that has to lower fetch impact | |
145 | # untie %$lex if tied %$lex; | |
146 | } | |
147 | } | |
148 | ||
149 | =head2 encoding | |
150 | ||
151 | Returns the encoding of the current lexicon, as yanked out of __ContentType's "charset" field. | |
152 | If it can't find anything, it returns 'ISO-8859-1' | |
153 | ||
154 | ||
155 | ||
156 | =cut | |
157 | ||
158 | ||
159 | sub encoding { 'utf-8' } | |
160 | ||
161 | ||
162 | =head2 SetMIMEEntityToUTF8 $entity | |
163 | ||
164 | An utility function which will try to convert entity body into utf8. | |
165 | It's now a wrap-up of SetMIMEEntityToEncoding($entity, 'utf-8'). | |
166 | ||
167 | =cut | |
168 | ||
169 | sub SetMIMEEntityToUTF8 { | |
170 | RT::I18N::SetMIMEEntityToEncoding(shift, 'utf-8'); | |
171 | } | |
172 | ||
173 | ||
174 | ||
175 | =head2 IsTextualContentType $type | |
176 | ||
177 | An utility function that determines whether $type is I<textual>, meaning | |
178 | that it can sensibly be converted to Unicode text. | |
179 | ||
180 | Currently, it returns true iff $type matches this regular expression | |
181 | (case-insensitively): | |
182 | ||
183 | ^(?:text/(?:plain|html)|message/rfc822)\b | |
184 | ||
185 | ||
186 | =cut | |
187 | ||
188 | sub IsTextualContentType { | |
189 | my $type = shift; | |
190 | ($type =~ m{^(?:text/(?:plain|html)|message/rfc822)\b}i) ? 1 : 0; | |
191 | } | |
192 | ||
193 | ||
3ffc5f4f | 194 | =head2 SetMIMEEntityToEncoding Entity => ENTITY, Encoding => ENCODING, PreserveWords => BOOL, IsOut => BOOL |
84fb5b46 MKG |
195 | |
196 | An utility function which will try to convert entity body into specified | |
197 | charset encoding (encoded as octets, *not* unicode-strings). It will | |
198 | iterate all the entities in $entity, and try to convert each one into | |
199 | specified charset if whose Content-Type is 'text/plain'. | |
200 | ||
3ffc5f4f MKG |
201 | If PreserveWords is true, values in mime head will be decoded.(default is false) |
202 | ||
203 | Incoming and outgoing mails are handled differently, if IsOut is true(default | |
204 | is false), it'll be treated as outgoing mail, otherwise incomding mail: | |
205 | ||
206 | incoming mail: | |
207 | 1) find encoding | |
208 | 2) if found then try to convert to utf-8 in croak mode, return if success | |
209 | 3) guess encoding | |
210 | 4) if guessed differently then try to convert to utf-8 in croak mode, return | |
211 | if success | |
212 | 5) mark part as application/octet-stream instead of falling back to any | |
213 | encoding | |
214 | ||
215 | outgoing mail: | |
216 | 1) find encoding | |
217 | 2) if didn't find then do nothing, send as is, let MUA deal with it | |
218 | 3) if found then try to convert it to outgoing encoding in croak mode, return | |
219 | if success | |
220 | 4) do nothing otherwise, keep original encoding | |
221 | ||
84fb5b46 MKG |
222 | This function doesn't return anything meaningful. |
223 | ||
224 | =cut | |
225 | ||
226 | sub SetMIMEEntityToEncoding { | |
3ffc5f4f MKG |
227 | my ( $entity, $enc, $preserve_words, $is_out ); |
228 | ||
229 | if ( @_ <= 3 ) { | |
230 | ( $entity, $enc, $preserve_words ) = @_; | |
231 | } | |
232 | else { | |
233 | my %args = ( | |
234 | Entity => undef, | |
235 | Encoding => undef, | |
236 | PreserveWords => undef, | |
237 | IsOut => undef, | |
238 | @_, | |
239 | ); | |
240 | ||
241 | $entity = $args{Entity}; | |
242 | $enc = $args{Encoding}; | |
243 | $preserve_words = $args{PreserveWords}; | |
244 | $is_out = $args{IsOut}; | |
245 | } | |
246 | ||
247 | unless ( $entity && $enc ) { | |
248 | RT->Logger->error("Missing Entity or Encoding arguments"); | |
249 | return; | |
250 | } | |
84fb5b46 MKG |
251 | |
252 | # do the same for parts first of all | |
3ffc5f4f MKG |
253 | SetMIMEEntityToEncoding( |
254 | Entity => $_, | |
255 | Encoding => $enc, | |
256 | PreserveWords => $preserve_words, | |
257 | IsOut => $is_out, | |
258 | ) foreach $entity->parts; | |
84fb5b46 | 259 | |
403d7b0b MKG |
260 | my $head = $entity->head; |
261 | ||
262 | my $charset = _FindOrGuessCharset($entity); | |
263 | if ( $charset ) { | |
264 | unless( Encode::find_encoding($charset) ) { | |
265 | $RT::Logger->warning("Encoding '$charset' is not supported"); | |
266 | $charset = undef; | |
267 | } | |
268 | } | |
269 | unless ( $charset ) { | |
270 | $head->replace( "X-RT-Original-Content-Type" => $head->mime_attr('Content-Type') ); | |
271 | $head->mime_attr('Content-Type' => 'application/octet-stream'); | |
272 | return; | |
273 | } | |
84fb5b46 MKG |
274 | |
275 | SetMIMEHeadToEncoding( | |
3ffc5f4f MKG |
276 | Head => $head, |
277 | From => _FindOrGuessCharset( $entity, 1 ), | |
278 | To => $enc, | |
279 | PreserveWords => $preserve_words, | |
280 | IsOut => $is_out, | |
84fb5b46 MKG |
281 | ); |
282 | ||
84fb5b46 | 283 | # If this is a textual entity, we'd need to preserve its original encoding |
3ffc5f4f MKG |
284 | $head->replace( "X-RT-Original-Encoding" => Encode::encode( "UTF-8", $charset ) ) |
285 | if $head->mime_attr('content-type.charset') or IsTextualContentType($head->mime_type); | |
84fb5b46 MKG |
286 | |
287 | return unless IsTextualContentType($head->mime_type); | |
288 | ||
289 | my $body = $entity->bodyhandle; | |
290 | ||
dab09ea8 | 291 | if ( $body && ($enc ne $charset || $enc =~ /^utf-?8(?:-strict)?$/i) ) { |
84fb5b46 | 292 | my $string = $body->as_string or return; |
3ffc5f4f | 293 | RT::Util::assert_bytes($string); |
84fb5b46 MKG |
294 | |
295 | $RT::Logger->debug( "Converting '$charset' to '$enc' for " | |
296 | . $head->mime_type . " - " | |
3ffc5f4f MKG |
297 | . ( Encode::decode("UTF-8",$head->get('subject')) || 'Subjectless message' ) ); |
298 | ||
299 | my $orig_string = $string; | |
300 | ( my $success, $string ) = EncodeFromToWithCroak( $orig_string, $charset => $enc ); | |
301 | if ( !$success ) { | |
302 | return if $is_out; | |
303 | my $error = $string; | |
304 | ||
305 | my $guess = _GuessCharset($orig_string); | |
306 | if ( $guess && $guess ne $charset ) { | |
307 | $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" ); | |
308 | ( $success, $string ) = EncodeFromToWithCroak( $orig_string, $guess, $enc ); | |
309 | $error = $string unless $success; | |
310 | } | |
84fb5b46 | 311 | |
3ffc5f4f MKG |
312 | if ( !$success ) { |
313 | $RT::Logger->error( "Encoding error: " . $error . " falling back to application/octet-stream" ); | |
314 | $head->mime_attr( "content-type" => 'application/octet-stream' ); | |
315 | return; | |
316 | } | |
317 | } | |
84fb5b46 MKG |
318 | |
319 | my $new_body = MIME::Body::InCore->new($string); | |
320 | ||
321 | # set up the new entity | |
322 | $head->mime_attr( "content-type" => 'text/plain' ) | |
323 | unless ( $head->mime_attr("content-type") ); | |
324 | $head->mime_attr( "content-type.charset" => $enc ); | |
325 | $entity->bodyhandle($new_body); | |
326 | } | |
327 | } | |
328 | ||
84fb5b46 MKG |
329 | =head2 DecodeMIMEWordsToUTF8 $raw |
330 | ||
331 | An utility method which mimics MIME::Words::decode_mimewords, but only | |
3ffc5f4f MKG |
332 | limited functionality. Despite its name, this function returns the |
333 | bytes of the string, in UTF-8. | |
84fb5b46 MKG |
334 | |
335 | =cut | |
336 | ||
337 | sub DecodeMIMEWordsToUTF8 { | |
338 | my $str = shift; | |
339 | return DecodeMIMEWordsToEncoding($str, 'utf-8', @_); | |
340 | } | |
341 | ||
342 | sub DecodeMIMEWordsToEncoding { | |
343 | my $str = shift; | |
344 | my $to_charset = _CanonicalizeCharset(shift); | |
345 | my $field = shift || ''; | |
3ffc5f4f MKG |
346 | $RT::Logger->warning( |
347 | "DecodeMIMEWordsToEncoding was called without field name." | |
348 | ."It's known to cause troubles with decoding fields properly." | |
349 | ) unless $field; | |
350 | ||
351 | # XXX TODO: RT doesn't currently do the right thing with mime-encoded headers | |
352 | # We _should_ be preserving them encoded until after parsing is completed and | |
353 | # THEN undo the mime-encoding. | |
354 | # | |
355 | # This routine should be translating the existing mimeencoding to utf8 but leaving | |
356 | # things encoded. | |
357 | # | |
358 | # It's legal for headers to contain mime-encoded commas and semicolons which | |
359 | # should not be treated as address separators. (Encoding == quoting here) | |
360 | # | |
361 | # until this is fixed, we must escape any string containing a comma or semicolon | |
362 | # this is only a bandaid | |
363 | ||
364 | # Some _other_ MUAs encode quotes _already_, and double quotes | |
365 | # confuse us a lot, so only quote it if it isn't quoted | |
366 | # already. | |
84fb5b46 | 367 | |
b5747ff2 MKG |
368 | # handle filename*=ISO-8859-1''%74%E9%73%74%2E%74%78%74, parameter value |
369 | # continuations, and similar syntax from RFC 2231 | |
3ffc5f4f | 370 | if ($field =~ /^Content-/i) { |
b5747ff2 MKG |
371 | # This concatenates continued parameters and normalizes encoded params |
372 | # to QB encoded-words which we handle below | |
3ffc5f4f MKG |
373 | my $params = MIME::Field::ParamVal->parse_params($str); |
374 | foreach my $v ( values %$params ) { | |
375 | $v = _DecodeMIMEWordsToEncoding( $v, $to_charset ); | |
376 | # de-quote in case those were hidden inside encoded part | |
377 | $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/; | |
378 | } | |
379 | $str = bless({}, 'MIME::Field::ParamVal')->set($params)->stringify; | |
380 | } | |
381 | elsif ( $field =~ /^(?:Resent-)?(?:To|From|B?Cc|Sender|Reply-To)$/i ) { | |
382 | my @addresses = RT::EmailParser->ParseEmailAddress( $str ); | |
383 | foreach my $address ( @addresses ) { | |
384 | foreach my $field (qw(phrase comment)) { | |
385 | my $v = $address->$field() or next; | |
386 | $v = _DecodeMIMEWordsToEncoding( $v, $to_charset ); | |
387 | if ( $field eq 'phrase' ) { | |
388 | # de-quote in case quoted value were hidden inside encoded part | |
389 | $v =~ s/\\(.)/$1/g if $v =~ s/^"(.*)"$/$1/; | |
390 | } | |
391 | $address->$field($v); | |
392 | } | |
393 | } | |
394 | $str = join ', ', map $_->format, @addresses; | |
b5747ff2 | 395 | } |
3ffc5f4f MKG |
396 | else { |
397 | $str = _DecodeMIMEWordsToEncoding( $str, $to_charset ); | |
398 | } | |
399 | ||
400 | ||
401 | # We might have \n without trailing whitespace, which will result in | |
402 | # invalid headers. | |
403 | $str =~ s/\n//g; | |
404 | ||
405 | return ($str) | |
406 | } | |
407 | ||
408 | sub _DecodeMIMEWordsToEncoding { | |
409 | my $str = shift; | |
410 | my $to_charset = shift; | |
b5747ff2 | 411 | |
403d7b0b MKG |
412 | # Pre-parse by removing all whitespace between encoded words |
413 | my $encoded_word = qr/ | |
414 | =\? # =? | |
415 | ([^?]+?) # charset | |
416 | (?:\*[^?]+)? # optional '*language' | |
417 | \? # ? | |
418 | ([QqBb]) # encoding | |
419 | \? # ? | |
420 | ([^?]+) # encoded string | |
421 | \?= # ?= | |
422 | /x; | |
423 | $str =~ s/($encoded_word)\s+(?=$encoded_word)/$1/g; | |
424 | ||
425 | # Also merge quoted-printable sections together, in case multiple | |
426 | # octets of a single encoded character were split between chunks. | |
427 | # Though not valid according to RFC 2047, this has been seen in the | |
428 | # wild. | |
429 | 1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i; | |
430 | ||
b5747ff2 MKG |
431 | # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our |
432 | # custom MIME word decoding and charset canonicalization. We can't do this | |
433 | # until we parse before decode, instead of the other way around. | |
434 | my @list = $str =~ m/(.*?) # prefix | |
403d7b0b | 435 | $encoded_word |
b5747ff2 MKG |
436 | ([^=]*) # trailing |
437 | /xgcs; | |
3ffc5f4f MKG |
438 | return $str unless @list; |
439 | ||
440 | # add everything that hasn't matched to the end of the latest | |
441 | # string in array this happen when we have 'key="=?encoded?="; key="plain"' | |
442 | $list[-1] .= substr($str, pos $str); | |
443 | ||
444 | $str = ''; | |
445 | while (@list) { | |
446 | my ($prefix, $charset, $encoding, $enc_str, $trailing) = | |
447 | splice @list, 0, 5; | |
448 | $charset = _CanonicalizeCharset($charset); | |
449 | $encoding = lc $encoding; | |
450 | ||
451 | $trailing =~ s/\s?\t?$//; # Observed from Outlook Express | |
452 | ||
453 | if ( $encoding eq 'q' ) { | |
454 | use MIME::QuotedPrint; | |
455 | $enc_str =~ tr/_/ /; # Observed from Outlook Express | |
456 | $enc_str = decode_qp($enc_str); | |
457 | } elsif ( $encoding eq 'b' ) { | |
458 | use MIME::Base64; | |
459 | $enc_str = decode_base64($enc_str); | |
460 | } else { | |
461 | $RT::Logger->warning("Incorrect encoding '$encoding' in '$str', " | |
462 | ."only Q(uoted-printable) and B(ase64) are supported"); | |
463 | } | |
84fb5b46 | 464 | |
3ffc5f4f MKG |
465 | # now we have got a decoded subject, try to convert into the encoding |
466 | if ( $charset ne $to_charset || $charset =~ /^utf-?8(?:-strict)?$/i ) { | |
467 | if ( Encode::find_encoding($charset) ) { | |
468 | Encode::from_to( $enc_str, $charset, $to_charset ); | |
84fb5b46 | 469 | } else { |
3ffc5f4f MKG |
470 | $RT::Logger->warning("Charset '$charset' is not supported"); |
471 | $enc_str =~ s/[^[:print:]]/\357\277\275/g; | |
472 | Encode::from_to( $enc_str, 'UTF-8', $to_charset ) | |
473 | unless $to_charset eq 'utf-8'; | |
84fb5b46 | 474 | } |
84fb5b46 | 475 | } |
3ffc5f4f | 476 | $str .= $prefix . $enc_str . $trailing; |
84fb5b46 MKG |
477 | } |
478 | ||
84fb5b46 MKG |
479 | return ($str) |
480 | } | |
481 | ||
482 | ||
84fb5b46 MKG |
483 | =head2 _FindOrGuessCharset MIME::Entity, $head_only |
484 | ||
485 | When handed a MIME::Entity will first attempt to read what charset the message is encoded in. Failing that, will use Encode::Guess to try to figure it out | |
486 | ||
487 | If $head_only is true, only guesses charset for head parts. This is because header's encoding (e.g. filename="...") may be different from that of body's. | |
488 | ||
489 | =cut | |
490 | ||
491 | sub _FindOrGuessCharset { | |
492 | my $entity = shift; | |
493 | my $head_only = shift; | |
494 | my $head = $entity->head; | |
495 | ||
496 | if ( my $charset = $head->mime_attr("content-type.charset") ) { | |
497 | return _CanonicalizeCharset($charset); | |
498 | } | |
499 | ||
500 | if ( !$head_only and $head->mime_type =~ m{^text/} ) { | |
501 | my $body = $entity->bodyhandle or return; | |
502 | return _GuessCharset( $body->as_string ); | |
503 | } | |
504 | else { | |
505 | ||
506 | # potentially binary data -- don't guess the body | |
507 | return _GuessCharset( $head->as_string ); | |
508 | } | |
509 | } | |
510 | ||
511 | ||
512 | ||
513 | =head2 _GuessCharset STRING | |
514 | ||
515 | use Encode::Guess to try to figure it out the string's encoding. | |
516 | ||
517 | =cut | |
518 | ||
3ffc5f4f MKG |
519 | use constant HAS_ENCODE_GUESS => Encode::Guess->require; |
520 | use constant HAS_ENCODE_DETECT => Encode::Detect::Detector->require; | |
84fb5b46 MKG |
521 | |
522 | sub _GuessCharset { | |
523 | my $fallback = _CanonicalizeCharset('iso-8859-1'); | |
524 | ||
525 | # if $_[0] is null/empty, we don't guess its encoding | |
526 | return $fallback | |
527 | unless defined $_[0] && length $_[0]; | |
528 | ||
529 | my @encodings = RT->Config->Get('EmailInputEncodings'); | |
530 | unless ( @encodings ) { | |
531 | $RT::Logger->warning("No EmailInputEncodings set, fallback to $fallback"); | |
532 | return $fallback; | |
533 | } | |
534 | ||
535 | if ( $encodings[0] eq '*' ) { | |
536 | shift @encodings; | |
537 | if ( HAS_ENCODE_DETECT ) { | |
538 | my $charset = Encode::Detect::Detector::detect( $_[0] ); | |
539 | if ( $charset ) { | |
540 | $RT::Logger->debug("Encode::Detect::Detector guessed encoding: $charset"); | |
541 | return _CanonicalizeCharset( Encode::resolve_alias( $charset ) ); | |
542 | } | |
543 | else { | |
544 | $RT::Logger->debug("Encode::Detect::Detector failed to guess encoding"); | |
545 | } | |
546 | } | |
547 | else { | |
3ffc5f4f | 548 | $RT::Logger->error( |
84fb5b46 MKG |
549 | "You requested to guess encoding, but we couldn't" |
550 | ." load Encode::Detect::Detector module" | |
551 | ); | |
552 | } | |
553 | } | |
554 | ||
555 | unless ( @encodings ) { | |
556 | $RT::Logger->warning("No EmailInputEncodings set except '*', fallback to $fallback"); | |
557 | return $fallback; | |
558 | } | |
559 | ||
560 | unless ( HAS_ENCODE_GUESS ) { | |
561 | $RT::Logger->error("We couldn't load Encode::Guess module, fallback to $fallback"); | |
562 | return $fallback; | |
563 | } | |
564 | ||
565 | Encode::Guess->set_suspects( @encodings ); | |
566 | my $decoder = Encode::Guess->guess( $_[0] ); | |
567 | unless ( defined $decoder ) { | |
568 | $RT::Logger->warning("Encode::Guess failed: decoder is undefined; fallback to $fallback"); | |
569 | return $fallback; | |
570 | } | |
571 | ||
572 | if ( ref $decoder ) { | |
573 | my $charset = $decoder->name; | |
574 | $RT::Logger->debug("Encode::Guess guessed encoding: $charset"); | |
575 | return _CanonicalizeCharset( $charset ); | |
576 | } | |
577 | elsif ($decoder =~ /(\S+ or .+)/) { | |
578 | my %matched = map { $_ => 1 } split(/ or /, $1); | |
579 | return 'utf-8' if $matched{'utf8'}; # one and only normalization | |
580 | ||
581 | foreach my $suspect (RT->Config->Get('EmailInputEncodings')) { | |
582 | next unless $matched{$suspect}; | |
583 | $RT::Logger->debug("Encode::Guess ambiguous ($decoder); using $suspect"); | |
584 | return _CanonicalizeCharset( $suspect ); | |
585 | } | |
586 | } | |
587 | else { | |
588 | $RT::Logger->warning("Encode::Guess failed: $decoder; fallback to $fallback"); | |
589 | } | |
590 | ||
591 | return $fallback; | |
592 | } | |
593 | ||
594 | =head2 _CanonicalizeCharset NAME | |
595 | ||
596 | canonicalize charset, return lowercase version. | |
597 | special cases are: gb2312 => gbk, utf8 => utf-8 | |
598 | ||
599 | =cut | |
600 | ||
601 | sub _CanonicalizeCharset { | |
602 | my $charset = lc shift; | |
603 | return $charset unless $charset; | |
604 | ||
605 | # Canonicalize aliases if they're known | |
606 | if (my $canonical = Encode::resolve_alias($charset)) { | |
607 | $charset = $canonical; | |
608 | } | |
609 | ||
610 | if ( $charset eq 'utf8' || $charset eq 'utf-8-strict' ) { | |
611 | return 'utf-8'; | |
612 | } | |
613 | elsif ( $charset eq 'euc-cn' ) { | |
614 | # gbk is superset of gb2312/euc-cn so it's safe | |
615 | return 'gbk'; | |
3ffc5f4f MKG |
616 | } |
617 | elsif ( $charset =~ /^(?:(?:big5(-1984|-2003|ext|plus))|cccii|unisys|euc-tw|gb18030|(?:cns11643-\d+))$/ ) { | |
618 | unless ( Encode::HanExtra->require ) { | |
619 | RT->Logger->error("Please install Encode::HanExtra to handle $charset"); | |
620 | } | |
621 | return $charset; | |
84fb5b46 MKG |
622 | } |
623 | else { | |
624 | return $charset; | |
625 | } | |
626 | } | |
627 | ||
628 | ||
3ffc5f4f | 629 | =head2 SetMIMEHeadToEncoding MIMEHead => HEAD, From => OLD_ENCODING, To => NEW_Encoding, PreserveWords => BOOL, IsOut => BOOL |
84fb5b46 MKG |
630 | |
631 | Converts a MIME Head from one encoding to another. This totally violates the RFC. | |
632 | We should never need this. But, Surprise!, MUAs are badly broken and do this kind of stuff | |
633 | all the time | |
634 | ||
635 | ||
636 | =cut | |
637 | ||
638 | sub SetMIMEHeadToEncoding { | |
3ffc5f4f MKG |
639 | my ( $head, $charset, $enc, $preserve_words, $is_out ); |
640 | ||
641 | if ( @_ <= 4 ) { | |
642 | ( $head, $charset, $enc, $preserve_words ) = @_; | |
643 | } | |
644 | else { | |
645 | my %args = ( | |
646 | Head => undef, | |
647 | From => undef, | |
648 | To => undef, | |
649 | PreserveWords => undef, | |
650 | IsOut => undef, | |
651 | @_, | |
652 | ); | |
653 | ||
654 | $head = $args{Head}; | |
655 | $charset = $args{From}; | |
656 | $enc = $args{To}; | |
657 | $preserve_words = $args{PreserveWords}; | |
658 | $is_out = $args{IsOut}; | |
659 | } | |
660 | ||
661 | unless ( $head && $charset && $enc ) { | |
662 | RT->Logger->error( | |
663 | "Missing Head or From or To arguments"); | |
664 | return; | |
665 | } | |
84fb5b46 MKG |
666 | |
667 | $charset = _CanonicalizeCharset($charset); | |
668 | $enc = _CanonicalizeCharset($enc); | |
669 | ||
670 | return if $charset eq $enc and $preserve_words; | |
671 | ||
3ffc5f4f | 672 | RT::Util::assert_bytes( $head->as_string ); |
84fb5b46 MKG |
673 | foreach my $tag ( $head->tags ) { |
674 | next unless $tag; # seen in wild: headers with no name | |
675 | my @values = $head->get_all($tag); | |
676 | $head->delete($tag); | |
677 | foreach my $value (@values) { | |
dab09ea8 | 678 | if ( $charset ne $enc || $enc =~ /^utf-?8(?:-strict)?$/i ) { |
3ffc5f4f MKG |
679 | my $orig_value = $value; |
680 | ( my $success, $value ) = EncodeFromToWithCroak( $orig_value, $charset => $enc ); | |
681 | if ( !$success ) { | |
682 | my $error = $value; | |
683 | if ($is_out) { | |
684 | $value = $orig_value; | |
685 | $head->add( $tag, $value ); | |
686 | next; | |
687 | } | |
688 | ||
689 | my $guess = _GuessCharset($orig_value); | |
690 | if ( $guess && $guess ne $charset ) { | |
691 | $RT::Logger->error( "Encoding error: " . $error . " falling back to Guess($guess) => $enc" ); | |
692 | ( $success, $value ) = EncodeFromToWithCroak( $orig_value, $guess, $enc ); | |
693 | $error = $value unless $success; | |
694 | } | |
695 | ||
696 | if ( !$success ) { | |
697 | $RT::Logger->error( "Encoding error: " . $error . " forcing conversion to $charset => $enc" ); | |
698 | $value = $orig_value; | |
699 | Encode::from_to( $value, $charset => $enc ); | |
700 | } | |
701 | } | |
84fb5b46 | 702 | } |
3ffc5f4f | 703 | |
84fb5b46 MKG |
704 | $value = DecodeMIMEWordsToEncoding( $value, $enc, $tag ) |
705 | unless $preserve_words; | |
706 | ||
707 | # We intentionally add a leading space when re-adding the | |
708 | # header; Mail::Header strips it before storing, but it | |
709 | # serves to prevent it from "helpfully" canonicalizing | |
710 | # $head->add("Subject", "Subject: foo") into the same as | |
711 | # $head->add("Subject", "foo"); | |
712 | $head->add( $tag, " " . $value ); | |
713 | } | |
714 | } | |
715 | ||
716 | } | |
717 | ||
3ffc5f4f MKG |
718 | =head2 EncodeFromToWithCroak $string, $from, $to |
719 | ||
720 | Try to encode string from encoding $from to encoding $to in croak mode | |
721 | ||
722 | return (1, $encoded_string) if success, otherwise (0, $error) | |
723 | ||
724 | =cut | |
725 | ||
726 | sub EncodeFromToWithCroak { | |
727 | my $string = shift; | |
728 | my $from = shift; | |
729 | my $to = shift; | |
730 | ||
731 | eval { Encode::from_to( $string, $from => $to, Encode::FB_CROAK ); }; | |
732 | return $@ ? ( 0, $@ ) : ( 1, $string ); | |
733 | } | |
734 | ||
84fb5b46 MKG |
735 | RT::Base->_ImportOverlays(); |
736 | ||
737 | 1; # End of module. | |
738 |