Upgrade to 4.0.13
[usit-rt.git] / sbin / rt-fulltext-indexer
... / ...
CommitLineData
1#!/usr/bin/perl
2# BEGIN BPS TAGGED BLOCK {{{
3#
4# COPYRIGHT:
5#
6# This software is Copyright (c) 1996-2013 Best Practical Solutions, LLC
7# <sales@bestpractical.com>
8#
9# (Except where explicitly superseded by other copyright notices)
10#
11#
12# LICENSE:
13#
14# This work is made available to you under the terms of Version 2 of
15# the GNU General Public License. A copy of that license should have
16# been provided with this software, but in any event can be snarfed
17# from www.gnu.org.
18#
19# This work is distributed in the hope that it will be useful, but
20# WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22# General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27# 02110-1301 or visit their web page on the internet at
28# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
29#
30#
31# CONTRIBUTION SUBMISSION POLICY:
32#
33# (The following paragraph is not intended to limit the rights granted
34# to you to modify and distribute this software under the terms of
35# the GNU General Public License and is only of importance to you if
36# you choose to contribute your changes and enhancements to the
37# community by submitting them to Best Practical Solutions, LLC.)
38#
39# By intentionally submitting any modifications, corrections or
40# derivatives to this work, or any other work intended for use with
41# Request Tracker, to Best Practical Solutions, LLC, you confirm that
42# you are the copyright holder for those contributions and you grant
43# Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
44# royalty-free, perpetual, license to use, copy, create derivative
45# works based on those contributions, and sublicense and distribute
46# those contributions and any derivatives thereof.
47#
48# END BPS TAGGED BLOCK }}}
49use strict;
50use warnings;
51no warnings 'once';
52
53# fix lib paths, some may be relative
54BEGIN {
55 require File::Spec;
56 my @libs = ("lib", "local/lib");
57 my $bin_path;
58
59 for my $lib (@libs) {
60 unless ( File::Spec->file_name_is_absolute($lib) ) {
61 unless ($bin_path) {
62 if ( File::Spec->file_name_is_absolute(__FILE__) ) {
63 $bin_path = ( File::Spec->splitpath(__FILE__) )[1];
64 }
65 else {
66 require FindBin;
67 no warnings "once";
68 $bin_path = $FindBin::Bin;
69 }
70 }
71 $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib );
72 }
73 unshift @INC, $lib;
74 }
75}
76
77BEGIN {
78 use RT;
79 RT::LoadConfig();
80 RT::Init();
81};
82use RT::Interface::CLI ();
83
84my %OPT = (
85 help => 0,
86 debug => 0,
87);
88my @OPT_LIST = qw(help|h! debug!);
89
90my $db_type = RT->Config->Get('DatabaseType');
91if ( $db_type eq 'Pg' ) {
92 %OPT = (
93 %OPT,
94 limit => 0,
95 all => 0,
96 );
97 push @OPT_LIST, 'limit=i', 'all!';
98}
99elsif ( $db_type eq 'mysql' ) {
100 %OPT = (
101 %OPT,
102 limit => 0,
103 all => 0,
104 xmlpipe2 => 0,
105 );
106 push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!';
107}
108elsif ( $db_type eq 'Oracle' ) {
109 %OPT = (
110 %OPT,
111 memory => '2M',
112 );
113 push @OPT_LIST, qw(memory=s);
114}
115
116use Getopt::Long qw(GetOptions);
117GetOptions( \%OPT, @OPT_LIST );
118
119if ( $OPT{'help'} ) {
120 RT::Interface::CLI->ShowHelp(
121 Sections => 'NAME|DESCRIPTION|'. uc($db_type),
122 );
123}
124
125my $fts_config = RT->Config->Get('FullTextSearch') || {};
126unless ( $fts_config->{'Enable'} ) {
127 print STDERR <<EOT;
128
129Full text search is disabled in your RT configuration. Run
130/www/var/rt/sbin/rt-setup-fulltext-index to configure and enable it.
131
132EOT
133 exit 1;
134}
135unless ( $fts_config->{'Indexed'} ) {
136 print STDERR <<EOT;
137
138Full text search is enabled in your RT configuration, but not with any
139full-text database indexing -- hence this tool is not required. Read
140the documentation for %FullTextSearch in your RT_Config for more details.
141
142EOT
143 exit 1;
144}
145
146if ( $db_type eq 'Oracle' ) {
147 my $index = $fts_config->{'IndexName'} || 'rt_fts_index';
148 $RT::Handle->dbh->do(
149 "begin ctx_ddl.sync_index(?, ?); end;", undef,
150 $index, $OPT{'memory'}
151 );
152 exit;
153} elsif ( $db_type eq 'mysql' ) {
154 unless ($OPT{'xmlpipe2'}) {
155 print STDERR <<EOT;
156
157Updates to the external Sphinx index are done via running the sphinx
158`indexer` tool:
159
160 indexer rt
161
162EOT
163 exit 1;
164 }
165}
166
167my @types = qw(text html);
168foreach my $type ( @types ) {
169 REDO:
170 my $attachments = attachments($type);
171 $attachments->Limit(
172 FIELD => 'id',
173 OPERATOR => '>',
174 VALUE => last_indexed($type)
175 );
176 $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
177 $attachments->RowsPerPage( $OPT{'limit'} || 100 );
178
179 my $found = 0;
180 while ( my $a = $attachments->Next ) {
181 next if filter( $type, $a );
182 debug("Found attachment #". $a->id );
183 my $txt = extract($type, $a) or next;
184 $found++;
185 process( $type, $a, $txt );
186 debug("Processed attachment #". $a->id );
187 }
188 finalize( $type, $attachments ) if $found;
189 clean( $type );
190 goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
191}
192
193sub attachments {
194 my $type = shift;
195 my $res = RT::Attachments->new( RT->SystemUser );
196 my $txn_alias = $res->Join(
197 ALIAS1 => 'main',
198 FIELD1 => 'TransactionId',
199 TABLE2 => 'Transactions',
200 FIELD2 => 'id',
201 );
202 $res->Limit(
203 ALIAS => $txn_alias,
204 FIELD => 'ObjectType',
205 VALUE => 'RT::Ticket',
206 );
207 my $ticket_alias = $res->Join(
208 ALIAS1 => $txn_alias,
209 FIELD1 => 'ObjectId',
210 TABLE2 => 'Tickets',
211 FIELD2 => 'id',
212 );
213 $res->Limit(
214 ALIAS => $ticket_alias,
215 FIELD => 'Status',
216 OPERATOR => '!=',
217 VALUE => 'deleted'
218 );
219
220 # On newer DBIx::SearchBuilder's, indicate that making the query DISTINCT
221 # is unnecessary because the joins won't produce duplicates. This
222 # drastically improves performance when fetching attachments.
223 $res->{joins_are_distinct} = 1;
224
225 return goto_specific(
226 suffix => $type,
227 error => "Don't know how to find $type attachments",
228 arguments => [$res],
229 );
230}
231
232sub last_indexed {
233 my ($type) = (@_);
234 return goto_specific(
235 suffix => $db_type,
236 error => "Don't know how to find last indexed $type attachment for $db_type DB",
237 arguments => \@_,
238 );
239}
240
241sub filter {
242 my $type = shift;
243 return goto_specific(
244 suffix => $type,
245 arguments => \@_,
246 );
247}
248
249sub extract {
250 my $type = shift;
251 return goto_specific(
252 suffix => $type,
253 error => "No way to convert $type attachment into text",
254 arguments => \@_,
255 );
256}
257
258sub process {
259 return goto_specific(
260 suffix => $db_type,
261 error => "No processer for $db_type DB",
262 arguments => \@_,
263 );
264}
265
266sub finalize {
267 return goto_specific(
268 suffix => $db_type,
269 arguments => \@_,
270 );
271}
272
273sub clean {
274 return goto_specific(
275 suffix => $db_type,
276 arguments => \@_,
277 );
278}
279
280{
281sub last_indexed_mysql {
282 my $type = shift;
283 my $attr = $RT::System->FirstAttribute('LastIndexedAttachments');
284 return 0 unless $attr;
285 return 0 unless exists $attr->{ $type };
286 return $attr->{ $type } || 0;
287}
288
289sub process_mysql {
290 my ($type, $attachment, $text) = (@_);
291
292 my $doc = sphinx_template();
293
294 my $element = $doc->createElement('sphinx:document');
295 $element->setAttribute( id => $attachment->id );
296 $element->appendTextChild( content => $$text );
297
298 $doc->documentElement->appendChild( $element );
299}
300
301my $doc = undef;
302sub sphinx_template {
303 return $doc if $doc;
304
305 require XML::LibXML;
306 $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
307 my $root = $doc->createElement('sphinx:docset');
308 $doc->setDocumentElement( $root );
309
310 my $schema = $doc->createElement('sphinx:schema');
311 $root->appendChild( $schema );
312 foreach ( qw(content) ) {
313 my $field = $doc->createElement('sphinx:field');
314 $field->setAttribute( name => $_ );
315 $schema->appendChild( $field );
316 }
317
318 return $doc;
319}
320
321sub finalize_mysql {
322 my ($type, $attachments) = @_;
323 sphinx_template()->toFH(*STDOUT, 1);
324}
325
326sub clean_mysql {
327 $doc = undef;
328}
329
330}
331
332sub last_indexed_pg {
333 my $type = shift;
334 my $attachments = attachments( $type );
335 my $alias = 'main';
336 if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
337 $alias = $attachments->Join(
338 TYPE => 'left',
339 FIELD1 => 'id',
340 TABLE2 => $fts_config->{'Table'},
341 FIELD2 => 'id',
342 );
343 }
344 $attachments->Limit(
345 ALIAS => $alias,
346 FIELD => $fts_config->{'Column'},
347 OPERATOR => 'IS NOT',
348 VALUE => 'NULL',
349 );
350 $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
351 $attachments->RowsPerPage( 1 );
352 my $res = $attachments->First;
353 return 0 unless $res;
354 return $res->id;
355}
356
357sub process_pg {
358 my ($type, $attachment, $text) = (@_);
359
360 my $dbh = $RT::Handle->dbh;
361 my $table = $fts_config->{'Table'};
362 my $column = $fts_config->{'Column'};
363
364 my $query;
365 if ( $table ) {
366 if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
367 $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
368 } else {
369 $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
370 }
371 } else {
372 $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
373 }
374
375 my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) };
376 unless ( $status ) {
377 if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
378 warn "Attachment @{[$attachment->id]} cannot be indexed, as it contains too many unique words";
379 } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
380 warn "Attachment @{[$attachment->id]} cannot be indexed, as it contains invalid UTF8 bytes";
381 } else {
382 die "error: ". $dbh->errstr;
383 }
384
385 # Insert an empty tsvector, so we count this row as "indexed"
386 # for purposes of knowing where to pick up
387 eval { $dbh->do( $query, undef, "", $attachment->id ) }
388 or die "Failed to insert empty tsvector: " . $dbh->errstr;
389 }
390}
391
392sub attachments_text {
393 my $res = shift;
394 $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' );
395 return $res;
396}
397
398sub extract_text {
399 my $attachment = shift;
400 my $text = $attachment->Content;
401 return undef unless defined $text && length($text);
402 return \$text;
403}
404
405sub attachments_html {
406 my $res = shift;
407 $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
408 return $res;
409}
410
411sub filter_html {
412 my $attachment = shift;
413 if ( my $parent = $attachment->ParentObj ) {
414# skip html parts that are alternatives
415 return 1 if $parent->id
416 && $parent->ContentType eq 'mulitpart/alternative';
417 }
418 return 0;
419}
420
421sub extract_html {
422 my $attachment = shift;
423 my $text = $attachment->Content;
424 return undef unless defined $text && length($text);
425# TODO: html -> text
426 return \$text;
427}
428
429sub goto_specific {
430 my %args = (@_);
431
432 my $func = (caller(1))[3];
433 $func =~ s/.*:://;
434 my $call = $func ."_". lc $args{'suffix'};
435 unless ( defined &$call ) {
436 return undef unless $args{'error'};
437 require Carp; Carp::croak( $args{'error'} );
438 }
439 @_ = @{ $args{'arguments'} };
440 goto &$call;
441}
442
443
444# helper functions
445sub debug { print @_, "\n" if $OPT{debug}; 1 }
446sub error { $RT::Logger->error(_(@_)); 1 }
447sub warning { $RT::Logger->warn(_(@_)); 1 }
448
449=head1 NAME
450
451rt-fulltext-indexer - Indexer for full text search
452
453=head1 DESCRIPTION
454
455This is a helper script to keep full text indexes in sync with data.
456Read F<docs/full_text_indexing.pod> for complete details on how and when
457to run it.
458
459=head1 AUTHOR
460
461Ruslan Zakirov E<lt>ruz@bestpractical.comE<gt>,
462Alex Vandiver E<lt>alexmv@bestpractical.comE<gt>
463
464=cut
465