Merge branch 'master' of git.uio.no:usit-rt
[usit-rt.git] / sbin / rt-fulltext-indexer
CommitLineData
84fb5b46
MKG
1#!/usr/bin/perl
2# BEGIN BPS TAGGED BLOCK {{{
3#
4# COPYRIGHT:
5#
320f0092 6# This software is Copyright (c) 1996-2014 Best Practical Solutions, LLC
84fb5b46
MKG
7# <sales@bestpractical.com>
8#
9# (Except where explicitly superseded by other copyright notices)
10#
11#
12# LICENSE:
13#
14# This work is made available to you under the terms of Version 2 of
15# the GNU General Public License. A copy of that license should have
16# been provided with this software, but in any event can be snarfed
17# from www.gnu.org.
18#
19# This work is distributed in the hope that it will be useful, but
20# WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22# General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27# 02110-1301 or visit their web page on the internet at
28# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
29#
30#
31# CONTRIBUTION SUBMISSION POLICY:
32#
33# (The following paragraph is not intended to limit the rights granted
34# to you to modify and distribute this software under the terms of
35# the GNU General Public License and is only of importance to you if
36# you choose to contribute your changes and enhancements to the
37# community by submitting them to Best Practical Solutions, LLC.)
38#
39# By intentionally submitting any modifications, corrections or
40# derivatives to this work, or any other work intended for use with
41# Request Tracker, to Best Practical Solutions, LLC, you confirm that
42# you are the copyright holder for those contributions and you grant
43# Best Practical Solutions, LLC a nonexclusive, worldwide, irrevocable,
44# royalty-free, perpetual, license to use, copy, create derivative
45# works based on those contributions, and sublicense and distribute
46# those contributions and any derivatives thereof.
47#
48# END BPS TAGGED BLOCK }}}
49use strict;
50use warnings;
51no warnings 'once';
52
53# fix lib paths, some may be relative
af59614d 54BEGIN { # BEGIN RT CMD BOILERPLATE
84fb5b46 55 require File::Spec;
af59614d 56 require Cwd;
84fb5b46
MKG
57 my @libs = ("lib", "local/lib");
58 my $bin_path;
59
60 for my $lib (@libs) {
61 unless ( File::Spec->file_name_is_absolute($lib) ) {
af59614d 62 $bin_path ||= ( File::Spec->splitpath(Cwd::abs_path(__FILE__)) )[1];
84fb5b46
MKG
63 $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib );
64 }
65 unshift @INC, $lib;
66 }
af59614d 67
84fb5b46
MKG
68}
69
70BEGIN {
71 use RT;
72 RT::LoadConfig();
73 RT::Init();
74};
75use RT::Interface::CLI ();
76
77my %OPT = (
78 help => 0,
79 debug => 0,
c33a4027 80 quiet => 0,
84fb5b46 81);
c33a4027 82my @OPT_LIST = qw(help|h! debug! quiet);
84fb5b46
MKG
83
84my $db_type = RT->Config->Get('DatabaseType');
85if ( $db_type eq 'Pg' ) {
86 %OPT = (
87 %OPT,
88 limit => 0,
89 all => 0,
90 );
91 push @OPT_LIST, 'limit=i', 'all!';
92}
93elsif ( $db_type eq 'mysql' ) {
94 %OPT = (
95 %OPT,
96 limit => 0,
97 all => 0,
98 xmlpipe2 => 0,
99 );
100 push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!';
101}
102elsif ( $db_type eq 'Oracle' ) {
103 %OPT = (
104 %OPT,
105 memory => '2M',
106 );
107 push @OPT_LIST, qw(memory=s);
108}
109
110use Getopt::Long qw(GetOptions);
111GetOptions( \%OPT, @OPT_LIST );
112
113if ( $OPT{'help'} ) {
114 RT::Interface::CLI->ShowHelp(
115 Sections => 'NAME|DESCRIPTION|'. uc($db_type),
116 );
117}
118
c33a4027
MKG
119use Fcntl ':flock';
120if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
121 if ( $OPT{quiet} ) {
122 RT::Logger->info("$0 is already running; aborting silently, as requested");
123 exit;
124 }
125 else {
126 print STDERR "$0 is already running\n";
127 exit 1;
128 }
129}
130
84fb5b46
MKG
131my $fts_config = RT->Config->Get('FullTextSearch') || {};
132unless ( $fts_config->{'Enable'} ) {
133 print STDERR <<EOT;
134
135Full text search is disabled in your RT configuration. Run
b5747ff2 136/www/var/rt/sbin/rt-setup-fulltext-index to configure and enable it.
84fb5b46
MKG
137
138EOT
139 exit 1;
140}
141unless ( $fts_config->{'Indexed'} ) {
142 print STDERR <<EOT;
143
144Full text search is enabled in your RT configuration, but not with any
145full-text database indexing -- hence this tool is not required. Read
146the documentation for %FullTextSearch in your RT_Config for more details.
147
148EOT
149 exit 1;
150}
151
152if ( $db_type eq 'Oracle' ) {
153 my $index = $fts_config->{'IndexName'} || 'rt_fts_index';
154 $RT::Handle->dbh->do(
155 "begin ctx_ddl.sync_index(?, ?); end;", undef,
156 $index, $OPT{'memory'}
157 );
158 exit;
159} elsif ( $db_type eq 'mysql' ) {
160 unless ($OPT{'xmlpipe2'}) {
161 print STDERR <<EOT;
162
163Updates to the external Sphinx index are done via running the sphinx
164`indexer` tool:
165
166 indexer rt
167
168EOT
169 exit 1;
170 }
171}
172
173my @types = qw(text html);
174foreach my $type ( @types ) {
175 REDO:
176 my $attachments = attachments($type);
177 $attachments->Limit(
178 FIELD => 'id',
179 OPERATOR => '>',
180 VALUE => last_indexed($type)
181 );
182 $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
183 $attachments->RowsPerPage( $OPT{'limit'} || 100 );
184
185 my $found = 0;
186 while ( my $a = $attachments->Next ) {
187 next if filter( $type, $a );
188 debug("Found attachment #". $a->id );
189 my $txt = extract($type, $a) or next;
190 $found++;
191 process( $type, $a, $txt );
192 debug("Processed attachment #". $a->id );
193 }
194 finalize( $type, $attachments ) if $found;
195 clean( $type );
196 goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
197}
198
199sub attachments {
200 my $type = shift;
201 my $res = RT::Attachments->new( RT->SystemUser );
202 my $txn_alias = $res->Join(
203 ALIAS1 => 'main',
204 FIELD1 => 'TransactionId',
205 TABLE2 => 'Transactions',
206 FIELD2 => 'id',
207 );
208 $res->Limit(
209 ALIAS => $txn_alias,
210 FIELD => 'ObjectType',
211 VALUE => 'RT::Ticket',
212 );
213 my $ticket_alias = $res->Join(
214 ALIAS1 => $txn_alias,
215 FIELD1 => 'ObjectId',
216 TABLE2 => 'Tickets',
217 FIELD2 => 'id',
218 );
219 $res->Limit(
220 ALIAS => $ticket_alias,
221 FIELD => 'Status',
222 OPERATOR => '!=',
223 VALUE => 'deleted'
224 );
225
226 return goto_specific(
227 suffix => $type,
228 error => "Don't know how to find $type attachments",
229 arguments => [$res],
230 );
231}
232
233sub last_indexed {
234 my ($type) = (@_);
235 return goto_specific(
236 suffix => $db_type,
237 error => "Don't know how to find last indexed $type attachment for $db_type DB",
238 arguments => \@_,
239 );
240}
241
242sub filter {
243 my $type = shift;
244 return goto_specific(
245 suffix => $type,
246 arguments => \@_,
247 );
248}
249
250sub extract {
251 my $type = shift;
252 return goto_specific(
253 suffix => $type,
254 error => "No way to convert $type attachment into text",
255 arguments => \@_,
256 );
257}
258
259sub process {
260 return goto_specific(
261 suffix => $db_type,
262 error => "No processer for $db_type DB",
263 arguments => \@_,
264 );
265}
266
267sub finalize {
268 return goto_specific(
269 suffix => $db_type,
270 arguments => \@_,
271 );
272}
273
274sub clean {
275 return goto_specific(
276 suffix => $db_type,
277 arguments => \@_,
278 );
279}
280
281{
282sub last_indexed_mysql {
283 my $type = shift;
284 my $attr = $RT::System->FirstAttribute('LastIndexedAttachments');
285 return 0 unless $attr;
286 return 0 unless exists $attr->{ $type };
287 return $attr->{ $type } || 0;
288}
289
290sub process_mysql {
291 my ($type, $attachment, $text) = (@_);
292
293 my $doc = sphinx_template();
294
295 my $element = $doc->createElement('sphinx:document');
296 $element->setAttribute( id => $attachment->id );
297 $element->appendTextChild( content => $$text );
298
299 $doc->documentElement->appendChild( $element );
300}
301
302my $doc = undef;
303sub sphinx_template {
304 return $doc if $doc;
305
306 require XML::LibXML;
307 $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
308 my $root = $doc->createElement('sphinx:docset');
309 $doc->setDocumentElement( $root );
310
311 my $schema = $doc->createElement('sphinx:schema');
312 $root->appendChild( $schema );
313 foreach ( qw(content) ) {
314 my $field = $doc->createElement('sphinx:field');
315 $field->setAttribute( name => $_ );
316 $schema->appendChild( $field );
317 }
318
319 return $doc;
320}
321
322sub finalize_mysql {
323 my ($type, $attachments) = @_;
324 sphinx_template()->toFH(*STDOUT, 1);
325}
326
327sub clean_mysql {
328 $doc = undef;
329}
330
331}
332
333sub last_indexed_pg {
334 my $type = shift;
335 my $attachments = attachments( $type );
336 my $alias = 'main';
337 if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
338 $alias = $attachments->Join(
339 TYPE => 'left',
340 FIELD1 => 'id',
341 TABLE2 => $fts_config->{'Table'},
342 FIELD2 => 'id',
343 );
344 }
345 $attachments->Limit(
346 ALIAS => $alias,
347 FIELD => $fts_config->{'Column'},
348 OPERATOR => 'IS NOT',
349 VALUE => 'NULL',
350 );
351 $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
352 $attachments->RowsPerPage( 1 );
353 my $res = $attachments->First;
354 return 0 unless $res;
355 return $res->id;
356}
357
358sub process_pg {
359 my ($type, $attachment, $text) = (@_);
360
361 my $dbh = $RT::Handle->dbh;
362 my $table = $fts_config->{'Table'};
363 my $column = $fts_config->{'Column'};
364
365 my $query;
366 if ( $table ) {
367 if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
368 $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
369 } else {
370 $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
371 }
372 } else {
373 $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
374 }
375
376 my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) };
377 unless ( $status ) {
5b0d0914 378 if ( $dbh->err == 7 && $dbh->state eq '54000' ) {
01e3b242 379 warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
5b0d0914 380 } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
01e3b242 381 warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
84fb5b46
MKG
382 } else {
383 die "error: ". $dbh->errstr;
384 }
403d7b0b
MKG
385
386 # Insert an empty tsvector, so we count this row as "indexed"
387 # for purposes of knowing where to pick up
388 eval { $dbh->do( $query, undef, "", $attachment->id ) }
389 or die "Failed to insert empty tsvector: " . $dbh->errstr;
84fb5b46
MKG
390 }
391}
392
393sub attachments_text {
394 my $res = shift;
395 $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' );
396 return $res;
397}
398
399sub extract_text {
400 my $attachment = shift;
401 my $text = $attachment->Content;
402 return undef unless defined $text && length($text);
403 return \$text;
404}
405
406sub attachments_html {
407 my $res = shift;
408 $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
409 return $res;
410}
411
412sub filter_html {
413 my $attachment = shift;
414 if ( my $parent = $attachment->ParentObj ) {
415# skip html parts that are alternatives
416 return 1 if $parent->id
417 && $parent->ContentType eq 'mulitpart/alternative';
418 }
419 return 0;
420}
421
422sub extract_html {
423 my $attachment = shift;
424 my $text = $attachment->Content;
425 return undef unless defined $text && length($text);
426# TODO: html -> text
427 return \$text;
428}
429
430sub goto_specific {
431 my %args = (@_);
432
433 my $func = (caller(1))[3];
434 $func =~ s/.*:://;
435 my $call = $func ."_". lc $args{'suffix'};
436 unless ( defined &$call ) {
437 return undef unless $args{'error'};
438 require Carp; Carp::croak( $args{'error'} );
439 }
440 @_ = @{ $args{'arguments'} };
441 goto &$call;
442}
443
444
445# helper functions
446sub debug { print @_, "\n" if $OPT{debug}; 1 }
447sub error { $RT::Logger->error(_(@_)); 1 }
448sub warning { $RT::Logger->warn(_(@_)); 1 }
449
450=head1 NAME
451
452rt-fulltext-indexer - Indexer for full text search
453
454=head1 DESCRIPTION
455
456This is a helper script to keep full text indexes in sync with data.
457Read F<docs/full_text_indexing.pod> for complete details on how and when
458to run it.
459
460=head1 AUTHOR
461
462Ruslan Zakirov E<lt>ruz@bestpractical.comE<gt>,
463Alex Vandiver E<lt>alexmv@bestpractical.comE<gt>
464
465=cut
466
c33a4027 467__DATA__