#!/usr/bin/env python # # Authors: # rafael@postgresql.org.es / http://www.postgresql.org.es/ # # Copyright (c) 2016 USIT-University of Oslo # # zabbix_elasticsearch_node_stats.py: Used by zabbix_agent to pull # elasticsearch cluster health information from an ES cluster and send # this information to Zabbix via trappers. # # zabbix_elasticsearch_node_stats.py is free software: you can # redistribute it and/or modify it under the terms of the GNU General # Public License as published by the Free Software Foundation, either # version 3 of the License, or (at your option) any later version. # # zabbix_elasticsearch_node_stats.py is distributed in the hope # that it will be useful, but WITHOUT ANY WARRANTY; without even the # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR # PURPOSE. See the GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with sms_send. If not, see . # # # This script gets ES node stats information from the ES rest API, # extracts the parameters we have defined in stats_keys{} and sends # data back to zabbix via zabbix_sender to defined trap-items. # # The script is executed via zabbix_agent and is defined in an # UserParameter that will return 0 (execution OK) or 1 (execution # ERROR) so zabbix can register if it cannot get data from ES. # # UserParameter=get.es.node.stats[*],/usr/bin/zabbix_elasticsearch_node_stats.py $1 # import requests import json import sys import os import time import tempfile # Elasticsearch clients elasticsearch_clients = ['es-client-in.uio.no','es-client-out.uio.no'] # Zabbix proxy zabbix_proxy = ['zabbix-proxy-prod03.uio.no','zabbix-proxy-prod04.uio.no'] # Path to zabbix_sender zabbix_sender = '/usr/bin/zabbix_sender' # Temp file with full json output tmp_stat_file = tempfile.NamedTemporaryFile(delete=False,dir='/tmp') # Item prefix item_prefix = 'es.node.' # keys for health page stats_keys = { "indices.docs.count", "indices.docs.deleted", "indices.store.size_in_bytes", "indices.store.throttle_time_in_millis", "indices.indexing.index_total", "indices.indexing.index_time_in_millis", "indices.indexing.index_current", "indices.indexing.index_failed", "indices.indexing.delete_total", "indices.indexing.delete_time_in_millis", "indices.indexing.delete_current", "indices.get.total", "indices.get.time_in_millis", "indices.get.exists_total", "indices.get.exists_time_in_millis", "indices.get.missing_total", "indices.get.missing_time_in_millis", "indices.get.current", "indices.search.open_contexts", "indices.search.query_total", "indices.search.query_time_in_millis", "indices.search.query_current", "indices.search.fetch_total", "indices.search.fetch_time_in_millis", "indices.search.fetch_current", "indices.merges.current", "indices.merges.current_docs", "indices.merges.total_size_in_bytes", "indices.merges.total", "indices.merges.total_time_in_millis", "indices.merges.total_docs", "indices.merges.total_size_in_bytes", "indices.merges.total_stopped_time_in_millis", "indices.query_cache.memory_size_in_bytes", "indices.query_cache.total_count", "indices.query_cache.hit_count", "indices.query_cache.miss_count", "indices.query_cache.cache_size", "indices.query_cache.cache_count", "indices.query_cache.evictions", "indices.fielddata.memory_size_in_bytes", "indices.fielddata.evictions", "indices.segments.count", "indices.segments.memory_in_bytes", "indices.segments.terms_memory_in_bytes", "indices.segments.stored_fields_memory_in_bytes", "indices.segments.norms_memory_in_bytes", "indices.segments.points_memory_in_bytes", "indices.segments.doc_values_memory_in_bytes", "indices.segments.index_writer_memory_in_bytes", "indices.segments.version_map_memory_in_bytes", "indices.segments.fixed_bit_set_memory_in_bytes", "indices.translog.operations", "indices.translog.size_in_bytes", "indices.request_cache.memory_size_in_bytes", "indices.request_cache.evictions", "indices.request_cache.hit_count", "indices.request_cache.miss_count", "process.open_file_descriptors", "process.max_file_descriptors", "jvm.mem.heap_used_in_bytes", "jvm.mem.heap_used_percent", "jvm.mem.heap_committed_in_bytes", "jvm.mem.heap_max_in_bytes", "jvm.mem.non_heap_used_in_bytes", "jvm.mem.non_heap_committed_in_bytes", "jvm.mem.pools.young.used_in_bytes", "jvm.mem.pools.young.max_in_bytes", "jvm.mem.pools.young.peak_used_in_bytes", "jvm.mem.pools.young.peak_max_in_bytes", "jvm.mem.pools.survivor.used_in_bytes", "jvm.mem.pools.survivor.max_in_bytes", "jvm.mem.pools.survivor.peak_used_in_bytes", "jvm.mem.pools.survivor.peak_max_in_bytes", "jvm.mem.pools.old.used_in_bytes", "jvm.mem.pools.old.max_in_bytes", "jvm.mem.pools.old.peak_used_in_bytes", "jvm.mem.pools.old.peak_max_in_bytes", "jvm.threads.count", "jvm.threads.peak_count", "jvm.gc.collectors.young.collection_count", "jvm.gc.collectors.young.collection_time_in_millis", "jvm.gc.collectors.old.collection_count", "jvm.gc.collectors.old.collection_time_in_millis", "jvm.buffer_pools.direct.count", "jvm.buffer_pools.direct.used_in_bytes", "jvm.buffer_pools.direct.total_capacity_in_bytes", "jvm.buffer_pools.mapped.count", "jvm.buffer_pools.mapped.used_in_bytes", "jvm.buffer_pools.mapped.total_capacity_in_bytes", "transport.server_open", "transport.rx_count", "transport.rx_size_in_bytes", "transport.tx_count", "transport.tx_size_in_bytes", "thread_pool.index.threads", "thread_pool.index.active", "thread_pool.index.queue", "thread_pool.index.rejected", "thread_pool.index.completed", "thread_pool.bulk.threads", "thread_pool.bulk.active", "thread_pool.bulk.queue", "thread_pool.bulk.rejected", "thread_pool.bulk.completed", "thread_pool.search.threads", "thread_pool.search.active", "thread_pool.search.queue", "thread_pool.search.rejected", "thread_pool.search.completed", "thread_pool.flush.threads", "thread_pool.flush.active", "thread_pool.flush.queue", "thread_pool.flush.rejected", "thread_pool.flush.completed", "thread_pool.management.threads", "thread_pool.management.active", "thread_pool.management.queue", "thread_pool.management.rejected", "thread_pool.management.completed", "thread_pool.warmer.threads", "thread_pool.warmer.active", "thread_pool.warmer.queue", "thread_pool.warmer.rejected", "thread_pool.warmer.completed", "thread_pool.refresh.threads", "thread_pool.refresh.active", "thread_pool.refresh.queue", "thread_pool.refresh.rejected", "thread_pool.refresh.completed", "thread_pool.generic.threads", "thread_pool.generic.active", "thread_pool.generic.queue", "thread_pool.generic.rejected", "thread_pool.generic.completed", "breakers.request.limit_size_in_bytes", "breakers.request.estimated_size_in_bytes", "breakers.request.overhead", "breakers.request.tripped", "breakers.fielddata.limit_size_in_bytes", "breakers.fielddata.estimated_size_in_bytes", "breakers.fielddata.overhead", "breakers.fielddata.tripped", "breakers.in_flight_requests.limit_size_in_bytes", "breakers.in_flight_requests.estimated_size_in_bytes", "breakers.in_flight_requests.overhead", "breakers.in_flight_requests.tripped", "breakers.parent.limit_size_in_bytes", "breakers.parent.estimated_size_in_bytes", "breakers.parent.overhead", "breakers.parent.tripped" } # ############################################ # getKeys() # ############################################ def getKeys(json_data,keys,node_fqdn): result='' for key in keys: attributes=key.split('.') value=json_data for index in range(len(attributes)): value=value.get(attributes.pop(0),{}) if value=={}: continue result += node_fqdn + ' ' + item_prefix + "{0} {1}\n".format(key,value) return result # ############################################ # get_node_stats_data() # ############################################ def get_node_stats_data(node_fqdn): """ Get ES node stats data """ # # We try all ES clients defined in elasticsearch_clients[] until # one of them returns an ansver with a 200 status code. # for client in elasticsearch_clients: try: request_data = requests.get("http://" + client + ":9200/_nodes/" + node_fqdn + "/stats") if request_data.status_code != 200: continue stats_data = request_data.json() break except Exception: pass try: for node_id in stats_data['nodes']: result = getKeys(stats_data['nodes'][node_id],stats_keys,node_fqdn) except Exception, e: print "1" sys.exit(1) return result # ############################################ # Main # ############################################ if __name__ == '__main__': try: if len(sys.argv) == 2: node_fqdn = sys.argv[1].lower() result = get_node_stats_data(node_fqdn) # # We create a file with the data that zabbix_sender will # send in a bulk execution. # with open(tmp_stat_file.name,'w') as f: f.write(result) # # The monitoring of this host can be done by any of the # zabbix proxyer defined in zabbix_proxy[]. We try all of # them until one of them accepts our data # for proxy in zabbix_proxy: returncode = os.system(zabbix_sender + ' -z ' + proxy + ' -i ' + tmp_stat_file.name + ' > /dev/null 2>&1') if returncode == 0: break if returncode != 0: print "1" sys.exit(1) else: print "1" sys.exit(1) except Exception, e: print "1" sys.exit(1) # Delete temp file with zabbix_sender data os.remove(tmp_stat_file.name) # Return value 0 = execution OK print "0"