#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013-2019
# This file is part of Shinken Enterprise, all rights reserved.

import json

from shinkensolutions.lib_checks import schecks
from shinkensolutions.lib_checks.common import BREAK_LINE, COLOR, EXIT_STATUS, HTMLList, HTMLTable, HTMLTag, ParseOptionError, RaiseOnExitOptionParser, Result, ShinkenUtils, Utils
from shinkensolutions.lib_checks.graphite import CheckGraphite, TAG_CRITICAL, TAG_FOR_STATE, TAG_OK, TAG_UNKNOWN

VERSION = '0.1'
DAEMON_TYPE = 'broker'
IOSTATS_FILE = '/tmp/__check_graphite_iostats.tmp'
GRAPHITE_CONF_FILE = '/opt/graphite/conf/carbon.conf'


class WRITE_STATUS(object):
    NO_MODULE = 'NO_MODULE'
    NO_CONF = 'NO_CONF'
    OK = 'OK'


result = Result()

parser = RaiseOnExitOptionParser('%prog [options] [--help]', version='%prog ' + VERSION)
parser.add_option('-H', '--hostname', dest='hostname', help='The hostname of the shinken daemon')
parser.add_option('-p', '--port', dest='port', type='int', help='The port of the shinken daemon')
parser.add_option('-t', '--timeout', dest='timeout', type='int', default=3, help='timeout to connect to the shinken daemon. Default : 3')

parser.add_option('-P', '--ssh-port', dest='ssh_port', type='int', default=22, help='SSH port to connect to. Default : 22')
parser.add_option('-i', '--ssh-key', dest='ssh_key_file', default='~/.ssh/id_rsa', help='SSH key file to use. By default it will take ~/.ssh/id_rsa.')
parser.add_option('-u', '--ssh-user', dest='user', default='shinken', help='remote user to use. By default shinken.')
parser.add_option('-r', '--passphrase', dest='passphrase', default='', help='SSH key passphrase. By default void will be used.')
parser.add_option('-d', '--graphite_location', dest='graphite_location', default='/opt/graphite/storage/whisper', help='Graphite Data Location. Default : /opt/graphite/storage/whisper')
parser.add_option('-U', '--graphite_user', dest='graphite_user', default='apache', help='Graphite user. Default : apache')
parser.add_option('-n', '--graphite_process_name', dest='graphite_process_name', default='carbon-cache', help='Graphite writer process name. Default : carbon-cache')
parser.add_option('-w', '--warning', dest='storage_usage_warning', default=85, help='Warning value for Graphite space usage. In percent. Default : 85%')
parser.add_option('-c', '--critical', dest='storage_usage_critical', default=95, help='Critical value for Graphite space usage. In percent. Default : 95%')
parser.add_option('-D', '--graphite_disks', dest='graphite_disks', default='', help='Filter for disk I/O : list of disks separated by commas. Default no filter')
parser.add_option('-W', '--warning-io', dest='storage_io_warning', default=85, help='Warning value for Graphite io usage. In percent. Default : 85%')
parser.add_option('-C', '--critical-io', dest='storage_io_critical', default=95, help='Critical value for Graphite io usage. In percent. Default : 95%')

parser.add_option('--shinkenversion', dest='shinken_supervisor_version', default='', help='The shinken version number used to compare with the monitored shinken. Mandatory if in shinken mode.')


class CheckGraphiteForWriter(CheckGraphite):
    
    def __init__(self, graphite_disks, storage_io_warning, storage_io_critical, *argv, **kwarg):
        super(CheckGraphiteForWriter, self).__init__(*argv, **kwarg)
        self.graphite_disks = graphite_disks
        self.storage_io_warning = storage_io_warning
        self.storage_io_critical = storage_io_critical
    
    
    def get_check_node(self, node):
        node_graphite_location = self.graphite_location
        node_conf_file = self.graphite_conf_file
        node_graphite_user = self.graphite_user
        node_storage_usage_warning = self.storage_usage_warning
        node_storage_usage_critical = self.storage_usage_critical
        node_graphite_disks = self.graphite_disks
        node_storage_io_warning = self.storage_io_warning
        node_storage_io_critical = self.storage_io_critical
        node_process_name = ''
        check_graphite_node = CheckGraphiteNodeWriter(
            graphite_disks=node_graphite_disks,
            storage_io_warning=node_storage_io_warning,
            storage_io_critical=node_storage_io_critical,
            graphite_hostname=node[0],
            graphite_port=node[1],
            ssh_port=self.ssh_port,
            ssh_key_file=self.ssh_key_file,
            passphrase=self.passphrase,
            user=self.user,
            graphite_location=node_graphite_location,
            graphite_user=node_graphite_user,
            storage_usage_warning=node_storage_usage_warning,
            storage_usage_critical=node_storage_usage_critical,
            graphite_process_name=node_process_name,
            graphite_conf_file=node_conf_file,
        )
        return check_graphite_node
    
    
    def do_all_check(self):
        connect_state = self.connect(quit_on_fail=False)
        if connect_state != EXIT_STATUS.OK:
            self.result_add_check(EXIT_STATUS.UNKNOWN, 'Can not connect to server %s through SSH with user %s to get informations.' % (HTMLTag.color_text(self.graphite_hostname), HTMLTag.color_text(self.user)), step_name='Backend stats')
            return
        self.read_carbon_conf()
        
        if self.is_relay:
            self.meta_check_relay()
            for i, node in enumerate(self.nodes):
                self.meta_check_node(node, i + 1)
        else:
            self.graphite_port_check(step_index=1, step_name='Port \'%s\' status' % HTMLTag.color_text(self.graphite_port))
            self.find_process_carbon_cache(step_index=2, step_name='Process \'%s\' status' % self.graphite_process_name)
            self.graphite_space(step_index=3, step_name='Storage size usage')
            self.graphite_io(step_index=4, step_name='Storage i/o usage')
            self.graphite_file_permission(step_index=5, step_name='File permission')
        
        self.close()
    
    
    def result_add_check(self, status, output, step_index=1, step_name='STEP', extra_summary=''):
        self.summary.append('%s - %s : %s' % (TAG_FOR_STATE.get(status, TAG_UNKNOWN), step_name, '%s%s' % (output, extra_summary)))
        if status != EXIT_STATUS.OK:
            self.result.add_check(status, '%s : %s' % (step_name, output))
    
    
    def graphite_io(self, step_index, step_name, retry=4):
        
        data = None
        while True:
            retry -= 1
            try:
                command_io = r"""cat %s""" % IOSTATS_FILE
                stdin, stdout, stderr = self.client.exec_command(command_io)
                raw = [line for line in stdout]
                data = json.loads(''.join(raw))
                break
            except Exception as exp:
                if retry == 0:
                    self.result_add_check(
                        EXIT_STATUS.UNKNOWN,
                        '%s - On server %s, can not read the file %s.' % (TAG_UNKNOWN, HTMLTag.color_text(self.graphite_hostname), HTMLTag.color_text(IOSTATS_FILE)),
                        step_index,
                        step_name,
                        extra_summary='The graphite server may not be up to date, or the carbon-cache service is incorrectly started. Try to restart carbon-cache process.'
                    )
                    return
                else:
                    continue
        
        if not data:
            self.result_add_check(EXIT_STATUS.UNKNOWN, 'Cannot find any disks in the iostats file.', step_index, step_name)
            return
        
        exit_status = EXIT_STATUS.OK
        lines = []
        disk = [d.strip() for d in self.graphite_disks.split(',') if d.strip()]
        for disk_name, data_for_disk in data.iteritems():
            if disk and disk_name not in disk:
                continue
            _exist_status = EXIT_STATUS.OK
            values = [float(v) for v in data_for_disk]
            
            range = 0
            load_sum = 0.0
            for value in values:
                range += 1
                load_sum += value
            
            avg = load_sum / range
            
            # Add perf before continue because avg will be transform into str
            self.result.add_perf_data('%s_graphite_disk_%s_IO' % (self.graphite_hostname, disk_name), '%s%%' % avg)
            
            if avg > int(self.storage_io_critical):
                exit_status = EXIT_STATUS.CRITICAL
                _exist_status = EXIT_STATUS.CRITICAL
                avg = HTMLTag.color_text('%.2f' % avg, COLOR.RED)
            elif avg > int(self.storage_io_warning):
                exit_status = EXIT_STATUS.WARNING if exit_status < EXIT_STATUS.WARNING else exit_status
                _exist_status = EXIT_STATUS.WARNING
                avg = HTMLTag.color_text('%.2f' % avg, COLOR.ORANGE)
            else:
                exit_status = EXIT_STATUS.OK if exit_status < EXIT_STATUS.OK else exit_status
                _exist_status = EXIT_STATUS.OK
                avg = HTMLTag.color_text('%.2f' % avg, COLOR.GREEN)
            
            lines.append(('Disk %s %s : Average : %s%% - Minimum : %s%% - Maximum : %s%%' % (HTMLTag.color_text(disk_name), TAG_FOR_STATE[_exist_status], avg, min(values), max(values))))
        
        if not lines:
            self.result_add_check(EXIT_STATUS.UNKNOWN, 'Cannot find any disks in the iostats file. Maybye the disk filter is not correct. Actual value : %s' % disk, step_index, step_name)
            return
        
        output = HTMLList.simple_list(lines)
        
        self.result_add_check(exit_status, output, step_index, step_name)


class CheckGraphiteNodeWriter(CheckGraphiteForWriter):
    def connect(self, quit_on_fail=True, timeout=None):
        state, client_or_message = schecks.connect(self.graphite_hostname, self.ssh_port, self.ssh_key_file, self.passphrase, self.user, quit_on_fail=quit_on_fail, timeout=timeout)
        if state == EXIT_STATUS.OK:
            self.client = client_or_message
        else:
            self.summary.append(client_or_message)
            self.result.add_check(state, client_or_message)
        
        return state


def _parse_args():
    opts = None
    try:
        opts, args = parser.parse_args()
        if args:
            parser.error('Does not accept arguments.')
        
        if opts and not opts.hostname:
            parser.error('Missing parameter hostname (-H/--hostname)')
        if opts and not opts.port:
            parser.error('Missing parameter port (-p/--port)')
    
    except ParseOptionError as e:
        if e.msg:
            result.hard_exit(EXIT_STATUS.CRITICAL, 'Fail to parse command argument : %s %s' % (BREAK_LINE, BREAK_LINE.join(e.msg.split('\n'))))
        exit(0)
    
    return opts


def metacheck_graphite_server(write_status, opts):
    headers = []
    lines = []
    graphite_result = Result()
    for write_status_server in write_status['data']:
        
        server_host = write_status_server['host']
        server_host = opts.hostname if server_host in ['127.0.0.1', 'localhost'] else server_host
        server_port = write_status_server['port']
        module_name = write_status_server['module_name']
        can_post_data = write_status_server['can_post_data']
        server_state = EXIT_STATUS.OK if can_post_data else EXIT_STATUS.CRITICAL
        
        check_graphite_for_writer = CheckGraphiteForWriter(
            graphite_disks=opts.graphite_disks,
            storage_io_warning=opts.storage_io_warning,
            storage_io_critical=opts.storage_io_critical,
            graphite_hostname=server_host,
            graphite_port=server_port,
            ssh_port=opts.ssh_port,
            ssh_key_file=opts.ssh_key_file,
            passphrase=opts.passphrase,
            user=opts.user,
            graphite_location=opts.graphite_location,
            graphite_user=opts.graphite_user,
            storage_usage_warning=opts.storage_usage_warning,
            storage_usage_critical=opts.storage_usage_critical,
            graphite_process_name=opts.graphite_process_name,
            graphite_conf_file=GRAPHITE_CONF_FILE,
        )
        
        check_graphite_for_writer.result = graphite_result
        check_graphite_for_writer.do_all_check()
        
        _title = 'Server %s used by module : %s' % (server_host, HTMLTag.color_text(module_name))
        check_graphite_for_writer.summary.insert(0, '%s - Module can post data.' % TAG_OK if can_post_data else '%s - Module cannot post data' % TAG_CRITICAL)
        check_list_str = '%s' % (HTMLList.header_list(_title, check_graphite_for_writer.summary))
        if check_graphite_for_writer.is_relay:
            check_list_str += HTMLList.header_list('Node information', check_graphite_for_writer.nodes_summary)
        server_state = max(server_state, check_graphite_for_writer.result.status)
        
        headers.append('%s:%s is %s<br>%s' % (server_host, server_port, TAG_FOR_STATE[server_state], check_graphite_for_writer.get_mode()))
        lines.append([check_list_str])
    
    metacheck_graphit_server_long_output = HTMLTable.table([], lines, left_headers=headers, title='Graphite servers', compact_title=True, all_col_same_width=False)
    
    if graphite_result.status == EXIT_STATUS.OK:
        output = 'Graphite servers : All servers are available.'
    elif graphite_result.status == EXIT_STATUS.WARNING:
        output = 'Graphite servers : Some problems have been detected :'
    elif graphite_result.status == EXIT_STATUS.CRITICAL:
        output = 'Graphite servers : Critical errors have been detected :'
    elif graphite_result.status == EXIT_STATUS.UNKNOWN:
        output = 'Graphite servers : Some information are not available :'
    
    result.add_check(output=output, long_output=metacheck_graphit_server_long_output)
    for _output in graphite_result.criticals:
        _output = HTMLTag.tag_border('<div class="skn-ln">%s%s</div>' % (HTMLTag.color_text(u'=> ', COLOR.RED), _output), COLOR.RED)
        result.add_check(output=_output, status=EXIT_STATUS.CRITICAL)
    for _output in graphite_result.warnings:
        _output = HTMLTag.tag_border('<div class="skn-ln">%s%s</div>' % (HTMLTag.color_text(u'=> ', COLOR.ORANGE), _output), COLOR.ORANGE)
        result.add_check(output=_output, status=EXIT_STATUS.WARNING)
    for _output in graphite_result.outputs:
        _output = HTMLTag.tag_border('<div class="skn-ln">%s%s</div>' % (HTMLTag.color_text(u'=> ', COLOR.BLACK), _output), COLOR.BLACK)
        result.add_check(output=_output, status=EXIT_STATUS.UNKNOWN)
    
    for _key, _value in graphite_result.perf_data.iteritems():
        result.add_perf_data(_key, _value)


def check_modules(raw_stats):
    graphite_modules = raw_stats.get('module_stats', {}).get('graphite_perfdata', {})
    result.add_check(output='Module stats :')
    if not graphite_modules:
        _output = '<div class="skn-ln">%s%s</div>' % (HTMLTag.color_text(u'=> ', COLOR.RED), '%s - There is no information about graphite_perfdata module on the Broker daemon.' % TAG_CRITICAL)
        _output = HTMLTag.tag_border(_output, COLOR.RED)
        
        _long_output = HTMLTable.table(
            [],
            [['There is no information about graphite_perfdata module on this Broker daemon. Maybye, the broker cannot compute the stats in time. Please check the broker logs']],
            left_headers=[TAG_CRITICAL],
            title='Module stats',
            compact_title=True,
            all_col_same_width=False
        )
        
        result.add_check(EXIT_STATUS.CRITICAL, _output, long_output=_long_output)
        return
    
    headers = []
    modules_result = Result()
    for module_name, module_stats in sorted(graphite_modules.iteritems()):
        module_result = Result()
        workers = module_stats['workers']
        expected_nb_workers = module_stats['nb_workers']
        started_nb_workers = len(workers)
        if started_nb_workers == 0:
            module_result.add_check(EXIT_STATUS.CRITICAL, '%s - The module does not have any workers.' % TAG_CRITICAL)
        
        # Check that the workers are here and in good number (real versus expected)
        elif started_nb_workers != expected_nb_workers:
            module_result.add_check(EXIT_STATUS.CRITICAL, '%s - You only have %d workers but the module is configured to have %s.' % (TAG_CRITICAL, started_nb_workers, expected_nb_workers))
        else:
            
            last_minute_nb_metric_sent = module_stats['last_minute_nb_metric_sent']
            last_minute_sent_size = float(module_stats['last_minute_sent_size'])
            
            cumulative_parse_time = module_stats['cumulative_parse_time']
            cumulative_connection_time = module_stats['cumulative_connection_time']
            cumulative_other_time = module_stats['cumulative_other_time']
            
            last_minute_nb_broks_sent = module_stats['last_minute_broks_sent_nb']
            
            # Time sum is acceptable (sum of time is lower than number of workers = number of seconds)
            time_sum = cumulative_other_time + cumulative_parse_time + cumulative_connection_time
            module_load = (time_sum / started_nb_workers) * 100  # len cannot be 0 here
            
            module_result.add_check(EXIT_STATUS.OK, 'Number of metrics sent to graphite in the last minute: %s' % last_minute_nb_metric_sent)
            module_result.add_check(EXIT_STATUS.OK, 'Number of broks managed in the last minute : %s' % last_minute_nb_broks_sent)
            module_result.add_check(EXIT_STATUS.OK, 'Volume of metrics sent to graphite in the last minute : %s' % (Utils.print_human_readable_size(last_minute_sent_size)))
            module_result.add_check(EXIT_STATUS.WARNING if module_load > 80.0 else EXIT_STATUS.OK, 'The module load is %d%%' % module_load)
            module_result.add_check(EXIT_STATUS.OK, 'All workers (%s/%s) are running <br>' % (started_nb_workers, expected_nb_workers))
            for index, (worker_id, worker_data) in enumerate(sorted(workers.iteritems())):
                worker_lines = []
                work_time, work_range = worker_data['work_time']
                work_time_sampling = [i for i in worker_data['work_time_sampling'] if i != -1]
                _worker_load = (work_time / 60) * 100
                worker_state = EXIT_STATUS.WARNING if _worker_load > 80.0 else EXIT_STATUS.OK
                broks = []
                broks.append('%d broks were processed during the last minute' % (worker_data['last_minute_broks_sent_nb']))
                broks.append(HTMLList.simple_list(['%s metrics were processed.' % (worker_data['last_minute_metrics_sent_nb'])]))
                
                worker_lines.append('%s elements are managed by this worker.' % (worker_data['number_of_managed_items']))
                worker_lines.append(''.join(broks))
                worker_lines.append('%s sent to graphite in the last minute' % (Utils.print_human_readable_size(worker_data['last_minute_sent_size'])))
                worker_lines.append('Work time during the last %.3f seconds : %.3f seconds (%3d%%).' % (work_range, work_time, _worker_load))
                worker_lines.append('Work time per 10 second sample : %s.' % ' - '.join(['%.3fs' % i for i in work_time_sampling]))
                
                worker_info = HTMLList.header_list('Worker %s %s' % (worker_id, TAG_FOR_STATE[worker_state]), worker_lines)
                
                _is_last_worker = (index == (started_nb_workers - 1))
                module_result.add_check(worker_state, worker_info, no_new_line=_is_last_worker)
                result.add_perf_data('worker_%s_last_minute_metrics_sent_nb' % worker_id, worker_data['last_minute_metrics_sent_nb'])
                result.add_perf_data('worker_%s_last_minute_work_time' % worker_id, work_time)
        
        headers.append('%s : %s' % (module_name, TAG_FOR_STATE[module_result.status]))
        
        check_list_str = HTMLList.simple_list(module_result.outputs_no_sort)
        modules_result.add_check(module_result.status, [check_list_str], no_new_line=True)
    
    stats = HTMLTable.table([], modules_result.outputs_no_sort, left_headers=headers, title='Module stats', compact_title=True, all_col_same_width=False)
    
    output = 'Modules are running properly.'
    if modules_result.status != EXIT_STATUS.OK:
        output = 'The modules are not running properly.'
    
    result.add_check(status=modules_result.status, output=output, long_output=stats)


def main():
    HTMLTag.EXTRA_STYLE = u'.skn-met{margin:7px 7px;border:1px solid;width:calc(100% - 14px);}'
    HTMLTag.EXTRA_CLASS = u'skn-met'
    opts = _parse_args()
    
    daemon_adr = opts.hostname
    daemon_port = opts.port
    shinken_supervisor_version = opts.shinken_supervisor_version
    timeout = opts.timeout
    
    if timeout <= 0:
        result.hard_exit(EXIT_STATUS.CRITICAL, 'The --timeout option (%s) must be greater than 0' % timeout)
    
    html, connection_time = ShinkenUtils.request_get_daemon(result, DAEMON_TYPE, '%s:%s' % (daemon_adr, daemon_port), '/get_raw_stats', timeout=timeout)
    raw_stats = json.loads(html)
    ShinkenUtils.minimal_check(result, raw_stats, DAEMON_TYPE, shinken_supervisor_version)
    
    html, connection_time = ShinkenUtils.request_get_daemon(result, DAEMON_TYPE, '%s:%s' % (daemon_adr, daemon_port), '/check_graphite_write_status', timeout=timeout)
    write_status = json.loads(html)
    
    if write_status['status'] == WRITE_STATUS.NO_MODULE:
        result.hard_exit(EXIT_STATUS.OK, 'No module "graphite_perfdata" on this broker')
    
    metacheck_graphite_server(write_status, opts)
    check_modules(raw_stats)
    
    result.exit(sorted_by_level=False)


if __name__ == '__main__':
    main()
