#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013-2019
# This file is part of Shinken Enterprise, all rights reserved.


import ConfigParser
import json
import socket
import shutil
import time
import os
import random

try:
    import pwd
    import grp
except ImportError:  # windows
    pwd = grp = None
from subprocess import Popen, PIPE

from shinken.misc.type_hint import Optional, List, Tuple
from shinken.log import logger
from shinkensolutions.lib_checks import schecks
from shinkensolutions.lib_checks.common import Result, HTMLTag, COLOR, EXIT_STATUS, HTMLList
from shinkensolutions.lib_checks.libs.paramiko.client import SSHClient


class GRAPHITE_STATS_KEY(object):
    TIME_READ = 'graphite_stats_time'
    LOCAL_TIME = 'graphite_local_time'


GRAPHITE_API_VERSION = '0.9.11'
GRAPHITE_STATS_FILE_IS_TOO_OLD = 180

TAG_OK = HTMLTag.color_text('OK', COLOR.GREEN)
TAG_WARNING = HTMLTag.color_text('WARNING', COLOR.ORANGE)
TAG_CRITICAL = HTMLTag.color_text('CRITICAL', COLOR.RED)
TAG_UNKNOWN = HTMLTag.color_text('UNKNOWN', COLOR.BLACK)

CARBON_CACHE_PROCESS_NAME = 'carbon-cache'
CARBON_RELAY_PROCESS_NAME = 'carbon-relay'

TAG_FOR_STATE = {
    EXIT_STATUS.OK      : TAG_OK,
    EXIT_STATUS.WARNING : TAG_WARNING,
    EXIT_STATUS.CRITICAL: TAG_CRITICAL,
    EXIT_STATUS.UNKNOWN : TAG_UNKNOWN,
}

WHISPER_DIR = '/opt/graphite/storage/whisper'
NB_METRICS_COUNT_FILE = '/opt/graphite/storage/whisper/.nb_metrics'


class GraphiteMetricsCounter(object):
    UNWANTED_METRIC_DIRECTORIES = ('carbon', '.cacheinvalidation', '.nb_metrics')
    
    
    def __init__(self):
        self.metrics = 0
        self.level_0 = 0
    
    
    def update_count(self):
        start = time.time()
        self._update_level_0()
        self._do_compute_number_of_metrics_file()
        graphite_stats_time = int(time.time())
        self._write_metric_file(graphite_stats_time)
        # logger.info('[METRIC COUNT] Computing the %s file did finish (in %.3fs)' % (NB_METRICS_COUNT_FILE, time.time() - start))
    
    
    def _update_level_0(self):
        self.level_0 = len([folder for folder in os.listdir(WHISPER_DIR) if folder not in self.UNWANTED_METRIC_DIRECTORIES])
    
    
    # This will be computed by the iostats_collector script
    def _do_compute_number_of_metrics_file(self):
        # logger.info('[METRIC COUNT] Computing the %s file' % NB_METRICS_COUNT_FILE)
        self.metrics = 0
        
        try:
            # It's fastest to go with a find command instead of doing it ourselve
            cmd = "find %s -type f  -name '*.wsp'  | grep -v '%s/carbon' | wc -l" % (WHISPER_DIR, WHISPER_DIR)
            p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
            output, stderr = p.communicate()
            self.metrics = int(output.strip())
            return
        except:  # something did fail with wc -l ?
            pass
        # oups, something was wrong, switch to a manual mode (maybe we are no mor allowed to launch command, like under apache)
        for root, dirs, files in os.walk(WHISPER_DIR):
            root = root.replace(WHISPER_DIR, '')
            if root.startswith(self.UNWANTED_METRIC_DIRECTORIES):
                continue
            for basename in files:
                if basename.endswith('.wsp'):
                    self.metrics += 1
    
    
    def _get_count_export(self, graphite_stats_time):
        return {'metrics': self.metrics, 'level_0': self.level_0, GRAPHITE_STATS_KEY.TIME_READ: graphite_stats_time}
    
    
    def _write_metric_file(self, graphite_stats_time):
        try:
            tmp_file = NB_METRICS_COUNT_FILE + '.tmp.%d' % random.randint(1, 100000)
            count = self._get_count_export(graphite_stats_time)
            with open(tmp_file, 'w') as f:
                f.write(json.dumps(count))
            os.chown(tmp_file, pwd.getpwnam("apache").pw_uid, grp.getgrnam("apache").gr_gid)
            shutil.move(tmp_file, NB_METRICS_COUNT_FILE)  # atomic move
        except Exception as exp:
            # logger.info('ERROR: cannot save the metric count file cache: %s=> %s' % (NB_METRICS_COUNT_FILE, exp))
            raise


def _split_node_graphite(def_node):
    # safe split
    def_node = ('%s:::' % def_node).split(':')
    return def_node[0].strip(), def_node[1].strip(), def_node[2].strip()


class GraphiteConfReader(object):
    def __init__(self, graphite_hostname, graphite_port, ssh_port, ssh_key_file, passphrase, graphite_conf_file, user):
        self.graphite_hostname = graphite_hostname
        self.graphite_port = graphite_port
        self.ssh_port = ssh_port
        self.user = user
        self.ssh_key_file = ssh_key_file
        self.passphrase = passphrase
        self.graphite_conf_file = graphite_conf_file
        
        self.nodes = []  # type: List[Tuple[basestring, basestring, basestring]]
        self.carbon_conf = None  # type: Optional[ConfigParser]
        self.is_relay = None  # type: Optional[bool]
        self.client = None  # type: Optional[SSHClient]
    
    
    def connect(self, quit_on_fail=True, timeout=None):
        if quit_on_fail:
            self.client = schecks.connect(self.graphite_hostname, self.ssh_port, self.ssh_key_file, self.passphrase, self.user, quit_on_fail=quit_on_fail, timeout=timeout)
            return EXIT_STATUS.OK
        
        return_status, client = schecks.connect(self.graphite_hostname, self.ssh_port, self.ssh_key_file, self.passphrase, self.user, quit_on_fail=quit_on_fail, timeout=timeout)
        if return_status == EXIT_STATUS.OK:
            self.client = client
        
        return return_status
    
    
    def close(self):
        self.client.close()
    
    
    def get_mode(self):
        return HTMLTag.color_text('relay (sending data to Node)' if self.is_relay else 'cache (where data is stored)', bold=True)
    
    
    def read_carbon_conf(self, retry=4):
        # type: (int) -> ConfigParser
        stdin, stdout, stderr = self.client.exec_command('LC_ALL=C cat %s' % self.graphite_conf_file)
        self.carbon_conf = ConfigParser.SafeConfigParser({'LINE_RECEIVER_PORT': '--no port--', 'PICKLE_RECEIVER_PORT': '--no port--', 'DESTINATIONS': ''})
        self.carbon_conf.readfp(stdout)
        self.is_relay = self._is_relay_check()
        self.nodes = [_split_node_graphite(i) for i in self.carbon_conf.get('relay', 'DESTINATIONS').split(',')]
        return self.carbon_conf
    
    
    def _is_relay_check(self):
        relay_ports = (
            self.carbon_conf.get('relay', 'LINE_RECEIVER_PORT'),
            self.carbon_conf.get('relay', 'PICKLE_RECEIVER_PORT')
        )
        cache_ports = (
            self.carbon_conf.get('cache', 'LINE_RECEIVER_PORT'),
            self.carbon_conf.get('cache', 'PICKLE_RECEIVER_PORT')
        )
        
        if self.graphite_port not in cache_ports and self.graphite_port not in relay_ports:
            self.result.hard_exit(EXIT_STATUS.CRITICAL, 'Graphite port %s was not found in configuration file %s on %s server.' % (self.graphite_port, self.graphite_conf_file, self.graphite_hostname))
        
        return self.graphite_port in relay_ports


class CheckGraphite(GraphiteConfReader):
    
    def __init__(self, graphite_location, graphite_user, storage_usage_warning, storage_usage_critical, graphite_process_name, *argv, **kwarg):
        super(CheckGraphite, self).__init__(*argv, **kwarg)
        self.graphite_location = graphite_location
        self.graphite_user = graphite_user
        self.storage_usage_warning = storage_usage_warning
        self.storage_usage_critical = storage_usage_critical
        self.graphite_process_name = graphite_process_name
        
        self.result = Result()
        self.summary = []
        self.nodes_summary = []
    
    
    def read_carbon_conf(self, retry=4):
        super(CheckGraphite, self).read_carbon_conf()
        self.graphite_location = self.graphite_location or self.carbon_conf.get('cache', 'LOCAL_DATA_DIR')
        self.graphite_user = self.graphite_user or self.carbon_conf.get('cache', 'USER')
        self.graphite_process_name = self.graphite_process_name or (CARBON_RELAY_PROCESS_NAME if self.is_relay else CARBON_CACHE_PROCESS_NAME)
        return self.carbon_conf
    
    
    def result_add_check(self, status, output, step_index=1, step_name='STEP', extra_summary=''):
        step_index_as_text = HTMLTag.color_text('STEP %s' % step_index)
        self.summary.append('%s : %s - %s : %s' % (step_index_as_text, TAG_FOR_STATE.get(status, TAG_UNKNOWN), step_name, '%s%s' % (output, extra_summary)))
        if status != EXIT_STATUS.OK:
            self.result.add_check(status, '%s : %s' % (step_name, output))
    
    
    # We are requesting the server as it's real public interface, and for this we are requesting the total number of metrics
    # * our graphite : /metrics/get-metrics-count
    # * vanilla graphite: /metrics/index.json (but you get ALL metrics names)
    def api_graphite_metric_check(self, step_index, step_name, retry=4):
        while True:
            try:
                retry -= 1
                # We are checking graphite query on loopback address
                command = 'curl "http://%s/metrics/get-metrics-count"' % self.graphite_hostname
                stdin, stdout, stderr = self.client.exec_command(command)
                resp = stdout.readline()
                try:
                    count = json.loads(resp)
                    metrics_nb = count['metrics']
                except:  # is not a json, so must be an old graphite
                    # We are checking graphite query on loopback address
                    command = 'curl "http://%s/metrics/index.json"' % self.graphite_hostname
                    stdin, stdout, stderr = self.client.exec_command(command)
                    resp = stdout.readline()
                    metrics = json.loads(resp)
                    metrics = [m for m in metrics if 'carbon.agents' not in m]
                    metrics_nb = len(metrics)
                
                self.result_add_check(EXIT_STATUS.OK, '%s - %s metrics found.' % (TAG_OK, metrics_nb), step_index, step_name)
                self.result.add_perf_data('nb_metrics', metrics_nb)
                return
            except Exception as exp:
                if retry == 0:
                    self.result_add_check(EXIT_STATUS.CRITICAL, '%s - Fail to request metric to graphite server.' % TAG_CRITICAL, step_index, step_name)
                    return
                else:
                    continue
    
    
    def graphite_port_check(self, step_index, step_name, retry=4):
        # type: (int, basestring, int) -> None
        sock = None
        while True:
            try:
                retry -= 1
                sock = socket.create_connection((str(self.graphite_hostname), int(self.graphite_port)), 0.5)
                self.result_add_check(EXIT_STATUS.OK, '%s.' % HTMLTag.color_text('OPEN', COLOR.GREEN), step_index, step_name)
                return
            except Exception as exp:
                if retry == 0:
                    self.result_add_check(EXIT_STATUS.CRITICAL, '%s. %s' % (HTMLTag.color_text('CLOSE', COLOR.RED), exp), step_index, step_name)
                    return
                else:
                    continue
            finally:
                if sock is not None:
                    sock.close()
    
    
    def graphite_conf_type_check(self, step_index, step_name):
        if self.is_relay:
            conf_type = "RELAY"
        else:
            conf_type = "CACHE"
        
        self.result_add_check(EXIT_STATUS.OK, HTMLTag.color_text(conf_type, COLOR.GREEN), step_index, step_name)
    
    
    def find_process_carbon_cache(self, step_index, step_name, retry=4):
        # We are looking for a line like
        # (nap,7320,3384,0.0) /(...)proc_to_match(...)
        # Beware of the export!
        while True:
            try:
                retry -= 1
                raw = r"""pgrep %s""" % self.graphite_process_name
                stdin, stdout, stderr = self.client.exec_command('LC_ALL=C %s' % raw)
                count = 0
                pid_found = None
                for line in stdout:
                    line = line.strip()
                    if not line:
                        continue
                    
                    # There is a match, i retrieve the info
                    count += 1
                    pid_found = int(line)
                
                if count == 0:
                    output = '%s - No \'%s\' process have been found.' % (HTMLTag.color_text('Not running.', COLOR.RED), self.graphite_process_name)
                    exit_status = EXIT_STATUS.CRITICAL
                elif count > 1:
                    output = '%s - You should have only one \'%s\' process running' % (HTMLTag.color_text('%s \'%s\' process found.' % (count, self.graphite_process_name), COLOR.RED), self.graphite_process_name)
                    exit_status = EXIT_STATUS.CRITICAL
                else:
                    output = '%s - (process \'%s\' with pid %s)' % (HTMLTag.color_text('Running', COLOR.GREEN), self.graphite_process_name, pid_found)
                    exit_status = EXIT_STATUS.OK
                
                self.result_add_check(exit_status, output, step_index, step_name)
                return
            except Exception as exp:
                if retry == 0:
                    self.result_add_check(EXIT_STATUS.CRITICAL, '%s - Fail to get graphite server process info. [%s]' % (TAG_CRITICAL, exp), step_index, step_name)
                    return
                else:
                    continue
    
    
    def graphite_space(self, step_index, step_name, retry=4):
        # We are checking disk space usage for Graphite
        while True:
            retry -= 1
            try:
                command_partition_usage = 'LC_ALL=C df -P %s' % self.graphite_location
                stdin, stdout, stderr = self.client.exec_command(command_partition_usage)
                
                used_pct = None
                for line in stdout:
                    line = line.strip()
                    # By pass the first line, we already know about it
                    if not line or line.startswith('Filesystem'):
                        continue
                    # Only keep non void elements
                    tmp = [s for s in line.split(' ') if s]
                    used_pct = int(tmp[4][:-1])
                
                if used_pct is None:
                    exit_status = EXIT_STATUS.CRITICAL
                    output = 'Parse for size of the command [%s] fail.' % command_partition_usage
                elif int(used_pct) >= int(self.storage_usage_critical):
                    exit_status = EXIT_STATUS.CRITICAL
                    output = '%s - Greater than your critical threshold (> %s%%).' % (HTMLTag.color_text(str(used_pct) + '%', COLOR.RED), self.storage_usage_critical)
                elif int(used_pct) >= int(self.storage_usage_warning):
                    exit_status = EXIT_STATUS.WARNING
                    output = '%s - Greater than your warning threshold (> %s%%).' % (HTMLTag.color_text(str(used_pct) + '%', COLOR.ORANGE), self.storage_usage_warning)
                else:
                    exit_status = EXIT_STATUS.OK
                    output = '%s - Correct, below the limits (< %s%%).' % (HTMLTag.color_text(str(used_pct) + '%', COLOR.GREEN), self.storage_usage_warning)
                
                self.result_add_check(exit_status, output, step_index, step_name)
                self.result.add_perf_data('%s_graphite_storage_size' % self.graphite_hostname, '%s%%' % used_pct)
                return
            except Exception as exp:
                if retry == 0:
                    self.result_add_check(EXIT_STATUS.CRITICAL, 'Fail to get server storage usage. [%s]' % exp, step_index, step_name)
                    return
                else:
                    continue
    
    
    def graphite_file_permission(self, step_index, step_name, retry=4):
        # We are checking File permission for Graphite
        while True:
            retry -= 1
            try:
                command_file_permission = """LC_ALL=C find "%s" ! -user "%s" | head -11""" % (self.graphite_location, self.graphite_user)
                stdin, stdout, stderr = self.client.exec_command(command_file_permission)
                
                data = [line.strip() for line in stdout if line.strip()]
                have_extra_line = len(data) > 10
                data = data[:10]
                
                if data:
                    exit_status = EXIT_STATUS.CRITICAL
                    output = 'On server %s, the user %s does not have ownership on some files in %s.' % (HTMLTag.color_text(self.graphite_hostname), HTMLTag.color_text(self.graphite_user), HTMLTag.color_text(self.graphite_location))
                    if have_extra_line:
                        extra_summary = ' These are the first 10 concerned files/folder: %s' % (HTMLList.simple_list(data))
                    else:
                        extra_summary = ' These are the concerned files/folder: %s' % (HTMLList.simple_list(data))
                else:
                    exit_status = EXIT_STATUS.OK
                    output = 'No problems detected with file permissions.'
                    extra_summary = ''
                
                self.result_add_check(exit_status, output, step_index, step_name, extra_summary=extra_summary)
                return
            except Exception as exp:
                if retry == 0:
                    self.result_add_check(EXIT_STATUS.CRITICAL, 'Fail to get server file permission. [%s]' % exp, step_index, step_name)
                    return
                else:
                    continue
    
    
    def meta_check_relay(self):
        # STEP 1 - check of Graphite port
        self.graphite_port_check(step_index=1, step_name='Port \'%s\' status' % HTMLTag.color_text(self.graphite_port))
        # STEP 3 - check of carbon-cache process
        self.find_process_carbon_cache(step_index=2, step_name='Process \'%s\' status' % self.graphite_process_name)
        # STEP 4 - check API Metrics
        self.api_graphite_metric_check(step_index=3, step_name='HTTP API status')
    
    
    def meta_check_node(self, node, node_index):
        # type: (Tuple, int) -> None
        check_graphite_node = self.get_check_node(node)
        
        check_graphite_node.do_all_check()
        node_hostname_html = HTMLTag.color_text(u'%s:%s' % (check_graphite_node.graphite_hostname, check_graphite_node.graphite_port))
        _summary = u'Node %s ( Address: %s ) Type: %s is %s' % (node_index, node_hostname_html, check_graphite_node.get_mode(), TAG_FOR_STATE[check_graphite_node.result.status])
        if check_graphite_node.nodes_summary:
            _summary = u'%s %s' % (_summary, HTMLList.simple_list(check_graphite_node.nodes_summary))
        self.nodes_summary.append(_summary)
        
        if check_graphite_node.result.status != EXIT_STATUS.OK:
            self.result.add_check(status=check_graphite_node.result.status, output=_summary)
    
    
    def get_check_node(self, node):
        node_graphite_location = self.graphite_location
        node_conf_file = self.graphite_conf_file
        node_graphite_user = self.graphite_user
        node_storage_usage_warning = self.storage_usage_warning
        node_storage_usage_critical = self.storage_usage_critical
        node_process_name = ''
        check_graphite_node = CheckGraphiteNode(
            graphite_hostname=node[0],
            graphite_port=node[1],
            ssh_port=self.ssh_port,
            ssh_key_file=self.ssh_key_file,
            passphrase=self.passphrase,
            user=self.user,
            graphite_location=node_graphite_location,
            graphite_user=node_graphite_user,
            storage_usage_warning=node_storage_usage_warning,
            storage_usage_critical=node_storage_usage_critical,
            graphite_process_name=node_process_name,
            graphite_conf_file=node_conf_file,
        )
        return check_graphite_node
    
    
    def do_all_check(self):
        connect_state = self.connect()
        if connect_state != EXIT_STATUS.OK:
            return
        self.read_carbon_conf()
        
        if self.is_relay:
            self.meta_check_relay()
            for i, node in enumerate(self.nodes):
                self.meta_check_node(node, i + 1)
        else:
            # STEP 1 - check of Graphite port
            self.graphite_port_check(step_index=1, step_name='Port \'%s\' status' % HTMLTag.color_text(self.graphite_port))
            # STEP 2 - check of carbon-cache process
            self.find_process_carbon_cache(step_index=2, step_name='Process \'%s\' status' % self.graphite_process_name)
            # STEP 3 - check API Metrics
            self.api_graphite_metric_check(step_index=3, step_name='HTTP API status')
            # STEP 4 - check Graphite Storage
            self.graphite_space(step_index=4, step_name='Storage size usage')
            # STEP 6 - check Graphite file permission
            self.graphite_file_permission(step_index=6, step_name='File permission')
        
        self.close()


class CheckGraphiteNode(CheckGraphite):
    def connect(self, quit_on_fail=False, timeout=None):
        state, client_or_message = schecks.connect(self.graphite_hostname, self.ssh_port, self.ssh_key_file, self.passphrase, self.user, quit_on_fail=quit_on_fail, timeout=timeout)
        if state == EXIT_STATUS.OK:
            self.client = client_or_message
        else:
            self.summary.append(client_or_message)
            self.result.add_check(state, client_or_message)
        
        return state
