#!/usr/bin/python

# -*- coding: utf-8 -*-

# Copyright (C) 2009-2012:
#    Gabes Jean, naparuba@gmail.com
#    Gerhard Lausser, Gerhard.Lausser@consol.de
#    Gregory Starck, g.starck@gmail.com
#    Hartmut Goebel, h.goebel@goebel-consult.de
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken.  If not, see <http://www.gnu.org/licenses/>.

"""This Class is a plugin for the Shinken Broker. It is in charge
to brok information of the service/host perfdatas into the Graphite
backend. http://graphite.wikidot.com/start
"""

import json
import os
import re
import struct
import threading
import time
import traceback
import urllib2
from socket import socket, SOL_SOCKET, SO_SNDTIMEO
from threading import Thread, RLock, Event

from shinken.brokermodule import WorkerBasedBrokerModule
from shinken.brokermoduleworker import BrokerModuleWorker
from shinken.load import AvgForFixSizeCall
from shinken.misc.perfdata import PerfDatas
from shinken.misc.type_hint import Optional, Union

properties = {
    'daemons' : ['broker'],
    'type'    : 'graphite_perfdata',
    'external': False,
}

RETENTION_SIZE_BY_METRIC = 5
ONE_MINUTE = 60
GRAPHITE_ACCEPTED_BROK_TYPES = ('host_check_result', 'service_check_result')

PRINT_DEBUG_PACKETS = os.environ.get('SHINKEN_DEBUG_PRINT_GRAPHITE_PACKETS', '0') == '1'


# Called by the plugin manager to get a broker
def get_instance(mod_conf):
    instance = Graphite_broker(mod_conf)
    return instance


class GraphiteModuleWorker(BrokerModuleWorker):
    _migration_thread = None  # type: Optional[Thread]
    _migration_done = None  # type: Event
    _migration_running = None  # type: Event
    _migration_lock = None  # type: RLock
    _migration_thread_joined = False  # type: bool
    illegal_char = None
    con_time = None  # type: AvgForFixSizeCall
    parse_time = None  # type: AvgForFixSizeCall
    other_time = None  # type: AvgForFixSizeCall
    parse_pure_perfdata = None  # type: AvgForFixSizeCall
    nb_managed = 0
    host = ''
    port = 0
    webapp_port = 0
    tcp_send_timeout = 0
    module_name = ''
    last_flush = 0
    buffer = []
    ticks = 0
    graphite_data_source = ''
    realm_store_only = None  # type: Union[str,list,None]
    last_minute_sent_size = 0
    last_minute_metrics_sent_nb = 0
    last_minute_broks_sent_nb = 0
    current_minute = None
    current_minute_metrics_sent_nb = 0
    current_minute_broks_sent_nb = 0
    current_minute_sent_size = 0
    metric_name_cache = {}
    elements_name_cache = {}
    _migration_table = []
    _migration_retention_broks_by_path = {}
    external_info = ""
    
    
    def worker_main(self):
        while not self.interrupted:
            self.logger.info('[worker:%s]Loop for graphite worker process' % self._worker_id)
            self.interruptable_sleep(60)
    
    
    def init_worker(self, modconf):
        self.illegal_char = re.compile(r'[^\w-]')
        
        self.con_time = AvgForFixSizeCall(time_limit=ONE_MINUTE)
        self.parse_time = AvgForFixSizeCall(time_limit=ONE_MINUTE)
        self.other_time = AvgForFixSizeCall(time_limit=ONE_MINUTE)
        self.parse_pure_perfdata = AvgForFixSizeCall(time_limit=ONE_MINUTE)
        self.nb_managed = 0
        
        self.host = getattr(modconf, 'host', 'localhost')
        self.port = int(getattr(modconf, 'port', '2003'))
        self.webapp_port = int(getattr(modconf, 'webapp_port', '80'))
        self.tcp_send_timeout = int(getattr(modconf, 'tcp_send_timeout', '4'))
        self.module_name = getattr(modconf, 'module_name', '')
        self.last_flush = 0
        self.buffer = []
        self.ticks = 0
        
        self._migration_thread = None
        self._migration_done = Event()
        self._migration_running = Event()
        self._migration_lock = RLock()
        self._migration_thread_joined = False
        
        # optional "sub-folder" in graphite to hold the data of a specific host
        self.graphite_data_source = self.illegal_char.sub('_', getattr(modconf, 'graphite_data_source', ''))
        
        # allow realm filtering if need
        self.realm_store_only = getattr(modconf, 'realm_store_only', '__UNSET__')
        # If unset, no filter
        if self.realm_store_only == '__UNSET__':
            self.realm_store_only = []
        elif self.realm_store_only == '':  # Bad conf, not happy!
            self.realm_store_only = None
        else:  # ok normal case, just split
            self.realm_store_only = [s.strip() for s in self.realm_store_only.split(',') if s.strip()]
        
        # stats
        self.last_minute_sent_size = 0
        self.last_minute_metrics_sent_nb = 0
        self.last_minute_broks_sent_nb = 0
        self.current_minute = None
        self.current_minute_metrics_sent_nb = 0
        self.current_minute_broks_sent_nb = 0
        self.current_minute_sent_size = 0
        
        self.metric_name_cache = {}
        self.elements_name_cache = {}
        
        self._migration_table = []
        self._migration_retention_broks_by_path = {}
        
        self.external_info = ""
        
        self._init_connection(timeout=10)  # at boot, let 10s to find the carbon server
    
    
    # Called by Broker so we can do init stuff
    def _init_connection(self, timeout=0):
        if self.realm_store_only is None:
            raise Exception('The realm_store_only parameter is void. Please set a comma separated list of realms you only want to store. ')
        start = time.time()
        while True:  # will be timeouted
            self.logger.info("I try to init the %s server connection to %s:%d" % (self.get_name(), str(self.host), self.port))
            try:
                self.con = socket()
                self.con.connect((self.host, self.port))
                # set send timeout to 4s max, so locky server don't lock us
                # as we catch/retry, must be the /2 timeout
                sendtimeout = struct.pack('ll', max(1, int(self.tcp_send_timeout / 2)), 0)
                self.con.setsockopt(SOL_SOCKET, SO_SNDTIMEO, sendtimeout)
                self.logger.info("Connection successful to  %s:%d" % (str(self.host), self.port))
                return
            except IOError, err:
                if time.time() > start + timeout:
                    self.logger.error("Graphite Carbon instance network socket! IOError:%s" % str(err))
                    raise
                time.sleep(0.5)
    
    
    def get_raw_stats(self):
        data = super(GraphiteModuleWorker, self).get_raw_stats()
        
        # get_raw_stats was call before init_worker
        if not hasattr(self, 'con_time'):
            return data
        
        data['con_time'] = self.con_time.get_avg(avg_on_time=True, with_range_size=False)
        data['parse_time'] = self.parse_time.get_avg(avg_on_time=True, with_range_size=False)
        data['other_time'] = self.other_time.get_avg(avg_on_time=True, with_range_size=False)
        data['parse_pure_perfdata'] = self.parse_pure_perfdata.get_avg(avg_on_time=True, with_range_size=False)
        data['nb_managed'] = self.nb_managed
        data['last_minute_sent_size'] = self.last_minute_sent_size
        data['last_minute_metrics_sent_nb'] = self.last_minute_metrics_sent_nb
        data['last_minute_broks_sent_nb'] = self.last_minute_broks_sent_nb
        
        return data
    
    
    # Sending data to Carbon. In case of failure, try to reconnect and send again.
    # If carbon instance is down, or the migration is still running, data are buffered.
    def _send_packet(self, p):
        t0 = time.time()
        now = time.time()
        
        # if we get back in time, reset last_flush
        if now < self.last_flush:
            self.last_flush = now
        
        self.buffer.append(p)
        
        if not self._is_migration_done():
            self.con_time.update_avg(time.time() - t0)
            return
        
        if not self._migration_thread_joined:
            try:
                self._migration_thread.join()
                self._migration_thread_joined = True
            except:
                pass
        
        if self._migration_retention_broks_by_path:
            retention_packet = []
            for path_metric in self._migration_retention_broks_by_path:
                retention_packet.extend(["%s %s" % (path_metric, value) for value in self._migration_retention_broks_by_path[path_metric]])
            self.buffer.append("\n".join(retention_packet) + "\n")
            self._migration_retention_broks_by_path = None
        need_flush = False
        # We need to flush if we have too much buffer elements
        if len(self.buffer) > 20:
            need_flush = True
        else:  # Maybe it's time? (force a flush every 10s)
            if now > self.last_flush + 10:
                need_flush = True
        
        # Maybe we did not need to flush to graphite now
        if not need_flush:
            self.con_time.update_avg(time.time() - t0)
            return
        
        # Ok time to flush!
        whole_packet = '\n'.join(self.buffer) + '\n'  # NOTE: +\n is important because
        # if not packet1 last line will be stuck to line 1 of packet 2
        if PRINT_DEBUG_PACKETS:
            self.logger.debug('GRAPHITE PACKET SEND: ==>%s<==' % whole_packet)
        
        self.last_flush = now
        try:
            self.con.sendall(whole_packet)
        except IOError:
            self.logger.error("[worker:%s] Failed sending data to the Graphite Carbon instance after %.4fs! Trying to reconnect ... " % (self._worker_id, time.time() - t0))
            try:
                self._init_connection(timeout=3)  # let 3s to find the carbon server
                self.con.sendall(whole_packet)
            except IOError:
                self.logger.error('[worker:%s] error in send time after: %.4fs' % (self._worker_id, time.time() - t0))
                raise
        
        # We did send, reset buffer
        self.buffer = []
        
        # logger.debug('successfully send %d bytes to graphite server in %.4fs' % (len(p), time.time() - t0))
        self.con_time.update_avg(time.time() - t0)
    
    
    # For a perf_data like /=30MB;4899;4568;1234;0  /var=50MB;4899;4568;1234;0 /toto=
    # return ('/', '30'), ('/var', '50')
    def _get_metric_and_value(self, perf_data):
        t0 = time.time()
        res = []
        metrics = PerfDatas(perf_data)
        self.parse_pure_perfdata.update_avg(time.time() - t0)
        
        for e in metrics:
            # First look if name cleaning is not already in the cache
            name = e.name
            orig_name = name
            if name in self.metric_name_cache:
                name = self.metric_name_cache[name]
            else:
                # Manage / so we don't loose IT
                name = name.replace('/', 'SLASH')
                name = name.replace('"', '')
                
                name = self.illegal_char.sub('_', name)
                self.metric_name_cache[orig_name] = name
            
            # Skip void name
            if name == '':
                continue
            
            # get metric value and its thresholds values if they exist
            res.append((name, e.value))  # name_value = {name: e.value}
            
            if e.warning and e.critical:
                res.append((name + '_warn', e.warning))
                res.append((name + '_crit', e.critical))
        
        self.parse_time.update_avg(time.time() - t0)
        return res
    
    
    # A service check result brok has just arrived, we UPDATE data info with this
    def manage_service_check_result_brok(self, b):
        t0 = time.time()
        data = b.data
        
        self.nb_managed += 1
        
        realm = data.get('realm', '')
        if self.realm_store_only and realm not in self.realm_store_only:
            self.logger.debug('SKIPPING BROK from realm [%s] (%s)' % (realm, self.realm_store_only))
            return
        
        perf_data = data['perf_data']
        t1 = time.time()
        couples = self._get_metric_and_value(perf_data)
        t2 = time.time()
        
        # If no values, we can exit now
        if len(couples) == 0:
            return
        
        check_time = int(data['last_chk'])
        
        # Instance_uuid is made of ${host_uuid}-${service_uuid}
        # So we just have to replace "-" with "." to build the path for Graphite
        path = data['instance_uuid'].replace('-', '.')
        
        lines = []
        data_length = 0
        # Send a bulk of all metrics at once
        for (metric, value) in couples:
            data_length += self._store_metric(check_time, lines, "%s.%s" % (path, metric), value) + len('\n')
        packet = '\n'.join(lines)  # Be sure we put \n between every line, last one will be set later
        self._update_stats(len(couples), data_length)
        t3 = time.time()
        if self._is_migration_done():
            try:
                self._send_packet(packet)
            except IOError:
                self.logger.error("[worker:%s] Failed sending to the Graphite Carbon. Data are lost" % self._worker_id)
                # we are in a massive error, so instead of retry init for EACH metric, we are asking the broker to restart us
                # and do the retry only once a turn (aka 1s)
                raise
        
        self.other_time.update_avg(((t1 - t0) + (t3 - t2)))
    
    
    # A host check result brok has just arrived, we UPDATE data info with this
    def manage_host_check_result_brok(self, b):
        t0 = time.time()
        data = b.data
        
        self.nb_managed += 1
        
        # If we have a filter and the brok is not in an allowed realm, skip it
        realm = data.get('realm', '')
        if self.realm_store_only and realm not in self.realm_store_only:
            return
        
        perf_data = data['perf_data']
        
        t1 = time.time()
        couples = self._get_metric_and_value(perf_data)
        t2 = time.time()
        # If no values, we can exit now
        if len(couples) == 0:
            return
        
        check_time = int(data['last_chk'])
        
        path = data['instance_uuid']
        
        lines = []
        data_length = 0
        # Send a bulk of all metrics at once
        for (metric, value) in couples:
            data_length += self._store_metric(check_time, lines, "%s.__HOST__.%s" % (path, metric), value) + len('\n')
        packet = '\n'.join(lines)  # Be sure we put \n between every line, last one will be set later
        self._update_stats(len(couples), data_length)
        t3 = time.time()
        if self._is_migration_done():
            try:
                self._send_packet(packet)
            except IOError:
                self.logger.error("[worker:%s] Failed sending to the Graphite Carbon. Data are lost" % self._worker_id)
                # we are in a massive error, so instead of retry init for EACH metric, we are asking the broker to restart us
                # and do the retry only once a turn (aka 1s)
                raise
        
        self.other_time.update_avg((t1 - t0) + (t3 - t2))
    
    
    def _store_metric(self, check_time, lines, path, value):
        metric = "%s %s %d" % (path, value, check_time)
        length = len(metric)
        
        if self._is_migration_done():
            lines.append(metric)  # Necessary to update stats
        else:
            if self._migration_retention_broks_by_path is None:
                self._migration_retention_broks_by_path = {}
            
            if path in self._migration_retention_broks_by_path:
                self._migration_retention_broks_by_path[path].append("%s %d" % (value, check_time))
                self._migration_retention_broks_by_path[path] = self._migration_retention_broks_by_path[path][-RETENTION_SIZE_BY_METRIC:]
            else:
                self._migration_retention_broks_by_path[path] = ["%s %d" % (value, check_time)]
        
        return length
    
    
    def _migrate(self):
        self._set_migration_start()
        try:
            self.logger.info("[worker:%s] Start to check that all graphite metrics are valid (in uuids format since the 2.5.0 version). Migration table size: %s" % (self._worker_id, len(self._migration_table)))
            if not self._migration_table:
                self.logger.info('[worker:%s] OK No metrics to check' % self._worker_id)
                self._set_migration_done()
                return
            
            graphite_migration_url = "http://%s:%s/migrate/" % (self.host, self.webapp_port)
            json_data = json.dumps(self._migration_table)
            
            pth = "/var/log/shinken/graphite-migration-data-worker%s.json" % self._worker_id
            with open(pth, "w") as fd:
                fd.write(json_data)
            self.logger.info('[worker:%s] migration data (size:%s) is saved to the %s file' % (self._worker_id, len(self._migration_table), pth))
            
            chunk_size = 5000
            total_directories = 0
            total_migrated_directories = 0
            for sublist in [self._migration_table[i:i + chunk_size] for i in range(0, len(self._migration_table), chunk_size)]:
                try:
                    result = urllib2.urlopen(graphite_migration_url, data=json.dumps(sublist))
                except (urllib2.HTTPError, urllib2.URLError) as error:
                    header = '*' * 160
                    self.logger.error(header)
                    self.logger.error('CRITICAL ERROR:: [worker:%s] Issue during migration to UUIDs : your graphite server could not be contacted at %s : %s ; please check your configuration.' % (self._worker_id, graphite_migration_url, error))
                    self.logger.error(header)
                    self._set_migration_done()
                    return
                try:
                    migration_result = json.load(result)
                except ValueError:  # not a json? graphite it not up to date!
                    header = '*' * 160
                    self.logger.error(header)
                    self.logger.error('CRITICAL ERROR:: [worker:%s] Issue during migration to UUIDs : your graphite server is not up to date, please install the Shinken Enterprise one.' % self._worker_id)
                    self.logger.error(header)
                    self._set_migration_done()
                    return
                if result.code != 200:
                    self.logger.error("[worker:%s] Issue during migration to UUIDs : %s" % (self._worker_id, migration_result))
                    break
                if migration_result['error_count'] != 0:
                    header = '*' * 160
                    self.logger.error(header)
                    self.logger.error(
                        '\nCRITICAL ERROR:: [worker:%s] There was %d errors during the metrics migration. The last error is on the graphite server: %s.\n' % (self._worker_id, migration_result['error_count'], migration_result['last_error']))
                    self.logger.error(header)
                    continue
                total_migrated_directories += migration_result['nb_migrated']
                total_directories += migration_result['ok_count']
            
            # Show the stats only once
            if total_migrated_directories != 0:
                self.logger.info('[worker:%s] The graphite did migrate from name to uuids %d directories on a total of %d directories to uuids' % (self._worker_id, total_migrated_directories, total_directories))
            else:
                self.logger.info('[worker:%s] All %d directories on the graphite server are already migrated to uuids.' % (self._worker_id, total_directories))
            self._migration_table = []
            
            self.logger.info("[worker:%s] All metrics UUID paths are checked and are in the uuid format on the graphite server" % self._worker_id)
        except Exception:
            self.logger.error('[worker:%s] CRITICAL: the graphite migration thread did fail: %s' % (self._worker_id, traceback.format_exc()))
        
        self._set_migration_done()
    
    
    def _set_migration_done(self):
        with self._migration_lock:
            self._migration_running.clear()
            self._migration_done.set()
    
    
    def _set_migration_start(self):
        with self._migration_lock:
            self._migration_running.set()
            self._migration_done.clear()
            self._migration_thread_joined = False
    
    
    def _is_migration_done(self):
        with self._migration_lock:
            return self._migration_done.is_set()
    
    
    def _is_migration_running(self):
        with self._migration_lock:
            return self._migration_running.is_set()
    
    
    def _update_migration_table_with_host(self, host):
        sanitized_host_name = self.illegal_char.sub('_', host.get_name())
        check_name = "%s.__HOST__" % sanitized_host_name
        self._migration_table.append([check_name, "%s.__HOST__" % host.get_uuid()])
        try:
            self.logger.debug('[worker:%s] Stacking Host information to verify UUIDs in graphite:  %s/%s (current size:%s)' % (
                self._worker_id, check_name, host.get_uuid(), len(self._migration_table)))
        except Exception as exp:
            self.logger.debug('Cannot log a Host information (%s)' % exp)
        
        for check_uuid, check in host.get_checks().iteritems():
            sanitized_service_desc = self.illegal_char.sub('_', check.get_name())
            check_name = "%s.%s" % (sanitized_host_name, sanitized_service_desc)
            self._migration_table.append([check_name, '%s' % (check_uuid.replace('-', '.'))])
            try:
                self.logger.debug('[worker:%s] Stacking Check information to verify UUIDs in graphite:  %s/%s (current size:%s)' % (
                    self._worker_id, check_name, check_uuid.replace("-", "."), len(self._migration_table)))
            except Exception as exp:
                self.logger.debug('Cannot log a Check information (%s)' % exp)
    
    
    def _update_stats(self, nblines, size):
        cur_min = int(time.time()) / 60
        # Ok we just switch minute, switch counters
        if cur_min != self.current_minute:
            self.current_minute = cur_min
            self.last_minute_metrics_sent_nb = self.current_minute_metrics_sent_nb
            self.last_minute_sent_size = self.current_minute_sent_size
            self.last_minute_broks_sent_nb = self.current_minute_broks_sent_nb
            self.current_minute_metrics_sent_nb = 0
            self.current_minute_sent_size = 0
            self.current_minute_broks_sent_nb = 0
        # And now sum on the new value
        self.current_minute_metrics_sent_nb += nblines
        self.current_minute_sent_size += size
        self.current_minute_broks_sent_nb += 1
    
    
    def callback__a_new_host_added(self, host_uuid):
        host = self.get_host_from_uuid(host_uuid)
        self._update_migration_table_with_host(host)
        self.logger.debug('[worker:%s] New host received: %s' % (self._worker_id, host.get_name()))
    
    
    def callback__a_host_updated(self, host_uuid):
        host = self.get_host_from_uuid(host_uuid)
        self._update_migration_table_with_host(host)
        self.logger.debug('[worker:%s] Updated host received: %s' % (self._worker_id, host.get_name()))
    
    
    def _launch_migration(self, realm_name):
        if self._is_migration_running():
            self._migration_done.wait()
            
            if not self._migration_thread_joined:
                try:
                    self._migration_thread.join()
                    self._migration_thread_joined = True
                except:
                    pass
        
        self._migration_thread = threading.Thread(target=self._migrate, name="migration_thread")
        self._migration_thread.start()
        self.logger.info('[worker:%s] ******* We have all information for a realm %s, we can start to check if graphite is up to date with the metrics UUIDS. (migration size: %s)' % (self._worker_id, realm_name, len(self._migration_table)))
    
    
    def callback__a_new_realm_added(self, realm_name):
        self._launch_migration(realm_name)
    
    
    def callback__a_realm_updated(self, realm_name):
        self._launch_migration(realm_name)


# Class for the Graphite Broker
# Get broks and send them to a Carbon instance of Graphite
class Graphite_broker(WorkerBasedBrokerModule):
    MODULE_WORKER_CLASS = GraphiteModuleWorker
    
    
    def __init__(self, modconf):
        WorkerBasedBrokerModule.__init__(self, modconf)
    
    
    # We only want to manage 2 types of broks
    def want_brok(self, brok):
        return brok.type in GRAPHITE_ACCEPTED_BROK_TYPES
    
    
    def get_raw_stats(self, param=''):
        data = super(Graphite_broker, self).get_raw_stats(param=param)
        
        con_time = 0.0
        parse_time = 0.0
        other_time = 0.0
        parse_pure_perfdata = 0.0
        last_minute_sent_size = 0
        last_minute_metrics_sent_nb = 0
        last_minute_broks_sent_nb = 0
        broks = 0
        
        for worker_id, worker_data in data.get('workers', {}).iteritems():
            if not worker_data:
                continue
            
            con_time += worker_data['con_time']
            parse_time += worker_data['parse_time']
            other_time += worker_data['other_time']
            parse_pure_perfdata += worker_data['parse_pure_perfdata']
            last_minute_sent_size += worker_data['last_minute_sent_size']
            last_minute_metrics_sent_nb += worker_data['last_minute_metrics_sent_nb']
            last_minute_broks_sent_nb += worker_data['last_minute_broks_sent_nb']
        
        data.update({
            'nb_workers'                    : self._nb_workers,
            'last_minute_nb_metric_sent'    : last_minute_metrics_sent_nb,
            'last_minute_broks_sent_nb'     : last_minute_broks_sent_nb,
            'last_minute_sent_size'         : last_minute_sent_size,
            'cumulative_connection_time'    : con_time,
            'cumulative_parse_time'         : parse_time,
            'cumulative_parse_pure_perfdata': parse_pure_perfdata,
            'cumulative_other_time'         : other_time,
            'cumulative_broks'              : broks,
            
        })
        return data
