#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
# Copyright (C) 2009-2016:
#     Gabes Jean, naparuba@gmail.com
#     Gerhard Lausser, Gerhard.Lausser@consol.de
#     Gregory Starck, g.starck@gmail.com
#     Hartmut Goebel, h.goebel@goebel-consult.de
#     Martin Benjamin, b.martin@shinken-solutions.com
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken.  If not, see <http://www.gnu.org/licenses/>.


import base64
import cPickle
import copy
import json
import logging
import os
import socket
import sys
import threading
import time
import traceback
import zlib
from Queue import Empty
from multiprocessing import active_children, cpu_count
from threading import RLock

from .action import ACTION_TYPES
from .daemon import Interface, IStatsInterface
from .executor_stats import ExecutorStats
from .http_client import HTTPExceptions
from .load import AvgInRange
from .log import logger, get_chapter_string, LoggerFactory
from .misc.type_hint import TYPE_CHECKING
from .property import BoolProp
from .runtime_stats.cpu_stats import cpu_stats_helper
from .runtime_stats.threads_dumper import WatchDogThreadDumper
from .safepickle import SafeUnpickler
from .satellite import BaseSatellite
from .util import from_int_micro_sec_to_float_sec, get_memory_used_percent, get_cpu_running_procs
from .worker import Worker, WORKER_MAX_RAM_PERCENT, WORKER_MAX_CPU_QUEUE_PER_CPU, WORKER_MAX_LOAD_QUEUE
from .worker_stats import InMainProcessStatsAggregateAndPrinter

if TYPE_CHECKING:
    from .misc.type_hint import List, Dict, Any, Optional
    from .log import PartLogger
    from .action import Action

_NB_VALUE_SAVE_FOR_EXEC_STAT = 10
_PATH_EXEC_STAT = '/var/lib/shinken/poller_exec_stat_%s.dat'

_TIMEOUT_NAME = 0
_TIMEOUT_TIME = 1
_TIMEOUT_AT = 2
_TIMEOUT_NB = 3

_BEACON_LOG_LEVEL = logging.NOTSET

WORKER_EXPECTED_LOOP_TIME = 0.5

CHAPTER_STATS = get_chapter_string(u'STATS')
CHAPTER_CONNECTION = get_chapter_string(u'CONNECTION')
CHAPTER_CHECKS = get_chapter_string(u'CHECKS')
CHAPTER_ACTION = get_chapter_string(u'ACTIONS')
CHAPTER_ACTION_RESULT = get_chapter_string(u'ACTIONS RESULTS')
CHAPTER_CHECKS_RESULT = get_chapter_string(u'CHECKS RESULTS')
CHAPTER_GET = get_chapter_string(u'GET')
CHAPTER_RECEIVED = get_chapter_string(u'RECEIVED')
CHAPTER_PUSHED = get_chapter_string(u'PUSHED')
CHAPTER_GIVEN = get_chapter_string(u'GIVEN')

logger_configuration = LoggerFactory.get_logger(u'CONFIGURATION')


class ACTION_DIRECTIONS(object):
    PASSIVE_DIRECTION = 'passive'  # from other daemon to the poller/reactionner
    ACTIVE_DIRECTION = 'active'  # poller/reactionner ask for jobs to the other daemons


# Class to tell that we are facing a non-worker module but a standard one
class NotWorkerMod(Exception):
    pass


class IStatsExecutor(IStatsInterface):
    # Interface for various stats about executor activity
    
    def get_raw_stats(self, param=u''):
        # type: (unicode) -> Dict[unicode, Any]
        return super(IStatsExecutor, self).get_raw_stats(param=param)
    
    
    get_raw_stats.doc = u'get stats of the daemon'
    get_raw_stats.need_lock = False
    
    
    def _daemon_get_raw_stats(self, param=u''):
        # type: (unicode) -> Dict[unicode, Any]
        start = time.time()
        after_common = time.time()
        
        app = self.app  # type: Executor
        stats_schedulers = []
        for scheduler in app.schedulers.itervalues():
            if scheduler.get(u'unmanaged_by_arbiter', False):
                continue
            _uri = u'%s:%s' % (scheduler[u'address'], scheduler[u'port'])
            d_scheduler = {
                u'addr'   : _uri,
                u'name'   : scheduler[u'name'],
                u'con'    : (scheduler.get(u'con', None) is not None),
                u'info'   : scheduler.get(u'con_info', u'Poller did not try to connect to scheduler [%s]' % _uri),
                u'latency': scheduler.get(u'con_latency', -1)
            }
            stats_schedulers.append(d_scheduler)
        
        list_timeout_actions = self.app.get_timeout_stats()
        exec_stats = app.get_execution_stats()
        checks_top_usage = app.get_top_usage()
        module_stats = self._get_module_stats(getattr(app, u'modules_manager', None), param)
        
        asking_cpu_time_get_avg = app.avg_asking_cpu_time.get_avg(0)
        executor_load = 0 if asking_cpu_time_get_avg == 0 else app.avg_get_cpu_time.get_avg(0) / asking_cpu_time_get_avg
        
        raw_stats = {
            u'tags'                   : self.get_managed_tags(),
            u'realm'                  : app.realm,
            u'type'                   : u'PASSIVE' if app.passive else u'ACTIVE',
            u'have_conf'              : app.cur_conf is not None,
            u'nb_action_done_per_sec' : app.get_nb_finished_actions_average(),
            u'nb_cpus'                : app.nb_cpus,
            u'schedulers'             : stats_schedulers,
            u'checks_top_usage'       : checks_top_usage,
            u'nb_check_in_timeout'    : len(list_timeout_actions),
            u'checks_in_timeout'      : list_timeout_actions,
            u'keep_timeout_time'      : app.get_keep_timeout_time(),
            u'exec_stats'             : exec_stats,
            u'executor_load'          : executor_load,
            u'cpu_usage'              : app.get_nb_finished_actions_cpu_time_average(),
            u'dead_worker_stat'       : app.dead_worker_stat,
            u'workers_restarts'       : app.last_restarts,
            u'http_errors_count'      : app.http_errors_count,
            u'last_action_launch_time': self.app.last_action_launch_time,
            u'module_stats'           : module_stats,
            u'ram_usage'              : app.avg_ram_percent.get_avg(default_value=get_memory_used_percent() * 100),
            u'max_ram_usage'          : app.max_ram_percent,
            u'cpu_running_queue'      : app.get_cpu_running_queue(),
            u'max_cpu_queue_par_cpu'  : app.max_cpu_queue_per_cpu,
            u'platform'               : app.platform,
            u'activated'              : app.activated,
            u'spare'                  : app.spare,
            u'executor_in_overload'   : app.executor_in_overload,
            
        }
        
        end = time.time()
        total_time = end - start
        # Display the total stats time only if the time is higher than display_statistics_compute_time_if_higher
        log_f = logger.debug
        add_line = ''
        if total_time >= self.app.display_statistics_compute_time_if_higher / 1000.0:  # The cfg parameter is in ms
            log_f = logger.info
            add_line = u'(NOTE: log is displayed in INFO because %.3f is higher than display_statistics_compute_time_if_higher=%sms in the daemon cfg)' % (total_time, self.app.display_statistics_compute_time_if_higher)
        log_f(u'%s Daemon stats were computed in %.3fs (%.3f for daemon common part, %.3f for %s part) %s' % (CHAPTER_STATS, total_time, after_common - start, end - after_common, self.app.daemon_type, add_line))
        
        return raw_stats
    
    
    def get_managed_tags(self, param=u''):  # noqa => no problem for not using the param
        tags = []
        if self.app.do_checks is True:
            tags = self.app.poller_tags
        
        if self.app.do_actions is True:
            tags = tags + self.app.reactionner_tags
        
        tags = list(set(tags))
        tags.sort()
        return tags
    
    
    get_managed_tags.doc = u'Return managed tags for poller or reactionner'
    get_managed_tags.need_lock = True


class ISynchronizer(Interface):
    """
    Interface for Synchronizers for try check
    """
    
    
    def synchronizer_push_actions(self, actions, sync_id):
        schedulers_list = self.app.schedulers
        if sync_id not in schedulers_list:
            synchronizers = {
                'active'              : False,
                'actions'             : {},
                'wait_homerun'        : {},
                'uri'                 : 'localhost',
                'wait_homerun_lock'   : threading.RLock(),
                'unmanaged_by_arbiter': True,
                'thread'              : None,
                'instance_id'         : sync_id,
                'name'                : sync_id,
            }
            schedulers_list[sync_id] = synchronizers
        
        self.app.add_actions(actions, schedulers_list[sync_id], actions_direction=ACTION_DIRECTIONS.PASSIVE_DIRECTION)
    
    
    synchronizer_push_actions.method = 'POST'
    synchronizer_push_actions.doc = 'Push new actions'
    synchronizer_push_actions.display_name = u'Push of Check from Synchronizer for a Try Check'
    
    
    def synchronizer_get_returns(self, sync_id):
        get_checks_return = self.app.get_return_for_passive(sync_id)
        
        get_checks_return = cPickle.dumps(get_checks_return)
        get_checks_return = zlib.compress(get_checks_return)
        get_checks_return = base64.b64encode(get_checks_return)
        
        return get_checks_return
    
    
    synchronizer_get_returns.doc = 'Get the returns of the actions'


class ISchedulers(Interface):
    """
    Interface for Schedulers
    If we are passive, they connect to this and send/get actions
    """
    
    doc = 'Push new actions from the scheduler (internal)'
    
    
    # A Scheduler send me actions to do
    def push_actions(self, actions, sched_id):
        scheduler_link = self.app.get_link_from_type(u'scheduler', sched_id)
        if scheduler_link is None:
            scheduler_from = self.app.http_daemon.bottle.request.headers.get(u'X-Forwarded-For', u'unknown scheduler')
            # Maybe the daemon do not have any configuration currently (just started or maybe arbiter is starting to dispatch)
            # in this case, do not raise errors at all, or if the arbiter talk us since not long, maybe we are in a configuration
            # change in progress
            if not self.app.have_configuration or not self.app.get_any_link_from_type(u'scheduler') or self.app.is_configuration_change_recent():
                logger.info(u'The scheduler from %s (shard_id=%s) pushed me actions but is unknown. We can maybe not have any configuration or just have it, so we are refusing theses actions.' % (scheduler_from, sched_id))
                return False
            
            logger.error(u'A scheduler push me actions but it is not in my configuration so I drop these actions')
            logger.error(u'Maybe I\'m not ready or there is a rogue scheduler')
            logger.error(u'Unknown Scheduler detected with host name:[%s] and sched_id[%s]' % (scheduler_from, sched_id))
            hours_since_epoch = int(time.time()) / 3600
            # all app daemons will have this but for schedulers app can be Scheduler and not a daemon
            if hasattr(self.app, u'http_errors_count'):
                # Store HTTP errors for the last 24 hours
                if hours_since_epoch in self.app.http_errors_count:
                    self.app.http_errors_count[int(hours_since_epoch)] += 1
                else:
                    self.app.http_errors_count[int(hours_since_epoch)] = 1
            return False
        else:
            self.app.add_actions(actions, scheduler_link, actions_direction=ACTION_DIRECTIONS.PASSIVE_DIRECTION)
            return True
    
    
    push_actions.method = 'POST'
    push_actions.doc = doc
    push_actions.display_name = u'Scheduler push checks/notifications/event handlers to passive poller/reactionners'
    
    doc = 'Get the returns of the actions (internal)'
    
    
    # A scheduler ask us the action return value
    def get_returns(self, sched_id):
        get_checks_return = self.app.get_return_for_passive(int(sched_id))
        
        get_checks_return = cPickle.dumps(get_checks_return)
        get_checks_return = zlib.compress(get_checks_return)
        get_checks_return = base64.b64encode(get_checks_return)
        
        return get_checks_return
    
    
    get_returns.doc = doc
    
    doc = 'Get the limit for push_actions '
    
    
    def get_request_limit_cpu(self):
        ret = self.app.compute_request_limit()
        return cPickle.dumps(ret)
    
    
    get_returns.doc = doc
    
    doc = 'Check if the scheduler ID is known by '
    
    
    def is_scheduler_known(self, sched_id):
        sched_id = int(sched_id)
        return str(sched_id in self.app.schedulers)
    
    
    is_scheduler_known.doc = doc


# Class for reactionner, poller
class Executor(BaseSatellite):
    properties = BaseSatellite.properties.copy()
    properties.update({
        'debug_display_launched_command': BoolProp(default='0'),
    })
    
    # Must be overloaded by subclass, by default we are doing nothing
    do_checks = False  # I do checks?
    do_actions = False  # and other? (notif, event handlers, etc)
    
    managed_actions_str = '_NOT_SET_'  # for display of what we are managing
    
    
    def __init__(self, name, config_file, is_daemon, do_replace, debug, debug_file, daemon_id=0):
        super(Executor, self).__init__(name, config_file, is_daemon, do_replace, debug, debug_file, daemon_id)
        logger.debug(u'[executor] executor init.')
        
        self.realm = 'not set'
        self.modules = []
        
        # dict of active workers
        self._workers = {}  # type: Dict[unicode,Dict[int,Worker]]
        
        self._add_http_interface(ISchedulers(self))
        self._add_http_interface(ISynchronizer(self))
        self._add_http_interface(IStatsExecutor(self))
        
        self.debug_display_launched_command = False
        
        try:
            self.nb_cpus = cpu_count()
        except Exception:
            # not implemented?
            self.nb_cpus = 4
        
        self.cpu_percent = {}
        
        self._cpu_time_limit = self.nb_cpus
        self._round_robin_counter = 0
        
        # The executor stats are in their own object (how many checks are between 10ms->50ms, etc.)
        self._executor_stats = ExecutorStats()
        
        self._workers_stats_aggregator_and_printer = InMainProcessStatsAggregateAndPrinter(self.managed_actions_str)
        
        self._exec_stat = {}
        
        self._actions_to_give_at_worker = []
        self._actions_to_give_at_worker_lock = threading.RLock()
        self._tick_count = 0
        
        # Stats about loop time and number of action we did grok from the schedulers,
        # so we can know about our internal loop, and limit it if we need (if > 1sec)
        self.avg_loop_time = AvgInRange(60)
        self.executor_in_overload = False
        
        self.avg_asking_cpu_time = AvgInRange(60)
        self.avg_get_cpu_time = AvgInRange(60)
        
        # The daemon need to keep a count of the actions it needs to push back to schedulers
        self._to_give_back_actions_nb_avg = AvgInRange(60)
        self._to_give_back_actions_cpu_time_avg = AvgInRange(60)
        
        self.avg_ram_percent = AvgInRange(60, get_memory_used_percent())
        
        self.raw_nb_action_todo_by_sec = 0
        self.raw_cpu_time_action_todo_by_sec = 0
        
        self._action_with_stat = 0
        
        # Set in _setup_new_conf
        self.passive = False
        
        self._min_workers = 0
        self._processes_by_worker = -1
        self._polling_interval = -1
        self._timeout = -10
        self.keep_timeout_time = 1200
        self.max_ram_percent = WORKER_MAX_RAM_PERCENT
        self._cpu_running_queue = AvgInRange(60, self.get_cpu_running_queue())
        self.max_cpu_queue_per_cpu = WORKER_MAX_CPU_QUEUE_PER_CPU
        
        self.dead_worker_stat = {}
        self.last_restarts = {}
        self.last_restarts_lock = RLock()
        self.last_restarts_keep = 86400  # lasts 24h (86400 seconds) of restarts
        
        self.poller_tags = ['None']
        self.reactionner_tags = ['None']
        self._max_plugins_output_length = 8192
        
        # Our arbiters
        self.arbiters = {}
        
        # Our pollers and reactionners
        self.pollers = {}
        self.reactionners = {}
        self.receivers = {}
        
        # We will have a tread by distant satellites, so we must protect our access
        self.satellite_lock = threading.RLock()
        
        self.request_limit = 0  # will know if we must get new checks or not
        
        self.actions_lock = threading.RLock()
        
        self.have_modules = False  # to know if we must update or just load modules in the modules manager
        self.platform = os.name
        
        self.last_action_launch_time = -1
        
        # If the get_raw_stats call is higher to this ms, then display as INFO
        self.display_statistics_compute_time_if_higher = 100
        self.mainloop_watchdog = WatchDogThreadDumper(u'Main loop', wait_time=60 * 30, dump_interval=60 * 5, fatal_dead_lock_delay=60 * 30, multi_usage=True)
        self._loop_number = 1
        self._loop_logger = LoggerFactory.get_logger().get_sub_part(u'%s TIME' % self.__class__.__name__.upper())
        self._loop_logger_workers = self._loop_logger.get_sub_part(u'WORKERS')
        self._loop_logger_start = self._loop_logger.get_sub_part(u'=== Loop start ===')
        self._loop_logger_stop = self._loop_logger.get_sub_part(u'=== Loop stop  ===')
    
    
    def wait_new_conf(self):
        super(Executor, self).wait_new_conf()
        logger_configuration.info(u'Stopping our workers as the arbiter ask us to wait until we receive a new configuration')
        self._do_stop_workers()
    
    
    # Get all returning actions for a call from a scheduler
    def get_return_for_passive(self, sched_id):
        # I do not know this scheduler?
        if sched_id not in self.schedulers:
            logger.debug(u'[executor][%s] I do not know this scheduler: [%s]' % (self.name, sched_id))
            return []
        
        sched = self.schedulers[sched_id]
        # logger.debug('[executor][%s] Preparing to return %s' % (self.name, str(sched['wait_homerun'].values())))
        
        # prepare our return
        with sched['wait_homerun_lock']:
            ret = copy.copy(sched['wait_homerun'].values())
            # and clear our dict
            sched['wait_homerun'].clear()
        
        if ret:
            if self.do_actions:
                nb_notifs = len([action for action in ret if action.is_a == u'notification'])
                nb_events = len([action for action in ret if action.is_a == u'eventhandler'])
                logger.info(u'%s %s %s %d action\'s result(s) given to answer scheduler request [%d notifications / %d events] ' % (CHAPTER_ACTION_RESULT, get_chapter_string(sched['name']), CHAPTER_GIVEN, len(ret), nb_notifs, nb_events))
            elif self.do_checks:
                logger.info(u'%s %s %s %d check\'s result(s) given to answer scheduler request ' % (CHAPTER_CHECKS_RESULT, get_chapter_string(sched['name']), CHAPTER_GIVEN, len(ret)))
        return ret
    
    
    def _do_stop_workers(self):
        my_workers = self._get_all_workers()
        logger.info('Stopping all my %d workers' % len(my_workers))
        for worker in my_workers:
            try:
                worker.terminate()
                worker.join(timeout=1)
            # An already dead worker or in a worker
            except (AttributeError, AssertionError):
                pass
        # No more workers, if we need, we will create new ones later
        for (worker_type, workers) in self._workers.items():
            workers.clear()
    
    
    # The main stop of this daemon. Stop all workers modules and sockets
    def do_stop(self):
        self.mainloop_watchdog.quit()
        self._save_exec_stat()
        self._do_stop_workers()
        # And then call our master stop from satellite code
        super(Executor, self).do_stop()
    
    
    # A simple function to add objects in self like broks in self.broks, etc
    def add(self, elt):
        cls_type = elt.__class__.my_type
        if cls_type == u'brok':
            # For brok, we TAG brok with our instance_id
            # TODO: better tag ID : use poller name
            elt.instance_id = 0
            self.broks[elt.id] = elt
            return
        elif cls_type == u'externalcommand':
            logger.debug(u'Enqueuing an external command \'%s\'' % str(elt.__dict__))
            with self.external_commands_lock:
                self.external_commands.append(elt)
    
    
    def add_actions(self, actions, sat_entry, request_limit=-1, actions_direction=ACTION_DIRECTIONS.ACTIVE_DIRECTION):
        with self._actions_to_give_at_worker_lock:
            self.really_add_actions(actions, sat_entry, request_limit, actions_direction=actions_direction)
    
    
    # Add a list of actions to our queues
    def really_add_actions(self, actions, sat_entry, request_limit=-1, actions_direction=ACTION_DIRECTIONS.ACTIVE_DIRECTION):
        scheduler_name = sat_entry[u'name']
        total_cpu_time_give = 0
        action_with_stat = 0
        nb_action_todo = 0
        nb_action_event = 0
        nb_action_notification = 0
        nb_action_check = 0
        
        for action in actions:  # type: Action
            if action.get_uuid() in sat_entry[u'actions']:
                continue
            # we try to set average_cpu_time
            try:
                action_hash = action.get_hash()
                action.average_cpu_time = self._exec_stat[action_hash][u'average']
                total_cpu_time_give += action.average_cpu_time
                action_with_stat += 1
            except:
                pass
            if action.is_a == ACTION_TYPES.EVENTHANDLER:
                logger.debug(u'[EVENTHANDLER] stacking event handler id %s to be executed' % action.get_uuid())
                nb_action_event += 1
            elif action.is_a == ACTION_TYPES.NOTIFICATION:
                nb_action_notification += 1
            elif action.is_a == ACTION_TYPES.CHECK:
                nb_action_check += 1
            
            action.sched_id = sat_entry[u'instance_id']
            self._actions_to_give_at_worker.append(action)
            nb_action_todo += 1
        
        # Log must change if we ask or receive action
        if actions_direction == ACTION_DIRECTIONS.ACTIVE_DIRECTION:  # ask for actions
            if self.do_actions:
                logger.info(u'%s %s %s Requesting actions todo from this scheduler for %.3fs cpu time  [received=%d notification(s) / %d event(s) for %.3fs cpu time]' % (
                    CHAPTER_ACTION, get_chapter_string(scheduler_name), CHAPTER_GET, self.request_limit, nb_action_notification, nb_action_event, total_cpu_time_give))
            if self.do_checks:
                logger.info(u'%s %s %s Requesting checks todo from this scheduler for %.3fs cpu time [received=%d check(s) for %.3fs cpu time]' % (
                    CHAPTER_CHECKS, get_chapter_string(scheduler_name), CHAPTER_GET, self.request_limit, nb_action_check, total_cpu_time_give))
        else:
            if self.do_actions:
                logger.info(u'%s %s %s We received actions todo from this scheduler for %.3fs cpu time [received=%d notification(s) / %d event(s) for %.3fs cpu time]' % (
                    CHAPTER_ACTION, get_chapter_string(scheduler_name), CHAPTER_RECEIVED, self.request_limit, nb_action_notification, nb_action_event, total_cpu_time_give))
            if self.do_checks:
                logger.info(u'%s %s %s We received checks todo from this scheduler for %.3fs cpu time [received=%d check(s) for %.3fs cpu time]' % (
                    CHAPTER_CHECKS, get_chapter_string(scheduler_name), CHAPTER_RECEIVED, self.request_limit, nb_action_check, total_cpu_time_give))
        
        self.raw_nb_action_todo_by_sec = nb_action_todo
        self.raw_cpu_time_action_todo_by_sec = total_cpu_time_give
        self._action_with_stat = action_with_stat
        
        if request_limit != -1 and action_with_stat != 0:
            self.avg_asking_cpu_time.update_avg(request_limit)
            self.avg_get_cpu_time.update_avg(total_cpu_time_give)
            
            logger.debug(u'[executor][%s] total_cpu_time_give [%s] request_limit [%s] load [%.3f] avg [%.3f] ' % (
                self.name,
                total_cpu_time_give,
                request_limit,
                0 if request_limit == 0 else total_cpu_time_give / request_limit,
                0 if self.avg_asking_cpu_time.get_avg(0) == 0 else self.avg_get_cpu_time.get_avg(0) / self.avg_asking_cpu_time.get_avg(0)))
    
    
    def get_top_usage(self):
        return self._executor_stats.get_top_usage()
    
    
    def get_execution_stats(self):
        return self._executor_stats.get_stats()
    
    
    def get_keep_timeout_time(self):
        return self._executor_stats.get_keep_timeout_time()
    
    
    def get_timeout_stats(self):
        return self._executor_stats.get_timeout_stats()
    
    
    def _get_all_workers(self):
        workers = []
        for worker_info in self._workers.values():
            workers.extend(worker_info.values())
        return workers
    
    
    def _get_cpu_time_action_todo(self):
        cpu_time_action_todo = 0
        with self._actions_to_give_at_worker_lock:
            for action in self._actions_to_give_at_worker:
                action_hash = action.get_hash()
                stat = self._exec_stat.get(action_hash, None)
                if stat:
                    cpu_time_action_todo += stat['average']
        return cpu_time_action_todo
    
    
    def _get_cpu_time_action_todo_in_workers(self):
        # type: () -> float
        return from_int_micro_sec_to_float_sec(sum([worker.load_todo_actions.value for worker in self._get_all_workers()]))
    
    
    @staticmethod
    def get_cpu_running_queue():
        return get_cpu_running_procs()
    
    
    def _load_exec_stat(self):
        clean_name = 'fail to make name'
        try:
            clean_name = "".join(x for x in self.name if x.isalnum())
            file_path = _PATH_EXEC_STAT % clean_name
            if not os.path.exists(os.path.dirname(file_path)):
                os.makedirs(os.path.dirname(file_path))
            self._exec_stat = json.load(open(file_path, 'r'))
            
            now = time.time()
            for stat in self._exec_stat.itervalues():
                stat['last_update'] = now
        except:
            logger.warning('[executor][%s] Warning previous stats cannot be load from file [%s].' % (self.name, _PATH_EXEC_STAT % clean_name))
    
    
    def _save_exec_stat(self):
        clean_name = 'fail to make name'
        try:
            clean_name = "".join(x for x in self.name if x.isalnum())
            file_path = _PATH_EXEC_STAT % clean_name
            if not os.path.exists(os.path.dirname(file_path)):
                os.makedirs(os.path.dirname(file_path))
            json.dump(self._exec_stat, open(file_path, 'w'))
        except:
            logger.warning('[executor][%s] save exec stat in file [%s] fail : [%s]' % (self.name, _PATH_EXEC_STAT % clean_name, traceback.format_exc()))
    
    
    def main(self):
        try:
            self._init()
            
            # Now main loop
            self.do_mainloop()
        except Exception as exp:
            logger.critical(u'The daemon did have an unrecoverable error. It must exit.')
            logger.critical(u'You can log a bug to your Shinken integrator with the error message:')
            logger.critical(u'ERROR: %s %s' % (exp, type(exp)))
            try:
                logger.critical(u'ERROR: %s' % exp.__dict__)
            except:
                pass
            logger.print_stack(level=logging.CRITICAL)
            # NOTE: yes these lines are useless, but seems that without them, os._exit() do not let the time to logger to flush its buffer.
            print('\n')
            sys.stdout.flush()
            os._exit(2)  # noqa => need this hard exit
            raise
    
    
    """ ----------------- INIT ----------------- """
    
    
    def _init(self):
        for line in self.get_header():
            logger.info(line)
        self.daily_log_version()
        
        self.load_config_file()
        # Look if we are enabled or not. If ok, start the daemon mode
        self.look_for_early_exit()
        self.do_daemon_init_and_start()
        self._do_post_daemon_init()
        
        self.load_modules_manager()
        
        # We wait for initial conf
        self.wait_for_initial_conf()
        
        # Maybe we never have any conf from the arbiter, but we have a stop
        if self.interrupted:
            return
        
        self._setup_new_conf()
        self._setup_command_logger()
        # We can load our modules now
        
        self.modules_manager.set_modules(self.modules_manager.modules)
        self.do_load_modules()
        
        # And even start external ones
        self.modules_manager.start_external_instances()
        
        # Start our workers
        self._allocate_workers()
    
    
    def _allocate_workers(self):
        # Allocate worker
        for _ in xrange(self._min_workers):
            to_del = []
            for worker_type in self._workers:
                try:
                    self._create_and_launch_worker(worker_type=worker_type)
                # Maybe this module is not a true worker one. if so, just delete if from _workers
                except NotWorkerMod:
                    to_del.append(worker_type)
            
            for worker_type in to_del:
                logger.debug('[executor][%s] The module %s is not a worker one,  I remove it from the worker list' % (self.name, worker_type))
                del self._workers[worker_type]
    
    
    # Do this satellite (poller or reactionner) post 'daemonize' init: we must register our interfaces for 4 possible callers: arbiter, schedulers, brokers and stat.
    def _do_post_daemon_init(self):
        # Register all interface
        self._register_http_interfaces()
        
        self._workers[u'fork'] = {}  # type: Dict[int, Worker]
        
        # For multiprocess things, we should not have socket timeouts.
        socket.setdefaulttimeout(None)
    
    
    def _setup_command_logger(self):
        path = '/var/log/shinken/DEBUG_commands_launched_by_%s.log' % self.name
        if self.debug_display_launched_command:
            self.command_logger = logging.getLogger('CommandLogger')
            self.command_logger.setLevel(logging.DEBUG)
            self.command_logger.addHandler(logging.FileHandler(path))
        else:
            # For security reasons, the file is removed when the option isn't set up.
            if os.path.isfile(path):
                os.unlink(path)
    
    
    def _set_passive(self, is_passive, _logger):
        # type: (bool, PartLogger) -> None
        was_passive = self.passive
        self.passive = is_passive
        if was_passive != self.passive:
            if self.passive:
                _logger.info('The arbiter ask us to go in passive mode.')
            else:
                _logger.info('The arbiter ask us to go in active mode.')
    
    
    @staticmethod
    def __print_tag_line(tag, what, _logger):
        # type: (unicode, unicode, PartLogger) -> None
        
        additional_info = u' (=>not tagged)' if tag == u'None' else u''
        _logger.info(u'   - %-10s => %-10s %s' % (what, tag, additional_info))
    
    
    def _update_tags(self, new_poller_tags, _property, property_display, _logger):
        # type: (List[unicode], unicode, unicode, PartLogger) -> None
        new_poller_tags.sort()  # order for all display things
        old_poller_tags = getattr(self, _property)
        setattr(self, _property, new_poller_tags)
        
        # for display, we need them as set
        new_poller_tags = set(new_poller_tags)
        old_poller_tags = set(old_poller_tags)
        new_poller_tags_sorted = list(new_poller_tags)
        new_poller_tags_sorted.sort()
        # Nothing to display
        if new_poller_tags == old_poller_tags:
            return
        added_tags = sorted(list(new_poller_tags - old_poller_tags))
        removed_tags = sorted(list(old_poller_tags - new_poller_tags))
        unchanged_tags = sorted(list(new_poller_tags.intersection(old_poller_tags)))
        
        _logger.info(u'%s did changed:' % property_display)
        for tag in removed_tags:
            self.__print_tag_line(tag, u'REMOVED', _logger)
        for tag in added_tags:
            self.__print_tag_line(tag, u'NEW', _logger)
        for tag in unchanged_tags:
            self.__print_tag_line(tag, u'UNCHANGED', _logger)
    
    
    # Set properties we want to set in our new schedulers
    def _set_default_values_to_scheduler_entry(self, entry):
        
        # IMPORTANT: mut be LOCAL, so each scheduler have their own {} and []
        default_scheduler_properties = {
            u'wait_homerun'      : {},
            u'wait_homerun_lock' : threading.RLock(),
            u'actions'           : {},
            u'running_id'        : 0,
            u'con'               : None,
            u'thread'            : None,
            u'type'              : u'scheduler',
            u'daemon_incarnation': {},
            u'last_connection'   : 0
        }
        
        entry.update(default_scheduler_properties)
    
    
    def _set_daemon_id_of_scheduler(self, daemon, daemon_id):
        daemon['instance_id'] = daemon_id
    
    
    # Set up a new conf, but beware of global lock management.
    # Note: don't do locking thing here, as we have the satellite lock!
    def _setup_new_conf(self):
        with self.satellite_lock:
            self.really_setup_new_conf()
    
    
    # Set up the new received conf from arbiter
    def really_setup_new_conf(self):
        start = time.time()
        _logger = self._print_new_update_conf_received()
        
        # If the configuration was giving us a new configuration incarnation, show it
        self.print_configuration_incarnation_log_entry_if_need(_logger)
        
        conf = self.new_conf
        _logger.debug('[executor][%s] Setup configuration %s' % (self.name, conf))
        self.reset_configuration_change()  # arbiter talk to us, we note it
        self.new_conf = None
        self.cur_conf = conf
        global_conf = conf['global']
        
        # ------ Name part
        if 'poller_name' in global_conf:
            name = global_conf['poller_name']
        elif 'reactionner_name' in global_conf:
            name = global_conf['reactionner_name']
        else:
            name = 'Unnamed satellite'
        
        self.name = name
        self.realm = global_conf.get('realm', 'not set')
        
        self.save_daemon_name_into_configuration_file(name)
        
        # Should we enable/disable human log format
        logger.set_human_format(on=global_conf.get('human_timestamp_log', True))
        
        self._set_spare(global_conf.get('spare', False), _logger)
        
        # Get the display_statistics_compute_time_if_higher from the .cfg
        self.display_statistics_compute_time_if_higher = global_conf.get('display_statistics_compute_time_if_higher', 100)
        
        activated = conf.get('activated', True)
        was_activated = self._set_is_activated(activated, _logger)
        
        # Now set tags ['None'] is the default tags, display only if we need
        self._update_tags(global_conf.get('poller_tags', ['None']), 'poller_tags', 'Poller tags', _logger)
        self._update_tags(global_conf.get('reactionner_tags', ['None']), 'reactionner_tags', 'Reactionner tags', _logger)
        
        # ------ Passive part
        self._set_passive(global_conf['passive'], _logger)  # set and log
        
        if not self.activated:
            self._go_as_not_active(was_activated, _logger)
            self.have_configuration = True
            return
        
        _logger.debug('Configuration received')
        
        new_schedulers = []  # For logging
        deleted_schedulers = []  # for logging
        # ------ Schedulers part
        for (daemon_id, daemon) in conf['schedulers'].iteritems():
            self._set_or_update_scheduler_from_configuration(daemon, daemon_id, global_conf, new_schedulers, deleted_schedulers, _logger)
            continue
        
        # print a bloc about the new schedulers this turn, note: we got no arbiters
        self._print_new_and_deleted_daemons(new_schedulers=new_schedulers, deleted_schedulers=deleted_schedulers, _logger=_logger)
        
        # ------ Nb workers
        self._min_workers = global_conf['min_workers']
        # Min workers = 0 means it's the number of cpu
        if self._min_workers == 0:
            self._min_workers = self.nb_cpus
        else:
            self._cpu_time_limit = self._min_workers
        
        self.max_ram_percent = global_conf.get('max_ram_percent', WORKER_MAX_RAM_PERCENT)
        self.max_cpu_queue_per_cpu = global_conf.get('max_cpu_queue_per_cpu', WORKER_MAX_CPU_QUEUE_PER_CPU)
        self._processes_by_worker = global_conf['processes_by_worker']
        self._polling_interval = global_conf['polling_interval']
        self._timeout = self._polling_interval
        
        self._executor_stats.set_keep_timeout_time(global_conf['keep_timeout_time'])
        logger.debug('[executor][%s] workers [%s] processes_by_worker:[%s] polling interval:[%s]' % (self.name, self._min_workers, self._processes_by_worker, self._polling_interval))
        
        self._max_plugins_output_length = global_conf.get('max_plugins_output_length', 8192)
        
        # if the .cfg have a specific exec stat range(s) then parse them and warn
        # the executor stats, so it can (maybe) reset its structure
        if global_conf['exec_stat_range']:
            exec_stat_range = [int(k) for k in global_conf['exec_stat_range'].split(',')]
            self._executor_stats.update_ranges(exec_stat_range)
        
        # Set our giving timezone from arbiter
        self.set_tz(global_conf['use_timezone'])
        
        # Set extra worker_types define in module.
        self.modules = global_conf['modules']
        
        logger.debug('[executor][configuration] Receiving modules:[%s] i already load modules:[%s]' % (','.join([m.get_name() for m in self.modules]), self.have_modules))
        
        if not self.have_modules:
            if self.modules:
                _logger.info('New modules received: %s' % (','.join([m.get_name() for m in self.modules])))
            # Ok now start, or restart them!
            # Set modules, init them and start external ones
            self.modules_manager.set_modules(self.modules)
            self.do_load_modules()
            self.modules_manager.start_external_instances()
            self.have_modules = True
        else:  # just update the one we need
            self.modules_manager.update_modules(self.modules)
        
        for module in self.modules:
            # If we already got it, bypass
            if module.module_type not in self._workers:
                self._workers[module.module_type] = {}
        
        self._load_exec_stat()
        
        # Start threads if we need, not a problem as starting thread is cheap and not timeout prone
        self._assert_valid_satellite_threads()
        
        self.have_configuration = True
        _logger.debug('configuration loaded into %.3fs' % (time.time() - start))
    
    
    # We will look for satellites, and if we don't have a thread or a dead one, start a new one
    # unless the distant satellite is not active (like an idle spare scheduler)
    def _assert_valid_satellite_threads(self):
        with self.satellite_lock:
            types = {'scheduler': self.schedulers, 'poller': self.pollers, 'reactionner': self.reactionners, 'receiver': self.receivers}
            for (satellite_type, satellite_definition) in types.iteritems():
                for (satellite_id, satellite_entry) in satellite_definition.iteritems():
                    if not satellite_entry['active']:
                        continue
                    self._assert_one_satellite_thread(satellite_type, satellite_id, satellite_entry)
    
    
    # Broker ask for Broks objects
    def get_and_send_jobs_from_distant(self, e):
        self.get_new_actions(e)
        self.send_workers_result_to_schedulers(e)
    
    
    def do_satellite_thread(self, s_type, s_id, t_name):
        logger.debug('[satellite] Starting thread [%s]' % t_name)
        
        with self.satellite_lock:
            distant_link = self.get_link_from_type(s_type, s_id)
            if distant_link is None:  # already down?
                return
        
        uri = distant_link['uri']
        while True:
            start = time.time()
            with self.satellite_lock:
                # first look if we are still need or not
                distant_link = self.get_link_from_type(s_type, s_id)
            
            if distant_link is None or (uri != distant_link['uri']):  # no more present, or uri did change? EXIT!!!
                logger.info('[satellite] The connection thread to the %s with the id %s (%s) is not longer need as this daemon is no more that same as before. Restarting it.' % (s_type, s_id, t_name))
                return
            
            if not self.passive:
                # we have an entry, so we can ping it
                self.ping_and_check_distant_daemon(distant_link)
                self.get_and_send_jobs_from_distant(distant_link)
            else:
                logger.debug('[satellite] Skipping %s actions as we are passive' % t_name)
            
            end = time.time()
            diff = end - start
            diff = max(0, min(1, diff))  # noqa force it to be between 0 and 1
            logger.debug('[satellite] get_and_send_jobs_from_distant load [%3d]%%' % (diff * 100))
            time.sleep(self._polling_interval - diff)
    
    
    def is_connection_try_too_close(self, elt):
        now = time.time()
        last_connection = elt['last_connection']
        if now - last_connection < 5:
            return True
        return False
    
    
    def ping_and_check_distant_daemon(self, sat_entry):
        con = sat_entry['con']
        daemon_type = sat_entry['type']
        if con is None:
            return False
        
        try:
            # initial ping must be quick
            con.get('ping')
            new_incarnation = con.get('get_daemon_incarnation')
            # protect daemon_incarnation from modification
            with self.satellite_lock:
                # data transfer can be longer
                daemon_incarnation = sat_entry['daemon_incarnation']
                
                # The schedulers have been restarted: it has a new run_id.
                if new_incarnation != daemon_incarnation:
                    if daemon_incarnation:
                        logger_configuration.info('The %s change it\'s configuration from %s to %s' % (sat_entry['name'], new_incarnation['configuration_incarnation_uuid'], daemon_incarnation['configuration_incarnation_uuid']))
                    else:  # it was without any configuration
                        logger_configuration.info('The %s received a new configuration (uuid=%s)' % (sat_entry['name'], new_incarnation['configuration_incarnation_uuid']))
                # Ok all is done, we can save this new incarnation
                sat_entry['daemon_incarnation'] = new_incarnation
                return True
        except HTTPExceptions as exp:
            logger.error('Connection problem to the %s %s: %s' % (daemon_type, sat_entry['name'], str(exp)))
            sat_entry['con'] = None
            return False
        except KeyError as exp:
            logger.error('the %s \'%s\' is not initialized: %s' % (daemon_type, sat_entry['name'], str(exp)))
            sat_entry['con'] = None
            return False
    
    
    def _create_and_launch_worker(self, worker_type=u'fork'):
        # If we are in the fork module, we do not specify a target
        target = None
        if worker_type != u'fork':
            for module in self.modules_manager.get_all_alive_instances():
                if module.properties[u'type'] == worker_type:
                    # First, see if the module is a 'worker' one or not
                    if not module.properties.get(u'worker_capable', False):
                        logger.warning(u'[executor][%s] Worker type [%s] module is not worker_capable' % (self.name, worker_type))
                        raise NotWorkerMod()
                    target = getattr(module, u'work', None)
            if target is None:
                logger.warning(u'[executor][%s] Worker type [%s] has no \'work\' method.' % (self.name, worker_type))
                raise NotWorkerMod()
        
        # We want to give to the Worker the name of the daemon (poller or reactionner)
        cls_name = self.__class__.__name__.lower()
        worker = Worker(
            self._processes_by_worker,
            max_plugins_output_length=self._max_plugins_output_length,
            target=target,
            worker_type=worker_type,
            loaded_into=cls_name,
            executor_id=self.name,
            http_daemon=self.http_daemon,
            daemon_display_name=self.daemon_display_name,
            max_ram_percent=self.max_ram_percent,
            max_cpu_queue_per_cpu=self.max_cpu_queue_per_cpu
        )
        
        # Save this worker
        self._workers[worker_type][worker.id] = worker
        
        logger.debug('[executor][%s] allocating new [%s] worker [%s]' % (self.name, worker_type, worker.id))
        
        # Ok, all is good. Start it!
        worker.start()
    
    
    """ ----------------- LOOP ----------------- """
    
    
    def do_loop_turn(self):
        with self.mainloop_watchdog:
            self._loop_logger_start.info(u'[ Loop number=%-5d ] ===-===-===-===-===-===-===-===-===-===-===-===-===' % self._loop_number)
            loop_start = time.time()
            self._do_loop_turn()
            
            worker_loop_stats = [(w.id, time.time() - w.last_loop_time.value) for w in self._get_all_workers()]
            worker_loops_info = u', '.join([u'WORKER %s: %.3fs ago' % (worker_id, worker_last_time) for worker_id, worker_last_time in worker_loop_stats])
            self._loop_logger_workers.info(u'Last activity [ %s]' % worker_loops_info)
            
            critical_limit = 4 * WORKER_EXPECTED_LOOP_TIME
            warning_limit = 2 * WORKER_EXPECTED_LOOP_TIME
            for worker_id, worker_last_time in worker_loop_stats:
                if worker_last_time >= critical_limit:
                    self._loop_logger.get_sub_part(u'WORKER %s' % worker_id).error(u'is late, last tick was %.3fs ago, over limit of %.3fs' % (worker_last_time, critical_limit))
                elif worker_last_time >= warning_limit:
                    self._loop_logger.get_sub_part(u'WORKER %s' % worker_id).warning(u'is slow, last tick was %.3fs ago, over limit of %.3fs' % (worker_last_time, warning_limit))
            
            elapsed_time = time.time() - loop_start
            if self.activated and elapsed_time > (4 * self._polling_interval):
                level = logging.ERROR
            elif self.activated and elapsed_time > (2 * self._polling_interval):
                level = logging.WARNING
            else:
                level = logging.INFO
            self._loop_logger_stop.log(level, u'[ Loop number=%-5d ] [PERF] [ %6.3f ]s' % (self._loop_number, time.time() - loop_start))
            self._loop_number += 1
    
    def _do_loop_turn(self):
        start_snap = cpu_stats_helper.get_thread_cpu_snapshot()
        logger.debug('[executor][%s] Loop turn [%s]' % (self.name, self._tick_count))
        loop_start = time.time()
        
        # P1 - _check_new_configuration
        before_ = time.time()
        try:
            self._check_new_configuration()
        except:
            return
        after_ = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 00_check_new_configuration [%.3f]' % (after_ - before_))
        
        # Maybe we get a conf without schedulers, that means that I'm a spare so do nothing
        if not self.schedulers or (self.spare and not self.activated):
            time.sleep(1)
            return
        
        before_ = time.time()
        self.compute_request_limit()
        after_ = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 01_compute_request_limit [%.3f]' % (after_ - before_))
        
        # P2 - _clean_zombie
        before_ = time.time()
        self._clean_zombie()
        after_ = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 02_clean_zombie [%.3f]' % (after_ - before_))
        
        # Look if we have all need threads for our satellites
        before_ = time.time()
        self._assert_valid_satellite_threads()
        after_ = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 03_assert_valid_satellite_threads [%.3f]' % (after_ - before_))
        
        # P4 - _manage_workers_result
        before_ = time.time()
        self._manage_workers_result()
        after_ = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 04_manage_workers_result [%.3f]' % (after_ - before_))
        
        # P5 - _get_loop_stat
        before_ = time.time()
        self._clean_worker_restart()
        after_ = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 05_get_loop_stat [%.3f]' % (after_ - before_))
        
        # P5.5 - _get_loop_stat
        before_ = time.time()
        self._get_loop_stat()
        after_ = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 05_get_loop_stat [%.3f]' % (after_ - before_))
        
        # P6 - _put_actions_in_workers_queue
        self._put_actions_in_workers_queue()
        
        # Get objects from our modules that are not worker based
        # P8 - _get_objects_from_external_queues
        before_ = time.time()
        self._get_objects_from_external_queues()
        after_ = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 07_get_objects_from_external_queues [%.3f]' % (after_ - before_))
        
        # Say to modules it's a new tick :)
        before_ = time.time()
        self.hook_point('tick')
        after_ = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 08_hook_point [%.3f]' % (after_ - before_))
        
        if self._tick_count % 3600 == 0:
            self._save_exec_stat()
        
        if self._tick_count % 300 == 0:
            self._executor_stats.clean_timeouts()
        
        self._tick_count += 1
        
        diff_time = time.time() - loop_start
        logger.debug('[executor] poller working time [%.3f]s/[%.3f]s [%3d]%%' % (diff_time, self.avg_loop_time.get_avg(0), diff_time * 100 / self._polling_interval))
        
        # Protect it against time shifting from system
        self.avg_loop_time.update_avg(diff_time)
        self.sleep(self._polling_interval - diff_time)
        logger.debug('[executor] ======================== %s' % start_snap.get_diff())
    
    
    """ ----------------- LOOP P1 - _check_new_configuration ----------------- """
    
    
    def _check_new_configuration(self):
        # Maybe the arbiter ask us to wait for a new conf If true, we must restart all...
        if self.cur_conf is None:
            # Clean previous run from useless objects and close modules
            self._clean_previous_run()
            
            self.wait_for_initial_conf()
            # we may have been interrupted or so; then just return from this loop turn
            if not self.new_conf:
                raise Exception(u'[executor][%s] We fail to get new configuration. Maybe we are asking to shutdown.', self.name)
            self._setup_new_conf()
        
        self.watch_for_new_conf(0.0)
        if self.new_conf:
            self._setup_new_conf()
    
    
    # An arbiter ask us to wait a new conf, so we must clean all the mess we did, and close modules too
    def _clean_previous_run(self):
        # Clean all lists
        self.schedulers.clear()
        self.broks.clear()
        self._executor_stats.reset()
        with self.external_commands_lock:
            self.external_commands = self.external_commands[:]
    
    
    """ ----------------- LOOP P2 - _clean_zombie ----------------- """
    
    
    def _clean_zombie(self):
        # Check if zombies workers are among us :)
        # If so: KILL THEM ALL!!!
        self._check_and_del_zombie_workers()
        # But also modules
        self.check_and_del_zombie_modules()
    
    
    def _check_and_del_zombie_workers(self):
        # workers are processes, they can die in a number of ways
        # like:
        # *99.99%: bug in code, sorry:p
        # *0.005 %: a mix between a stupid admin (or an admin without coffee),
        # and a kill command
        # *0.005%: alien attack
        # So they need to be detected, and restart if we need
        
        # Active children make a join with everyone, useful :)
        active_children()
        
        workers_to_del = []
        for worker in self._get_all_workers():
            # If a worker goes down, and we did not ask him, it's not good: we can think that we have a worker, and it's not True
            # So we del it
            if not worker.is_alive():
                logger.warning('[executor][%s] The worker %s goes down unexpectedly!' % (self.name, worker.id))
                # Terminate immediately
                worker.terminate()
                worker.join(timeout=1)
                workers_to_del.append((worker.worker_type, worker.id))
                self._register_worker_restart(worker.worker_type)
                # Keep track of the count of dead worker by type
                if worker.worker_type not in self.dead_worker_stat:
                    self.dead_worker_stat[worker.worker_type] = 1
                else:
                    self.dead_worker_stat[worker.worker_type] += 1
        
        # Delete workers from our queues
        # TODO requeue the actions it was managed
        for (worker_type, worker_id) in workers_to_del:
            del self._workers[worker_type][worker_id]
        
        # Here we create new workers if the queue load is too long
        to_del = []
        for mod in self._workers:
            if len(self._workers[mod]) < self._min_workers:
                logger.info('[executor][%s] Trying to adjust worker [%s] number. Actual workers number : %d, [%d]' % (self.name, mod, len(self._workers), self._min_workers))
        
        # I want at least _min_workers by module
        for mod in self._workers:
            # At least _min_workers
            while len(self._workers[mod]) < self._min_workers:
                try:
                    self._create_and_launch_worker(worker_type=mod)
                # Maybe these modules is not a true worker one. if so, just delete if from _workers
                except NotWorkerMod:
                    to_del.append(mod)
                    break
        
        for mod in to_del:
            logger.debug('[executor][%s] The module %s is not a worker one, I remove it from the worker list' % (self.name, mod))
            del self._workers[mod]
    
    
    """ ----------------- LOOP P5.5 - _get_loop_stat ----------------- """
    
    
    def get_nb_finished_actions_cpu_time_average(self):
        return self._workers_stats_aggregator_and_printer.get_nb_finished_actions_cpu_time_average()
    
    
    def get_nb_finished_actions_average(self):
        return self._workers_stats_aggregator_and_printer.get_nb_finished_actions_average()
    
    
    def _get_loop_stat(self):
        with self._actions_to_give_at_worker_lock:
            to_give_cpu_time = self._get_cpu_time_action_todo()
            nb_to_give = len(self._actions_to_give_at_worker)
        
        self._clean_exec_stat()
        
        self._workers_stats_aggregator_and_printer.print_stats(
            logger,
            self._get_all_workers(),
            nb_to_give,
            to_give_cpu_time,
            self._to_give_back_actions_nb,
            self._to_give_back_actions_nb_avg,
            self._to_give_back_actions_cpu_time,
            self._to_give_back_actions_cpu_time_avg
        )
    
    
    """ ----------------- LOOP P4 - _manage_workers_result ----------------- """
    
    
    def _manage_workers_result(self):
        
        cpu_time_action_done = 0
        nb_action_done = 0
        
        # We loop on workers, and get all results from their queues
        for worker in self._get_all_workers():
            w_results = worker.get_results()
            # logger.debug('GET %d results from worker %s' % (len(w_results), worker.id))
            for action in w_results:
                self._manage_action_return(action)
                cpu_time_action_done += getattr(action, 'average_cpu_time', 0)
                nb_action_done += 1
                worker.nb_doing_action -= 1
        
        self.avg_ram_percent.update_avg(get_memory_used_percent() * 100)
        self._cpu_running_queue.update_avg(self.get_cpu_running_queue())
        
        # Compute how many actions
        actions_to_return_nb = 0
        actions_to_return_cpu_time = 0.0
        with self.satellite_lock:
            for scheduler in self.schedulers.values():
                with scheduler['wait_homerun_lock']:
                    for action in scheduler['wait_homerun'].values():
                        actions_to_return_nb += 1
                        actions_to_return_cpu_time += action.cpu_time
        
        self._to_give_back_actions_nb = actions_to_return_nb
        self._to_give_back_actions_nb_avg.update_avg(self._to_give_back_actions_nb)
        self._to_give_back_actions_cpu_time = actions_to_return_cpu_time
        self._to_give_back_actions_cpu_time_avg.update_avg(self._to_give_back_actions_cpu_time)
    
    
    # Manage action returned from Workers We just put them into the corresponding sched, and we clean unused properties like sched_id
    def _manage_action_return(self, action):
        # Maybe our workers end us something else than an action
        # if so, just add this in other queues and return
        cls_type = action.__class__.my_type
        if cls_type not in ['check', 'notification', 'eventhandler']:
            self.add(action)
            return
        
        if action.is_a == ACTION_TYPES.EVENTHANDLER:
            logger.debug('[EVENTHANDLER] the event handler id %s was executed by the workers, and will be returns to the scheduler' % action.get_uuid())
        
        # Ok, it's a result. We get it, and fill actions with sched_id
        sched_id = action.sched_id
        
        # Now we now where to put action, we do not need sched_id anymore
        del action.sched_id
        
        # Unset the tag of the worker_id too
        try:
            delattr(action, '_worker_id')
        except AttributeError:
            pass
        
        # And we remove it from the actions queue of the scheduler too
        try:
            del self.schedulers[sched_id]['actions'][action.get_uuid()]  # noqa => sched_id DOES exists
        except KeyError:
            pass
        
        # We tag it as 'return wanted', and move it in the wait return queue
        # Stop, if it is 'timeout' we need this information later in the scheduler
        sched_entry = self.schedulers.get(sched_id, None)
        if sched_entry is not None:
            with sched_entry['wait_homerun_lock']:
                try:
                    sched_entry['wait_homerun'][action.get_uuid()] = action
                except KeyError:
                    pass
        
        self._compute_exec_stat(action)
    
    
    def _compute_exec_stat(self, action):
        action_command = getattr(action, 'command', None)
        action_command_name = getattr(action, 'command_name', None)
        action_cpu_time = action.get_cpu_time()
        now = time.time()
        
        if action.status == 'timeout':
            self._executor_stats.add_timeout(action.command_name, action.execution_time, int(now))
        else:
            self._executor_stats.update_action_time(action_command_name, action_cpu_time)
        
        if action_cpu_time != 0 and action_command is not None:
            action_hash = action.get_hash()
            
            stat = {}
            if action_hash in self._exec_stat:
                stat = self._exec_stat[action_hash]
                # _prev_update = stat['last_update']
                i = stat['offset']
                i = (i + 1) % _NB_VALUE_SAVE_FOR_EXEC_STAT
                stat['last_update'] = now
                stat['saving_periode'] = action.get_saving_period()
                stat['values'][i] = action_cpu_time
                stat['offset'] = i
                stat['average'] = sum(stat['values']) / _NB_VALUE_SAVE_FOR_EXEC_STAT
                action.average_cpu_time = stat['average']
                # logger.debug('[executor][%s] REFRESH stat on [%s] cpu_time[%.5f] average[%.5f] check_interval[%d] hash[%s].' % (self.name, action_command, action_cpu_time, stat['average'], action.check_interval, action_hash))
                # logger.debug('REFRESH name=%s hash=%s interval=%s prev_update=%d   now=%d ' % (self.name, action_hash, action.check_interval, _prev_update, now))
            else:
                stat['last_update'] = now
                stat['saving_periode'] = action.get_saving_period()
                stat['offset'] = 0
                stat['average'] = action_cpu_time
                stat['values'] = range(_NB_VALUE_SAVE_FOR_EXEC_STAT)
                for i in xrange(_NB_VALUE_SAVE_FOR_EXEC_STAT):
                    stat['values'][i] = action_cpu_time
                self._exec_stat[action_hash] = stat
                action.average_cpu_time = stat['average']
                logger.debug('[executor][%s] CREATION First stat on [%s] cpu_time[%.5f] average[%.5f] check_interval[%d] hash[%s].' % (self.name, action_command_name, action_cpu_time, stat['average'], action.check_interval, action_hash))
            
            stat['ram_usage'] = self.avg_ram_percent.get_avg()
            stat['cpu_running_queue'] = self._cpu_running_queue.get_avg()
    
    
    def _clean_exec_stat(self):
        # Remove old stat from the _exec_stat
        
        to_remove = []
        now = time.time()
        for hash_command in self._exec_stat:
            stat = self._exec_stat[hash_command]
            if (now - stat['last_update']) / 60 > stat['saving_periode']:
                to_remove.append(hash_command)
        # logger.debug('[executor][%s]  exec stat total size: %s' % (self.name, len(self._exec_stat)))
        if to_remove:
            logger.debug('[executor][%s] Clean [%d] stats (total:%d):' % (self.name, len(to_remove), len(self._exec_stat)))
        for hash_command in to_remove:
            # logger.debug('[executor][%s]    - Hash: %s' % (self.name, hash_command))
            del self._exec_stat[hash_command]
    
    
    """ ----------------- LOOP P5 - get_new_actions ----------------- """
    
    
    # We get new broks from schedulers
    # REF: doc/broker-modules.png (2)
    def get_new_actions(self, sat_entry):
        # We check for new check in each Scheduler and put
        # the result in new_checks
        sat_type = sat_entry['type']
        
        # Here are the differences between a poller and a reactionner:
        # Poller will only do checks,
        # Reactionner do actions (notif + event handlers)
        do_checks = self.__class__.do_checks
        do_actions = self.__class__.do_actions
        
        if self.request_limit == 0:
            self.avg_asking_cpu_time.update_avg(self._cpu_time_limit)
            self.avg_get_cpu_time.update_avg(self._cpu_time_limit)
            
            logger.info('[executor][%s] We don\'t ask the schedulers for new checks because we already have enough' % self.name)
            return
        
        try:
            con = sat_entry['con']
            if con is not None:  # None = not initialized
                # OK, go for it :)
                # Before ask a call that can be long, do a simple ping to be sure it is alive
                start_time = time.time()
                con.get('ping')
                sat_entry['con_latency'] = time.time() - start_time
                
                get_checks_return = con.get('get_checks', {
                    'do_checks'        : json.dumps(do_checks),
                    'do_actions'       : json.dumps(do_actions),
                    'poller_tags'      : json.dumps(self.poller_tags),
                    'reactionner_tags' : json.dumps(self.reactionner_tags),
                    'worker_name'      : self.name,
                    'module_types'     : json.dumps(self._workers.keys()),
                    'request_limit'    : -1,
                    'request_limit_cpu': self.request_limit
                }, wait='long')
                # Explicit pickle load
                get_checks_return = base64.b64decode(get_checks_return)
                get_checks_return = zlib.decompress(get_checks_return)
                get_checks_return = SafeUnpickler.loads(str(get_checks_return), u'Checks/notification/event handlers get from %s' % sat_entry['name'])
                # logger.debug('get_new_action:: from %s (%d)  in %.3f sec' % (sat_entry['name'], len(get_checks_return), time.time() - start_time))
                self.add_actions(get_checks_return, sat_entry, self.request_limit, actions_direction=ACTION_DIRECTIONS.ACTIVE_DIRECTION)
            else:  # no con? make the connection
                self.pynag_con_init(sat_entry)
        # Ok, con is not known, so we create it
        except KeyError as exp:
            logger.debug('Key error for get_broks : %s' % str(exp))
            self.pynag_con_init(sat_entry)
        except HTTPExceptions as exp:
            logger.warning('Connection problem to the %s %s: %s' % (sat_type, sat_entry['name'], str(exp)))
            sat_entry['con'] = None
        # scheduler must not #be initialized
        except AttributeError as exp:
            logger.warning('The %s %s should not be initialized: %s' % (sat_type, sat_entry['name'], str(exp)))
        # scheduler must not have checks
        #  What the F**k? We do not know what happened,
        # so.. bye bye :)
        except Exception as x:
            logger.error(str(x))
            logger.error(traceback.format_exc())
            sys.exit(1)


    def compute_request_limit(self):
        if os.name == 'nt':
            self.request_limit = -1
            return self.request_limit
    
        if not self.have_configuration:
            self.request_limit = 0
            return self.request_limit
    
        _main_process_todo_queue = self._get_cpu_time_action_todo()
    
        total_cpu_time_todo_in_poller = _main_process_todo_queue
        total_cpu_time_todo_in_poller_with_margin = total_cpu_time_todo_in_poller / 2.0
    
        # We want to have 1s of job ahead us in our main process buffer
        wish_limit = 1 * self._cpu_time_limit
        if total_cpu_time_todo_in_poller_with_margin >= wish_limit:
            request_limit = 0
        else:
            request_limit = wish_limit - total_cpu_time_todo_in_poller_with_margin
    
        # If the loop turn is too high, slow a bit until we reach an acceptable time
        # NOTE: this is a warning because such case should not be seen, even on large env
        self.executor_in_overload = self.avg_loop_time.get_avg(0) > self._polling_interval
        if self.executor_in_overload:
            logger.warning('[limiter] loop time too high [%.3f] we will limit the executor.' % (self.avg_loop_time.get_avg(0)))
            request_limit = 0
    
        logger.debug(u'[limiter] [todo main daemon=%.3f] [ Hard limit=%d ] => [ asking this turn=%.3f ]' % (_main_process_todo_queue, self._cpu_time_limit, request_limit))
    
        self.request_limit = request_limit
        return request_limit
    
    
    """ ----------------- LOOP P6 - _put_actions_in_workers_queue ----------------- """
    
    
    def _put_actions_in_workers_queue(self):
        before_ = time.time()
        with self._actions_to_give_at_worker_lock:
            after_ = time.time()
            logger.log(_BEACON_LOG_LEVEL, '[beacon] 061_put_actions_in_workers_queue_lock [%.3f]' % (after_ - before_))
            self.really_put_actions_in_workers_queue()
    
    
    def really_put_actions_in_workers_queue(self):
        _cum_get_worker = 0.0
        _cum_add_todo = 0.0
        _cum_append = 0.0
        
        for _worker in self._get_all_workers():
            _worker.print_worker_full()
            _worker.action_counter_by_tick = 0
        
        actions_to_remove_from_actions_to_queue = []
        worker_to_skip = []
        for action in self._actions_to_give_at_worker:
            
            if action.status != 'inpoller':
                logger.warning('[executor] Scheduler give us an action with status:[%s] instead of inpoller status !!. This action:[%s] will be ignore.' % (action.status, action.command_name))
                continue
            
            t0 = time.time()
            worker = self._select_worker_for_next_action(action, worker_to_skip)
            # All worker are overload
            if worker is None:
                break
            
            t1 = time.time()
            try:
                worker.add_todo_action(action)
            except Exception as e:
                worker_to_skip.append(worker)
                worker.rollback_add_todo_action(action)
                
                if isinstance(e, IOError) and e.errno == 11:  # Resource temporarily unavailable, pipe is full, we roll back code on action for resend it next time
                    logger.info(u'[executor][%s] The worker [%s-%s] skip action because this pipe is full. It has [%d] actions ([%d] sent this turn). '
                                u'The action [%s] will be resend next turn.' % (self.name, worker.worker_type, worker.id, worker.nb_doing_action, worker.action_counter_by_tick, action.command_name))
                    continue
                else:
                    worker.terminate()
                    logger.error('[executor][%s] Worker [%s-%d] failed to add todo action, error: [%s]' % (self.name, worker.worker_type, worker.id, e))
                    continue
            
            if self.debug_display_launched_command:
                self.command_logger.debug('[%d][%s] Command launched [ %s ]' % (time.time(), self.name, action.command))
            t2 = time.time()
            actions_to_remove_from_actions_to_queue.append(action)
            t3 = time.time()
            _cum_get_worker += (t1 - t0)
            _cum_add_todo += (t2 - t1)
            _cum_append += (t3 - t2)
        
        t0 = time.time()
        for action in actions_to_remove_from_actions_to_queue:
            self._actions_to_give_at_worker.remove(action)
        t1 = time.time()
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 062_put_actions_in_workers_queue_really_remove part [%.3f]' % (t1 - t0))
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 062_put_actions_in_workers_queue_really_cum_get_worker [%.3f]' % _cum_get_worker)
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 062_put_actions_in_workers_queue_really_cum_add_todo [%.3f]' % _cum_add_todo)
        logger.log(_BEACON_LOG_LEVEL, '[beacon] 062_put_actions_in_workers_queue_really_cum_append [%.3f]' % _cum_append)
        
        worker_stats = []
        total = 0
        for _worker in self._get_all_workers():  # type: Worker
            q_sz = 0
            if hasattr(_worker.pipe_out_actions_done_by_worker_to_send_at_executor, u'get_queues_size'):
                q_sz = _worker.pipe_out_actions_done_by_worker_to_send_at_executor.get_queues_size()
            worker_stats.append(u'[ WORKER %s: %s done this turn / %s total pending queues_size: %s ]' % (_worker.id, _worker.action_counter_by_tick, _worker.nb_doing_action, q_sz))
            total += _worker.nb_doing_action
        in_daemon = len(self._actions_to_give_at_worker)
        total += in_daemon
        logger.info(u'[STATS] action in main daemon to be dispatched to workers: [ %s ], distribution by worker %s total: [ %s ]' % (in_daemon, u' '.join(worker_stats), total))
    
    
    def _select_worker_for_next_action(self, action, worker_to_skip):
        # type: (Any, Optional[List[Worker]]) -> Optional[Worker]
        all_workers = self._get_all_workers()
        nb_worker = len(all_workers)
        
        for i in xrange(nb_worker):
            self._round_robin_counter = (self._round_robin_counter + 1) % nb_worker
            worker = all_workers[self._round_robin_counter]
            
            if action.module_type != worker.worker_type or worker in worker_to_skip:
                continue
            
            # Fork worker are able to export execution time, use it to choose valid worker
            if worker.worker_type == 'fork' and worker.load_todo_actions.value >= WORKER_MAX_LOAD_QUEUE:
                continue  # skip this fork worker as it's too busy
            # ok so a valid worker
            return worker
        
        return None
    
    
    """ ----------------- LOOP P7 - send_workers_result_to_schedulers ----------------- """
    
    
    # Return the action to scheduler and clean them REF: doc/shinken-action-queues.png (6)
    def send_workers_result_to_schedulers(self, sat_entry):
        # If sched is not active, I do not try return
        if not sat_entry['active']:
            return
        send_ok = False
        # We keep temporary and will reinsert them if the fail did fail
        with sat_entry['wait_homerun_lock']:
            done_actions = sat_entry['wait_homerun']
            sat_entry['wait_homerun'] = {}
        
        if done_actions:
            try:
                start_time = time.time()
                scheduler_connection = sat_entry['con']
                if scheduler_connection is not None:  # None = not initialized
                    send_ok = scheduler_connection.post('put_results', {'results': done_actions.values()})
                    if self.do_checks:
                        logger.info(u'%s %s %s %d check\'s result(s) sends to this scheduler in [%.3f]s' % (
                            CHAPTER_CHECKS_RESULT, get_chapter_string(sat_entry['name']), CHAPTER_PUSHED, len(done_actions), time.time() - start_time))
                    elif self.do_actions:
                        nb_action_event = 0
                        nb_action_notification = 0
                        for action in done_actions.values():
                            if action.is_a == ACTION_TYPES.EVENTHANDLER:
                                nb_action_event += 1
                            elif action.is_a == ACTION_TYPES.NOTIFICATION:
                                nb_action_notification += 1
                        logger.info(u'%s %s %s %d action\'s result(s) sends to this scheduler in [%.3f]s [%d notifications / %d events] ' % (
                            CHAPTER_ACTION_RESULT, get_chapter_string(sat_entry[u'name']), CHAPTER_PUSHED, len(done_actions), time.time() - start_time, nb_action_notification, nb_action_event))
            # Not connected or sched is gone
            except (HTTPExceptions, KeyError) as exp:
                logger.error('[executor] Send workers result to schedulers[%s] fail, we will try to reconnect to scheduler %s,%s ' % (sat_entry['name'], type(exp), str(exp)))
                self.pynag_con_init(sat_entry)
                return
            except AttributeError as exp:  # the scheduler must not be initialized
                logger.error('[executor] Send workers result to schedulers[%s] fail, the scheduler must not be initialized this error is ignore. %s,%s ' % (sat_entry['name'], type(exp), str(exp)))
            except Exception as exp:
                logger.error('[executor] Send workers result to schedulers[%s] fail. %s,%s ' % (sat_entry['name'], type(exp), str(exp)))
                raise
            
            # We clean ONLY if sending is OK
            if send_ok:
                # we can exit, we already clean the home_return
                return
            else:
                # fuck ,we must reinsert the done_actions!
                with sat_entry['wait_homerun_lock']:
                    sat_entry['wait_homerun'].update(done_actions)
                logger.warning('[executor] Send workers result to schedulers[%s] fail, we will try to reconnect to scheduler' % (sat_entry['name']))
                self.pynag_con_init(sat_entry)
    
    
    """ ----------------- LOOP P8 - _get_objects_from_external_queues ----------------- """
    
    
    # Get 'objects' from external modules from now nobody uses it, but it can be useful for a module like livestatus to raise external commands for example
    def _get_objects_from_external_queues(self):
        for f in self.modules_manager.get_external_from_queues():
            full_queue = True
            while full_queue:
                try:
                    o = f.get(block=False)
                    self.add(o)
                except Empty:
                    full_queue = False
    
    
    #########################################################
    # # Worker Restarts
    #########################################################
    def _register_worker_restart(self, worker_type, reason=''):
        with self.last_restarts_lock:
            if worker_type not in self.last_restarts:
                self.last_restarts[worker_type] = []
            self.last_restarts[worker_type].append({'timestamp': time.time(), 'reason': reason.rstrip()})
    
    
    def _clean_worker_restart(self):
        with self.last_restarts_lock:
            now_ts = time.time()
            clean_ts = now_ts - self.last_restarts_keep
            
            for last_restarts in self.last_restarts.values():
                delete_count = 0
                
                for restart_ts in last_restarts:
                    if restart_ts < clean_ts:
                        delete_count += 1
                    else:
                        break
                
                del last_restarts[:delete_count]
    
    
    def get_jobs_from_distant(self, e):
        pass
