#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (C) 2009-2012:
#     Gabes Jean, naparuba@gmail.com
#     Gerhard Lausser, Gerhard.Lausser@consol.de
#     Gregory Starck, g.starck@gmail.com
#     Hartmut Goebel, h.goebel@goebel-consult.de
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken.  If not, see <http://www.gnu.org/licenses/>.

import base64
import cPickle
import cStringIO
import copy
import itertools
import json
import logging
import os
import shutil
import tempfile
import threading
import time
import traceback
import zlib
from Queue import Empty
from datetime import date

from shinken.action import ACTION_TYPES
from shinken.brok import Brok
from shinken.check import Check, CHECK_CAUSE, CHECK_STATUS
from shinken.comment import Comment  # Still load so pickle won't have a problem finding them if it load an old retention
from shinken.contactdowntime import ContactDowntime
from shinken.downtime import Downtime
from shinken.eventhandler import EventHandler
from shinken.external_command import ExternalCommand
from shinken.http_client import HTTPExceptions
from shinken.load import Load, AvgInRange
from shinken.log import logger, get_chapter_string, get_section_string
from shinken.notification import Notification
from shinken.objects.proxyitem import proxyitemsmgr, proxyitemsgraph
from shinken.rawdata import RawData
from shinken.external_command import ExternalCommandManager
from shinken.configuration_incarnation import PartConfigurationIncarnation

_PATH_SCHEDULER_STAT_PATTERN = "/var/lib/shinken/scheduler_stat-%d.dat"
# Execution stats part
# We save as retention the execution stats for poller & reactionners in a file,
# each for each schedulers of the local system
_PATH_EXEC_STAT_PATTERN = "/var/lib/shinken/scheduler_exec_stat-%d.dat"
# If we find the old stat file (< 2.5.0) then we move to the new path and load it
_OLD_PATH_EXEC_STAT = "/var/lib/shinken/scheduler_exec_stat.dat"
_DEFAULT_ACTION_CPU_TIME = 0.1
_DEFAULT_EXECUTOR = "iQIcBAABCAAGBQJXmOX0AAoJEDjbvchgkmk"
_PROVIDERS_KEY_RAW_DATAS = 'raw_datas'
_PROVIDERS_KEY_INITAL_DONE = 'has_initial_raw_data'
_BEACON_LOG_LEVEL = logging.NOTSET
_KEEP_CHECKS_WARNING_THRESHOLD_CPU_USAGE_TIME = 20  # 20min

_HOST_STATUS_ID_TO_STATUS = {0: "UP", 1: "DOWN", 2: "DOWN", 3: "UNKNOWN"}
_SERVICE_STATUS_ID_TO_STATUS = {0: "OK", 1: "WARNING", 2: "CRITICAL", 3: "UNKNOWN"}

MONITORING_CHECK_CONSUME_DEBUG_FLAG = os.environ.get('MONITORING_CHECK_CONSUME_DEBUG_FLAG', '0') == '1'

get_new_actions_stats = {}
ADD_CHECKS_STATS = {'nb_add': 0, 'total_time': 0.0, 'lock_time': 0.0, 'set_time': 0.0, 'dispatch_time': 0.0, 'hash_time': 0.0, 'brok_create_time': 0.0, 'brok_add_time': 0.0}
ADD_BROKS_STATS = {'nb_add': 0, 'total_time': 0.0, 'sat_lock_time': 0.0, 'brokers_lock_time': 0.0, 'to_one_broker_time': 0.0, 'to_all_brokers_time': 0.0, 'to_global_list_time': 0.0}

LOG_SCHEDULER_RECURRENT_TIMES = os.environ.get('SHINKEN_LOG_SCHEDULER_RECURRENT_TIMES_FLAG', '0') == '1'

CHAPTER_CONFIGURATION = get_chapter_string('CONFIGURATION')
CHAPTER_STATS = get_chapter_string('STATS')
CHAPTER_CHECKS_AND_NOTIF = get_chapter_string('check/notification/event handler')
SECTION_BROKERS = get_section_string('BROKERS')
SECTION_POLLERS_REACTIONNERS = get_section_string('POLLERS/REACTIONNERS')
SECTION_GET = get_section_string('GET')


class Scheduler:
    """Please Add a Docstring to describe the class here"""
    
    
    def __init__(self, scheduler_daemon):
        self.sched_daemon = scheduler_daemon
        # When set to false by us, we die and arbiter launch a new Scheduler
        self.must_run = True
        # Set when the retention was read so the scheduler is ready
        self.scheduler_is_ready = False
        
        # When a configuration is being load, we should avoid some error that occurs because
        # some structure are not all updated, like rogues poller/reactionners
        self._new_configuration_load_in_progress = False
        
        # By default we got no configuration, so no incarnation too
        self.configuration_incarnation = None
        self.part_configuration_incarnation = None
        
        # protect this uniq list
        self.waiting_results_lock = threading.RLock()
        self.waiting_results = []  # satellites returns us results
        # and to not wait for them, we put them here and
        # use them later
        
        # Every N seconds we call functions like consume, del zombies
        # etc. All of theses functions are in recurrent_works with the
        # every tick to run. So must be an integer > 0
        # The order is important, so make key an int.
        # TODO: at load, change value by configuration one (like reaper time, etc)
        self.recurrent_works = {
            0 : ('update_downtimes', self.update_downtimes, 1),
            1 : ('schedule', self.schedule, 1),  # just schedule
            2 : ('consume_results', self.consume_results, 1),  # incorporate checks and dependencies
            3 : ('get_new_actions', self.get_new_actions, 1),  # now get the news actions (checks, notif) raised
            4 : ('get_new_broks', self.get_new_broks, 1),  # and broks
            5 : ('get_new_raw_datas', self.get_new_raw_datas, 1),  # and raw data
            6 : ('scatter_master_notifications', self.scatter_master_notifications, 1),
            7 : ('delete_zombie_checks', self.delete_zombie_checks, 1),
            8 : ('delete_zombie_actions', self.delete_zombie_actions, 1),
            # 3: (self.delete_unwanted_notifications, 1),
            9 : ('check_freshness', self.check_freshness, 10),
            10: ('clean_caches', self.clean_caches, 1),
            11: ('update_retention_file', self.update_retention_file, 3600),
            12: ('check_orphaned', self.check_orphaned, 60),
            # For NagVis like tools: update our status every 10s
            13: ('get_and_register_update_program_status_brok', self.get_and_register_update_program_status_brok, 10),
            # Check for system time change. And AFTER get new checks
            # so they are changed too.
            14: ('check_for_system_time_change', self.sched_daemon.check_for_system_time_change, 1),
            # launch if need all internal checks
            15: ('manage_internal_checks', self.manage_internal_checks, 1),
            # launch automatic ack computations
            16: ('compute_automatic_acknowledge', self.compute_automatic_acknowledge, 1),
            # launch automatic flapping computations
            17: ('compute_automatic_flapping', self.compute_automatic_flapping, 1),
            # launch automatic root problems computations
            18: ('compute_cluster_root_problems', self.compute_cluster_root_problems, 1),
            
            # NOTE: 19 is free, was clean_queues
            
            # Look for new business_impact change by modulation every minute
            20: ('update_business_values', self.update_business_values, 60),
            # Reset the topology change flag if need
            21: ('reset_topology_change_flag', self.reset_topology_change_flag, 1),
            22: ('check_for_expire_acknowledge', self.check_for_expire_acknowledge, 1),
            23: ('send_broks_to_modules', self.send_broks_to_modules, 1),
            24: ('get_objects_from_from_queues', self.get_objects_from_from_queues, 1),
            
            25: ('_clean_exec_stat', self._clean_exec_stat, 60),
            26: ('clean_incidents', self.clean_incidents, 5),
            27: ('compute_cluster_flapping', self.compute_cluster_flapping, 60),
        }
        
        # stats part
        self.scheduler_stat = {}
        self._nb_checks_with_stat_send = 0
        self.raw_nb_checks_to_send = 0
        self.raw_nb_notification_to_send = 0
        self.raw_cpu_time_checks_to_send = 0
        
        self.avg_checks_todo_by_sec = AvgInRange(60)
        
        self.avg_notification_todo_by_sec = AvgInRange(60)
        self.avg_cpu_time_checks_to_send = AvgInRange(60)
        self.avg_total_checks_send = AvgInRange(60)
        self.avg_total_checks_received = AvgInRange(60)
        
        self.avg_nb_checks_send = {}
        
        # checks stats by cause
        ## raw stats
        self.raw_nb_checks_received_schedule = 0
        self.raw_nb_checks_received_force = 0
        self.raw_nb_checks_received_retry = 0
        self.raw_nb_checks_received_dependency = 0
        ## computed avg
        self.avg_checks_received_schedule_by_sec = AvgInRange(60)
        self.avg_checks_received_force_by_sec = AvgInRange(60)
        self.avg_checks_received_retry_by_sec = AvgInRange(60)
        self.avg_checks_received_dependency_by_sec = AvgInRange(60)
        
        self.avg_cpu_time_checks_send = {}
        self.avg_nb_checks_received = {}
        self.avg_cpu_time_checks_received = {}
        
        self._executor_type = {}
        self.stat_by_executor = {}
        self.late_checks_by_tags = {}
        self.checks_warning_threshold_cpu_usage = []
        
        self._exec_stat = {_DEFAULT_EXECUTOR: {}}
        self.nb_broks_send = 0
        self.nb_raw_datas_send = 0
        
        # Log init
        logger.load_obj(self)
        
        self.instance_id = 0  # Temporary set. Will be erase later
        
        # Ours queues
        self.checks_n_actions_lock = threading.RLock()
        self.checks = {}
        self.check_to_launch = {}
        self.actions = {}
        self.downtimes = {}
        self.contact_downtimes = {}
        self.broks = {}
        self.raw_datas = {}
        self.hosts = ()
        self.clusters = {}
        # Some flags
        self.has_full_broks = False  # have a initial_broks in broks queue?
        self.need_objects_dump = False  # set by signal 2
        
        # And a dummy push flavor
        self.push_flavor = 0
        
        # Now fake initialize for our satellites
        self.brokers = {}
        self.brokers_lock = threading.RLock()
        self.providers = {}
        self.providers_lock = threading.RLock()
        self.pollers = {}
        self.reactionners = {}
        self.pollers_name = set()
        self.reactionners_name = set()
        self.rogue_satellites_lock = threading.RLock()
        self.rogue_satellites = {}
        
        # Keep up counters
        self.nb_scheduled = 0
        self.nb_late = 0
        self.late_checks_by_tags = {}
        self.nb_inpoller = 0
        self.nb_zombies = 0
        self.nb_notifications = 0
        self.lat_avg, self.lat_min, self.lat_max = 0, 0, 0
        
        # Keep a trace of loop time
        self.loop_time_avg = AvgInRange(60)
        
        # Keep a set of our uuids (hosts+services)
        self.elements_uuids = set()
        
        self.skip_rentention_save = False
        
        self._retention_data = None
        self._save_retention_thread = None
        self._delete_old_retention_thread = None
        self._load_scheduler_stat()
    
    
    # We are getting a new configuration, so maybe we are going into spare or
    # maybe we are loading a new conf and will run.
    # * spare: drop everything
    # * active: keep waiting results so we do not lost notifications and checks
    def _reset(self, is_going_to_spare=False):
        self.must_run = True
        if is_going_to_spare:
            with self.waiting_results_lock:
                del self.waiting_results[:]
        with self.checks_n_actions_lock:
            for o in self.checks, self.check_to_launch, self.actions, self.downtimes, self.contact_downtimes, self.broks, self.raw_datas, self.brokers, self.providers, self.clusters:
                o.clear()
            self.nb_late = 0
            self.late_checks_by_tags = 0
            self.services = ()
            self.hosts = ()
        with self.rogue_satellites_lock:
            self.rogue_satellites.clear()
    
    
    # The daemon is warning us that a new configuration is being load, so beware of some structures that can be
    # not finish to load, like rogue satellites
    def warn_about_a_new_configuration_load_in_progress(self):
        self._new_configuration_load_in_progress = True
    
    
    # The daemon is warning us the new configuration load is finish
    def warn_about_the_end_of_the_configuration_load(self):
        self._new_configuration_load_in_progress = False
    
    
    def get_current_satellites(self):
        broker_names = self.brokers.keys()
        receiver_names = []  # currently void
        poller_names = list(self.pollers_name)
        reactionner_names = list(self.reactionners_name)
        return {
            'broker'     : broker_names,
            'receiver'   : receiver_names,
            'poller'     : poller_names,
            'reactionner': reactionner_names,
        }
    
    
    def create_broker_entry(self, broker_name):
        with self.sched_daemon.satellite_lock:
            if broker_name in self.brokers:  # already have
                return
            logger.info("%s %s [%10s] This new broker contacted us. Creating a brok queue for this new broker." % (CHAPTER_CONFIGURATION, SECTION_BROKERS, broker_name))
            self.brokers[broker_name] = {'broks': {}, 'has_full_broks': False}
    
    
    def remove_broker(self, broker_name):
        with self.sched_daemon.satellite_lock:
            if broker_name not in self.brokers:
                logger.info('%s %s [%10s] The arbiter asks us to remove broker "%s" but we does not have it. Skipping order.' % (CHAPTER_CONFIGURATION, SECTION_BROKERS, broker_name, broker_name))
                return
            # Ok we have it, clean it
            del self.brokers[broker_name]
            logger.info('%s %s [%10s] The arbiter asks us to remove broker "%s" that is no more need.' % (CHAPTER_CONFIGURATION, SECTION_BROKERS, broker_name, broker_name))
    
    
    def create_providers_entry(self, provider_name):
        return
    
    
    def set_as_inactive(self, instance_name, push_flavor, configuration_incarnation, is_spare):
        txt = 'Setting us as a spare' if is_spare else 'Setting us as idle until we receive an active shard.'
        logger.info('[CONFIGURATION] Did receive shard flavor [%s]. %s' % (push_flavor, txt))
        # I'm not activated AKA spare, reset the scheduler and return
        self._reset(is_going_to_spare=True)
        # self for instance_name
        self.instance_name = instance_name
        # and push flavor
        self.push_flavor = push_flavor
        self.configuration_incarnation = configuration_incarnation
    
    
    def load_configuration_from_arbiter(self, conf, push_flavor, pollers, reactionners, configuration_incarnation):
        # we give sched it's conf
        start = time.time()
        self._reset()
        self.load_conf(conf, push_flavor)
        self.load_satellites(pollers, reactionners)
        self._load_exec_stat()
        
        # Now create the external commander
        # it's a applyer: it role is not to dispatch commands, but to apply them
        external_commander = ExternalCommandManager(conf, 'applyer')
        
        # Scheduler need to know about external command to activate it if necessary
        self.load_external_command(external_commander)
        
        # External command need the sched because he can raise checks
        external_commander.load_scheduler(self)
        
        self.conf.push_flavor = push_flavor
        self.configuration_incarnation = configuration_incarnation
        self.part_configuration_incarnation = PartConfigurationIncarnation(self.configuration_incarnation, self.instance_id, self.instance_name)
        
        logger.info("[schedulerdeamon] The configuration %s was loaded in [%s]s" % (self.part_configuration_incarnation, time.time() - start))
    
    
    # Load conf for future use we are in_test if the data are from an arbiter object like, so only for tests
    def load_conf(self, conf, push_flavor, in_test=False):
        self.stat_by_executor = {}
        self.program_start = int(time.time())
        self.conf = conf
        # We must update our Config dict macro with good value from the config parameters
        self.conf.fill_resource_macros_names_macros()
        self.hostgroups = conf.hostgroups
        self.hostgroups.create_reversed_list()
        self.services = conf.services
        # Refil time: 0.5s by 1K checks
        self.services.refil_running_properties()  # the arbiter did not create some state properties, we must have them now
        
        # We need reversed list for search in the retention
        # file read
        self.services.create_reversed_list()
        self.services.optimize_service_search(conf.hosts)
        self.hosts = conf.hosts
        self.hosts.refil_running_properties()  # the arbiter did not create some state properties, we must have them now
        self.hosts.create_reversed_list()
        
        self.notificationways = conf.notificationways
        self.checkmodulations = conf.checkmodulations
        self.macromodulations = conf.macromodulations
        self.contacts = conf.contacts
        self.contacts.refil_running_properties()  # the arbiter did not create some state properties, we must have them now
        self.contacts.create_reversed_list()
        self.contactgroups = conf.contactgroups
        self.contactgroups.create_reversed_list()
        self.servicegroups = conf.servicegroups
        self.servicegroups.create_reversed_list()
        self.timeperiods = conf.timeperiods
        self.timeperiods.create_reversed_list()
        self.commands = conf.commands
        self.commands.create_reversed_list()
        
        # We must update our Config dict macro with good value from the config parameters because we use it to calculate notes_url and notes_multi_url
        self.conf.fill_resource_macros_names_macros()
        
        self.clusters = {}
        for potential_cluster in itertools.chain(self.hosts, self.services):
            if potential_cluster.got_business_rule or potential_cluster.is_cluster:
                self.clusters[potential_cluster.get_instance_uuid()] = potential_cluster
        
        # Log to the admin how much elements we did load in this configuration
        nb_clusters = len(self.clusters)
        logger.info('[ CONFIGURATION ] LOADED => clusters[ %d ] -- hosts[ %d ] -- hostgroups[ %d ] -- checks[ %d ] --commands[ %d ] -- contacts[ %d ]  -- contacgroups[ %d ]' % (
            nb_clusters, len(self.hosts) - nb_clusters, len(self.hostgroups), len(self.services), len(self.commands), len(self.contacts), len(self.contactgroups)))
        
        if not in_test:
            # Commands in the host/services/contacts are not real one
            # we must relink them
            t0 = time.time()
            self.conf.late_linkify()
            logger.debug("Late command relink in %d" % (time.time() - t0))
        
        # From Arbiter. Use for Broker to differentiate schedulers
        self.instance_id = conf.instance_id
        
        self.conf.fill_resource_macros_names_macros()
        # Tag our hosts with our instance_id
        for h in self.hosts:
            h.instance_id = conf.instance_id
        for s in self.services:
            s.instance_id = conf.instance_id
        # self for instance_name
        self.instance_name = conf.instance_name
        # and push flavor
        self.push_flavor = push_flavor
        
        self.elements_uuids = set()  # be sure to reset it
        for h in self.hosts:
            self.elements_uuids.add(h.get_instance_uuid())
        for s in self.services:
            self.elements_uuids.add(s.get_instance_uuid())
        
        # Resolve static macro in notes_url / notes_multi_url of hosts and service
        for item in itertools.chain(self.hosts, self.services):
            notes_url = getattr(item, 'notes_url', None)
            if notes_url:
                data = item.get_data_for_checks()
                notes_url = self.sched_daemon.macro_resolver.resolve_simple_macros_in_string(notes_url, data, only_static_macros=True)
                item.notes_url = notes_url
            notes_multi_url = getattr(item, 'notes_multi_url', None)
            if notes_multi_url:
                data = item.get_data_for_checks()
                notes_multi_url = self.sched_daemon.macro_resolver.resolve_simple_macros_in_string(notes_multi_url, data, only_static_macros=True)
                item.notes_multi_url = notes_multi_url
        
        # Now we can update our 'ticks' for special calls
        # like the retention one, etc
        self.update_recurrent_works_tick('update_retention_file', self.conf.retention_update_interval * 60)
    
    
    # Update the 'tick' for a function call in our recurrent work
    def update_recurrent_works_tick(self, f_name, new_tick):
        for i in self.recurrent_works:
            (name, f, old_tick) = self.recurrent_works[i]
            if name == f_name:
                logger.debug("Changing the tick to %d for the function %s" % (new_tick, name))
                self.recurrent_works[i] = (name, f, new_tick)
    
    
    # Load the pollers/reactionners from our app master
    def load_satellites(self, pollers, reactionners):
        self.pollers = pollers
        self.reactionners = reactionners
        # avoid having old satellites name in our sets
        self.pollers_name = set()
        self.reactionners_name = set()
        # reset the potential rogue rogue satellites
        self.rogue_satellites = {}
        # SEF-1143 add pollers and reactionners so stats can be aware about all configured satellites
        # add pollers information
        for poller in pollers.itervalues():
            self.pollers_name.add(poller['name'])
            self._add_execturor_stat(poller['name'], 'Poller', poller['poller_tags'], poller['realm'])
        # add reactionners information
        for reactionner in reactionners.itervalues():
            self.reactionners_name.add(reactionner['name'])
            self._add_execturor_stat(reactionner['name'], 'Reactionner', reactionner['reactionner_tags'], reactionner['realm'])
    
    
    # add executors definition for stats
    def _add_execturor_stat(self, id, type, tag, realm):
        stat_by_executor = self.stat_by_executor.get(id, {})
        stat_by_executor['type'] = type
        stat_by_executor['tag'] = ",".join(tag)
        stat_by_executor['realm'] = realm
        self.stat_by_executor[id] = stat_by_executor
    
    
    # Oh... Arbiter want us to die... To launch a new Scheduler
    # "Mais qu'a-t-il de plus que je n'ais pas?"
    # "But.. On which point it is better than me?"
    def die(self):
        self.must_run = False
    
    
    def dump_objects(self):
        d = tempfile.gettempdir()
        p = os.path.join(d, 'scheduler-obj-dump-%d' % time.time())
        logger.info('Opening the DUMP FILE %s' % (p))
        try:
            f = open(p, 'w')
            f.write('Scheduler DUMP at %d\n' % time.time())
            with self.checks_n_actions_lock:
                for c in self.checks.values():
                    s = 'CHECK: %s:%s:%s:%s:%s:%s\n' % (c.id, c.status, c.t_to_go, c.poller_tag, c.command, c.executor_id)
                    f.write(s)
                for a in self.actions.values():
                    s = '%s: %s:%s:%s:%s:%s:%s\n' % (a.__class__.my_type.upper(), a.id, a.status, a.t_to_go, a.reactionner_tag, a.command, a.executor_id)
                    f.write(s)
            for b in self.broks.values():
                s = 'BROK: %s:%s\n' % (b.id, b.type)
                f.write(s)
            f.close()
        except Exception, exp:
            logger.error("Error in writing the dump file %s : %s" % (p, str(exp)))
    
    
    # Load the external command
    def load_external_command(self, e):
        self.external_command = e
    
    
    # We've got activity in the fifo, we get and run commands
    def run_external_commands(self, cmds):
        for command in cmds:
            self.run_external_command(command)
    
    
    def run_external_command(self, command):
        logger.debug("scheduler resolves command '%s'" % command)
        ext_cmd = ExternalCommand(command)
        self.external_command.resolve_command(ext_cmd)
    
    
    # Add_Brok is a bit more complex than the others, because on starting, the broks are put in a global queue : self.broks
    # then when the first broker connect, it will generate initial_broks in it's own queue (so bname != None).
    # and when in "normal" run, we just need to put the brok to all queues
    def add_Brok(self, brok, broker_name=None):
        # ADD_BROKS_STATS = {'nb_add': 0, 'total_time':0.0, 'sat_lock_time': 0.0,'brokers_lock_time': 0.0,  'to_one_broker_time': 0.0, 'to_all_brokers_time': 0.0, 'to_global_list_time': 0.0}
        ADD_BROKS_STATS['nb_add'] += 1
        
        before_sat_lock = time.time()
        # IMPORTANT: always the satellite lock BEFORE brokers lock
        with self.sched_daemon.satellite_lock:
            ADD_BROKS_STATS['sat_lock_time'] += time.time() - before_sat_lock
            
            before_brokers_lock = time.time()
            # ask for service and hosts their broks waiting be eaten
            with self.brokers_lock:
                ADD_BROKS_STATS['brokers_lock_time'] += time.time() - before_brokers_lock
                
                before = time.time()
                # For brok, we TAG brok with our instance_id and configuration_incarnation
                brok.set_part_configuration_incarnation(self.part_configuration_incarnation)
                # Maybe it's just for one broker
                if broker_name:
                    broks = self.brokers[broker_name]['broks']
                    broks[brok.id] = brok
                    ADD_BROKS_STATS['to_one_broker_time'] += time.time() - before
                else:
                    # If there are known brokers, give it to them
                    if len(self.brokers) > 0:
                        # Or maybe it's for all
                        for broker_name in self.brokers:
                            broks = self.brokers[broker_name]['broks']
                            broks[brok.id] = brok
                        ADD_BROKS_STATS['to_all_brokers_time'] += time.time() - before
                    else:  # no brokers? maybe at startup for logs
                        # we will put in global queue, that the first broker
                        # connexion will get all
                        self.broks[brok.id] = brok
                        ADD_BROKS_STATS['to_global_list_time'] += time.time() - before
            
            ADD_BROKS_STATS['total_time'] += time.time() - before_sat_lock
    
    
    def add_RawData(self, raw_data, _provider_name=None):
        raw_data.instance_id = self.instance_id
        if _provider_name:
            raw_datas = self.providers[_provider_name][_PROVIDERS_KEY_RAW_DATAS]
            raw_datas[raw_data.id] = raw_data
        else:
            if len(self.providers) > 0:
                for provider_name in self.providers:
                    raw_datas = self.providers[provider_name][_PROVIDERS_KEY_RAW_DATAS]
                    raw_datas[raw_data.id] = raw_data
            else:
                self.raw_datas[raw_data.id] = raw_data
    
    
    def add_Notification(self, notif):
        self.actions[notif.id] = notif
        self.raw_nb_notification_to_send += 1
        # A notification ask for a brok
        if notif.contact is not None:
            b = notif.get_initial_status_brok()
            self.add(b)
    
    
    # 'nb_add':0, 'total_time':0.0, 'lock_time':0.0, 'set_time':0.0,'dispatch_time':0.0, 'hash_time':0.0, 'brok_time':0.0
    def add_Check(self, check):
        ADD_CHECKS_STATS['nb_add'] += 1
        before_checks_n_actions_lock = time.time()
        with self.checks_n_actions_lock:
            ADD_CHECKS_STATS['lock_time'] += time.time() - before_checks_n_actions_lock
            
            before_set = time.time()
            old_check = self.checks.get(check.id, None)
            if old_check:
                try:
                    self.check_to_launch[int(old_check.t_to_go)].remove(old_check)
                except KeyError:
                    # The old check wasn't in check_to_launch, and we try to remove it so we don't care of this error
                    pass
            self.checks[check.id] = check
            ADD_CHECKS_STATS['set_time'] += time.time() - before_set
            
            before_dispatch_time = time.time()
            #  Use the try tech instead preventive check for pref reason
            # logger.debug("[get_check] add check_to_launch [%s]" % int(check.t_to_go))
            try:
                self.check_to_launch[int(check.t_to_go)].append(check)
            except KeyError:
                self.check_to_launch[int(check.t_to_go)] = [check]
            ADD_CHECKS_STATS['dispatch_time'] += time.time() - before_dispatch_time
            
            before_hash = time.time()
            self.raw_nb_checks_to_send += 1
            action_hash = check.get_hash()
            if action_hash and action_hash in self._exec_stat[_DEFAULT_EXECUTOR]:
                self.raw_cpu_time_checks_to_send += self._exec_stat[_DEFAULT_EXECUTOR][action_hash]['action_cpu_time']
            ADD_CHECKS_STATS['hash_time'] += time.time() - before_hash
        
        ADD_CHECKS_STATS['total_time'] += time.time() - before_checks_n_actions_lock
    
    
    def add_EventHandler(self, action):
        # print "Add an event Handler", elt.id
        self.actions[action.id] = action
        logger.debug('[EVENTHANDLER] registering in the scheduler queues the event handler number %d (on the object %s)' % (action.id, action.ref.get_full_name()))
    
    
    def add_Downtime(self, dt):
        self.downtimes[dt.id] = dt
    
    
    def add_ContactDowntime(self, contact_dt):
        self.contact_downtimes[contact_dt.id] = contact_dt
    
    
    def add_Comment(self, comment):
        return
    
    
    # Ok one of our modules send us a command? just run it!
    def add_ExternalCommand(self, ext_cmd):
        self.external_command.resolve_command(ext_cmd)
    
    
    # Schedulers have some queues. We can simplify call by adding
    # elements into the proper queue just by looking at their type
    # Brok -> self.broks
    # Check -> self.checks
    # Notification -> self.actions
    # Downtime -> self.downtimes
    # ContactDowntime -> self.contact_downtimes
    def add(self, elt):
        f = self.__add_actions.get(elt.__class__, None)
        if f:
            # print("found action for %s: %s" % (elt.__class__.__name__, f.__name__))
            f(self, elt)
    
    
    __add_actions = {
        Check          : add_Check,
        Brok           : add_Brok,
        RawData        : add_RawData,
        Notification   : add_Notification,
        EventHandler   : add_EventHandler,
        Downtime       : add_Downtime,
        ContactDowntime: add_ContactDowntime,
        Comment        : add_Comment,  # Disable, do nothing
        ExternalCommand: add_ExternalCommand,
    }
    
    
    def on_reschedule(self, old_t_to_go, t_to_go, check):
        try:
            self.check_to_launch[old_t_to_go].remove(check)
        except (KeyError, ValueError):
            # The old check wasn't in check_to_launch, and we try to remove it so we don't care of this error
            pass
        # logger.debug("[get_check] on_reschedule")
        try:
            self.check_to_launch[t_to_go].append(check)
        except KeyError:
            self.check_to_launch[t_to_go] = [check]
    
    
    # We call the function of modules that got the
    # hook function
    # TODO: find a way to merge this and the version in daemon.py
    def hook_point(self, hook_name):
        done = False
        for inst in self.sched_daemon.modules_manager.get_all_alive_instances():
            full_hook_name = 'hook_' + hook_name
            if hasattr(inst, full_hook_name):
                f = getattr(inst, full_hook_name)
                try:
                    logger.debug("hook_point: %s %s" % (inst.get_name(), hook_name))
                    done = f(self)
                except Exception as exp:
                    logger.error("The instance %s raise an exception %s. I disable it and set it to restart it later" % (inst.get_name(), str(exp)))
                    output = cStringIO.StringIO()
                    traceback.print_exc(file=output)
                    logger.error("Exception trace follows: %s" % (output.getvalue()))
                    output.close()
                    self.sched_daemon.modules_manager.did_crash(inst, reason="The instance %s raise an exception %s." % (inst.get_name(), str(exp)))
                    if hook_name == 'save_retention':
                        raise RuntimeError(exp.message)
        return done
    
    
    # For tunning purpose we use caches but we do not want them to explode
    # So we clean them
    def clean_caches(self):
        for tp in self.timeperiods:
            tp.clean_cache()
    
    
    # Ask item (host or service) an update_status
    # and add it to our broks queue
    def get_and_register_status_brok(self, item):
        b = item.get_update_status_brok()
        self.add(b)
    
    
    # We do not want this downtime id
    def del_downtime(self, dt_id):
        if dt_id in self.downtimes:
            self.downtimes[dt_id].ref.del_downtime(dt_id)
            del self.downtimes[dt_id]
    
    
    # We do not want this downtime id
    def del_contact_downtime(self, dt_id):
        if dt_id in self.contact_downtimes:
            self.contact_downtimes[dt_id].ref.del_downtime(dt_id)
            del self.contact_downtimes[dt_id]
    
    
    # We are looking for outdated acks, and if so, remove them
    def check_for_expire_acknowledge(self):
        for t in [self.hosts, self.services]:
            for i in t:
                i.check_for_expire_acknowledge()
    
    
    # We update all business_impact to look at new modulation
    # start for impacts, and so update broks status and
    # problems value too
    def update_business_values(self):
        for t in [self.hosts, self.services]:
            # We first update impacts and classic elements
            for i in [i for i in t if not i.is_problem]:
                was = i.business_impact
                i.update_business_impact_value()
                new = i.business_impact
                # Ok, the business_impact change, we can update the broks
                if new != was:
                    # print "The elements", i.get_name(), "change it's business_impact value"
                    self.get_and_register_status_brok(i)
        
        # When all impacts and classic elements are updated,
        # we can update problems (their value depend on impacts, so
        # they must be done after)
        for t in [self.hosts, self.services]:
            # We first update impacts and classic elements
            for i in [i for i in t if i.is_problem]:
                was = i.business_impact
                i.update_business_impact_value()
                new = i.business_impact
                # Maybe one of the impacts change it's business_impact to a high value
                # and so ask for the problem to raise too
                if new != was:
                    # print "The elements", i.get_name(), "change it's business_impact value from", was, "to", new
                    self.get_and_register_status_brok(i)
    
    
    # Each second we search for master notification that are scatterisable and we do the job
    # we take the sons and we put them into our actions queue
    def scatter_master_notifications(self):
        now = time.time()
        for master_notification in self.actions.values():
            # We only want notifications
            if master_notification.is_a != ACTION_TYPES.NOTIFICATION:
                continue
            if master_notification.status == 'scheduled' and master_notification.t_to_go <= now and not master_notification.contact:
                # This is a "master" notification created by create_notifications.
                # It won't sent itself because it has no contact.
                # We use it to create "child" notifications (for the contacts and notification_commands) which are executed in the reactionner.
                item = master_notification.ref
                child_notifications = []
                if not item.notification_is_blocked_by_item(master_notification.type, now):
                    # If it is possible to send notifications of this type at the current time, then create
                    # a single notification for each contact of this item.
                    child_notifications = item.scatter_notification(master_notification)
                    for c in child_notifications:
                        c.status = 'scheduled'
                        self.add(c)  # this will send a brok
                
                # If we have notification_interval then schedule the next notification (problems only)
                if master_notification.type == 'PROBLEM':
                    # Update the ref notif number after raise the one of the notification
                    if len(child_notifications) != 0:
                        # notif_nb of the master notification was already current_notification_number+1.
                        # If notifications were sent, then host/service-counter will also be incremented
                        item.current_notification_number = master_notification.notif_nb
                    
                    if item.notification_interval != 0 and master_notification.t_to_go is not None:
                        # We must continue to send notifications.
                        # Just leave it in the actions list and set it to "scheduled" and it will be found again later
                        # Ask the service/host to compute the next notif time. It can be just
                        # a.t_to_go + item.notification_interval * item.__class__.interval_length
                        # or maybe before because we have an escalation that need to raise up before
                        master_notification.previous_t_to_go = master_notification.t_to_go
                        master_notification.t_to_go = item.get_next_notification_time(master_notification)
                        
                        master_notification.notif_nb = item.current_notification_number + 1
                        master_notification.status = 'scheduled'
                    else:
                        # Wipe out this master notification. One problem notification is enough.
                        item.remove_in_progress_notification(master_notification)
                        self.actions[master_notification.id].status = 'zombie'
                
                else:
                    # Wipe out this master notification. We don't repeat recover/downtime/flap/etc...
                    item.remove_in_progress_notification(master_notification)
                    self.actions[master_notification.id].status = 'zombie'
    
    
    # Called by poller to get checks
    # Can get checks and actions (notifications and co)
    def get_to_run_checks(self, do_checks=False, do_actions=False, poller_tags=['None'], reactionner_tags=['None'], worker_name='none', module_types=['fork'], request_limit=-1, request_limit_cpu=-1, protect_from_rogues=True):
        t0 = time.time()
        with self.checks_n_actions_lock:
            acquire_lock_time = time.time() - t0
            if acquire_lock_time > 0.05:
                logger.warning('get_to_run_checks long acquire lock time [%.3f] for worker [%s]' % (acquire_lock_time, worker_name))
            elif acquire_lock_time > 0.005:
                logger.debug('get_to_run_checks acquire lock time [%.3f] for worker [%s]' % (acquire_lock_time, worker_name))
            return self._get_to_run_checks(
                do_checks=do_checks,
                do_actions=do_actions,
                poller_tags=poller_tags,
                reactionner_tags=reactionner_tags,
                worker_name=worker_name,
                worker_types=module_types,
                request_limit=request_limit,
                request_limit_cpu=request_limit_cpu,
                protect_from_rogues=protect_from_rogues,
            )
    
    
    # Remove the rogue satellites that doesn't send any check since n sec
    def cleanup_rogue_satellite(self, timeout=60):
        now = time.time()
        with self.rogue_satellites_lock:
            for satellite_type, satellite_info in self.rogue_satellites.iteritems():
                to_del = set()
                for name, last_check_time in satellite_info.iteritems():
                    time_since_last_check = time.time() - last_check_time
                    # Check time since last rogue connection, if > timeout, remove it
                    if time_since_last_check > timeout:
                        to_del.add(name)
                # manage object deletion
                if to_del:
                    logger.debug("[scheduler][%s] cleanup rogue [%s] named [%s]" % (self.instance_id, satellite_type, to_del))
                for key in to_del:
                    del satellite_info[key]
    
    
    # Store satellites that shouldn't talk to us as rogue satellites
    def _add_rogue_satellite(self, satellite_type, satellite_name, now=None):
        # Maybe we are currently loading a configuration, so :
        # * we are still with self.pollers/reactionners not clean
        # * the poller/reactionner is still aware about us, so is talking with us
        # => wrong error, if it's a real thing, will be raise as soon as we finish to load
        #    the configuration
        if self._new_configuration_load_in_progress:
            return
        
        # maybe we did not have any conf at all (arbiter reset us)
        if self.sched_daemon.cur_conf is None:
            logger.debug("[scheduler][%s] the %s named [%s] send a check/notification but we have no conf" % (self.instance_id, satellite_type, satellite_name))
            return
        
        with self.rogue_satellites_lock:
            if now is None:
                now = time.time()
            if satellite_type not in self.rogue_satellites:
                self.rogue_satellites[satellite_type] = {}
            if satellite_name not in self.rogue_satellites[satellite_type]:
                self.rogue_satellites[satellite_type][satellite_name] = now
            else:
                last_check_time = self.rogue_satellites[satellite_type][satellite_name]
                # We previously store a date in future, the server probably update the date/time
                # clear the rogue cache by security
                if last_check_time > now:
                    self.rogue_satellites = {}
                else:
                    self.rogue_satellites[satellite_type][satellite_name] = now
                    logger.warning("[scheduler][%s] the %s named [%s] send a check/notification but shouldn't" % (self.instance_id, satellite_type, satellite_name))
    
    
    # Called by executor to get action to run.
    def _get_to_run_checks(self, do_checks=False, do_actions=False, poller_tags=['None'], reactionner_tags=['None'], worker_name='none', worker_types=['fork'], request_limit=-1, request_limit_cpu=-1, protect_from_rogues=True):
        start = time.time()
        in_check_time = 0.0
        action_to_give_to_executor = []
        self.nb_action_give_to_executor = 0
        self.cpu_time_action_give_to_executor = 0
        now = time.time()
        executor_type = 'Poller' if do_checks else 'Reactionner'
        tag = poller_tags if do_checks else reactionner_tags
        executor_id = worker_name
        if protect_from_rogues:
            # Reject the check given by an unknown executor
            if executor_type == 'Poller' and worker_name not in self.pollers_name:
                self._add_rogue_satellite(executor_type, worker_name, now)
                return ()
            elif executor_type == 'Reactionner' and worker_name not in self.reactionners_name:
                self._add_rogue_satellite(executor_type, worker_name, now)
                return ()
        
        # If poller want to do checks
        if do_checks:
            # If the command is untagged, and the poller too, or if both are tagged  with same name, go for it
            # If do_check, call for poller, and so poller_tags by default is ['None']
            # by default poller_tag is 'None' and poller_tags is ['None']
            # and same for module_type, the default is the 'fork' type
            key_to_del = []
            in_ask_limit = False
            # nb_to_launch = sum([len(checks) for checks in self.check_to_launch.itervalues()])
            # logger.debug("[get_check] now[%s] check_to_launch [%s] [%s]" % (now, nb_to_launch, self.check_to_launch.keys()))
            sorted_key = self.check_to_launch.keys()
            sorted_key.sort()
            for t_to_go in sorted_key:
                checks = self.check_to_launch[t_to_go]
                if now < t_to_go or in_ask_limit:
                    break
                for check in checks:
                    if check.status == 'scheduled' and not check.internal and check.poller_tag in poller_tags and check.module_type in worker_types:
                        c_time = time.time()
                        check.status = 'inpoller'
                        check.executor_id = executor_id
                        # We make a minimal copy with info for exec the check.
                        action_to_give_to_executor.append(check.copy_shell())
                        in_ask_limit = self._test_limit_for_giving_action_to_executor(check, executor_id, request_limit, request_limit_cpu)
                        in_check_time += (time.time() - c_time)
                        self.check_to_launch[t_to_go].remove(check)
                        if in_ask_limit:
                            break
                if len(self.check_to_launch[t_to_go]) == 0:
                    key_to_del.append(t_to_go)
            
            for todel in key_to_del:
                del self.check_to_launch[todel]
        
        # If reactionner want to notify too
        if do_actions:
            for action in self.actions.values():
                is_master = (action.is_a == ACTION_TYPES.NOTIFICATION and not action.contact)
                
                # Master notifications should not be launched
                if is_master:
                    continue
                
                # if do_action, call the reactionner, and so reactionner_tags by default is ['None']
                # by default reactionner_tag is 'None' and reactionner_tags is ['None'] too
                # So if not the good one, loop for next :)
                if action.reactionner_tag not in reactionner_tags:
                    continue
                
                # same for module_type
                if action.module_type not in worker_types:
                    continue
                
                if action.is_a == ACTION_TYPES.EVENTHANDLER:
                    logger.debug('[EVENTHANDLER] looking at event handler %d (on object %s) state %s to know if we should give it to a reactionner' % (action.id, action.ref.get_full_name(), action.status))
                
                # And now look for can launch or not :)
                if action.status == 'scheduled' and action.is_launchable(now):
                    if action.is_a == ACTION_TYPES.EVENTHANDLER:
                        logger.debug('[EVENTHANDLER] we can give the event handler %d (on object %s) to the reactionner' % (action.id, action.ref.get_full_name()))
                    action.status = 'inpoller'
                    action.executor_id = executor_id
                    # This is for child notifications and eventhandlers
                    new_a = action.copy_shell()
                    
                    action_to_give_to_executor.append(new_a)
                    # Look if we have too much request to send
                    in_ask_limit = self._test_limit_for_giving_action_to_executor(action, executor_id, request_limit, request_limit_cpu)
                    
                    if in_ask_limit:
                        break
        
        if self.nb_action_give_to_executor > 0:
            qlimit = '%s' % request_limit if request_limit != -1 else 'unlimited'
            logger.info(
                '%s %s [%s-%s][tag:%s][worker_type:%s] Querying with limit of [%s] objects and [%.3f]s execution time and we give it [%d] objects for a total of [%.3f]s execution time' %
                (CHAPTER_CHECKS_AND_NOTIF, SECTION_GET, executor_type, executor_id, ','.join(tag), ','.join(worker_types), qlimit, request_limit_cpu, self.nb_action_give_to_executor, self.cpu_time_action_give_to_executor)
            )
        if 'raw_nb_checks_send' in self.stat_by_executor.get(executor_id, {}):
            stat_by_executor = self.stat_by_executor[executor_id]
            stat_by_executor['raw_nb_checks_send'] += len(action_to_give_to_executor)
            stat_by_executor['raw_cpu_time_checks_send'] += self.cpu_time_action_give_to_executor
        else:
            stat_by_executor = self.stat_by_executor.get(executor_id, {})
            self.stat_by_executor[executor_id] = stat_by_executor
            stat_by_executor['raw_nb_checks_send'] = len(action_to_give_to_executor)
            stat_by_executor['raw_cpu_time_checks_send'] = self.cpu_time_action_give_to_executor
        
        logger.log_perf(start, 'get_check', 'OVERALLTIME:[%.3f] in_action_time:[%.3f] for [%s] actions' % (time.time() - start, in_check_time, len(action_to_give_to_executor)))
        return action_to_give_to_executor
    
    
    def _test_limit_for_giving_action_to_executor(self, check, executor_id, request_limit, request_limit_cpu):
        in_ask_limit = False
        
        self.nb_action_give_to_executor += 1
        action_commande = getattr(check, 'command', None)
        if action_commande:
            action_hash = check.get_hash()
            exec_stat_executor = self._exec_stat.get(executor_id, {})
            default_executor = self._exec_stat[_DEFAULT_EXECUTOR]
            if action_hash in exec_stat_executor:
                self.cpu_time_action_give_to_executor += exec_stat_executor[action_hash]['action_cpu_time']
                self._nb_checks_with_stat_send += 1
            elif action_hash in default_executor:
                self.cpu_time_action_give_to_executor += default_executor[action_hash]['action_cpu_time']
                logger.debug('[scheduler][%s] Give action with _DEFAULT_EXECUTOR time. [%s]' % (self.instance_id, check.command_name))
            else:
                # Only limit fork based checks
                if check.module_type == 'fork':
                    self.cpu_time_action_give_to_executor += _DEFAULT_ACTION_CPU_TIME
                    logger.debug('[scheduler][%s] Give action with _DEFAULT_ACTION_CPU_TIME time. [%s]' % (self.instance_id, check.command_name))
        
        if request_limit != -1 and request_limit != 0 and self.nb_action_give_to_executor >= request_limit:
            in_ask_limit = True
        if request_limit_cpu != -1 and self.cpu_time_action_give_to_executor > request_limit_cpu:
            in_ask_limit = True
        return in_ask_limit
    
    
    def _update_check_causes_stats(self, cause):
        if cause == CHECK_CAUSE.SCHEDULE:
            self.raw_nb_checks_received_schedule += 1
        elif cause == CHECK_CAUSE.FORCE:
            self.raw_nb_checks_received_force += 1
        elif cause == CHECK_CAUSE.RETRY:
            self.raw_nb_checks_received_retry += 1
        elif cause == CHECK_CAUSE.DEPENDENCY:
            self.raw_nb_checks_received_dependency += 1
        else:
            logger.warning('check return with unknown cause : [%s]' % cause)
    
    
    # Called by poller and reactionner to send result
    def put_results(self, action):
        if action.is_a == ACTION_TYPES.NOTIFICATION:
            # We will only see child notifications here
            try:
                # Add protection for strange charset
                if isinstance(action.output, str):
                    action.output = action.output.decode('utf8', 'ignore')
                # Add protection for strange charset
                if isinstance(action.long_output, str):
                    action.long_output = action.long_output.decode('utf8', 'ignore')
                
                original_action = self.actions.get(action.id, None)
                if original_action is None:
                    if MONITORING_CHECK_CONSUME_DEBUG_FLAG:
                        logger.info('[NOTIFICATION] We received a notification return (id=%s) for a unknown notification. this can be because this host/check was disabled.')
                    return
                original_action.get_return_from(action)
                item = original_action.ref
                item.remove_in_progress_notification(original_action)
                item.last_notification = action.check_time
                
                # And we ask the item to update it's state
                self.get_and_register_status_brok(item)
                
                # If we' ve got a problem with the notification, raise a Warning log
                if action.status == 'timeout':
                    logger.warning("Contact %s %s notification command '%s ' timed out after %d seconds on the reactionner %s" %
                                   (original_action.contact.contact_name,
                                    item.__class__.my_type,
                                    original_action.command_name,
                                    int(action.execution_time),
                                    action.executor_id,
                                    ))
                elif action.exit_status != 0:
                    logger.warning("The notification command '%s' raised an error (exit code=%d on the reactionner=%s): '%s %s'" % (action.command_name, action.exit_status, action.executor_id, action.output, action.long_output))
            except AttributeError, exp:  # bad object, drop it
                logger.warning('put_results:: get bad notification : %s ' % str(exp))
        
        elif action.is_a == ACTION_TYPES.CHECK:
            
            try:
                if action.status == 'timeout':
                    action.exit_status = self.conf.timeout_exit_status
                
                if action.get_cpu_time() > self.checks[action.id].warning_threshold_cpu_usage:
                    self.checks_warning_threshold_cpu_usage.append((action.command_name, action.get_cpu_time(), self.checks[action.id].warning_threshold_cpu_usage, action.check_time))
                
                self.checks[action.id].get_return_from(action, self.conf.language)
                self.checks[action.id].status = CHECK_STATUS.WAITCONSUME
            except KeyError, exp:
                pass
        
        elif action.is_a == ACTION_TYPES.EVENTHANDLER:
            original_event_handler = self.actions.get(action.id, None)
            
            # Maybe we got a return of a old even handler, so we can forget it
            if original_event_handler is None:
                logger.warning('put_results:: get unknown event handler : %s ' % action.id)
                return
            
            # Add protection for strange charset as we can print it
            if isinstance(action.output, str):
                action.output = action.output.decode('utf8', 'ignore')
            
            # Add protection for strange charset as we can print it
            if isinstance(action.long_output, str):
                action.long_output = action.long_output.decode('utf8', 'ignore')
            
            reference_object = original_event_handler.ref
            reference_name = reference_object.get_full_name()
            logger.debug('[EVENTHANDLER] the event handler %d (on element %s) just came back from the reactionner %s with state %s' % (original_event_handler.id, reference_name, action.status, action.executor_id))
            
            # It just die
            if action.status == 'timeout':
                logger.warning("[EVENTHANDLER] the event handler %s on the element %s timed out after %d seconds on the reactionner %s" % (action.id, reference_name, int(action.execution_time), action.executor_id))
            elif action.exit_status != 0:
                logger.warning(
                    "[EVENTHANDLER] The event handler %s on the element %s raised an error (exit code=%d on the reactionner:%s): '%s %s'" % (action.id, reference_name, action.exit_status, action.executor_id, action.output, action.long_output))
            else:
                logger.debug('[EVENTHANDLER] The event handler %d (on element %s) just came back without errors (exit code=%s on the reactionner:%s)' % (action.id, reference_name, action.exit_status, action.executor_id))
            
            # Let's set it to be clean asap
            original_event_handler.status = 'zombie'
        
        else:
            logger.error("[scheduler][%s] The received result type in unknown! [%s]" % (self.instance_id, str(action.is_a)))
            return
        
        self._compute_exec_stat(action)
    
    
    def _compute_exec_stat(self, action):
        now = time.time()
        action_commande = getattr(action, 'command', None)
        action_cpu_time = getattr(action, 'average_cpu_time', 0)
        executor_id = getattr(action, 'executor_id', None)
        if action_cpu_time != 0 and action_commande is not None and executor_id is not None:
            action_hash = action.get_hash()
            executor_stat = {}
            if executor_id in self._exec_stat:
                executor_stat = self._exec_stat[executor_id]
            
            self._exec_stat[executor_id] = executor_stat
            
            action_stat = {}
            if action_hash in executor_stat:
                action_stat = executor_stat[action_hash]
            
            action_stat['action_cpu_time'] = action_cpu_time
            action_stat['last_update'] = now
            action_stat['saving_periode'] = action.get_saving_period()
            executor_stat[action_hash] = action_stat
            if action.is_a == "check":
                if hasattr(action, 'cause'):
                    # update stats counter about the cause of received checks
                    self._update_check_causes_stats(action.cause)
                else:
                    raise Exception('The poller [%s] give use a action without cause. You should update your pollers.' % executor_id)
            
            self._exec_stat[_DEFAULT_EXECUTOR][action_hash] = action_stat
            
            # logger.debug("[scheduler][%s] Stat on action[%s] [%.50s] with average[%s] from executor[%s]" % (self.instance_id, action.id, action.command_name, action_cpu_time, executor_id))
    
    
    def _clean_exec_stat(self):
        # Remove old stat from the _exec_stat
        now = time.time()
        
        for executor_id in self._exec_stat:
            executor_stat = self._exec_stat[executor_id]
            to_remove = []
            for hash_commande in executor_stat:
                stat = executor_stat[hash_commande]
                if (now - stat['last_update']) / 60 > stat['saving_periode']:
                    to_remove.append(hash_commande)
            # logger.debug('[scheduler][%s][%s] Total stat size: %d' % (self.instance_id, executor_id, len(executor_stat)))
            if to_remove:
                logger.debug("[scheduler][%s] Clean [%d] stats for executor [%s]" % (self.instance_id, len(to_remove), executor_id))
            for hash_commande in to_remove:
                del executor_stat[hash_commande]
        
        todel = []
        for entry in self.checks_warning_threshold_cpu_usage:
            if time.time() - entry[3] > _KEEP_CHECKS_WARNING_THRESHOLD_CPU_USAGE_TIME * 60:
                todel.append(entry)
        
        for entry in todel:
            self.checks_warning_threshold_cpu_usage.remove(entry)
    
    
    # We should push actions to our passives satellites
    def push_actions_to_passives_satellites(self, distant_link):
        daemon_type = distant_link['type']
        daemon_name = distant_link['name']
        # logger.debug("[scheduler][%s] I will send actions to the %s %s" % (self.instance_id, daemon_type, daemon_name))
        con = distant_link['con']
        daemon_tags = distant_link['%s_tags' % daemon_type]
        try:
            start_time = time.time()
            request_limit_cpu = con.get('get_request_limit_cpu')
            distant_link['latency'] = time.time() - start_time
            request_limit_cpu = float(cPickle.loads(str(request_limit_cpu)))
        except Exception as exp:
            request_limit_cpu = -1
            logger.warning("[scheduler][%s] The [%s:%s] don't give this cpu limit: [%s]" % (self.instance_id, daemon_type, daemon_name, str(exp)))
        
        if daemon_type == 'reactionner':
            lst = self.get_to_run_checks(False, True, daemon_tags, worker_name=daemon_name, module_types=['fork'], request_limit_cpu=request_limit_cpu, protect_from_rogues=False)
        else:
            lst = self.get_to_run_checks(True, False, daemon_tags, worker_name=daemon_name, module_types=['fork'], request_limit_cpu=request_limit_cpu, protect_from_rogues=False)
        try:
            if len(lst) > 0:
                logger.debug("Sending [%s] actions to [%s]" % (len(lst), daemon_name))
            con.post('push_actions', {'actions': lst, 'sched_id': self.instance_id})
        except HTTPExceptions as exp:
            logger.warning("[scheduler][%s] Connection problem to the [%s:%s]: [%s]" % (self.instance_id, daemon_type, daemon_name, str(exp)))
            distant_link['con'] = None
        except KeyError as exp:
            logger.warning("[scheduler][%s] The [%s:%s] is not initialized: [%s]" % (self.instance_id, daemon_type, daemon_name, str(exp)))
            distant_link['con'] = None
    
    
    # We should get returns from satellites
    def get_actions_from_passives_satellites(self, distant_link):
        daemon_type = distant_link['type']
        daemon_name = distant_link['name']
        con = distant_link['con']
        # logger.debug("[scheduler][%s] I will get actions from the %s %s" % (self.instance_id, daemon_type, daemon_name))
        
        try:
            results = con.get('get_returns', {'sched_id': self.instance_id}, wait='long')
            results = base64.b64decode(results)
            results = zlib.decompress(results)
            results = cPickle.loads(str(results))
            
            nb_received = len(results)
            if nb_received > 0:
                logger.debug("Received [%d] passive results from [%s]" % (nb_received, daemon_name))
            with self.waiting_results_lock:
                self.waiting_results.extend(results)
        except HTTPExceptions as exp:
            logger.warning("Connection problem to the %s %s: %s" % (daemon_type, daemon_name, str(exp)))
            distant_link['con'] = None
        except KeyError as exp:
            logger.warning("The %s '%s' is not initialized: %s" % (daemon_type, daemon_name, str(exp)))
            distant_link['con'] = None
    
    
    # Some checks are purely internal, like business based one
    # simply ask their ref to manage it when it's ok to run
    def manage_internal_checks(self):
        now = time.time()
        
        with self.checks_n_actions_lock:
            # First let create checks for the clusters that need to be recompute ( aka they are dirty )
            dirty_clusters = proxyitemsgraph.get_and_reset_clusters_to_recompute_state()
            if len(dirty_clusters) > 0:
                logger.debug('Clusters to refresh: %s' % dirty_clusters)
            for c_uuid in dirty_clusters:
                cluster = self.clusters.get(c_uuid, None)
                if cluster is None:
                    continue
                
                cluster.schedule(force=True, force_time=now)
                if len(cluster.checks_in_progress) == 0:
                    logger.error('Cannot schedule the cluster %s and force a immediate check' % cluster.get_full_name())
                    continue
                # stack the checks as we do for all checks
                # NOTE: code take from get_new_actions
                for a in cluster.actions:
                    self.add(a)
                # We take all, we can clear it
                cluster.actions = []
            
            sorted_key = self.check_to_launch.keys()
            sorted_key.sort()
            for t_to_go in sorted_key:
                checks = self.check_to_launch[t_to_go]
                to_del = []
                if now < t_to_go:
                    break
                for check in checks:
                    if check.status == 'scheduled' and check.internal:
                        check.ref.manage_internal_check(check)
                        # it manage it, now just ask to consume it like for all checks
                        check.status = 'waitconsume'
                        to_del.append(check)
                # We need to clean the checks we did "launched" like we are doing in the poller case
                for check in to_del:
                    checks.remove(check)
    
    
    # We look for possible automatic ack creation if ALL our sources are in acknowledge too.
    # * CREATION: all all our source problems are ack, we are going ack
    # * REMOVING: we remove our ack if one of our source pb is not ack,
    #             BUT only if the ack was an automatic one
    def compute_automatic_acknowledge(self):
        for i in self.hosts:
            i.compute_automatic_acknowledge()
    
    
    # We look for possible automatic flapping cretaion if ALL our sources are flapping too
    # * CREATION : all of our source are flappping but we are not in flapping
    # * REMOVING: we remove if we are in flapping
    def compute_automatic_flapping(self):
        for i in self.hosts:
            i.compute_automatic_flapping()
    
    
    # We look for clusters that need to update their own root problems computation from proxy items
    def compute_cluster_root_problems(self):
        
        dirty_clusters = proxyitemsgraph.get_and_reset_clusters_to_recompute_root_problems()
        
        for c_uuid in dirty_clusters:
            cluster = self.clusters.get(c_uuid, None)
            if cluster is None:
                logger.debug('Cannot find dep cluster %s for root problems' % c_uuid)
                continue
            proxy = proxyitemsmgr[c_uuid]
            orig_root_problems = proxy.root_problems
            new_root_problems = set()
            
            my_fathers = proxyitemsgraph.son_to_fathers.get(c_uuid, [])
            for father_uuid in my_fathers:
                father = proxyitemsmgr[father_uuid]
                
                # If the father is in a bad state with no root problems, it means it is a root problem itself
                if father.state != 0 and len(father.root_problems) == 0:
                    new_root_problems.add(father.uuid)
                else:  # ok take its root problem if there are some
                    new_root_problems.update(father.root_problems)
            did_change = (new_root_problems != orig_root_problems)
            
            if did_change:  # raise a brok about this, so the broker will know about the new root problems
                proxy.root_problems = new_root_problems
                # And we register a new broks for update status
                cluster.broks.append(cluster.get_update_status_brok())
                # And warn other cluster that rely on this one that they must recompute their own
                proxyitemsgraph.trigger_my_clusters_root_problem(c_uuid)
    
    
    # Call by brokers to have broks
    # We give them, and clean them!
    def get_broks(self, bname):
        # If we are here, we are sure the broker entry exists
        res = self.brokers[bname]['broks']
        # They are gone, we keep none!
        self.brokers[bname]['broks'] = {}
        
        # Also put in the result the possible first log broks if so
        res.update(self.broks)
        # and clean the global broks too now
        self.broks.clear()
        
        return res
    
    
    def get_raw_datas(self, provider_name):
        raw_datas = self.providers[provider_name][_PROVIDERS_KEY_RAW_DATAS]
        self.providers[provider_name][_PROVIDERS_KEY_RAW_DATAS] = {}
        
        raw_datas.update(self.raw_datas)
        self.raw_datas.clear()
        
        logger.debug("[raw_data] get_raw_datas [%s]" % provider_name)
        return raw_datas
    
    
    # An element can have its topology changed by an external command
    # if so a brok will be generated with this flag. No need to reset all of
    # them.
    def reset_topology_change_flag(self):
        for i in self.hosts:
            i.topology_change = False
        for i in self.services:
            i.topology_change = False
    
    
    # Update the retention file and give all te data in
    # a dict so the read function can pickup what it wants
    # For now compression is not used, but it can be added easily
    # just uncomment :)
    def update_retention_file(self, forced=False):
        if self.skip_rentention_save:
            return
        t0 = time.time()
        exec_stat = copy.deepcopy(self._exec_stat)
        self._prepare_retention_data()
        time_to_prepare = time.time() - t0
        if time_to_prepare > 1:
            logger.warning('[Support Information]prepare retention data take [%.3f]s' % time_to_prepare)
        
        if self._save_retention_thread:
            self._save_retention_thread.join()
        self._save_retention_thread = threading.Thread(target=self._save_retention, name="scheduler retention saving", args=(exec_stat, forced))
        self._save_retention_thread.start()
    
    
    def _save_retention(self, exec_stat, forced):
        try:
            # save_retention_time = -1 -> save in progress
            self._update_scheduler_stat('save_retention_time', -1)
            self._update_scheduler_stat('save_retention_error', '')
            self._update_scheduler_stat('last_retention_save', '%s' % date.today())
            t0 = time.time()
            self._save_exec_stat(exec_stat)
            # If we set the update to 0, we do not want of this if we do not forced (like at stopping)
            if self.conf.retention_update_interval != 0 or forced:
                try:
                    self.hook_point('save_retention')
                except RuntimeError as err:
                    logger.debug(err.message)
                    time_to_save = time.time() - t0
                    self._update_scheduler_stat('save_retention_time', time_to_save)
                    self._update_scheduler_stat('save_retention_error', err.message)
                    return
            
            time_to_save = time.time() - t0
            if time_to_save > 2:
                logger.warning('[Support Information]saving retention data take [%.3f]s' % time_to_save)
            self._update_scheduler_stat('save_retention_time', time_to_save)
        except Exception as exp:
            time_to_save = time.time() - t0
            self._update_scheduler_stat('save_retention_time', time_to_save)
            self._update_scheduler_stat('save_retention_error', exp.message)
            logger.error(exp)
            logger.print_stack()
    
    
    # Delete the old mongo retention
    def old_retention_delete(self):
        if self._delete_old_retention_thread:
            self._delete_old_retention_thread.join()
        self.hook_point('delete_old_retention')
    
    
    # Load the retention file and get status from it. It does not get all checks in progress
    # for the moment, just the status and the notifications.
    def retention_load(self):
        if self._save_retention_thread:
            self._save_retention_thread.join()
        
        done = self.hook_point('load_retention')
        if not done:
            logger.error('Failed to load retention. Shutting down daemon')
            self.skip_rentention_save = True
            self.die()
            self.sched_daemon.must_run = False
            self.sched_daemon.interrupted = True
        
        # In all cases we drop old retention data
        self._retention_data = None
    
    
    # Call by retention module, it give all data to save.
    # You must call _prepare_retention_data before this method
    def get_retention_data(self):
        return self._retention_data
    
    
    def _prepare_retention_data(self):
        all_data = {'hosts': {}, 'services': {}}
        for host in self.hosts:
            _host_to_save = {}
            running_properties = host.__class__.running_properties
            for prop, entry in running_properties.iteritems():
                if entry.retention:
                    _host_to_save[prop] = Scheduler._get_retention_value(host, entry, prop)
            all_data['hosts'][host.get_instance_uuid()] = _host_to_save
        
        for service in self.services:
            _service_to_save = {}
            running_properties = service.__class__.running_properties
            for prop, entry in running_properties.iteritems():
                if entry.retention:
                    _service_to_save[prop] = Scheduler._get_retention_value(service, entry, prop)
            all_data['services'][service.get_instance_uuid()] = _service_to_save
        
        self._retention_data = all_data
    
    
    @staticmethod
    def _get_retention_value(host, entry, prop):
        v = getattr(host, prop)
        f = entry.retention_preparation
        if f:
            v = f(host, v)
        return v
    
    
    # A retention module is asking us which uuid it need to retrieve:
    # * hosts & services
    # * maybe we already have in self._retention_data and so don't ask for theses ones
    #   * => means that we did have on the configuration just before, and so it not interesting to be ask from
    #        database
    def get_instances_uuids_to_restore_retention(self):
        r = {'hosts': {'total': len(self.hosts), 'to_load': []}, 'services': {'total': len(self.services), 'to_load': []}}
        # Hosts:
        hosts_to_load = r['hosts']['to_load']
        current_host_retention_cache = {}
        if self._retention_data:
            current_host_retention_cache = self._retention_data['hosts']
        for host in self.hosts:
            host_uuid = host.get_instance_uuid()
            if host_uuid in current_host_retention_cache:
                continue
            hosts_to_load.append(host_uuid)
        
        # Services:
        services_to_load = r['services']['to_load']
        current_service_retention_cache = {}
        if self._retention_data:
            current_service_retention_cache = self._retention_data['services']
        for service in self.services:
            service_uuid = service.get_instance_uuid()
            if service_uuid in current_service_retention_cache:
                continue
            services_to_load.append(service_uuid)
        
        return r
    
    
    # For a host uuid, we can have the retention data in :
    # * the self._retention_data so we are sure it's up to date (take first)
    # * the module data
    # * no where, no luck.
    def _get_host_retention_data_from(self, host_uuid, module_data):
        if self._retention_data and host_uuid in self._retention_data['hosts']:
            return self._retention_data['hosts'][host_uuid]
        if host_uuid in module_data['hosts']:
            return module_data['hosts'][host_uuid]
        # found no one, skip it
        return None
    
    
    # Same for checks
    def _get_service_retention_data_from(self, service_uuid, module_data):
        if self._retention_data and service_uuid in self._retention_data['services']:
            return self._retention_data['services'][service_uuid]
        if service_uuid in module_data['services']:
            return module_data['services'][service_uuid]
        # found no one, skip it
        return None
    
    
    # Get back our broks from a retention module :)
    def restore_retention_data(self, data):
        # Now load interesting properties in hosts/services Tagging retention=False prop that not be directly load
        # Items will be with theirs status, but not in checking, so
        # a new check will be launched like with a normal beginning (random distributed scheduling)
        
        for host in self.hosts:
            host_uuid = host.get_instance_uuid()
            # we can have (or not) retention data from cache or from module
            host_retention_data = self._get_host_retention_data_from(host_uuid, data)
            if host_retention_data is None:  # not found, skip this host retention
                continue
            
            # First manage all running properties
            running_properties = host.__class__.running_properties
            for prop, entry in running_properties.iteritems():
                # Maybe the saved one was not with this value, so we just bypass this
                if entry.retention and prop in host_retention_data:
                    setattr(host, prop, host_retention_data[prop])
            
            # If the retention comes from an older version without last_state_as_string, we need to recompute it
            # roughly from last_state_id
            if 'last_state_as_string' not in host_retention_data:
                setattr(host, 'last_state_as_string', _HOST_STATUS_ID_TO_STATUS[getattr(host, "last_state_id", 3)])
            
            # Ok, some are in properties too (like active check enabled
            # or not. Will OVERRIDE THE CONFIGURATION VALUE!
            properties = host.__class__.properties
            for prop, entry in properties.iteritems():
                if entry.retention:
                    # Maybe the saved one was not with this value, so
                    # we just bypass this
                    if prop in host_retention_data:
                        setattr(host, prop, host_retention_data[prop])
            # Now manage all linked objects load from previous run
            if 'must_respread' not in host_retention_data:
                host.must_respread = True
            # Relink the notified_contacts as a set() of true contacts objects
            # it it was load from the retention, it's now a list of contacts  names
            if 'notified_contacts' in host_retention_data:
                new_notified_contacts = set()
                for cname in host.notified_contacts:
                    contact = self.contacts.find_by_name(cname)
                    # Maybe the contact is gone. Skip it
                    if contact:
                        new_notified_contacts.add(contact)
                host.notified_contacts = new_notified_contacts
            for notification in host.notifications_in_progress.values():
                notification.ref = host
                self.add(notification)
                # Also raises the action id, so do not overlap ids
                notification.assume_at_least_id(notification.id)
            for check in host.checks_in_progress:
                logger.debug('[retention][loading] the host %s is restoring a check from retention: id=%s status=%s' % (host.get_full_name(), check.id, check.status))
                check.ref = host
                self.add(check)
                # Also raises the action id, so do not overlap ids
                check.assume_at_least_id(check.id)
                # IMPORTANT: checks here are NOT ready to be launched/analysed!
                # so we will need a final step at the end of the retention load, because
                # we need to have load ALL checks before be ready
            host.update_in_checking()
            # And also add downtimes
            for dt in host.downtimes:
                dt.ref = host
                dt.extra_comment = None  # No more set (2.05.01 and more)
                # raises the downtime id to do not overlap
                Downtime.id = max(Downtime.id, dt.id + 1)
                self.add(dt)
            if host.acknowledgement is not None:
                host.acknowledgement.ref = host
        
        # Same for services
        for service in self.services:
            service_uuid = service.get_instance_uuid()
            # we can have (or not) retention data from cache or from module
            check_retention_data = self._get_service_retention_data_from(service_uuid, data)
            if check_retention_data is None:  # not found, skip this service retention
                continue
            
            # Load the major values from running properties
            running_properties = service.__class__.running_properties
            for prop, entry in running_properties.iteritems():
                # Maybe the saved one was not with this value, so we just bypass this
                if entry.retention and prop in check_retention_data:
                    setattr(service, prop, check_retention_data[prop])
            
            # If the retention comes from an older version without last_state_as_string, we need to recompute it
            # roughly from last_state_id
            if 'last_state_as_string' not in check_retention_data:
                setattr(service, 'last_state_as_string', _SERVICE_STATUS_ID_TO_STATUS[getattr(service, "last_state_id", 3)])
            
            # And some others from properties dict too
            properties = service.__class__.properties
            for prop, entry in properties.iteritems():
                if entry.retention:
                    # Maybe the saved one was not with this value, so
                    # we just bypass this
                    if prop in check_retention_data:
                        setattr(service, prop, check_retention_data[prop])
            if 'must_respread' not in check_retention_data:
                service.must_respread = True
            # Relink the notified_contacts as a set() of true contacts objects
            # it it was load from the retention, it's now a list of contacts
            # names
            if 'notified_contacts' in check_retention_data:
                new_notified_contacts = set()
                for cname in service.notified_contacts:
                    contact = self.contacts.find_by_name(cname)
                    # Maybe the contact is gone. Skip it
                    if contact:
                        new_notified_contacts.add(contact)
                service.notified_contacts = new_notified_contacts
            for notification in service.notifications_in_progress.values():
                notification.ref = service
                self.add(notification)
                # Also raises the action id, so do not overlap id
                notification.assume_at_least_id(notification.id)
            for check in service.checks_in_progress:
                logger.debug('[retention][loading] the check %s is restoring a check from retention: id=%s status=%s' % (service.get_full_name(), check.id, check.status))
                check.ref = service
                self.add(check)
                # Also raises the action id, so do not overlap ids
                check.assume_at_least_id(check.id)
                # IMPORTANT: checks here are NOT ready to be launched/analysed!
                # so we will need a final step at the end of the retention load, because
                # we need to have load ALL checks before be ready
            service.update_in_checking()
            # And also add downtimes
            for dt in service.downtimes:
                dt.ref = service
                dt.extra_comment = None
                # raises the downtime id to do not overlap
                Downtime.id = max(Downtime.id, dt.id + 1)
                self.add(dt)
            if service.acknowledgement is not None:
                service.acknowledgement.ref = service
        
        # Final step for checks: we did load checks that are in the middle of they analysis
        # so we must prepare them
        # NOTE: they already have they ref
        # IMPORTANT: only loop for the check we have at the begining of the loop!
        restored_checks = list(self.checks.values())
        for check in restored_checks:
            check.restore_from_retention()
    
    
    # Fill the self.broks with broks of self (process id, and co)
    # broks of service and hosts (initial status)
    def fill_initial_broks(self, broker_name, with_logs=False):
        if not getattr(self, 'conf', None):
            raise Exception("scheduler has no conf.")
        
        # First a Brok for delete all from my instance_id
        brok = Brok('clean_all_my_instance_id', {'instance_id': self.instance_id})
        self.add_Brok(brok, broker_name)
        
        # first the program status
        brok = self.get_program_status_brok()
        self.add_Brok(brok, broker_name)
        
        #  We can't call initial_status from all this types
        #  The order is important, service need host...
        initial_status_types = (self.timeperiods, self.commands,
                                self.contactgroups, self.contacts,
                                self.hosts, self.hostgroups,
                                self.services, self.servicegroups)
        
        self.conf.skip_initial_broks = getattr(self.conf, 'skip_initial_broks', False)
        logger.debug("Skipping initial broks? %s" % str(self.conf.skip_initial_broks))
        if not self.conf.skip_initial_broks:
            for tab in initial_status_types:
                for i in tab:
                    brok = i.get_initial_status_brok()
                    self.add_Brok(brok, broker_name)
        
        # Only raises the all logs at the scheduler startup
        if with_logs:
            # Ask for INITIAL logs for services and hosts
            for i in self.hosts:
                i.raise_initial_state()
            for i in self.services:
                i.raise_initial_state()
        
        brok = self.get_proxy_items_graph_brok()
        self.add_Brok(brok, broker_name)
        
        # Add a brok to say that we finished all initial_pass
        brok = Brok('initial_broks_done', {'instance_id': self.instance_id})
        self.add_Brok(brok, broker_name)
        
        # We now have all full broks
        self.has_full_broks = True
        
        logger.info('%s %s [%10s] Created %5d initial Broks (data exchanged between scheduler and broker for all objects) for broker named "%s" and configuration flavor: %s'
                    % (CHAPTER_CONFIGURATION, SECTION_BROKERS, broker_name, len(self.brokers[broker_name]['broks']), broker_name, self.conf.push_flavor))
    
    
    def fill_initial_raw_datas(self, provider_name):
        self.conf.skip_initial_raw_datas = getattr(self.conf, 'skip_initial_raw_datas', False)
        logger.debug("[raw_data] fill_initial_raw_datas for [%s] - skip [%s]" % (provider_name, self.conf.skip_initial_raw_datas))
        
        raw_data = RawData('clean_all_my_instance_id', {'instance_id': self.instance_id})
        self.add_RawData(raw_data, provider_name)
        
        initial_status_types = (self.timeperiods, self.commands,
                                self.contacts, self.contactgroups,
                                self.hosts, self.hostgroups,
                                self.services, self.servicegroups)
        
        if not self.conf.skip_initial_raw_datas:
            for type in initial_status_types:
                for item in type:
                    raw_data = item.get_initial_status_raw_data()
                    self.add_RawData(raw_data, provider_name)
        
        raw_data = RawData('initial_raw_datas_done', {'instance_id': self.instance_id})
        self.add_RawData(raw_data, provider_name)
        
        logger.info("[raw_data] Created [%d] initial raw_datas for provider [%s] and configuration flavor [%s]" %
                    (len(self.providers[provider_name][_PROVIDERS_KEY_RAW_DATAS]), provider_name, self.conf.push_flavor))
    
    
    # Crate a brok with program status info
    def get_and_register_program_status_brok(self):
        b = self.get_program_status_brok()
        self.add(b)
    
    
    # Crate a brok with program status info
    def get_and_register_update_program_status_brok(self):
        b = self.get_program_status_brok()
        b.type = 'update_program_status'
        self.add(b)
    
    
    # Get a brok with program status
    def get_program_status_brok(self):
        now = int(time.time())
        data = {
            'is_running'                    : 1,
            'instance_id'                   : self.instance_id,
            'instance_name'                 : self.instance_name,
            'last_alive'                    : now,
            'interval_length'               : self.conf.interval_length,
            'program_start'                 : self.program_start,
            'pid'                           : os.getpid(),
            'daemon_mode'                   : 1,
            'last_command_check'            : now,
            'last_log_rotation'             : now,
            'notifications_enabled'         : self.conf.enable_notifications,
            'active_service_checks_enabled' : self.conf.execute_service_checks,
            'passive_service_checks_enabled': self.conf.accept_passive_service_checks,
            'active_host_checks_enabled'    : self.conf.execute_host_checks,
            'passive_host_checks_enabled'   : self.conf.accept_passive_host_checks,
            'event_handlers_enabled'        : self.conf.enable_event_handlers,
            'flap_detection_enabled'        : self.conf.enable_flap_detection,
            'failure_prediction_enabled'    : 0,
            'process_performance_data'      : self.conf.process_performance_data,
            'obsess_over_hosts'             : self.conf.obsess_over_hosts,
            'obsess_over_services'          : self.conf.obsess_over_services,
            'modified_host_attributes'      : 0,
            'modified_service_attributes'   : 0,
            'global_host_event_handler'     : self.conf.global_host_event_handler,
            'global_service_event_handler'  : self.conf.global_service_event_handler,
            'check_external_commands'       : self.conf.check_external_commands,
            'check_service_freshness'       : self.conf.check_service_freshness,
            'check_host_freshness'          : self.conf.check_host_freshness,
            'command_file'                  : self.conf.command_file,
            'default_properties_values'     : self.conf.default_properties_values,
        }
        b = Brok('program_status', data)
        return b
    
    
    # Get a brok with the item graph
    def get_proxy_items_graph_brok(self):
        data = {
            'proxy_items_graph_son_to_fathers': proxyitemsgraph.son_to_fathers,
            'instance_id'                     : self.instance_id,
        }
        b = Brok('proxy_items_graph', data)
        return b
    
    
    # Called every 1sec to consume every result in services or hosts
    # with these results, they are OK, CRITICAL, UP/DOWN, etc...
    def consume_results(self):
        t0 = time.time()
        # All results are in self.waiting_results
        # We need to get them first
        received_results = 0
        with self.waiting_results_lock:
            old_waiting_results = self.waiting_results
            self.waiting_results = []  # switch to a new one
        
        # Now loop around checks we did received
        for action in old_waiting_results:
            self._set_raw_stat_for_checks_received(action)
            self.put_results(action)
            received_results += 1
        
        _havetoresolvedeps = []
        _waitdeps = []
        _all_dep_are_finishs = []
        
        t1 = time.time()
        # Then we consume them
        for check in self.checks.values():
            current_check_status = check.status
            if current_check_status == CHECK_STATUS.WAITCONSUME:
                check.launch_consume()
            elif current_check_status == CHECK_STATUS.ALL_DEP_ARE_FINISH:
                _all_dep_are_finishs.append(check)
        
        t2 = time.time()
        
        # Finish to consume checks that have all their deps done
        for check in _all_dep_are_finishs:
            if check.can_be_consume():
                check.launch_consume()
        t3 = time.time()
        
        put_result_time = (t1 - t0)
        waitconsume_time = (t2 - t1)
        _all_dep_are_finishs_time = (t3 - t2)
        consume_time = (t3 - t0)
        
        if consume_time != 0 and received_results != 0:
            logger.debug('%s [consume] [%.3f] for [%d] results (speed consume/s:[%d]) ->  put_results:[%.3f]   waitconsume:[%.3f]   all_dep_are_finishs:[%.3f]' %
                         (CHAPTER_STATS, consume_time, received_results, (received_results / consume_time), put_result_time, waitconsume_time, _all_dep_are_finishs_time))
    
    
    def _set_raw_stat_for_checks_received(self, action):
        executor_id = action.executor_id
        if executor_id in self.stat_by_executor and 'raw_nb_checks_received' in self.stat_by_executor[executor_id]:
            stat_by_executor = self.stat_by_executor[executor_id]
            stat_by_executor['raw_nb_checks_received'] += 1
            stat_by_executor['raw_cpu_time_checks_received'] += action.average_cpu_time
        else:
            stat_by_executor = self.stat_by_executor.get(executor_id, {})
            self.stat_by_executor[executor_id] = stat_by_executor
            stat_by_executor['raw_nb_checks_received'] = 1
            stat_by_executor['raw_cpu_time_checks_received'] = action.average_cpu_time
    
    
    # Called every 1sec to delete all checks in a zombie state
    # zombie = not useful anymore
    def delete_zombie_checks(self):
        # print "**********Delete zombies checks****"
        id_to_del = []
        for c in self.checks.values():
            if c.status == 'zombie':
                id_to_del.append(c.id)
        with self.checks_n_actions_lock:
            # une petite tape dans le dos et tu t'en vas, merci...
            # *pat pat* GFTO, thks :)
            for id in id_to_del:
                del self.checks[id]  # ZANKUSEN!
    
    
    # Called every 1sec to delete all actions in a zombie state
    # zombie = not useful anymore
    def delete_zombie_actions(self):
        # print "**********Delete zombies actions****"
        id_to_del = []
        
        with self.checks_n_actions_lock:
            for a in self.actions.values():
                if a.status == 'zombie':
                    id_to_del.append(a.id)
                    if a.is_a == ACTION_TYPES.EVENTHANDLER:
                        logger.debug('[EVENTHANDLER] the event handler %s (on the element %s) will be deleted' % (a.id, a.ref.get_full_name()))
            
            # une petite tape dans le dos et tu t'en vas, merci...
            # *pat pat* GFTO, thks :)
            for _id in id_to_del:
                del self.actions[_id]  # ZANKUSEN!
    
    
    # Check for downtimes start and stop, and register them if needed
    def update_downtimes(self):
        broks = []
        now = time.time()
        
        # Check maintenance periods
        for elt in [y for y in [x for x in self.hosts] + [x for x in self.services] if y.maintenance_period is not None]:
            
            if elt.in_maintenance is None:
                if elt.maintenance_period.is_time_valid(now):
                    start_dt = elt.maintenance_period.get_next_valid_time_from_t(now)
                    end_dt = elt.maintenance_period.get_next_invalid_time_from_t(start_dt + 1) - 1
                    dt = Downtime(elt, start_dt, end_dt, 1, 0, 0, "system", "this downtime was automatically scheduled through a maintenance_period")
                    elt.add_downtime(dt)
                    self.add(dt)
                    self.get_and_register_status_brok(elt)
                    elt.in_maintenance = dt.id
            else:
                if not elt.in_maintenance in self.downtimes:
                    # the main downtimes has expired or was manually deleted
                    elt.in_maintenance = None
        
        # Check the validity of contact downtimes
        for elt in self.contacts:
            for dt in elt.downtimes:
                dt.check_activation()
        
        # A loop where those downtimes are removed
        # which were marked for deletion (mostly by dt.exit())
        for dt in self.downtimes.values():
            if dt.can_be_deleted == True:
                ref = dt.ref
                self.del_downtime(dt.id)
                broks.append(ref.get_update_status_brok())
                # also get news from services if they are updated
                if ref.my_type == 'host':
                    for s in ref.services:
                        broks.append(s.get_update_status_brok())
        
        # Same for contact downtimes:
        for dt in self.contact_downtimes.values():
            if dt.can_be_deleted == True:
                ref = dt.ref
                self.del_contact_downtime(dt.id)
                broks.append(ref.get_update_status_brok())
        
        # Check start and stop times
        for dt in self.downtimes.values():
            if dt.real_end_time < now:
                # this one has expired
                broks.extend(dt.exit())  # returns downtimestop notifications
            elif now >= dt.start_time and dt.fixed and not dt.is_in_effect:
                # this one has to start now
                broks.extend(dt.enter())  # returns downtimestart notifications
                broks.append(dt.ref.get_update_status_brok())
                # also get news from services if they are updated
                if dt.ref.my_type == 'host':
                    for s in dt.ref.services:
                        broks.append(s.get_update_status_brok())
        
        # Cluster case: check for sons for inherited dt status
        for h in self.hosts:
            if h.got_business_rule:
                broks.extend(self._check_cluster_downtime(h))
        
        for b in broks:
            self.add(b)
    
    
    def _check_cluster_downtime(self, cluster):
        for dt in self.downtimes.values():
            if dt.ref == cluster and dt.is_in_effect:
                return []
        is_inherited_dt, is_partial_dt = cluster.business_rule.get_downtime_state()
        if cluster.in_inherited_downtime != is_inherited_dt or cluster.in_partial_downtime != is_partial_dt or cluster.in_scheduled_downtime:
            cluster.in_inherited_downtime = is_inherited_dt
            cluster.in_partial_downtime = is_partial_dt
            cluster.in_scheduled_downtime = False
            return [cluster.get_update_status_brok()]
        return []
    
    
    # Main schedule function to make the regular scheduling
    def schedule(self):
        # ask for service and hosts their next check
        for type_tab in [self.services, self.hosts]:
            for i in type_tab:
                i.schedule()
    
    
    # Main actions reaper function: it get all new checks,
    # notification and event handler from hosts and services
    def get_new_actions(self):
        self.hook_point('get_new_actions')
        # ask for service and hosts their next check
        for type_tab in [self.services, self.hosts]:
            start_of_type = time.time()
            nb_actions_stacked = 0
            for i in type_tab:
                for a in i.actions:
                    nb_actions_stacked += 1
                    self.add(a)
                # We take all, we can clear it
                i.actions = []
            end_of_type = time.time()
            get_new_actions_stats[type_tab.inner_class.my_type] = (len(type_tab), end_of_type - start_of_type)
        
        if LOG_SCHEDULER_RECURRENT_TIMES:
            for type_class in get_new_actions_stats:
                nb_elements, elapsed_time = get_new_actions_stats[type_class]
                logger.debug('[TIMING] get_new_actions_stats:: %s total=%s => nb actions:%d in %.3fs' % (type_class, nb_elements, nb_actions_stacked, elapsed_time))
            
            logger.debug('[TIMING] Adding checks times: (nb add this turn:%4d) (total time=%.3f) (lock aquired time=%.3f) (set time=%.3f) (dispatch time=%.3f) (hash time=%.3f) (brok create time=%.3f) (brok add time=%.3f)' % (
                ADD_CHECKS_STATS['nb_add'], ADD_CHECKS_STATS['total_time'], ADD_CHECKS_STATS['lock_time'], ADD_CHECKS_STATS['set_time'], ADD_CHECKS_STATS['dispatch_time'], ADD_CHECKS_STATS['hash_time'], ADD_CHECKS_STATS['brok_create_time'],
                ADD_CHECKS_STATS['brok_add_time']))
            
            logger.debug('[TIMING] Adding broks times : (nb add this turn:%4d) (total time=%.3f) (sat lock aquired=%.3f) (brokers lock aquired=%.3f) (one broker=%.3f) (to all brokers=%.3f) (to global list=%.3f)' % (
                ADD_BROKS_STATS['nb_add'], ADD_BROKS_STATS['total_time'], ADD_BROKS_STATS['sat_lock_time'], ADD_BROKS_STATS['brokers_lock_time'], ADD_BROKS_STATS['to_one_broker_time'], ADD_BROKS_STATS['to_all_brokers_time'],
                ADD_BROKS_STATS['to_global_list_time']))
        
        ADD_CHECKS_STATS['nb_add'] = 0
        ADD_CHECKS_STATS['total_time'] = ADD_CHECKS_STATS['lock_time'] = ADD_CHECKS_STATS['set_time'] = ADD_CHECKS_STATS['dispatch_time'] = ADD_CHECKS_STATS['hash_time'] = ADD_CHECKS_STATS['brok_create_time'] = ADD_CHECKS_STATS['brok_add_time'] = 0.0
        
        ADD_BROKS_STATS['nb_add'] = 0
        ADD_BROKS_STATS['total_time'] = ADD_BROKS_STATS['sat_lock_time'] = ADD_BROKS_STATS['brokers_lock_time'] = ADD_BROKS_STATS['to_one_broker_time'] = ADD_BROKS_STATS['to_all_brokers_time'] = ADD_BROKS_STATS['to_global_list_time'] = 0.0
        
        if LOG_SCHEDULER_RECURRENT_TIMES:
            before = time.time()
            i = 0
            while i < 50000:
                i += 1
            logger.debug('[TIMING] computing exec time: %.3fs' % (time.time() - before))
    
    
    # Similar as above, but for broks
    # Important: broks will be set into the self.brokers, so need
    #            to lock the access to it
    def get_new_broks(self):
        # IMPORTANT: always the satellite lock BEFORE brokers lock
        with self.sched_daemon.satellite_lock:
            # ask for service and hosts their broks waiting be eaten
            with self.brokers_lock:
                for type_tab in [self.services, self.hosts]:
                    for i in type_tab:
                        # keep only the last broks for each type
                        l_broks = {}
                        for b in i.broks:
                            type_ = b.type
                            l_broks[type_] = b
                        if l_broks:
                            for b in l_broks.itervalues():
                                self.add(b)
                        # We take all, we can clear it
                        i.broks = []
    
    
    def get_new_raw_datas(self):
        return
        # with self.providers_lock:
        #    for items in [self.services, self.hosts]:
        #        for item in items:
        #            raw_datas = {}
        #            for raw_data in item.raw_datas:
        #                raw_datas[raw_data.type] = raw_data
        #            if raw_datas:
        #                for raw_data in raw_datas.itervalues():
        #                    self.add(raw_data)
        #            # We take all, we can clear it
        #            item.raw_datas = []
    
    
    # Raises checks for no fresh states for services and hosts
    def check_freshness(self):
        # print "********** Check freshness******"
        for type_tab in [self.services, self.hosts]:
            for i in type_tab:
                c = i.do_check_freshness()
                if c is not None:
                    self.add(c)
    
    
    # Check for orphaned checks: checks that never returns back
    # so if inpoller and t_to_go < now - 300s: pb!
    # Warn only one time for each "worker"
    # XXX I think we should make "time_to_orphanage" configurable
    #     each action type, each for notification, event_handler & check
    #     I think it will be a little more useful that way, not sure tho
    def check_orphaned(self):
        with self.checks_n_actions_lock:
            return self._check_orphaned()
    
    
    # real version without the lock inside
    def _check_orphaned(self):
        orphaned_checks_ids = []
        orphaned_notif_ids = []
        executor_ids = {}
        now = int(time.time())
        for c in self.checks.values():
            time_to_orphanage = c.ref.get_time_to_orphanage()
            if time_to_orphanage:
                if c.status == 'inpoller' and c.t_to_go < now - time_to_orphanage:
                    # ok it's an orphan, so change it's time ot go to now, but don't touch the original time to go
                    # so we can compute real latency
                    c.status = 'scheduled'
                    
                    c.t_to_go = now
                    #  Use the try tech instead preventive check for pref reason
                    try:
                        self.check_to_launch[now].append(c)
                    except KeyError:
                        self.check_to_launch[now] = [c]
                    orphaned_checks_ids.append(c.id)
                    if c.executor_id not in executor_ids:
                        executor_ids[c.executor_id] = 1
                        continue
                    executor_ids[c.executor_id] += 1
        for a in self.actions.values():
            time_to_orphanage = a.ref.get_time_to_orphanage()
            if time_to_orphanage:
                if a.status == 'inpoller' and a.t_to_go < now - time_to_orphanage:
                    # same as for checks
                    a.status = 'scheduled'
                    a.t_to_go = now
                    orphaned_notif_ids.append(a.id)
                    if a.executor_id not in executor_ids:
                        executor_ids[a.executor_id] = 1
                        continue
                    executor_ids[a.executor_id] += 1
        
        for w in executor_ids:
            logger.warning("%d actions (check, notification, event handler, ...) never came back for the satellite '%s'. Reenabling them for immediate execution." % (executor_ids[w], w))
        if MONITORING_CHECK_CONSUME_DEBUG_FLAG:
            logger.info('The checks that are late are: (%s) %s' % (len(orphaned_checks_ids), orphaned_checks_ids))
            logger.info('The notifs that are late are: (%s) %s' % (len(orphaned_notif_ids), orphaned_notif_ids))
    
    
    # Each loop we are going to send our broks to our modules (if need)
    def send_broks_to_modules(self):
        t0 = time.time()
        nb_sent = 0
        for mod in self.sched_daemon.modules_manager.get_external_instances():
            logger.debug("Look for sending to module %s" % mod.get_name())
            q = mod.to_q
            to_send = [b for b in self.broks.values() if not getattr(b, 'sent_to_sched_externals', False) and mod.want_brok(b)]
            q.put(to_send)
            nb_sent += len(to_send)
        
        # No more need to send them
        for b in self.broks.values():
            b.sent_to_sched_externals = True
        
        if time.time() - t0 > 0.1:
            logger.debug("[Broks] Time to send %s broks (after %d secs)" % (nb_sent, time.time() - t0))
    
    
    # Get 'objects' from external modules.
    # Right now on nobody uses it, but it can be useful for a module like livestatus to raise external commands for example
    def get_objects_from_from_queues(self):
        for f in self.sched_daemon.modules_manager.get_external_from_queues():
            full_queue = True
            while full_queue:
                try:
                    o = f.get(block=False)
                    self.add(o)
                except Empty:
                    full_queue = False
    
    
    # Filter to drop incidents that are older than a week
    def _is_newer_than_a_week(self, incident):
        return 'bloc_end' in incident and incident['bloc_end'] >= time.time() - 7 * 24 * 60 * 60
    
    
    # Closes and filters incidents
    def clean_incidents(self):
        for item_list in [self.services, self.hosts]:
            for item in item_list:
                # Convert notifications in old format (dict) into ordered list
                for b in item.notification_list:
                    if isinstance(b['bloc_content'], dict):
                        b['bloc_content'] = [b['bloc_content'][k] for k in sorted(b['bloc_content'].keys())]
                item.notification_list = filter(self._is_newer_than_a_week, item.notification_list)
                # End the incident with the newest notification time
                if item.consecutive_ok >= 20 and item.notification_list:
                    item.notification_list[-1]['bloc_end'] = max([notif['time'] for notif in item.notification_list[-1]['bloc_content']])
    
    
    # Update and clean data load from retention with new configuration
    def _sanatize_item_post_retention_load(self):
        self._recomput_impact_and_root_problem()
        self._reschedule_notification_with_new_notification_interval()
        self._set_state_validity_period()
    
    
    def _recomput_impact_and_root_problem(self):
        for items in [self.hosts, self.services]:
            for item in items:
                if item.state != 'PENDING' and item.state_type == 'HARD' and item.state_id != 0 and item.is_root_problem():
                    # SEF-5621 After retention load we do not change state of the impact, because it was done before retention loading and it must not be done twice
                    item.set_myself_as_problem(enable_impact_state=False)
    
    
    def _set_state_validity_period(self):
        for items in [self.hosts, self.services]:
            for item in items:
                if item.state_validity_period == 0:
                    if item.check_interval:
                        item.state_validity_period = item.check_interval * item.__class__.interval_length
                    else:
                        item.state_validity_period = 5 * 60
    
    
    def _reschedule_notification_with_new_notification_interval(self):
        for notification in self.actions.values():
            is_master_notification = notification.is_a == ACTION_TYPES.NOTIFICATION and notification.status == 'scheduled' and not notification.contact
            if is_master_notification and notification.previous_t_to_go:
                ref_item = notification.ref
                old_t_to_go = notification.t_to_go
                new_t_to_go = ref_item.get_next_notification_time(notification, notification.previous_t_to_go)
                if new_t_to_go is not None and old_t_to_go != new_t_to_go:
                    notification.t_to_go = new_t_to_go
                    logger.debug("reschedule notification of:[%s] old_t_to_go:[%s]->t_to_go[%s]" % (ref_item.get_full_name(), old_t_to_go, notification.t_to_go))
    
    
    # Force an update of the proxies after loading the retention data
    def update_proxy_items_states(self):
        for s in self.services:
            s.update_proxy()
        for h in self.hosts:
            h.update_proxy()
    
    
    # for all cluster check flapping change periodically and not only when on of the host state change
    def compute_cluster_flapping(self):
        for cluster in self.clusters.itervalues():
            state_id = cluster.state_id
            last_cluster_state_id = cluster.last_cluster_state_id
            if last_cluster_state_id == -1:
                last_cluster_state_id = state_id  # last_cluster_state_id can be -1 when not set
            cluster.check_flapping_change(state_id, last_cluster_state_id)
            cluster.last_cluster_state_id = cluster.state_id
            b = cluster.get_check_result_brok()
            cluster.broks.append(b)
    
    
    # Main function
    def run(self):
        # I'm a spare so do nothing
        if not self.sched_daemon.activated:
            return
        # Then we see if we've got info in the retention file
        self.retention_load()
        
        # We need to delete old retention we don't need anymore (ONLY AVAILABLE FOR MONGO RETENTION)
        if self.must_run and "mongodb_retention" in [module.module_type for module in self.sched_daemon.modules]:
            self.old_retention_delete()
        
        # Since we did load the retention, update the proxy elements with new values, to be sure to have something
        self.update_proxy_items_states()
        
        # Update and clean data load from retention with new configuration
        self._sanatize_item_post_retention_load()
        
        # Finally start the external modules now we got our data
        self.hook_point('pre_scheduler_mod_start')
        self.sched_daemon.modules_manager.start_external_instances(late_start=True)
        
        self.scheduler_is_ready = True
        
        # Ok, now all is initialized, we can make the initial broks
        logger.info("[%s] First scheduling is launching on host [ %d ] and checks [ %d ]" % (self.instance_name, len(self.hosts), len(self.services)))
        self.schedule()
        
        # And do a force for all clusters
        for cluster in self.clusters.itervalues():
            cluster.schedule(force=True)
        logger.info("[%s] First scheduling done" % self.instance_name)
        
        # Ticks are for recurrent function call like consume del zombies etc
        self.ticks = 0
        
        # We must reset it if we received a new conf from the Arbiter.
        # Otherwise, the stat check average won't be correct
        
        self.load_one_min = Load(initial_value=1)
        
        # Recompute initial business impacts to prevent sending erroneous notifications
        self.update_business_values()
        
        logger.debug("[scheduler][%s] First loop at %d" % (self.instance_id, time.time()))
        
        while self.must_run:
            start_of_loop = time.time()
            logger.debug("[scheduler][%s] Loop tick [%s]" % (self.instance_id, self.ticks))
            
            # The daemon must check if its threads are ok or not
            self.sched_daemon.assert_valid_satellite_threads()
            
            self.sched_daemon.sleep_time = 0.0
            recurrent_work_stats = []
            self.ticks += 1
            before_recurrent_works = time.time()
            # Do recurrent works like schedule, consume delete_zombie_checks
            for i in self.recurrent_works:
                (name, f, nb_ticks) = self.recurrent_works[i]
                # A 0 in the tick will just disable it
                if nb_ticks != 0:
                    _before = time.time()
                    if self.ticks % nb_ticks == 0:
                        f()
                    _after = time.time()
                    _diff = (_after - _before)
                    logger.log(_BEACON_LOG_LEVEL, '[beacon] %s [%.3f]' % (name, _diff))
                    if _diff > 0.1:
                        logger.warning('Long perf function call: %s => %.3f' % (name, _diff))
                    star = '' if _diff < 0.100 else '-'
                    recurrent_work_stats.append([name, _diff, star])
            
            if LOG_SCHEDULER_RECURRENT_TIMES:
                recurrent_work_stats.sort(key=lambda x: -x[1])
                recurrent_work_stats[0][2] = '***'
                recurrent_work_stats[1][2] = '**'
                recurrent_work_stats[2][2] = '*'
                
                recurrent_work_stats.sort(key=lambda x: x[0])  # sort by name
                
                for name, _diff, star in recurrent_work_stats:
                    logger.debug('[TIMING] Loop turn action : %-45s => %.3fs %s' % (name, _diff, star))
                logger.debug('[TIMING] => Total of turn actions: %.3fs' % (time.time() - before_recurrent_works))
            
            # stats
            before_ = time.time()
            with self.checks_n_actions_lock:
                self._compute_and_print_stat()
            after_ = time.time()
            logger.log(_BEACON_LOG_LEVEL, '[beacon] _compute_and_print_stat [%.3f]' % (after_ - before_))
            
            # We are cleaning old proxy change entries so we don't leak memory
            # all over the time
            proxyitemsmgr.clean_history()
            
            if self.need_objects_dump:
                logger.debug('I need to dump my objects!')
                self.dump_objects()
                self.need_objects_dump = False
            
            with self.sched_daemon.satellite_lock:
                logger.debug('%s %s [QUEUE] Current brokers and broks queues: %s' % (CHAPTER_STATS, SECTION_BROKERS, ' '.join(['[%s=>%s broks in queue]' % (bname, len(broker['broks'])) for (bname, broker) in self.brokers.iteritems()])))
            
            # Now we should loop at time
            end_of_loop = time.time()
            
            elapsed_time = end_of_loop - start_of_loop
            # If we got back in time, skip this sleep
            if elapsed_time < 0:
                continue
            
            self.loop_time_avg.update_avg(elapsed_time)
            
            log_level = logging.WARNING if elapsed_time > 0.5 else logging.DEBUG
            logger.log(log_level, '%s loop time raw/avg[%.3f]/[%.3f] ' % (CHAPTER_STATS, elapsed_time, self.loop_time_avg.get_avg(0, flatten=False)))
            logger.debug("%s ==================================" % CHAPTER_STATS)
            self.sched_daemon.sleep(1.0 - elapsed_time)
        
        self.scheduler_is_ready = False
        # WE must save the retention at the quit BY OURSELF
        # because our daemon will not be able to do it for us
        self.update_retention_file(True)
    
    
    def _compute_and_print_stat(self):
        total_checks_send = 0
        total_checks_received = 0
        for executor_id, stat_by_executor in self.stat_by_executor.iteritems():
            nb_checks_send = stat_by_executor.get('raw_nb_checks_send', 0)
            cpu_time_checks_send = stat_by_executor.get('raw_cpu_time_checks_send', 0)
            nb_checks_received = stat_by_executor.get('raw_nb_checks_received', 0)
            cpu_time_checks_received = stat_by_executor.get('raw_cpu_time_checks_received', 0)
            
            self._compute_executor_average_stat(executor_id, cpu_time_checks_received, cpu_time_checks_send, nb_checks_received, nb_checks_send)
            
            total_checks_send += nb_checks_send
            total_checks_received += nb_checks_received
            logger.debug("%s %s [%30s] Raw/Avg : "
                         "give:[%4d]nb/[%0.3f]s | [%4d]nb/[%0.3f]s "
                         "return:[%4d]nb/[%0.3f]s | [%4d]nb/[%0.3f]s" % (
                             CHAPTER_STATS,
                             SECTION_POLLERS_REACTIONNERS,
                             executor_id,
                             nb_checks_send,
                             cpu_time_checks_send,
                             self.avg_nb_checks_send[executor_id].get_avg(0),
                             self.avg_cpu_time_checks_send[executor_id].get_avg(0),
                             nb_checks_received,
                             cpu_time_checks_received,
                             self.avg_nb_checks_received[executor_id].get_avg(0),
                             self.avg_cpu_time_checks_received[executor_id].get_avg(0)))
        self.avg_checks_todo_by_sec.update_avg(self.raw_nb_checks_to_send)
        self.avg_notification_todo_by_sec.update_avg(self.raw_nb_notification_to_send)
        self.avg_cpu_time_checks_to_send.update_avg(self.raw_cpu_time_checks_to_send)
        
        self.avg_total_checks_send.update_avg(total_checks_send)
        self.avg_total_checks_received.update_avg(total_checks_received)
        
        # update checks by causes
        self.avg_checks_received_schedule_by_sec.update_avg(self.raw_nb_checks_received_schedule)
        self.avg_checks_received_force_by_sec.update_avg(self.raw_nb_checks_received_force)
        self.avg_checks_received_retry_by_sec.update_avg(self.raw_nb_checks_received_retry)
        self.avg_checks_received_dependency_by_sec.update_avg(self.raw_nb_checks_received_dependency)
        self.raw_nb_checks_received_schedule = 0
        self.raw_nb_checks_received_force = 0
        self.raw_nb_checks_received_retry = 0
        self.raw_nb_checks_received_dependency = 0
        
        now = time.time()
        logger.debug("%s %s broks send:[%6d] "
                     "checks : todo:[%4d]/[%4d]nb - [%0.3f]/[%0.3f]s "
                     "give:[%4d]/[%4d]nb "
                     "return:[%4d]/[%4d]nb" %
                     (CHAPTER_STATS, SECTION_BROKERS, self.nb_broks_send,
                      self.raw_nb_checks_to_send,
                      self.avg_checks_todo_by_sec.get_avg(0),
                      self.raw_cpu_time_checks_to_send,
                      self.avg_cpu_time_checks_to_send.get_avg(0),
                      total_checks_send,
                      self.avg_total_checks_send.get_avg(0),
                      total_checks_received,
                      self.avg_total_checks_received.get_avg(0)
                      )
                     )
        
        # logger.debug("[stats] Raw :action todo : [%4d]/[%4d] without stats/total send" %
        #              ( total_checks_send - self._nb_checks_with_stat_send, total_checks_send))
        
        self.raw_nb_checks_to_send = 0
        self.raw_nb_notification_to_send = 0
        self.raw_cpu_time_checks_to_send = 0
        self._nb_checks_with_stat_send = 0
        
        for executor_id, stat_by_executor in self.stat_by_executor.iteritems():
            if 'raw_nb_checks_send' in stat_by_executor:
                stat_by_executor['raw_nb_checks_send'] = 0
                stat_by_executor['raw_cpu_time_checks_send'] = 0
            if 'raw_nb_checks_received' in stat_by_executor:
                stat_by_executor['raw_nb_checks_received'] = 0
                stat_by_executor['raw_cpu_time_checks_received'] = 0
        
        with self.checks_n_actions_lock:
            self.nb_scheduled = 0
            self.nb_todo = 0
            self.nb_late = 0
            self.late_checks_by_tags = {}
            self.nb_inpoller = 0
            self.nb_zombies = 0
            for check in self.checks.values():
                if check.status == 'scheduled':
                    self.nb_scheduled += 1
                    # todo means should be take by poller from now
                    if time.time() >= check.t_to_go:
                        self.nb_todo += 1
                    # is late means that is is still scheduled, but not taken by poller since long (10s)
                    if check.is_late():
                        self.nb_late += 1
                        if check.poller_tag in self.late_checks_by_tags:
                            self.late_checks_by_tags[check.poller_tag] += 1
                        else:
                            self.late_checks_by_tags[check.poller_tag] = 1
                if check.status == 'inpoller':
                    self.nb_inpoller += 1
                if check.status == 'zombie':
                    self.nb_zombies += 1
            self.nb_notifications = len(self.actions)
            
            logger.debug("%s %s Raw : Action todo[ total: [%d], scheduled: [%d], nb_todo: [%d], inpoller: [%d], late: [%d], zombies: [%d], notifications: [%d]]" % (
                CHAPTER_STATS, SECTION_POLLERS_REACTIONNERS, len(self.checks), self.nb_scheduled, self.nb_todo, self.nb_inpoller, self.nb_late, self.nb_zombies, self.nb_notifications))
    
    
    def _compute_executor_average_stat(self, executor_id, cpu_time_checks_received, cpu_time_checks_send, nb_checks_received, nb_checks_send):
        avg_nb_checks_send = self.avg_nb_checks_send.get(executor_id, None)
        avg_cpu_time_checks_send = self.avg_cpu_time_checks_send.get(executor_id, None)
        avg_nb_checks_received = self.avg_nb_checks_received.get(executor_id, None)
        avg_cpu_time_checks_received = self.avg_cpu_time_checks_received.get(executor_id, None)
        
        if avg_nb_checks_send is None:
            avg_nb_checks_send = AvgInRange(60)
        avg_nb_checks_send.update_avg(nb_checks_send)
        self.avg_nb_checks_send[executor_id] = avg_nb_checks_send
        
        if avg_cpu_time_checks_send is None:
            avg_cpu_time_checks_send = AvgInRange(60)
        avg_cpu_time_checks_send.update_avg(cpu_time_checks_send)
        self.avg_cpu_time_checks_send[executor_id] = avg_cpu_time_checks_send
        
        if avg_nb_checks_received is None:
            avg_nb_checks_received = AvgInRange(60)
        avg_nb_checks_received.update_avg(nb_checks_received)
        self.avg_nb_checks_received[executor_id] = avg_nb_checks_received
        
        if avg_cpu_time_checks_received is None:
            avg_cpu_time_checks_received = AvgInRange(60)
        avg_cpu_time_checks_received.update_avg(cpu_time_checks_received)
        self.avg_cpu_time_checks_received[executor_id] = avg_cpu_time_checks_received
        
        executor_stat = self.stat_by_executor.get(executor_id, {})
        executor_stat['avg_nb_checks_send'] = avg_nb_checks_send.get_avg(0)
        executor_stat['avg_cpu_time_checks_send'] = avg_cpu_time_checks_send.get_avg(0)
        executor_stat['avg_nb_checks_received'] = avg_nb_checks_received.get_avg(0)
        executor_stat['avg_cpu_time_checks_received'] = avg_cpu_time_checks_received.get_avg(0)
        self.stat_by_executor[executor_id] = executor_stat
    
    
    def _load_exec_stat(self):
        # Always reset the exec stat as maybe we are from a corrupted exec_stats
        # and so we must be sure we will clean it, even if it means be void
        self._exec_stat = {_DEFAULT_EXECUTOR: {}}
        _exec_stat_path = _PATH_EXEC_STAT_PATTERN % self.sched_daemon.daemon_id
        if not os.path.exists(_exec_stat_path):
            # maybe it's just we are migrating from a 2.4.X version
            # but if not exists, there is so nothing to load
            if not os.path.exists(_OLD_PATH_EXEC_STAT):
                logger.debug("The load exec stat file %s is missing" % _exec_stat_path)
                return
            # Try to move the old stats so we don't start from zero
            # but it can fail, if so, we will have nothing to load
            try:
                shutil.move(_OLD_PATH_EXEC_STAT, _exec_stat_path)
                logger.info('Migrate the old execution time stats file %s to the new path %s' % (_OLD_PATH_EXEC_STAT, _exec_stat_path))
            except Exception, exp:
                logger.debug('Cannot move old execution time stats file %s to the new path %s : %s' % (_OLD_PATH_EXEC_STAT, _exec_stat_path, exp))
                return
        
        try:
            with open(_exec_stat_path, 'r') as f:
                buf = f.read()
                self._exec_stat = json.loads(buf)
                # Clean unknown pollers & reactionners
                for sat_name in self._exec_stat.keys():
                    if sat_name not in self.pollers_name | self.reactionners_name:
                        del self._exec_stat[sat_name]
                
                if _DEFAULT_EXECUTOR not in self._exec_stat:
                    self._exec_stat[_DEFAULT_EXECUTOR] = {}
                logger.info('Sucessfully load the execution stats from the file %s' % _exec_stat_path)
        except Exception, exp:
            logger.warning("[scheduler][%s] cannot load the exec stat from file [%s] : [%s]" % (self.instance_id, _exec_stat_path, exp))
    
    
    def _save_exec_stat(self, exec_stat):
        _exec_stat_path = _PATH_EXEC_STAT_PATTERN % self.sched_daemon.daemon_id
        try:
            buf = json.dumps(exec_stat)
            # first save ino a tmp file, so we won't corrupt the final file even if we are killed
            tmp_file = '%s.tmp' % _exec_stat_path
            with open(tmp_file, 'w') as f:
                f.write(buf)
                f.flush()
                os.fsync(f.fileno())
            # Then move in an atomic way
            shutil.move(tmp_file, _exec_stat_path)
        except TypeError as exp:  # we are trying to dump a exec stat that have critical error on it, cannot be json
            logger.error('%s\nThe scheduler stats is corrupted and must be reset. Please open a support ticket with this log\n%s\n' % ('*' * 80, '*' * 80))
            logger.error('Corrupted stats: %s' % exec_stat)
            self._load_exec_stat()
        except Exception, exp:
            logger.warning("[scheduler][%s] cannot save the exec stat: [%s]" % (self.instance_id, exp))
    
    
    def _load_scheduler_stat(self):
        if os.name == 'nt':  # in the tests as scheduler is not officially on windows
            return
        
        _exec_stat_path = _PATH_SCHEDULER_STAT_PATTERN % self.sched_daemon.daemon_id
        try:
            with open(_exec_stat_path, 'r') as f:
                buf = f.read()
                self.scheduler_stat = json.loads(buf)
                logger.info('Successfully load the scheduler stats from the file %s' % _exec_stat_path)
        except Exception, exp:
            logger.warning("[scheduler][%s] cannot load the scheduler stat from file [%s] : [%s]" % (self.instance_id, _exec_stat_path, exp))
        if self.scheduler_stat.get('save_retention_time', 0) == -1:
            self.scheduler_stat['save_retention_error'] = 'Timeout error ( 2 minutes )'
    
    
    def _update_scheduler_stat(self, stat_name, value):
        _exec_stat_path = _PATH_SCHEDULER_STAT_PATTERN % self.sched_daemon.daemon_id
        try:
            self.scheduler_stat[stat_name] = value
            buf = json.dumps(self.scheduler_stat)
            # first save ino a tmp file, so we won't corrupt the final file even if we are killed
            tmp_file = '%s.tmp' % _exec_stat_path
            with open(tmp_file, 'w') as f:
                f.write(buf)
                f.flush()
                os.fsync(f.fileno())
            # Then move in an atomic way
            shutil.move(tmp_file, _exec_stat_path)
        except Exception, exp:
            logger.warning("[scheduler][%s] cannot save the exec stat: [%s]" % (self.instance_id, exp))
