#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2022:
#     Gabes Jean, naparuba@gmail.com
#     Gerhard Lausser, Gerhard.Lausser@consol.de
#     Gregory Starck, g.starck@gmail.com
#     Hartmut Goebel, h.goebel@goebel-consult.de
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken.  If not, see <http://www.gnu.org/licenses/>.

import base64
import copy
import hashlib
import itertools
import json
import logging
import os
import pickle
import shutil
import tempfile
import threading
import time
import zlib
from collections import deque

from shinken.action import ACTION_TYPES
from shinken.brok import Brok
from shinken.check import Check, CHECK_CAUSE, CHECK_STATUS
from shinken.comment import Comment  # Still loaded so pickle won't have a problem finding them if it loads an old retention
from shinken.compat import SHINKEN_PICKLE_PROTOCOL
from shinken.configuration_incarnation import PartConfigurationIncarnation
from shinken.contactdowntime import ContactDowntime
from shinken.downtime import Downtime
from shinken.eventhandler import EventHandler
from shinken.exceptions.system import ShinkenNoConfig
from shinken.external_command import ExternalCommand
from shinken.external_command import ExternalCommandManager
from shinken.http_client import HTTPExceptions
from shinken.load import Load, AvgInRange
from shinken.log import logger, get_chapter_string, get_section_string, LoggerFactory
from shinken.misc.type_hint import TYPE_CHECKING
from shinken.notification import Notification
from shinken.objects import timeperiod
from shinken.objects.checkmodulation import CheckModulations
from shinken.objects.command import Commands
from shinken.objects.contact import Contacts
from shinken.objects.contactgroup import Contactgroups
from shinken.objects.host import Hosts
from shinken.objects.hostgroup import Hostgroups
from shinken.objects.macromodulation import MacroModulations
from shinken.objects.notificationway import NotificationWays
from shinken.objects.proxyitem import proxyitemsmgr, proxyitemsgraph
from shinken.objects.schedulingitem import PENDING_OUTPUT
from shinken.objects.service import Services
from shinken.objects.servicegroup import Servicegroups
from shinken.objects.timeperiod import Timeperiods
from .actions_container import JOB_EXECUTION_FAST_INDEX, LOG_SCHEDULER_JOB_EXECUTION_FAST_INDEX
from .checks_container import checks_container
from .initial_broks_factory import InitialBroksFactory
from .network_exchange_sequencer import NetworkExchangeSequencer
from .runtime_stats.cpu_stats import cpu_stats_helper
from .runtime_stats.threads_dumper import WatchDogThreadDumper
from .safepickle import SafeUnpickler

if TYPE_CHECKING:
    from shinken.log import PartLogger
    from shinken.misc.type_hint import Optional, Union, List, Dict, Any
    from .daemons.schedulerdaemon import Shinken
    from .objects.host import Host
    from .objects.schedulingitem import SchedulingItem
    from .objects.service import Service
    from .objects.shardedconfiguration import ShardedConfiguration
    from shinken.configuration_incarnation import ConfigurationIncarnation

_PATH_SCHEDULER_STAT_PATTERN = "/var/lib/shinken/scheduler_stat-%d.dat"
# Execution stats part
# We save as retention the execution stats for pollers & reactionners in a file,
# each for each scheduler of the local system
_PATH_EXEC_STAT_PATTERN = "/var/lib/shinken/scheduler_exec_stat-%d.dat"
# If we find the old stat file (< 2.5.0) then we move to the new path and load it
_OLD_PATH_EXEC_STAT = "/var/lib/shinken/scheduler_exec_stat.dat"
_DEFAULT_ACTION_CPU_TIME = 0.1
# noinspection SpellCheckingInspection
_DEFAULT_EXECUTOR = "iQIcBAABCAAGBQJXmOX0AAoJEDjbvchgkmk"
_BEACON_LOG_LEVEL = logging.NOTSET
_KEEP_CHECKS_WARNING_THRESHOLD_CPU_USAGE_TIME = 20  # 20min

_HOST_STATUS_ID_TO_STATUS = {0: "UP", 1: "DOWN", 2: "DOWN", 3: "UNKNOWN"}
_SERVICE_STATUS_ID_TO_STATUS = {0: "OK", 1: "WARNING", 2: "CRITICAL", 3: "UNKNOWN"}

MONITORING_CHECK_CONSUME_DEBUG_FLAG = os.environ.get('MONITORING_CHECK_CONSUME_DEBUG_FLAG', '0') == '1'

get_new_actions_stats = {}
ADD_CHECKS_STATS = {'nb_add': 0, 'total_time': 0.0, 'lock_time': 0.0, 'set_time': 0.0, 'dispatch_time': 0.0, 'hash_time': 0.0, 'brok_create_time': 0.0, 'brok_add_time': 0.0}
ADD_BROKS_STATS = {'nb_add': 0, 'total_time': 0.0, 'sat_lock_time': 0.0, 'brokers_lock_time': 0.0, 'to_one_broker_time': 0.0, 'to_all_brokers_time': 0.0, 'to_global_list_time': 0.0}

LOG_SCHEDULER_RECURRENT_TIMES = os.environ.get('SHINKEN_LOG_SCHEDULER_RECURRENT_TIMES_FLAG', '0') == '1'

logger_raw = LoggerFactory.get_logger()
logger_give_broks = logger_raw.get_sub_part('GIVE BROKS')
logger_retention = logger_raw.get_sub_part('RETENTION')
logger_retention_analyse = logger_retention.get_sub_part('ANALYSE')
logger_retention_checks_in_progress = logger_retention_analyse.get_sub_part('CHECKS_IN_PROGRESS')

CHAPTER_CONFIGURATION = get_chapter_string('CONFIGURATION')
CHAPTER_STATS = get_chapter_string('STATS')
CHAPTER_CHECKS_AND_NOTIF = get_chapter_string('check/notification/event handler')
SECTION_BROKERS = get_section_string('BROKERS')
SECTION_POLLERS = get_section_string('POLLERS')
SECTION_REACTIONNERS = get_section_string('REACTIONNERS')
SECTION_GET = get_section_string('GET')

SCHEDULING_CHAPTER = get_chapter_string('SCHEDULING')

BROK_COMPRESSION_RATE = .25

_SCHEDULER_TIME_STR = get_chapter_string('SCHEDULER TIME')
_LOOP_START_STR = get_section_string('=== Loop start ===')
_LOOP_STOP_STR = get_section_string('=== Loop stop  ===')
_DEBUG_PERF_TAG = get_section_string('UPDATE DOWNTIMES')


class LOOP_TIME_GROUP:
    SCHEDULING = 'SCHEDULING'
    CONTEXT = 'CONTEXT_UPDATE'
    CLEANING = 'CLEANING'
    CHECK_ENV = 'CHECK_ENV'
    DEBUG = 'DEBUG'
    
    ALL_GROUP = (SCHEDULING, CONTEXT, CLEANING, CHECK_ENV, DEBUG)


class Scheduler:
    
    def __init__(self, scheduler_daemon):
        # type: (Shinken) -> None
        self.sched_daemon = scheduler_daemon
        if TYPE_CHECKING:
            self.conf = None  # type: Optional[ShardedConfiguration]
            self.program_start = 0
            self.instance_name = ''
            self.hostgroups = None  # type: Optional[Hostgroups]
            self.services = None  # type: Optional[Services]
            self.notificationways = None  # type: Optional[NotificationWays]
            self.checkmodulations = None  # type: Optional[CheckModulations]
            self.macromodulations = None  # type: Optional[MacroModulations]
            self.contacts = None  # type: Optional[Contacts]
            self.contactgroups = None  # type: Optional[Contactgroups]
            self.servicegroups = None  # type: Optional[Servicegroups]
            self.timeperiods = None  # type: Optional[Timeperiods]
            self.commands = None  # type: Optional[Commands]
            self.load_one_min = None  # type: Optional[Load]
            self.external_command = None  # type: Optional[ExternalCommandManager]
            self.missing_data_startup_delay = 0
        
        # When set to false by us, we die and arbiter launch a new Scheduler
        self.must_run = True
        # Set when the retention was read so the scheduler is ready
        self._scheduler_is_ready = False
        self._scheduler_is_ready_lock = threading.Condition(threading.RLock())
        self.force_check_spread_out = scheduler_daemon.force_check_spread_out
        
        self.last_retention_save_start = 0
        
        # When a configuration is being load, we should avoid some error that occurs because
        # some structures are not all updated, like rogues' poller/reactionners
        self.new_configuration_load_in_progress = False
        
        # By default, we got no configuration, so no incarnation too
        self.configuration_incarnation = None  # type: Optional[ConfigurationIncarnation]
        self.part_configuration_incarnation = None  # type: Optional[PartConfigurationIncarnation]
        
        # protect this uniq list
        self.waiting_results_lock = threading.RLock()
        self.waiting_results = []  # satellites returns us results
        # and to not wait for them, we put them here and
        # use them later
        
        # Every N seconds we call functions like consume, del zombies
        # etc. All of theses functions are in recurrent_works with the
        # every tick to run. So must be an integer > 0
        # The order is important, so make key an int.
        # TODO: at load, change value by configuration one (like reaper time, etc)
        self.recurrent_works = {
            0 : ('update_downtimes', self.update_downtimes, 1, LOOP_TIME_GROUP.CONTEXT),
            1 : ('schedule', self.schedule, 1, LOOP_TIME_GROUP.SCHEDULING),  # just schedule
            2 : ('consume_results', self.consume_results, 1, LOOP_TIME_GROUP.SCHEDULING),  # incorporate checks and dependencies
            3 : ('get_new_actions', self.get_new_actions, 1, LOOP_TIME_GROUP.SCHEDULING),  # now get the news actions (checks, notif) raised
            4 : ('get_new_broks', self.get_new_broks, 1, LOOP_TIME_GROUP.SCHEDULING),  # and broks
            
            # NOTE: 5 is free, was get_new_raw_datas
            
            6 : ('scatter_master_notifications', self.scatter_master_notifications, 1, LOOP_TIME_GROUP.SCHEDULING),
            7 : ('delete_zombie_checks', self.delete_zombie_checks, 1, LOOP_TIME_GROUP.CLEANING),
            8 : ('delete_zombie_actions', self.delete_zombie_actions, 1, LOOP_TIME_GROUP.CLEANING),
            # 3: (self.delete_unwanted_notifications, 1),
            9 : ('check_freshness', self.check_freshness, 10, LOOP_TIME_GROUP.SCHEDULING),
            10: ('clean_caches', self.clean_caches, 1, LOOP_TIME_GROUP.CLEANING),
            11: ('update_retention_file', self.update_retention_file, 3600, LOOP_TIME_GROUP.CHECK_ENV),
            12: ('check_orphaned', self.check_orphaned, 60, LOOP_TIME_GROUP.CHECK_ENV),
            # For NagVis like tools: update our status every 10s
            13: ('get_and_register_update_program_status_brok', self.get_and_register_update_program_status_brok, 10, LOOP_TIME_GROUP.CHECK_ENV),
            # Check for system time change. And AFTER get new checks,
            # so they are changed too.
            14: ('check_for_system_time_change', self.sched_daemon.check_for_system_time_change, 1, LOOP_TIME_GROUP.CHECK_ENV),
            # launch all internal checks
            15: ('manage_internal_checks', self.manage_internal_checks, 1, LOOP_TIME_GROUP.SCHEDULING),
            # launch automatic ack computations
            16: ('compute_automatic_acknowledge', self.compute_automatic_acknowledge, 1, LOOP_TIME_GROUP.CONTEXT),
            # launch automatic flapping computations
            17: ('compute_automatic_flapping', self.compute_automatic_flapping, 1, LOOP_TIME_GROUP.CONTEXT),
            # launch automatic root problems computations
            18: ('compute_cluster_root_problems', self.compute_cluster_root_problems, 1, LOOP_TIME_GROUP.CONTEXT),
            
            # NOTE: 19 is free, was clean_queues
            
            # Look for new business_impact change by modulation every minute
            20: ('update_business_values', self.update_business_values, 60, LOOP_TIME_GROUP.CONTEXT),
            # Reset the topology change flag
            21: ('reset_topology_change_flag', self.reset_topology_change_flag, 1, LOOP_TIME_GROUP.CONTEXT),
            22: ('check_for_expire_acknowledge', self.check_for_expire_acknowledge, 1, LOOP_TIME_GROUP.CONTEXT),
            
            # NOTE: 23 is free, was send_broks_to_modules
            # NOTE: 24 is free, was get_objects_from_from_queues
            
            25: ('_clean_exec_stat', self._clean_exec_stat, 60, LOOP_TIME_GROUP.CLEANING),
            26: ('clean_incidents', self.clean_incidents, 5, LOOP_TIME_GROUP.CLEANING),
            27: ('compute_cluster_flapping', self.compute_cluster_flapping, 60, LOOP_TIME_GROUP.CONTEXT),
            28: ('cleanup_old_forgotten_zombies', self.cleanup_old_forgotten_zombies, 60, LOOP_TIME_GROUP.CLEANING),
        }
        
        # stats part
        self.scheduler_stat = {}
        self._nb_checks_with_stat_send = 0
        self.raw_nb_checks_to_send = 0
        self.raw_nb_notification_to_send = 0
        self.raw_nb_event_handler_to_send = 0
        self.raw_cpu_time_checks_to_send = 0
        
        self.avg_checks_todo_by_sec = AvgInRange(60)
        
        self.avg_notification_todo_by_sec = AvgInRange(60)
        self.avg_event_handler_todo_by_sec = AvgInRange(60)
        self.avg_cpu_time_checks_to_send = AvgInRange(60)
        self.avg_total_checks_send = AvgInRange(60)
        self.avg_total_checks_received = AvgInRange(60)
        
        self.avg_nb_checks_send = {}
        
        # checks stats by cause
        # raw stats
        self.raw_nb_checks_received_schedule = 0
        self.raw_nb_checks_received_force = 0
        self.raw_nb_checks_received_retry = 0
        self.raw_nb_checks_received_dependency = 0
        # computed avg
        self.avg_checks_received_schedule_by_sec = AvgInRange(60)
        self.avg_checks_received_force_by_sec = AvgInRange(60)
        self.avg_checks_received_retry_by_sec = AvgInRange(60)
        self.avg_checks_received_dependency_by_sec = AvgInRange(60)
        
        self.avg_cpu_time_checks_send = {}
        self.avg_nb_checks_received = {}
        self.avg_cpu_time_checks_received = {}
        
        self._executor_type = {}
        self.stat_by_executor = {}
        self.checks_warning_threshold_cpu_usage = []
        
        self._exec_stat = {_DEFAULT_EXECUTOR: {}}
        self.nb_broks_send = 0
        self.nb_raw_datas_send = 0
        
        # Log init
        logger.load_obj(self)
        
        self.instance_id = 0  # Temporary set. Will be erased later
        
        # Ours queues
        self.checks_n_actions_lock = threading.RLock()
        self.checks = {}
        # self.check_to_launch = {}
        # each turn we are filling
        self._checks_created_this_turn_lock = threading.RLock()
        self._checks_created_this_turn = deque()
        self.actions = {}
        self.downtimes = {}
        self.contact_downtimes = {}
        self.raw_datas = {}
        self.hosts = Hosts({})
        self.clusters = {}
        # Some flags
        self.need_objects_dump = False  # set by signal 2
        
        # Now fake initialize for our satellites
        self.brokers = {}
        self.brokers_lock = threading.RLock()
        self.pollers = {}
        self.reactionners = {}
        self.pollers_name = set()
        self.reactionners_name = set()
        self.rogue_satellites_lock = threading.RLock()
        self.rogue_satellites = {}
        self.already_generated_initial_broks_for_this_configuration = set([])
        
        # Keep up counters
        self.checks_n_actions_stats = {
            'check'       : {'nb_total': 0, 'scheduled': 0, 'todo': 0, 'in_executor': 0, 'nb_late': 0, 'zombies': 0, 'late_by_tags': {}},
            'notification': {'nb_total': 0, 'scheduled': 0, 'todo': 0, 'in_executor': 0, 'nb_late': 0, 'zombies': 0, 'late_by_tags': {}},
            'eventhandler': {'nb_total': 0, 'scheduled': 0, 'todo': 0, 'in_executor': 0, 'nb_late': 0, 'zombies': 0, 'late_by_tags': {}},
        }  # type: Dict[str, Dict[str, Union[int, Any]]]
        
        self.lat_avg = 0
        self.lat_min = 0
        self.lat_max = 0
        
        # Keep a trace of loop time
        self.loop_time = {}  # Will be reset all loop_turn
        self.loop_time_avg = AvgInRange(60)
        
        # Keep a set of our uuids (hosts+services)
        self.elements_uuids = set()
        
        self.skip_retention_save = False
        
        self._retention_data = None
        self._save_retention_thread: 'threading.Thread|None' = None
        self._save_retention_lock: 'threading.Condition' = threading.Condition(threading.RLock())
        self._save_retention_thread_has_started: 'bool' = False
        self._delete_old_retention_thread = None
        self._load_scheduler_stat()
        
        # The check's container need the lock to secure its access, in the future, this lock will
        # need to be fully on but it and we will only ask it from the exterior some data/change
        checks_container.load_lock(self.checks_n_actions_lock)
        self._loop_number = 0
        
        # The Initial Broks have a special factory, so we can factorise several requests into
        # one unique generation instead of X
        # NOTE: WE(Scheduler) are generating the Broks, but hte factory is managing the requests and synchronization
        self.initial_broks_factory = InitialBroksFactory(self)
    
    
    # We are getting a new configuration, so maybe we are going into spare, or
    # maybe we are loading a new conf and will run.
    # * spare: drop everything
    # * active: keep waiting results, so we do not lost notifications and checks
    def _reset(self, is_going_to_spare=False):
        self.must_run = True
        self.elements_uuids = set()
        if is_going_to_spare:
            with self.waiting_results_lock:
                del self.waiting_results[:]
        with self.checks_n_actions_lock:
            for o in self.checks, self.actions, self.downtimes, self.contact_downtimes, self.raw_datas, self.brokers, self.clusters:
                o.clear()
            checks_container.reset()  # need to clean the index too
            self._reset_stats()
            self.services = Services({})
            self.hosts = Hosts({})
            self.hostgroups = Hostgroups({})
            self.notificationways = NotificationWays({})
            self.checkmodulations = CheckModulations({})
            self.macromodulations = MacroModulations({})
            self.contacts = Contacts({})
            self.contactgroups = Contactgroups({})
            self.servicegroups = Servicegroups({})
            self.timeperiods = Timeperiods({})
            self.commands = Commands({})
            self.external_command = ExternalCommandManager({}, 'applicators')
        
        with self.rogue_satellites_lock:
            self.rogue_satellites.clear()
    
    
    # The daemon is warning us that a new configuration is being load, so beware of some structures that can be
    # not finish to load, like rogue satellites
    def warn_about_a_new_configuration_load_in_progress(self):
        self.new_configuration_load_in_progress = True
    
    
    # The daemon is warning us the new configuration load is finish
    def warn_about_the_end_of_the_configuration_load(self):
        self.new_configuration_load_in_progress = False
    
    
    def get_current_satellites(self):
        broker_names = list(self.brokers.keys())
        receiver_names = []  # currently void
        poller_names = list(self.pollers_name)
        reactionner_names = list(self.reactionners_name)
        return {
            'broker'     : broker_names,
            'receiver'   : receiver_names,
            'poller'     : poller_names,
            'reactionner': reactionner_names,
        }
    
    
    def get_retention_save_interval(self):
        # Maybe we are without configuration, so retention is not our main problem
        if not hasattr(self, 'conf') or not self.conf:  # not hasattr: for spare sleeping at start
            return 0
        return self.conf.retention_update_interval
    
    
    def reset_broker_entry(self, broker_name):
        with self.sched_daemon.satellite_lock:
            self.brokers[broker_name] = {
                'broks'                          : deque(),  # broks only for this broker
                'already_generated_initial_broks': False,  # the first broks for a configuration incarnation MUST be the initial ones, so have a flag to know when it's not
                'last_contact_with_broker'       : time.time(),
                'network_sequencer'              : NetworkExchangeSequencer(),  # used to know at which point in flow we are with this Broker
            }
    
    
    def create_broker_entry(self, broker_name):
        with self.sched_daemon.satellite_lock:
            if broker_name in self.brokers:  # already have
                return
            logger.info('%s %s [%10s] This new broker contacted us. Creating a brok queue for this new broker.' % (CHAPTER_CONFIGURATION, SECTION_BROKERS, broker_name))
            self.reset_broker_entry(broker_name)
    
    
    # We want to clear all broks from a specific broker
    # NOTE: it's up to YOU to be sure the broker exists, and YOU have the self.sched_daemon.satellite_lock
    def _clear_broks_of_broker(self, broker_name):
        self.brokers[broker_name]['broks'] = deque()
    
    
    # NOTE: YOU must be sure the broker does exist, and YOU already have the self.sched_daemon.satellite_lock
    def _append_brok_to_broker(self, broker_name, brok):
        # Maybe this broker is inhibited until it does ask for initial broks for our current configuration incarnation
        if not self.brokers[broker_name]['already_generated_initial_broks']:
            
            # When the Arbiter remove a Broker from our list without send a new ConfigurationIncarnation
            # we keep all Broker which already ask us initial broks
            # because if a Broker return and the ConfigurationIncarnation have not changed,
            # the broker will start to aks broks without call for initial broks (it already has the configuration)
            # See SEF-10073
            if broker_name in self.already_generated_initial_broks_for_this_configuration:
                self.brokers[broker_name]['already_generated_initial_broks'] = True
                logger.info('The Broker %s which was remove by the Arbiter is return. We allow brok for him.' % broker_name)
            else:
                logger_give_broks.debug('[ %s ] The broker did not asked for initial broks, not stacking the brok' % broker_name)
                return  # not stack any broks, the first one we want are initial broks, not this one
        
        self.brokers[broker_name]['broks'].append(brok)
    
    
    def reset_broker_entry_for_initial_broks(self, broker_name):
        with self.sched_daemon.satellite_lock:
            if broker_name in self.brokers:  # already have, just reset the broks
                self.reset_broker_entry(broker_name)
                logger_give_broks.debug('[ %s ] Re-enabled the broker broks generation (asked for initial broks)' % broker_name)
            else:  # new one
                self.create_broker_entry(broker_name)
            
            # The broker is asking for initial broks, we can allow it again to stack broks, the FIRST one will be the initial ones
            self.brokers[broker_name]['already_generated_initial_broks'] = True
            self.already_generated_initial_broks_for_this_configuration.add(broker_name)
    
    
    def remove_broker(self, broker_name):
        with self.sched_daemon.satellite_lock:
            if broker_name not in self.brokers:
                logger.info('%s %s [%10s] The arbiter asks us to remove broker "%s" but we does not have it. Skipping order.' % (CHAPTER_CONFIGURATION, SECTION_BROKERS, broker_name, broker_name))
                return
            # Ok we have it, clean it
            del self.brokers[broker_name]
            logger.info('%s %s [%10s] The arbiter asks us to remove broker "%s" that is no more need.' % (CHAPTER_CONFIGURATION, SECTION_BROKERS, broker_name, broker_name))
    
    
    def create_providers_entry(self, provider_name):
        return
    
    
    def set_as_inactive(self, instance_name: 'str', configuration_incarnation: 'ConfigurationIncarnation', is_spare: 'bool'):
        txt = 'Waiting until we receive and order to start.' if is_spare else 'Setting us as idle until we receive an active shard.'
        logger.info('[CONFIGURATION] Did receive shard . %s' % txt)
        # I'm not activated AKA spare, reset the scheduler and return
        self._reset(is_going_to_spare=True)
        # self for instance_name
        self.instance_name = instance_name
        
        self.configuration_incarnation = configuration_incarnation
        self.conf = None
    
    
    def load_configuration_from_arbiter(self, conf, pollers, reactionners, configuration_incarnation, _logger: 'PartLogger|None' = None):
        # we give sched it's conf
        start = time.time()
        self._reset()
        self.load_conf(conf, configuration_incarnation, _logger=_logger)
        self.load_satellites(pollers, reactionners)
        self._load_exec_stat(_logger)
        
        # Now create the external commander
        # it's an applicators: its role is not to dispatch commands, but to apply them
        external_commander = ExternalCommandManager(conf, 'applicators')
        
        # Scheduler need to know about external command to activate it if necessary
        self.load_external_command(external_commander)
        
        # External command need the sched because he can raise checks
        external_commander.load_scheduler(self)
        
        if not _logger:
            _logger = LoggerFactory.get_logger().get_sub_part(CHAPTER_CONFIGURATION)
        _logger.info("The configuration %s was loaded in [%.3f]s" % (self.part_configuration_incarnation, time.time() - start))
    
    
    # Load conf for future use we are in_test if the data are from an arbiter object like, so only for tests
    def load_conf(self, conf: 'ShardedConfiguration', configuration_incarnation: 'ConfigurationIncarnation', in_test: 'bool' = False, _logger: 'PartLogger|None' = None) -> None:
        if not _logger:
            _logger = LoggerFactory.get_logger().get_sub_part(CHAPTER_CONFIGURATION)
        self.stat_by_executor = {}
        self.program_start = int(time.time())
        self.conf = conf
        
        # From Arbiter. Use for Broker to differentiate schedulers
        self.instance_id = conf.instance_id
        self.instance_name = conf.instance_name
        
        old_configuration_incarnation = self.configuration_incarnation
        self.configuration_incarnation = configuration_incarnation
        self.part_configuration_incarnation = PartConfigurationIncarnation(self.configuration_incarnation, self.instance_id, self.instance_name)
        
        self._reset_brokers_that_have_already_generated_initial_broks_for_this_configuration(old_configuration_incarnation)
        
        # We must update our Config dict macro with good value from the config parameters
        self.conf.fill_resource_macros_names_macros()
        self.hostgroups = conf.hostgroups
        self.hostgroups.create_reversed_list()
        self.services = conf.services
        # Refill time: 0.5s by 1K checks
        self.services.refil_running_properties()  # the arbiter did not create some state properties, we must have them now
        
        # We need reversed list for search in the retention
        # file read
        self.services.create_reversed_list()
        self.services.optimize_service_search(conf.hosts, build_service_on_host_index=True)
        self.hosts = conf.hosts
        self.hosts.refil_running_properties()  # the arbiter did not create some state properties, we must have them now
        self.hosts.create_reversed_list()
        
        self.notificationways = conf.notificationways
        self.checkmodulations = conf.checkmodulations
        self.macromodulations = conf.macromodulations
        self.contacts = conf.contacts
        self.contacts.refil_running_properties()  # the arbiter did not create some state properties, we must have them now
        self.contacts.create_reversed_list()
        self.contactgroups = conf.contactgroups
        self.contactgroups.create_reversed_list()
        self.servicegroups = conf.servicegroups
        self.servicegroups.create_reversed_list()
        self.timeperiods = conf.timeperiods
        self.timeperiods.create_reversed_list()
        self.commands = conf.commands
        self.commands.create_reversed_list()
        
        # Missing data tolerance thresholds
        self.missing_data_startup_delay = getattr(self.conf, 'minimal_time_before_an_element_become_missing_data_at_startup', 0)
        
        # We must update our Config dict macro with good value from the config parameters because we use it to calculate notes_url and notes_multi_url
        self.conf.fill_resource_macros_names_macros()
        
        self.clusters = {}
        for potential_cluster in itertools.chain(self.hosts, self.services):
            if potential_cluster.got_business_rule or potential_cluster.is_cluster:
                self.clusters[potential_cluster.get_instance_uuid()] = potential_cluster
        
        # Log to the admin how many elements we loaded in this configuration
        nb_clusters = len(self.clusters)
        
        _logger.info('The configuration %s is loaded with theses elements:' % self.part_configuration_incarnation)
        _logger.info('  - clusters    : %d' % nb_clusters)
        _logger.info('  - hosts       : %d' % (len(self.hosts) - nb_clusters))
        _logger.info('  - host groups : %d' % len(self.hostgroups))
        _logger.info('  - checks      : %d' % len(self.services))
        _logger.info('  - commands    : %d' % len(self.commands))
        _logger.info('  - users       : %d' % len(self.contacts))
        _logger.info('  - user groups : %d' % len(self.contactgroups))
        
        if not in_test:
            # Commands in the host/services/contacts are not real one
            # we must relink them
            t0 = time.time()
            self.conf.late_linkify()
            _logger.debug("Late command relink in %d" % (time.time() - t0))
        
        self.conf.fill_resource_macros_names_macros()
        # Tag our hosts with our instance_id
        for h in self.hosts:
            h.instance_id = conf.instance_id
        for s in self.services:
            s.instance_id = conf.instance_id
        # self for instance_name
        
        self.elements_uuids = set()  # be sure to reset it
        for h in self.hosts:
            self.elements_uuids.add(h.get_instance_uuid())
        for s in self.services:
            self.elements_uuids.add(s.get_instance_uuid())
        
        # Resolve static macro in notes_url / notes_multi_url of hosts and service
        for item in itertools.chain(self.hosts, self.services):
            notes_url = getattr(item, 'notes_url', None)
            if notes_url:
                data = item.get_data_for_checks()
                notes_url = self.sched_daemon.macro_resolver.resolve_simple_macros_in_string(notes_url, data, only_static_macros=True)
                item.notes_url = notes_url
            
            notes_multi_url = getattr(item, 'notes_multi_url', None)
            if notes_multi_url:
                data = item.get_data_for_checks()
                notes_multi_url = self.sched_daemon.macro_resolver.resolve_simple_macros_in_string(notes_multi_url, data, only_static_macros=True)
                item.notes_multi_url = notes_multi_url
            
            thresholds_display_definition = getattr(item, 'thresholds_display_definition', None)
            if thresholds_display_definition:
                item.resolve_static_macro_from_thresholds_display_at_load_conf(self.conf.language)
        
        # Now we can update our 'ticks' for special calls
        # like the retention one, etc
        self.update_recurrent_works_tick('update_retention_file', self.conf.retention_update_interval * 60, _logger)
    
    
    def _reset_brokers_that_have_already_generated_initial_broks_for_this_configuration(self, old_configuration_incarnation):
        # type: (ConfigurationIncarnation) -> None
        if old_configuration_incarnation is None or not old_configuration_incarnation.is_equal(self.configuration_incarnation):
            self.already_generated_initial_broks_for_this_configuration = set([])
    
    
    def get_daemon_incarnation(self):
        if hasattr(self, 'conf') and self.conf is not None and self.configuration_incarnation is not None:
            return {'shard_id': self.conf.instance_id, 'configuration_incarnation_uuid': self.configuration_incarnation.get_uuid()}
        return {}
    
    
    # Update the 'tick' for a function call in our recurrent work
    def update_recurrent_works_tick(self, f_name, new_tick, _logger: 'PartLogger'):
        for i in self.recurrent_works:
            (name, f, old_tick, loop_run_time_group) = self.recurrent_works[i]
            if name == f_name:
                if _logger:
                    _logger.debug("Changing the tick to %d for the function %s" % (new_tick, name))
                self.recurrent_works[i] = (name, f, new_tick, loop_run_time_group)
    
    
    # Load the pollers/reactionners from our app master
    def load_satellites(self, pollers, reactionners):
        self.pollers = pollers
        self.reactionners = reactionners
        # avoid having old satellites name in our sets
        self.pollers_name = set()
        self.reactionners_name = set()
        # reset the potential rogue rogue satellites
        self.rogue_satellites = {}
        # SEF-1143 add pollers and reactionners so stats can be aware about all configured satellites
        # add pollers information
        for poller in pollers.values():
            self.pollers_name.add(poller['name'])
            self._add_execturor_stat(poller['name'], 'Poller', poller['poller_tags'], poller['realm'])
        # add reactionners information
        for reactionner in reactionners.values():
            self.reactionners_name.add(reactionner['name'])
            self._add_execturor_stat(reactionner['name'], 'Reactionner', reactionner['reactionner_tags'], reactionner['realm'])
    
    
    # add executors definition for stats
    def _add_execturor_stat(self, id_, type_, tag, realm):
        stat_by_executor = self.stat_by_executor.get(id_, {})
        stat_by_executor['type'] = type_
        stat_by_executor['tag'] = ",".join(tag)
        stat_by_executor['realm'] = realm
        self.stat_by_executor[id_] = stat_by_executor
    
    
    def die(self):
        self.must_run = False
    
    
    def dump_objects(self):
        d = tempfile.gettempdir()
        p = os.path.join(d, 'scheduler-obj-dump-%d' % time.time())
        logger.info('Opening the DUMP FILE %s' % p)
        try:
            f = open(p, 'w')
            f.write('Scheduler DUMP at %d\n' % time.time())
            with self.checks_n_actions_lock:
                for c in self.checks.values():
                    s = 'CHECK: %s:%s:%s:%s:%s:%s\n' % (c.get_uuid(), c.status, c.t_to_go, c.poller_tag, c.command, c.executor_id)
                    f.write(s)
                for a in self.actions.values():
                    s = '%s: %s:%s:%s:%s:%s:%s\n' % (a.__class__.my_type.upper(), a.get_uuid(), a.status, a.t_to_go, a.reactionner_tag, a.command, a.executor_id)
                    f.write(s)
            f.close()
        except Exception as exp:
            logger.error("Error in writing the dump file %s : %s" % (p, str(exp)))
    
    
    # Load the external command
    def load_external_command(self, e):
        self.external_command = e
    
    
    # We've got activity in the fifo, we get and run commands
    def run_external_commands(self, cmds):
        for command in cmds:
            self.run_external_command(command)
    
    
    def run_external_command(self, command):
        logger.debug("scheduler resolves command '%s'" % command)
        ext_cmd = ExternalCommand(command)
        self.external_command.resolve_command(ext_cmd)
    
    
    # Add_Brok is a bit more complex than the others, because on starting, the broks are put in a global queue : self.broks
    # then when the first broker connect, it will generate initial_broks in its own queue (so broker_name != None).
    # and when in "normal" run, we just need to put the brok to all queues
    def add_Brok(self, brok, broker_name=None):  # noqa => upper for lisibility, deal with it
        # type: (Brok, Optional[str]) -> None
        # ADD_BROKS_STATS = {'nb_add': 0, 'total_time':0.0, 'sat_lock_time': 0.0,'brokers_lock_time': 0.0,  'to_one_broker_time': 0.0, 'to_all_brokers_time': 0.0, 'to_global_list_time': 0.0}
        ADD_BROKS_STATS['nb_add'] += 1
        
        before_sat_lock = time.time()
        # IMPORTANT: always the satellite lock BEFORE brokers lock
        with self.sched_daemon.satellite_lock:
            ADD_BROKS_STATS['sat_lock_time'] += time.time() - before_sat_lock
            
            before_brokers_lock = time.time()
            # ask for service and hosts their broks waiting be eaten
            with self.brokers_lock:
                ADD_BROKS_STATS['brokers_lock_time'] += time.time() - before_brokers_lock
                
                before = time.time()
                # For brok, we TAG brok with our instance_id and configuration_incarnation
                brok.set_part_configuration_incarnation(self.part_configuration_incarnation)
                # Maybe it's just for one broker (like in initial broks)
                if broker_name:
                    self._append_brok_to_broker(broker_name, brok)
                    ADD_BROKS_STATS['to_one_broker_time'] += time.time() - before
                else:
                    # If there are known brokers, give it to them
                    for broker_name in self.brokers:
                        self._append_brok_to_broker(broker_name, brok)
                    ADD_BROKS_STATS['to_all_brokers_time'] += time.time() - before
            
            ADD_BROKS_STATS['total_time'] += time.time() - before_sat_lock
    
    
    def add_Notification(self, notif):  # noqa => upper for lisibility, deal with it
        # type: (Notification) -> None
        self.actions[notif.get_uuid()] = notif
        # only count rea notification launch, not the master ones
        if not notif.is_master_notification():
            self.raw_nb_notification_to_send += 1
        # A notification ask for a brok
        if notif.contact is not None:
            b = notif.get_initial_status_brok()
            self.add(b)
    
    
    # 'nb_add':0, 'total_time':0.0, 'lock_time':0.0, 'set_time':0.0,'dispatch_time':0.0, 'hash_time':0.0, 'brok_time':0.0
    def add_Check(self, check):  # noqa => upper for lisibility, deal with it
        with self._checks_created_this_turn_lock:
            self._checks_created_this_turn.append(check)
    
    
    def _stack_all_checks(self):
        # fast switch the list of checks we will stack this turn
        with self._checks_created_this_turn_lock:
            checks_to_stack = self._checks_created_this_turn
            self._checks_created_this_turn = deque()
        
        # Now add all checks, in one lock call to avoid X calls to lock
        before_checks_n_actions_lock = time.time()
        with self.checks_n_actions_lock:
            ADD_CHECKS_STATS['lock_time'] += time.time() - before_checks_n_actions_lock
            for check in checks_to_stack:
                self._stack_check(check)
        ADD_CHECKS_STATS['total_time'] += time.time() - before_checks_n_actions_lock
    
    
    def _stack_check(self, check):
        ADD_CHECKS_STATS['nb_add'] += 1
        
        before_set = time.time()
        old_check = self.checks.get(check.get_uuid(), None)
        if old_check:
            checks_container.delete_job_from_index_seconds(old_check)  # be sure the old check is no more indexed
        self.checks[check.get_uuid()] = check
        ADD_CHECKS_STATS['set_time'] += time.time() - before_set
        
        before_dispatch_time = time.time()
        #  Use the try tech instead preventive check for pref reason
        checks_container.do_index_job_execution(check)
        ADD_CHECKS_STATS['dispatch_time'] += time.time() - before_dispatch_time
        
        before_hash = time.time()
        self.raw_nb_checks_to_send += 1
        action_hash = check.get_hash()
        if action_hash and action_hash in self._exec_stat[_DEFAULT_EXECUTOR]:
            self.raw_cpu_time_checks_to_send += self._exec_stat[_DEFAULT_EXECUTOR][action_hash]['action_cpu_time']
        ADD_CHECKS_STATS['hash_time'] += time.time() - before_hash
    
    
    def add_EventHandler(self, action):  # noqa => upper for lisibility, deal with it
        # print "Add an event Handler", elt.get_uuid()
        self.actions[action.get_uuid()] = action
        self.raw_nb_event_handler_to_send += 1
        logger.debug('[EVENTHANDLER] registering in the scheduler queues the event handler number %s (on the object %s)' % (action.get_uuid(), action.ref.get_full_name()))
    
    
    def add_Downtime(self, dt):  # noqa => upper for lisibility, deal with it
        self.downtimes[dt.id] = dt
    
    
    def add_ContactDowntime(self, contact_dt):  # noqa => upper for lisibility, deal with it
        self.contact_downtimes[contact_dt.id] = contact_dt
    
    
    def add_Comment(self, comment):  # noqa => upper for lisibility, deal with it
        return
    
    
    # Ok one of our modules send us a command? just run it!
    def add_ExternalCommand(self, ext_cmd):  # noqa => upper for lisibility, deal with it
        self.external_command.resolve_command(ext_cmd)
    
    
    # Schedulers have some queues. We can simplify call by adding
    # elements into the proper queue just by looking at their type
    # Brok -> self.broks
    # Check -> self.checks
    # Notification -> self.actions
    # Downtime -> self.downtimes
    # ContactDowntime -> self.contact_downtimes
    def add(self, elt):
        f = self.__add_actions.get(elt.__class__, None)
        if f:
            # print("found action for %s: %s" % (elt.__class__.__name__, f.__name__))
            f(self, elt)  # noqa => typing, you are wrong
    
    
    __add_actions = {
        Check          : add_Check,
        Brok           : add_Brok,
        Notification   : add_Notification,
        EventHandler   : add_EventHandler,
        Downtime       : add_Downtime,
        ContactDowntime: add_ContactDowntime,
        Comment        : add_Comment,  # Disable, do nothing
        ExternalCommand: add_ExternalCommand,
    }
    
    
    # We call the function of modules that got the
    # hook function
    # TODO: find a way to merge this and the version in daemon.py
    def hook_point(self, hook_name):
        done = False
        for inst in self.sched_daemon.modules_manager.get_all_alive_instances():
            full_hook_name = 'hook_' + hook_name
            if hasattr(inst, full_hook_name):
                f = getattr(inst, full_hook_name)
                start_time = time.time()
                try:
                    logger.debug("hook_point: %s %s" % (inst.get_name(), hook_name))
                    done = f(self)
                    if not done:
                        return done
                except ShinkenNoConfig:
                    raise
                except Exception as exp:
                    self.sched_daemon.modules_manager.did_crash(inst, reason='The instance %s raised an error: %s. I disable it and set it to restart it later' % (inst.get_name(), str(exp)))
                    if hook_name == 'save_retention':
                        raise RuntimeError(str(exp))
                finally:
                    logger.log_perf(start_time, inst.get_name(), f'hook {hook_name} duration', min_time=0)
        return done
    
    
    # For tuning purpose we use caches, but we do not want them to explode
    # So we clean them
    def clean_caches(self):
        for tp in self.timeperiods:
            tp.clean_cache()
    
    
    # Ask item (host or service) an update_status
    # and add it to our broks queue
    def get_and_register_status_brok(self, item):
        b = item.get_update_status_brok()
        self.add(b)
    
    
    # We do not want this downtime id
    def del_downtime(self, dt_id):
        if dt_id in self.downtimes:
            self.downtimes[dt_id].ref.del_downtime(dt_id)
            del self.downtimes[dt_id]
    
    
    # We do not want this downtime id
    def del_contact_downtime(self, dt_id):
        if dt_id in self.contact_downtimes:
            self.contact_downtimes[dt_id].ref.del_downtime(dt_id)
            del self.contact_downtimes[dt_id]
    
    
    # We are looking for outdated acknowledgements, and if so, remove them
    def check_for_expire_acknowledge(self):
        for t in [self.hosts, self.services]:
            for i in t:
                i.check_for_expire_acknowledge()
    
    
    # We update all business_impact to look at new modulation
    # start for impacts, and so update broks status and
    # problems value too
    # * allow_status_broks : if we are in the first call, we do not want status_broks, as initial_brok will do the job
    def update_business_values(self, allow_status_broks=True):
        for t in [self.hosts, self.services]:
            # We first update impacts and classic elements
            for i in [i for i in t if not i.is_problem]:
                was = i.business_impact
                i.update_business_impact_value()
                new = i.business_impact
                # Ok, the business_impact change, we can update the broks
                if allow_status_broks and new != was:
                    # print "The elements", i.get_name(), "change it's business_impact value"
                    self.get_and_register_status_brok(i)
        
        # When all impacts and classic elements are updated,
        # we can update problems (their value depend on impacts, so
        # they must be done after)
        for t in [self.hosts, self.services]:
            # We first update impacts and classic elements
            for i in [i for i in t if i.is_problem]:
                was = i.business_impact
                i.update_business_impact_value()
                new = i.business_impact
                # Maybe one of the impacts change it's business_impact to a high value
                # and so ask for the problem to raise too
                if allow_status_broks and new != was:
                    # print "The elements", i.get_name(), "change it's business_impact value from", was, "to", new
                    self.get_and_register_status_brok(i)
    
    
    # Each second we search for master notification that can be scattered, and we do the job
    # we take the sons, and we put them into our actions queue
    def scatter_master_notifications(self):
        now = time.time()
        for master_notification in list(self.actions.values()):  # type: Notification
            # We only want notifications
            if master_notification.is_a != ACTION_TYPES.NOTIFICATION:
                continue
            if master_notification.status == 'scheduled' and master_notification.t_to_go <= now and not master_notification.contact:
                # This is a "master" notification created by create_notifications.
                # It won't send itself because it has no contact.
                # We use it to create "child" notifications (for the contacts and notification_commands) which are executed in the reactionner.
                item = master_notification.ref  # type: SchedulingItem
                child_notifications = []
                if not item.notification_is_blocked_by_item(master_notification.type) and item.is_notification_time_allowed(now):
                    # If it is possible to send notifications of this type at the current time, then create
                    # a single notification for each contact of this item.
                    child_notifications = item.scatter_notification(master_notification)
                    for c in child_notifications:
                        c.status = 'scheduled'
                        self.add(c)  # this will send a brok
                
                # If we have notification_interval then schedule the next notification (problems only)
                if master_notification.type == 'PROBLEM':
                    # Update the ref notif number after raise the one of the notification
                    if len(child_notifications) != 0:
                        # notif_nb of the master notification was already current_notification_number+1.
                        # If notifications were sent, then host/service-counter will also be incremented
                        item.current_notification_number = master_notification.notif_nb
                    
                    if item.notification_interval != 0 and master_notification.t_to_go is not None:
                        # We must continue to send notifications.
                        # Just leave it in the actions list and set it to "scheduled" and it will be found again later
                        # Ask the service/host to compute the next notif time. It can be just
                        # a.t_to_go + item.notification_interval * item.__class__.interval_length
                        # or maybe before because we have an escalation that need to raise up before
                        master_notification.previous_t_to_go = master_notification.t_to_go
                        master_notification.t_to_go = item.get_next_notification_time(master_notification)
                        
                        master_notification.notif_nb = item.current_notification_number + 1
                        master_notification.status = 'scheduled'
                    else:
                        # Wipe out this master notification. One problem notification is enough.
                        item.remove_in_progress_notification(master_notification)
                        self.actions[master_notification.get_uuid()].status = 'zombie'
                
                else:
                    # Wipe out this master notification. We don't repeat recover/downtime/flap/etc...
                    item.remove_in_progress_notification(master_notification)
                    self.actions[master_notification.get_uuid()].status = 'zombie'
    
    
    # Called by executor (poller/reactionner) to get checks
    # Can get checks and actions (notifications and co)
    def get_to_run_checks(self, do_checks=False, do_actions=False, poller_tags=None, reactionner_tags=None, worker_name='none', module_types=None, request_limit=-1, request_limit_cpu=-1, protect_from_rogues=True):
        # If we are a sleeping daemon, give nothing
        if not self.is_activated():
            logger.info(f'The daemon {worker_name} asked us checks/notifications, but we are in a sleeping state, giving nothing.')
            return []
        if not self.scheduler_is_ready():
            logger.debug(f'The daemon {worker_name} asked us checks/notifications, but we are not ready yet, giving nothing.')
            return []
        
        if module_types is None:
            module_types = ['fork']
        if reactionner_tags is None:
            reactionner_tags = ['None']
        if poller_tags is None:
            poller_tags = ['None']
        t0 = time.time()
        with self.checks_n_actions_lock:
            acquire_lock_time = time.time() - t0
            logger.debug('get_to_run_checks acquire lock time [%.3f] for worker [%s]' % (acquire_lock_time, worker_name))
            return self._get_to_run_checks(
                do_checks=do_checks,
                do_actions=do_actions,
                poller_tags=poller_tags,
                reactionner_tags=reactionner_tags,
                worker_name=worker_name,
                worker_types=module_types,
                request_limit=request_limit,
                request_limit_cpu=request_limit_cpu,
                protect_from_rogues=protect_from_rogues,
            )
    
    
    # We can have bugs in the check indexing: some checks can stay in self.check_to_launch and not
    # be deleted (bug fix soon, 20 janv 2021), but what ever, if it happens again, we must detect/correct it
    # if possible. So we will look at all past times, and delete zombie checks we are founding
    # NOTE: as we will look up lot of checks, we should not run every second
    @staticmethod
    def cleanup_old_forgotten_zombies():
        checks_container.cleanup_old_forgotten_zombies(SCHEDULING_CHAPTER)
    
    
    # We are printing check indexing stats, so we can check if this computation is huge or not
    @staticmethod
    def _print_action_indexing_stats():
        checks_container.print_log_stats(SCHEDULING_CHAPTER)
    
    
    # Remove the rogue satellites that doesn't send any check since n sec
    def cleanup_rogue_satellite(self, timeout=60):
        with self.rogue_satellites_lock:
            for satellite_type, satellite_info in self.rogue_satellites.items():
                to_del = set()
                for name, last_check_time in satellite_info.items():
                    time_since_last_check = time.time() - last_check_time
                    # Check time since last rogue connection, if > timeout, remove it
                    if time_since_last_check > timeout:
                        to_del.add(name)
                # manage object deletion
                if to_del:
                    logger.debug("[scheduler][%s] cleanup rogue [%s] named [%s]" % (self.instance_id, satellite_type, to_del))
                for key in to_del:
                    del satellite_info[key]
    
    
    # Store satellites that shouldn't talk to us as rogue satellites
    def _add_rogue_satellite(self, satellite_type, satellite_name, now=None):
        # Maybe we are currently loading a configuration, so :
        # * we are still with self.pollers/reactionners not clean
        # * the poller/reactionner is still aware about us, so is talking with us
        # => wrong error, if it's a real thing, will be raised as soon as we finish to load
        #    the configuration
        if self.new_configuration_load_in_progress:
            return
        
        # maybe we did not have any conf at all (arbiter reset us)
        if self.sched_daemon.cur_conf is None:
            logger.debug("[scheduler][%s] the %s named [%s] send a check/notification but we have no conf" % (self.instance_id, satellite_type, satellite_name))
            return
        
        with self.rogue_satellites_lock:
            if now is None:
                now = time.time()
            if satellite_type not in self.rogue_satellites:
                self.rogue_satellites[satellite_type] = {}
            if satellite_name not in self.rogue_satellites[satellite_type]:
                self.rogue_satellites[satellite_type][satellite_name] = now
            else:
                last_check_time = self.rogue_satellites[satellite_type][satellite_name]
                # We previously stored a date in the future, the server probably update the date/time
                # clear the rogue cache by security
                if last_check_time > now:
                    self.rogue_satellites = {}
                else:
                    self.rogue_satellites[satellite_type][satellite_name] = now
                    logger.warning("[scheduler][%s] the %s named [%s] send a check/notification but shouldn't" % (self.instance_id, satellite_type, satellite_name))
    
    
    # Called by executor to get action to run.
    def _get_to_run_checks(self, do_checks=False, do_actions=False, poller_tags=None, reactionner_tags=None, worker_name='none', worker_types=None, request_limit=-1, request_limit_cpu=-1, protect_from_rogues=True):
        if worker_types is None:
            worker_types = ['fork']
        if reactionner_tags is None:
            reactionner_tags = ['None']
        if poller_tags is None:
            poller_tags = ['None']
        start = time.time()
        in_check_time = 0.0
        action_to_give_to_executor = []
        self.nb_action_give_to_executor = 0
        self.cpu_time_action_give_to_executor = 0
        now = time.time()
        executor_type = 'Poller' if do_checks else 'Reactionner'
        tag = poller_tags if do_checks else reactionner_tags
        executor_id = worker_name
        if protect_from_rogues:
            # Reject the check given by an unknown executor
            if executor_type == 'Poller' and worker_name not in self.pollers_name:
                self._add_rogue_satellite(executor_type, worker_name, now)
                return ()
            elif executor_type == 'Reactionner' and worker_name not in self.reactionners_name:
                self._add_rogue_satellite(executor_type, worker_name, now)
                return ()
        
        # If poller want to do checks
        if do_checks:
            # If the command is untagged, and the poller too, or if both are tagged  with same name, go for it.
            # If "do_check" is True, call for poller, and poller_tags by default is ['None']
            # by default poller_tag is 'None' and poller_tags is ['None']
            # and same for module_type, the default is the 'fork' type
            in_ask_limit = False
            
            past_second_entries = checks_container.get_sorted_past_job_execution_index_seconds()
            
            for t_to_go in past_second_entries:
                checks_at_t = checks_container.get_jobs_at_second(t_to_go)  # self.check_to_launch[t_to_go]
                if in_ask_limit:
                    break
                _checks_to_del = deque()
                for check in checks_at_t:
                    if check.status == 'scheduled' and not check.internal and check.poller_tag in poller_tags and check.module_type in worker_types:
                        c_time = time.time()
                        check.status = 'inpoller'
                        check.executor_id = executor_id
                        # We make a minimal copy with info for exec the check.
                        action_to_give_to_executor.append(check.copy_shell())
                        in_ask_limit = self._test_limit_for_giving_action_to_executor(check, executor_id, request_limit, request_limit_cpu)
                        in_check_time += (time.time() - c_time)
                        _checks_to_del.append(check)
                        if in_ask_limit:
                            break
                # We can clean them all from index
                checks_container.clean_jobs_from_index_seconds(_checks_to_del, t_to_go)
        
        # If reactionner want to notify too
        if do_actions:
            for action in list(self.actions.values()):
                is_master = (action.is_a == ACTION_TYPES.NOTIFICATION and not action.contact)
                
                # Master notifications should not be launched
                if is_master:
                    continue
                
                # if "do_action", call the reactionner, and reactionner_tags by default is ['None']
                # by default reactionner_tag is 'None' and reactionner_tags is ['None'] too
                # So if not the good one, loop for next :)
                if action.reactionner_tag not in reactionner_tags:
                    continue
                
                # same for module_type
                if action.module_type not in worker_types:
                    continue
                
                if action.is_a == ACTION_TYPES.EVENTHANDLER:
                    logger.debug('[EVENTHANDLER] looking at event handler %s (on object %s) state %s to know if we should give it to a reactionner' % (action.get_uuid(), action.ref.get_full_name(), action.status))
                
                # And now look for can launch or not :)
                if action.status == 'scheduled' and action.is_launchable(now):
                    if action.is_a == ACTION_TYPES.EVENTHANDLER:
                        logger.debug('[EVENTHANDLER] we can give the event handler %s (on object %s) to the reactionner' % (action.get_uuid(), action.ref.get_full_name()))
                    action.status = 'inpoller'
                    action.executor_id = executor_id
                    # This is for child notifications and eventhandlers
                    new_a = action.copy_shell()
                    
                    action_to_give_to_executor.append(new_a)
                    # Look if we have too much request to send
                    in_ask_limit = self._test_limit_for_giving_action_to_executor(action, executor_id, request_limit, request_limit_cpu)
                    
                    if in_ask_limit:
                        break
        
        if self.nb_action_give_to_executor > 0:
            qlimit = '%s' % request_limit if request_limit != -1 else 'unlimited'
            logger.info(
                '%s %s [%s-%s][tag:%s][worker_type:%s] Querying with limit of [%s] objects and [%.3f]s execution time and we give it [%d] objects for a total of [%.3f]s execution time' %
                (CHAPTER_CHECKS_AND_NOTIF, SECTION_GET, executor_type, executor_id, ','.join(tag), ','.join(worker_types), qlimit, request_limit_cpu, self.nb_action_give_to_executor, self.cpu_time_action_give_to_executor)
            )
        if 'raw_nb_checks_send' in self.stat_by_executor.get(executor_id, {}):
            stat_by_executor = self.stat_by_executor[executor_id]
            stat_by_executor['raw_nb_checks_send'] += len(action_to_give_to_executor)
            stat_by_executor['raw_cpu_time_checks_send'] += self.cpu_time_action_give_to_executor
        else:
            stat_by_executor = self.stat_by_executor.get(executor_id, {})
            self.stat_by_executor[executor_id] = stat_by_executor
            stat_by_executor['raw_nb_checks_send'] = len(action_to_give_to_executor)
            stat_by_executor['raw_cpu_time_checks_send'] = self.cpu_time_action_give_to_executor
        
        logger.log_perf(start, 'get_check', 'OVERALL-TIME:[%.3f] in_action_time:[%.3f] for [%s] actions' % (time.time() - start, in_check_time, len(action_to_give_to_executor)))
        return action_to_give_to_executor
    
    
    def _test_limit_for_giving_action_to_executor(self, check, executor_id, request_limit, request_limit_cpu):
        in_ask_limit = False
        
        self.nb_action_give_to_executor += 1
        action_commande = getattr(check, 'command', None)
        if action_commande:
            action_hash = check.get_hash()
            exec_stat_executor = self._exec_stat.get(executor_id, {})
            default_executor = self._exec_stat[_DEFAULT_EXECUTOR]
            if action_hash in exec_stat_executor:
                self.cpu_time_action_give_to_executor += exec_stat_executor[action_hash]['action_cpu_time']
                self._nb_checks_with_stat_send += 1
            elif action_hash in default_executor:
                self.cpu_time_action_give_to_executor += default_executor[action_hash]['action_cpu_time']
                logger.debug('[scheduler][%s] Give action with _DEFAULT_EXECUTOR time. [%s]' % (self.instance_id, check.command_name))
            else:
                # Only limit fork based checks
                if check.module_type == 'fork':
                    self.cpu_time_action_give_to_executor += _DEFAULT_ACTION_CPU_TIME
                    logger.debug('[scheduler][%s] Give action with _DEFAULT_ACTION_CPU_TIME time. [%s]' % (self.instance_id, check.command_name))
        
        if request_limit != -1 and request_limit != 0 and self.nb_action_give_to_executor >= request_limit:
            in_ask_limit = True
        if request_limit_cpu != -1 and self.cpu_time_action_give_to_executor > request_limit_cpu:
            in_ask_limit = True
        return in_ask_limit
    
    
    def _update_check_causes_stats(self, cause):
        if cause == CHECK_CAUSE.SCHEDULE:
            self.raw_nb_checks_received_schedule += 1
        elif cause == CHECK_CAUSE.FORCE:
            self.raw_nb_checks_received_force += 1
        elif cause == CHECK_CAUSE.RETRY:
            self.raw_nb_checks_received_retry += 1
        elif cause == CHECK_CAUSE.DEPENDENCY:
            self.raw_nb_checks_received_dependency += 1
        else:
            logger.warning('check return with unknown cause : [%s]' % cause)
    
    
    # Called by poller and reactionner to send result
    def put_results(self, action):
        if action.is_a == ACTION_TYPES.NOTIFICATION:
            # We will only see child notifications here
            try:
                # Add protection for strange charset
                if isinstance(action.output, bytes):
                    action.output = action.output.decode('utf8', 'ignore')
                # Add protection for strange charset
                if isinstance(action.long_output, bytes):
                    action.long_output = action.long_output.decode('utf8', 'ignore')
                
                original_notification = self.actions.get(action.get_uuid(), None)
                if original_notification is None:
                    if MONITORING_CHECK_CONSUME_DEBUG_FLAG:
                        logger.info('[NOTIFICATION] We received a notification return (uuid=%s) for a unknown notification. this can be because this host/check was disabled.' % action.get_uuid())
                    return
                original_notification.get_return_from(action)
                item = original_notification.ref
                item.remove_in_progress_notification(original_notification)
                
                # And we ask the item to update it's state
                self.get_and_register_status_brok(item)
                
                # If we' ve got a problem with the notification, raise a Warning log
                if action.status == 'timeout':
                    logger.warning('Contact %s %s notification command "%s" timed out after %d seconds on the reactionner %s' %
                                   (original_notification.contact.contact_name,
                                    item.__class__.my_type,
                                    original_notification.command_name,
                                    int(action.execution_time),
                                    action.executor_id,
                                    ))
                elif action.exit_status != 0:
                    logger.warning('The notification command "%s" raised an error (exit code=%d on the reactionner=%s): "%s %s"' % (action.command_name, action.exit_status, action.executor_id, action.output, action.long_output))
            except AttributeError as exp:  # bad object, drop it
                logger.warning('put_results:: get bad notification : %s ' % str(exp))
        
        elif action.is_a == ACTION_TYPES.CHECK:
            try:
                if action.status == 'timeout':
                    action.exit_status = self.conf.timeout_exit_status
                
                if action.get_cpu_time() > self.checks[action.get_uuid()].warning_threshold_cpu_usage:
                    self.checks_warning_threshold_cpu_usage.append((action.command_name, action.get_cpu_time(), self.checks[action.get_uuid()].warning_threshold_cpu_usage, action.check_time))
                
                destination_check = self.checks[action.get_uuid()]  # type: Check
                destination_check.get_return_from(action, self.conf.language)
                destination_check.status = CHECK_STATUS.WAITCONSUME
                # Maybe the return was from not a poller (like old scheduler run, or something else), so maybe
                # the check is still registered into the time index, if so, unregister it, but it is not a crime
                if destination_check.is_indexed_at_epoch():
                    if LOG_SCHEDULER_JOB_EXECUTION_FAST_INDEX:
                        logger.debug('%s The Check %s is having a saved return (or internal one), so must remove from index (cleaning)' % (JOB_EXECUTION_FAST_INDEX, destination_check.get_printable_name()))
                    checks_container.delete_job_from_index_seconds(destination_check)
            except KeyError:
                # Not a crime, the check is a new, and will be redone soon
                logger.debug('The check return %s was unable to find the check-execution it refer.' % action.get_printable_name())
        
        elif action.is_a == ACTION_TYPES.EVENTHANDLER:
            original_event_handler = self.actions.get(action.get_uuid(), None)  # Type: EventHandler
            
            # Maybe we got a return of an old even handler, so we can forget it
            if original_event_handler is None:
                logger.warning('put_results:: get unknown event handler : %s ' % action.get_uuid())
                return
            
            # Add protection for strange charset as we can print it
            if isinstance(action.output, bytes):
                action.output = action.output.decode('utf8', 'ignore')
            
            # Add protection for strange charset as we can print it
            if isinstance(action.long_output, bytes):
                action.long_output = action.long_output.decode('utf8', 'ignore')
            
            item = original_event_handler.ref
            
            item.consume_event_result(original_event_handler)
            reference_name = item.get_full_name()
            logger.debug('[EVENTHANDLER] the event handler %s (on element %s) just came back from the reactionner %s with state %s' % (original_event_handler.get_uuid(), reference_name, action.status, action.executor_id))
            
            # It just died
            if action.status == 'timeout':
                logger.warning('[EVENTHANDLER] the event handler %s on the element %s timed out after %d seconds on the reactionner %s' % (action.get_uuid(), reference_name, int(action.execution_time), action.executor_id))
            elif action.exit_status != 0:
                logger.warning(
                    '[EVENTHANDLER] The event handler %s on the element %s raised an error (exit code=%d on the reactionner:%s): "%s %s"' % (
                        action.get_uuid(), reference_name, action.exit_status, action.executor_id, action.output, action.long_output))
            else:
                logger.debug('[EVENTHANDLER] The event handler %s (on element %s) just came back without errors (exit code=%s on the reactionner:%s)' % (action.get_uuid(), reference_name, action.exit_status, action.executor_id))
            
            # Let's set it to be clean asap
            original_event_handler.status = 'zombie'
        
        else:
            logger.error('[scheduler][%s] The received result type in unknown! [%s]' % (self.instance_id, str(action.is_a)))
            return
        
        self._compute_exec_stat(action)
    
    
    def _compute_exec_stat(self, action):
        now = time.time()
        action_commande = getattr(action, 'command', None)
        action_cpu_time = getattr(action, 'average_cpu_time', 0)
        executor_id = getattr(action, 'executor_id', None)
        if action_cpu_time != 0 and action_commande is not None and executor_id is not None:
            action_hash = action.get_hash()
            executor_stat = {}
            if executor_id in self._exec_stat:
                executor_stat = self._exec_stat[executor_id]
            
            self._exec_stat[executor_id] = executor_stat
            
            action_stat = {}
            if action_hash in executor_stat:
                action_stat = executor_stat[action_hash]
            
            action_stat['action_cpu_time'] = action_cpu_time
            action_stat['last_update'] = now
            action_stat['saving_periode'] = action.get_saving_period()
            executor_stat[action_hash] = action_stat
            if action.is_a == "check":
                if hasattr(action, 'cause'):
                    # update stats counter about the cause of received checks
                    self._update_check_causes_stats(action.cause)
                else:
                    raise Exception('The poller [%s] give use a action without cause. You should update your pollers.' % executor_id)
            
            self._exec_stat[_DEFAULT_EXECUTOR][action_hash] = action_stat
            
            # logger.debug("[scheduler][%s] Stat on action[%s] [%.50s] with average[%s] from executor[%s]" % (self.instance_id, action.get_uuid(), action.command_name, action_cpu_time, executor_id))
    
    
    def _clean_exec_stat(self):
        # Remove old stat from the _exec_stat
        now = time.time()
        
        for executor_id in self._exec_stat:
            executor_stat = self._exec_stat[executor_id]
            to_remove = []
            for hash_commande in executor_stat:
                stat = executor_stat[hash_commande]
                if (now - stat['last_update']) / 60 > stat['saving_periode']:
                    to_remove.append(hash_commande)
            # logger.debug('[scheduler][%s][%s] Total stat size: %d' % (self.instance_id, executor_id, len(executor_stat)))
            if to_remove:
                logger.debug("[scheduler][%s] Clean [%d] stats for executor [%s]" % (self.instance_id, len(to_remove), executor_id))
            for hash_commande in to_remove:
                del executor_stat[hash_commande]
        
        to_del = []
        for entry in self.checks_warning_threshold_cpu_usage:
            if time.time() - entry[3] > _KEEP_CHECKS_WARNING_THRESHOLD_CPU_USAGE_TIME * 60:
                to_del.append(entry)
        
        for entry in to_del:
            self.checks_warning_threshold_cpu_usage.remove(entry)
    
    
    def _reinject_refused_actions_by_poller(self, daemon_name, daemon_type, actions):
        logger.info('The %s %s did refused our %d checks/notifications, reset them in our immediate sending queue so another poller can take it.' % (daemon_type, daemon_name, len(actions)))
        is_poller = daemon_type == 'poller'
        for action_shell in actions:
            # Important: this action is NOT the original action, it's just a shell/copy
            if is_poller:
                action = self.checks[action_shell.get_uuid()]
            else:
                action = self.actions[action_shell.get_uuid()]
            logger.debug('[Re-inject] The action action for %s is set back to run now (for polller=%s)' % (action.ref.get_full_name(), is_poller))
            # Maybe the poller refused our checks for a reason, if so, reschedule it for now
            # ok it's an orphan, so change it's time ot go to now, but don't touch the original time to go,
            # so we can compute real latency
            action.reset_schedule_now()
            if is_poller:  # need to update the poller check index too
                #  Use the try tech instead preventive check for pref reason
                checks_container.do_index_job_execution(action)  # IMPORTANT: ONLY FOR POLLER!
    
    
    # We should push actions to our passives satellites
    def push_actions_to_passives_satellites(self, distant_link):
        daemon_type = distant_link['type']
        daemon_name = distant_link['name']
        # logger.debug("[scheduler][%s] I will send actions to the %s %s" % (self.instance_id, daemon_type, daemon_name))
        con = distant_link['con']
        if con is None:  # not ready
            return
        daemon_tags = distant_link['%s_tags' % daemon_type]
        try:
            start_time = time.time()
            request_limit_cpu = con.get('get_request_limit_cpu')
            distant_link['latency'] = time.time() - start_time
            request_limit_cpu = float(request_limit_cpu)
        except Exception as exp:
            request_limit_cpu = -1
            logger.warning("[scheduler][%s] The [%s:%s] don't give this cpu limit: [%s]" % (self.instance_id, daemon_type, daemon_name, str(exp)))
        
        if daemon_type == 'reactionner':
            lst = self.get_to_run_checks(False, True, reactionner_tags=daemon_tags, worker_name=daemon_name, module_types=['fork'], request_limit_cpu=request_limit_cpu, protect_from_rogues=False)
        else:
            lst = self.get_to_run_checks(True, False, poller_tags=daemon_tags, worker_name=daemon_name, module_types=['fork'], request_limit_cpu=request_limit_cpu, protect_from_rogues=False)
        try:
            if len(lst) == 0:
                return
            logger.debug("Sending [%s] actions to [%s]" % (len(lst), daemon_name))
            res = con.post('push_actions', {'actions': lst, 'sched_id': self.instance_id})
            if res == 'false':  # oups, the poller was not ready
                self._reinject_refused_actions_by_poller(daemon_name, daemon_type, lst)
            elif res != 'true':
                logger.debug('[push actions] The check/notification push to %s returns a unknown value: %s type=%s' % (daemon_name, res, type(res)))
        except HTTPExceptions as exp:
            logger.warning("[scheduler][%s] Connection problem to the [%s:%s]: [%s]" % (self.instance_id, daemon_type, daemon_name, str(exp)))
            try:
                distant_link['con'].con.close()
            except:
                pass
            distant_link['con'] = None
        except KeyError as exp:
            logger.warning("[scheduler][%s] The [%s:%s] is not initialized: [%s]" % (self.instance_id, daemon_type, daemon_name, str(exp)))
            try:
                distant_link['con'].con.close()
            except:
                pass
            distant_link['con'] = None
    
    
    # We should get returns from satellites
    def get_actions_from_passives_satellites(self, distant_link):
        daemon_type = distant_link['type']
        daemon_name = distant_link['name']
        con = distant_link['con']
        # logger.debug("[scheduler][%s] I will get actions from the %s %s" % (self.instance_id, daemon_type, daemon_name))
        if con is None:  # not ready
            return
        try:
            results = con.get('get_returns', {'sched_id': self.instance_id}, wait='long')
            results = base64.b64decode(results)
            results = zlib.decompress(results)
            results = SafeUnpickler.loads(results, 'Checks/notifications/event handlers get from passive %s' % daemon_name)
            
            nb_received = len(results)
            if nb_received > 0:
                logger.debug("GET [%d] passive results from [%s]" % (nb_received, daemon_name))
            with self.waiting_results_lock:
                self.waiting_results.extend(results)
        except HTTPExceptions as exp:
            logger.warning("Connection problem to the %s %s: %s" % (daemon_type, daemon_name, str(exp)))
            try:
                distant_link['con'].con.close()
            except:
                pass
            distant_link['con'] = None
        except KeyError as exp:
            logger.warning("The %s '%s' is not initialized: %s" % (daemon_type, daemon_name, str(exp)))
            try:
                distant_link['con'].con.close()
            except:
                pass
            distant_link['con'] = None
    
    
    # Some checks are purely internal, like business based one
    # simply ask their ref to manage it when it's ok to run
    def manage_internal_checks(self):
        now = time.time()
        
        with self.checks_n_actions_lock:
            # First let create checks for the clusters that need to be recomputed ( aka they are dirty )
            dirty_clusters = proxyitemsgraph.get_and_reset_clusters_to_recompute_state()
            if len(dirty_clusters) > 0:
                logger.debug('Clusters to refresh: %s' % dirty_clusters)
            for c_uuid in dirty_clusters:
                cluster = self.clusters.get(c_uuid, None)
                if cluster is None:
                    continue
                
                cluster.schedule(force=True, force_time=now)
                if len(cluster.checks_in_progress) == 0:
                    logger.error('Cannot schedule the cluster %s and force a immediate check' % cluster.get_full_name())
                    continue
                # stack the checks as we do for all checks
                # NOTE: code take from get_new_actions
                for a in cluster.actions:
                    self.add(a)
                # We take all, we can clear it
                cluster.actions = []
            
            past_seconds = checks_container.get_sorted_past_job_execution_index_seconds()
            for t_to_go in past_seconds:
                # checks = self.check_to_launch[t_to_go]
                checks = checks_container.get_jobs_at_second(t_to_go)
                to_del = []
                for check in checks:
                    if check.status == 'scheduled' and check.internal:
                        check.ref.manage_internal_check(check, lang=self.conf.language)
                        to_del.append(check)
                # We can clean them all from index
                checks_container.clean_jobs_from_index_seconds(to_del, t_to_go)
    
    
    # We look for possible automatic ack creation if ALL our sources are in acknowledge too.
    # * CREATION: all our source problems are ack, we are going ack
    # * REMOVING: we remove our ack if one of our source pb is not ack,
    #             BUT only if the ack was an automatic one
    def compute_automatic_acknowledge(self):
        for i in self.hosts:
            i.compute_automatic_acknowledge()
    
    
    # We look for possible automatic flapping creation if ALL our sources are flapping too
    # * CREATION : all of our source are flapping, but we are not in flapping
    # * REMOVING: we remove if we are in flapping
    def compute_automatic_flapping(self):
        for i in self.hosts:
            i.compute_automatic_flapping()
    
    
    # We look for clusters that need to update their own root's problems computation from proxy items
    def compute_cluster_root_problems(self):
        
        dirty_clusters = proxyitemsgraph.get_and_reset_clusters_to_recompute_root_problems()
        
        for c_uuid in dirty_clusters:
            cluster = self.clusters.get(c_uuid, None)
            if cluster is None:
                logger.debug('Cannot find dep cluster %s for root problems' % c_uuid)
                continue
            proxy = proxyitemsmgr[c_uuid]
            orig_root_problems = proxy.root_problems
            new_root_problems = set()
            
            my_fathers = proxyitemsgraph.son_to_fathers.get(c_uuid, [])
            for father_uuid in my_fathers:
                father = proxyitemsmgr[father_uuid]
                
                # If the father is in a bad state with no root problems, it means it is a root problem itself
                if father.state != 0 and len(father.root_problems) == 0:
                    new_root_problems.add(father.uuid)
                else:  # ok take its root problem if there are some
                    new_root_problems.update(father.root_problems)
            did_change = (new_root_problems != orig_root_problems)
            
            if did_change:  # raise a brok about this, so the broker will know about the new root problems
                proxyitemsmgr.update_root_problems(c_uuid, new_root_problems)
                # And we register a new broks for update status
                cluster.broks.append(cluster.get_update_status_brok())
                # And warn other cluster that rely on this one that they must recompute their own
                proxyitemsgraph.trigger_my_clusters_root_problem(c_uuid)
    
    
    # Call by brokers to have broks
    # Return all broks if size_hint=0
    # We give them, and clean them!
    def get_broks(self, broker_name, size_hint=0):
        # type: (str, int) -> List[Brok]
        res = deque()
        if size_hint == 0:
            res = self.brokers[broker_name]['broks']
            self._clear_broks_of_broker(broker_name)
        else:
            current_size = 0.0
            while self.brokers[broker_name]['broks'] and current_size < size_hint:
                brok = self.brokers[broker_name]['broks'].popleft()
                if isinstance(brok.data, bytes):
                    brok_data = brok.data
                else:
                    brok_data = pickle.dumps(brok.data, SHINKEN_PICKLE_PROTOCOL)
                # very crude approximation of compressed-pickled size of the brok
                # .25 factor (ie 75% compression rate)
                # cf SEF-6358
                # TODO: refine with some stats
                current_size += len(brok_data) * BROK_COMPRESSION_RATE
                res.append(brok)
        return list(res)
    
    
    # An element can have its topology changed by an external command
    # if so a brok will be generated with this flag. No need to reset all of
    # them.
    def reset_topology_change_flag(self):
        for i in self.hosts:
            i.topology_change = False
        for i in self.services:
            i.topology_change = False
    
    
    # Update the retention file and give all te data in
    # a dict so the read function can pick up what it wants
    # For now compression is not used, but it can be added easily
    # just uncomment :)
    def update_retention_file(self, forced=False):
        if self.skip_retention_save:
            return
        t0 = time.time()
        exec_stat = copy.deepcopy(self._exec_stat)
        self._prepare_retention_data()
        time_to_prepare = time.time() - t0
        if time_to_prepare > 1:
            logger.warning('[Support Information]prepare retention data take [%.3f]s' % time_to_prepare)
        
        if self._save_retention_thread:
            self._save_retention_thread.join()
        
        with self._save_retention_lock:
            self._save_retention_thread_has_started = False
        
        self._save_retention_thread = threading.Thread(target=self._save_retention, name='scheduler retention saving', args=(exec_stat, forced))
        self._save_retention_thread.start()
        
        with self._save_retention_lock:
            while not self._save_retention_thread_has_started:
                self._save_retention_lock.wait()
            self._save_retention_thread_has_started = False
    
    
    def wait_for_retention_update_thread(self):
        if self._save_retention_thread:
            self._save_retention_thread.join()
            self._save_retention_thread = None
    
    
    def _save_retention(self, exec_stat, forced):
        start = time.time()
        self.last_retention_save_start = start
        
        with self._save_retention_lock:
            self._save_retention_thread_has_started = True
            self._save_retention_lock.notify_all()
        
        try:
            # save_retention_time = -1 -> save in progress
            self._update_scheduler_stat('save_retention_time', -1)
            self._update_scheduler_stat('save_retention_error', '')
            self._save_exec_stat(exec_stat)
            # If we set the update to 0, we do not want of this if we do not force (like at stopping)
            if self.conf.retention_update_interval != 0 or forced:
                try:
                    self.hook_point('save_retention')
                except RuntimeError as err:
                    logger.debug(str(err))
                    time_to_save = time.time() - start
                    self._update_scheduler_stat('save_retention_time', time_to_save)
                    self._update_scheduler_stat('save_retention_error', str(err))
                    return
            
            time_to_save = time.time() - start
            if time_to_save > 2:
                logger.warning('[Support Information]saving retention data take [%.3f]s' % time_to_save)
            self._update_scheduler_stat('save_retention_time', time_to_save)
            saved_at = int(time.time())
            self._update_scheduler_stat('last_retention_save', saved_at)  # we only update the value if we did succeed
            self._update_scheduler_stat('last_retention_save_try', saved_at)
        except Exception as exp:
            time_to_save = time.time() - start
            self._update_scheduler_stat('save_retention_time', time_to_save)
            self._update_scheduler_stat('save_retention_error', str(exp))
            self._update_scheduler_stat('last_retention_save_try', int(time.time()))
            logger.error(exp)
            logger.print_stack()
    
    
    # Delete the old mongo retention
    def old_retention_delete(self):
        if self._delete_old_retention_thread:
            self._delete_old_retention_thread.join()
        self.hook_point('delete_old_retention')
    
    
    # Load the retention file and get status from it. It does not get all checks in progress
    # for the moment, just the status and the notifications.
    def retention_load(self):
        self.wait_for_retention_update_thread()
        
        logger.info('Start to load the retention')
        # Be sure to restart dead modules, even if they are init since not long
        self.sched_daemon.check_and_del_zombie_modules(force_start=True)
        
        start = time.time()
        done = self.hook_point('load_retention')
        if not done:
            logger.error('[ RETENTION        ] Failed to load the retention data. Shutting down daemon: we prefer to shut the daemon and leave a spare take the role instead of start and loose data like notifications or downtimes.')
            self.skip_retention_save = True
            self.die()
            self.sched_daemon.must_run = False
            self.sched_daemon.interrupted = True
        
        # Some actions must be done only once, even if we have two retention modules
        if done:
            self._launch_post_retention()
        
        self._update_scheduler_stat('last_retention_load_epoch', int(start))
        self._update_scheduler_stat('last_retention_load_duration', time.time() - start)
        
        # In all cases we drop old retention data
        self._retention_data = None
    
    
    # Call by retention module, it gives all data to save.
    # You must call _prepare_retention_data before this method
    def get_retention_data(self):
        return self._retention_data
    
    
    def _prepare_retention_data(self):
        all_data = {'hosts': {}, 'services': {}}
        for host in self.hosts:
            _host_to_save = {}
            running_properties = host.__class__.running_properties
            for prop, entry in running_properties.items():
                if entry.retention:
                    _host_to_save[prop] = Scheduler._get_retention_value(host, entry, prop)
            all_data['hosts'][host.get_instance_uuid()] = _host_to_save
        
        for service in self.services:
            _service_to_save = {}
            running_properties = service.__class__.running_properties
            for prop, entry in running_properties.items():
                if entry.retention:
                    _service_to_save[prop] = Scheduler._get_retention_value(service, entry, prop)
            all_data['services'][service.get_instance_uuid()] = _service_to_save
        
        self._retention_data = all_data
    
    
    @staticmethod
    def _get_retention_value(host, entry, prop):
        v = getattr(host, prop)
        f = entry.retention_preparation
        if f:
            v = f(host, v)
        return v
    
    
    # A retention module is asking us which uuid it needs to retrieve:
    # * hosts & services
    # * maybe we already have in self._retention_data and so don't ask for these
    #   * => means that we had on the configuration just before, and so it is not interesting to be asked from
    #        database
    def get_instances_uuids_to_restore_retention(self):
        r = {'hosts': {'total': len(self.hosts), 'to_load': []}, 'services': {'total': len(self.services), 'to_load': []}}
        # Hosts:
        hosts_to_load = r['hosts']['to_load']
        current_host_retention_cache = {}
        if self._retention_data:
            current_host_retention_cache = self._retention_data['hosts']
        for host in self.hosts:
            host_uuid = host.get_instance_uuid()
            if host_uuid in current_host_retention_cache:
                continue
            hosts_to_load.append(host_uuid)
        
        # Services:
        services_to_load = r['services']['to_load']
        current_service_retention_cache = {}
        if self._retention_data:
            current_service_retention_cache = self._retention_data['services']
        for service in self.services:
            service_uuid = service.get_instance_uuid()
            if service_uuid in current_service_retention_cache:
                continue
            services_to_load.append(service_uuid)
        
        return r
    
    
    # For a host uuid, we can have the retention data in :
    # * the self._retention_data so we are sure it's up-to-date (take first)
    # * the module data
    # * no where, no luck.
    def _get_host_retention_data_from(self, host_uuid, module_data):
        if self._retention_data and host_uuid in self._retention_data['hosts']:
            return self._retention_data['hosts'][host_uuid]
        if host_uuid in module_data['hosts']:
            return module_data['hosts'][host_uuid]
        # found no one, skip it
        return None
    
    
    # Same for checks
    def _get_service_retention_data_from(self, service_uuid, module_data):
        if self._retention_data and service_uuid in self._retention_data['services']:
            return self._retention_data['services'][service_uuid]
        if service_uuid in module_data['services']:
            return module_data['services'][service_uuid]
        # found no one, skip it
        return None
    
    
    # Get back our broks from a retention module :)
    def restore_retention_data(self, data):
        # Now load interesting properties in hosts/services Tagging retention=False prop that not be directly load
        # Items will be with theirs status, but not in checking, so
        # a new check will be launched like with a normal beginning (random distributed scheduling)
        
        for host in self.hosts:
            host_uuid = host.get_instance_uuid()
            # we can have (or not) retention data from cache or from module
            host_retention_data = self._get_host_retention_data_from(host_uuid, data)
            if host_retention_data is None:  # not found, skip this host retention
                continue
            
            # First manage all running properties
            running_properties = host.__class__.running_properties
            for prop, entry in running_properties.items():
                # Maybe the saved one was not with this value, so we just bypass this
                if entry.retention and prop in host_retention_data:
                    setattr(host, prop, host_retention_data[prop])
            
            # If the retention comes from an older version without last_state_as_string, we need to recompute it
            # roughly from last_state_id
            if 'last_state_as_string' not in host_retention_data:
                setattr(host, 'last_state_as_string', _HOST_STATUS_ID_TO_STATUS[getattr(host, "last_state_id", 3)])
            
            # Ok, some are in properties too (like active check enabled)
            # or not. Will OVERRIDE THE CONFIGURATION VALUE!
            properties = host.__class__.properties
            for prop, entry in properties.items():
                if entry.retention:
                    # Maybe the saved one was not with this value, so
                    # we just bypass this
                    if prop in host_retention_data:
                        setattr(host, prop, host_retention_data[prop])
            # Now manage all linked objects load from previous run
            if 'must_respread' not in host_retention_data:
                host.must_respread = True
            # Relink the notified_contacts as a set() of true contacts objects
            # it was load from the retention, it's now a list of contacts  names
            if 'notified_contacts' in host_retention_data:
                new_notified_contacts = set()
                for cname in host.notified_contacts:
                    contact = self.contacts.find_by_name(cname)
                    # Maybe the contact is gone. Skip it
                    if contact:
                        new_notified_contacts.add(contact)
                host.notified_contacts = new_notified_contacts
            notification_ids = list(host.notifications_in_progress.keys())  # need a copy as we hot delete, list() -> for managing future python3 code
            for notif_id in notification_ids:
                notification = host.notifications_in_progress[notif_id]
                # SEF-7822: protect against bad old notification objects
                if not notification.is_format_valid_after_retention_load():
                    del host.notifications_in_progress[notif_id]
                    logger_retention_analyse.warning('[%s] The notification [%s] was detected as invalid (bug from the old code), and was dropped.' % (host.get_full_name(), notification))
                    continue
                notification.ref = host
                self.add(notification)
            
            host.check_and_error_if_too_much_checks_in_progress(logger_retention_checks_in_progress)
            for check in host.checks_in_progress:
                logger_retention_analyse.debug('The host %s is restoring a check from retention: id=%s status=%s' % (host.get_full_name(), check.get_uuid(), check.status))
                check.ref = host
                self.add(check)
                
                # IMPORTANT: checks here are NOT ready to be launched/analysed!
                # so we will need a final step at the end of the retention load, because
                # we need to have load ALL checks before be ready
            host.update_in_checking()
            # And also add downtimes
            self._reload_and_add_item_downtimes_on_retention_loading(host)
            
            # Update host acknowledges
            self._reload_item_acknowledgement_on_retention_loading(host)
        
        # Same for services
        for service in self.services:
            service_uuid = service.get_instance_uuid()
            # we can have (or not) retention data from cache or from module
            check_retention_data = self._get_service_retention_data_from(service_uuid, data)
            if check_retention_data is None:  # not found, skip this service retention
                continue
            
            # Load the major values from running properties
            running_properties = service.__class__.running_properties
            for prop, entry in running_properties.items():
                # Maybe the saved one was not with this value, so we just bypass this
                if entry.retention and prop in check_retention_data:
                    setattr(service, prop, check_retention_data[prop])
            
            # If the retention comes from an older version without last_state_as_string, we need to recompute it
            # roughly from last_state_id
            if 'last_state_as_string' not in check_retention_data:
                setattr(service, 'last_state_as_string', _SERVICE_STATUS_ID_TO_STATUS[getattr(service, "last_state_id", 3)])
            
            # And some others from properties dict too
            properties = service.__class__.properties
            for prop, entry in properties.items():
                if entry.retention:
                    # Maybe the saved one was not with this value, so
                    # we just bypass this
                    if prop in check_retention_data:
                        setattr(service, prop, check_retention_data[prop])
            if 'must_respread' not in check_retention_data:
                service.must_respread = True
            # Relink the notified_contacts as a set() of true contacts objects
            # it was load from the retention, it's now a list of contacts
            # names
            if 'notified_contacts' in check_retention_data:
                new_notified_contacts = set()
                for cname in service.notified_contacts:
                    contact = self.contacts.find_by_name(cname)
                    # Maybe the contact is gone. Skip it
                    if contact:
                        new_notified_contacts.add(contact)
                service.notified_contacts = new_notified_contacts
            notification_ids = list(service.notifications_in_progress.keys())  # need a copy as we hot delete, list() -> for managing future python3 code
            for notif_id in notification_ids:
                notification = service.notifications_in_progress[notif_id]
                # SEF-7822: protect against bad old notification objects
                if not notification.is_format_valid_after_retention_load():
                    del service.notifications_in_progress[notif_id]
                    logger_retention_analyse.warning('[%s] The notification [%s] was detected as invalid (bug from the old code), and was dropped.' % (service.get_full_name(), notification))
                    continue
                notification.ref = service
                self.add(notification)
            
            service.check_and_error_if_too_much_checks_in_progress(logger_retention_checks_in_progress)
            for check in service.checks_in_progress:
                logger_retention_analyse.debug('The check %s is restoring a check from retention: uuid=%s status=%s' % (service.get_full_name(), check.get_uuid(), check.status))
                check.ref = service
                self.add(check)
                # IMPORTANT: checks here are NOT ready to be launched/analysed!
                # so we will need a final step at the end of the retention load, because
                # we need to have load ALL checks before be ready
            service.update_in_checking()
            # And also add downtimes
            self._reload_and_add_item_downtimes_on_retention_loading(service)
            # Update service acknowledges
            self._reload_item_acknowledgement_on_retention_loading(service)
    
    
    def _reload_and_add_item_downtimes_on_retention_loading(self, item: 'SchedulingItem') -> None:
        for dt in item.downtimes:
            dt.ref = item
            dt.extra_comment = None  # No more set (2.05.01 and more)
            # raises the downtime id to do not overlap
            Downtime.id = max(Downtime.id, dt.id + 1)
            if dt.comment and isinstance(dt.comment, bytes):
                try:
                    dt.comment = dt.comment.decode('utf8', 'ignore')
                except Exception:
                    dt.comment = 'Fail to decode downtime comment in utf8. Shinken drops it'
            self.add(dt)
        # As we did load downtime values, try to detect if there was incoherency in it #SEF-6735
        item.detect_and_repair_downtime_incoherency()
    
    
    @staticmethod
    def _reload_item_acknowledgement_on_retention_loading(item: 'SchedulingItem') -> None:
        for acknowledgement in [item.acknowledgement, item.partial_acknowledge]:
            if acknowledgement is None:
                continue
            
            acknowledgement.ref = item
            acknowledgement_comment = acknowledgement.comment
            if acknowledgement_comment and isinstance(acknowledgement_comment, bytes):
                try:
                    acknowledgement.comment = acknowledgement_comment.decode('utf8', 'ignore')
                except Exception:
                    acknowledgement.comment = 'Fail to decode acknowledgement comment in utf8. Shinken drops it.'
            
            if acknowledgement.state_when_has_been_acknowledged == 'PENDING':
                acknowledgement.state_when_has_been_acknowledged = item.state if item.state not in ('UNKNOWN', 'UNREACHABLE') else item.last_state
    
    
    # Note: after retention(s) we launch these fix/load only once, even if two retentions
    def _launch_post_retention(self):
        # type: () -> None
        # Final step for checks: we did load checks that are in the middle of they analysis,
        # so we must prepare them
        # NOTE: they already have they ref
        # IMPORTANT: only loop for the check we have at the beginning of the loop!
        with self._checks_created_this_turn_lock:
            restored_checks = list(self.checks.values()) + list(self._checks_created_this_turn)  # TAKE all exiting checks, especially the one not already stack
        
        for check in restored_checks:
            check.restore_from_retention(logger_retention_analyse)
            check.empty_perf_data()
            check.ref.empty_perf_data()
        
        for host in self.hosts:
            host.empty_perf_data()
    
    
    @staticmethod
    def __add_generate_brok_time(ref_time, brok_type, generate_broks_stats):
        # type: (float, str, Dict[str, Any]) -> None
        generate_broks_stats['nb_generate'] += 1
        prev_cum_time = generate_broks_stats['time_by_brok_type'].get(brok_type, 0.0)
        diff_time = time.time() - ref_time
        prev_cum_time += diff_time
        generate_broks_stats['time_by_brok_type'][brok_type] = prev_cum_time
        generate_broks_stats['total_time'] += diff_time
    
    
    # Get an initial Broks pack for the InitialBroksFactory, so it can dispatch it to multiple Brokers in one time
    def get_initial_broks(self):
        # type: () -> deque[Brok]
        if not getattr(self, 'conf', None):
            raise Exception('scheduler has no conf.')
        
        generate_broks_stats = {'nb_generate': 0, 'total_time': 0.0, 'time_by_brok_type': {}}
        
        broks = deque()
        
        logger.info('[TIMING] generating initial broks for configuration %s (currently have %s hosts and %s services)' % (self.part_configuration_incarnation, len(self.hosts), len(self.services)))
        
        # First a Brok for delete all from my instance_id
        t_before = time.time()
        brok = Brok('clean_all_my_instance_id', {'instance_id': self.instance_id})
        self.__add_generate_brok_time(t_before, 'clean_all_my_instance_id', generate_broks_stats)
        broks.append(brok)
        
        # first the program status
        t_before = time.time()
        brok = self.get_program_status_brok()
        self.__add_generate_brok_time(t_before, 'program_status', generate_broks_stats)
        broks.append(brok)
        
        #  We can't call initial_status from all this types
        #  The order is important, service need host...
        initial_status_types = (self.timeperiods, self.commands,
                                self.contactgroups, self.contacts,
                                self.hosts, self.hostgroups,
                                self.services, self.servicegroups)
        
        self.conf.skip_initial_broks = getattr(self.conf, 'skip_initial_broks', False)
        logger.debug('Skipping initial broks? %s' % str(self.conf.skip_initial_broks))
        if not self.conf.skip_initial_broks:
            for tab in initial_status_types:
                for i in tab:
                    t_before = time.time()
                    brok = i.get_initial_status_brok()
                    self.__add_generate_brok_time(t_before, 'initial_%s_status' % i.my_type, generate_broks_stats)
                    broks.append(brok)
        
        t_before = time.time()
        brok = self.get_proxy_items_graph_brok()
        self.__add_generate_brok_time(t_before, 'proxy_items_graph', generate_broks_stats)
        broks.append(brok)
        
        # Add a brok to say that we finished all initial_pass
        t_before = time.time()
        brok = Brok('initial_broks_done', {'instance_id': self.instance_id})
        self.__add_generate_brok_time(t_before, 'initial_broks_done', generate_broks_stats)
        broks.append(brok)
        
        logger.debug('%s %s [PERF] Created initial Broks : (nb add this turn:%4d) (total time=%.3f) (by type: %s) ' % (
            CHAPTER_CONFIGURATION, SECTION_BROKERS, generate_broks_stats['nb_generate'], generate_broks_stats['total_time'],
            ', '.join(['%s=>%.3fs' % (k, v) for (k, v) in generate_broks_stats['time_by_brok_type'].items()])))
        generate_broks_stats['nb_generate'] = 0
        generate_broks_stats['total_time'] = 0.0
        generate_broks_stats['time_by_brok_type'] = {}
        
        return broks
    
    
    # Create a brok with program status info
    def get_and_register_program_status_brok(self):
        b = self.get_program_status_brok()
        self.add(b)
    
    
    # Create a brok with program status info
    def get_and_register_update_program_status_brok(self):
        b = self.get_program_status_brok()
        b.type = 'update_program_status'
        self.add(b)
    
    
    # Get a brok with program status
    def get_program_status_brok(self):
        now = int(time.time())
        data = {
            'is_running'                    : 1,
            'instance_id'                   : self.instance_id,
            'instance_name'                 : self.instance_name,
            'last_alive'                    : now,
            'interval_length'               : self.conf.interval_length,
            'program_start'                 : self.program_start,
            'pid'                           : os.getpid(),
            'daemon_mode'                   : 1,
            'last_command_check'            : now,
            'last_log_rotation'             : now,
            'notifications_enabled'         : self.conf.enable_notifications,
            'active_service_checks_enabled' : self.conf.execute_service_checks,
            'passive_service_checks_enabled': self.conf.accept_passive_service_checks,
            'active_host_checks_enabled'    : self.conf.execute_host_checks,
            'passive_host_checks_enabled'   : self.conf.accept_passive_host_checks,
            'event_handlers_enabled'        : self.conf.enable_event_handlers,
            'flap_detection_enabled'        : self.conf.enable_flap_detection,
            'failure_prediction_enabled'    : 0,
            'process_performance_data'      : self.conf.process_performance_data,
            'obsess_over_hosts'             : self.conf.obsess_over_hosts,
            'obsess_over_services'          : self.conf.obsess_over_services,
            'modified_host_attributes'      : 0,
            'modified_service_attributes'   : 0,
            'global_host_event_handler'     : self.conf.global_host_event_handler,
            'global_service_event_handler'  : self.conf.global_service_event_handler,
            'check_external_commands'       : self.conf.check_external_commands,
            'check_service_freshness'       : self.conf.check_service_freshness,
            'check_host_freshness'          : self.conf.check_host_freshness,
            'command_file'                  : self.conf.command_file,
            'default_properties_values'     : self.conf.default_properties_values,
        }
        b = Brok('program_status', data)
        return b
    
    
    # Get a brok with the item graph
    def get_proxy_items_graph_brok(self):
        data = {
            'proxy_items_graph_son_to_fathers': proxyitemsgraph.son_to_fathers,
            'instance_id'                     : self.instance_id,
        }
        b = Brok('proxy_items_graph', data)
        return b
    
    
    # Called every 1sec to consume every result in services or hosts
    # with these results, they are OK, CRITICAL, UP/DOWN, etc...
    def consume_results(self):
        start = time.time()
        # All results are in self.waiting_results
        # We need to get them first
        received_results = 0
        with self.waiting_results_lock:
            t0 = time.time()
            old_waiting_results = self.waiting_results
            self.waiting_results = []  # switch to a new one
        
        # Now loop around checks we received
        for action in old_waiting_results:
            self._set_raw_stat_for_checks_received(action)
            self.put_results(action)
            received_results += 1
        
        _havetoresolvedeps = []
        _waitdeps = []
        _all_dep_are_finishs = []
        
        t1 = time.time()
        # Then we consume them
        for check in self.checks.values():
            current_check_status = check.status
            if current_check_status == CHECK_STATUS.WAITCONSUME:
                check.launch_consume()
            elif current_check_status == CHECK_STATUS.ALL_DEP_ARE_FINISH:
                _all_dep_are_finishs.append(check)
        
        t2 = time.time()
        
        # Finish to consume checks that have all their deps done
        for check in _all_dep_are_finishs:
            if check.can_be_consume():
                check.launch_consume()
        t3 = time.time()
        
        lock_time = t0 - start
        put_result_time = t1 - t0
        waitconsume_time = t2 - t1
        _all_dep_are_finishs_time = t3 - t2
        consume_time = t3 - start
        consume_time = 1 if consume_time == 0 else consume_time
        
        logger.debug('%s [consume] [%.3f] for [%d] results (speed consume/s:[%d]) ->  wait_lock:[%.3f] put_results:[%.3f]   waitconsume:[%.3f]   all_dep_are_finishs:[%.3f]' %
                     (CHAPTER_STATS, consume_time, received_results, (received_results / consume_time), lock_time, put_result_time, waitconsume_time, _all_dep_are_finishs_time))
    
    
    def _set_raw_stat_for_checks_received(self, action):
        executor_id = action.executor_id
        if executor_id in self.stat_by_executor and 'raw_nb_checks_received' in self.stat_by_executor[executor_id]:
            stat_by_executor = self.stat_by_executor[executor_id]
            stat_by_executor['raw_nb_checks_received'] += 1
            stat_by_executor['raw_cpu_time_checks_received'] += action.average_cpu_time
        else:
            stat_by_executor = self.stat_by_executor.get(executor_id, {})
            self.stat_by_executor[executor_id] = stat_by_executor
            stat_by_executor['raw_nb_checks_received'] = 1
            stat_by_executor['raw_cpu_time_checks_received'] = action.average_cpu_time
    
    
    # Called every 1sec to delete all checks in a zombie state
    # zombie = not useful anymore
    def delete_zombie_checks(self):
        uuid_to_del = []
        for c in self.checks.values():  # type: Check
            if c.status == 'zombie':
                uuid_to_del.append(c.get_uuid())
        with self.checks_n_actions_lock:
            for uuid_ in uuid_to_del:
                del self.checks[uuid_]
    
    
    # Called every 1sec to delete all actions in a zombie state
    # zombie = not useful anymore
    def delete_zombie_actions(self):
        uuid_to_del = []
        
        with self.checks_n_actions_lock:
            for a in self.actions.values():  # type: Union[Check,EventHandler]
                if a.status == 'zombie':
                    uuid_to_del.append(a.get_uuid())
                    if a.is_a == ACTION_TYPES.EVENTHANDLER:
                        logger.debug('[EVENTHANDLER] the event handler %s (on the element %s) will be deleted' % (a.get_uuid(), a.ref.get_full_name()))
            
            for _uuid in uuid_to_del:
                del self.actions[_uuid]
    
    
    # Check for downtimes start and stop, and register them if needed
    def update_downtimes(self):
        broks = []
        now = time.time()
        logger.debug('%s At the start of update_downtimes, there is [%d] downtimes on this scheduler and [%d] are active' % (_DEBUG_PERF_TAG, len(self.downtimes), len([dt for dt in self.downtimes.values() if dt.is_in_effect])))
        
        # Check maintenance periods
        item_with_maintenance_period = 0
        for elt in [y for y in [x for x in self.hosts] + [x for x in self.services] if y.maintenance_period is not None]:  # type: Union[Host, Service]
            item_with_maintenance_period += 1
            if elt.in_maintenance is None:
                if elt.maintenance_period.is_time_valid(now):
                    start_dt = elt.maintenance_period.get_next_valid_time_from_t(now)
                    end_dt = elt.maintenance_period.get_next_invalid_time_from_t(start_dt + 1)
                    if end_dt:
                        # -1 for go back to the very last valid second
                        end_dt -= 1
                    else:
                        # If no next invalid time was found we set the end of the downtime to the next year
                        end_dt = start_dt + timeperiod.ONE_YEAR_AS_SECONDS
                    dt = Downtime(elt, start_dt, end_dt, 1, 0, 0, "system", "this downtime was automatically scheduled through a maintenance_period")
                    elt.add_downtime(dt)
                    self.add(dt)
                    self.get_and_register_status_brok(elt)
                    elt.in_maintenance = dt.id
            else:
                if elt.in_maintenance not in self.downtimes:
                    # the main downtimes has expired or was manually deleted
                    elt.in_maintenance = None
        t1 = time.time()
        logger.debug('%s Check %d host/cluster/service maintenance period in %.3fs' % (_DEBUG_PERF_TAG, item_with_maintenance_period, t1 - now))
        
        # Check the validity of contact downtimes
        contact_downtime_number = 0
        for elt in self.contacts:
            for dt in elt.downtimes:
                contact_downtime_number += 1
                dt.check_activation()
        t2 = time.time()
        logger.debug('%s Check %s contact downtimes validity in %.3fs' % (_DEBUG_PERF_TAG, contact_downtime_number, t2 - t1))
        
        clean_contact_dt_number = 0
        # Clean downtimes who can be deleted:
        for dt in list(self.contact_downtimes.values()):
            if dt.can_be_deleted:
                clean_contact_dt_number += 1
                ref = dt.ref
                self.del_contact_downtime(dt.id)
                broks.append(ref.get_update_status_brok())
        t3 = time.time()
        logger.debug('%s Delete %d obsolete contact downtimes in %.3fs' % (_DEBUG_PERF_TAG, clean_contact_dt_number, t3 - t2))
        
        downtimes_checked = 0
        # Check start and stop times
        for dt in list(self.downtimes.values()):
            # if it can be deleted, will be done after
            if dt.can_be_deleted:
                continue
            downtimes_checked += 1
            if dt.real_end_time < now:
                # this one has expired
                broks.extend(dt.exit())  # returns downtimestop notifications
            elif now >= dt.start_time and dt.fixed and not dt.is_in_effect:
                # this one has to start now
                broks.extend(dt.enter())  # returns downtimestart notifications
                broks.append(dt.ref.get_update_status_brok())
                # also get news from services if they are updated
                if dt.ref.my_type == 'host':
                    for s in dt.ref.services:
                        broks.append(s.get_update_status_brok())
        t4 = time.time()
        logger.debug('%s Check %d host/service downtimes validity in %.3fs' % (_DEBUG_PERF_TAG, downtimes_checked, t4 - t3))
        
        # Cluster case: check for sons for inherited dt status
        cluster_downtime_number = 0
        for h in self.hosts:
            if h.got_business_rule:
                cluster_downtime_number += 1
                broks.extend(self._check_cluster_downtime(h))
        t5 = time.time()
        logger.debug('%s Check %d cluster downtimes validity in %.3fs' % (_DEBUG_PERF_TAG, cluster_downtime_number, t5 - t4))
        
        clean_dt_number = 0
        # A loop where those downtimes are removed which were marked for deletion (mostly by dt.exit())
        for dt in list(self.downtimes.values()):
            if dt.can_be_deleted:
                clean_dt_number += 1
                ref = dt.ref
                self.del_downtime(dt.id)
                broks.append(ref.get_update_status_brok())
                # also get news from services if they are updated
                if ref.my_type == 'host':
                    for s in ref.services:
                        broks.append(s.get_update_status_brok())
        t6 = time.time()
        logger.debug('%s Delete %d obsolete host/service/cluster downtimes in %.3fs' % (_DEBUG_PERF_TAG, clean_dt_number, t6 - t5))
        
        for b in broks:
            self.add(b)
        t7 = time.time()
        logger.debug('%s Update %s broks for DOWNTIMESTART/DOWNTIMESTOP and FULL_STATUS for elements in %.3fs' % (_DEBUG_PERF_TAG, len(broks), t7 - t6))
        
        logger.debug('%s At the end   of update_downtimes, there is [%d] downtimes on this scheduler and [%d] are active. update_downtimes took %.3fs' % (
            _DEBUG_PERF_TAG, len(self.downtimes), len([dt for dt in self.downtimes.values() if dt.is_in_effect]), t7 - now))
    
    
    def _check_cluster_downtime(self, cluster: 'Host') -> 'list[Brok]':
        for dt in self.downtimes.values():
            if dt.ref == cluster and dt.is_in_effect:
                return []
        is_inherited_dt, is_partial_dt = cluster.business_rule.get_downtime_state()
        if cluster.in_inherited_downtime != is_inherited_dt or cluster.in_partial_downtime != is_partial_dt or cluster.in_scheduled_downtime:
            cluster.in_inherited_downtime = is_inherited_dt
            cluster.in_partial_downtime = is_partial_dt
            cluster.in_scheduled_downtime = False
            proxyitemsmgr.update_in_inherited_downtime(cluster.get_instance_uuid(), is_inherited_dt)
            proxyitemsmgr.update_partial_downtime(cluster.get_instance_uuid(), is_partial_dt)
            return [cluster.get_update_status_brok()]
        return []
    
    
    # Main schedule function to make the regular scheduling
    def schedule(self):
        # ask for service and hosts their next check
        for type_tab in [self.services, self.hosts]:
            for i in type_tab:  # type: Optional[SchedulingItem]
                i.schedule()
    
    
    # Main actions reaper function: it gets all new checks,
    # notification and event handler from hosts and services
    def get_new_actions(self):
        self.hook_point('get_new_actions')
        # ask for service and hosts their next check
        for type_tab in [self.services, self.hosts]:
            start_of_type = time.time()
            nb_actions_stacked = 0
            for i in type_tab:
                for a in i.actions:
                    nb_actions_stacked += 1
                    self.add(a)
                # We take all, we can clear it
                i.actions = []
            end_of_type = time.time()
            get_new_actions_stats[type_tab.inner_class.my_type] = (len(type_tab), end_of_type - start_of_type)
        
        # Now we can stack all to self.checks
        self._stack_all_checks()
        
        if LOG_SCHEDULER_RECURRENT_TIMES:
            for type_class in get_new_actions_stats:
                nb_elements, elapsed_time = get_new_actions_stats[type_class]
                logger.debug('[TIMING] get_new_actions_stats:: %s total=%s => nb actions:%d in %.3fs' % (type_class, nb_elements, nb_actions_stacked, elapsed_time))
            
            logger.debug('[TIMING] Adding checks times: (nb add this turn:%4d) (total time=%.3f) (lock acquired time=%.3f) (set time=%.3f) (dispatch time=%.3f) (hash time=%.3f) (brok create time=%.3f) (brok add time=%.3f)' % (
                ADD_CHECKS_STATS['nb_add'], ADD_CHECKS_STATS['total_time'], ADD_CHECKS_STATS['lock_time'], ADD_CHECKS_STATS['set_time'], ADD_CHECKS_STATS['dispatch_time'], ADD_CHECKS_STATS['hash_time'], ADD_CHECKS_STATS['brok_create_time'],
                ADD_CHECKS_STATS['brok_add_time']))
            
            logger.debug('[TIMING] Adding broks times : (nb add this turn:%4d) (total time=%.3f) (sat lock acquired=%.3f) (brokers lock acquired=%.3f) (one broker=%.3f) (to all brokers=%.3f) (to global list=%.3f)' % (
                ADD_BROKS_STATS['nb_add'], ADD_BROKS_STATS['total_time'], ADD_BROKS_STATS['sat_lock_time'], ADD_BROKS_STATS['brokers_lock_time'], ADD_BROKS_STATS['to_one_broker_time'], ADD_BROKS_STATS['to_all_brokers_time'],
                ADD_BROKS_STATS['to_global_list_time']))
        
        ADD_CHECKS_STATS['nb_add'] = 0
        ADD_CHECKS_STATS['total_time'] = ADD_CHECKS_STATS['lock_time'] = ADD_CHECKS_STATS['set_time'] = ADD_CHECKS_STATS['dispatch_time'] = ADD_CHECKS_STATS['hash_time'] = ADD_CHECKS_STATS['brok_create_time'] = ADD_CHECKS_STATS['brok_add_time'] = 0.0
        
        ADD_BROKS_STATS['nb_add'] = 0
        ADD_BROKS_STATS['total_time'] = ADD_BROKS_STATS['sat_lock_time'] = ADD_BROKS_STATS['brokers_lock_time'] = ADD_BROKS_STATS['to_one_broker_time'] = ADD_BROKS_STATS['to_all_brokers_time'] = ADD_BROKS_STATS['to_global_list_time'] = 0.0
        
        if LOG_SCHEDULER_RECURRENT_TIMES:
            before = time.time()
            i = 0
            while i < 50000:
                i += 1
            logger.debug('[TIMING] computing exec time: %.3fs' % (time.time() - before))
    
    
    # Similar as above, but for broks
    # Important: broks will be set into the self.brokers, so need
    #            to lock the access to it
    def get_new_broks(self):
        # IMPORTANT: always the satellite lock BEFORE brokers lock
        with self.sched_daemon.satellite_lock:
            # ask for service and hosts their broks waiting be eaten
            with self.brokers_lock:
                item: 'SchedulingItem'
                for item in itertools.chain(self.services, self.hosts):
                    # keep only the last broks for each type
                    last_broks = {b.type: b for b in item.broks}
                    for b in last_broks.values():
                        self.add(b)
                    # We take all, we can clear it
                    item.broks.clear()
    
    
    # Raises checks for no fresh states for services and hosts
    def check_freshness(self):
        # print "********** Check freshness******"
        for type_tab in [self.services, self.hosts]:
            for i in type_tab:
                c = i.do_check_freshness()
                if c is not None:
                    self.add(c)
    
    
    # Check for orphaned checks: checks that never returns
    # so if in poller and t_to_go < now - 300s: pb!
    # Warn only one time for each "worker"
    # XXX I think we should make "time_to_orphanage" configurable
    #     each action type, each for notification, event_handler & check
    #     I think it will be a little more useful that way, not sure tho
    def check_orphaned(self):
        with self.checks_n_actions_lock:
            return self._check_orphaned()
    
    
    # real version without the lock inside
    def _check_orphaned(self):
        orphaned_checks_uuids = []
        orphaned_notif_uuids = []
        executor_ids = {}
        now = int(time.time())
        for c in list(self.checks.values()):  # type: Check
            time_to_orphanage = c.ref.get_time_to_orphanage()
            if time_to_orphanage:
                if c.status == 'inpoller' and c.t_to_go < now - time_to_orphanage:
                    # ok it's an orphan, so change it's time ot go to now, but don't touch the original time to go,
                    # so we can compute real latency
                    c.reset_schedule_now()  # ok reset to be done NOW
                    #  Use the try tech instead preventive check for pref reason
                    checks_container.do_index_job_execution(c)
                    orphaned_checks_uuids.append(c.get_uuid())
                    if c.executor_id not in executor_ids:
                        executor_ids[c.executor_id] = 1
                        continue
                    executor_ids[c.executor_id] += 1
        for a in list(self.actions.values()):
            time_to_orphanage = a.ref.get_time_to_orphanage()
            if time_to_orphanage:
                if a.status == 'inpoller' and a.t_to_go < now - time_to_orphanage:
                    # same as for checks
                    a.status = 'scheduled'
                    a.t_to_go = now
                    orphaned_notif_uuids.append(a.get_uuid())
                    if a.executor_id not in executor_ids:
                        executor_ids[a.executor_id] = 1
                        continue
                    executor_ids[a.executor_id] += 1
        
        for w in executor_ids:
            logger.warning("%d actions (check, notification, event handler, ...) never came back for the satellite '%s'. Re-enabling them for immediate execution." % (executor_ids[w], w))
        if MONITORING_CHECK_CONSUME_DEBUG_FLAG:
            logger.info('The checks that are late are: (%s) %s' % (len(orphaned_checks_uuids), orphaned_checks_uuids))
            logger.info('The notifs that are late are: (%s) %s' % (len(orphaned_notif_uuids), orphaned_notif_uuids))
    
    
    # Closes and filters incidents
    def clean_incidents(self):
        for item_list in [self.services, self.hosts]:  # type: List[Union[Host, Service]]
            for item in item_list:
                item.clean_incidents()
    
    
    # Update and clean data load from retention with new configuration
    def _sanitize_item_post_retention_load(self):
        self._recompute_hosts_and_services_data()
        self._reschedule_notification_with_new_notification_interval()
    
    
    def _recompute_hosts_and_services_data(self):
        for items in [self.hosts, self.services]:
            for item in items:
                self._recompute_impact_and_root_problem(item)
                # Order is important here : first compute next schedule, then update new items
                self._set_next_chk_validity_period_and_missing_data_time(item)
                self._update_new_pending_element(item)
    
    
    @staticmethod
    def _recompute_impact_and_root_problem(item):
        # type: (SchedulingItem) -> None
        if item.state != 'PENDING' and item.state_type == 'HARD' and item.state_id != 0 and item.is_root_problem():
            # SEF-5621 After retention load we do not change state of the impact, because it was done before retention loading, and it must not be done twice
            item.set_myself_as_problem(enable_impact_state=False)
        # Proxy may not have been updated if root_problem is empty after retention loading
        if not item.source_problems:
            proxyitemsmgr.update_root_problems(item.get_instance_uuid(), [])
    
    
    def _set_next_chk_validity_period_and_missing_data_time(self, item):
        # type: (SchedulingItem) -> None
        item.schedule(startup_minimal_time_before_missing=self.missing_data_startup_delay, force_check_spread_out=self.force_check_spread_out)
    
    
    @staticmethod
    def _update_new_pending_element(item):
        if item.state in ('PENDING', 'UNKNOWN', 'UNREACHABLE') and item.last_chk == 0:
            if (time.time() - item.monitoring_start_time) < 3600:
                item.last_chk = item.monitoring_start_time
            else:
                item.last_chk = time.time()
            if item.last_state_change == 0:
                item.last_state_change = item.last_chk
            if item.last_hard_state_change == 0:
                item.last_hard_state_change = item.last_chk
            if item.next_chk and item.next_chk > 0:
                next_chk = ' ( %s )' % time.strftime('%H:%M:%S %d-%m-%Y (%Z)', time.localtime(item.next_chk))
                if item.state_validity_end_time == 0:
                    item.state_validity_end_time = item.next_chk
            else:
                next_chk = ''
            last_chk = time.strftime('%H:%M:%S %d-%m-%Y (%Z)', time.localtime(item.last_chk))
            item.output = PENDING_OUTPUT[item.__class__.language] % (last_chk, next_chk)
    
    
    def _reschedule_notification_with_new_notification_interval(self):
        for notification in list(self.actions.values()):
            is_master_notification = notification.is_a == ACTION_TYPES.NOTIFICATION and notification.status == 'scheduled' and not notification.contact
            if is_master_notification and notification.previous_t_to_go:
                ref_item = notification.ref
                old_t_to_go = notification.t_to_go
                new_t_to_go = ref_item.get_next_notification_time(notification, notification.previous_t_to_go)
                if new_t_to_go is not None and old_t_to_go != new_t_to_go:
                    notification.t_to_go = new_t_to_go
                    logger.debug("reschedule notification of:[%s] old_t_to_go:[%s]->t_to_go[%s]" % (ref_item.get_full_name(), old_t_to_go, notification.t_to_go))
    
    
    # Force an update of the proxies after loading the retention data
    def update_proxy_items_states(self):
        for s in self.services:
            s.update_proxy()
        for h in self.hosts:
            h.update_proxy()
    
    
    # for all cluster check flapping change periodically and not only when on of the host state change
    def compute_cluster_flapping(self):
        for cluster in self.clusters.values():
            state_id = cluster.state_id
            last_cluster_state_id = cluster.last_cluster_state_id
            if last_cluster_state_id == -1:
                last_cluster_state_id = state_id  # last_cluster_state_id can be -1 when not set
            cluster.check_flapping_change(state_id, last_cluster_state_id)
            cluster.last_cluster_state_id = cluster.state_id
            b = cluster.get_check_result_brok()
            cluster.broks.append(b)
    
    
    def is_activated(self):
        return self.sched_daemon.is_activated()
    
    
    def _increase_loop_number(self):
        self._loop_number += 1
    
    
    def _get_loop_number(self):
        return self._loop_number
    
    
    # Main function
    def run(self):
        # In all cases,e sure the InitialBroksFactory thread is up
        self.initial_broks_factory.launch_main_thread()
        
        # I'm a spare so do nothing
        if not self.is_activated():
            return
        
        # see SEF-9777
        logger_monitoring = LoggerFactory.get_logger('SCHEDULING').get_sub_part('STATUS DEPENDENCY UPDATE')
        logger_monitoring.set_enable(False)
        LoggerFactory.get_logger('SCHEDULE ITEM').set_enable(False)
        
        # Then we see if we've got info in the retention file
        # NOTE: killing watch dog after 30min, so only real deadlock are raised here
        _30_min = 60 * 30
        with WatchDogThreadDumper('Load retention', wait_time=_30_min, dump_interval=60 * 5, fatal_dead_lock_delay=_30_min):
            self.retention_load()
            
            # We need to delete old retention we don't need any more (ONLY AVAILABLE FOR MONGO RETENTION)
            if self.must_run and "mongodb_retention" in [module.module_type for module in self.sched_daemon.modules]:
                self.old_retention_delete()
        
        # Since we did load the retention, update the proxy elements with new values, to be sure to have something
        self.update_proxy_items_states()
        
        if self.force_check_spread_out:
            logger.info('[ MAINTENANCE ] [ CHECK SPREAD OUT ] Scheduler will force all check spread out')
        # Update and clean data load from retention with new configuration
        self._sanitize_item_post_retention_load()
        self.force_check_spread_out = False
        
        # Finally start the external modules now we got our data
        self.hook_point('pre_scheduler_mod_start')
        self.sched_daemon.modules_manager.start_external_instances(late_start=True)
        
        # Ok, now all is initialized, we can make the initial broks
        logger.info("[%s] First scheduling is launching on host [ %d ] and checks [ %d ]" % (self.instance_name, len(self.hosts), len(self.services)))
        self.schedule()
        
        # And do a force for all clusters
        for cluster in self.clusters.values():
            cluster.schedule(force=True, allow_next_schedule_brok=False)
        logger.info("[%s] First scheduling done" % self.instance_name)
        
        # We must reset it if we received a new conf from the Arbiter.
        # Otherwise, the stat check average won't be correct
        
        self.load_one_min = Load(initial_value=1)
        
        # Recompute initial business impacts to prevent sending erroneous notifications
        # NOTE: in this very first call, we do not want status_broks as the initial_broks will
        #       do the job just after
        self.update_business_values(allow_status_broks=False)
        
        # We are now ready because:
        # * we have our retention
        # * the scheduling is done, we can have real initial_broks with real next check
        # * valid business values
        self.set_scheduler_is_ready(True)
        
        logger.debug("[scheduler][%s] First loop at %d" % (self.instance_id, time.time()))
        mainloop_watchdog = WatchDogThreadDumper('Main loop', wait_time=_30_min, dump_interval=60 * 5, fatal_dead_lock_delay=_30_min, multi_usage=True)
        while self.must_run:
            # Go dead if lock more than 30min (big, but ok for a deadlock detection)
            with mainloop_watchdog:
                self.sched_daemon.daily_log_version()
                start_snap = cpu_stats_helper.get_thread_cpu_snapshot()
                start_of_loop = time.time()
                self._reset_loop_run_time()
                self._increase_loop_number()
                loop_number = self._get_loop_number()
                
                logger.info(_SCHEDULER_TIME_STR)
                logger.info('%s %s [ Loop number=%-5d ] ===-===-===-===-===-===-===-===-===-===-===-===-===' % (_SCHEDULER_TIME_STR, _LOOP_START_STR, loop_number))
                
                # The daemon must check if its threads are ok or not
                self.sched_daemon.assert_valid_satellite_threads()
                
                t1 = time.time()
                logger.debug('%s took [ %6.3f ]s in assert_valid_satellite_threads' % (_SCHEDULER_TIME_STR, t1 - start_of_loop))
                
                # if we have dead modules, we must restart them
                self.sched_daemon.check_and_del_zombie_modules()
                t2 = time.time()
                
                logger.debug('%s took [ %6.3f ]s in check_and_del_zombie_modules' % (_SCHEDULER_TIME_STR, t2 - t1))
                self.sched_daemon.sleep_time = 0.0
                
                hook_tick_start_time = time.time()
                self.hook_point('tick')
                
                t3 = time.time()
                logger.log_perf(hook_tick_start_time, _SCHEDULER_TIME_STR, '''modules' tick''', min_time=0, info_time=0.1)
                
                self.update_loop_run_time(LOOP_TIME_GROUP.CHECK_ENV, t3, start_of_loop)
                # Do recurrent works like schedule, consume delete_zombie_checks
                for i in self.recurrent_works:
                    (name, f, nb_ticks, loop_run_time_group) = self.recurrent_works[i]
                    # A 0 in the tick will just disable it
                    if nb_ticks != 0:
                        _before = time.time()
                        snap = cpu_stats_helper.get_thread_cpu_snapshot()
                        if loop_number % nb_ticks == 0:
                            f()
                        _after = time.time()
                        _diff = (_after - _before)
                        self.update_loop_run_time(loop_run_time_group, _after, _before)
                        logger.debug('%s took [ %6.3f ]s %s in recurrent_works : %s' % (_SCHEDULER_TIME_STR, _after - _before, snap.get_diff(), name))
                
                t4 = time.time()
                logger.debug('%s took [ %6.3f ]s in all recurrent_works' % (_SCHEDULER_TIME_STR, t4 - t3))
                
                with self.checks_n_actions_lock:
                    self._compute_and_print_stat()
                t5 = time.time()
                logger.debug('%s took [ %6.3f ]s in _compute_and_print_stat' % (_SCHEDULER_TIME_STR, t5 - t4))
                self.update_loop_run_time(LOOP_TIME_GROUP.CHECK_ENV, t5, t4)
                
                # We are cleaning old proxy change entries, so we don't leak memory
                # all over the time
                proxyitemsmgr.clean_history()
                t6 = time.time()
                logger.debug('%s took [ %6.3f ]s in proxyitemsmgr.clean_history' % (_SCHEDULER_TIME_STR, t6 - t5))
                self.update_loop_run_time(LOOP_TIME_GROUP.CLEANING, t6, t5)
                
                if self.need_objects_dump:
                    logger.debug('I need to dump my objects!')
                    self.dump_objects()
                    self.need_objects_dump = False
                t7 = time.time()
                logger.debug('%s took [ %6.3f ]s in dump_objects' % (_SCHEDULER_TIME_STR, t7 - t6))
                
                with self.sched_daemon.satellite_lock:
                    logger.debug(
                        '%s %s [QUEUE] Current brokers and broks queues: %s' % (CHAPTER_STATS, SECTION_BROKERS, ' '.join(['[%s=>%s broks in queue]' % (broker_name, len(broker['broks'])) for (broker_name, broker) in self.brokers.items()])))
                
                # Now we should loop at time
                end_of_loop = time.time()
                
                self.update_loop_run_time(LOOP_TIME_GROUP.DEBUG, end_of_loop, t6)
                
                elapsed_time = end_of_loop - start_of_loop
                logger.info('%s took [ %6.3f ]s to schedule checks, consume results and create broks' % (_SCHEDULER_TIME_STR, self.loop_time['SCHEDULING']))
                logger.info('%s took [ %6.3f ]s to update items context ( downtimes, acknowledge, flapping, root_problems, business values )' % (_SCHEDULER_TIME_STR, self.loop_time['CONTEXT_UPDATE']))
                logger.info('%s took [ %6.3f ]s to clean data ( cache, zombies, proxy, stats, notifications list )' % (_SCHEDULER_TIME_STR, self.loop_time['CLEANING']))
                logger.info('%s took [ %6.3f ]s to check environment and update stats ( satellite_thread, time, retention, orphan, modules )' % (_SCHEDULER_TIME_STR, self.loop_time['CHECK_ENV']))
                logger.debug('%s took [ %6.3f ]s to debug operations ( log, dump objects )' % (_SCHEDULER_TIME_STR, self.loop_time['DEBUG']))
                
                log_level = logging.WARNING if elapsed_time > 1 else logging.INFO
                logger.log(log_level, '%s %s [ Loop number=%-5d ] [PERF] [ %6.3f ]s' % (_SCHEDULER_TIME_STR, _LOOP_STOP_STR, loop_number, elapsed_time))
                # Special debug about CPU time: show real cpu time taken
                logger.debug('%s %s [ Loop number=%-5d ] [PERF] %s' % (_SCHEDULER_TIME_STR, _LOOP_STOP_STR, loop_number, start_snap.get_diff()))
                
                # If we got back in time, skip this sleep
                if elapsed_time < 0:
                    continue
                
                self.loop_time_avg.update_avg(elapsed_time)
                
                self.sched_daemon.sleep(1.0 - elapsed_time)
        
        # Main Loop has ended, we no more need the watchdog
        mainloop_watchdog.quit()
        # WE must save the retention at the quit BY OURSELVES
        # because our daemon will not be able to do it for us
        self.update_retention_file(True)
        self.set_scheduler_is_ready(False)
    
    
    def set_scheduler_is_ready(self, is_ready: 'bool' = True):
        with self._scheduler_is_ready_lock:
            self._scheduler_is_ready = is_ready
            self._scheduler_is_ready_lock.notify_all()
    
    
    def scheduler_is_ready(self) -> bool:
        with self._scheduler_is_ready_lock:
            return self._scheduler_is_ready
    
    
    def wait_main_loop_has_started(self, timeout: 'float|None') -> 'bool':
        return self._wait_scheduler_readiness(True, timeout)
    
    
    def wait_main_loop_has_exited(self, timeout: 'float|None' = None):
        return self._wait_scheduler_readiness(False, timeout)
    
    
    def _wait_scheduler_readiness(self, is_ready: 'bool', timeout: 'float|None') -> bool:
        start_wait_time = time.time()
        with self._scheduler_is_ready_lock:
            while self._scheduler_is_ready is not is_ready:
                if timeout is not None:
                    remaining_wait_time = timeout - (time.time() - start_wait_time)
                else:
                    remaining_wait_time = None
                if remaining_wait_time is None or remaining_wait_time > 0:
                    self._scheduler_is_ready_lock.wait(remaining_wait_time)
                    continue
                break
            return self._scheduler_is_ready == is_ready
    
    
    def _compute_and_print_stat(self):
        start_snap = cpu_stats_helper.get_thread_cpu_snapshot()
        # Let's print stats of our useful object, like our index
        self._print_action_indexing_stats()
        total_checks_send = 0
        total_checks_received = 0
        for executor_id, stat_by_executor in self.stat_by_executor.items():
            nb_checks_send = stat_by_executor.get('raw_nb_checks_send', 0)
            cpu_time_checks_send = stat_by_executor.get('raw_cpu_time_checks_send', 0)
            nb_checks_received = stat_by_executor.get('raw_nb_checks_received', 0)
            cpu_time_checks_received = stat_by_executor.get('raw_cpu_time_checks_received', 0)
            _type = stat_by_executor.get('type', None)
            if _type is None:  # not finish entry
                continue
            
            self._compute_executor_average_stat(executor_id, cpu_time_checks_received, cpu_time_checks_send, nb_checks_received, nb_checks_send)
            
            total_checks_send += nb_checks_send
            total_checks_received += nb_checks_received
            section = SECTION_POLLERS if _type == 'Poller' else SECTION_REACTIONNERS
            logger.debug('%s %s [ %-25s ] Raw data from daemon : give : [%4d] for [%0.3f]s CPU time. Return : [%4d] for [%0.3f]s CPU time' % (
                CHAPTER_STATS,
                section,
                executor_id,
                nb_checks_send,
                cpu_time_checks_send,
                nb_checks_received,
                cpu_time_checks_received))
            logger.debug('%s %s [ %-25s ] Data daemon average  : give : [%4d] for [%0.3f]s CPU time. Return : [%4d] for [%0.3f]s CPU time' % (
                CHAPTER_STATS,
                section,
                executor_id,
                self.avg_nb_checks_send[executor_id].get_avg(0),
                self.avg_cpu_time_checks_send[executor_id].get_avg(0),
                self.avg_nb_checks_received[executor_id].get_avg(0),
                self.avg_cpu_time_checks_received[executor_id].get_avg(0)))
        
        self.avg_checks_todo_by_sec.update_avg(self.raw_nb_checks_to_send)
        self.avg_notification_todo_by_sec.update_avg(self.raw_nb_notification_to_send)
        self.avg_event_handler_todo_by_sec.update_avg(self.raw_nb_event_handler_to_send)
        self.avg_cpu_time_checks_to_send.update_avg(self.raw_cpu_time_checks_to_send)
        
        self.avg_total_checks_send.update_avg(total_checks_send)
        self.avg_total_checks_received.update_avg(total_checks_received)
        
        # update checks by causes
        self.avg_checks_received_schedule_by_sec.update_avg(self.raw_nb_checks_received_schedule)
        self.avg_checks_received_force_by_sec.update_avg(self.raw_nb_checks_received_force)
        self.avg_checks_received_retry_by_sec.update_avg(self.raw_nb_checks_received_retry)
        self.avg_checks_received_dependency_by_sec.update_avg(self.raw_nb_checks_received_dependency)
        self.raw_nb_checks_received_schedule = 0
        self.raw_nb_checks_received_force = 0
        self.raw_nb_checks_received_retry = 0
        self.raw_nb_checks_received_dependency = 0
        
        logger.debug('%s %s broks send:[%6d] checks : todo:[%4d]/[%4d]nb - [%0.3f]/[%0.3f]s give:[%4d]/[%4d]nb return:[%4d]/[%4d]nb' %
                     (CHAPTER_STATS,
                      SECTION_BROKERS,
                      self.nb_broks_send,
                      self.raw_nb_checks_to_send,
                      self.avg_checks_todo_by_sec.get_avg(0),
                      self.raw_cpu_time_checks_to_send,
                      self.avg_cpu_time_checks_to_send.get_avg(0),
                      total_checks_send,
                      self.avg_total_checks_send.get_avg(0),
                      total_checks_received,
                      self.avg_total_checks_received.get_avg(0)))
        
        self.raw_nb_checks_to_send = 0
        self.raw_nb_notification_to_send = 0
        self.raw_nb_event_handler_to_send = 0
        self.raw_cpu_time_checks_to_send = 0
        self._nb_checks_with_stat_send = 0
        
        for executor_id, stat_by_executor in self.stat_by_executor.items():
            if 'raw_nb_checks_send' in stat_by_executor:
                stat_by_executor['raw_nb_checks_send'] = 0
                stat_by_executor['raw_cpu_time_checks_send'] = 0
            if 'raw_nb_checks_received' in stat_by_executor:
                stat_by_executor['raw_nb_checks_received'] = 0
                stat_by_executor['raw_cpu_time_checks_received'] = 0
        
        with self.checks_n_actions_lock:
            
            self._reset_stats()
            
            for check_or_action in list(self.actions.values()) + list(self.checks.values()):
                if check_or_action.is_a == 'notification' and check_or_action.is_master_notification():
                    continue
                _stat = self.checks_n_actions_stats[check_or_action.is_a]
                _stat['nb_total'] += 1
                _tag = check_or_action.poller_tag if check_or_action.is_a == 'check' else check_or_action.reactionner_tag
                
                if check_or_action.status == 'scheduled':
                    _stat['scheduled'] += 1
                    if time.time() >= check_or_action.t_to_go:
                        _stat['todo'] += 1
                    if check_or_action.is_late():
                        _stat['nb_late'] += 1
                        if _tag in _stat['late_by_tags']:
                            _stat['late_by_tags'][_tag] += 1
                        else:
                            _stat['late_by_tags'][_tag] = 1
                
                elif check_or_action.status == 'inpoller':
                    _stat['in_executor'] += 1
                elif check_or_action.status == 'zombie':
                    _stat['zombies'] += 1
            
            for type_as_str, stat_type, in (('Checks', 'check'), ('Notifications', 'notification'), ('Event Handlers', 'eventhandler')):
                _stat = self.checks_n_actions_stats[stat_type]
                if stat_type == 'check':
                    section = SECTION_POLLERS
                    executor_daemon = 'poller'
                else:
                    section = SECTION_REACTIONNERS
                    executor_daemon = 'reactionner'
                
                logger.debug('%s %s %-15s [ total: [%6d], scheduled: [%6d], nb_todo: [%6d], in %-11s: [%6d], late: [%6d], zombies: [%6d]]' % (
                    CHAPTER_STATS, section, type_as_str, _stat['nb_total'], _stat['scheduled'], _stat['todo'], executor_daemon, _stat['in_executor'], _stat['nb_late'], _stat['zombies']))
        
        logger.debug('%s %s compute_and_print_stat::  %s' % (CHAPTER_STATS, section, start_snap.get_diff()))
    
    
    def _reset_stats(self):
        for counter in self.checks_n_actions_stats.values():
            counter['nb_total'] = 0
            counter['scheduled'] = 0
            counter['todo'] = 0
            counter['in_executor'] = 0
            counter['nb_late'] = 0
            counter['zombies'] = 0
            counter['late_by_tags'] = {}
    
    
    def _reset_loop_run_time(self):
        # type: () -> None
        for group in LOOP_TIME_GROUP.ALL_GROUP:
            self.loop_time[group] = 0.0
    
    
    def update_loop_run_time(self, group, now, previous_time):
        # type: (str, float, float)-> None
        if group in self.loop_time:
            self.loop_time[group] += now - previous_time
        else:
            logger.debug('Want to update loop time with a group that does not exist : %s. Skip it, it will not be logged ' % group)
    
    
    def _compute_executor_average_stat(self, executor_id, cpu_time_checks_received, cpu_time_checks_send, nb_checks_received, nb_checks_send):
        avg_nb_checks_send = self.avg_nb_checks_send.get(executor_id, None)
        avg_cpu_time_checks_send = self.avg_cpu_time_checks_send.get(executor_id, None)
        avg_nb_checks_received = self.avg_nb_checks_received.get(executor_id, None)
        avg_cpu_time_checks_received = self.avg_cpu_time_checks_received.get(executor_id, None)
        
        if avg_nb_checks_send is None:
            avg_nb_checks_send = AvgInRange(60)
        avg_nb_checks_send.update_avg(nb_checks_send)
        self.avg_nb_checks_send[executor_id] = avg_nb_checks_send
        
        if avg_cpu_time_checks_send is None:
            avg_cpu_time_checks_send = AvgInRange(60)
        avg_cpu_time_checks_send.update_avg(cpu_time_checks_send)
        self.avg_cpu_time_checks_send[executor_id] = avg_cpu_time_checks_send
        
        if avg_nb_checks_received is None:
            avg_nb_checks_received = AvgInRange(60)
        avg_nb_checks_received.update_avg(nb_checks_received)
        self.avg_nb_checks_received[executor_id] = avg_nb_checks_received
        
        if avg_cpu_time_checks_received is None:
            avg_cpu_time_checks_received = AvgInRange(60)
        avg_cpu_time_checks_received.update_avg(cpu_time_checks_received)
        self.avg_cpu_time_checks_received[executor_id] = avg_cpu_time_checks_received
        
        executor_stat = self.stat_by_executor.get(executor_id, {})
        executor_stat['avg_nb_checks_send'] = avg_nb_checks_send.get_avg(0)
        executor_stat['avg_cpu_time_checks_send'] = avg_cpu_time_checks_send.get_avg(0)
        executor_stat['avg_nb_checks_received'] = avg_nb_checks_received.get_avg(0)
        executor_stat['avg_cpu_time_checks_received'] = avg_cpu_time_checks_received.get_avg(0)
        self.stat_by_executor[executor_id] = executor_stat
    
    
    def _load_exec_stat(self, _logger):
        # Always reset the exec stat as maybe we are from a corrupted exec_stats,
        # and so we must be sure we will clean it, even if it means be void
        self._exec_stat = {_DEFAULT_EXECUTOR: {}}
        _exec_stat_path = _PATH_EXEC_STAT_PATTERN % self.sched_daemon.daemon_id
        if not os.path.exists(_exec_stat_path):
            # maybe it's just we are migrating from a 2.4.X version
            # but if not exists, there is so nothing to load
            if not os.path.exists(_OLD_PATH_EXEC_STAT):
                _logger.debug("The load exec stat file %s is missing" % _exec_stat_path)
                return
            # Try to move the old stats, so we don't start from zero,
            # but it can fail, if so, we will have nothing to load
            try:
                shutil.move(_OLD_PATH_EXEC_STAT, _exec_stat_path)
                _logger.info('Migrate the old execution time stats file %s to the new path %s' % (_OLD_PATH_EXEC_STAT, _exec_stat_path))
            except Exception as exp:
                _logger.debug('Cannot move old execution time stats file %s to the new path %s : %s' % (_OLD_PATH_EXEC_STAT, _exec_stat_path, exp))
                return
        
        try:
            with open(_exec_stat_path, 'r') as f:
                buf = f.read()
                self._exec_stat = json.loads(buf)
                # Clean unknown pollers & reactionners
                for sat_name in list(self._exec_stat.keys()):
                    if sat_name not in self.pollers_name | self.reactionners_name:
                        del self._exec_stat[sat_name]
                
                if _DEFAULT_EXECUTOR not in self._exec_stat:
                    self._exec_stat[_DEFAULT_EXECUTOR] = {}
                _logger.info('Successfully loaded the execution stats from the file %s' % _exec_stat_path)
        except Exception as exp:
            _logger.warning("[scheduler][%s] cannot load the exec stat from file [%s] : [%s]" % (self.instance_id, _exec_stat_path, exp))
    
    
    def _save_exec_stat(self, exec_stat):
        _exec_stat_path = _PATH_EXEC_STAT_PATTERN % self.sched_daemon.daemon_id
        try:
            buf = json.dumps(exec_stat)
            # first save ino a tmp file, so we won't corrupt the final file even if we are killed
            tmp_file = '%s.tmp' % _exec_stat_path
            with open(tmp_file, 'w') as f:
                f.write(buf)
                f.flush()
                os.fsync(f.fileno())
            # Then move in an atomic way
            shutil.move(tmp_file, _exec_stat_path)
        except TypeError:  # we are trying to dump an exec stat that have critical error on it, cannot be json
            logger.error('%s\nThe scheduler stats is corrupted and must be reset. Please open a support ticket with this log\n%s\n' % ('*' * 80, '*' * 80))
            logger.error('Corrupted stats: %s' % exec_stat)
            self._load_exec_stat(logger)
        except Exception as exp:
            logger.warning("[scheduler][%s] cannot save the exec stat: [%s]" % (self.instance_id, exp))
    
    
    def _load_scheduler_stat(self):
        if os.name == 'nt':
            return
        
        _exec_stat_path = _PATH_SCHEDULER_STAT_PATTERN % self.sched_daemon.daemon_id
        try:
            with open(_exec_stat_path, 'r') as f:
                buf = f.read()
                self.scheduler_stat = json.loads(buf)
                logger.info('Successfully load the scheduler stats from the file %s' % _exec_stat_path)
        except Exception as exp:
            logger.warning("[scheduler][%s] cannot load the scheduler stat from file [%s] : [%s]" % (self.instance_id, _exec_stat_path, exp))
        if self.scheduler_stat.get('save_retention_time', 0) == -1:
            self.scheduler_stat['save_retention_error'] = 'Timeout error ( 2 minutes )'
        # SEF-8099 V02.08.01-Patched-06.02, last_retention_save was string before, change to int with default value 0 if needed convert
        if not isinstance(self.scheduler_stat.get('last_retention_save', 0), int):
            logger.info('The stats file loading is migrating the last_retention_save entry (was %s)' % self.scheduler_stat.get('last_retention_save', 0))
            self.scheduler_stat['last_retention_save'] = 0
    
    
    def _update_scheduler_stat(self, stat_name, value):
        _exec_stat_path = _PATH_SCHEDULER_STAT_PATTERN % self.sched_daemon.daemon_id
        try:
            self.scheduler_stat[stat_name] = value
            buf = json.dumps(self.scheduler_stat)
            # first save ino a tmp file, so we won't corrupt the final file even if we are killed
            tmp_file = '%s.tmp' % _exec_stat_path
            with open(tmp_file, 'w') as f:
                f.write(buf)
                f.flush()
                os.fsync(f.fileno())
            # Then move in an atomic way
            shutil.move(tmp_file, _exec_stat_path)
        except Exception as exp:
            logger.warning("[scheduler][%s] cannot save the exec stat: [%s]" % (self.instance_id, exp))
    
    
    @staticmethod
    def __export_data_create_anonymous_hash(name):
        # Make as bytes so hashlib is happy
        if isinstance(name, str):
            name = name.encode('utf8', 'ignore')
        return 'anonymous-hash-%s' % (hashlib.sha1(name).hexdigest())[:10]
    
    
    def _dump_execution_stats(self, host, service, realm_name, dump_with_private_information, entries):
        # type: (Host, Optional[Service], str, bool, List) -> None
        if service:
            check_uuid = service.uuid
            check_name = service.get_name()
            command_hash = service.last_command_hash
            command_name = service.last_command_name
            check_interval = service.check_interval
            retry_interval = service.retry_interval
            scheduled_epoch = service.next_chk
            last_normal_chk = service.last_normal_chk
        else:
            check_uuid = ''
            check_name = ''
            command_hash = host.last_command_hash
            command_name = host.last_command_name
            check_interval = host.check_interval
            retry_interval = host.retry_interval
            scheduled_epoch = host.next_chk
            last_normal_chk = host.last_normal_chk
        
        scheduled_epoch = scheduled_epoch if scheduled_epoch else 0
        command_name = command_name.split('-//-')[-1]
        command_name_hash = self.__export_data_create_anonymous_hash(command_name)
        realm_name_hash = self.__export_data_create_anonymous_hash(realm_name)
        host_uuid = host.get_instance_uuid()
        host_name = host.get_full_name()
        
        if dump_with_private_information:
            realm = realm_name
        else:
            host_name = ''
            check_name = ''
            command_name = ''
            realm = ''
        
        if command_hash and command_hash in self._exec_stat[_DEFAULT_EXECUTOR]:
            cpu_time_s = '%.3f' % self._exec_stat[_DEFAULT_EXECUTOR][command_hash]['action_cpu_time']
        else:
            cpu_time_s = ''
        entry_version_1 = [host_uuid, host_name, check_uuid, check_name, command_name, command_name_hash, realm, realm_name_hash, check_interval, retry_interval, cpu_time_s, scheduled_epoch, last_normal_chk]
        logger.debug('DUMP: %s' % entry_version_1)
        entries.append(entry_version_1)
    
    
    @staticmethod
    def get_export_data_struct():
        entries = []
        res = {'format_version': 1, 'entries': entries}  # IMPORTANT: as we will give tuple/list to reduce exchange size, we MUST warn the other side about our format
        return res
    
    
    # The daemon is asking us to dump our check scheduling into a csv file in /tmp
    # NOTE: called from a http thread query
    # * dump_with_private_information => if True, give names too (call was authenticated)
    def export_data(self, realm_name, dump_with_private_information):
        # type: (str, bool) -> Dict[str,Any]
        res = self.get_export_data_struct()
        entries = res['entries']
        with self.sched_daemon.satellite_lock:  # protect against loading a new configuration
            for host in self.hosts:
                self._dump_execution_stats(host, None, realm_name, dump_with_private_information, entries)
            for service in self.services:
                self._dump_execution_stats(service.host, service, realm_name, dump_with_private_information, entries)
        return res
