#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2022:
#    Gabes Jean, naparuba@gmail.com
#    Gerhard Lausser, Gerhard.Lausser@consol.de
#    Gregory Starck, g.starck@gmail.com
#    Hartmut Goebel, h.goebel@goebel-consult.de
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken.  If not, see <http://www.gnu.org/licenses/>.


import base64
import pickle
import copy
import json
import logging
import os
import socket
import sys
import threading
import time
import traceback
import urllib.request, urllib.error, urllib.parse
import zlib
from collections import defaultdict, deque
from hashlib import md5
from multiprocessing import active_children

from shinken.brok import Brok
from shinken.daemon import IStatsInterface
from shinken.external_command import ExternalCommand
from shinken.http_client import HTTPException, HTTPExceptions, HTTPClient
from shinken.ipc.share_item import ShareItem
from shinken.load import AvgInRange
from shinken.log import logger, get_chapter_string, LoggerFactory, get_section_string
from shinken.misc.type_hint import TYPE_CHECKING
from shinken.network_exchange_sequencer import BROKS_EXCHANGE_PROTOCOLS
from shinken.objects.graphitebackend import GraphiteBackend
from shinken.property import IntegerProp, PathProp
from shinken.runtime_stats.cpu_stats import cpu_stats_helper
from shinken.runtime_stats.threads_dumper import WatchDogThreadDumper
from shinken.safepickle import SafeUnpickler
from shinken.toolbox.url_helper import BaseUrl
from shinken.util import to_bool
from shinken.withinventorysatellite import IArbiterToInventorySatellite, WithInventorySatellite
from shinkensolutions.lib_checks.graphite import GraphiteVersionChecker
from shinkensolutions.lib_checks.graphite_metrics_counter import GRAPHITE_STATS_KEY

try:
    from statsd import StatsClient  # noqa => no need in requirements
    
    statsd = StatsClient()
except ImportError:
    StatsClient = None
    statsd = None

if TYPE_CHECKING:
    from shinken.brok import Brok
    from shinken.log import PartLogger
    from shinken.misc.type_hint import Any, List, Dict, Optional

DEFAULT_INJECTOR_PUSHER_WAIT_TIME = 240
DEFAULT_MIN_PUSHER_WAIT_TIME = 5
DEFAULT_PUSHER_SECURITY_RATIO_TIME = 5
DEFAULT_PUSHER_MAX_RETRY = 3
DEFAULT_PUSHER_QUEUE_BATCH_SIZE = 100000  # we do not want too much broks on the socket in one pass

# Strings for log chapters
_MANAGE_BROKS_STR = 'MANAGE BROKS'
_GET_BROKS_STR = 'GET BROKS'
_RECEIVE_BROKS_STR = 'RECEIVE BROKS'
_BROKER_TIME_STR = 'BROKER TIME'
_MODULES_STR = 'MODULES'

CHAPTER_MANAGE_BROKS = get_chapter_string(_MANAGE_BROKS_STR)

_EXTERNAL_MODULE_STR = get_section_string('EXTERNAL MODULE')
_INTERNAL_MODULE_STR = get_section_string('INTERNAL MODULE')
_NEED_DATA_STR = 'NEED DATA'
_LOOP_START_STR = '=== Loop start ==='
_LOOP_STOP_STR = '=== Loop stop  ==='
_BUS_COMMANDS_STR = 'BUS COMMANDS'

_ARBITER_AS_BROKS_SOURCE = 'arbiter'


class IStatsBroker(IStatsInterface):
    """ 
    Interface for various stats about broker activity
    """
    
    doc = 'Return a connection status from my network location to all graphite server I should write to (graphite_perfdata type modules).'
    
    
    def check_graphite_write_status(self):
        return self.app.check_graphite_write_status()
    
    
    check_graphite_write_status.doc = doc
    check_graphite_write_status.need_lock = False
    
    doc = 'Return a connection status from my network location to all graphite server I should read from (webui type modules).'
    
    
    def check_graphite_read_status(self):
        return self.app.check_graphite_read_status()
    
    
    check_graphite_read_status.doc = doc
    check_graphite_read_status.need_lock = False
    
    
    def get_raw_stats(self, param='', module=''):
        # type: (unicode, unicode) -> Dict[unicode, Any]
        return super(IStatsBroker, self).get_raw_stats(param=param, module=module)
    
    
    get_raw_stats.doc = 'get stats of the daemon'
    get_raw_stats.need_lock = False
    
    
    def _daemon_get_raw_stats(self, param='', module_wanted=None):
        # type: (unicode, List) -> Dict[unicode, Any]
        if module_wanted is None:
            module_wanted = []
        app = self.app
        logger.info('Query broker running stats')
        raw_stats = {
            'modules'              : [],
            'len_external_commands': len(app.external_commands)
        }
        # update some queues
        with app.broks_lock:
            raw_stats['len_broks'] = len(app.broks)
        raw_stats['len_internal_broks'] = len(app.broks_internal_raised)
        with app.arbiter_broks_lock:
            raw_stats['len_arbiter_broks'] = len(app.arbiter_broks)
        
        instances = [inst for inst in app.modules_manager.get_all_instances() if inst.is_external]
        for inst in instances:
            try:
                raw_stats['modules'].append({'module_name': inst.get_name(), 'queue_size': inst.to_q.qsize()})
            except Exception:  # noqa => need to catch all about queues
                raw_stats['modules'].append({'module_name': inst.get_name(), 'queue_size': -1})
        
        raw_stats['module_stats'] = self._get_module_stats(getattr(self.app, 'modules_manager', None), param, module_wanted)
        raw_stats['http_errors_count'] = app.http_errors_count
        raw_stats['have_conf'] = app.cur_conf is not None
        raw_stats['activated'] = self.app.activated
        raw_stats['spare'] = self.app.spare
        raw_stats['master_daemon'] = self.app.master_daemon_name
        raw_stats['spare_daemon'] = self.app.spare_daemon_name
        raw_stats['spare_must_have_the_same_list_of_module_type'] = self.app.spare_must_have_the_same_list_of_module_type
        raw_stats['last_too_long_injector_time'] = self.app.last_too_long_injector_time
        raw_stats['known_realms'] = getattr(self.app, 'known_realms', None)
        return raw_stats


# Our main APP class
class Broker(WithInventorySatellite):
    properties = WithInventorySatellite.properties.copy()
    properties.update({
        'pidfile'                  : PathProp(default='brokerd.pid'),
        'port'                     : IntegerProp(default='7772'),
        'local_log'                : PathProp(default='brokerd.log'),
        'max_file_descriptor_limit': IntegerProp(default='0'),  # the broker need to set to maximum value available on the system
    })
    
    daemon_type = 'broker'
    
    
    def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, profile='', daemon_id=0):
        super(Broker, self).__init__(self.__class__.daemon_type, config_file, is_daemon, do_replace, debug, debug_file, daemon_id)
        
        # Our arbiters
        self.arbiters = {}
        
        # Our pollers and reactionners
        self.pollers = {}
        self.reactionners = {}
        self.receivers = {}
        
        # Modules are load one time
        self.have_modules = False
        self.modules = None
        
        # Can have a queue of external_commands given by modules
        # will be processed by arbiter
        self.external_commands = []
        
        # All broks to manage
        self.broks = deque()  # broks to manage, note: WILL overwrite base satellite value
        self.external_module_broks = deque()  # type:deque[Brok]  # broks manage by external module
        self.broks_lock = threading.RLock()  # as satellites will be access by threads, protect our broks list by a lock
        # broks raised this turn and that needs to be put in self.broks
        self.broks_internal_raised = []
        self.broks_packet_size = 0
        # broks raised by the arbiters, we need a lock so the push can be in parallel
        # to our current activities and won't lock the arbiter
        self.arbiter_broks = []
        self.arbiter_broks_lock = threading.RLock()
        
        self.timeout = 1.0
        
        self._add_http_interface(IStatsBroker(self))
        self._add_http_interface(IArbiterToInventorySatellite(self))
        
        self.local_module_stats = {}
        self.stats_time_sla = None
        self.stats_nb_sla = None
        self.broks_done = 0
        self.last_too_long_injector_time = 0.0
        
        # We will have a tread by distant satellites, so we must protect our access
        self.satellite_lock = threading.RLock()
        
        self._tmp_bucket = {}  # use by injector workers to inherit jobs during the fork()
        
        # Memory protection: we will have parameters to choose if we do memory protection or not
        self.manage_brok_enable_sub_processes_memory_usage_protection = False
        self.manage_brok_sub_process_memory_usage_system_reserved_memory = 0
        self.manage_brok_sub_processes_memory_usage_protection_max_retry_time = 5
        
        self.manage_brok_sub_process_broks_pusher_max_execution_timeout = DEFAULT_INJECTOR_PUSHER_WAIT_TIME
        self.manage_brok_sub_process_broks_pusher_security_ratio = DEFAULT_PUSHER_SECURITY_RATIO_TIME
        self.manage_brok_sub_process_broks_pusher_min_execution_timeout = DEFAULT_MIN_PUSHER_WAIT_TIME
        self.manage_brok_sub_process_broks_pusher_max_retry = DEFAULT_PUSHER_MAX_RETRY
        self.manage_brok_sub_process_broks_pusher_queue_batch_size = DEFAULT_PUSHER_QUEUE_BATCH_SIZE
        self.avg_brok_send_speed = AvgInRange(300)  # 0 broks/s by default, so we will take manage_brok_sub_process_broks_pusher_max_wait_time for wait time
        
        # Spare information
        self.spare_daemon_name = ''
        self.spare_must_have_the_same_list_of_module_type = True  # by default spare must have the same modules
        self.master_daemon_name = ''
        self.mainloop_watchdog = WatchDogThreadDumper('Main loop', wait_time=60 * 30, dump_interval=60 * 5, fatal_dead_lock_delay=60 * 30, multi_usage=True)
        
        self.computed_address = ''  # This is the address used by user to access the broker. If the broker address is localhost, this will be the first address found in the network interfaces
        self.daemon_configuration_for_modules = None  # type: Optional[ShareItem]
        
        self.all_monitoring_configuration_part = []
        
        # Loggers
        self.broker_time_logger = LoggerFactory.get_logger(_BROKER_TIME_STR)
        self.loop_start_logger = self.broker_time_logger.get_sub_part(_LOOP_START_STR)
        self.loop_stop_logger = self.broker_time_logger.get_sub_part(_LOOP_STOP_STR)
        self.get_brok_logger = LoggerFactory.get_logger(_GET_BROKS_STR)
        self.protocol_logger = self.get_brok_logger.get_sub_part('PROTOCOL')
        self.need_data_logger = self.get_brok_logger.get_sub_part(_NEED_DATA_STR)
        self.modules_logger = LoggerFactory.get_logger(_MODULES_STR)
    
    
    # BEWARE: you should have the lock to call this one
    def _find_scheduler_by_shard_id(self, shard_id):
        return next((scheduler for scheduler in self.schedulers.values() if scheduler['instance_id'] == shard_id), None)
    
    
    # Schedulers have some queues. We can simplify the call by adding
    # elements into the proper queue just by looking at their type
    # Brok -> self.broks
    # External commands -> self.external_commands
    def add(self, elt):
        cls_type = elt.__class__.my_type
        if cls_type == 'brok':
            # Like this brok was set by scheduler 0
            elt.instance_id = 0
            self.broks_internal_raised.append(elt)
            return
        elif cls_type == 'externalcommand':
            logger.debug('[broker][add_broks] Enqueuing an external command [ %s ]' % str(ExternalCommand.__dict__))
            self.external_commands.append(elt)
        elif cls_type == 'message':
            # Maybe we got a Message from the modules, it's way to ask something like from now a full data from a scheduler for example.
            logger.debug('[broker][add_broks] message [ %s ]' % str(elt.__dict__))
            if elt.get_type() == 'NeedData':
                data = elt.get_data()
                source = elt.source
                logger_source_need_data = self.need_data_logger.get_sub_part(source, register=False)
                if 'full_instance_id' not in data:
                    logger_source_need_data.warning('The asking for initial broks is malformed. (full_instance_id is missing).')
                    return
                
                shard_id = data['full_instance_id']
                with self.satellite_lock:
                    scheduler = self._find_scheduler_by_shard_id(shard_id)
                    # Maybe the arbiter did JUST remove the scheduler during the fact that the module get it, and now
                    # it's quite rare, but not an error because as soon as the scheduler will be given back by the arbiter
                    # we will have a full broks generation, so we don't have to "force" it
                    if scheduler is None:
                        module_logger_source_need_data = self.modules_logger.get_sub_part(_NEED_DATA_STR).get_sub_part(source, register=False)
                        module_logger_source_need_data.warning(
                            'The module is asking elements information from scheduler with shard (%s) but we cannot find it in our schedulers. Data will be available for the module as soon as the arbiter will give us this scheduler.' % shard_id)
                        module_logger_source_need_data.warning('Current schedulers are: %s' % (', '.join(['%s(shard=%s)' % (scheduler['name'], scheduler['instance_id']) for scheduler in list(self.schedulers.values())])))
                        return
                # Reset the scheduler, so it will ask for a full initial brok generation
                scheduler['con'] = None
                scheduler['daemon_incarnation'] = {}
                logger.info('Requesting scheduler [ %s ] (managing shard [ %s ]) initial broks.' % (scheduler['name'], shard_id))
            
            elif elt.get_type() == 'ICrash':
                # Maybe a module tells me that it's dead, I must log it's last words... and directly set the module to be restarted
                data = elt.get_data()
                module_name = data['name']
                instance = self.modules_manager.get_module_by_name(module_name)
                if instance is None:
                    logger.error('Received an error from a unknown module: %s (%s)' % (module_name, data['trace']))
                    return
                module_crash_logger = LoggerFactory.get_logger('MODULES-MANAGER').get_sub_part('MODULE-INSTANCE-CRASH').get_sub_part(module_name).get_sub_part('module_type=%s' % instance.properties.get('type', ''))
                module_crash_logger.error('The module %s just stopped. Last ERROR received:' % module_name)
                if data['trace']:
                    for line in data['trace'].splitlines():
                        module_crash_logger.error('     %s' % line)
                # We can kill it
                self.modules_manager.did_crash(instance, reason=str(data['exception']), do_log=False)  # note: no more log here, already done ^^
    
    
    # Check if we do not connect to often to this
    def is_connection_try_too_close(self, elt):
        now = time.time()
        last_connection = elt['last_connection']
        if now - last_connection < 5:
            return True
        return False
    
    
    ##
    # Iterate over all graphite servers I should write to and return connection status.
    # They are defined in my graphite_perfdata type modules.
    def check_graphite_write_status(self):
        reply = {'status': 'OK', 'data': []}
        
        if not self.cur_conf:
            reply['status'] = 'NO_CONF'
            reply['data'].append({
                'module_name'  : None,
                'host'         : None,
                'port'         : None,
                'can_post_data': False
            })
            return reply
        
        graphite_mods = [mod for mod in self.cur_conf['global']['modules'] if mod.module_type == 'graphite_perfdata']
        if not graphite_mods:
            reply['status'] = 'NO_MODULE'
        for mod in graphite_mods:
            host = mod.host
            port = mod.port
            
            can_post_data = Broker.graphite_service_is_alive(host, port)
            reply['data'].append({
                'module_name'  : mod.module_name,
                'host'         : host,
                'port'         : port,
                'can_post_data': can_post_data})
        
        return reply
    
    
    ##
    # Iterate over all graphite servers I should read from and return connection status.
    # They are defined in my webui type modules.
    def check_graphite_read_status(self):
        reply = []
        
        if not self.cur_conf:
            status = {
                'module_name': None,
                'host'       : None,
                'ip'         : None,
                'resolved_ip': None,
                'port'       : None,
                'use_ssl'    : None,
                'realm'      : None,
                'nb_metrics' : 0,
                'reachable'  : False,
                'cfg_url'    : '',
                'url_errors' : [],
                'version'    : None
            }
            reply.append(status)
            return reply
        
        webui_modules = (mod for mod in self.cur_conf['global']['modules'] if mod.module_type == 'webui')
        webui_and_graphite_threads = []
        for webui_module in webui_modules:
            webui_graphite_backends = getattr(webui_module, 'graphite_backends', '')
            graphite_backends = [g.strip() for g in webui_graphite_backends.split(',') if g.strip()]
            use_ssl = to_bool(webui_module.use_ssl)
            graphs_errors = {}
            thread_name = 'webui-checks-errors-thread-%s' % webui_module.module_name
            webui_thread = threading.Thread(None, target=Broker.webui_graphs_errors, name=thread_name, args=('127.0.0.1', webui_module.port, use_ssl, graphs_errors))
            webui_and_graphite_threads.append(webui_thread)
            
            for graphite_backend in graphite_backends:
                graphite_backend_def = GraphiteBackend(graphite_backend, strict=False)
                if graphite_backend_def.has_errors():
                    status = {
                        'module_name': webui_module.module_name,
                        'host'       : None,
                        'ip'         : None,
                        'resolved_ip': None,
                        'port'       : None,
                        'use_ssl'    : None,
                        'realm'      : graphite_backend_def.realm,
                        'errors'     : None,
                        'cfg_url'    : graphite_backend_def.cfg_url,
                        'url_errors' : graphite_backend_def.get_errors(),
                        'version'    : None
                    }
                    reply.append(status)
                    continue
                
                # This is ran locally on broker, so this is the correct IP
                base_url = graphite_backend_def.base_url
                broker_addr = BaseUrl.get_local_ip()
                if base_url.is_localhost():
                    resolved_address = base_url.create_from(host=broker_addr)
                else:
                    resolved_address = base_url
                
                status = {
                    'module_name': webui_module.module_name,
                    'host'       : base_url.get_host(),
                    'ip'         : base_url.get_ip(),
                    'resolved_ip': resolved_address.get_ip(),
                    'port'       : base_url.get_port(),
                    'use_ssl'    : base_url.get_use_ssl(),
                    'realm'      : graphite_backend_def.realm,
                    'errors'     : graphs_errors,
                    'cfg_url'    : graphite_backend_def.cfg_url,
                    'url_errors' : [],
                    'version'    : None
                }
                reply.append(status)
                thread_name = 'graphite-backend-thread-%s' % base_url.get_host()
                graphite_backend_thread = threading.Thread(None, target=Broker.graphite_ui_data, name=thread_name, args=(base_url, status))
                webui_and_graphite_threads.append(graphite_backend_thread)
        
        for t in webui_and_graphite_threads:
            t.daemon = True
            t.start()
        
        for t in webui_and_graphite_threads:
            t.join()
        
        return reply
    
    
    @staticmethod
    # Will return None in case of failure, or json reply.
    def graphite_ui_data(base_url, status, retry=4):
        # type: (BaseUrl, Dict, int) -> None
        Broker.get_graphite_version(base_url, status, retry)
        # We can only count nodes on our own versions, so check it's ok
        if not GraphiteVersionChecker().is_version_valid(status['version']):
            return
        while True:
            try:
                retry -= 1
                resp = urllib.request.urlopen(base_url.build_url_with_path('/metrics/get-metrics-count'), timeout=3)
                buf = resp.read()
                json_data = json.loads(buf)
                status['nb_metrics'] = json_data.get('metrics', None)
                status['nb_hosts_clusters'] = json_data.get('level_0', None)
                status[GRAPHITE_STATS_KEY.TIME_READ] = json_data.get(GRAPHITE_STATS_KEY.TIME_READ, -1)
                status[GRAPHITE_STATS_KEY.LOCAL_TIME] = json_data.get(GRAPHITE_STATS_KEY.LOCAL_TIME, None)
                return
            except Exception:
                if retry == 0:
                    status['nb_metrics'] = 0
                    status['nb_hosts_clusters'] = 0
                    return
                else:
                    continue
    
    
    @staticmethod
    # Will return None in case of failure, or json reply.
    def get_graphite_version(base_url, status, retry=4):
        # type: (BaseUrl, Dict, int) -> None
        while True:
            try:
                retry -= 1
                resp = urllib.request.urlopen(base_url.build_url_with_path('/version/'), timeout=3)
                buf = resp.read()
                status['reachable'] = True
                status['version'] = buf.strip()
                return
            except Exception:
                if retry == 0:
                    status['reachable'] = False
                    status['version'] = None
                    return
                else:
                    continue
    
    
    @staticmethod
    # Will return None in case of failure, or json reply.
    def webui_graphs_errors(host, port, use_ssl, graphs_errors, retry=4):
        while True:
            try:
                retry -= 1
                proto = 'https' if use_ssl else 'http'
                resp = urllib.request.urlopen('%s://%s:%s/api/graphs/errors' % (proto, host, port), timeout=3)
                html = resp.read()
                json_val = json.loads(html)
                graphs_errors.update(json_val)
                return
            except Exception:
                if retry == 0:
                    return
                else:
                    continue
    
    
    @staticmethod
    def graphite_service_is_alive(host, port, retry=4):
        sock = None
        while True:
            try:
                retry -= 1
                sock = socket.create_connection((str(host), int(port)), 0.5)
                return True
            except Exception as exp:
                if retry == 0:
                    logger.warning('graphite_service_is_alive failed with exp: [%s]' % exp)
                    return False
                else:
                    continue
            finally:
                if sock is not None:
                    sock.close()
    
    
    # Get a brok. Our role is to put it in the modules
    # DO NOT CHANGE data of b!!!
    # REF: doc/broker-modules.png (4-5)
    def manage_brok(self, b):
        # Call all modules if they catch the call
        
        self.broks_done += 1
        for mod in self.modules_manager.get_internal_instances():
            try:
                # Un-serialize if needed, but outside the module time (it's common to all modules)
                b.prepare()
                
                before = time.time()
                mod.manage_brok(b)
                mod_name = mod.get_name()
                if mod_name not in self.local_module_stats:
                    self.local_module_stats[mod_name] = 0
                self.local_module_stats[mod_name] += time.time() - before
            except Exception as exp:
                logger.debug(str(exp.__dict__))
                logger.warning("The mod %s raise an exception: %s, I'm tagging it to restart later" % (mod.get_name(), str(exp)))
                logger.warning("Exception type: %s" % type(exp))
                logger.warning("Back trace of this kill: %s" % (traceback.format_exc()))
                self.modules_manager.did_crash(mod, "The mod %s raise an exception: %s" % (mod.get_name(), str(exp)))
    
    
    # Add broks (a tab) to different queues for internal and external modules
    def _add_broks_to_queues(self, broks, source, duration=0.0, sent_broks=0, do_log=False, sat_entry=None):
        # type: (List[Brok], unicode, float, int, bool, Dict) -> None
        # If we are idle (spare waiting to start) we can't handle brok
        if not self.activated:
            return
        
        # Ok now put in queue broks to be managed by
        # internal modules
        with self.broks_lock:
            self.broks.extend(broks)
            self.external_module_broks.extend(broks)
            nb_internal_broks = len(self.broks)  # Save the numbers in the with: part
            nb_external_broks = len(self.external_module_broks)
        
        # We only have to log now
        if not do_log:
            return
        
        if source == 'arbiter':
            _logger = LoggerFactory.get_logger(_RECEIVE_BROKS_STR).get_sub_part(source)
        else:
            _logger = self.get_brok_logger.get_sub_part(source)
        _logger.get_sub_part('PERF').info('[ %.3f ]s - Add %4s broks into INTERNAL queue (new size=%s) and the EXTERNAL queue (new size=%s)' % (duration, sent_broks, nb_internal_broks, nb_external_broks))
        
        # We don't show void list of types, if we have 0 broks, the fact that we really ask and ge nothing is enough (cf SEF-9763)
        if len(broks) != 0:
            number_by_types = {}
            for brok in broks:
                btype = brok.type
                number_by_types[btype] = number_by_types.get(btype, 0) + 1
            btypes = sorted(list(number_by_types.keys()))
            s_printed = ', '.join(['%s=%s' % (k, number_by_types[k]) for k in btypes])
            _logger.info('                    ----- %4s composed of: %s' % (sent_broks, s_printed))
            if sat_entry:
                have_program_status = 'program_status' in number_by_types
                have_initial_broks_done = 'initial_broks_done' in number_by_types
                loading_configuration_logger = LoggerFactory.get_loading_configuration_logger()
                if have_program_status:
                    loading_configuration_logger.info('Broker start receive initial broks from 〖%s〗. Scheduler incarnation :〖%s〗' % (sat_entry['name'], sat_entry['daemon_incarnation']))
                if have_initial_broks_done:
                    loading_configuration_logger.info('Broker receive all initial broks from 〖%s〗. Scheduler incarnation :〖%s〗' % (sat_entry['name'], sat_entry['daemon_incarnation']))
    
    
    # We will get in the broks list the broks from the arbiters,
    # but as the arbiter_broks list can be push by arbiter without Global lock,
    # we must protect this with the list lock
    def interger_arbiter_broks(self):
        with self.arbiter_broks_lock:
            self._add_broks_to_queues(self.arbiter_broks, _ARBITER_AS_BROKS_SOURCE)
            self.arbiter_broks = []
    
    
    # Get 'objects' from external modules
    # right now on nobody uses it, but it can be useful
    # for modules like live status to raise external
    # commands for example
    def get_objects_from_from_queues(self):
        start = time.time()
        nb_object_get = 0
        _bus_logger = self.modules_logger.get_sub_part(_BUS_COMMANDS_STR)
        for module, queue in self.modules_manager.get_external_modules_and_from_queues():
            try:
                while not queue.empty():
                    o = queue.get(block=False)
                    self.add(o)
                    nb_object_get += 1
            except Exception:
                # NOTE: maybe the instance was crashing, and was stopped by a crash message in self.add(), it so, don't cry about it as
                # it is already die
                if self.modules_manager.is_instance_set_to_restart(module):
                    continue
                _bus_logger.warning('Cannot read shinken internal commands (like recheck, set acknowledge, etc) from module [%s]. We will retry it.' % module.get_name())
                _bus_logger.print_stack(level=logging.WARNING)
                continue
        end = time.time()
        _bus_logger.info('[PERF] [ %.3f ]s Did read %d shinken internal commands (like recheck, set acknowledge, etc) from modules' % (end - start, nb_object_get))
        return nb_object_get
    
    
    # For a new distant daemon, if it is a scheduler, ask for a new full broks generation
    def _manage_new_distant_daemon_incarnation(self, entry, old_incar, new_incar):
        daemon_type = entry['type']
        entry['broks'].clear()
        entry['received_chunks'] = []
        entry['remaining_chunks_nb'] = 0
        con = entry['con']
        if con is None:  # maybe another thread did close the connection on a new configuration
            entry['daemon_incarnation'] = {}  # reset it, so we will get back in this method on the next loop
            return
        # we must ask for a new full broks if  it's a scheduler
        if daemon_type == 'scheduler':
            scheduler_name = entry['name']
            
            loading_configuration_logger = LoggerFactory.get_loading_configuration_logger()
            get_broks_logger = self.need_data_logger.get_sub_part(scheduler_name, register=False)
            
            new_monitoring_configuration_part_id_for_log = '%(shard_id)s-%(configuration_incarnation_uuid)s' % new_incar if new_incar else 'empty'
            old_monitoring_configuration_part_id_for_log = '%(shard_id)s-%(configuration_incarnation_uuid)s' % old_incar if old_incar else 'empty'
            
            if not new_incar:  # maybe the scheduler is currently void, wait until it have a real configuration loaded
                msg = 'I do not ask to scheduler 〖%s〗 for initial broks generation because it is currently empty. Old monitoring configuration part was 〖%s〗' % (scheduler_name, old_monitoring_configuration_part_id_for_log)
                get_broks_logger.info(msg)
                loading_configuration_logger.info(msg)
                return
            msg = 'I ask to the scheduler 〖%s〗 for a initial broks generation with new monitoring configuration part 〖%s〗. Old monitoring configuration part was 〖%s〗' % (
                scheduler_name, new_monitoring_configuration_part_id_for_log, old_monitoring_configuration_part_id_for_log)
            get_broks_logger.info(msg)
            loading_configuration_logger.info(msg)
            
            schedulers = self.daemon_configuration_for_modules.schedulers
            scheduler_entry = next((i for i in schedulers if i['name'] == scheduler_name), None)
            if scheduler_entry:
                scheduler_entry['daemon_incarnation'] = new_incar
                self.daemon_configuration_for_modules.schedulers = schedulers
            
            self._add_broks_to_queues([Brok('asking_initial_broks', {'instance_id': entry['instance_id']})], 'internal broks')
            
            try:
                did_generate_raw = con.get('fill_initial_broks', {'bname': self.name}, wait='long')
            except HTTPExceptions as exp:  # If fail (as error 500 or something like this) it means that the
                # scheduler did fail to generate broks, it's a huge problem and we must restart this
                msg = 'The scheduler %s failed to generate initial broks data with error: %s. we will retry it' % (scheduler_name, exp)
                entry['daemon_incarnation'] = {}
                raise HTTPException(msg)
            # maybe the scheduler was not ready, if so, retry it.
            # NOTE: we want all schedulers to be up-to-date, so ask for a real true here
            if did_generate_raw != 'true':
                msg = 'The scheduler %s failed to generate initial broks data, restarting the request' % scheduler_name
                entry['daemon_incarnation'] = {}
                raise HTTPException(msg)
    
    
    @staticmethod
    def __detect_protocol_and_get_input(input_data):
        # Protocol detection:
        # * NO_LIMIT_AND_NO_ACK_PROTOCOL (old) was gzip, no ack, all data in one query
        # * other is a pickle with (protocol, seq_number,  data, ...)
        try:
            input_data = SafeUnpickler.loads(input_data, 'broks')
            protocol = input_data[0]
            input_data = input_data[1:]
        except pickle.UnpicklingError:
            protocol = BROKS_EXCHANGE_PROTOCOLS.NO_LIMIT_AND_NO_ACK_PROTOCOL
        
        return protocol, input_data
    
    
    @staticmethod
    def __build_data_from_received_chunk(debug_is_on, protocol_logger, sat_entry, protocol, data, nb_remaining_chunks):
        # type: (bool, PartLogger, Dict, int, bytes, int) -> Optional[bytes]
        scheduler_name = sat_entry['name']
        if protocol == BROKS_EXCHANGE_PROTOCOLS.SPLIT_FULL_DATA_PROTOCOL:
            received_chunks = sat_entry.get('received_chunks', [])  # type: List
            to_receive_chunks_nb = sat_entry.get('remaining_chunks_nb', 0)
            if nb_remaining_chunks > 0:
                if to_receive_chunks_nb > 1 and nb_remaining_chunks != (to_receive_chunks_nb - 1):
                    protocol_logger.error('Wrong chunk number received (%s instead of %s) from scheduler %s' % (nb_remaining_chunks, to_receive_chunks_nb - 1, scheduler_name))
                    sat_entry['con'] = None
                    return None
                
                if debug_is_on:
                    protocol_logger.debug('Received split packet from scheduler %s, %s chunks remaining, received chunk size: %s' % (scheduler_name, nb_remaining_chunks, len(data)))
                
                received_chunks.append(data)
                sat_entry['received_chunks'] = received_chunks
                sat_entry['remaining_chunks_nb'] = nb_remaining_chunks
                sat_entry['loop_delay'] = 0
                return None
            else:
                if received_chunks:
                    if debug_is_on:
                        protocol_logger.debug('Received last chunk of split packet from scheduler %s, chunk size %s' % (scheduler_name, len(data)))
                    
                    received_chunks.append(data)
                    data = ''.join(received_chunks)  # type: bytes
                    
                    if debug_is_on:
                        protocol_logger.debug('Received packet from scheduler %s size:%s [ %s ]' % (scheduler_name, len(data), md5(data).hexdigest()))
                    
                    sat_entry['received_chunks'] = []
                    sat_entry['remaining_chunks_nb'] = 0
                elif debug_is_on:
                    protocol_logger.debug('Received full packet from scheduler %s, packet size: %s' % (scheduler_name, len(data)))
        
        return data
    
    
    # We get new broks from schedulers
    # REF: doc/broker-modules.png (2)
    def get_new_broks(self, sat_entry):
        # If we are idle (spare waiting to start) we do not ask our satellites new broks
        if not self.activated:
            time.sleep(0.5)
            return
        # We check for new check in each scheduler and put the result in new_checks
        sat_type = sat_entry['type']
        debug_is_on = logger.is_debug()
        try:
            con = sat_entry['con']  # type: HTTPClient
            if con is None:  # None = not initialized
                self.pynag_con_init(sat_entry)
                return  # we will get broks the next turn
            
            before = time.time()
            # Before ask a call that can be long, do a simple ping to be sure it is alive
            con.get('ping')
            scheduler_name = sat_entry['name']
            shard_id = ''
            configuration_incarnation_uuid = ''
            daemon_incarnation = sat_entry['daemon_incarnation']
            if daemon_incarnation:
                shard_id = daemon_incarnation.get('shard_id', '')
                configuration_incarnation_uuid = daemon_incarnation.get('configuration_incarnation_uuid', '')
            
            param = {
                'bname'                         : self.name,
                'packet_size'                   : self.broks_packet_size,
                'seq_number'                    : sat_entry['sequence_number'],
                'shard_id'                      : shard_id,
                'configuration_incarnation_uuid': configuration_incarnation_uuid,
                'protocol_version'              : BROKS_EXCHANGE_PROTOCOLS.SPLIT_FULL_DATA_PROTOCOL,
                'remaining_chunks'              : sat_entry.get('remaining_chunks_nb', 0)
            }
            tmp_broks_b64 = con.get('get_broks', param, wait='long')
            receive_time = time.time()
            input_data = base64.b64decode(tmp_broks_b64)
            del tmp_broks_b64  # release memory asap
            
            b64_time = time.time()
            
            protocol, input_data = self.__detect_protocol_and_get_input(input_data)
            nb_remaining_broks = 0
            # Now based on the protocol version, get broks as pickle
            try:
                if protocol == BROKS_EXCHANGE_PROTOCOLS.NO_LIMIT_AND_NO_ACK_PROTOCOL:
                    # Old protocol
                    pickle_broks = zlib.decompress(input_data)
                else:
                    if protocol == BROKS_EXCHANGE_PROTOCOLS.SPLIT_FULL_DATA_PROTOCOL:
                        seq_number, data, nb_remaining_broks, nb_remaining_chunks = input_data
                    elif protocol == BROKS_EXCHANGE_PROTOCOLS.BY_BROK_SIZE_ESTIMATION_LIMITED_DATA_SIZE_PROTOCOL:
                        nb_remaining_chunks = 0
                        seq_number, data, nb_remaining_broks = input_data
                    else:
                        self.protocol_logger.error('Invalid protocol version received from scheduler "%s" : %s' % (scheduler_name, protocol))
                        sat_entry['con'] = None
                        return
                    
                    sat_entry['sequence_number'] = seq_number  # type: unicode
                    
                    data = self.__build_data_from_received_chunk(debug_is_on, self.protocol_logger, sat_entry, protocol, data, nb_remaining_chunks)
                    if not data:
                        return
                    
                    pickle_broks = zlib.decompress(data)
            
            except zlib.error as exp:
                self.protocol_logger.error('Fail to decompress loaded broks from %s %s with : [%s]' % (sat_type, scheduler_name, exp))
                sat_entry['con'] = None
                return
            
            uncompress_time = time.time()
            # Final Step: load the broks from pickle
            try:
                broks = SafeUnpickler.loads(pickle_broks, 'broks')
                
                del pickle_broks  # release memory asap
            except Exception as exp:
                self.get_brok_logger.error('Fail to load broks data from %s %s with : [%s]' % (sat_type, sat_entry['name'], exp))
                self.get_brok_logger.print_stack()
                sat_entry['con'] = None
                return
            unpickle_time = time.time()
            if debug_is_on:
                self.get_brok_logger.debug('%s Got %4d broks, remaining broks: %4s broks, protocol:%s, in total time:%.3fs [recv:%.3fs, b64_decode:%.3fs, uncompress:%.3fs, deserialize:%.3fs]' % (
                    sat_entry['name'],
                    len(broks),
                    nb_remaining_broks,
                    protocol,
                    unpickle_time - before,
                    receive_time - before,
                    b64_time - receive_time,
                    uncompress_time - b64_time,
                    unpickle_time - uncompress_time))
            
            sat_entry['sent_broks_this_turn'] += len(broks)
            
            # Ok, we can add theses broks to our queues
            now = time.time()
            # Log only if more than 1s since last log
            do_log = False
            if abs(now - sat_entry['last_log_time']) > 1:  # abs() is for time-jumps
                sat_entry['last_log_time'] = now
                do_log = True
            
            self._add_broks_to_queues(broks, sat_entry['name'], time.time() - before, sat_entry['sent_broks_this_turn'], do_log)
            sat_entry['sent_broks_this_turn'] = 0
            
            # We will re-ask broks if the scheduler say us that there is remaining broks, if not, wait normally
            loop_delay = 0 if nb_remaining_broks != 0 else 1
            sat_entry['loop_delay'] = loop_delay
        
        # Ok, con is not known, so we create it
        except KeyError as exp:
            self.get_brok_logger.info('We fail to get new broks from %s %s with: [%s]. Connection must not be initialize. It will be initialize.' % (sat_type, sat_entry['name'], str(exp)))
            self.pynag_con_init(sat_entry)
        except HTTPExceptions as exp:
            err_string = str(exp)
            did_log = False
            # IMPORTANT: to be removed ASAP we have broks transfer retries
            # NOTE: pycurl does not tell us if the timeout occurred during the transfer or during the connection attempt, so must look at the error part
            if 'Operation timed out after' in err_string:  # is a timeout
                if 'with 0 out of ' not in err_string:  # only raise a true error if there was a start of transfer
                    self.get_brok_logger.error('*' * 100)
                    self.get_brok_logger.error('Failed to transfer broks from the %s "%s": [%s]. THESES BROKS ARE LOST AND CANNOT BE RETRIEVED.' % (sat_type, sat_entry['name'], str(exp)))
                    self.get_brok_logger.error('*' * 100)
                    self._add_http_error_count()  # let the checks know that we have a HUGE errors here
                    did_log = True
            if not did_log:  # do not print a warning if it was a very ERROR
                self.get_brok_logger.warning('Failed to get new broks from %s %s with: [%s]. Connection fail and it will be reinitialize.' % (sat_type, sat_entry['name'], str(exp)))
            sat_entry['con'] = None
        # scheduler must not #be initialized
        except AttributeError as exp:
            self.get_brok_logger.warning('Failed to get new broks from %s %s with: [%s]. The %s must be initialized.' % (sat_type, sat_entry['name'], str(exp), sat_type))
        except Exception as exp:
            self.get_brok_logger.error('Failed to get new broks from %s %s with: [%s]. Broker will be kill.' % (sat_type, sat_entry['name'], str(exp)))
            self.get_brok_logger.print_stack()
            sys.exit(1)
    
    
    # Helper function for module, will give our broks
    def get_retention_data(self):
        return {}
    
    
    # Get back our broks from a retention module
    def restore_retention_data(self, data):
        return
    
    
    def do_stop(self):
        self.print_log_block('Stopping daemon')
        self.daemon_info.daemon_is_requested_to_stop.value = True
        self.interrupted = True
        self.mainloop_watchdog.quit()
        super(Broker, self).do_stop()
    
    
    def _set_spare_daemon_name(self, spare_daemon_name, spare_must_have_the_same_list_of_module_type, _logger):
        # type: (str, bool, PartLogger) -> None
        previous_spare_daemon_name = self.spare_daemon_name
        self.spare_daemon_name = spare_daemon_name
        if previous_spare_daemon_name != self.spare_daemon_name:
            _logger.info('MASTER', 'My spare daemon is now "%s"' % self.spare_daemon_name)
        previous_spare_must_have_the_same_list_of_module_type = self.spare_must_have_the_same_list_of_module_type
        self.spare_must_have_the_same_list_of_module_type = spare_must_have_the_same_list_of_module_type
        if self.spare_daemon_name and previous_spare_must_have_the_same_list_of_module_type != self.spare_must_have_the_same_list_of_module_type:
            not_s = '' if self.spare_must_have_the_same_list_of_module_type else 'NOT '
            _logger.info('MASTER', 'The spare daemon "%s" is now %srequiring the same modules types as the master' % (self.spare_daemon_name, not_s))
    
    
    def _set_master_daemon_name(self, master_daemon_name, _logger):
        # type: (str, PartLogger) -> None
        previous_master_daemon_name = self.master_daemon_name
        self.master_daemon_name = master_daemon_name
        if previous_master_daemon_name != self.master_daemon_name:
            _logger.info('SPARE', 'I am now the spare of the master daemon "%s"' % self.master_daemon_name)
    
    
    # Set properties we want to set in our new schedulers
    def _set_default_values_to_scheduler_entry(self, entry):
        
        # IMPORTANT: mut be LOCAL, so each scheduler have their own {} and []
        default_scheduler_properties = {'broks'          : {}, 'thread': None, 'daemon_incarnation': {},
                                        'con'            : None, 'sequence_number': '', 'loop_delay': 1,
                                        'last_log_time'  : 0, 'sent_broks_this_turn': 0,
                                        'last_connection': 0, 'type': 'scheduler',
                                        }
        entry.update(default_scheduler_properties)
    
    
    def _set_daemon_id_of_scheduler(self, daemon, daemon_id):
        daemon['instance_id'] = daemon_id
    
    
    # set up a new conf, but beware of global lock management.
    # Note: don't do locking thing here, as we have the satellite lock!
    def setup_new_conf(self):
        with self.satellite_lock:
            self.really_setup_new_conf()
    
    
    def really_setup_new_conf(self):
        _logger = self._print_new_update_conf_received()
        
        # If the configuration was giving us a new configuration incarnation, show it
        self.print_configuration_incarnation_log_entry_if_need(_logger)
        
        loading_configuration_logger = LoggerFactory.get_loading_configuration_logger()
        schedulers = [i['name'] for i in self.new_conf.get('schedulers', {}).values()]
        all_monitoring_configuration_part = self.new_conf['all_monitoring_configuration_part']
        loading_configuration_logger.info('Broker receive a new configuration from arbiter : 〖%s〗 with schedulers : 〖%s〗. Part to handle : 〖%s〗. %s' % (
            self.configuration_incarnation.get_author(), ','.join(schedulers), ','.join(map(str, all_monitoring_configuration_part)), self.configuration_incarnation.build_log_message()))
        
        t0 = time.time()
        conf = self.new_conf
        self.new_conf = None
        self.cur_conf = conf
        # Got our name from the globals
        g_conf = conf['global']
        if 'broker_name' in g_conf:
            name = g_conf['broker_name']
        else:
            name = 'Unnamed broker'
        self.name = name
        logger.load_obj(self, name)
        
        self.broks_packet_size = g_conf.get('broks_packet_size', 2001024) * 1024
        self.manage_brok_enable_sub_processes_memory_usage_protection = g_conf.get('broker__manage_brok__enable_sub_processes_memory_usage_protection', True)
        self.manage_brok_sub_process_memory_usage_system_reserved_memory = g_conf.get('broker__manage_brok__sub_process_memory_usage_system_reserved_memory', 0)
        self.manage_brok_sub_processes_memory_usage_protection_max_retry_time = g_conf.get('broker__manage_brok__sub_processes_memory_usage_protection_max_retry_time', 5)
        
        self.manage_brok_sub_process_broks_pusher_max_execution_timeout = g_conf.get('broker__manage_brok__sub_process_broks_pusher_max_execution_timeout', DEFAULT_INJECTOR_PUSHER_WAIT_TIME)
        self.manage_brok_sub_process_broks_pusher_security_ratio = g_conf.get('broker__manage_brok__sub_process_broks_pusher_security_ratio', DEFAULT_PUSHER_SECURITY_RATIO_TIME)
        self.manage_brok_sub_process_broks_pusher_min_execution_timeout = g_conf.get('broker__manage_brok__sub_process_broks_pusher_min_execution_timeout', DEFAULT_MIN_PUSHER_WAIT_TIME)
        self.manage_brok_sub_process_broks_pusher_max_retry = g_conf.get('broker__manage_brok__sub_process_broks_pusher_max_retry', DEFAULT_PUSHER_MAX_RETRY)
        self.manage_brok_sub_process_broks_pusher_queue_batch_size = g_conf.get('broker__manage_brok__sub_process_broks_pusher_queue_batch_size', DEFAULT_PUSHER_QUEUE_BATCH_SIZE)
        
        # Look at human log format. Enable by default
        logger.set_human_format(on=g_conf.get('human_timestamp_log', True))
        
        self.save_daemon_name_into_configuration_file(name)
        
        # The arbiter let us know about the realms that are allowed to talk to us.
        # it let us know also if a realm that was present before did disappear and so need to be deleted
        self.known_realms = conf['known_realms']
        self.all_monitoring_configuration_part = conf['all_monitoring_configuration_part']
        
        spare_daemon_name = g_conf.get('spare_daemon', '')
        spare_must_have_the_same_list_of_module_type = g_conf.get('broker__manage_spare__spare_must_have_the_same_list_of_module_type', False)
        self._set_spare_daemon_name(spare_daemon_name, spare_must_have_the_same_list_of_module_type, _logger)
        self._set_master_daemon_name(g_conf.get('master_daemon', ''), _logger)
        
        self._set_spare(g_conf.get('spare', True), _logger)
        activated = conf.get('activated', True)
        was_activated = self._set_is_activated(activated, _logger)
        
        if not self.activated:
            self._go_as_not_active(was_activated, _logger)
            self.have_configuration = True
            return
        
        _logger.debug('Configuration received, set as active daemon')
        
        new_schedulers = []  # for logging
        deleted_schedulers = []
        new_arbiters = []
        
        # Load Schedulers
        for (daemon_id, daemon) in conf['schedulers'].items():
            self._set_or_update_scheduler_from_configuration(daemon, daemon_id, g_conf, new_schedulers, deleted_schedulers, _logger)
        
        # Now get arbiter
        for arb_id in conf['arbiters']:
            # Must look if we already have it
            already_got = arb_id in self.arbiters
            if already_got:
                broks = self.arbiters[arb_id]['broks']
            else:
                broks = {}
            daemon = conf['arbiters'][arb_id]
            
            # replacing poller address and port by those defined in satellite map
            # IMPORTANT: do this BEFORE get uri
            self._update_daemon_addr_port_with_satellitemap(daemon, g_conf['satellitemap'], _logger)
            uri = self._get_daemon_uri(daemon)
            daemon['uri'] = uri
            
            daemon['broks'] = broks
            daemon['instance_id'] = 0  # No use so all to 0
            daemon['daemon_incarnation'] = {}  # no use
            daemon['last_connection'] = 0
            daemon['id'] = arb_id
            daemon['type'] = 'arbiter'
            
            # We do not connect to the arbiter. Connection hangs
            if not already_got:
                new_arbiters.append((daemon['name'], daemon['uri']))
            self.arbiters[arb_id] = daemon
        
        self._print_new_and_deleted_daemons(new_schedulers=new_schedulers, deleted_schedulers=deleted_schedulers, new_arbiters=new_arbiters, _logger=_logger)
        
        with self.daemon_configuration_for_modules:
            self.daemon_configuration_for_modules.configuration_incarnation = self.configuration_incarnation
            self.daemon_configuration_for_modules.all_monitoring_configuration_part = self.all_monitoring_configuration_part
            self.daemon_configuration_for_modules.schedulers = [{'name': s['name'], 'address': s['address'], 'port': s['port'], 'active': s['active'], 'daemon_incarnation': {}} for s in self.schedulers.values()]
        
        if self.schedulers:
            self.modules = conf['global']['modules']
            [module.set_father_config(conf['global']) for module in self.modules]
            _logger.debug('Receiving modules:[%s] i already load modules:[%s]' % (','.join([m.get_name() for m in self.modules]), self.have_modules))
            
            if not self.have_modules:
                _logger.info('New modules received: %s' % (','.join([m.get_name() for m in self.modules])))
                # Ok now start, or restart them!
                # Set modules, init them and start external ones
                self.modules_manager.set_modules(self.modules)
                self.do_load_modules()
                self.modules_manager.start_external_instances()
                self.modules_manager.start_worker_based_instances()
                self.have_modules = True
            else:  # just update the one we need
                self.modules_manager.update_modules(self.modules)
            
            # Set our giving timezone from arbiter
            self.set_tz(conf['global']['use_timezone'])
            
            # Start threads if needed, not a problem as starting thread is cheap and not timeout prone
            self.assert_valid_satellite_threads()
        
        self.have_configuration = True
        _logger.debug('configuration loaded into %.3fs' % (time.time() - t0))
    
    
    # An arbiter ask us to wait for a new conf, so we must clean
    # all our mess we did, and close modules too
    def clean_previous_run(self):
        with self.satellite_lock:
            # Clean all lists
            self.schedulers.clear()
            self.pollers.clear()
            self.reactionners.clear()
            self.receivers.clear()
        with self.broks_lock:
            self.broks = deque()
            self.external_module_broks = deque()
        self.broks_internal_raised = self.broks_internal_raised[:]
        with self.arbiter_broks_lock:
            self.arbiter_broks = self.arbiter_broks[:]
        self.external_commands = self.external_commands[:]
        
        # And now modules
        self.have_modules = False
        self.modules_manager.clear_instances()
    
    
    # Broker ask for Broks objects
    def get_jobs_from_distant(self, sat_entry):
        self.get_new_broks(sat_entry)
    
    
    def print_modules_stats(self, b_times, b_types, nb_broks):
        logger.debug('[ broks ] broks done this loop %d/%d' % (self.broks_done, nb_broks))
        stat_str = ''
        for mod in self.modules_manager.get_internal_instances():
            local_module_stats = self.local_module_stats.get(mod.get_name(), 0)
            if mod.get_name() == 'sla' and hasattr(mod, 'external_speed_counter'):
                
                if self.stats_time_sla is None:
                    self.stats_time_sla = AvgInRange(60, initial_value=mod.external_speed_counter)
                    self.stats_nb_sla = AvgInRange(60, initial_value=mod.external_done_counter)
                else:
                    self.stats_time_sla.update_avg(mod.external_speed_counter)
                    self.stats_nb_sla.update_avg(mod.external_done_counter)
                
                avg_speed = self.stats_time_sla.get_avg(mod.external_speed_counter)
                avg = self.stats_nb_sla.get_avg(mod.external_done_counter)
                stat_str += '[%s][%.4fs update_time:%.4f, nb_done:%4d, speed:%4d /avg update_time:%.4f, nb_done:%4d, speed:%4d ]  |  ' % \
                            ('sla',
                             local_module_stats,
                             mod.external_speed_counter,
                             mod.external_done_counter,
                             0 if mod.external_speed_counter == 0 else mod.external_done_counter / mod.external_speed_counter,
                             avg_speed,
                             avg,
                             0 if avg_speed == 0 else avg / avg_speed)
                mod.external_speed_counter = 0
                mod.external_done_counter = 0
            elif mod.get_name() == 'Graphite-Perfdata' and hasattr(mod, 'external_info'):
                stat_str += '[%s][%.4fs %s]  |  ' % (mod.get_name(), local_module_stats, mod.external_info)
            else:
                stat_str += '[%s][%.4fs]  |  ' % (mod.get_name(), local_module_stats)
            if statsd is not None:
                statsd.timing('broker.%s' % mod.get_name(), local_module_stats)
        logger.debug('[ broks ] stats by local module %s' % stat_str)
        stat_str = ''
        for brok_type, brok_stat_nb in b_types.items():
            brok_stat_time = b_times[brok_type]
            stat_str += '{ [%s] : [%d][%.4f]s }' % (brok_type, brok_stat_nb, brok_stat_time)
        logger.debug('[ broks ] stats by type %s' % stat_str)
    
    
    def print_external_modules_queue_size(self):
        external_modules = self.modules_manager.get_external_instances()
        if not external_modules:
            return
        msg = ['%s %s => Number of "Broks Sets" not eaten in MODULE queues' % (CHAPTER_MANAGE_BROKS, _EXTERNAL_MODULE_STR)]
        for external_module in external_modules:
            try:
                msg.append(' (%s): %s  ' % (external_module.get_name(), external_module.to_q.qsize()))
            except Exception as exp:
                msg.append(' (%s): Exception! %s   ' % (external_module.get_name(), exp))
        logger.info(''.join(msg))
    
    
    def _get_avg_speed(self):
        return self.avg_brok_send_speed.get_avg(flatten=False, default_value=0)
    
    
    # We will wait for the pusher process based on the broks send speed
    # * avg speed * size of broks to send = expected_time
    # Then we will multiply to allow more security and not kill process when we can avoid
    # * expected_time * security = secure_expected_time
    # Maybe we have very few broks, but the fork() have a minimal time, so at least will be a security value
    # * min(DEFAULT_MIN_PUSHER_WAIT_TIME, secure_expected_time) = ceil_secure_expected_time
    def _get_pusher_wait_time(self, nb_broks_sent):
        avg_speed = self._get_avg_speed()
        if avg_speed == 0:
            logger.info('%s %s We do not have any broks spend speed, so will allow a wait of %.1fs for sending %d broks' % (CHAPTER_MANAGE_BROKS, _EXTERNAL_MODULE_STR, self.manage_brok_sub_process_broks_pusher_max_execution_timeout, nb_broks_sent))
            return self.manage_brok_sub_process_broks_pusher_max_execution_timeout
        expected_time = nb_broks_sent / avg_speed
        secure_expected_time = expected_time * self.manage_brok_sub_process_broks_pusher_security_ratio
        ceil_secure_expected_time = min(self.manage_brok_sub_process_broks_pusher_max_execution_timeout, max(secure_expected_time, self.manage_brok_sub_process_broks_pusher_min_execution_timeout))
        
        logger.debug('%s %s We will allow a wait of %.1fs for sending %d broks (average speed = %5d broks/s)' % (CHAPTER_MANAGE_BROKS, _EXTERNAL_MODULE_STR, ceil_secure_expected_time, nb_broks_sent, avg_speed))
        return ceil_secure_expected_time
    
    
    def _push_broks_to_external_queues(self, broks, elapsed_put_stats):
        # We are sending broks as a big list, more efficient than one by one
        modules_and_queues = self.modules_manager.get_external_modules_and_queues()
        
        if len(modules_and_queues) == 0:
            return
        
        broks = pickle.dumps(broks, pickle.HIGHEST_PROTOCOL)
        for (instance, queue) in modules_and_queues:
            try:
                queue.send(broks)
            except Exception as exp:
                module_name = instance.get_name()
                logger.error('%s %s cannot send broks to module %s ( module might be dead ) : %s' % (CHAPTER_MANAGE_BROKS, _EXTERNAL_MODULE_STR, module_name, exp))
        
        for (instance, queue) in modules_and_queues:
            try:
                queue.flush()
            except:
                pass
    
    
    def do_loop_turn(self):
        # If the loop is NOT finished after 30min, it's a problem, and a FATAL one. Dump stack every 5minutes
        # until the daemon is restarted
        # NOTE: 30min because in the past we had a long bug with 20min time, so take a delay to only have
        #       real deadlocks
        with self.mainloop_watchdog:
            self._do_loop_turn()
    
    
    def _do_loop_turn(self):
        start_snap = cpu_stats_helper.get_thread_cpu_snapshot()
        loop_start = time.time()
        self._increase_loop_number()
        loop_number = self._get_loop_number()
        self.broker_time_logger.info('')
        self.loop_start_logger.info('[ Loop number=%-5d ] ===-===-===-===-===-===-===-===-===-===-===-===-===' % loop_number)
        
        # If there are some zombie process, wait for all of them and allow them to go to hell
        try:
            active_children()
        except:
            pass
        
        # #### STEP1:
        
        with self.broks_lock:
            nb_broks = len(self.broks)
        logger.debug('[ broks ] [ manage ] Begin loop with [%d] broks to handles' % nb_broks)
        
        self.broks_done = 0
        self.local_module_stats.clear()  # be sure to remove old unused modules
        for mod in self.modules_manager.get_internal_instances():
            self.local_module_stats[mod.get_name()] = 0
        
        # Dumping modules Queues size
        self.print_external_modules_queue_size()
        
        # Begin to clean modules
        self.check_and_del_zombie_modules()
        
        # Look if we have all need threads for our satellites
        self.assert_valid_satellite_threads()
        
        # Maybe the arbiter ask us to wait for a new conf
        # If true, we must restart all...
        if self.cur_conf is None:
            # Clean previous run from useless objects and close modules
            self.clean_previous_run()
            
            self.wait_for_initial_conf()
            # we may have been interrupted or so; then just return from this loop turn
            if not self.new_conf:
                return
            
            self.setup_new_conf()
        
        # Now we check if arbiter speak to us in the pyro_daemon.
        # If so, we listen for it
        # When it pushes conf to us, we re-initialise connections
        self.watch_for_new_conf(0.0)
        if self.new_conf:
            self.setup_new_conf()
        
        # We need to be sure that our inventory do not keep deleted realms
        self._clean_old_realms_in_inventory()
        
        # If an inventory did change, warn the modules about it, so they can update their own inventory about it
        self.assert_module_inventory_are_updated()
        
        if int(time.time()) % 3600 == 0:
            for (realm_name, inventory) in list(self._realms_inventory.items()):
                logger.info('The realm %s inventory have currently %s elements' % (realm_name, inventory.get_len()))
        
        # Reap broks sent from the arbiters
        self.interger_arbiter_broks()
        
        # Spare mode stop loop here
        if not self.activated:
            # we clear broks list as in idle (spare waiting to start) we do not handle broks
            with self.broks_lock:
                self.broks = deque()
                self.external_module_broks = deque()
            
            self.broker_time_logger.info('Broker is idle (spare waiting to start)')
            self.loop_stop_logger.info('[ Loop number=%-5d ] ===-===-===-===-===-===-===-===-===-===-===-===-===' % loop_number)
            self.loop_stop_logger.debug('[ Loop number=%-5d ] %s' % (loop_number, start_snap.get_diff()))
            
            loop_time = (time.time() - loop_start)
            self.sleep(1.0 - loop_time)
            
            return
        
        # #### STEP2: PREPARE and send broks to external queues
        
        step2_start = time.time()
        # EXTERNAL BROKS
        # We will work this turn with a copy of the broks, so we won't be impacted by the others threads
        with self.broks_lock:
            broks = copy.copy(self.broks)  # type: deque[Brok]
            broks_to_send_to_externals = list([b for b in self.external_module_broks if b.type != 'log'])
            
            self.broks = deque()
            self.external_module_broks = deque()
        logger.info('%s %s [PERF] [ %.3f ]s, preparing broks lists for INTERNAL and EXTERNAL modules' % (CHAPTER_MANAGE_BROKS, get_section_string('PREPARING BROKS'), time.time() - step2_start))
        
        # and for external queues
        # REF: doc/broker-modules.png (3)
        # We put to external queues broks that was not already send
        before_external = time.time()
        # We are sending broks as a big list, more efficient than one by one
        modules_and_queues = self.modules_manager.get_external_modules_and_queues()
        
        elapsed_put_stats = {}
        nb_external_broks_at_start = len(broks_to_send_to_externals)
        nb_external_broks_managed = 0
        
        while len(broks_to_send_to_externals) != 0:
            now = time.time()
            if now < before_external or now > before_external + 1:  # if got back in time, or too long, skip this turn
                # so we must re-merge our last broks with the main broks to do not lost them
                with self.broks_lock:
                    # IMPORTANT: extend left does REVERSE broks orders, so we do once before to put them in good order
                    # note: broks.reverse() is not available on python 2.6
                    self.external_module_broks.extendleft(reversed(broks_to_send_to_externals))
                break
            
            # Take the next batch of max self.manage_brok_sub_process_broks_pusher_queue_batch_size
            shard_to_send = broks_to_send_to_externals[:self.manage_brok_sub_process_broks_pusher_queue_batch_size]
            broks_to_send_to_externals = broks_to_send_to_externals[self.manage_brok_sub_process_broks_pusher_queue_batch_size:]
            self._push_broks_to_external_queues(shard_to_send, elapsed_put_stats)
            nb_external_broks_managed += len(shard_to_send)
        
        # logger.info('[MANAGE BROKS ] [ EXTERNAL MODULE    ] We will allow a wait of %.1fs for sending %d broks (average speed = %5d broks/s)' % (ceil_secure_expected_time, nb_broks_sent, avg_speed))
        ceil_secure_expected_time = self._get_pusher_wait_time(nb_external_broks_at_start)
        avg_speed = self._get_avg_speed()
        logger.info('%s %s - PUSHED   [ %.3fs, limit=%.3fs ]s, EXTERNAL queue evolution: [ %5d broks => %5d broks remaining ] [ %5d broks managed ] [ Push average speed = %5d broks/s]' % (
            CHAPTER_MANAGE_BROKS, _EXTERNAL_MODULE_STR, time.time() - before_external, ceil_secure_expected_time, nb_external_broks_at_start, len(self.external_module_broks), nb_external_broks_managed, avg_speed))
        if len(elapsed_put_stats) != 0:
            elapsed_puts_sorted = sorted(elapsed_put_stats.keys())
            logger.info('%s                            ----- Sent to [%d] EXTERNAL queues: %s' % (CHAPTER_MANAGE_BROKS, len(modules_and_queues), ', '.join(['(%s=%.3fs)' % (k, elapsed_put_stats[k]) for k in elapsed_puts_sorted])))
        
        # #### STEP3: MANAGE internal broks
        start_step3 = time.time()
        
        # Now INTERNAL
        b_types = defaultdict(int)
        b_times = defaultdict(float)
        
        nb_broks_managed = 0
        nb_broks_at_start = len(broks)
        
        brok_prepare_time = 0
        
        internal_modules = self.modules_manager.get_internal_instances()
        have_internal_modules = len(internal_modules) != 0
        if have_internal_modules:
            while len(broks) != 0:
                now = time.time()
                
                # Do not 'manage' more than 1s, we must get new broks
                # every 1s
                if now - start_step3 > 1:
                    # so we must re-merge our last broks with the main broks to do not
                    # lost them
                    with self.broks_lock:
                        # IMPORTANT: extend left does REVERSE broks orders, so we do once before to put them in good order
                        # note: broks.reverse() is not available on python 2.6
                        self.broks.extendleft(reversed(broks))
                    break
                
                try:
                    b = broks.popleft()
                except IndexError:  # no more broks, maybe a daemon stop, not a problem, catch it
                    break
                
                nb_broks_managed += 1
                before_prepare = time.time()
                b.prepare()
                brok_prepare_time = brok_prepare_time + (time.time() - before_prepare)
                # Ok, we can get the brok, and doing something with it
                # REF: doc/broker-modules.png (4-5)
                # We un serialize the brok before consume it
                before_manage = time.time()
                self.manage_brok(b)
                after_manage = time.time()
                b_types[b.type] += 1
                b_times[b.type] += (after_manage - before_manage)
        else:  # no internal modules, we can fake the result
            nb_broks_managed = len(broks)
        
        with self.broks_lock:
            nb_remaining_broks = len(self.broks)
        internal_broks_time = time.time() - start_step3
        if have_internal_modules:
            logger.info('%s %s - EXECUTED [ %.3f ]s, INTERNAL queue evolution: [ %5d broks => %5d broks remaining ] [ %5d broks managed ]' % (
                CHAPTER_MANAGE_BROKS, _INTERNAL_MODULE_STR, internal_broks_time, nb_broks_at_start, nb_remaining_broks, nb_broks_managed))
        
        sorted_modules_stats = sorted(list(self.local_module_stats.keys()))
        if sorted_modules_stats:
            logger.info('%s                            ----- Details of INTERNAL brok deserialization time:%.3fs modules execution time: %s' % (
                CHAPTER_MANAGE_BROKS, brok_prepare_time, ', '.join(['(%s=%.3fs)' % (k, self.local_module_stats[k]) for k in sorted_modules_stats])))
        
        self.manage_inter_daemon_messages()
        
        # #### STEP4: get modules objects and call modules ticks
        # Maybe external modules raised 'objects' we should get them
        self.get_objects_from_from_queues()
        
        # Say to modules it's a new tick :)
        self.hook_point('tick')
        
        self.print_modules_stats(b_times, b_types, nb_broks)
        
        loop_time = (time.time() - loop_start)
        
        self.loop_stop_logger.info('[ Loop number=%-5d ] [PERF] [ %.3f ]s' % (loop_number, loop_time))
        self.loop_stop_logger.debug('[ Loop number=%-5d ] %s' % (loop_number, start_snap.get_diff()))
        self.sleep(1.0 - loop_time)
    
    
    def compute_real_address(self, daemon_address):
        # type: (unicode) -> None
        if daemon_address in ('localhost', '127.0.0.1'):
            self.computed_address = socket.gethostbyname(socket.gethostname())
        else:
            self.computed_address = daemon_address
    
    
    def print_daemon_start(self):
        super(Broker, self).print_daemon_start()
        loading_configuration_logger = LoggerFactory.get_loading_configuration_logger()
        loading_configuration_logger.info('Broker is starting')
    
    
    #  Main function, will loop forever
    def main(self):
        try:
            self.load_config_file()
            
            for line in self.get_header():
                logger.info(line)
            self.daily_log_version()
            
            logger.info('[ broks ] Using working directory: %s' % os.path.abspath(self.workdir))
            
            # Look if we are enabled or not. If ok, start the daemon mode
            self.look_for_early_exit()
            self.do_daemon_init_and_start()
            self.print_daemon_start()
            self.load_modules_manager()
            
            self.daemon_configuration_for_modules = ShareItem(key_name='shinken_broker_daemon_configuration_for_modules', reinit=True)
            
            with self.daemon_configuration_for_modules:
                self.daemon_configuration_for_modules.configuration_incarnation = None
                self.daemon_configuration_for_modules.all_monitoring_configuration_part = []
                self.daemon_configuration_for_modules.schedulers = []
            
            #  We wait for initial conf
            self.wait_for_initial_conf()
            if not self.new_conf:
                return
            
            self.setup_new_conf()
            
            # Now the main loop
            self.do_mainloop()
        except Exception:
            logger.critical('The daemon had an unrecoverable error. It must exit.')
            logger.critical('You can log a bug to your Shinken integrator with the error message:')
            logger.print_stack(level=logging.CRITICAL)
            os._exit(2)  # noqa => force process exit, skip all hooks
            raise
    
    
    def do_after_fork_cleanup(self, after_fork_new_top_instance):
        # SEF-9050
        # Running after fork cleanup
        # We are no more in this daemon process,
        # its son (this process) need to free useless resources from father
        self.broker_time_logger = None
        self.loop_start_logger = None
        self.loop_stop_logger = None
        self.get_brok_logger = None
        self.protocol_logger = None
        self.need_data_logger = None
        self.modules_logger = None
        self.mainloop_watchdog = None
        super(Broker, self).do_after_fork_cleanup(after_fork_new_top_instance)
        self.arbiters = {}
        self.pollers = {}
        self.reactionners = {}
        self.receivers = {}
        self.schedulers = {}
        self.modules = None
        self.external_commands = []
        self.broks = None
        self.external_module_broks = None
        self.broks_internal_raised = []
        self.arbiter_broks = []
        self.local_module_stats = {}
        self.stats_time_sla = None
        self.stats_nb_sla = None
        self._tmp_bucket = {}
        self.new_conf = None
        self.cur_conf = None
