#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2009-2017:
#    Gabes Jean, naparuba@gmail.com
#    Gerhard Lausser, Gerhard.Lausser@consol.de
#    Gregory Starck, g.starck@gmail.com
#    Hartmut Goebel, h.goebel@goebel-consult.de
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken.  If not, see <http://www.gnu.org/licenses/>.


import base64
import cPickle
import copy
import json
import multiprocessing
import os
import signal
import socket
import sys
import threading
import time
import traceback
import urllib2
import zlib
from collections import defaultdict, deque
from multiprocessing import active_children

from shinken.basesubprocess import LookAtMyFatherThread
from shinken.daemon import Interface
from shinken.external_command import ExternalCommand
from shinken.http_client import HTTPException, HTTPExceptions
from shinken.load import AvgInRange
from shinken.log import logger, get_chapter_string, get_section_string
from shinken.property import IntegerProp, PathProp
from shinken.util import mem_wait_for_fork_possible, set_process_name, sort_by_ids
from shinken.withinventorysatellite import IArbiterToInventorySatellite, WithInventorySatellite
from shinken.vmware_stats import vmware_stats_reader
from shinkensolutions.lib_checks.graphite import GRAPHITE_API_VERSION, GRAPHITE_STATS_KEY

try:
    from statsd import StatsClient
    
    statsd = StatsClient()
except ImportError:
    statsd = None

DEFAULT_INJECTOR_PUSHER_WAIT_TIME = 240
DEFAULT_MIN_PUSHER_WAIT_TIME = 5
DEFAULT_PUSHER_SECURITY_RATIO_TIME = 5
DEFAULT_PUSHER_MAX_RETRY = 3

# Strings for log chapters
_MANAGE_BROKS_STR = get_chapter_string('MANAGE BROKS')
_GET_BROKS_STR = get_chapter_string('GET BROKS')
_RECEIVE_BROKS_STR = get_chapter_string('RECEIVE BROKS')
_BROKER_TIME_STR = get_chapter_string('BROKER TIME')
_MODULES_STR = get_chapter_string('MODULES')

_EXTERNAL_MODULE_STR = get_section_string('EXTERNAL MODULE')
_INTERNAL_MODULE_STR = get_section_string('INTERNAL MODULE')
_NEED_DATA_STR = get_section_string('NEED DATA')
_LOOP_START_STR = get_section_string('=== Loop start ===')
_LOOP_STOP_STR = get_section_string('=== Loop stop  ===')
_EXTERNAL_COMMANDS_STR = get_section_string('EXTERNAL COMMANDS')

_ARBITER_AS_BROKS_SOURCE = 'arbiter'


class IStatsBroker(Interface):
    """ 
    Interface for various stats about broker activity
    """
    
    doc = 'Return a connexion status from my network location to all graphite server I should write to (graphite_perfdata type modules).'
    
    
    def check_graphite_write_status(self):
        return self.app.check_graphite_write_status()
    
    
    check_graphite_write_status.doc = doc
    check_graphite_write_status.need_lock = False
    
    doc = 'Return a connexion status from my network location to all graphite server I should read from (webui type modules).'
    
    
    def check_graphite_read_status(self):
        return self.app.check_graphite_read_status()
    
    
    check_graphite_read_status.doc = doc
    check_graphite_read_status.need_lock = False
    
    doc = 'Get raw stats from the daemon'
    
    
    def get_raw_stats(self, param=''):
        app = self.app
        with app.satellite_lock:
            logger.info('Query broker running stats')
            raw_stats = super(IStatsBroker, self).get_raw_stats(param)
            raw_stats.update({'modules': []})
            # update some queues
            raw_stats['len_external_commands'] = len(app.external_commands)
            with app.broks_lock:
                raw_stats['len_broks'] = len(app.broks)
            raw_stats['len_internal_broks'] = len(app.broks_internal_raised)
            with app.arbiter_broks_lock:
                raw_stats['len_arbiter_broks'] = len(app.arbiter_broks)
            
            insts = [inst for inst in app.modules_manager.get_all_instances() if inst.is_external]
            for inst in insts:
                try:
                    raw_stats['modules'].append({'module_name': inst.get_name(), 'queue_size': inst.to_q.qsize()})
                except Exception:
                    raw_stats['modules'].append({'module_name': inst.get_name(), 'queue_size': -1})
            
            raw_stats['module_stats'] = self._get_module_stats(getattr(self.app, 'modules_manager', None), param)
            raw_stats['http_errors_count'] = app.http_errors_count
            raw_stats['have_conf'] = app.cur_conf is not None
            raw_stats['activated'] = self.app.activated
            raw_stats['spare'] = self.app.spare
            raw_stats['last_too_long_injector_time'] = self.app.last_too_long_injector_time
            raw_stats['known_realms'] = getattr(self.app, 'known_realms', None)
            return raw_stats
    
    
    get_raw_stats.doc = doc
    get_raw_stats.need_lock = False


# Our main APP class
class Broker(WithInventorySatellite):
    properties = WithInventorySatellite.properties.copy()
    properties.update({
        'pidfile'                         : PathProp(default='brokerd.pid'),
        'port'                            : IntegerProp(default='7772'),
        'local_log'                       : PathProp(default='brokerd.log'),
        'external_module_queue_batch_size': IntegerProp(default='1000000'),
        'max_file_descriptor_limit'       : IntegerProp(default='0'),  # the broker need to set to maximum value available on the system
    })
    
    daemon_type = 'broker'
    
    
    def __init__(self, config_file, is_daemon, do_replace, debug, debug_file, profile='', daemon_id=0):
        super(Broker, self).__init__(self.__class__.daemon_type, config_file, is_daemon, do_replace, debug, debug_file, daemon_id)
        
        # Our arbiters
        self.arbiters = {}
        
        # Our pollers and reactionners
        self.pollers = {}
        self.reactionners = {}
        self.receivers = {}
        
        # Modules are load one time
        self.have_modules = False
        
        # Can have a queue of external_commands given by modules
        # will be processed by arbiter
        self.external_commands = []
        
        # All broks to manage
        self.broks = deque()  # broks to manage, note: WILL overwrite basesatellite value
        self.external_module_broks = deque()  # broks manage by external module
        self.broks_lock = threading.RLock()  # as satellites will be access by threads, protect our broks list by a lock
        # broks raised this turn and that needs to be put in self.broks
        self.broks_internal_raised = []
        # broks raised by the arbiters, we need a lock so the push can be in parallel
        # to our current activities and won't lock the arbiter
        self.arbiter_broks = []
        self.arbiter_broks_lock = threading.RLock()
        
        self.timeout = 1.0
        
        self._add_http_interface(IStatsBroker(self))
        self._add_http_interface(IArbiterToInventorySatellite(self))
        
        self.local_module_stats = {}
        self.stats_time_sla = None
        self.stats_nb_sla = None
        self.broks_done = 0
        self.last_too_long_injector_time = 0.0
        
        # We will have a tread by distant satellites, so we must protect our access
        self.satellite_lock = threading.RLock()
        
        self._tmp_bucket = {}  # use by injector workers to inherit jobs during the fork()
        
        # Memory protection: we will have parameters to choose if we do memory protection or not
        self.manage_brok_enable_sub_processes_memory_usage_protection = False
        self.manage_brok_sub_process_memory_usage_system_reserved_memory = 0
        self.manage_brok_sub_processes_memory_usage_protection_max_retry_time = 5
        
        self.manage_brok_sub_process_broks_pusher_max_execution_timeout = DEFAULT_INJECTOR_PUSHER_WAIT_TIME
        self.manage_brok_sub_process_broks_pusher_security_ratio = DEFAULT_PUSHER_SECURITY_RATIO_TIME
        self.manage_brok_sub_process_broks_pusher_min_execution_timeout = DEFAULT_MIN_PUSHER_WAIT_TIME
        self.manage_brok_sub_process_broks_pusher_max_retry = DEFAULT_PUSHER_MAX_RETRY
        self.avg_brok_send_speed = AvgInRange(300)  # 0 broks/s by default, so we will take manage_brok_sub_process_broks_pusher_max_wait_time for wait time
    
    
    # BEWARE: you should have the lock to call this one
    def _find_scheduler_by_shard_id(self, shard_id):
        for scheduler in self.schedulers.values():
            if scheduler['instance_id'] == shard_id:
                return scheduler
        return None
    
    
    # Schedulers have some queues. We can simplify the call by adding
    # elements into the proper queue just by looking at their type
    # Brok -> self.broks
    # External commands -> self.external_commands
    def add(self, elt):
        cls_type = elt.__class__.my_type
        if cls_type == 'brok':
            # Like this brok was set by scheduler 0
            elt.instance_id = 0
            self.broks_internal_raised.append(elt)
            return
        elif cls_type == 'externalcommand':
            logger.debug("[broker][add_broks] Enqueuing an external command '%s'" % str(ExternalCommand.__dict__))
            self.external_commands.append(elt)
        elif cls_type == 'message':
            # Maybe we got a Message from the modules, it's way to ask something like from now a full data from a scheduler for example.
            logger.debug('[broker][add_broks] message [%s]' % str(elt.__dict__))
            if elt.get_type() == 'NeedData':
                data = elt.get_data()
                # Full instance id means: I got no data for this scheduler so give me all dumbass!
                if 'full_instance_id' in data:
                    shard_id = data['full_instance_id']
                    source = elt.source
                    with self.satellite_lock:
                        scheduler = self._find_scheduler_by_shard_id(shard_id)
                        # Maybe the arbiter did JUST remove the scheduler during the fact that the module get it, and now
                        # it's quite rare, but not an error because as soon as the scheduler will be given back by the arbiter
                        # we will have a full broks generation, so we don't have to "force" it
                        if scheduler is None:
                            logger.warning(
                                "%s %s [ %s ] The module is asking need to have all the elements from a scheduler. It is asking for shard (%s) but we cannot find it in our schedulers, because the arbiter did ask us to remove it. We will give it to the module as soon as the arbiter will give us this scheduler." % (
                                    _MODULES_STR, _NEED_DATA_STR, source, shard_id))
                            logger.warning("%s %s [ %s ] Current schedulers are: %s" % (_MODULES_STR, _NEED_DATA_STR, source, ', '.join(['%s(shard=%s)' % (sched['name'], sched['instance_id']) for sched in self.schedulers.values()])))
                            return
                    # Reset the scheduler, so it will ask for a full initial brok generation
                    scheduler['con'] = None
                    scheduler['daemon_incarnation'] = 0
                    
                    logger.info("%s %s Requesting scheduler '%s' (managing shard %s) initial broks." % (_GET_BROKS_STR, _NEED_DATA_STR, scheduler['name'], shard_id))
            
            elif elt.get_type() == 'ICrash':
                # Maybe a module tells me that it's dead, I must log it's last words...
                # The module death will be looked for elsewhere and restarted.
                data = elt.get_data()
                logger.error('the module %s just crash! Please look at the traceback:' % data['name'])
                if data['trace']:
                    for line in data['trace'].splitlines():
                        logger.error(line)
    
    
    # Check if we do not connect to often to this
    def is_connection_try_too_close(self, elt):
        now = time.time()
        last_connection = elt['last_connection']
        if now - last_connection < 5:
            return True
        return False
    
    
    ##
    # Iterate over all graphite servers I should write to and return connexion status.
    # They are defined in my graphite_perfdata type modules.
    def check_graphite_write_status(self):
        reply = {'status': 'OK', 'data': []}
        
        if not self.cur_conf:
            reply['status'] = 'NO_CONF'
            reply['data'].append({
                'module_name'  : None,
                'host'         : None,
                'port'         : None,
                'can_post_data': False
            })
            return reply
        
        graphite_mods = [mod for mod in self.cur_conf['global']['modules'] if mod.module_type == "graphite_perfdata"]
        if not graphite_mods:
            reply['status'] = 'NO_MODULE'
        for mod in graphite_mods:
            host = mod.host
            port = mod.port
            can_post_data = Broker.graphite_service_is_alive(host, port)
            
            reply['data'].append({
                'module_name'  : mod.module_name,
                'host'         : host,
                'port'         : port,
                'can_post_data': can_post_data})
        
        return reply
    
    
    ##
    # Iterate over all graphite servers I should read from and return connexion status.
    # They are defined in my webui type modules.
    def check_graphite_read_status(self):
        reply = []
        
        if not self.cur_conf:
            status = {
                'module_name': None,
                'host'       : None,
                'realm'      : None,
                'nb_metrics' : 0,
                'reachable'  : False,
                'version'    : None
            }
            reply.append(status)
            return reply
        
        webui_modules = [mod for mod in self.cur_conf['global']['modules'] if mod.module_type == "webui"]
        webui_and_graphite_threads = []
        for webui_module in webui_modules:
            graphite_backends = [g.strip() for g in webui_module.graphite_backends.split(',') if g.strip()]
            use_ssl = webui_module.use_ssl in ['1', True]
            graphs_errors = {}
            thread_name = 'webui-checks-errors-thread-%s' % webui_module.module_name
            webui_thread = threading.Thread(None, target=Broker.webui_graphs_errors, name=thread_name, args=('127.0.0.1', webui_module.port, use_ssl, graphs_errors))
            webui_and_graphite_threads.append(webui_thread)
            
            for graphite_backend in graphite_backends:
                graphite_backend_def = graphite_backend.split(':')
                if len(graphite_backend_def) != 2:
                    status = {
                        'module_name': webui_module.module_name,
                        'host'       : None,
                        'realm'      : None,
                        'nb_metrics' : 0,
                        'reachable'  : False,
                        'errors'     : graphs_errors,
                        'version'    : None
                    }
                    reply.append(status)
                    # TODO: Error management if graphite_backend value is not correct
                    continue
                
                realm = graphite_backend_def[0].strip()
                host = graphite_backend_def[1].strip()
                
                status = {
                    'module_name': webui_module.module_name,
                    'host'       : host,
                    'realm'      : realm,
                    'errors'     : graphs_errors,
                    'version'    : None
                }
                reply.append(status)
                thread_name = 'graphite-backend-thread-%s' % host
                graphite_backend_thread = threading.Thread(None, target=Broker.graphite_ui_data, name=thread_name, args=(host, status))
                webui_and_graphite_threads.append(graphite_backend_thread)
        
        for t in webui_and_graphite_threads:
            t.daemon = True
            t.start()
        
        for t in webui_and_graphite_threads:
            t.join()
        
        return reply
    
    
    @staticmethod
    ## Will return None in case of failure, or json reply.
    def graphite_ui_data(host, status, retry=4):
        Broker.get_graphite_version(host, status, retry)
        if status['version'] != GRAPHITE_API_VERSION:
            return
        while True:
            try:
                retry -= 1
                resp = urllib2.urlopen('http://%s/metrics/get-metrics-count' % host, timeout=3)
                buf = resp.read()
                jval = json.loads(buf)
                status['nb_metrics'] = jval.get('metrics', None)
                status['nb_hosts_clusters'] = jval.get('level_0', None)
                status[GRAPHITE_STATS_KEY.TIME_READ] = jval.get(GRAPHITE_STATS_KEY.TIME_READ, -1)
                status[GRAPHITE_STATS_KEY.LOCAL_TIME] = jval.get(GRAPHITE_STATS_KEY.LOCAL_TIME, None)
                return
            except Exception as e:
                if retry == 0:
                    status['nb_metrics'] = 0
                    status['nb_hosts_clusters'] = 0
                    return
                else:
                    continue
    
    
    @staticmethod
    ## Will return None in case of failure, or json reply.
    def get_graphite_version(host, status, retry=4):
        while True:
            try:
                retry -= 1
                resp = urllib2.urlopen('http://%s/version/' % host, timeout=3)
                buf = resp.read()
                status['reachable'] = True
                status['version'] = buf.strip()
                return
            except Exception as e:
                if retry == 0:
                    status['reachable'] = False
                    status['version'] = None
                    return
                else:
                    continue
    
    
    @staticmethod
    ## Will return None in case of failure, or json reply.
    def webui_graphs_errors(host, port, use_ssl, graphs_errors, retry=4):
        while True:
            try:
                retry -= 1
                proto = 'https' if use_ssl else 'http'
                resp = urllib2.urlopen('%s://%s:%s/api/graphs/errors' % (proto, host, port), timeout=3)
                html = resp.read()
                jval = json.loads(html)
                graphs_errors.update(jval)
                return
            except Exception as err:
                if retry == 0:
                    return
                else:
                    continue
    
    
    @staticmethod
    def graphite_service_is_alive(host, port, retry=4):
        sock = None
        while True:
            try:
                retry -= 1
                sock = socket.create_connection((str(host), int(port)), 0.5)
                return True
            except Exception as exp:
                if retry == 0:
                    logger.warning("graphite_service_is_alive failed with exp: [%s]" % exp)
                    return False
                else:
                    continue
            finally:
                if sock is not None:
                    sock.close()
    
    
    # Get a brok. Our role is to put it in the modules
    # DO NOT CHANGE data of b!!!
    # REF: doc/broker-modules.png (4-5)
    def manage_brok(self, b):
        # Call all modules if they catch the call
        # data = b.data
        # if 'host' in b.type and 'group' not in b.type and "schedule" not in b.type:
        #         update = {"btype": b.type, "now": int(time.time()), "diff": int(time.time() - data.get('last_state_change',0))}
        #         update.update(data)
        #         try:
        #             logger.debug("[BROKSLE] %(now)s ; %(btype)20s ; %(host_name)10s ; %(last_state_id)s=>%(last_state_as_string)-10s ; %(state_id)s=>%(state)-10s ; %(last_state_change)s ; -%(diff)s\n" % update)
        #         except KeyError:
        #             logger.debug('[BROKSLE] ERROR for brok type %s' % b.type)
        
        self.broks_done += 1
        for mod in self.modules_manager.get_internal_instances():
            try:
                # Unserialize if need, but outside the module time (it's common to all modules)
                b.prepare()
                
                before = time.time()
                mod.manage_brok(b)
                mod_name = mod.get_name()
                if mod_name not in self.local_module_stats:
                    self.local_module_stats[mod_name] = 0
                self.local_module_stats[mod_name] += time.time() - before
            except Exception as exp:
                logger.debug(str(exp.__dict__))
                logger.warning("The mod %s raise an exception: %s, I'm tagging it to restart later" % (mod.get_name(), str(exp)))
                logger.warning("Exception type: %s" % type(exp))
                logger.warning("Back trace of this kill: %s" % (traceback.format_exc()))
                self.modules_manager.did_crash(mod, "The mod %s raise an exception: %s" % (mod.get_name(), str(exp)))
    
    
    # Add broks (a tab) to different queues for internal and external modules
    def add_broks_to_queue(self, broks, source, duration=0):
        # If we are idle (spare waiting to start) we can't handle brok
        if not self.activated:
            return
        
        if len(broks) == 0:
            return
        
        log_label = _RECEIVE_BROKS_STR if source == _ARBITER_AS_BROKS_SOURCE else _GET_BROKS_STR  # if we did get or not broks, we change the log
        
        # Ok now put in queue broks to be managed by
        # internal modules
        with self.broks_lock:
            self.broks.extend(broks)
            self.external_module_broks.extend(broks)
            logger.info(
                '%s %s [PERF] [ %.3f ]s - Add %4s broks into INTERNAL queue (new size=%s) and the EXTERNAL queue (new size=%s)' % (log_label, get_section_string(source), duration, len(broks), len(self.broks), len(self.external_module_broks)))
        number_by_types = {}
        for brok in broks:
            btype = brok.type
            number_by_types[btype] = number_by_types.get(btype, 0) + 1
        btypes = number_by_types.keys()
        btypes.sort()
        s_printed = ', '.join(['%s=%s' % (k, number_by_types[k]) for k in btypes])
        logger.info('%s %s                   ----- %4s composed of: %s' % (log_label, get_section_string(source), len(broks), s_printed))
    
    
    # We will get in the broks list the broks from the arbiters,
    # but as the arbiter_broks list can be push by arbiter without Global lock,
    # we must protect this with he list lock
    def interger_arbiter_broks(self):
        with self.arbiter_broks_lock:
            self.add_broks_to_queue(self.arbiter_broks, _ARBITER_AS_BROKS_SOURCE)
            self.arbiter_broks = []
    
    
    # Get 'objects' from external modules
    # right now on nobody uses it, but it can be useful
    # for modules like livestatus to raise external
    # commands for example
    def get_objects_from_from_queues(self):
        start = time.time()
        nb_object_get = 0
        for f in self.modules_manager.get_external_from_queues():
            try:
                while not f.empty():
                    o = f.get(block=False)
                    self.add(o)
                    nb_object_get += 1
            except Exception as exp:
                logger.warning('Cannot get objects from a module (%s). skipping it.' % exp)
                continue
        end = time.time()
        logger.info('%s %s [PERF] [ %.3f ]s Did read %d external commands (like recheck, set acknowledge, etc) from modules' % (_MODULES_STR, _EXTERNAL_COMMANDS_STR, end - start, nb_object_get))
        return nb_object_get
    
    
    # For a new distant daemon, if it is a scheduler, ask for a new full broks generation
    def _manage_new_distant_daemon_incarnation(self, entry, old_incar, new_incar):
        daemon_type = entry['type']
        entry['broks'].clear()
        con = entry['con']
        if con is None:  # maybe another thread did close the connection on a new configuration
            entry['daemon_incarnation'] = 0  # reset it so we will get back in this method on the next loop
            return
        # we must ask for a new full broks if  it's a scheduler
        if daemon_type == 'scheduler':
            logger.info("%s %s [ %s ] I ask for a initial broks generation to the scheduler with new daemon incarnation %s (old incarnation was %s)" % (_GET_BROKS_STR, _NEED_DATA_STR, entry['name'], new_incar, old_incar))
            try:
                did_generate_raw = con.get('fill_initial_broks', {'bname': self.name}, wait='long')
            except HTTPExceptions as exp:  # If fail (as error 500 or somethign like this) it means that the
                # scheduler did fail to generate broks, it's a huge problem and we must restart this
                msg = 'The scheduler %s did fail to generate initial broks data with an error: %s. we will retry it' % (entry['name'], exp)
                entry['daemon_incarnation'] = 0
                raise HTTPException(msg)
            # maybe the scheduler was not ready, if so, retry it
            # NOTE: 'true' or even ''/None is OK, just means the scheduler is old
            if did_generate_raw == 'false':
                msg = 'The scheduler %s did fail to generate initial broks data in this turn, retrying to ask it' % entry['name']
                entry['daemon_incarnation'] = 0
                raise HTTPException(msg)
    
    
    # We get new broks from schedulers
    # REF: doc/broker-modules.png (2)
    def get_new_broks(self, sat_entry):
        # If we are idle (spare waiting to start) we do not ask our satellites new broks
        if not self.activated:
            time.sleep(0.5)
            return
        # We check for new check in each schedulers and put the result in new_checks
        sat_type = sat_entry['type']
        try:
            con = sat_entry['con']
            if con is not None:  # None = not initialized
                before = time.time()
                # Before ask a call that can be long, do a simple ping to be sure it is alive
                con.get('ping')
                tmp_broks = con.get('get_broks', {'bname': self.name}, wait='long')
                try:
                    _t = base64.b64decode(tmp_broks)
                    _t = zlib.decompress(_t)
                    tmp_broks = cPickle.loads(_t)
                except (TypeError, zlib.error, cPickle.PickleError), exp:
                    logger.error('[broks][get_new_broks] Fail to load broks data from %s %s with : [%s]' % (sat_type, sat_entry['name'], exp))
                    sat_entry['con'] = None
                    return
                
                for b in tmp_broks.values():
                    b.instance_id = sat_entry['instance_id']
                    b.part_configuration_incarnation._part_id = sat_entry['instance_id']
                    
                    # Ok, we can add theses broks to our queues
                tmp_broks = tmp_broks.values()
                tmp_broks.sort(sort_by_ids)
                self.add_broks_to_queue(tmp_broks, sat_entry['name'], time.time() - before)
            
            else:  # no con? make the connection
                self.pynag_con_init(sat_entry)
        # Ok, con is not known, so we create it
        except KeyError as exp:
            logger.info("[broks][get_new_broks] We fail to get new broks from %s %s with: [%s]. Connection must not be initialize. It will be initialize." % (sat_type, sat_entry['name'], str(exp)))
            self.pynag_con_init(sat_entry)
        except HTTPExceptions as exp:
            logger.warning("[broks][get_new_broks] We fail to get new broks from %s %s with: [%s]. Connection fail and it will be reinitialize." % (sat_type, sat_entry['name'], str(exp)))
            sat_entry['con'] = None
        # scheduler must not #be initialized
        except AttributeError as exp:
            logger.warning("[broks][get_new_broks] We fail to get new broks from %s %s with: [%s]. The %s must be initialized." % (sat_type, sat_entry['name'], str(exp), sat_type))
        except Exception as exp:
            logger.error("[broks][get_new_broks] We fail to get new broks from %s %s with: [%s]. Broker will be kill." % (sat_type, sat_entry['name'], str(exp)))
            logger.print_stack()
            sys.exit(1)
    
    
    # Helper function for module, will give our broks
    def get_retention_data(self):
        return {}
    
    
    # Get back our broks from a retention module
    def restore_retention_data(self, data):
        return
    
    
    def do_stop(self):
        self.print_log_block('Stopping daemon')
        try:
            act = active_children()
            for a in act:
                a.terminate()
                a.join(1)
        except:
            pass
        super(Broker, self).do_stop()
    
    
    # setup a new conf, byt beware about global lock management.
    # Note: don't do locking thing here, as we have the satellite lock!
    def setup_new_conf(self):
        with self.satellite_lock:
            self.really_setup_new_conf()
    
    
    def _find_previous_daemon_by_uri(self, into, uri):
        for (_id, daemon_entry) in into.iteritems():
            if daemon_entry['uri'] == uri:
                return _id
        return None
    
    
    @staticmethod
    def _get_daemon_uri(daemon):
        proto = 'https' if daemon['use_ssl'] else 'http'
        uri = '%s://%s:%s/' % (proto, daemon['address'], daemon['port'])
        return uri
    
    
    def really_setup_new_conf(self):
        self.print_log_block('Loading a configuration from the arbiter')
        t0 = time.time()
        conf = self.new_conf
        self.new_conf = None
        self.cur_conf = conf
        # Got our name from the globals
        g_conf = conf['global']
        if 'broker_name' in g_conf:
            name = g_conf['broker_name']
        else:
            name = 'Unnamed broker'
        self.name = name
        logger.load_obj(self, name)
        
        # Let the vmware stats part know if it's enabled or not. Can change while running
        vmware_stats_reader.set_enabled(g_conf.get('vmware__statistics_compute_enable', True))
        
        self.manage_brok_enable_sub_processes_memory_usage_protection = g_conf.get('broker__manage_brok__enable_sub_processes_memory_usage_protection', True)
        self.manage_brok_sub_process_memory_usage_system_reserved_memory = g_conf.get('broker__manage_brok__sub_process_memory_usage_system_reserved_memory', 0)
        self.manage_brok_sub_processes_memory_usage_protection_max_retry_time = g_conf.get('broker__manage_brok__sub_processes_memory_usage_protection_max_retry_time', 5)
        
        self.manage_brok_sub_process_broks_pusher_max_execution_timeout = g_conf.get('broker__manage_brok__sub_process_broks_pusher_max_execution_timeout', DEFAULT_INJECTOR_PUSHER_WAIT_TIME)
        self.manage_brok_sub_process_broks_pusher_security_ratio = g_conf.get('broker__manage_brok__sub_process_broks_pusher_security_ratio', DEFAULT_PUSHER_SECURITY_RATIO_TIME)
        self.manage_brok_sub_process_broks_pusher_min_execution_timeout = g_conf.get('broker__manage_brok__sub_process_broks_pusher_min_execution_timeout', DEFAULT_MIN_PUSHER_WAIT_TIME)
        self.manage_brok_sub_process_broks_pusher_max_retry = g_conf.get('broker__manage_brok__sub_process_broks_pusher_max_retry', DEFAULT_PUSHER_MAX_RETRY)
        
        # Look at human log format. Enable by default
        logger.set_human_format(on=g_conf.get('human_timestamp_log', True))
        
        self.save_daemon_name_into_configuration_file(name)
        
        # The arbiter let us know about the realms that are allowed to talk to us
        # it let us know also if a realm that was present before did disapear and so need to be deleted
        self.known_realms = conf['known_realms']
        
        logger.info("[broker][configuration] new configuration received")
        self.activated = conf.get('activated', True)
        self.spare = g_conf.get('spare', False)
        if not self.activated:
            # I'm not activated AKA spare, make some clean and log that
            self.clean_previous_run()
            self.arbiters.clear()
            
            logger.info('Stopping all modules')
            self.modules_manager.stop_all()
            self.have_modules = False
            
            logger.info("[broker][configuration] Configuration received, I'm configured as Spare")
            return
        
        logger.debug("[broker][configuration] Configuration received")
        
        # Load Schedulers
        for (daemon_id, daemon) in conf['schedulers'].iteritems():
            daemon_name = daemon['name']
            # replacing scheduler address and port by those defined in satellitemap
            map_entry = g_conf['satellitemap'].get(daemon_name, None)
            if map_entry:
                # MAP ENTRY will looks like   {'port': 7768, 'address': u'51.15.255.102'}
                logger.debug('[broker][configuration] Remap the daemon %s to new address %s and port %s as defined in the daemon %s configuration (satellitemap property)'
                             % (daemon_name, map_entry.get('address', ''), map_entry.get('port', ''), self.name))
                daemon.update(map_entry)
            
            uri = self._get_daemon_uri(daemon)
            daemon['uri'] = uri
            previous_id = self._find_previous_daemon_by_uri(self.schedulers, uri)
            
            # default value: void
            previous_thread = None
            previous_broks = {}
            previous_daemon_incarnation = 0
            previous_con = None
            # But maybe we need to keep them from the past (if same uri => aka same daemon)
            if previous_id is not None:  # ok we did already have in the past, keep important info
                previous_daemon = self.schedulers[previous_id]
                previous_broks = previous_daemon['broks']
                previous_daemon_incarnation = previous_daemon['daemon_incarnation']
                previous_thread = previous_daemon['thread']
                previous_con = previous_daemon['con']
                # we now can del the old id entry
                if previous_id != daemon_id:
                    logger.debug('[broker][configuration] the daemon %s did switch internal id from %s to %s' % (daemon_name, previous_id, daemon_id))
                del self.schedulers[previous_id]
            
            # Now set values into the daemon
            daemon['broks'] = previous_broks
            daemon['daemon_incarnation'] = previous_daemon_incarnation
            daemon['thread'] = previous_thread
            daemon['con'] = previous_con
            # and reset some value
            daemon['last_connection'] = 0
            daemon['id'] = daemon_id
            daemon['type'] = 'scheduler'
            
            # Now save the object
            self.schedulers[daemon_id] = daemon
            
            logger.info("[broker][configuration] Scheduler definition update : scheduler_id[%s] name[%s] instance_id[%s] push_flavor[%s] uri[%s]" % (daemon_id, daemon_name, daemon['instance_id'], daemon['push_flavor'], uri))
        
        # Now get arbiter
        for arb_id in conf['arbiters']:
            # Must look if we already have it
            already_got = arb_id in self.arbiters
            if already_got:
                broks = self.arbiters[arb_id]['broks']
            else:
                broks = {}
            a = conf['arbiters'][arb_id]
            self.arbiters[arb_id] = a
            
            # replacing arbiter address and port by those defined in satellitemap
            if a['name'] in g_conf['satellitemap']:
                a = dict(a)  # make a copy
                a.update(g_conf['satellitemap'][a['name']])
            
            proto = 'http'
            if a['use_ssl']:
                proto = 'https'
            uri = '%s://%s:%s/' % (proto, a['address'], a['port'])
            self.arbiters[arb_id]['uri'] = uri
            
            self.arbiters[arb_id]['broks'] = broks
            self.arbiters[arb_id]['instance_id'] = 0  # No use so all to 0
            self.arbiters[arb_id]['daemon_incarnation'] = 0
            self.arbiters[arb_id]['last_connection'] = 0
            self.arbiters[arb_id]['id'] = arb_id
            self.arbiters[arb_id]['type'] = 'arbiter'
            
            # We do not connect to the arbiter. Connection hangs
            logger.info("[broker][configuration] Arbiter definition update : name[%s] uri[%s]" % (a['name'], self.arbiters[arb_id]['uri']))
        
        if self.schedulers:
            self.modules = conf['global']['modules']
            logger.info("[broker][configuration] Receiving modules:[%s] i already load modules:[%s]" % (','.join([m.get_name() for m in self.modules]), self.have_modules))
            
            if not self.have_modules:
                # Ok now start, or restart them!
                # Set modules, init them and start external ones
                self.modules_manager.set_modules(self.modules)
                self.do_load_modules()
                self.modules_manager.start_external_instances()
                self.modules_manager.start_worker_based_instances()
                self.have_modules = True
            else:  # just update the one we need
                self.modules_manager.update_modules(self.modules)
            
            # Set our giving timezone from arbiter
            self.set_tz(conf['global']['use_timezone'])
            
            # Start threads if need, not a problem as starting thread is cheap and not timeout prone
            self.assert_valid_satellite_threads()
        
        logger.log_perf(t0, '[broker][configuration]', "update new configuration done")
    
    
    # An arbiter ask us to wait for a new conf, so we must clean
    # all our mess we did, and close modules too
    def clean_previous_run(self):
        with self.satellite_lock:
            # Clean all lists
            self.schedulers.clear()
            self.pollers.clear()
            self.reactionners.clear()
            self.receivers.clear()
        with self.broks_lock:
            self.broks = deque()
            self.external_module_broks = deque()
        self.broks_internal_raised = self.broks_internal_raised[:]
        with self.arbiter_broks_lock:
            self.arbiter_broks = self.arbiter_broks[:]
        self.external_commands = self.external_commands[:]
        
        # And now modules
        self.have_modules = False
        self.modules_manager.clear_instances()
    
    
    # Broker ask for Broksobjects
    def get_jobs_from_distant(self, e):
        self.get_new_broks(e)
    
    
    def print_modules_stats(self, b_times, b_types, nb_broks):
        logger.debug('[broks] broks done this loop %d/%d' % (self.broks_done, nb_broks))
        stat_str = ''
        for mod in self.modules_manager.get_internal_instances():
            local_module_stats = self.local_module_stats.get(mod.get_name(), 0)
            if mod.get_name() == 'sla' and hasattr(mod, 'external_speed_counter'):
                
                if self.stats_time_sla is None:
                    self.stats_time_sla = AvgInRange(60, initial_value=mod.external_speed_counter)
                    self.stats_nb_sla = AvgInRange(60, initial_value=mod.external_done_counter)
                else:
                    self.stats_time_sla.update_avg(mod.external_speed_counter)
                    self.stats_nb_sla.update_avg(mod.external_done_counter)
                
                avg_speed = self.stats_time_sla.get_avg(mod.external_speed_counter)
                avg = self.stats_nb_sla.get_avg(mod.external_done_counter)
                stat_str += '[%s][%.4fs update_time:%.4f, nb_done:%4d, speed:%4d /avg update_time:%.4f, nb_done:%4d, speed:%4d ]  |  ' % \
                            ('sla',
                             local_module_stats,
                             mod.external_speed_counter,
                             mod.external_done_counter,
                             0 if mod.external_speed_counter == 0 else mod.external_done_counter / mod.external_speed_counter,
                             avg_speed,
                             avg,
                             0 if avg_speed == 0 else avg / avg_speed)
                mod.external_speed_counter = 0
                mod.external_done_counter = 0
            elif mod.get_name() == 'Graphite-Perfdata' and hasattr(mod, 'external_info'):
                stat_str += '[%s][%.4fs %s]  |  ' % (mod.get_name(), local_module_stats, mod.external_info)
            else:
                stat_str += '[%s][%.4fs]  |  ' % (mod.get_name(), local_module_stats)
            if statsd is not None:
                statsd.timing('broker.%s' % mod.get_name(), local_module_stats)
        logger.debug('[broks] stats by local module %s' % stat_str)
        stat_str = ''
        for brok_type, brok_stat_nb in b_types.iteritems():
            brok_stat_time = b_times[brok_type]
            stat_str += '{ [%s] : [%d][%.4f]s }' % (brok_type, brok_stat_nb, brok_stat_time)
        logger.debug('[broks] stats by type %s' % stat_str)
    
    
    def print_external_modules_queue_size(self):
        external_modules = self.modules_manager.get_external_instances()
        if not external_modules:
            return
        msg = ['%s %s => Number of "Broks Sets" not eaten in MODULE queues' % (_MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR)]
        for external_module in external_modules:
            try:
                msg.append(' (%s): %s  ' % (external_module.get_name(), external_module.to_q.qsize()))
            except Exception as exp:
                msg.append(' (%s): Exception! %s   ' % (external_module.get_name(), exp))
        logger.info(''.join(msg))
    
    
    def _in_injector_send_to_one_queue(self, queue, broks):
        logger.debug('Starting to send %d broks to the module' % len(broks))
        t0 = time.time()
        queue.put(broks, timeout=DEFAULT_INJECTOR_PUSHER_WAIT_TIME)  # we put a timeout so we won't lock forever
        logger.debug('The broks were sent in %.3fs' % (time.time() - t0))
    
    
    def _in_queue_injector(self, before_start, module_name, father_pid):
        try:
            real_start = time.time()
            # We will need to
            # PERFORMANCE HACK/CHOICE:
            # IMPORTANT: we need to do as the multiprocessing lib is doing manually as
            #            we did fork() and not call the Process that is too slow for us
            
            # BEWARE: if we directly call the multiprocessing.util._run_after_forkers()
            #         it will be too long (500ms!) for EACH fork. it's not possible.
            #         it seems that only the multiprocessing.managers.AutoProxy[Queue]._after_for
            #         are long (500ms each) because they are doing a connection, so we will only
            #         clean the only queue we are using.
            # BEWARE2: do not try to thread clean phase calls, I did try, and it dead lock :(
            (injector_queue, broks) = self._tmp_bucket[module_name]
            self._tmp_bucket.clear()  # free the memory as soon as possible
            auto_proxy_queue_class = type(injector_queue)  # get the queue class, because it can move/rename in the future
            
            # Code take from multiprocessing.Process._bootstrap
            multiprocessing.util._finalizer_registry.clear()
            #  => code from : multiprocessing.util._run_after_forkers()
            items = list(multiprocessing.util._afterfork_registry.items())
            items.sort()
            for (index, ident, func), obj in items:
                # only clean the queue we will use in this process
                if isinstance(obj, auto_proxy_queue_class) and obj != injector_queue:
                    continue
                try:
                    func(obj)
                except Exception:
                    pass
            
            injector_start_time = time.time()
            process_name = "%s [ - Module: %s - queue brok pusher ]" % (self.daemon_display_name, module_name)
            set_process_name(process_name)
            logger_name = process_name + '(pid=%s)' % os.getpid()
            logger_name = logger_name.replace('[', '(').replace(']', ')')
            logger.set_name(logger_name)
            logger.debug('The worker process was started in %.3fs (clean phase=%.3f)' % (time.time() - before_start, injector_start_time - real_start))
            
            # NOTE: cannot call the clean_previous_run because it needs locks, and we are not sure threads lock
            # are not release in this fork().
            self.broks = deque()
            self.external_module_broks = deque()
            
            # Classic sub process behavior/protection
            # start_malloc_trim_thread()  # NOT this one, it's too long for a short time worker (~200ms)
            look_at_my_father_thread = LookAtMyFatherThread(father_pid, self.daemon_display_name, process_name, loop_speed=0.5)
            look_at_my_father_thread.start_thread()
            
            # Ask the http to release the socket
            self.http_daemon.shutdown(quiet=True)
            
            # Do not want the daemon any more from here
            self.new_conf = None
            self.cur_conf = None
            if hasattr(self, 'conf'):
                del self.conf
            
            # Allow the memory dump
            self.set_exit_handler()
            
            logger.debug('Finish to initialize the injector worker in %.3fs' % (time.time() - injector_start_time))
            # Ok, now work..
            self._in_injector_send_to_one_queue(injector_queue, broks)
            os._exit(0)
        except Exception as exp:
            logger.error('Cannot send broks to the instance %s: %s' % (module_name, traceback.format_exc()))
            os._exit(2)
    
    
    def _get_avg_speed(self):
        return self.avg_brok_send_speed.get_avg(flatten=False, default_value=0)
    
    
    # We will wait for the pusher process based on the broks send speed
    # * avg speed * size of broks to send = espected_time
    # Then we will multiply to allow more security and not kill process when we can avoid
    # * espected_time * security = secure_espected_time
    # Maybe we have very few broks, but the fork() have a minimal time, so at least will be a security value
    # * min(DEFAULT_MIN_PUSHER_WAIT_TIME, secure_espected_time) = ceiled_secure_espected_time
    def _get_pusher_wait_time(self, nb_broks_sent):
        avg_speed = self._get_avg_speed()
        if avg_speed == 0:
            logger.info('%s %s We do not have any broks spend speed, so will allow a wait of %.1fs for sending %d broks' % (_MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR, self.manage_brok_sub_process_broks_pusher_max_execution_timeout, nb_broks_sent))
            return self.manage_brok_sub_process_broks_pusher_max_execution_timeout
        espected_time = nb_broks_sent / avg_speed
        secure_espected_time = espected_time * self.manage_brok_sub_process_broks_pusher_security_ratio
        ceiled_secure_espected_time = min(self.manage_brok_sub_process_broks_pusher_max_execution_timeout, max(secure_espected_time, self.manage_brok_sub_process_broks_pusher_min_execution_timeout))
        
        logger.debug('%s %s We will allow a wait of %.1fs for sending %d broks (average speed = %5d broks/s)' % (_MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR, ceiled_secure_espected_time, nb_broks_sent, avg_speed))
        return ceiled_secure_espected_time
    
    
    def _wait_for_all_injectors_to_finish(self, all_processes, elapsed_put_stats, nb_broks_sent):
        not_finish = [e for e in all_processes.values() if e['end'] is None]
        wait_start = time.time()
        
        errored_modules = []
        
        ceiled_secure_espected_time = self._get_pusher_wait_time(nb_broks_sent)
        
        warning_printed = False
        
        while len(not_finish) != 0:
            for e in all_processes.values():
                if e['end'] is not None:  # already finished
                    continue
                module = e['module']
                module_name = module.get_name()
                pid = e['pid']
                last_warning = e['last_warning']  # do not raise warnings too much
                now = time.time()
                # raise a warning about long process, but at least once a second
                if now > wait_start + (ceiled_secure_espected_time / 2) and now > last_warning + 1:
                    if not warning_printed:
                        logger.warning(('%s %s [PERF-ALERT] [%s] |' % (_MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR, module_name)) + ('-' * (98)) + '|')
                        warning_printed = True
                    logger.warning('%s %s [PERF-ALERT] [%s] The queue pushing process (pid=%s) is running since a very long time: %ds' % (_MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR, module_name, pid, now - wait_start))
                    e['last_warning'] = now
                    self.last_too_long_injector_time = now  # let the daemon stats know about this error
                
                # If VERY too long, just kill it
                if now > wait_start + ceiled_secure_espected_time:
                    self.last_too_long_injector_time = now  # let the daemon stats know about this error
                    logger.warning('%s %s [PERF-ALERT] [%s] The queue pushing process (pid=%s) seems to be in a dead lock state (running since %ds>%ds limit). Stopping the process.' % (
                        _MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR, module_name, pid, now - wait_start, ceiled_secure_espected_time))
                    # kill the injector process
                    try:
                        os.kill(pid, signal.SIGKILL)
                        os.waitpid(pid, 0)
                    except OSError as exp:
                        pass
                    e['end'] = now  # do not reenter in this kill part
                    errored_modules.append(module_name)
                    continue
                
                try:
                    os.kill(pid, 0)
                    os.waitpid(pid, os.WNOHANG)
                    os.kill(pid, 0)
                except OSError:  # Not alive
                    # just to be sure it's dead-dead
                    try:
                        os.kill(pid, signal.SIGKILL)
                        os.waitpid(pid, 0)
                    except OSError:
                        pass
                    
                    e['end'] = time.time()
                    elapsed_put = e['end'] - e['start']
                    broks_speed = nb_broks_sent / elapsed_put
                    logger.debug('%s %s [%s] The queue injector (pid=%s) did finished in %.3fs (speed=%s broks/s)' % (_MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR, module_name, pid, elapsed_put, broks_speed))
                    self.avg_brok_send_speed.update_avg(broks_speed)
                    elapsed_put_stats[module_name] = elapsed_put_stats.get(module_name, 0) + elapsed_put
            
            not_finish = [e for e in all_processes.values() if e['end'] is None]
            if len(not_finish) != 0:
                time.sleep(0.01)
        
        return errored_modules
    
    
    def _push_broks_to_external_queues(self, broks, elapsed_put_stats, broks_to_send_to_externals):
        # We are sending broks as a big list, more efficient than one by one
        modules_and_queues = self.modules_manager.get_external_modules_and_queues()
        
        all_processes = {}
        self._tmp_bucket.clear()
        
        queues_jobs = {}
        for (instance, queue) in modules_and_queues:
            queues_jobs[instance.get_name()] = (instance, queue)
        
        try_nb = 0
        while len(queues_jobs) != 0:
            try_nb += 1
            if try_nb > self.manage_brok_sub_process_broks_pusher_max_retry:
                for module_name in queues_jobs.keys():
                    ceiled_secure_espected_time = self._get_pusher_wait_time(len(broks))
                    err = '%s %s [PERF-ALERT] [%s] The module did failed to send broks after %d try with a %.1fs execution timeout -- min=%d(seconds) < %d broks / %d (speed in broks/s) * %d (security ratio) < max=%ds .' % (
                        _MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR, module_name, self.manage_brok_sub_process_broks_pusher_max_retry, ceiled_secure_espected_time,
                        self.manage_brok_sub_process_broks_pusher_min_execution_timeout, len(broks), self._get_avg_speed(),
                        self.manage_brok_sub_process_broks_pusher_security_ratio, self.manage_brok_sub_process_broks_pusher_max_execution_timeout,
                    )
                    logger.error(err)
                    logger.error(('%s %s [PERF-ALERT] [%s] |' % (_MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR, module_name)) + ('-' * (30)) + ' Killing and restarting the module ' + ('-' * (30)) + '|')
                    self.modules_manager.did_crash(queues_jobs[module_name][0], reason=err)  # instance
                    # won't be possible to send this one, remove it
                    del queues_jobs[module_name]
                break
            
            for (instance, queue) in modules_and_queues:
                # before_put = time.time()
                module_name = instance.get_name()
                self._tmp_bucket[module_name] = (queue, broks)
                if self.manage_brok_enable_sub_processes_memory_usage_protection:
                    fork_is_possible = mem_wait_for_fork_possible('brok pusher for %s' % module_name,
                                                                  retry_time=self.manage_brok_sub_processes_memory_usage_protection_max_retry_time,
                                                                  reserved_memory=self.manage_brok_sub_process_memory_usage_system_reserved_memory)
                    if not fork_is_possible:
                        err = '[PERFORMANCE] [MEMORY] There are not enough memory to start the brok pusher process for module %s. Killing the module and restarting it.' % (module_name)
                        self.modules_manager.did_crash(instance, reason=err)
                        # won't be possible to send this one, remove it
                        del queues_jobs[module_name]
                        continue
                before = time.time()
                father_pid = os.getpid()
                pid = os.fork()
                if pid == 0:  # the son
                    self._in_queue_injector(before, module_name, father_pid)  # will exit
                entry = {'module': instance, 'pid': pid, 'start': time.time(), 'end': None, 'last_warning': 0.0}
                all_processes[instance.get_name()] = entry
            
            # We will wait until all injectors are done
            not_finished_modules = self._wait_for_all_injectors_to_finish(all_processes, elapsed_put_stats, len(broks))
            
            # if some errored modules occur, we don't remove them
            finished_modules = [module_name for module_name in queues_jobs.keys() if module_name not in not_finished_modules]
            for module_name in finished_modules:
                # SEF-4969: The module might have been removed frome the queue by another thread in the meantime
                queues_jobs.pop(module_name, None)
        
        # We no more need the tmp bucket now, all was sent
        self._tmp_bucket.clear()
    
    
    def do_loop_turn(self):
        loop_start = time.time()
        logger.info(_BROKER_TIME_STR)
        logger.info('%s %s ===-===-===-===-===-===-===-===-===-===-===-===-===' % (_BROKER_TIME_STR, _LOOP_START_STR))
        
        # If there are some zombie process, wait for all of them and allow them to go to hell
        try:
            active_children()
        except:
            pass
        
        ##### STEP1:
        
        with self.broks_lock:
            nb_broks = len(self.broks)
        logger.debug("[broks][manage] Begin loop with [%d] broks to handles" % nb_broks)
        
        self.broks_done = 0
        for mod in self.modules_manager.get_internal_instances():
            self.local_module_stats[mod.get_name()] = 0
        
        # Dump modules Queues size
        self.print_external_modules_queue_size()
        
        # Begin to clean modules
        self.check_and_del_zombie_modules()
        
        # Look if we have all need threads for our satellites
        self.assert_valid_satellite_threads()
        
        # Maybe the arbiter ask us to wait for a new conf
        # If true, we must restart all...
        if self.cur_conf is None:
            # Clean previous run from useless objects and close modules
            self.clean_previous_run()
            
            self.wait_for_initial_conf()
            # we may have been interrupted or so; then
            # just return from this loop turn
            if not self.new_conf:
                return
            
            self.setup_new_conf()
        
        # Now we check if arbiter speak to us in the pyro_daemon.
        # If so, we listen for it
        # When it pushes conf to us, we reinit connections
        self.watch_for_new_conf(0.0)
        if self.new_conf:
            self.setup_new_conf()
        
        # We need to be sure that our inventory do not keep
        # deleted realms
        self._clean_old_realms_in_inventory()
        
        # If an inventory did change, warn the modules about it
        # so they can update their own inventory about it
        self.assert_module_inventory_are_updated()
        
        if int(time.time()) % 3600 == 0:
            for (realm_name, inventory) in self._realms_inventory.items():
                logger.info('The realm %s inventory have currently %s elements' % (realm_name, inventory.get_len()))
        
        # Reap broks sent from the arbiters
        self.interger_arbiter_broks()
        
        # Spare mode stop loop here
        if not self.activated:
            # we clear broks list as in idle (spare waiting to start) we do not handle broks
            with self.broks_lock:
                self.broks = deque()
                self.external_module_broks = deque()
            
            logger.info('[BROKER TIME  ] Broker is idle (spare waiting to start)')
            logger.info('[BROKER TIME  ] [ === Loop end === ] ===-===-===-===-===-===-===-===-===-===-===-===-===')
            
            loop_time = (time.time() - loop_start)
            self.sleep(1.0 - loop_time)
            
            return
        
        ##### STEP2: PREPARE and send broks to external queues
        
        step2_start = time.time()
        # EXTERNAL BROKS
        # We will works this turn with a copy of the broks, so
        # we won't be impacted by the others threads
        with self.broks_lock:
            broks = copy.copy(self.broks)
            broks_to_send_to_externals = list([b for b in self.external_module_broks if b.type != 'log'])
            
            self.broks = deque()
            self.external_module_broks = deque()
        logger.info('%s %s [PERF] [ %.3f ]s, preparing broks lists for INTERNAL and EXTERNAL modules' % (_MANAGE_BROKS_STR, get_section_string('PREPARING BROKS'), time.time() - step2_start))
        
        # and for external queues
        # REF: doc/broker-modules.png (3)
        # We put to external queues broks that was not already send
        before_external = time.time()
        # We are sending broks as a big list, more efficient than one by one
        modules_and_queues = self.modules_manager.get_external_modules_and_queues()
        
        elapsed_put_stats = {}
        nb_external_broks_at_start = len(broks_to_send_to_externals)
        nb_external_broks_managed = 0
        
        while len(broks_to_send_to_externals) != 0:
            now = time.time()
            if now < before_external or now > before_external + 1:  # if get back in time, or too long, skip this turn
                # so we must remerge our last broks with the main broks to do not lost them
                with self.broks_lock:
                    # IMPORTANT: extendleft does REVERSE broks orders so we do once before to put them in good order
                    # note: broks.reverse() is not availabe on python 2.6
                    self.external_module_broks.extendleft(reversed(broks_to_send_to_externals))
                break
            
            shard_to_send = broks_to_send_to_externals[:self.external_module_queue_batch_size]
            broks_to_send_to_externals = broks_to_send_to_externals[self.external_module_queue_batch_size:]
            self._push_broks_to_external_queues(shard_to_send, elapsed_put_stats, broks_to_send_to_externals)
            nb_external_broks_managed += len(shard_to_send)
        
        ceiled_secure_espected_time = self._get_pusher_wait_time(nb_external_broks_at_start)
        avg_speed = self._get_avg_speed()
        logger.info('%s %s - PUSHED   [ %.3fs, limit=%.3fs ]s, EXTERNAL queue evolution: [ %5d broks => %5d broks remaining ] [ %5d broks managed ] [ Push average speed = %5d broks/s]' % (
            _MANAGE_BROKS_STR, _EXTERNAL_MODULE_STR, time.time() - before_external, ceiled_secure_espected_time, nb_external_broks_at_start, len(self.external_module_broks), nb_external_broks_managed, avg_speed))
        if len(elapsed_put_stats) != 0:
            elapsed_puts_sorted = sorted(elapsed_put_stats.keys())
            logger.info('%s                            ----- Sent to [%d] EXTERNAL queues: %s' % (_MANAGE_BROKS_STR, len(modules_and_queues), ', '.join(['(%s=%.3fs)' % (k, elapsed_put_stats[k]) for k in elapsed_puts_sorted])))
        
        ##### STEP3: MANAGE internal broks
        start_step3 = time.time()
        
        # Now INTERNAL
        b_types = defaultdict(int)
        b_times = defaultdict(float)
        
        nb_broks_managed = 0
        nb_broks_at_start = len(broks)
        
        internal_modules = self.modules_manager.get_internal_instances()
        if len(internal_modules) != 0:
            while len(broks) != 0:
                now = time.time()
                
                # Do not 'manage' more than 1s, we must get new broks
                # every 1s
                if now - start_step3 > 1:
                    # so we must remerge our last broks with the main broks to do not
                    # lost them
                    with self.broks_lock:
                        # IMPORTANT: extendleft does REVERSE broks orders so we do once before to put them in good order
                        # note: broks.reverse() is not availabe on python 2.6
                        self.broks.extendleft(reversed(broks))
                    break
                
                try:
                    b = broks.popleft()
                except IndexError:  # no more broks, maybe a daemon stop, not a problem, catch it
                    break
                
                nb_broks_managed += 1
                
                # Ok, we can get the brok, and doing something with it
                # REF: doc/broker-modules.png (4-5)
                # We un serialize the brok before consume it
                before_manage = time.time()
                self.manage_brok(b)
                after_manage = time.time()
                b_types[b.type] += 1
                b_times[b.type] += (after_manage - before_manage)
        else:  # no internal modules, we can fake the result
            nb_broks_managed = len(broks)
        
        with self.broks_lock:
            nb_remaining_broks = len(self.broks)
        internal_broks_time = time.time() - start_step3
        logger.info(
            '%s %s - EXECUTED [ %.3f ]s, INTERNAL queue evolution: [ %5d broks => %5d broks remaining ] [ %5d broks managed ]' % (
                _MANAGE_BROKS_STR, _INTERNAL_MODULE_STR, internal_broks_time, nb_broks_at_start, nb_remaining_broks, nb_broks_managed))
        
        sorted_modules_stats = self.local_module_stats.keys()
        sorted_modules_stats.sort()
        if sorted_modules_stats:
            logger.info('%s                            ----- Details of INTERNAL modules execution time: %s' % (_MANAGE_BROKS_STR, ', '.join(['(%s=%.3fs)' % (k, self.local_module_stats[k]) for k in sorted_modules_stats])))
        
        ##### STEP4: get modules objects and call modules ticks
        # Maybe external modules raised 'objects'we should get them
        self.get_objects_from_from_queues()
        
        # Say to modules it's a new tick :)
        self.hook_point('tick')
        
        self.print_modules_stats(b_times, b_types, nb_broks)
        
        loop_time = (time.time() - loop_start)
        
        logger.info('%s %s [PERF] [ %.3f ]s' % (_BROKER_TIME_STR, _LOOP_STOP_STR, loop_time))
        self.sleep(1.0 - loop_time)
    
    
    #  Main function, will loop forever
    def main(self):
        try:
            self.load_config_file()
            
            for line in self.get_header():
                logger.info(line)
            
            logger.info("[broks] Using working directory: %s" % os.path.abspath(self.workdir))
            
            # Look if we are enabled or not. If ok, start the daemon mode
            self.look_for_early_exit()
            self.do_daemon_init_and_start()
            self.print_daemon_start()
            self.load_modules_manager()
            
            self._register_http_interfaces()
            
            #  We wait for initial conf
            self.wait_for_initial_conf()
            if not self.new_conf:
                return
            
            self.setup_new_conf()
            
            # Now the main loop
            self.do_mainloop()
        except Exception as exp:
            logger.critical("The daemon did have an unrecoverable error. It must exit.")
            logger.critical("You can log a bug to your Shinken integrator with the error message:")
            logger.critical("%s" % (traceback.format_exc()))
            os._exit(2)  # force process exit, skip all hooks
            raise
