#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012:
#    Gabes Jean, naparuba@gmail.com
#    Gerhard Lausser, Gerhard.Lausser@consol.de
#    Gregory Starck, g.starck@gmail.com
#    Hartmut Goebel, h.goebel@goebel-consult.de
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken.  If not, see <http://www.gnu.org/licenses/>.


import base64
import cPickle
import ctypes
import gc
import os
import signal
import sys
import time
import traceback
from datetime import datetime, timedelta
from multiprocessing import Process, cpu_count

from pymongo.errors import AutoReconnect, ConnectionFailure

from shinken.basemodule import BaseModule, ModuleState
from shinken.basesubprocess import LookAtMyFatherThread
from shinken.macroresolver import MacroResolver
from shinken.objects.proxyitem import proxyitemsmgr, proxyitemsgraph
from shinken.util import malloc_trim, mem_wait_for_fork_possible, get_memory_consumption, split_list_by_pack
from shinkensolutions.ssh_mongodb.sshtunnelmongomgr import mongo_by_ssh_mgr

properties = {
    'daemons' : ['scheduler'],
    'type'    : 'mongodb_retention',
    'external': False,
}

ON_LINUX = sys.platform.startswith("linux")


def get_instance(plugin):
    # logger.debug('%s %s Get a Mongodb retention scheduler module for plugin %s' % (get_chapter_string('MODULE MANAGER'), get_section_string('CREATE MODULE'), plugin.get_name()))
    uri = plugin.uri
    database = plugin.database
    
    replica_set = getattr(plugin, 'replica_set', '')
    instance = MongodbRetentionScheduler(plugin, uri, database, replica_set)
    return instance


def chunks(_list, n):
    # Yield successive n-sized chunks from l.
    for i in xrange(0, len(_list), n):
        yield _list[i:i + n]


# This is a scheduler module to save host/service retention data into a mongodb database
class MongodbRetentionScheduler(BaseModule):
    def __init__(self, modconf, uri, database, replica_set):
        BaseModule.__init__(self, modconf)
        self.uri = uri
        self.database = database
        self.replica_set = replica_set
        self.nb_workers = 4
        self.use_ssh_tunnel = getattr(modconf, 'use_ssh_tunnel', False) in ['1', True]
        self.use_ssh_retry_failure = int(getattr(modconf, 'use_ssh_retry_failure', 1))
        
        self.ssh_user = getattr(modconf, 'ssh_user', os.getenv('USER'))
        self.ssh_keyfile = getattr(modconf, 'ssh_keyfile', '~shinken/.ssh/id_rsa')
        self.mongo_timeout = int(getattr(modconf, 'mongo_timeout', '10')) * 1000
        self.mongo_max_connections_retry = int(getattr(modconf, 'mongo_max_connections_retry', 3))
        self.mongo_wait_before_retry = int(getattr(modconf, 'mongo_wait_before_retry', 1))
        self.size_chunk_to_load = int(getattr(modconf, 'size_chunk_to_load', 1000))
        self.size_chunk_to_delete = int(getattr(modconf, 'size_chunk_to_delete', '1000'))
        
        # We have con, db etc for load and save parts
        self.con_load = None
        self.db_load = None
        self.hosts_fs_load = None
        self.services_fs_load = None
        
        self.con_save = None
        self.db_save = None
        self.hosts_fs_save = None
        self.services_fs_save = None
        
        # When we will fork, we will need to stop the HTTP sockets inside the
        # daemon, so if the scheduler goes down, we won't lock the socket ports
        self.scheduler_daemon = None
        
        self.worker_timeout = int(getattr(modconf, 'worker_timeout', '120'))
        self.worker_one_try_timeout = int(getattr(modconf, 'worker_timeout', '30'))
        
        # Not public option, for debug use only
        self.enable_sub_processes_memory_usage_protection = getattr(modconf, 'scheduler__retention_mongo__enable_sub_processes_memory_usage_protection', '1') == '1'
        self.sub_process_memory_usage_system_reserved_memory = int(getattr(modconf, 'scheduler__retention_mongo__sub_process_memory_usage_system_reserved_memory', '0'))
        self.sub_processes_memory_usage_protection_max_retry_time = int(getattr(modconf, 'scheduler__retention_mongo__sub_processes_memory_usage_protection_max_retry_time', '5'))
        
        self.nb_of_retention_day = int(getattr(modconf, 'nb_of_max_retention_day', '7'))
        
        self.max_number_of_workers = int(getattr(modconf, 'max_number_of_workers', '4'))
        
        self.all_data = None
        self.in_debug_mode = False
    
    
    def get_state(self):
        if self.con_load is None or self.db_load is None:
            status = {"status": ModuleState.CRITICAL, "output": "No connection to Mongo Database."}
        else:
            status = {"status": ModuleState.OK, "output": "OK"}
        
        return status
    
    
    def create_connections(self, what_for, logger):
        # In save mode the timeout is handle by the daemon so we force the timeout to the daemon value.
        before = time.time()
        logger.info('We are creating mongo connection')
        e = i = None
        for i in range(1, self.mongo_max_connections_retry + 1):
            try:
                requestor = '%s-%s-%s' % (self.daemon_display_name, self.name, logger.name)
                con_result = mongo_by_ssh_mgr.get_connection(self.uri, replica_set=self.replica_set, use_ssh=self.use_ssh_tunnel, ssh_keyfile=self.ssh_keyfile, ssh_user=self.ssh_user, ssh_retry=self.use_ssh_retry_failure, requestor=requestor)
                con = con_result.get_connection()
                db = getattr(con, self.database)
                hosts_fs = getattr(db, 'retention_hosts_raw')
                services_fs = getattr(db, 'retention_services_raw')
                break
            except Exception as e:
                if i == 1:
                    logger.warning('Mongo connection failed 1/%d time, we will try again' % self.mongo_max_connections_retry)
                    time.sleep(self.mongo_wait_before_retry)
                elif i != self.mongo_max_connections_retry:
                    logger.warning('Mongo connection failed %d/%d times, we will try again' % (i, self.mongo_max_connections_retry))
                    time.sleep(self.mongo_wait_before_retry)
        else:
            logger.error('Mongo connection failed %d/%d times, we stop trying' % (i, self.mongo_max_connections_retry))
            raise e
        
        if what_for == 'load' or what_for == 'delete':
            self.con_load = con
            self.db_load = db
            self.hosts_fs_load = hosts_fs
            self.services_fs_load = services_fs
        else:  # save
            self.con_save = con
            self.db_save = db
            self.hosts_fs_save = hosts_fs
            self.services_fs_save = services_fs
        logger.info('Connection created in : %.3fs' % (time.time() - before))
    
    
    def try_create_connection(self, what_for, logger):
        logger.info('We need to create a mongo connection')
        try:
            self.create_connections(what_for, logger)
        except Exception as e:
            logger.error('Could not create mongo connection')
            raise e
    
    
    def check_connection(self, what_for):
        if what_for == 'load' or what_for == 'delete':
            try:
                _host_test = self.hosts_fs_load.find_one({})
                _service_test = self.services_fs_load.find_one({})
            except:
                return False
        else:
            try:
                _host_test = self.hosts_fs_save.find_one({})
                _service_test = self.services_fs_save.find_one({})
            except:
                return False
        return True
    
    
    def init(self):
        self.logger.get_sub_part('INIT').debug('Initialization of the module')
    
    
    def manage_signal(self, sig, frame):
        self.logger.get_sub_part('MANAGE SIGNAL').info("The worker with the pid %d received a signal %s" % (os.getpid(), sig))
        dump_logger = self.logger.get_sub_part('WORKER pid=%d' % os.getpid())
        if not ON_LINUX:
            return
        if sig == signal.SIGUSR1:  # if USR1, ask a memory dump
            if sys.version_info[1] == 6:  # python 2.6
                try:
                    from guppy import hpy
                    hp = hpy()
                    dump_logger.error('(support-only) MEMORY DUMP (to be sent to the support):\n%s' % hp.heap())
                    return
                except ImportError:
                    dump_logger.error('(support-only) MEMORY DUMP: FAIL check if guppy lib is installed')
            if sys.version_info[1] == 7:  # python 2.7
                try:
                    import meliae.scanner
                    import meliae.loader
                    _f = "/tmp/memorydump-%s.json" % self.name
                    meliae.scanner.dump_all_objects(_f)
                    dump_logger.error('(support-only) Memory information dumped to file %s (to be sent to the support)' % _f)
                except ImportError:
                    dump_logger.error('(support-only) MEMORY DUMP: FAIL check if meliae lib is installed')
        else:
            if self.con_save:
                self.con_save.fsync()
                self.con_save.close()
            # On the worker exit, we ask to clone all ssh tunnel
            # NOTE: in worker, atexit calls won't be executed (thanks multiprocessing...)
            mongo_by_ssh_mgr.close_all_tunnels()
            sys.exit(0)
    
    
    def _look_at_father(self, worker_id, fatherpid):
        worker_logger = self.logger.get_sub_part('WORKER:%d' % worker_id)
        worker_logger.debug('Starting father process lookup loop')
        while True:
            try:
                os.kill(fatherpid, 0)  # fake kill
            except:  # no more father? fuck!
                worker_logger.error('I am a worker with pid: %d and my master process %s is dead, I exit.' % (os.getpid(), fatherpid))
                os._exit(0)  # in a thread, raw kill
            time.sleep(10)
    
    
    def _massive_delete_in_mongo(self, collection, nb_item_to_del, where, retry):
        delete_logger = self.logger.get_sub_part('DELETE OLD RETENTION')
        try:
            group_ids = list(collection.find(where, {'_id': 1}, modifiers={"$maxTimeMS": self.mongo_timeout}).limit(self.size_chunk_to_delete))
            batch = [i['_id'] for i in group_ids if i is not None]
            collection.remove({'_id': {'$in': batch}}, modifiers={"$maxTimeMS": self.mongo_timeout})
            nb_item_to_del -= len(group_ids)
        except ConnectionFailure as e:
            if retry < self.mongo_max_connections_retry:
                delete_logger.warning('We have been disconnected of mongo. Will retry [%s/%s]' % (retry, self.mongo_max_connections_retry))
                time.sleep(self.mongo_wait_before_retry)
                # Must raise to increment retry
                raise
            else:
                delete_logger.error('After %s tries, we could not connect to mongo :[%s]' % (self.mongo_max_connections_retry, e))
                delete_logger.print_stack()
                sys.exit(-1)
        except Exception as e:
            delete_logger.error('We have an error:[%s]' % e)
            delete_logger.print_stack()
            sys.exit(-1)
        return nb_item_to_del
    
    
    @staticmethod
    def _worker_exit_process(exit_code):
        # On the worker exit, we ask to clone all ssh tunnel
        # NOTE: in worker, at exit calls won't be executed (thanks multiprocessing...)
        mongo_by_ssh_mgr.close_all_tunnels()
        sys.exit(exit_code)
    
    
    def save_job(self, wid, father_pid, worker_name, retry=0):
        actual_pid = os.getpid()
        gc.disable()
        libc6 = ctypes.CDLL('libc.so.6')
        libc6.malloc_trim(0)
        
        save_logger = self.logger.get_sub_part('SAVE WORKER %d' % wid)
        for sig in (signal.SIGINT, signal.SIGTERM, signal.SIGUSR1):
            signal.signal(sig, self.manage_signal)
        
        # Launch a thread so this worker process raw exit if the master process die
        look_at_my_father_thread = LookAtMyFatherThread(father_pid, self.daemon_display_name, '[MongodbRetention][worker:%d]' % wid, loop_speed=10)
        look_at_my_father_thread.start_thread()
        
        try:
            from setproctitle import setproctitle
            setproctitle(worker_name)
        except:
            pass
        
        try:
            self._save_job(wid)
        except AutoReconnect as e:
            if retry < self.mongo_max_connections_retry:
                retry += 1
                save_logger.warning('worker has been disconnected of mongo. Will retry [%s/%s]' % (retry, self.mongo_max_connections_retry))
                time.sleep(self.mongo_wait_before_retry)
                self.save_job(wid, father_pid, retry)
                self._worker_exit_process(0)
            else:
                save_logger.error('After %s tries, worker could not connect to mongo :[%s]' % (self.mongo_max_connections_retry, e))
                save_logger.print_stack(prefix='(pid=%d)' % actual_pid)
                self._worker_exit_process(-1)
        except ConnectionFailure as e:
            save_logger.error('Worker has an error:[%s]' % e)
            self._worker_exit_process(-1)
        except Exception as e:
            save_logger.error('Worker has an error:[%s]' % e)
            save_logger.print_stack(prefix='(pid=%d) ' % actual_pid)
            self._worker_exit_process(-1)
    
    
    def _save_job(self, wid):
        actual_pid = os.getpid()
        worker_start = time.time()
        all_data = self.all_data
        offset = self.nb_workers
        save_logger = self.logger.get_sub_part('SAVE WORKER %d' % wid)
        save_logger.debug('Worker spawned as process with pid %d' % actual_pid)
        
        os.nice(1)
        # Re init the mongodb connexion for new process
        save_logger.debug('Create connection timeout:[%s]' % self.mongo_timeout)
        try:
            self.create_connections('save', save_logger)
        except ConnectionFailure as e:
            save_logger.error('Failed connection with the following exception : [%s]' % e.message)
            raise
        except Exception:
            save_logger.error('Failed connection with the following exception : [%s]' % traceback.format_exc(1))
            raise
        
        all_objs = {'hosts': {}, 'services': {}}
        date = datetime.utcnow()
        
        hosts = all_data['hosts']
        services = all_data['services']
        
        before = time.time()
        # Prepare the encoding for all managed hosts
        i = -1
        nb_host_save = 0
        for h_key in hosts:
            # Only manage the worker id element of the offset (number of workers)
            # elements
            i += 1
            if (i % offset) != wid:
                continue
            nb_host_save += 1
            h = hosts[h_key]
            key = "HOST-%s" % h_key
            val = cPickle.dumps(h, protocol=cPickle.HIGHEST_PROTOCOL)
            val2 = base64.b64encode(val)
            # We save it in the Gridfs for hosts
            all_objs['hosts'][key] = {'_id': key, 'value': val2, 'date': date}
        
        i = -1
        nb_service_save = 0
        for s_key in services:
            i += 1
            # Only manage the worker id element of the offset (number of workers)
            # elements
            if (i % offset) != wid:
                continue
            nb_service_save += 1
            s = services[s_key]
            key = "SERVICE-%s" % s_key
            # space are not allowed in a key.. so change it by SPACE token
            key = key.replace(' ', 'SPACE')
            val = cPickle.dumps(s, protocol=cPickle.HIGHEST_PROTOCOL)
            val2 = base64.b64encode(val)
            all_objs['services'][key] = {'_id': key, 'value': val2, 'date': date}
        save_logger.info('Will save %s hosts and %s checks' % (nb_host_save, nb_service_save))
        after_pick = time.time()
        save_logger.debug('hosts+services pickles times: %.3f' % (after_pick - before))
        
        before = time.time()
        if len(all_objs['hosts']) != 0:
            # Stack 1000 hosts together in a bulk operation
            host_stacks = list(chunks(all_objs['hosts'].values(), 1000))
            for host_stack in host_stacks:
                bulk = self.hosts_fs_save.initialize_unordered_bulk_op()
                for host_retention in host_stack:
                    bulk.find({'_id': host_retention['_id']}).upsert().replace_one(host_retention)
                bulk.execute()
        
        after_hosts = time.time()
        save_logger.debug('hosts times: %.3f' % (after_hosts - before))
        
        before = time.time()
        if len(all_objs['services']) != 0:
            # Stack 1000 hosts together in a bulk operation
            service_stacks = list(chunks(all_objs['services'].values(), 1000))
            for service_stack in service_stacks:
                bulk = self.services_fs_save.initialize_unordered_bulk_op()
                for service_retention in service_stack:
                    bulk.find({'_id': service_retention['_id']}).upsert().replace_one(service_retention)
                bulk.execute()
        after_services = time.time()
        save_logger.debug('Services times: %.3f' % (after_services - before))
        
        # Do not fsync the data, as it will block other workers. Disk write will wait that the mongo
        # will automatically do it, by default every 60s.
        save_logger.debug('Closing')
        self.con_save.close()
        save_logger.info('SUCCESS did saved %s hosts and %s checks retention data into mongodb in %.2fs' % (nb_host_save, nb_service_save, time.time() - worker_start))
        self.db_save = self.con_save = self.hosts_fs_save = self.services_fs_save = None
        if self.in_debug_mode:
            save_logger.debug('Memory usage after saving :[%s]Mo' % get_memory_consumption()[0])
        
        # On the worker exit, we ask to clone all ssh tunnel
        # NOTE: in worker, at exit calls won't be executed (thanks multiprocessing...)
        mongo_by_ssh_mgr.close_all_tunnels()
        os._exit(0)
        # Return and so quit this sub-process
        return 0
    
    
    def _kill_process(self, proc_entry):
        process = proc_entry['process']
        if process is None:  # already did
            return
        worker_id = proc_entry['worker_id']
        save_logger = self.logger.get_sub_part('SAVE WORKER %d' % worker_id)
        proc_pid = process.pid
        try_nb = proc_entry['try']
        save_logger.warning('The worker (pid:%d | try:%d) did not exit on time (%d s). We are restarting it.' % (proc_pid, try_nb, self.worker_one_try_timeout))
        process.terminate()
        # Give a small time to exit
        time.sleep(0.5)
        # And in all case: ATOMISE THIS
        try:
            os.kill(proc_pid, 9)
        except Exception:  # maybe was not exist
            pass
        process.join(0.1)
        proc_entry['process'] = None
    
    
    def launch_and_check_workers(self):
        try:
            self._really_launch_and_check_workers()
        except Exception:
            self.logger.get_sub_part('SAVE WORKERS').print_stack()
            sys.exit(2)
    
    
    def _really_launch_and_check_workers(self):
        gc.disable()
        self.scheduler_daemon.sched_daemon.http_daemon.shutdown(quiet=True)
        save_workers_logger = self.logger.get_sub_part('SAVE WORKERS')
        save_workers_logger.debug('closing self.scheduler_daemon.sched_daemon.http_daemon')
        processes = {}
        start = time.time()
        
        # ATOMIZATION
        # here before launch any worker we clean all our mapped memory from scheduler
        # If you don't clean mem here the copy on write will duplicate the scheduler memory because scheduler will continue this work
        # We do not exec do remove all because we want keep self.all_data with all data to save in retention
        self.in_debug_mode = self.scheduler_daemon.sched_daemon.debug
        worker_name_template = '%s [ - %s - worker %s ]' % (self.scheduler_daemon.sched_daemon.daemon_display_name, self.get_name(), '%d')
        
        if self.in_debug_mode:
            save_workers_logger.debug('[prepare worker] memory usage before cleaning memory :[%s]Mo' % get_memory_consumption()[0])
        start_atomization = time.time()
        
        mr = MacroResolver()
        for k in mr._Borg__shared_state.keys():
            del mr._Borg__shared_state[k]
        
        for t in ('hostgroups', 'services', 'hosts', 'notificationways', 'checkmodulations', 'macromodulations', 'contacts', 'contactgroups', 'servicegroups', 'timeperiods', 'commands'):
            map(self.atomize, (i for i in getattr(self.scheduler_daemon, t)))
        
        proxyitemsmgr.refresh_items([])
        proxyitemsgraph.reset()
        
        c4 = self.scheduler_daemon.sched_daemon.conf
        self.atomize(c4)
        self.atomize(self.scheduler_daemon.sched_daemon)
        self.atomize(self.scheduler_daemon)
        
        malloc_trim(False)
        
        save_workers_logger.log_perf(start_atomization, 'MongodbRetention', 'atomization duration')
        if self.in_debug_mode:
            save_workers_logger.debug('[prepare worker] memory usage after cleaning memory :[%s]Mo' % (get_memory_consumption()[0]))
        
        finished_workers = set()
        
        for worker_id in xrange(self.nb_workers):
            processes[worker_id] = {'worker_id': worker_id, 'process': None, 'pid': -1, 'start_time': 0.0, 'try': 0}
        
        while True:  # will be timeout or void
            now = time.time()
            
            # Did finish well
            if len(finished_workers) == self.nb_workers:  # No more process: we did finish
                return
            
            # Not good, Global timeout reach too late
            if now > start + self.worker_timeout:
                for (worker_id, proc_entry) in processes.items():
                    self._kill_process(proc_entry)
                save_workers_logger.error('some workers did fail to exit or encountered an error. The retention save can be incomplete.')
                sys.exit(2)
            
            # Join the process that did finished
            for (worker_id, proc_entry) in processes.items():
                if worker_id in finished_workers:
                    continue
                process = proc_entry['process']
                if process is None:  # was not started, will be done after
                    continue
                # Ok was started, try to join it
                process.join(0.1)
                if process.is_alive():  # if the process is still alive, timeout will manage it
                    continue
                return_code = process.exitcode
                if return_code == 0:  # all was well, we can skip this worker now
                    finished_workers.add(worker_id)
                    save_workers_logger.info('The worker %d did SUCCESS (after %d try)' % (worker_id, proc_entry['try']))
                else:  # ok was not a success, retry it
                    self._kill_process(proc_entry)
            
            # Start the process that need it
            for (worker_id, proc_entry) in processes.items():
                if worker_id in finished_workers:  # already finished, skip it
                    continue
                if proc_entry['process'] is None:  # No process, start one
                    proc_entry['try'] += 1
                    if proc_entry['try'] > self.mongo_max_connections_retry:
                        save_workers_logger.error('Too many tries failed')
                        sys.exit(2)
                    if self.enable_sub_processes_memory_usage_protection:
                        is_fork_possible = mem_wait_for_fork_possible('%s' % self.get_name(), reserved_memory=self.sub_process_memory_usage_system_reserved_memory, retry_time=self.sub_processes_memory_usage_protection_max_retry_time)
                        if not is_fork_possible:
                            for proc_entry in processes.values():
                                self._kill_process(proc_entry)
                            # Will be catch by create_queues => try_instance_init
                            save_workers_logger.error('Cannot start the %s worker process as there is not enough memory' % self.get_name())
                            sys.exit(2)
                    
                    proc = None
                    try:
                        proc = Process(target=self.save_job, args=(worker_id, os.getpid(), worker_name_template % worker_id))
                    except Exception as exp:
                        save_workers_logger.error('Cannot start the worker %d process: %s. Exiting the retention save, killing all currently launched workers' % (worker_id, exp))
                        for proc_entry in processes.values():
                            self._kill_process(proc_entry)
                        sys.exit(2)
                    
                    if self.in_debug_mode:
                        save_workers_logger.debug('[prepare worker] memory usage before worker fork :[%s]Mo' % (get_memory_consumption()[0]))
                    proc.start()
                    proc_entry['process'] = proc
                    proc_entry['pid'] = proc.pid
                    proc_entry['start_time'] = time.time()
                    save_workers_logger.info('Starting worker %d with pid %d. Try: %d/%d' % (worker_id, proc_entry['pid'], proc_entry['try'], self.mongo_max_connections_retry))
            
            # worker timeout, not great
            for (worker_id, proc_entry) in processes.items():
                if worker_id in finished_workers:
                    continue
                if now > proc_entry['start_time'] + self.worker_one_try_timeout:
                    self._kill_process(proc_entry)
            
            time.sleep(0.1)
    
    
    @staticmethod
    def atomize(obj):
        if hasattr(obj, 'properties'):
            for k in obj.properties.keys():
                try:
                    delattr(obj, k)
                except:
                    pass
        if hasattr(obj, 'running_properties'):
            for k in obj.running_properties.keys():
                try:
                    delattr(obj, k)
                except:
                    pass
        for k in obj.__dict__.keys():
            delattr(obj, k)
    
    
    def hook_save_retention(self, daemon):
        """
        main function that is called in the retention creation pass
        """
        # Save the scheduler daemon as we will need it in the sub-process
        self.scheduler_daemon = daemon
        try:
            _cpu_count = cpu_count()
        except NotImplementedError:
            _cpu_count = self.max_number_of_workers
        
        self.nb_workers = min(self.max_number_of_workers, _cpu_count)
        
        t0 = time.time()
        all_data = daemon.get_retention_data()
        save_global_logger = self.logger.get_sub_part('SAVE GLOBAL')
        save_global_logger.info('Starting to save retention data with %s worker(s). [%d:hosts] [%d:checks] (Database used = %s, use ssh = %s)' % (self.nb_workers, len(all_data['hosts']), len(all_data['services']), self.uri, self.use_ssh_tunnel))
        
        self.all_data = all_data  # so worker process will be able to get it
        
        # We make a process here to clean the memory before launch worker
        try:
            p = Process(target=self.launch_and_check_workers)
            p.start()
            p.join()
        except OSError as exc:
            message = '%s SAVE FAILED Retention data could not be saved in mongodb : %s' % (self.name, exc.strerror)
            save_global_logger.error(message)
            raise Exception(message)
        
        if p.exitcode != 0:
            message = '%s SAVE FAILED with error code : %s Retention data could not be saved in mongodb. Total time %.2fs' % (self.name, p.exitcode, time.time() - t0)
            save_global_logger.error(message)
            raise Exception(message)
        else:
            save_global_logger.info('SUCCESS Retention data was saved into mongodb. Total time %.2fs' % (time.time() - t0))
    
    
    # Should return if it succeed in the retention load or not
    def hook_load_retention(self, daemon):
        load_logger = self.logger.get_sub_part('LOAD RETENTION')
        if not self.hosts_fs_load or not self.check_connection('load'):
            try:
                self.try_create_connection('load', load_logger)
            except (ConnectionFailure, OSError) as exc:
                load_logger.error('FAILED Retention data could not be loaded from mongodb: %s' % (getattr(exc, 'message', str(exc))))
                return False
        
        host_and_cluster_string = '[ HOSTS / CLUSTERS ]'
        check_string = '[ CHECKS           ]'
        
        start = time.time()
        
        # The scheduler can already have a retention cache (if it already have element before) so maybe we won't have to load all elements
        instances_uuids_to_get = daemon.get_instances_uuids_to_restore_retention()
        nb_hosts_in_scheduler = instances_uuids_to_get['hosts']['total']
        nb_checks_in_scheduler = instances_uuids_to_get['services']['total']
        total_scheduler_elements = nb_hosts_in_scheduler + nb_checks_in_scheduler
        
        host_uuids_to_get = instances_uuids_to_get['hosts']['to_load']
        
        # We got list of loaded data from retention uri
        ret_hosts = {}
        ret_services = {}
        nb_host_load = 0
        nb_service_load = 0
        # Hosts:
        if len(host_uuids_to_get) != 0:
            _host_packs_list = split_list_by_pack(host_uuids_to_get, self.size_chunk_to_load)
            for _host_pack in _host_packs_list:
                try:
                    tmp_found_hosts = self.hosts_fs_load.find({'_id': {'$in': ['HOST-%s' % _host_uuid for _host_uuid in _host_pack]}}, modifiers={"$maxTimeMS": self.mongo_timeout})
                except Exception as exp:
                    load_logger.error('%s %s error querying host entries: %s. Module exiting.' % exp)
                    return False
                for item in tmp_found_hosts:
                    val = item.get('value', None)
                    if val is None:
                        continue
                    val = base64.b64decode(val)
                    val = cPickle.loads(val)
                    ret_hosts[item['_id'][5:]] = val  # to remove the HOST-
                    nb_host_load += 1
            load_logger.info('%s [ %.3fs ] We took %-4d hosts/clusters  from the retention [ in scheduler hosts/clusters : without retention=%-4d / total=%-4d ]' %
                             (host_and_cluster_string, time.time() - start, nb_host_load, len(host_uuids_to_get), nb_hosts_in_scheduler))
        else:
            load_logger.info('%s No host/cluster are need for retention load (scheduler already have all %-4d hosts retention data).' % (host_and_cluster_string, nb_checks_in_scheduler))
        host_ended_time = time.time()
        
        # Services/Checks:
        service_uuids_to_get = instances_uuids_to_get['services']['to_load']
        if len(service_uuids_to_get) != 0:
            _service_packs_list = split_list_by_pack(service_uuids_to_get, self.size_chunk_to_load)
            
            for _service_pack in _service_packs_list:
                try:
                    tmp_found_services = self.services_fs_load.find({'_id': {'$in': ['SERVICE-%s' % _service_id for _service_id in _service_pack]}}, modifiers={"$maxTimeMS": self.mongo_timeout})
                except Exception as exp:
                    load_logger.error('error querying checks entries: %s. Module exiting.' % exp)
                    return False
                for item in tmp_found_services:
                    val = item.get('value', None)
                    if val is not None:
                        val = base64.b64decode(val)
                        val = cPickle.loads(val)
                        ret_services[item['_id'][8:]] = val  # to remove the SERVICE- part
                        nb_service_load += 1
            load_logger.info('%s [ %.3fs ] We took %-4d checks          from the retention [ in scheduler checks         : without retention=%-4d / total=%-4d ]' %
                             (check_string, time.time() - host_ended_time, nb_service_load, len(service_uuids_to_get), nb_checks_in_scheduler,))
        else:
            load_logger.info('%s No checks       are need for retention load (scheduler already have all %-4d checks retention data).' % (check_string, nb_checks_in_scheduler))
        
        all_data = {'hosts': ret_hosts, 'services': ret_services}
        
        nb_load = nb_host_load + nb_service_load
        
        load_logger.info('[ %.3fs ] Total number of elements load from mongo database: %-4d ( scheduler have a total of %-4d elements )' %
                         (time.time() - start, nb_load, total_scheduler_elements))
        
        # Ok, now load them scheduler :)
        daemon.restore_retention_data(all_data)
        
        load_logger.info('[ %.3fs ] SUCCESS Retention data loaded successfully' % (time.time() - start))
        return True
    
    
    def hook_delete_old_retention(self, daemon):
        delete_logger = self.logger.get_sub_part('DELETE OLD RETENTION')
        if not self.hosts_fs_load or not self.check_connection('load'):
            try:
                self.try_create_connection('load', delete_logger)
            except Exception as e:
                raise e
        _start_time = time.time()
        now = datetime.utcnow()
        time_delta = now - timedelta(days=self.nb_of_retention_day)
        _where = {'date': {'$lte': time_delta}}
        delete_logger.info('We will delete all retention data that were saved before the %s UTC (%d days)' % (time_delta.strftime("%Y-%m-%d %H:%M"), self.nb_of_retention_day))
        retry = 0
        nb_hosts_to_del = total_hosts_to_del = self.hosts_fs_load.find(_where, {'_id': 1}, modifiers={"$maxTimeMS": self.mongo_timeout}).count()
        if total_hosts_to_del != 0:
            delete_logger.info('  - Deleting %d hosts from old retention [%d by %d]' % (nb_hosts_to_del, self.size_chunk_to_delete, self.size_chunk_to_delete))
        while nb_hosts_to_del > 0:
            try:
                nb_hosts_to_del = self._massive_delete_in_mongo(self.hosts_fs_load, nb_hosts_to_del, _where, retry)
            except Exception:
                retry += 1
        if total_hosts_to_del != 0:
            delete_logger.info('  - - %d hosts deleted in %.3fs' % (total_hosts_to_del, time.time() - _start_time))
        _after_host_time = time.time()
        
        retry = 0
        nb_services_to_del = total_services_to_del = self.services_fs_load.find(_where, {'_id': 1}, modifiers={"$maxTimeMS": self.mongo_timeout}).count()
        if total_services_to_del != 0:
            delete_logger.info('  - Deleting %d services from old retention [%d by %d]' % (nb_services_to_del, self.size_chunk_to_delete, self.size_chunk_to_delete))
        while nb_services_to_del > 0:
            try:
                nb_services_to_del = self._massive_delete_in_mongo(self.services_fs_load, nb_services_to_del, _where, retry)
            except Exception:
                retry += 1
        if total_services_to_del != 0:
            delete_logger.info('  - - %d services deleted in %.3fs' % (total_services_to_del, time.time() - _after_host_time))
        
        if total_services_to_del == 0 and total_hosts_to_del == 0:
            delete_logger.info('  - There is no data to delete')
        
        delete_logger.info(' Total time for deleting %d entries = %.3fs' % (total_services_to_del + total_hosts_to_del, time.time() - _start_time))
