#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
# Copyright (C) 2009-2016:
#     Gabes Jean, naparuba@gmail.com
#     Gerhard Lausser, Gerhard.Lausser@consol.de
#     Gregory Starck, g.starck@gmail.com
#     Hartmut Goebel, h.goebel@goebel-consult.de
#     Martin Benjamin, b.martin@shinken-solutions.com
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken.  If not, see <http://www.gnu.org/licenses/>.

import pickle
import io
import logging
import os
import signal
import time
import traceback
from queue import Empty
from ctypes import c_double
from multiprocessing import Process, cpu_count, Value

import psutil

from .action import TooManyOpenFiles, ACTION_TYPES
from .check import CHECK_STATUS
from shinken.ipc.shinken_queue.shinken_queue import build_shinken_queue
from shinken.basesubprocess import LookAtMyFatherThread
from shinken.log import logger, LoggerFactory, PartLogger
from shinken.message import Message
from shinken.misc.type_hint import TYPE_CHECKING
from shinken.safepickle import SafeUnpickler
from shinken.util import start_malloc_trim_thread, get_memory_used_percent, get_cpu_running_procs, from_float_sec_to_int_micro_sec
from shinken.worker_stats import InWorkerStats

if TYPE_CHECKING:
    from shinken.misc.type_hint import Optional
    from shinken.check import Check
    from shinken.notification import Notification
    from shinken.eventhandler import EventHandler

id_worker = 0

_CPU_USAGE_100PCT_AVAILABILITY_THRESHOLD = 80

# Max CPU time in worker queue
WORKER_MAX_LOAD_QUEUE = from_float_sec_to_int_micro_sec(1.5)

# Threshold of running queue per CPU for launch check
WORKER_MAX_CPU_QUEUE_PER_CPU = 4

# Threshold of RAM usage for launch check
WORKER_MAX_RAM_PERCENT = 95.0

_CHECK_RESOURCE_PERIOD = 0.010  # 10ms wait between check for CPU/RAM/Load Average

NO_CPU_DIFFERENCE_TIME = -9999.0  # Need a float as we will share between process


class Worker(object):
    """
    This class is used for poller and reactionner to launch actions.
    """
    
    
    def __init__(self, processes_by_worker, max_plugins_output_length=8192, target=None, worker_type="fork", loaded_into='unknown', executor_id='none', http_daemon=None, daemon_display_name='',
                 max_ram_percent=WORKER_MAX_RAM_PERCENT, max_cpu_queue_per_cpu=WORKER_MAX_CPU_QUEUE_PER_CPU):
        global id_worker
        id_worker += 1
        
        self.logger = None  # type: Optional[PartLogger]
        
        self.id = id_worker
        self.executor_id = executor_id
        
        # counter of action give each ticks
        self.action_counter_by_tick = 0
        
        self.pipe_in_actions_todo_send_by_executor_to_worker, self.pipe_out_actions_todo_send_by_executor_to_worker = build_shinken_queue('', 'Main', 'W[%s]' % self.id)
        
        self.worker_type = worker_type
        
        # windows forker do not like pickle http/lock, nor socket
        if os.name == 'nt':
            self.http_daemon = None
            # sockets cannot be pickled, we have to use multiprocessing.queue We need one queue for each way of communication to avoid reading data we have just sent
            self.pipe_in_actions_done_by_worker_to_send_at_executor, self.pipe_out_actions_done_by_worker_to_send_at_executor = build_shinken_queue('DONE', 'Main', 'W[%s]' % self.id)
        else:
            self.http_daemon = http_daemon
            # each way of communication has its own buffers, just use them, no risk of interference
            self.pipe_in_actions_done_by_worker_to_send_at_executor, self.pipe_out_actions_done_by_worker_to_send_at_executor = self.pipe_in_actions_todo_send_by_executor_to_worker, self.pipe_out_actions_todo_send_by_executor_to_worker
        
        self.nb_doing_action = 0  # use by the poller to compute the action in worker
        
        self._ram_percent = {}
        self._cpu_percent = {}
        self._cpu_running_queue = {}
        self._max_cpu_queue_per_cpu = max_cpu_queue_per_cpu
        self._i_am_dying = False
        self._loaded_into = loaded_into
        
        self._todo_actions = []
        self.load_todo_actions = Value(c_double, 0)
        self.last_loop_time = Value(c_double, 0)
        self._processes_by_worker = processes_by_worker
        self._max_ram_percent = max_ram_percent
        
        self._max_plugins_output_length = max_plugins_output_length
        self._full_since = -1
        self._launched_actions = 0
        
        # To give stats at 1s for nb running, nb launched, real execution cpu time, etc
        self._execution_stats = InWorkerStats()
        
        # By default, take our own code
        if target is None:
            target = self._do_work
            self.worker_type = "fork"
        
        self.daemon_display_name = daemon_display_name  # for ps aux display
        deprecated_parameter = None
        self._process = Process(target=self._work, args=(target, self.pipe_out_actions_todo_send_by_executor_to_worker, self.pipe_in_actions_done_by_worker_to_send_at_executor, deprecated_parameter, os.getpid(), logger.get_log_file_path()))
    
    
    def start(self):
        self._process.start()
    
    
    # Kill the background process
    # AND close correctly the queues (input and output)
    # each queue got a thread, so close it too....
    def terminate(self):
        # logger.debug("[worker][%d] asking terminate" % self.id)
        # We can just terminate process, not threads
        self._process.terminate()
        
        self.pipe_in_actions_todo_send_by_executor_to_worker.close()
        self.pipe_out_actions_todo_send_by_executor_to_worker.close()
        self.pipe_in_actions_done_by_worker_to_send_at_executor.close()
        self.pipe_out_actions_done_by_worker_to_send_at_executor.close()
    
    
    def is_alive(self):
        return self._process.is_alive()
    
    
    def join(self, timeout=None):
        self._process.join(timeout)
    
    
    # Call from master process, get all worker stats
    def get_stats_from_master_process(self):
        return self._execution_stats.get_stats_from_master_process()
    
    
    def add_todo_action(self, action):
        # type: (Optional[Check, Notification, EventHandler]) -> None
        action.status = CHECK_STATUS.READY_TO_RUN  # We can use the Check for all Actions
        # Save in the action the current executor_id / worker_id
        action.executor_id = self.executor_id
        action.worker_id = self.id
        msg = Message(id=0, type='Do', data=action)
        self.nb_doing_action += 1
        self.action_counter_by_tick += 1
        with self.load_todo_actions.get_lock():
            self.load_todo_actions.value += action.average_cpu_time_in_micro
        # Now it's a Pipe()
        _msg = pickle.dumps(msg, 2)
        self.pipe_in_actions_todo_send_by_executor_to_worker.send(_msg)
    
    
    def rollback_add_todo_action(self, action):
        action.status = 'inpoller'
        action.executor_id = -1
        action.worker_id = -1
        with self.load_todo_actions.get_lock():
            self.load_todo_actions.value -= action.average_cpu_time_in_micro
        self.nb_doing_action -= 1
        self.action_counter_by_tick -= 1
    
    
    def get_results(self):
        res = []
        while self.pipe_out_actions_done_by_worker_to_send_at_executor.poll():
            # NOTE: the recv can fail if the other side is pushing into it, if so, skip this turn
            try:
                action_raw = self.pipe_out_actions_done_by_worker_to_send_at_executor.recv()
            except MemoryError as exp:
                # Need to investigate for this ERROR. Should we continue to run and retry next turn (like EOFError or IOError) or manage it by another way ? See #SEF-6681
                logger.error('The worker %s reception did fail this turn (%s), skip to the next turn to receive more. The Traceback is as follows :' % (self.id, exp))
                logger.print_stack()
                return res
            except (EOFError, IOError, pickle.UnpicklingError) as exp:  # NOTE: UnpicklingError is possible if the worker is crash and didn't flush the socket
                logger.warning('The worker %s reception did fail this turn (%s), skip to the next turn to receive more.' % (self.id, exp))
                return res
            action = SafeUnpickler.loads(action_raw, 'action from worker')  # cPickle because Pipe() use a slow pickle instead of cPickle
            res.append(action)
        return res
    
    
    def print_worker_full(self):
        if self.load_todo_actions.value >= WORKER_MAX_LOAD_QUEUE:
            if self._full_since == -1:
                self._full_since = time.time()
            
            log_level = logging.DEBUG
            full_duration = time.time() - self._full_since
            if full_duration > 5:
                log_level = logging.WARNING
            logger.log(log_level, '[worker-%s] [%d] is full since 〖%s〗. It has 〖%s〗 checks pending for %s cpu time.' % (self.worker_type, self.id, PartLogger.format_duration_in_sec(full_duration), self.nb_doing_action, self.load_todo_actions.value))
        else:
            self._full_since = -1
    
    
    def _compute_ram_usage(self):
        used_percent = int(get_memory_used_percent() * 100)
        self._ram_percent = {'ram_percent': used_percent}
    
    
    def _compute_cpu_usage(self):
        compute_period = -1
        now = time.time()
        if self._cpu_percent:
            compute_period = now - self._cpu_percent['at']
        
        interval = None
        if compute_period == -1:
            interval = _CHECK_RESOURCE_PERIOD
        elif compute_period > _CHECK_RESOURCE_PERIOD:
            interval = None
        
        try:
            current_cpu_percent = psutil.cpu_percent(interval=interval, percpu=True)
        except ZeroDivisionError:  # old psutil versions can crash with ZeroDivisionError, if so, like new psutil versions, give 0.0
            # cf: https://chromium.googlesource.com/external/github.com/giampaolo/psutil/+/master/psutil/__init__.py#1724
            nb_cpus = cpu_count()
            current_cpu_percent = [0.0 for _ in range(nb_cpus)]
        
        self._cpu_percent = {'cpu_percent': current_cpu_percent, 'at': now}
    
    
    def _compute_cpu_running_queue(self):
        now = time.time()
        
        # Feature not available on Windows
        if os.name == 'nt':
            self._cpu_running_queue = {'running_queue': -1, 'at': now, 'compute_period': _CHECK_RESOURCE_PERIOD}
            return
        
        self._cpu_running_queue = {'running_queue': self._get_running_procs()}
    
    
    @staticmethod
    def _get_running_procs():
        return get_cpu_running_procs()
    
    
    def _is_allowed_to_launch_action(self):
        # type: () -> bool
        cpu_availability = False
        self._compute_cpu_usage()
        self._compute_ram_usage()
        self._compute_cpu_running_queue()
        
        # If one CPU have a little availability, then the global CPU is available
        for cpu in self._cpu_percent['cpu_percent']:
            if cpu < _CPU_USAGE_100PCT_AVAILABILITY_THRESHOLD:
                cpu_availability = True
        
        cpu_queue_availability = self._cpu_running_queue['running_queue'] <= (self._max_cpu_queue_per_cpu * cpu_count())
        ram_availability = self._ram_percent['ram_percent'] <= self._max_ram_percent
        return cpu_availability and ram_availability and cpu_queue_availability
    
    
    # Launch checks that are in status
    # REF: doc/shinken-action-queues.png (4)
    def _launch_actions(self):
        # self.logger.info('WORKER STATS launch actions')
        start = time.time()
        
        timeout = 0.5  # end the loop after 0.5s
        
        # self._wait_time_for_resource = 0.0
        _nb_action_launched_by_type = {ACTION_TYPES.CHECK: 0, ACTION_TYPES.NOTIFICATION: 0, ACTION_TYPES.EVENTHANDLER: 0}
        _nb_action_launched = 0
        _sum_launched_expected_cpu_time = 0
        
        is_loop_aborted = False  # is the loop was over the 1s limit and was aborted
        
        # Get all actions that are ready to be launched, and compute the overall CPU time if we launch ALL
        actions_to_launch = [action for action in self._todo_actions if action.status == CHECK_STATUS.READY_TO_RUN]
        _all_action_expected_cpu_time = 0.0
        if len(actions_to_launch) >= 1:  # sum() is on error when void
            _all_action_expected_cpu_time = sum([action.average_cpu_time for action in actions_to_launch])
        
        for action in actions_to_launch:  # type: Optional[Check, Notification, EventHandler]
            # We must protect against this loop to be too long, we limit it to 1s overall time
            if abs(time.time() - start) > timeout:
                is_loop_aborted = True
                break
            
            # Are we ok with limits to launch a new action .
            # 1. If number of running actions is too high, no need to wait here, as only self._manage_finished_checks will decrease the counter
            if self._launched_actions >= self._processes_by_worker:
                is_loop_aborted = True
                break
            
            is_timeout_reached = False  # used to exit inside a double break
            # 2. We are waiting CPU and MEMORY to be available before launch it
            while not self._is_allowed_to_launch_action():
                time.sleep(_CHECK_RESOURCE_PERIOD)
                self._execution_stats.increase_wait_time_for_resource(_CHECK_RESOURCE_PERIOD)
                # self._wait_time_for_resource_this_turn += _CHECK_RESOURCE_PERIOD
                # We must protect against this loop to be too long, we limit it to 1s overall time
                if abs(time.time() - start) > timeout:
                    is_timeout_reached = True
                    break
            
            if is_timeout_reached:  # 1s was reached during the waiting for CPU/RAM/load average, must quit now
                is_loop_aborted = True
                break
            
            # We can launch it now
            _nb_action_launched_by_type[action.my_type] += 1
            _nb_action_launched += 1
            _sum_launched_expected_cpu_time += action.average_cpu_time
            try:
                action.execute()
                with self.load_todo_actions.get_lock():
                    self.load_todo_actions.value -= action.average_cpu_time_in_micro
                self._launched_actions = self._launched_actions + 1
            except TooManyOpenFiles as e:
                logger.error('[worker][%d] I am dying : Too many open files by [%s].' % (self.id, action))
                logger.error('UNMANAGED EXCEPTION: %s %s' % (e, type(e)))
                self._i_am_dying = True
            except Exception as e:
                logger.error('[worker][%d] Exception %s' % (self.id, e))
                self._i_am_dying = True
        
        self._execution_stats.print_loop_debug_stats(
            time.time() - start,
            is_loop_aborted,
            _nb_action_launched_by_type,
            _nb_action_launched,
            len(actions_to_launch),
            _sum_launched_expected_cpu_time,
            _all_action_expected_cpu_time
        )
        
        self._execution_stats.increase_nb_launch_actions(_nb_action_launched, _sum_launched_expected_cpu_time)
    
    
    def _set_proctitle(self):
        try:
            from setproctitle import setproctitle
            setproctitle('%s [ - Worker %d ] ' % (self.daemon_display_name, self.id))
        except Exception:  # noqa : no t sure about all possible case errors here
            pass
    
    
    # Get new action if less than _processes_by_worker
    # If no new action got and no check in queue, I sleep for 1 sec
    # REF: doc/shinken-action-queues.png (3)
    def _read_todo_actions(self):
        try:
            while self.pipe_out_actions_todo_send_by_executor_to_worker.poll():
                msg_raw = self.pipe_out_actions_todo_send_by_executor_to_worker.recv()  # we are sure there is an element there
                msg = SafeUnpickler.loads(msg_raw, 'Action get from main daemon')
                if msg is not None:
                    action = msg.get_data()
                    self._todo_actions.append(action)
                    # We have a new action, update stats
                    self._execution_stats.add_ready_to_launch(action)
        except EOFError:
            logger.error('[worker][%s] End _read_todo_actions EOFError, queue still exist??) ' % self.id)
            raise
        except Empty:
            if self.nb_action_todo() == 0:
                return 1
                
                # logger.debug("[worker][%s] End _read_todo_actions Empty [%s] [%s]) " % (self.id, exp, self.todo_queue_actions.qsize()))
        # Maybe the Queue() is not available, if so, just return get back to work :)
        except IOError:
            pass
            # logger.debug("[worker][%s] End _read_todo_actions IOError [%s]) " % (self.id, exp))
        # logger.debug("[worker][%s] Get [%s] actions for [%.6f]s " % (self.id, len(self._todo_actions), actions_get_in_worker))
        return 0
    
    
    def nb_action_todo(self):
        return len(self._todo_actions)
    
    
    # Check the status of checks if done, return message finished :)
    # REF: doc/shinken-action-queues.png (5)
    def _manage_finished_checks(self):
        to_del = []
        wait_time = 0.1
        
        _stats_by_type = {
            CHECK_STATUS.READY_TO_RUN: {'nb': 0, 'estimated_cpu': 0.0},
            CHECK_STATUS.LAUNCHED    : {'nb': 0, 'estimated_cpu': 0.0},
            'finish'                 : {'nb': 0, 'estimated_cpu': 0.0, 'real_cpu': 0.0},  # finish is done or timeout
        }
        _finished_cpu_time = 0.0
        for action in self._todo_actions:
            # Ready to run: we did fail to launch it this turn, so only stat it
            if action.status == CHECK_STATUS.READY_TO_RUN:
                _stats_by_type[CHECK_STATUS.READY_TO_RUN]['nb'] += 1
                _stats_by_type[CHECK_STATUS.READY_TO_RUN]['estimated_cpu'] += action.average_cpu_time
            # Launched: is currently running, so look if finished since looked at
            elif action.status == CHECK_STATUS.LAUNCHED:
                action.check_finished(self._max_plugins_output_length)
                # if this action need a small finish look soon, take it
                wait_time = min(wait_time, action.check_finished_period)
                
                _stats_by_type[CHECK_STATUS.LAUNCHED]['nb'] += 1
                _stats_by_type[CHECK_STATUS.LAUNCHED]['estimated_cpu'] += action.average_cpu_time
            
            # Finish: we can send it back
            elif action.is_finish():
                # We have the real cpu time
                _stats_by_type['finish']['nb'] += 1
                _stats_by_type['finish']['estimated_cpu'] += action.average_cpu_time
                _stats_by_type['finish']['real_cpu'] += action.cpu_time
                self._launched_actions = self._launched_actions - 1
                # The action is done, so we put it in done_queue_actions in order to return to scheduler the action info.
                try:
                    action_ser = pickle.dumps(action, 2)
                    self.pipe_in_actions_done_by_worker_to_send_at_executor.send(action_ser)
                    # Only remove action if we manage to send to Pipe()
                    to_del.append(action)
                    self._execution_stats.add_finished_action(action)
                except IOError as exp:
                    if exp.errno == 11:  # Resource temporarily unavailable, pipe is full, will be retried later
                        continue
                    logger.error("[worker][%d] Worker fail to give back action to master process: %s" % (self.id, exp))
                    os._exit(2)  # noqa forced exit, if we have other threads, we don't care
        
        for action in to_del:
            self._todo_actions.remove(action)
        
        # Update the stats as we did grok all finish actions, and count running one
        self._execution_stats.increase_finished_and_running_stats(len(to_del),
                                                                  _stats_by_type['finish']['real_cpu'],
                                                                  _stats_by_type['finish']['estimated_cpu'],
                                                                  _stats_by_type[CHECK_STATUS.LAUNCHED]['nb'],
                                                                  _stats_by_type[CHECK_STATUS.LAUNCHED]['estimated_cpu'])
        
        return wait_time
    
    
    # Wrapper function for work in order to catch the exception
    # to see the real work, look at do_work.
    def _work(self, work_function, todo_queue_actions, done_queue_actions, master_orders, father_pid, log_file_path):
        logger.set_name('WORKER %s' % self.id)
        self.logger = LoggerFactory.get_logger()
        
        self._execution_stats.add_logger(self.logger)
        
        try:
            # WARNING: this do not work on windows as the kill(0) will really kill the
            # father process.
            # TODO: find a way on windows to detect a process is alive
            if os.name != 'nt':
                look_at_my_father_thread = LookAtMyFatherThread(father_pid, self.daemon_display_name, '[worker:%d]' % self.id, loop_speed=60)
                look_at_my_father_thread.start_thread()
            
            # But on Windows we need to reinitialise the log as multiprocessing cannot share file, so log was set as
            # non-inheritable for its handlers
            if os.name == 'nt':
                logger.register_local_log('%s.worker%s' % (log_file_path, self.id))
            
            # also launch the malloc trim thread
            start_malloc_trim_thread()
            # Go
            work_function(todo_queue_actions, done_queue_actions, master_orders)
        except Exception:
            # Catch any exception, try to print it and exit anyway
            output = io.StringIO()
            traceback.print_exc(file=output)
            logger.error("[worker][%s] Worker exit with an unmanaged exception : %s" % (self.id, output.getvalue()))
            output.close()
            # Ok I die now
            raise
    
    
    def _do_work(self, _deprecated1, _deprecated2, _deprecated3):
        # logger.debug("[worker][%d] I start working." % self.id)
        # restore default signal handler for the workers:
        # but on android, we are a thread, so don't do it
        signal.signal(signal.SIGTERM, signal.SIG_DFL)
        
        self._set_proctitle()
        
        # When we fork we must close our self.http_daemon (launched in a thread so don't lock us)
        if self.http_daemon:
            self.http_daemon.shutdown(quiet=True)
        
        self._todo_actions = []
        
        loop_idx = 0
        
        # NOTE: * we have 2 loops in 1s, so we have to sum some stats every 2 loops to have a classic 1s loop stats
        #       * currently not very sure why 0.5s loop, we think it was to push as fast new actions as we can
        while True:
            loop_idx = (loop_idx + 1) & 1  # keep last bit of counter, equivalent to (+ 1) % 2
            
            start_cycle_time = time.time()
            
            # Reset our stats at the start of the second
            if loop_idx == 0:
                self._execution_stats.compute_stats_after_two_loops_before_manage_returns()
            
            # REF: doc/shinken-action-queues.png (5)
            # logger.debug('[worker][%d][nb_action:%s] manage_finished_checks' % (self.id, self.nb_action_todo()))
            wait_time = self._manage_finished_checks()
            
            # If we are dying (big problem!) we do not take new jobs, we just finished the current one
            if not self._i_am_dying:
                # We want values stack over 2 loops (of 0.5s)
                if loop_idx == 0:
                    self._execution_stats.compute_stats_after_two_loops_after_manage_returns()
                
                # REF: doc/shinken-action-queues.png (3)
                # logger.debug('[worker][%d][nb_action:%s] _read_todo_actions' % (self.id, self.nb_action_todo()))
                wait_time += self._read_todo_actions()
                # REF: doc/shinken-action-queues.png (4)
                # logger.debug('[worker][%d][nb_action:%s] _launch_actions' % (self.id, self.nb_action_todo()))
                self._launch_actions()
            
            # Look if we are dying, and if we finish all current checks
            # if so, we really die, our master poller will launch a new
            # worker because we were too weak to manage our job :(
            if self.nb_action_todo() == 0 and self._i_am_dying:
                logger.warning('[worker][%d] I DIE because I cannot do my job as I should (too many open files?)... forgot me please.' % self.id)
                break
            
            self.last_loop_time.value = time.time()
            
            if wait_time > 0.1:
                wait_time = 0.1
            
            # Little sleep
            time.sleep(wait_time)
            
            wait_for_fix_cycle = 0.5 - (time.time() - start_cycle_time)
            
            if 0 < wait_for_fix_cycle < 1:
                if wait_for_fix_cycle < 0.1:  # only log if we need
                    self.logger.debug('Sleep time: %.3fs' % wait_for_fix_cycle)
                time.sleep(wait_for_fix_cycle)
            
            self.last_loop_time.value = time.time()
