#!/usr/bin/env python
# -*- coding: utf-8 -*-

#
# Copyright (C) 2009-2016:
#     Gabes Jean, naparuba@gmail.com
#     Gerhard Lausser, Gerhard.Lausser@consol.de
#     Gregory Starck, g.starck@gmail.com
#     Hartmut Goebel, h.goebel@goebel-consult.de
#     Martin Benjamin, b.martin@shinken-solutions.com
#
# This file is part of Shinken.
#
# Shinken is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Shinken is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Shinken.  If not, see <http://www.gnu.org/licenses/>.

import cPickle
import cStringIO
import logging
import os
import signal
import sys
import time
import traceback
from Queue import Empty
from ctypes import c_double
from multiprocessing.sharedctypes import Value

import psutil
from action import TooManyOpenFiles
from shinken.basesubprocess import LookAtMyFatherThread
from shinken.log import logger
from shinken.message import Message
from shinken.util import get_inter_process_pipes, start_malloc_trim_thread, get_memory_used_percent

# In android, we should use threads, not process
is_android = True
try:
    import android
except ImportError:
    is_android = False

if not is_android:
    from multiprocessing import Process, Queue, cpu_count
else:
    from Queue import Queue
    from threading import Thread as Process

id_worker = 0

# Max CPU time in worker queue
WORKER_MAX_LOAD_QUEUE = 1.5

# Threshold of running queue per CPU for launch check
WORKER_MAX_CPU_QUEUE_PER_CPU = 4

# Threshold of RAM usage for launch check
WORKER_MAX_RAM_PERCENT = 95.0

_CHECK_RESOURCE_ENABLE = True
_CHECK_RESOURCE_PERIOD = 0.010


class Worker(object):
    """
    This class is used for poller and reactionner to launch actions.
    """
    
    
    def __init__(self, processes_by_worker, max_plugins_output_length=8192, target=None, worker_type="fork", loaded_into='unknown', executor_id='none', http_daemon=None, daemon_display_name='',
                 max_ram_percent=WORKER_MAX_RAM_PERCENT, max_cpu_queue_per_cpu=WORKER_MAX_CPU_QUEUE_PER_CPU):
        global id_worker
        id_worker += 1
        
        self.id = id_worker
        self.executor_id = executor_id
        
        # counter of action give each ticks
        self.action_counter_by_tick = 0
        
        self.master_orders = Queue()
        self.pipe_in_actions_todo_send_by_executor_to_worker, self.pipe_out_actions_todo_send_by_executor_to_worker = get_inter_process_pipes()
        self.pipe_in_actions_done_by_worker_to_send_at_executor, self.pipe_out_actions_done_by_worker_to_send_at_executor = get_inter_process_pipes()
        
        self.worker_type = worker_type
        
        # windows forker do not like pickle http/lock
        if os.name == 'nt':
            self.http_daemon = None
        else:
            self.http_daemon = http_daemon
        
        self.nb_doing_action = 0  # use by the poller to compute the action in worker
        
        self._ram_percent = {}
        self._cpu_percent = {}
        self._cpu_running_queue = {}
        self._max_cpu_queue_per_cpu = max_cpu_queue_per_cpu
        self._i_am_dying = False
        self._loaded_into = loaded_into
        
        self._todo_actions = []
        self.load_todo_actions = Value(c_double, 0)
        self._processes_by_worker = processes_by_worker
        self._max_ram_percent = max_ram_percent
        self._idletime = 0
        self._max_plugins_output_length = max_plugins_output_length
        self.wait_time_for_resource = 0
        self._full_counter = 0
        
        # By default, take our own code
        if target is None:
            target = self._do_work
            self.worker_type = "fork"
        
        self.daemon_display_name = daemon_display_name  # for ps aux display
        
        self._process = Process(target=self._work, args=(target, self.pipe_out_actions_todo_send_by_executor_to_worker, self.pipe_in_actions_done_by_worker_to_send_at_executor, self.master_orders, os.getpid(), logger.get_log_file_path()))
    
    
    def start(self):
        self._process.start()
    
    
    # Kill the background process
    # AND close correctly the queues (input and output)
    # each queue got a thread, so close it too....
    def terminate(self):
        # logger.debug("[worker][%d] asking terminate" % self.id)
        # We can just terminate process, not threads
        if not is_android:
            self._process.terminate()
        
        # Is we are with a Manager() way there should be not such functions
        if hasattr(self.master_orders, 'close'):
            self.master_orders.close()
            self.master_orders.join_thread()
        
        self.pipe_in_actions_todo_send_by_executor_to_worker.close()
        self.pipe_out_actions_todo_send_by_executor_to_worker.close()
        self.pipe_in_actions_done_by_worker_to_send_at_executor.close()
        self.pipe_out_actions_done_by_worker_to_send_at_executor.close()
    
    
    def is_alive(self):
        return self._process.is_alive()
    
    
    def join(self, timeout=None):
        self._process.join(timeout)
    
    
    def send_order(self, order):
        self.master_orders.put(order)
    
    
    def add_todo_action(self, action):
        action.status = 'queue'
        # Save in the action the current executor_id / worker_id
        action.executor_id = self.executor_id
        action.worker_id = self.id
        msg = Message(id=0, type='Do', data=action)
        self.nb_doing_action += 1
        self.action_counter_by_tick += 1
        self.load_todo_actions.value += action.average_cpu_time
        # Now it's a Pipe()
        _msg = cPickle.dumps(msg, 2)
        self.pipe_in_actions_todo_send_by_executor_to_worker.send(_msg)
    
    
    def rollback_add_todo_action(self, action):
        action.status = 'inpoller'
        action.executor_id = -1
        action.worker_id = -1
        self.load_todo_actions.value -= action.average_cpu_time
        self.nb_doing_action -= 1
        self.action_counter_by_tick -= 1
    
    
    def get_results(self):
        res = []
        while self.pipe_out_actions_done_by_worker_to_send_at_executor.poll():
            # NOTE: the recv can fail if the other side is pushing into it, if so, skip this turn
            try:
                action_raw = self.pipe_out_actions_done_by_worker_to_send_at_executor.recv()
            except (IOError, cPickle.UnpicklingError) as exp:  # NOTE: UnpicklingError is possible if the worker is crash and didnt't flush the socket
                logger.warning('The worker %s reception did fail this turn (%s), skip to the next turn to receive more.' % (self.id, exp))
                return res
            action = cPickle.loads(action_raw)  # cPickle because Pipe() use a slow pickle instead of cPickle
            res.append(action)
        return res
    
    
    def print_worker_full(self):
        if self.load_todo_actions.value >= WORKER_MAX_LOAD_QUEUE:
            log_level = logging.DEBUG
            if self._full_counter > 5:
                log_level = logging.WARNING
            logger.log(log_level, '[worker-%s][%d] is full [%s] reject [%s] actions' % (self.worker_type, self.id, self.load_todo_actions.value, self._full_counter))
            self._full_counter += 1
        else:
            self._full_counter = 0
    
    
    def _compute_ram_usage(self):
        used_percent = int(get_memory_used_percent() * 100)
        self._ram_percent = {'ram_percent': used_percent}
    
    
    def _compute_cpu_usage(self):
        compute_period = -1
        now = time.time()
        if self._cpu_percent:
            compute_period = now - self._cpu_percent['at']
        
        interval = None
        if compute_period == -1:
            interval = _CHECK_RESOURCE_PERIOD
        elif compute_period > _CHECK_RESOURCE_PERIOD:
            interval = None
        
        try:
            current_cpu_percent = psutil.cpu_percent(interval=interval, percpu=True)
        except ZeroDivisionError:  # old psutil versions can crash with ZeroDivisionError, if so, like new psutil versions, give 0.0
            # cf: https://chromium.googlesource.com/external/github.com/giampaolo/psutil/+/master/psutil/__init__.py#1724
            nb_cpus = cpu_count()
            current_cpu_percent = [0.0 for i in range(nb_cpus)]
        
        self._cpu_percent = {'cpu_percent': current_cpu_percent, 'at': now}
    
    
    def _compute_cpu_running_queue(self):
        now = time.time()
        
        # Feature not available on Windows
        if os.name == 'nt':
            self._cpu_running_queue = {'running_queue': -1, 'at': now, 'compute_period': _CHECK_RESOURCE_PERIOD}
            return
        
        self._cpu_running_queue = {'running_queue': self._get_running_procs()}
    
    
    def _get_running_procs(self):
        with open("/proc/stat") as stats:
            try:
                line = stats.readline()
                while not line.startswith('procs_running'):
                    line = stats.readline()
                nb_running = int(line.split(' ')[-1])
                line = stats.readline()
                # nb_running = procs_running + procs_blocked (the next line)
                nb_running += int(line.split(' ')[-1])
            except EOFError:
                nb_running = -1
            except Exception:
                nb_running = -1
        return nb_running
    
    
    def _is_resources_availability(self):
        cpu_availability = False
        self._compute_cpu_usage()
        self._compute_ram_usage()
        self._compute_cpu_running_queue()
        
        for cpu in self._cpu_percent['cpu_percent']:
            if cpu < 80:
                cpu_availability = True
        
        cpu_queue_availability = self._cpu_running_queue['running_queue'] <= (self._max_cpu_queue_per_cpu * cpu_count())
        ram_availability = self._ram_percent['ram_percent'] <= self._max_ram_percent
        return cpu_availability and ram_availability and cpu_queue_availability
    
    
    # Launch checks that are in status
    # REF: doc/shinken-action-queues.png (4)
    def _launch_actions(self):
        for action in self._todo_actions:
            while _CHECK_RESOURCE_ENABLE and not self._is_resources_availability():
                time.sleep(_CHECK_RESOURCE_PERIOD)
                self.wait_time_for_resource += _CHECK_RESOURCE_PERIOD
            
            if action.status == 'queue':
                self._idletime = 0
                
                try:
                    action.execute()
                    self.load_todo_actions.value = max(0, self.load_todo_actions.value - action.average_cpu_time)
                except TooManyOpenFiles as e:
                    logger.error("[worker][%d] I am dying : Too many open files by [%s]." % (self.id, action))
                    logger.error('UNMANAGED EXCEPTION: %s %s' % (e, type(e)))
                    self._i_am_dying = True
                except Exception as e:
                    logger.error("[worker][%d] Exception %s" % (self.id, e.message))
                    self._i_am_dying = True
    
    
    def _set_proctitle(self):
        try:
            from setproctitle import setproctitle
            setproctitle("%s [ - Worker %d ] " % (self.daemon_display_name, self.id))
        except Exception:
            # logger.debug("[worker][%d] Python module setproctitle not found." % self.id)
            pass
    
    
    # Get new action if less than _processes_by_worker
    # If no new action got and no check in queue, i sleep for 1 sec
    # REF: doc/shinken-action-queues.png (3)
    def _read_todo_actions(self):
        actions_get_in_worker = 0
        try:
            while self.nb_action_todo() < self._processes_by_worker:
                has_element = self.pipe_out_actions_todo_send_by_executor_to_worker.poll()
                if not has_element:
                    break
                msg_raw = self.pipe_out_actions_todo_send_by_executor_to_worker.recv()  # we are sure there is an element there
                msg = cPickle.loads(msg_raw)
                if msg is not None:
                    action = msg.get_data()
                    self._todo_actions.append(action)
                    actions_get_in_worker += action.average_cpu_time
        
        except EOFError as exp:
            logger.error("[worker][%s] End _read_todo_actions EOFError, queue still exist??) " % self.id)
            raise
        except Empty as exp:
            if self.nb_action_todo() == 0:
                self._idletime = self._idletime + 1
                return 1
                
                # logger.debug("[worker][%s] End _read_todo_actions Empty [%s] [%s]) " % (self.id, exp, self.todo_queue_actions.qsize()))
        # Maybe the Queue() is not available, if so, just return get back to work :)
        except IOError, exp:
            pass
            # logger.debug("[worker][%s] End _read_todo_actions IOError [%s]) " % (self.id, exp))
        # logger.debug("[worker][%s] Get [%s] actions for [%.6f]s " % (self.id, len(self._todo_actions), actions_get_in_worker))
        return 0
    
    
    def nb_action_todo(self):
        return len(self._todo_actions)
    
    
    # Check the status of checks if done, return message finished :)
    # REF: doc/shinken-action-queues.png (5)
    def _manage_finished_checks(self):
        to_del = []
        wait_time = 0.1
        
        for action in self._todo_actions:
            if action.status == 'launched':
                action.check_finished(self._max_plugins_output_length)
                wait_time = min(wait_time, action.check_finished_period)
            elif action.is_finish():
                # The action is done so we put it in done_queue_actions in order to return to scheduler the action info.
                
                try:
                    action_ser = cPickle.dumps(action, 2)
                    self.pipe_in_actions_done_by_worker_to_send_at_executor.send(action_ser)
                    # Only remove action if we manage to send to Pipe()
                    to_del.append(action)
                except IOError as exp:
                    if exp.errno == 11:  # Resource temporarily unavailable, pipe is full, skip new send and retry later
                        break
                    logger.error("[worker][%d] Worker fail to answer master: %s" % (self.id, exp))
                    sys.exit(2)
        
        for action in to_del:
            self._todo_actions.remove(action)
        
        return wait_time
    
    
    # Wrapper function for work in order to catch the exception
    # to see the real work, look at do_work.
    def _work(self, work_function, todo_queue_actions, done_queue_actions, master_orders, father_pid, log_file_path):
        try:
            # WARNING: this do not work on windows as the kill(0) will really kill the
            # father process.
            # TODO: find a way on windows to detect a process is alive
            if os.name != 'nt':
                look_at_my_father_thread = LookAtMyFatherThread(father_pid, self.daemon_display_name, '[worker:%d]' % self.id, loop_speed=60)
                look_at_my_father_thread.start_thread()
            
            # But on windows we need to reinitialise the log as multiprocessing cannot share file, so log was set as
            # non-inheritable for its handlers
            if os.name == 'nt':
                logger.register_local_log('%s.worker%s' % (log_file_path, self.id))
            

            # also launch the malloc trim thread
            start_malloc_trim_thread()
            # Go
            work_function(todo_queue_actions, done_queue_actions, master_orders)
        except Exception as exp:
            # Catch any exception, try to print it and exit anyway
            output = cStringIO.StringIO()
            traceback.print_exc(file=output)
            logger.error("[worker][%s] Worker exit with an unmanaged exception : %s" % (self.id, output.getvalue()))
            output.close()
            # Ok I die now
            raise
    
    
    # master_orders = Control queue for the worker
    def _do_work(self, _unused1, _unused2, master_orders):
        # logger.debug("[worker][%d] I start working." % self.id)
        # restore default signal handler for the workers:
        # but on android, we are a thread, so don't do it
        if not is_android:
            signal.signal(signal.SIGTERM, signal.SIG_DFL)
        
        self._set_proctitle()
        
        # When we fork we must close our self.http_daemon (launched in a thread so don't lock us)
        if self.http_daemon:
            self.http_daemon.shutdown(quiet=True)
        
        self._todo_actions = []
        
        start_resource_cycle_time = time.time()
        while True:
            start_cycle_time = time.time()
            # REF: doc/shinken-action-queues.png (5)
            # logger.debug('[worker][%d][nb_action:%s] manage_finished_checks' % (self.id, self.nb_action_todo()))
            wait_time = self._manage_finished_checks()
            
            # If we are dying (big problem!) we do not take new jobs, we just finished the current one
            if not self._i_am_dying:
                # REF: doc/shinken-action-queues.png (3)
                # logger.debug('[worker][%d][nb_action:%s] _read_todo_actions' % (self.id, self.nb_action_todo()))
                wait_time += self._read_todo_actions()
                # REF: doc/shinken-action-queues.png (4)
                # logger.debug('[worker][%d][nb_action:%s] _launch_actions' % (self.id, self.nb_action_todo()))
                self._launch_actions()
            
            # Now get order from master
            # logger.debug('[worker][%d][nb_action:%s] get master_orders' % (self.id, self.nb_action_todo()))
            try:
                order = master_orders.get(block=False)
                if order.get_type() == 'Die':
                    # logger.debug("[worker][%d] Master as worker to DIE." % self.id)
                    break
            except Empty:
                # No Master order
                pass
            except:
                # logger.debug("[worker][%d] Error in master_orders, we ignore it" % self.id)
                pass
            
            # Look if we are dying, and if we finish all current checks
            # if so, we really die, our master poller will launch a new
            # worker because we were too weak to manage our job :(
            if self.nb_action_todo() == 0 and self._i_am_dying:
                logger.warning("[worker][%d] I DIE because I cannot do my job as I should (too many open files?)... forgot me please." % self.id)
                break
            
            # Little sleep
            # logger.debug('[worker][%d][nb_action:%s] start wait_time for [%s]' % (self.id, self.nb_action_todo(), wait_time))
            time.sleep(wait_time)
            
            wait_for_fix_cycle = 0.5 - (time.time() - start_cycle_time)
            # logger.debug('[worker][%d][nb_action:%s] start cycle sleep for [%s]' % (self.id, self.nb_action_todo(), wait_for_fix_cycle))
            if 0 < wait_for_fix_cycle < 1:
                time.sleep(wait_for_fix_cycle)
                # logger.debug("[worker][%d] Cycle time [%.6f] wait time [%.6f]." % (self.id, (time.time() - start_cycle), wait_time))
                # logger.debug("[worker][%d] actions to do in queue[%s] in list[%s]." % (self.id, self.todo_queue_actions.qsize(), len(self._todo_actions)))
            
            for_time = time.time() - start_resource_cycle_time
            if for_time > 10:
                logger.debug("[worker][%d] worker wait resource [%.6f] for [%.6f] ([%.2f]%%)" % (self.id, self.wait_time_for_resource, for_time, self.wait_time_for_resource * 100.0 / for_time))
                start_resource_cycle_time = time.time()
                self.wait_time_for_resource = 0
