| /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ |
| |
| #ifndef RR_REC_SCHED_H_ |
| #define RR_REC_SCHED_H_ |
| |
| #include <sched.h> |
| |
| #include <deque> |
| #include <map> |
| #include <random> |
| #include <set> |
| #include <vector> |
| |
| #include "Ticks.h" |
| #include "TraceFrame.h" |
| #include "core.h" |
| #include "util.h" |
| |
| namespace rr { |
| |
| class RecordSession; |
| class RecordTask; |
| class WaitAggregator; |
| |
| /** |
| * Overview of rr scheduling: |
| * |
| * rr honours priorities set by setpriority(2) --- even in situations where the |
| * kernel doesn't, e.g. when a non-privileged task tries to increase its |
| * priority. Normally rr honors priorities strictly by scheduling the highest |
| * priority runnable task; tasks with equal priorities are scheduled in |
| * round-robin fashion. Strict priority scheduling helps find bugs due to |
| * starvation. |
| * |
| * When a task calls sched_yield we temporarily switch to a completely |
| * fair scheduler that ignores priorities. All tasks are placed on a queue |
| * and while the queue is non-empty we take the next task from the queue and |
| * run it for a quantum if it's runnable. We do this because tasks calling |
| * sched_yield are often expecting some kind of fair scheduling and may deadlock |
| * (e.g. trying to acquire a spinlock) if some other tasks don't get a chance |
| * to run. |
| * |
| * The scheduler only runs during recording. During replay we're just replaying |
| * the recorded scheduling decisions. |
| * |
| * The main interface to the scheduler is |get_next_thread|. This gets called |
| * after every rr event to decide which task to run next. |
| * |
| * The scheduler gives the current task a 'timeslice', a ticks deadline after |
| * which we will try to switch to another task. So |get_next_thread| first |
| * checks whether the currently running task has exceeded that deadline. If |
| * not, and the current task is runnable, we schedule it again. If it's blocked |
| * or has exceeded its deadline, we search for another task to run: |
| * taking tasks from the round-robin queue until we find one that's runnable, |
| * and then if the round-robin queue is empty, choosing the highest-priority |
| * task that's runnable. If the highest-priority runnable task has the same |
| * priority as the current task, choose the next runnable task after the |
| * current task (so equal priority tasks run in round-robin order). |
| * |
| * The main parameter to the scheduler is |max_ticks|, which controls the |
| * length of each timeslice. |
| */ |
| class Scheduler { |
| public: |
| /** |
| * Like most task schedulers, there are conflicting goals to balance. Lower |
| * max-ticks generally makes the application more "interactive", generally |
| * speaking lower latency. (And wrt catching bugs, this setting generally |
| * creates more opportunity for bugs to arise in multi-threaded/process |
| * applications.) This comes at the cost of more overhead from scheduling and |
| * context switching. Context switches during recording are expensive because |
| * we must context switch to the rr process and then to the next tracee task. |
| * Increasing max-ticks generally gives the application higher throughput. |
| * |
| * Using ticks (retired conditional branches) to compute timeslices is quite |
| * crude, since they don't correspond to any unit of time in general. |
| * Hopefully that can be improved, but empirical data from Firefox |
| * demonstrate, surprisingly consistently, a distribution of insns/rcb massed |
| * around 10. Somewhat arbitrarily guessing ~4cycles/insn on average |
| * (fair amount of pointer chasing), that implies for a nominal 2GHz CPU |
| * 50,000 ticks per millisecond. We choose the default max ticks to give us |
| * 50ms timeslices, i.e. 2,500,000 ticks. |
| */ |
| enum { DEFAULT_MAX_TICKS = 2500000 }; |
| /** |
| * Don't allow max_ticks to get above this value. |
| */ |
| enum { MAX_MAX_TICKS = 1000000000 }; |
| |
| Scheduler(RecordSession& session); |
| |
| void set_max_ticks(Ticks max_ticks) { |
| DEBUG_ASSERT(max_ticks <= MAX_MAX_TICKS); |
| max_ticks_ = max_ticks; |
| } |
| Ticks max_ticks() { return max_ticks_; } |
| void set_always_switch(bool always_switch) { |
| this->always_switch = always_switch; |
| } |
| void set_enable_chaos(bool enable_chaos); |
| void set_num_cores(int cores); |
| |
| /** |
| * Schedule a new runnable task (which may be the same as current()). |
| * |
| * The new current() task is guaranteed to either have already been |
| * runnable, or have been made runnable by a waitpid status change (in |
| * which case, result.by_waitpid will be true. |
| * |
| * After this, if Rescheduled::interrupted_by_signal is false, |
| * and there is a new current task, its is_stopped() must |
| * be true. |
| */ |
| struct Rescheduled { |
| bool interrupted_by_signal; |
| bool by_waitpid; |
| bool started_new_timeslice; |
| }; |
| Rescheduled reschedule(Switchable switchable); |
| |
| /** |
| * Set the priority of |t| to |value| and update related |
| * state. |
| */ |
| void update_task_priority(RecordTask* t, int value); |
| |
| /** |
| * Do one round of round-robin scheduling if we're not already doing one. |
| * If we start round-robin scheduling now, make last_task the last |
| * task to be scheduled. |
| * If the task_round_robin_queue is empty this moves all tasks into it, |
| * putting last_task last. |
| */ |
| void schedule_one_round_robin(RecordTask* last_task); |
| |
| void on_create(RecordTask* t); |
| /** |
| * De-register a thread. This function should be called when a thread exits. |
| */ |
| void on_destroy(RecordTask* t); |
| |
| RecordTask* current() const { return current_; } |
| void set_current(RecordTask* t) { current_ = t; } |
| |
| Ticks current_timeslice_end() const { return current_timeslice_end_; } |
| |
| void expire_timeslice() { current_timeslice_end_ = 0; } |
| |
| double interrupt_after_elapsed_time() const; |
| |
| /** |
| * Return the number of cores we should report to applications. |
| */ |
| int pretend_num_cores() const { return pretend_num_cores_; } |
| /** |
| * Return the processor affinity masks we should report to applications. |
| */ |
| const cpu_set_t& pretend_affinity_mask() const { |
| return pretend_affinity_mask_; |
| } |
| |
| void in_stable_exit(RecordTask* t); |
| |
| /** |
| * In unlimited ticks mode, only one task is runnable while every other task |
| * is blocked in the kernel. Check whether we're in that situation. |
| */ |
| bool may_use_unlimited_ticks(); |
| |
| /** |
| * Let the scheduler know that the previously stopped task has resumed. |
| */ |
| void started_task(RecordTask* t); |
| |
| /** |
| * Let the scheduler know that the previously running task has reached a kernel stop |
| * (typically a ptrace stop). Tasks that are blocked but not in a stop |
| * are still "running" for our purposes here. |
| */ |
| void stopped_task(RecordTask* t); |
| |
| /** |
| * Let the scheduler know that the task has entered an execve. |
| */ |
| void did_enter_execve(RecordTask* t); |
| /** |
| * Let the scheduler know that the task has exited an execve. |
| */ |
| void did_exit_execve(RecordTask* t); |
| |
| private: |
| struct CompareByScheduleOrder { |
| bool operator()(RecordTask* a, RecordTask* b) const; |
| }; |
| struct SamePriorityTasks { |
| // Tasks ordered in of last-scheduled, most recently scheduled last |
| std::set<RecordTask*, CompareByScheduleOrder> tasks; |
| int consecutive_uses_of_attention_set; |
| |
| SamePriorityTasks() : consecutive_uses_of_attention_set(0) {} |
| }; |
| // Tasks sorted by priority. |
| typedef std::map<int, SamePriorityTasks> TaskPrioritySet; |
| typedef std::deque<RecordTask*> TaskQueue; |
| typedef std::default_random_engine Random; |
| |
| /** |
| * Pull a task from the round-robin queue if available. Otherwise, |
| * find the highest-priority task that is runnable. If the highest-priority |
| * runnable task has the same priority as 't', return 't' or |
| * the next runnable task after 't' in round-robin order. |
| * Sets 'by_waitpid' to true if we determined the task was runnable by |
| * calling waitpid on it and observing a state change. This task *must* |
| * be returned by get_next_thread, and is_runnable_task must not be called |
| * on it again until it has run. |
| * Considers only tasks with priority <= priority_threshold. |
| */ |
| RecordTask* find_next_runnable_task(WaitAggregator& wait_aggregator, |
| std::map<int, std::vector<RecordTask*>>& attention_set_by_priority, |
| bool* by_waitpid, int priority_threshold); |
| /** |
| * Returns the first task in the round-robin queue or null if it's empty, |
| * removing it from the round-robin queue. |
| */ |
| RecordTask* get_round_robin_task(); |
| void maybe_pop_round_robin_task(RecordTask* t); |
| void setup_new_timeslice(); |
| void maybe_reset_priorities(double now); |
| int choose_random_priority(RecordTask* t); |
| void update_task_priority_internal(RecordTask* t, int value); |
| void maybe_reset_high_priority_only_intervals(double now); |
| bool in_high_priority_only_interval(double now); |
| bool treat_as_high_priority(RecordTask* t); |
| bool is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator, bool* by_waitpid); |
| void validate_scheduled_task(); |
| void regenerate_affinity_mask(); |
| |
| void insert_into_task_priority_set(RecordTask* t); |
| void remove_from_task_priority_set(RecordTask* t); |
| |
| uint64_t reschedule_count; |
| |
| Random random; |
| |
| RecordSession& session; |
| |
| /** |
| * Every task of this session is either in task_priority_set |
| * (when in_round_robin_queue is false), or in task_round_robin_queue |
| * (when in_round_robin_queue is true). |
| * |
| * task_priority_set is a set of pairs of (task->priority, task). This |
| * lets us efficiently iterate over the tasks with a given priority, or |
| * all tasks in priority order. |
| */ |
| TaskPrioritySet task_priority_set; |
| size_t task_priority_set_total_count; |
| TaskQueue task_round_robin_queue; |
| |
| /** |
| * The currently scheduled task. This may be nullptr if the last scheduled |
| * task |
| * has been destroyed. |
| */ |
| RecordTask* current_; |
| Ticks current_timeslice_end_; |
| |
| /** |
| * At this time (or later) we should refresh these values. |
| */ |
| double high_priority_only_intervals_refresh_time; |
| double high_priority_only_intervals_start; |
| double high_priority_only_intervals_duration; |
| double high_priority_only_intervals_period; |
| /** |
| * At this time (or later) we should rerandomize RecordTask priorities. |
| */ |
| double priorities_refresh_time; |
| |
| Ticks max_ticks_; |
| |
| RecordTask* must_run_task; |
| |
| cpu_set_t pretend_affinity_mask_; |
| int pretend_num_cores_; |
| |
| /** |
| * If nonzero, the threadgroup ID of a threadgroup where one of the |
| * threads is currently in an execve. |
| */ |
| pid_t in_exec_tgid; |
| |
| /** |
| * The number of tasks that have is_stopped_ set. |
| */ |
| int ntasks_stopped; |
| |
| /** |
| * When true, context switch at every possible point. |
| */ |
| bool always_switch; |
| /** |
| * When true, make random scheduling decisions to try to increase the |
| * probability of finding buggy schedules. |
| */ |
| bool enable_chaos; |
| |
| bool enable_poll; |
| bool last_reschedule_in_high_priority_only_interval; |
| |
| bool unlimited_ticks_mode; |
| }; |
| |
| } // namespace rr |
| |
| #endif /* RR_REC_SCHED_H_ */ |