blob: 94e8b16e0b7d552059943b712493dbf73ef3ffef [file] [log] [blame]
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
#ifndef RR_REC_SCHED_H_
#define RR_REC_SCHED_H_
#include <sched.h>
#include <deque>
#include <map>
#include <random>
#include <set>
#include <vector>
#include "Ticks.h"
#include "TraceFrame.h"
#include "core.h"
#include "util.h"
namespace rr {
class RecordSession;
class RecordTask;
class WaitAggregator;
/**
* Overview of rr scheduling:
*
* rr honours priorities set by setpriority(2) --- even in situations where the
* kernel doesn't, e.g. when a non-privileged task tries to increase its
* priority. Normally rr honors priorities strictly by scheduling the highest
* priority runnable task; tasks with equal priorities are scheduled in
* round-robin fashion. Strict priority scheduling helps find bugs due to
* starvation.
*
* When a task calls sched_yield we temporarily switch to a completely
* fair scheduler that ignores priorities. All tasks are placed on a queue
* and while the queue is non-empty we take the next task from the queue and
* run it for a quantum if it's runnable. We do this because tasks calling
* sched_yield are often expecting some kind of fair scheduling and may deadlock
* (e.g. trying to acquire a spinlock) if some other tasks don't get a chance
* to run.
*
* The scheduler only runs during recording. During replay we're just replaying
* the recorded scheduling decisions.
*
* The main interface to the scheduler is |get_next_thread|. This gets called
* after every rr event to decide which task to run next.
*
* The scheduler gives the current task a 'timeslice', a ticks deadline after
* which we will try to switch to another task. So |get_next_thread| first
* checks whether the currently running task has exceeded that deadline. If
* not, and the current task is runnable, we schedule it again. If it's blocked
* or has exceeded its deadline, we search for another task to run:
* taking tasks from the round-robin queue until we find one that's runnable,
* and then if the round-robin queue is empty, choosing the highest-priority
* task that's runnable. If the highest-priority runnable task has the same
* priority as the current task, choose the next runnable task after the
* current task (so equal priority tasks run in round-robin order).
*
* The main parameter to the scheduler is |max_ticks|, which controls the
* length of each timeslice.
*/
class Scheduler {
public:
/**
* Like most task schedulers, there are conflicting goals to balance. Lower
* max-ticks generally makes the application more "interactive", generally
* speaking lower latency. (And wrt catching bugs, this setting generally
* creates more opportunity for bugs to arise in multi-threaded/process
* applications.) This comes at the cost of more overhead from scheduling and
* context switching. Context switches during recording are expensive because
* we must context switch to the rr process and then to the next tracee task.
* Increasing max-ticks generally gives the application higher throughput.
*
* Using ticks (retired conditional branches) to compute timeslices is quite
* crude, since they don't correspond to any unit of time in general.
* Hopefully that can be improved, but empirical data from Firefox
* demonstrate, surprisingly consistently, a distribution of insns/rcb massed
* around 10. Somewhat arbitrarily guessing ~4cycles/insn on average
* (fair amount of pointer chasing), that implies for a nominal 2GHz CPU
* 50,000 ticks per millisecond. We choose the default max ticks to give us
* 50ms timeslices, i.e. 2,500,000 ticks.
*/
enum { DEFAULT_MAX_TICKS = 2500000 };
/**
* Don't allow max_ticks to get above this value.
*/
enum { MAX_MAX_TICKS = 1000000000 };
Scheduler(RecordSession& session);
void set_max_ticks(Ticks max_ticks) {
DEBUG_ASSERT(max_ticks <= MAX_MAX_TICKS);
max_ticks_ = max_ticks;
}
Ticks max_ticks() { return max_ticks_; }
void set_always_switch(bool always_switch) {
this->always_switch = always_switch;
}
void set_enable_chaos(bool enable_chaos);
void set_num_cores(int cores);
/**
* Schedule a new runnable task (which may be the same as current()).
*
* The new current() task is guaranteed to either have already been
* runnable, or have been made runnable by a waitpid status change (in
* which case, result.by_waitpid will be true.
*
* After this, if Rescheduled::interrupted_by_signal is false,
* and there is a new current task, its is_stopped() must
* be true.
*/
struct Rescheduled {
bool interrupted_by_signal;
bool by_waitpid;
bool started_new_timeslice;
};
Rescheduled reschedule(Switchable switchable);
/**
* Set the priority of |t| to |value| and update related
* state.
*/
void update_task_priority(RecordTask* t, int value);
/**
* Do one round of round-robin scheduling if we're not already doing one.
* If we start round-robin scheduling now, make last_task the last
* task to be scheduled.
* If the task_round_robin_queue is empty this moves all tasks into it,
* putting last_task last.
*/
void schedule_one_round_robin(RecordTask* last_task);
void on_create(RecordTask* t);
/**
* De-register a thread. This function should be called when a thread exits.
*/
void on_destroy(RecordTask* t);
RecordTask* current() const { return current_; }
void set_current(RecordTask* t) { current_ = t; }
Ticks current_timeslice_end() const { return current_timeslice_end_; }
void expire_timeslice() { current_timeslice_end_ = 0; }
double interrupt_after_elapsed_time() const;
/**
* Return the number of cores we should report to applications.
*/
int pretend_num_cores() const { return pretend_num_cores_; }
/**
* Return the processor affinity masks we should report to applications.
*/
const cpu_set_t& pretend_affinity_mask() const {
return pretend_affinity_mask_;
}
void in_stable_exit(RecordTask* t);
/**
* In unlimited ticks mode, only one task is runnable while every other task
* is blocked in the kernel. Check whether we're in that situation.
*/
bool may_use_unlimited_ticks();
/**
* Let the scheduler know that the previously stopped task has resumed.
*/
void started_task(RecordTask* t);
/**
* Let the scheduler know that the previously running task has reached a kernel stop
* (typically a ptrace stop). Tasks that are blocked but not in a stop
* are still "running" for our purposes here.
*/
void stopped_task(RecordTask* t);
/**
* Let the scheduler know that the task has entered an execve.
*/
void did_enter_execve(RecordTask* t);
/**
* Let the scheduler know that the task has exited an execve.
*/
void did_exit_execve(RecordTask* t);
private:
struct CompareByScheduleOrder {
bool operator()(RecordTask* a, RecordTask* b) const;
};
struct SamePriorityTasks {
// Tasks ordered in of last-scheduled, most recently scheduled last
std::set<RecordTask*, CompareByScheduleOrder> tasks;
int consecutive_uses_of_attention_set;
SamePriorityTasks() : consecutive_uses_of_attention_set(0) {}
};
// Tasks sorted by priority.
typedef std::map<int, SamePriorityTasks> TaskPrioritySet;
typedef std::deque<RecordTask*> TaskQueue;
typedef std::default_random_engine Random;
/**
* Pull a task from the round-robin queue if available. Otherwise,
* find the highest-priority task that is runnable. If the highest-priority
* runnable task has the same priority as 't', return 't' or
* the next runnable task after 't' in round-robin order.
* Sets 'by_waitpid' to true if we determined the task was runnable by
* calling waitpid on it and observing a state change. This task *must*
* be returned by get_next_thread, and is_runnable_task must not be called
* on it again until it has run.
* Considers only tasks with priority <= priority_threshold.
*/
RecordTask* find_next_runnable_task(WaitAggregator& wait_aggregator,
std::map<int, std::vector<RecordTask*>>& attention_set_by_priority,
bool* by_waitpid, int priority_threshold);
/**
* Returns the first task in the round-robin queue or null if it's empty,
* removing it from the round-robin queue.
*/
RecordTask* get_round_robin_task();
void maybe_pop_round_robin_task(RecordTask* t);
void setup_new_timeslice();
void maybe_reset_priorities(double now);
int choose_random_priority(RecordTask* t);
void update_task_priority_internal(RecordTask* t, int value);
void maybe_reset_high_priority_only_intervals(double now);
bool in_high_priority_only_interval(double now);
bool treat_as_high_priority(RecordTask* t);
bool is_task_runnable(RecordTask* t, WaitAggregator& wait_aggregator, bool* by_waitpid);
void validate_scheduled_task();
void regenerate_affinity_mask();
void insert_into_task_priority_set(RecordTask* t);
void remove_from_task_priority_set(RecordTask* t);
uint64_t reschedule_count;
Random random;
RecordSession& session;
/**
* Every task of this session is either in task_priority_set
* (when in_round_robin_queue is false), or in task_round_robin_queue
* (when in_round_robin_queue is true).
*
* task_priority_set is a set of pairs of (task->priority, task). This
* lets us efficiently iterate over the tasks with a given priority, or
* all tasks in priority order.
*/
TaskPrioritySet task_priority_set;
size_t task_priority_set_total_count;
TaskQueue task_round_robin_queue;
/**
* The currently scheduled task. This may be nullptr if the last scheduled
* task
* has been destroyed.
*/
RecordTask* current_;
Ticks current_timeslice_end_;
/**
* At this time (or later) we should refresh these values.
*/
double high_priority_only_intervals_refresh_time;
double high_priority_only_intervals_start;
double high_priority_only_intervals_duration;
double high_priority_only_intervals_period;
/**
* At this time (or later) we should rerandomize RecordTask priorities.
*/
double priorities_refresh_time;
Ticks max_ticks_;
RecordTask* must_run_task;
cpu_set_t pretend_affinity_mask_;
int pretend_num_cores_;
/**
* If nonzero, the threadgroup ID of a threadgroup where one of the
* threads is currently in an execve.
*/
pid_t in_exec_tgid;
/**
* The number of tasks that have is_stopped_ set.
*/
int ntasks_stopped;
/**
* When true, context switch at every possible point.
*/
bool always_switch;
/**
* When true, make random scheduling decisions to try to increase the
* probability of finding buggy schedules.
*/
bool enable_chaos;
bool enable_poll;
bool last_reschedule_in_high_priority_only_interval;
bool unlimited_ticks_mode;
};
} // namespace rr
#endif /* RR_REC_SCHED_H_ */