Skip to main content

clone

Version

  • Linux v5.8.13 (64-bit)

Abstract

long sys_clone(unsigned long clone_flags, unsigned long newsp,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls);

Arguments

arg1: unsigned long clone_flags

from include/uapi/linux/sched.h

/*
* cloning flags:
*/
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */

arg5: unsigned long tls

新しいスレッドの TLS の位置を示す fsbase のアドレス.

arg1 の clone_flags で CLONE_SETTLS のビットが true の場合に下記の順に tls が設定される.

fsbase の設定は do_arch_prctl_64() で行う.

sys_clone()
-> _do_fork()
-> copy_process()
-> copy_thread_tls()
-> set_new_tls() <-- ここで tls を更新 (file: arch/x86/kernel/process.c)

Return

返り値は生成されたスレッドの tid.失敗した場合は -1 が返る.

Definitions

sys_clone()

システムコールの実装は kernel/fork.c にて次のように行われている.

config によって定義が異なっているが,実験環境では CONFIG_CLONE_BACKWARDS* が定義されていなかったので一番下のものが有効になっていた.

  • kernel/fork.c
#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
unsigned long, tls,
int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
int, stack_size,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
int __user *, child_tidptr,
unsigned long, tls)
#endif
{
struct kernel_clone_args args = {
.flags = (lower_32_bits(clone_flags) & ~CSIGNAL),
.pidfd = parent_tidptr,
.child_tid = child_tidptr,
.parent_tid = parent_tidptr,
.exit_signal = (lower_32_bits(clone_flags) & CSIGNAL),
.stack = newsp,
.tls = tls,
};

if (!legacy_clone_args_valid(&args))
return -EINVAL;

return _do_fork(&args);
}
#endif

sys_fork でも呼ばれる _do_fork() が実際の sys_clone の処理を行っている.

気になった関数などをピックアップした関数呼び出しフロー.

  • sys_clone()
    • _do_fork()
      • copy_process()
        • dup_task_struct()
          • alloc_thread_stack_node()
          • arch_dup_task_struct()
          • setup_thread_task()
        • sched_fork()
        • copy_files()
        • copy_fs()
        • copy_sighand()
        • copy_mm()
        • copy_thread_tls()
          • set_new_tls()
      • wake_up_new_task()
        • activate_task()
          • enqueue_task()
            • p->sched_class->enqueue_task()

_do_fork()

  • kernel/fork.c
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*
* args->exit_signal is expected to be checked for sanity by the caller.
*/
long _do_fork(struct kernel_clone_args *args)
{
u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
int trace = 0;
long nr;

/*
* Determine whether and which event to report to ptracer. When
* called from kernel_thread or CLONE_UNTRACED is explicitly
* requested, no event is reported; otherwise, report if the event
* for the type of forking is enabled.
*/
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;

if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}

/* Create thread */
p = copy_process(NULL, trace, NUMA_NO_NODE, args);
...
/* Prepare for return */
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);

if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, args->parent_tid);

if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
get_task_struct(p);
}

/* Set newly created task as running task */
wake_up_new_task(p);

/* forking complete and child started to run, tell ptracer */
if (unlikely(trace))
ptrace_event_pid(trace, pid);

if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}

put_pid(pid);
return nr;
}

copy_process()

  • kernel/fork.c
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
static __latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
struct kernel_clone_args *args)
{
int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
u64 clone_flags = args->flags;
struct nsproxy *nsp = current->nsproxy;

/* Per-flag sanitizer check */
...

/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
* processes that happen during the fork and delay them so that
* they appear to happen after the fork.
*/
...


/* p is the new task_struct, copied from current */
p = dup_task_struct(current, node);
if (!p)
goto fork_out;

/* goto error handler in some cases */
...

/* Initialize parameters of new task_struct */
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
...

/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;

retval = perf_event_init_task(p);
if (retval)
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
retval = security_task_alloc(p, clone_flags);
if (retval)
goto bad_fork_cleanup_audit;
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
retval = copy_files(clone_flags, p);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
if (retval)
goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p);
if (retval)
goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p);
if (retval)
goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p);
if (retval)
goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p);
if (retval)
goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
/* Copy or create new thread local storage */
retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
args->tls);
if (retval)
goto bad_fork_cleanup_io;

stackleak_task_init(p);

if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
args->set_tid_size);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
}
}

/*
* This has to happen after we've potentially unshared the file
* descriptor table (so that the pidfd doesn't leak into the child
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
if (retval < 0)
goto bad_fork_free_pid;

pidfd = retval;

pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
O_RDWR | O_CLOEXEC);
if (IS_ERR(pidfile)) {
put_unused_fd(pidfd);
retval = PTR_ERR(pidfile);
goto bad_fork_free_pid;
}
get_pid(pid); /* held by pidfile now */

retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
}

...

/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
p->exit_signal = args->exit_signal;
p->group_leader = p;
p->tgid = p->pid;
}

p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
p->dirty_paused_when = 0;

p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;

/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
retval = cgroup_can_fork(p, args);
if (retval)
goto bad_fork_put_pidfd;

/*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
* stalling fork(2) after we recorded the start_time but before it is
* visible to the system.
*/

p->start_time = ktime_get_ns();
p->start_boottime = ktime_get_boottime_ns();

/*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
write_lock_irq(&tasklist_lock);

/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}

klp_copy_process(p);

spin_lock(&current->sighand->siglock);

/*
* Copy seccomp details explicitly here, in case they were changed
* before holding sighand lock.
*/
copy_seccomp(p);

rseq_fork(p, clone_flags);

/* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
goto bad_fork_cancel_cgroup;
}

/* Let kill terminate clone/fork in the middle */
if (fatal_signal_pending(current)) {
retval = -EINTR;
goto bad_fork_cancel_cgroup;
}

/* past the last point of failure */
if (pidfile)
fd_install(pidfd, pidfile);

init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

init_task_pid(p, PIDTYPE_PID, pid);
if (thread_group_leader(p)) {
init_task_pid(p, PIDTYPE_TGID, pid);
init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
init_task_pid(p, PIDTYPE_SID, task_session(current));

if (is_child_reaper(pid)) {
ns_of_pid(pid)->child_reaper = p;
p->signal->flags |= SIGNAL_UNKILLABLE;
}
p->signal->shared_pending.signal = delayed.signal;
p->signal->tty = tty_kref_get(current->signal->tty);
/*
* Inherit has_child_subreaper flag under the same
* tasklist_lock with adding child to the process tree
* for propagate_has_child_subreaper optimization.
*/
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_TGID);
attach_pid(p, PIDTYPE_PGID);
attach_pid(p, PIDTYPE_SID);
__this_cpu_inc(process_counts);
} else {
current->signal->nr_threads++;
atomic_inc(&current->signal->live);
refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
attach_pid(p, PIDTYPE_PID);
nr_threads++;
}
...
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);

return p;

/* Error handlers e.g. bad_fork_free, fork_out, ... */
...
}

dup_task_struct()

  • kernel/fork.c
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
unsigned long *stack;
struct vm_struct *stack_vm_area __maybe_unused;
int err;

if (node == NUMA_NO_NODE)
node = tsk_fork_get_node(orig);
tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;

stack = alloc_thread_stack_node(tsk, node);
if (!stack)
goto free_tsk;

if (memcg_charge_kernel_stack(tsk))
goto free_stack;

stack_vm_area = task_stack_vm_area(tsk);

err = arch_dup_task_struct(tsk, orig);

/*
* arch_dup_task_struct() clobbers the stack-related fields. Make
* sure they're properly initialized before using any stack-related
* functions again.
*/
tsk->stack = stack;
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif

if (err)
goto free_stack;

err = scs_prepare(tsk, node);
if (err)
goto free_stack;

#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
* the sighand lock in case orig has changed between now and
* then. Until then, filter must be NULL to avoid messing up
* the usage counts on the error path calling free_task.
*/
tsk->seccomp.filter = NULL;
#endif

setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
...

#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
return tsk;

free_stack:
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
return NULL;
}

arch_dup_task_struct()

  • file: arch/x86/kernel/process.c
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
memcpy(dst, src, arch_task_struct_size);
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif

return fpu__copy(dst, src);
}

copy_files()

  • kernel/fork.c
static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
{
struct files_struct *oldf, *newf;
int error = 0;

/*
* A background process may not have any files ...
*/
oldf = current->files;
if (!oldf)
goto out;

if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
}

newf = dup_fd(oldf, &error);
if (!newf)
goto out;

tsk->files = newf;
error = 0;
out:
return error;
}

copy_fs()

  • kernel/fork.c
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
spin_lock(&fs->lock);
if (fs->in_exec) {
spin_unlock(&fs->lock);
return -EAGAIN;
}
fs->users++;
spin_unlock(&fs->lock);
return 0;
}
tsk->fs = copy_fs_struct(fs);
if (!tsk->fs)
return -ENOMEM;
return 0;
}

copy_thread_tls()

  • arch/x86/kernel/process.c
int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct inactive_task_frame *frame;
struct fork_frame *fork_frame;
struct pt_regs *childregs;
int ret = 0;

childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;

frame->bp = encode_frame_pointer(childregs);
frame->ret_addr = (unsigned long) ret_from_fork;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));

#ifdef CONFIG_X86_64
savesegment(gs, p->thread.gsindex);
p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase;
savesegment(fs, p->thread.fsindex);
p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase;
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
#else
p->thread.sp0 = (unsigned long) (childregs + 1);
/*
* Clear all status flags including IF and set fixed bit. 64bit
* does not have this initialization as the frame does not contain
* flags. The flags consistency (especially vs. AC) is there
* ensured via objtool, which lacks 32bit support.
*/
frame->flags = X86_EFLAGS_FIXED;
#endif

/* Kernel thread ? */
if (unlikely(p->flags & PF_KTHREAD)) {
memset(childregs, 0, sizeof(struct pt_regs));
kthread_frame_init(frame, sp, arg);
return 0;
}

frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;

#ifdef CONFIG_X86_32
task_user_gs(p) = get_user_gs(current_pt_regs());
#endif

/* Set a new TLS for the child thread? */
if (clone_flags & CLONE_SETTLS)
ret = set_new_tls(p, tls);

if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);

return ret;
}

wake_up_new_task()

  • file: kernel/sched/core.c
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;

raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_ptr can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);

activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
}

activate_task()

  • file: kernel/sched/core.c
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
enqueue_task(rq, p, flags);

p->on_rq = TASK_ON_RQ_QUEUED;
}

enqueue_task()

  • file: kernel/sched/core.c
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);

if (!(flags & ENQUEUE_RESTORE)) {
sched_info_queued(rq, p);
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
}

uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}

sched_class->enqueue_task()

enqueue_task_fair()

  • kernel/sched/fair.c
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);

/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
util_est_enqueue(&rq->cfs, p);

/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
* passed.
*/
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);

for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);

cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;

flags = ENQUEUE_WAKEUP;
}

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);

update_load_avg(cfs_rq, se, UPDATE_TG);
se_update_runnable(se);
update_cfs_group(se);

cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;

/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;

/*
* One parent has been throttled and cfs_rq removed from the
* list. Add it back to not break the leaf list.
*/
if (throttled_hierarchy(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}

/* At this point se is NULL and we are at root level*/
add_nr_running(rq, 1);

/*
* Since new tasks are assigned an initial util_avg equal to
* half of the spare capacity of their CPU, tiny tasks have the
* ability to cross the overutilized threshold, which will
* result in the load balancer ruining all the task placement
* done by EAS. As a way to mitigate that effect, do not account
* for the first enqueue operation of new tasks during the
* overutilized flag detection.
*
* A better way of solving this problem would be to wait for
* the PELT signals of tasks to converge before taking them
* into account, but that is not straightforward to implement,
* and the following generally works well enough in practice.
*/
if (flags & ENQUEUE_WAKEUP)
update_overutilized_status(rq);

enqueue_throttle:
if (cfs_bandwidth_used()) {
/*
* When bandwidth control is enabled; the cfs_rq_throttled()
* breaks in the above iteration can result in incomplete
* leaf list maintenance, resulting in triggering the assertion
* below.
*/
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);

if (list_add_leaf_cfs_rq(cfs_rq))
break;
}
}

assert_list_leaf_cfs_rq(rq);

hrtick_update(rq);
}

...

/*
* All the scheduling class methods:
*/
const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
...

enqueue_task_dl()

  • kernel/sched/deadline.c
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
struct sched_dl_entity *pi_se = &p->dl;

/*
* Use the scheduling parameters of the top pi-waiter task if:
* - we have a top pi-waiter which is a SCHED_DEADLINE task AND
* - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
* smaller than our deadline OR we are a !SCHED_DEADLINE task getting
* boosted due to a SCHED_DEADLINE pi-waiter).
* Otherwise we keep our runtime and deadline.
*/
if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl;
} else if (!dl_prio(p->normal_prio)) {
/*
* Special case in which we have a !SCHED_DEADLINE task
* that is going to be deboosted, but exceeds its
* runtime while doing so. No point in replenishing
* it, as it's going to return back to its original
* scheduling class after this.
*/
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
return;
}

/*
* Check if a constrained deadline task was activated
* after the deadline but before the next period.
* If that is the case, the task will be throttled and
* the replenishment timer will be set to the next period.
*/
if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
dl_check_constrained_dl(&p->dl);

if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
add_rq_bw(&p->dl, &rq->dl);
add_running_bw(&p->dl, &rq->dl);
}

/*
* If p is throttled, we do not enqueue it. In fact, if it exhausted
* its budget it needs a replenishment and, since it now is on
* its rq, the bandwidth timer callback (which clearly has not
* run yet) will take care of this.
* However, the active utilization does not depend on the fact
* that the task is on the runqueue or not (but depends on the
* task's state - in GRUB parlance, "inactive" vs "active contending").
* In other words, even if a task is throttled its utilization must
* be counted in the active utilization; hence, we need to call
* add_running_bw().
*/
if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
if (flags & ENQUEUE_WAKEUP)
task_contending(&p->dl, flags);

return;
}

enqueue_dl_entity(&p->dl, pi_se, flags);

if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}

...

const struct sched_class dl_sched_class = {
.next = &rt_sched_class,
.enqueue_task = enqueue_task_dl,
...

enqueue_task_rt()

  • kernel/sched/rt.c
/*
* Adding/removing a task to/from a priority array:
*/
static void
enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;

if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;

enqueue_rt_entity(rt_se, flags);

if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}

...

const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
...

enqueue_task_stop()

  • kernel/sched/stop_task.c
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
}

...

/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
const struct sched_class stop_sched_class = {
.next = &dl_sched_class,

.enqueue_task = enqueue_task_stop,
...
struct kernel_clone_args {
u64 flags;
int __user *pidfd;
int __user *child_tid;
int __user *parent_tid;
int exit_signal;
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
pid_t *set_tid;
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
struct cgroup *cgrp;
struct css_set *cset;
};

Examples