1/*
2 *  linux/kernel/fork.c
3 *
4 *  Copyright (C) 1991, 1992  Linus Torvalds
5 */
6
7/*
8 *  'fork.c' contains the help-routines for the 'fork' system call
9 * (see also entry.S and others).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
12 */
13
14#include <linux/slab.h>
15#include <linux/init.h>
16#include <linux/unistd.h>
17#include <linux/module.h>
18#include <linux/vmalloc.h>
19#include <linux/completion.h>
20#include <linux/personality.h>
21#include <linux/mempolicy.h>
22#include <linux/sem.h>
23#include <linux/file.h>
24#include <linux/fdtable.h>
25#include <linux/iocontext.h>
26#include <linux/key.h>
27#include <linux/binfmts.h>
28#include <linux/mman.h>
29#include <linux/mmu_notifier.h>
30#include <linux/fs.h>
31#include <linux/mm.h>
32#include <linux/vmacache.h>
33#include <linux/nsproxy.h>
34#include <linux/capability.h>
35#include <linux/cpu.h>
36#include <linux/cgroup.h>
37#include <linux/security.h>
38#include <linux/hugetlb.h>
39#include <linux/seccomp.h>
40#include <linux/swap.h>
41#include <linux/syscalls.h>
42#include <linux/jiffies.h>
43#include <linux/futex.h>
44#include <linux/compat.h>
45#include <linux/kthread.h>
46#include <linux/task_io_accounting_ops.h>
47#include <linux/rcupdate.h>
48#include <linux/ptrace.h>
49#include <linux/mount.h>
50#include <linux/audit.h>
51#include <linux/memcontrol.h>
52#include <linux/ftrace.h>
53#include <linux/proc_fs.h>
54#include <linux/profile.h>
55#include <linux/rmap.h>
56#include <linux/ksm.h>
57#include <linux/acct.h>
58#include <linux/tsacct_kern.h>
59#include <linux/cn_proc.h>
60#include <linux/freezer.h>
61#include <linux/delayacct.h>
62#include <linux/taskstats_kern.h>
63#include <linux/random.h>
64#include <linux/tty.h>
65#include <linux/blkdev.h>
66#include <linux/fs_struct.h>
67#include <linux/magic.h>
68#include <linux/perf_event.h>
69#include <linux/posix-timers.h>
70#include <linux/user-return-notifier.h>
71#include <linux/oom.h>
72#include <linux/khugepaged.h>
73#include <linux/signalfd.h>
74#include <linux/uprobes.h>
75#include <linux/aio.h>
76#include <linux/compiler.h>
77
78#include <asm/pgtable.h>
79#include <asm/pgalloc.h>
80#include <asm/uaccess.h>
81#include <asm/mmu_context.h>
82#include <asm/cacheflush.h>
83#include <asm/tlbflush.h>
84
85#include <trace/events/sched.h>
86
87#define CREATE_TRACE_POINTS
88#include <trace/events/task.h>
89
90/*
91 * Protected counters by write_lock_irq(&tasklist_lock)
92 */
93unsigned long total_forks;	/* Handle normal Linux uptimes. */
94int nr_threads;			/* The idle threads do not count.. */
95
96int max_threads;		/* tunable limit on nr_threads */
97
98DEFINE_PER_CPU(unsigned long, process_counts) = 0;
99
100__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
101
102#ifdef CONFIG_PROVE_RCU
103int lockdep_tasklist_lock_is_held(void)
104{
105	return lockdep_is_held(&tasklist_lock);
106}
107EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
108#endif /* #ifdef CONFIG_PROVE_RCU */
109
110int nr_processes(void)
111{
112	int cpu;
113	int total = 0;
114
115	for_each_possible_cpu(cpu)
116		total += per_cpu(process_counts, cpu);
117
118	return total;
119}
120
121void __weak arch_release_task_struct(struct task_struct *tsk)
122{
123}
124
125#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
126static struct kmem_cache *task_struct_cachep;
127
128static inline struct task_struct *alloc_task_struct_node(int node)
129{
130	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
131}
132
133static inline void free_task_struct(struct task_struct *tsk)
134{
135	kmem_cache_free(task_struct_cachep, tsk);
136}
137#endif
138
139void __weak arch_release_thread_info(struct thread_info *ti)
140{
141}
142
143#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
144
145/*
146 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
147 * kmemcache based allocator.
148 */
149# if THREAD_SIZE >= PAGE_SIZE
150static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
151						  int node)
152{
153	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
154						  THREAD_SIZE_ORDER);
155
156	return page ? page_address(page) : NULL;
157}
158
159static inline void free_thread_info(struct thread_info *ti)
160{
161	free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
162}
163# else
164static struct kmem_cache *thread_info_cache;
165
166static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
167						  int node)
168{
169	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
170}
171
172static void free_thread_info(struct thread_info *ti)
173{
174	kmem_cache_free(thread_info_cache, ti);
175}
176
177void thread_info_cache_init(void)
178{
179	thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
180					      THREAD_SIZE, 0, NULL);
181	BUG_ON(thread_info_cache == NULL);
182}
183# endif
184#endif
185
186/* SLAB cache for signal_struct structures (tsk->signal) */
187static struct kmem_cache *signal_cachep;
188
189/* SLAB cache for sighand_struct structures (tsk->sighand) */
190struct kmem_cache *sighand_cachep;
191
192/* SLAB cache for files_struct structures (tsk->files) */
193struct kmem_cache *files_cachep;
194
195/* SLAB cache for fs_struct structures (tsk->fs) */
196struct kmem_cache *fs_cachep;
197
198/* SLAB cache for vm_area_struct structures */
199struct kmem_cache *vm_area_cachep;
200
201/* SLAB cache for mm_struct structures (tsk->mm) */
202static struct kmem_cache *mm_cachep;
203
204static void account_kernel_stack(struct thread_info *ti, int account)
205{
206	struct zone *zone = page_zone(virt_to_page(ti));
207
208	mod_zone_page_state(zone, NR_KERNEL_STACK, account);
209}
210
211void free_task(struct task_struct *tsk)
212{
213	account_kernel_stack(tsk->stack, -1);
214	arch_release_thread_info(tsk->stack);
215	free_thread_info(tsk->stack);
216	rt_mutex_debug_task_free(tsk);
217	ftrace_graph_exit_task(tsk);
218	put_seccomp_filter(tsk);
219	arch_release_task_struct(tsk);
220	free_task_struct(tsk);
221}
222EXPORT_SYMBOL(free_task);
223
224static inline void free_signal_struct(struct signal_struct *sig)
225{
226	taskstats_tgid_free(sig);
227	sched_autogroup_exit(sig);
228	kmem_cache_free(signal_cachep, sig);
229}
230
231static inline void put_signal_struct(struct signal_struct *sig)
232{
233	if (atomic_dec_and_test(&sig->sigcnt))
234		free_signal_struct(sig);
235}
236
237void __put_task_struct(struct task_struct *tsk)
238{
239	WARN_ON(!tsk->exit_state);
240	WARN_ON(atomic_read(&tsk->usage));
241	WARN_ON(tsk == current);
242
243	task_numa_free(tsk);
244	security_task_free(tsk);
245	exit_creds(tsk);
246	delayacct_tsk_free(tsk);
247	put_signal_struct(tsk->signal);
248
249	if (!profile_handoff_task(tsk))
250		free_task(tsk);
251}
252EXPORT_SYMBOL_GPL(__put_task_struct);
253
254void __init __weak arch_task_cache_init(void) { }
255
256void __init fork_init(unsigned long mempages)
257{
258#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
259#ifndef ARCH_MIN_TASKALIGN
260#define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
261#endif
262	/* create a slab on which task_structs can be allocated */
263	task_struct_cachep =
264		kmem_cache_create("task_struct", sizeof(struct task_struct),
265			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
266#endif
267
268	/* do the arch specific task caches init */
269	arch_task_cache_init();
270
271	/*
272	 * The default maximum number of threads is set to a safe
273	 * value: the thread structures can take up at most half
274	 * of memory.
275	 */
276	max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE);
277
278	/*
279	 * we need to allow at least 20 threads to boot a system
280	 */
281	if (max_threads < 20)
282		max_threads = 20;
283
284	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
285	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
286	init_task.signal->rlim[RLIMIT_SIGPENDING] =
287		init_task.signal->rlim[RLIMIT_NPROC];
288}
289
290int __weak arch_dup_task_struct(struct task_struct *dst,
291					       struct task_struct *src)
292{
293	*dst = *src;
294	return 0;
295}
296
297void set_task_stack_end_magic(struct task_struct *tsk)
298{
299	unsigned long *stackend;
300
301	stackend = end_of_stack(tsk);
302	*stackend = STACK_END_MAGIC;	/* for overflow detection */
303}
304
305static struct task_struct *dup_task_struct(struct task_struct *orig)
306{
307	struct task_struct *tsk;
308	struct thread_info *ti;
309	int node = tsk_fork_get_node(orig);
310	int err;
311
312	tsk = alloc_task_struct_node(node);
313	if (!tsk)
314		return NULL;
315
316	ti = alloc_thread_info_node(tsk, node);
317	if (!ti)
318		goto free_tsk;
319
320	err = arch_dup_task_struct(tsk, orig);
321	if (err)
322		goto free_ti;
323
324	tsk->stack = ti;
325#ifdef CONFIG_SECCOMP
326	/*
327	 * We must handle setting up seccomp filters once we're under
328	 * the sighand lock in case orig has changed between now and
329	 * then. Until then, filter must be NULL to avoid messing up
330	 * the usage counts on the error path calling free_task.
331	 */
332	tsk->seccomp.filter = NULL;
333#endif
334
335	setup_thread_stack(tsk, orig);
336	clear_user_return_notifier(tsk);
337	clear_tsk_need_resched(tsk);
338	set_task_stack_end_magic(tsk);
339
340#ifdef CONFIG_CC_STACKPROTECTOR
341	tsk->stack_canary = get_random_int();
342#endif
343
344	/*
345	 * One for us, one for whoever does the "release_task()" (usually
346	 * parent)
347	 */
348	atomic_set(&tsk->usage, 2);
349#ifdef CONFIG_BLK_DEV_IO_TRACE
350	tsk->btrace_seq = 0;
351#endif
352	tsk->splice_pipe = NULL;
353	tsk->task_frag.page = NULL;
354
355	account_kernel_stack(ti, 1);
356
357	return tsk;
358
359free_ti:
360	free_thread_info(ti);
361free_tsk:
362	free_task_struct(tsk);
363	return NULL;
364}
365
366#ifdef CONFIG_MMU
367static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
368{
369	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
370	struct rb_node **rb_link, *rb_parent;
371	int retval;
372	unsigned long charge;
373
374	uprobe_start_dup_mmap();
375	down_write(&oldmm->mmap_sem);
376	flush_cache_dup_mm(oldmm);
377	uprobe_dup_mmap(oldmm, mm);
378	/*
379	 * Not linked in yet - no deadlock potential:
380	 */
381	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
382
383	mm->total_vm = oldmm->total_vm;
384	mm->shared_vm = oldmm->shared_vm;
385	mm->exec_vm = oldmm->exec_vm;
386	mm->stack_vm = oldmm->stack_vm;
387
388	rb_link = &mm->mm_rb.rb_node;
389	rb_parent = NULL;
390	pprev = &mm->mmap;
391	retval = ksm_fork(mm, oldmm);
392	if (retval)
393		goto out;
394	retval = khugepaged_fork(mm, oldmm);
395	if (retval)
396		goto out;
397
398	prev = NULL;
399	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
400		struct file *file;
401
402		if (mpnt->vm_flags & VM_DONTCOPY) {
403			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
404							-vma_pages(mpnt));
405			continue;
406		}
407		charge = 0;
408		if (mpnt->vm_flags & VM_ACCOUNT) {
409			unsigned long len = vma_pages(mpnt);
410
411			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
412				goto fail_nomem;
413			charge = len;
414		}
415		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
416		if (!tmp)
417			goto fail_nomem;
418		*tmp = *mpnt;
419		INIT_LIST_HEAD(&tmp->anon_vma_chain);
420		retval = vma_dup_policy(mpnt, tmp);
421		if (retval)
422			goto fail_nomem_policy;
423		tmp->vm_mm = mm;
424		if (anon_vma_fork(tmp, mpnt))
425			goto fail_nomem_anon_vma_fork;
426		tmp->vm_flags &= ~VM_LOCKED;
427		tmp->vm_next = tmp->vm_prev = NULL;
428		file = tmp->vm_file;
429		if (file) {
430			struct inode *inode = file_inode(file);
431			struct address_space *mapping = file->f_mapping;
432
433			get_file(file);
434			if (tmp->vm_flags & VM_DENYWRITE)
435				atomic_dec(&inode->i_writecount);
436			mutex_lock(&mapping->i_mmap_mutex);
437			if (tmp->vm_flags & VM_SHARED)
438				atomic_inc(&mapping->i_mmap_writable);
439			flush_dcache_mmap_lock(mapping);
440			/* insert tmp into the share list, just after mpnt */
441			if (unlikely(tmp->vm_flags & VM_NONLINEAR))
442				vma_nonlinear_insert(tmp,
443						&mapping->i_mmap_nonlinear);
444			else
445				vma_interval_tree_insert_after(tmp, mpnt,
446							&mapping->i_mmap);
447			flush_dcache_mmap_unlock(mapping);
448			mutex_unlock(&mapping->i_mmap_mutex);
449		}
450
451		/*
452		 * Clear hugetlb-related page reserves for children. This only
453		 * affects MAP_PRIVATE mappings. Faults generated by the child
454		 * are not guaranteed to succeed, even if read-only
455		 */
456		if (is_vm_hugetlb_page(tmp))
457			reset_vma_resv_huge_pages(tmp);
458
459		/*
460		 * Link in the new vma and copy the page table entries.
461		 */
462		*pprev = tmp;
463		pprev = &tmp->vm_next;
464		tmp->vm_prev = prev;
465		prev = tmp;
466
467		__vma_link_rb(mm, tmp, rb_link, rb_parent);
468		rb_link = &tmp->vm_rb.rb_right;
469		rb_parent = &tmp->vm_rb;
470
471		mm->map_count++;
472		retval = copy_page_range(mm, oldmm, mpnt);
473
474		if (tmp->vm_ops && tmp->vm_ops->open)
475			tmp->vm_ops->open(tmp);
476
477		if (retval)
478			goto out;
479	}
480	/* a new mm has just been created */
481	arch_dup_mmap(oldmm, mm);
482	retval = 0;
483out:
484	up_write(&mm->mmap_sem);
485	flush_tlb_mm(oldmm);
486	up_write(&oldmm->mmap_sem);
487	uprobe_end_dup_mmap();
488	return retval;
489fail_nomem_anon_vma_fork:
490	mpol_put(vma_policy(tmp));
491fail_nomem_policy:
492	kmem_cache_free(vm_area_cachep, tmp);
493fail_nomem:
494	retval = -ENOMEM;
495	vm_unacct_memory(charge);
496	goto out;
497}
498
499static inline int mm_alloc_pgd(struct mm_struct *mm)
500{
501	mm->pgd = pgd_alloc(mm);
502	if (unlikely(!mm->pgd))
503		return -ENOMEM;
504	return 0;
505}
506
507static inline void mm_free_pgd(struct mm_struct *mm)
508{
509	pgd_free(mm, mm->pgd);
510}
511#else
512#define dup_mmap(mm, oldmm)	(0)
513#define mm_alloc_pgd(mm)	(0)
514#define mm_free_pgd(mm)
515#endif /* CONFIG_MMU */
516
517__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
518
519#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
520#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
521
522static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
523
524static int __init coredump_filter_setup(char *s)
525{
526	default_dump_filter =
527		(simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
528		MMF_DUMP_FILTER_MASK;
529	return 1;
530}
531
532__setup("coredump_filter=", coredump_filter_setup);
533
534#include <linux/init_task.h>
535
536static void mm_init_aio(struct mm_struct *mm)
537{
538#ifdef CONFIG_AIO
539	spin_lock_init(&mm->ioctx_lock);
540	mm->ioctx_table = NULL;
541#endif
542}
543
544static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
545{
546#ifdef CONFIG_MEMCG
547	mm->owner = p;
548#endif
549}
550
551static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
552{
553	mm->mmap = NULL;
554	mm->mm_rb = RB_ROOT;
555	mm->vmacache_seqnum = 0;
556	atomic_set(&mm->mm_users, 1);
557	atomic_set(&mm->mm_count, 1);
558	init_rwsem(&mm->mmap_sem);
559	INIT_LIST_HEAD(&mm->mmlist);
560	mm->core_state = NULL;
561	atomic_long_set(&mm->nr_ptes, 0);
562	mm->map_count = 0;
563	mm->locked_vm = 0;
564	mm->pinned_vm = 0;
565	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
566	spin_lock_init(&mm->page_table_lock);
567	mm_init_cpumask(mm);
568	mm_init_aio(mm);
569	mm_init_owner(mm, p);
570	mmu_notifier_mm_init(mm);
571	clear_tlb_flush_pending(mm);
572#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
573	mm->pmd_huge_pte = NULL;
574#endif
575
576	if (current->mm) {
577		mm->flags = current->mm->flags & MMF_INIT_MASK;
578		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
579	} else {
580		mm->flags = default_dump_filter;
581		mm->def_flags = 0;
582	}
583
584	if (mm_alloc_pgd(mm))
585		goto fail_nopgd;
586
587	if (init_new_context(p, mm))
588		goto fail_nocontext;
589
590	return mm;
591
592fail_nocontext:
593	mm_free_pgd(mm);
594fail_nopgd:
595	free_mm(mm);
596	return NULL;
597}
598
599static void check_mm(struct mm_struct *mm)
600{
601	int i;
602
603	for (i = 0; i < NR_MM_COUNTERS; i++) {
604		long x = atomic_long_read(&mm->rss_stat.count[i]);
605
606		if (unlikely(x))
607			printk(KERN_ALERT "BUG: Bad rss-counter state "
608					  "mm:%p idx:%d val:%ld\n", mm, i, x);
609	}
610#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
611	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
612#endif
613}
614
615/*
616 * Allocate and initialize an mm_struct.
617 */
618struct mm_struct *mm_alloc(void)
619{
620	struct mm_struct *mm;
621
622	mm = allocate_mm();
623	if (!mm)
624		return NULL;
625
626	memset(mm, 0, sizeof(*mm));
627	return mm_init(mm, current);
628}
629
630/*
631 * Called when the last reference to the mm
632 * is dropped: either by a lazy thread or by
633 * mmput. Free the page directory and the mm.
634 */
635void __mmdrop(struct mm_struct *mm)
636{
637	BUG_ON(mm == &init_mm);
638	mm_free_pgd(mm);
639	destroy_context(mm);
640	mmu_notifier_mm_destroy(mm);
641	check_mm(mm);
642	free_mm(mm);
643}
644EXPORT_SYMBOL_GPL(__mmdrop);
645
646/*
647 * Decrement the use count and release all resources for an mm.
648 */
649void mmput(struct mm_struct *mm)
650{
651	might_sleep();
652
653	if (atomic_dec_and_test(&mm->mm_users)) {
654		uprobe_clear_state(mm);
655		exit_aio(mm);
656		ksm_exit(mm);
657		khugepaged_exit(mm); /* must run before exit_mmap */
658		exit_mmap(mm);
659		set_mm_exe_file(mm, NULL);
660		if (!list_empty(&mm->mmlist)) {
661			spin_lock(&mmlist_lock);
662			list_del(&mm->mmlist);
663			spin_unlock(&mmlist_lock);
664		}
665		if (mm->binfmt)
666			module_put(mm->binfmt->module);
667		mmdrop(mm);
668	}
669}
670EXPORT_SYMBOL_GPL(mmput);
671
672void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
673{
674	if (new_exe_file)
675		get_file(new_exe_file);
676	if (mm->exe_file)
677		fput(mm->exe_file);
678	mm->exe_file = new_exe_file;
679}
680
681struct file *get_mm_exe_file(struct mm_struct *mm)
682{
683	struct file *exe_file;
684
685	/* We need mmap_sem to protect against races with removal of exe_file */
686	down_read(&mm->mmap_sem);
687	exe_file = mm->exe_file;
688	if (exe_file)
689		get_file(exe_file);
690	up_read(&mm->mmap_sem);
691	return exe_file;
692}
693
694static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
695{
696	/* It's safe to write the exe_file pointer without exe_file_lock because
697	 * this is called during fork when the task is not yet in /proc */
698	newmm->exe_file = get_mm_exe_file(oldmm);
699}
700
701/**
702 * get_task_mm - acquire a reference to the task's mm
703 *
704 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
705 * this kernel workthread has transiently adopted a user mm with use_mm,
706 * to do its AIO) is not set and if so returns a reference to it, after
707 * bumping up the use count.  User must release the mm via mmput()
708 * after use.  Typically used by /proc and ptrace.
709 */
710struct mm_struct *get_task_mm(struct task_struct *task)
711{
712	struct mm_struct *mm;
713
714	task_lock(task);
715	mm = task->mm;
716	if (mm) {
717		if (task->flags & PF_KTHREAD)
718			mm = NULL;
719		else
720			atomic_inc(&mm->mm_users);
721	}
722	task_unlock(task);
723	return mm;
724}
725EXPORT_SYMBOL_GPL(get_task_mm);
726
727struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
728{
729	struct mm_struct *mm;
730	int err;
731
732	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
733	if (err)
734		return ERR_PTR(err);
735
736	mm = get_task_mm(task);
737	if (mm && mm != current->mm &&
738			!ptrace_may_access(task, mode)) {
739		mmput(mm);
740		mm = ERR_PTR(-EACCES);
741	}
742	mutex_unlock(&task->signal->cred_guard_mutex);
743
744	return mm;
745}
746
747static void complete_vfork_done(struct task_struct *tsk)
748{
749	struct completion *vfork;
750
751	task_lock(tsk);
752	vfork = tsk->vfork_done;
753	if (likely(vfork)) {
754		tsk->vfork_done = NULL;
755		complete(vfork);
756	}
757	task_unlock(tsk);
758}
759
760static int wait_for_vfork_done(struct task_struct *child,
761				struct completion *vfork)
762{
763	int killed;
764
765	freezer_do_not_count();
766	killed = wait_for_completion_killable(vfork);
767	freezer_count();
768
769	if (killed) {
770		task_lock(child);
771		child->vfork_done = NULL;
772		task_unlock(child);
773	}
774
775	put_task_struct(child);
776	return killed;
777}
778
779/* Please note the differences between mmput and mm_release.
780 * mmput is called whenever we stop holding onto a mm_struct,
781 * error success whatever.
782 *
783 * mm_release is called after a mm_struct has been removed
784 * from the current process.
785 *
786 * This difference is important for error handling, when we
787 * only half set up a mm_struct for a new process and need to restore
788 * the old one.  Because we mmput the new mm_struct before
789 * restoring the old one. . .
790 * Eric Biederman 10 January 1998
791 */
792void mm_release(struct task_struct *tsk, struct mm_struct *mm)
793{
794	/* Get rid of any futexes when releasing the mm */
795#ifdef CONFIG_FUTEX
796	if (unlikely(tsk->robust_list)) {
797		exit_robust_list(tsk);
798		tsk->robust_list = NULL;
799	}
800#ifdef CONFIG_COMPAT
801	if (unlikely(tsk->compat_robust_list)) {
802		compat_exit_robust_list(tsk);
803		tsk->compat_robust_list = NULL;
804	}
805#endif
806	if (unlikely(!list_empty(&tsk->pi_state_list)))
807		exit_pi_state_list(tsk);
808#endif
809
810	uprobe_free_utask(tsk);
811
812	/* Get rid of any cached register state */
813	deactivate_mm(tsk, mm);
814
815	/*
816	 * If we're exiting normally, clear a user-space tid field if
817	 * requested.  We leave this alone when dying by signal, to leave
818	 * the value intact in a core dump, and to save the unnecessary
819	 * trouble, say, a killed vfork parent shouldn't touch this mm.
820	 * Userland only wants this done for a sys_exit.
821	 */
822	if (tsk->clear_child_tid) {
823		if (!(tsk->flags & PF_SIGNALED) &&
824		    atomic_read(&mm->mm_users) > 1) {
825			/*
826			 * We don't check the error code - if userspace has
827			 * not set up a proper pointer then tough luck.
828			 */
829			put_user(0, tsk->clear_child_tid);
830			sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
831					1, NULL, NULL, 0);
832		}
833		tsk->clear_child_tid = NULL;
834	}
835
836	/*
837	 * All done, finally we can wake up parent and return this mm to him.
838	 * Also kthread_stop() uses this completion for synchronization.
839	 */
840	if (tsk->vfork_done)
841		complete_vfork_done(tsk);
842}
843
844/*
845 * Allocate a new mm structure and copy contents from the
846 * mm structure of the passed in task structure.
847 */
848static struct mm_struct *dup_mm(struct task_struct *tsk)
849{
850	struct mm_struct *mm, *oldmm = current->mm;
851	int err;
852
853	mm = allocate_mm();
854	if (!mm)
855		goto fail_nomem;
856
857	memcpy(mm, oldmm, sizeof(*mm));
858
859	if (!mm_init(mm, tsk))
860		goto fail_nomem;
861
862	dup_mm_exe_file(oldmm, mm);
863
864	err = dup_mmap(mm, oldmm);
865	if (err)
866		goto free_pt;
867
868	mm->hiwater_rss = get_mm_rss(mm);
869	mm->hiwater_vm = mm->total_vm;
870
871	if (mm->binfmt && !try_module_get(mm->binfmt->module))
872		goto free_pt;
873
874	return mm;
875
876free_pt:
877	/* don't put binfmt in mmput, we haven't got module yet */
878	mm->binfmt = NULL;
879	mmput(mm);
880
881fail_nomem:
882	return NULL;
883}
884
885static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
886{
887	struct mm_struct *mm, *oldmm;
888	int retval;
889
890	tsk->min_flt = tsk->maj_flt = 0;
891	tsk->nvcsw = tsk->nivcsw = 0;
892#ifdef CONFIG_DETECT_HUNG_TASK
893	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
894#endif
895
896	tsk->mm = NULL;
897	tsk->active_mm = NULL;
898
899	/*
900	 * Are we cloning a kernel thread?
901	 *
902	 * We need to steal a active VM for that..
903	 */
904	oldmm = current->mm;
905	if (!oldmm)
906		return 0;
907
908	/* initialize the new vmacache entries */
909	vmacache_flush(tsk);
910
911	if (clone_flags & CLONE_VM) {
912		atomic_inc(&oldmm->mm_users);
913		mm = oldmm;
914		goto good_mm;
915	}
916
917	retval = -ENOMEM;
918	mm = dup_mm(tsk);
919	if (!mm)
920		goto fail_nomem;
921
922good_mm:
923	tsk->mm = mm;
924	tsk->active_mm = mm;
925	return 0;
926
927fail_nomem:
928	return retval;
929}
930
931static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
932{
933	struct fs_struct *fs = current->fs;
934	if (clone_flags & CLONE_FS) {
935		/* tsk->fs is already what we want */
936		spin_lock(&fs->lock);
937		if (fs->in_exec) {
938			spin_unlock(&fs->lock);
939			return -EAGAIN;
940		}
941		fs->users++;
942		spin_unlock(&fs->lock);
943		return 0;
944	}
945	tsk->fs = copy_fs_struct(fs);
946	if (!tsk->fs)
947		return -ENOMEM;
948	return 0;
949}
950
951static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
952{
953	struct files_struct *oldf, *newf;
954	int error = 0;
955
956	/*
957	 * A background process may not have any files ...
958	 */
959	oldf = current->files;
960	if (!oldf)
961		goto out;
962
963	if (clone_flags & CLONE_FILES) {
964		atomic_inc(&oldf->count);
965		goto out;
966	}
967
968	newf = dup_fd(oldf, &error);
969	if (!newf)
970		goto out;
971
972	tsk->files = newf;
973	error = 0;
974out:
975	return error;
976}
977
978static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
979{
980#ifdef CONFIG_BLOCK
981	struct io_context *ioc = current->io_context;
982	struct io_context *new_ioc;
983
984	if (!ioc)
985		return 0;
986	/*
987	 * Share io context with parent, if CLONE_IO is set
988	 */
989	if (clone_flags & CLONE_IO) {
990		ioc_task_link(ioc);
991		tsk->io_context = ioc;
992	} else if (ioprio_valid(ioc->ioprio)) {
993		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
994		if (unlikely(!new_ioc))
995			return -ENOMEM;
996
997		new_ioc->ioprio = ioc->ioprio;
998		put_io_context(new_ioc);
999	}
1000#endif
1001	return 0;
1002}
1003
1004static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1005{
1006	struct sighand_struct *sig;
1007
1008	if (clone_flags & CLONE_SIGHAND) {
1009		atomic_inc(&current->sighand->count);
1010		return 0;
1011	}
1012	sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1013	rcu_assign_pointer(tsk->sighand, sig);
1014	if (!sig)
1015		return -ENOMEM;
1016	atomic_set(&sig->count, 1);
1017	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1018	return 0;
1019}
1020
1021void __cleanup_sighand(struct sighand_struct *sighand)
1022{
1023	if (atomic_dec_and_test(&sighand->count)) {
1024		signalfd_cleanup(sighand);
1025		kmem_cache_free(sighand_cachep, sighand);
1026	}
1027}
1028
1029
1030/*
1031 * Initialize POSIX timer handling for a thread group.
1032 */
1033static void posix_cpu_timers_init_group(struct signal_struct *sig)
1034{
1035	unsigned long cpu_limit;
1036
1037	/* Thread group counters. */
1038	thread_group_cputime_init(sig);
1039
1040	cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1041	if (cpu_limit != RLIM_INFINITY) {
1042		sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
1043		sig->cputimer.running = 1;
1044	}
1045
1046	/* The timer lists. */
1047	INIT_LIST_HEAD(&sig->cpu_timers[0]);
1048	INIT_LIST_HEAD(&sig->cpu_timers[1]);
1049	INIT_LIST_HEAD(&sig->cpu_timers[2]);
1050}
1051
1052static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1053{
1054	struct signal_struct *sig;
1055
1056	if (clone_flags & CLONE_THREAD)
1057		return 0;
1058
1059	sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
1060	tsk->signal = sig;
1061	if (!sig)
1062		return -ENOMEM;
1063
1064	sig->nr_threads = 1;
1065	atomic_set(&sig->live, 1);
1066	atomic_set(&sig->sigcnt, 1);
1067
1068	/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
1069	sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
1070	tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
1071
1072	init_waitqueue_head(&sig->wait_chldexit);
1073	sig->curr_target = tsk;
1074	init_sigpending(&sig->shared_pending);
1075	INIT_LIST_HEAD(&sig->posix_timers);
1076	seqlock_init(&sig->stats_lock);
1077
1078	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1079	sig->real_timer.function = it_real_fn;
1080
1081	task_lock(current->group_leader);
1082	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
1083	task_unlock(current->group_leader);
1084
1085	posix_cpu_timers_init_group(sig);
1086
1087	tty_audit_fork(sig);
1088	sched_autogroup_fork(sig);
1089
1090#ifdef CONFIG_CGROUPS
1091	init_rwsem(&sig->group_rwsem);
1092#endif
1093
1094	sig->oom_score_adj = current->signal->oom_score_adj;
1095	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1096
1097	sig->has_child_subreaper = current->signal->has_child_subreaper ||
1098				   current->signal->is_child_subreaper;
1099
1100	mutex_init(&sig->cred_guard_mutex);
1101
1102	return 0;
1103}
1104
1105static void copy_seccomp(struct task_struct *p)
1106{
1107#ifdef CONFIG_SECCOMP
1108	/*
1109	 * Must be called with sighand->lock held, which is common to
1110	 * all threads in the group. Holding cred_guard_mutex is not
1111	 * needed because this new task is not yet running and cannot
1112	 * be racing exec.
1113	 */
1114	assert_spin_locked(&current->sighand->siglock);
1115
1116	/* Ref-count the new filter user, and assign it. */
1117	get_seccomp_filter(current);
1118	p->seccomp = current->seccomp;
1119
1120	/*
1121	 * Explicitly enable no_new_privs here in case it got set
1122	 * between the task_struct being duplicated and holding the
1123	 * sighand lock. The seccomp state and nnp must be in sync.
1124	 */
1125	if (task_no_new_privs(current))
1126		task_set_no_new_privs(p);
1127
1128	/*
1129	 * If the parent gained a seccomp mode after copying thread
1130	 * flags and between before we held the sighand lock, we have
1131	 * to manually enable the seccomp thread flag here.
1132	 */
1133	if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
1134		set_tsk_thread_flag(p, TIF_SECCOMP);
1135#endif
1136}
1137
1138SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
1139{
1140	current->clear_child_tid = tidptr;
1141
1142	return task_pid_vnr(current);
1143}
1144
1145static void rt_mutex_init_task(struct task_struct *p)
1146{
1147	raw_spin_lock_init(&p->pi_lock);
1148#ifdef CONFIG_RT_MUTEXES
1149	p->pi_waiters = RB_ROOT;
1150	p->pi_waiters_leftmost = NULL;
1151	p->pi_blocked_on = NULL;
1152#endif
1153}
1154
1155/*
1156 * Initialize POSIX timer handling for a single task.
1157 */
1158static void posix_cpu_timers_init(struct task_struct *tsk)
1159{
1160	tsk->cputime_expires.prof_exp = 0;
1161	tsk->cputime_expires.virt_exp = 0;
1162	tsk->cputime_expires.sched_exp = 0;
1163	INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1164	INIT_LIST_HEAD(&tsk->cpu_timers[1]);
1165	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
1166}
1167
1168static inline void
1169init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
1170{
1171	 task->pids[type].pid = pid;
1172}
1173
1174/*
1175 * This creates a new process as a copy of the old one,
1176 * but does not actually start it yet.
1177 *
1178 * It copies the registers, and all the appropriate
1179 * parts of the process environment (as per the clone
1180 * flags). The actual kick-off is left to the caller.
1181 */
1182static struct task_struct *copy_process(unsigned long clone_flags,
1183					unsigned long stack_start,
1184					unsigned long stack_size,
1185					int __user *child_tidptr,
1186					struct pid *pid,
1187					int trace)
1188{
1189	int retval;
1190	struct task_struct *p;
1191
1192	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1193		return ERR_PTR(-EINVAL);
1194
1195	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1196		return ERR_PTR(-EINVAL);
1197
1198	/*
1199	 * Thread groups must share signals as well, and detached threads
1200	 * can only be started up within the thread group.
1201	 */
1202	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
1203		return ERR_PTR(-EINVAL);
1204
1205	/*
1206	 * Shared signal handlers imply shared VM. By way of the above,
1207	 * thread groups also imply shared VM. Blocking this case allows
1208	 * for various simplifications in other code.
1209	 */
1210	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
1211		return ERR_PTR(-EINVAL);
1212
1213	/*
1214	 * Siblings of global init remain as zombies on exit since they are
1215	 * not reaped by their parent (swapper). To solve this and to avoid
1216	 * multi-rooted process trees, prevent global and container-inits
1217	 * from creating siblings.
1218	 */
1219	if ((clone_flags & CLONE_PARENT) &&
1220				current->signal->flags & SIGNAL_UNKILLABLE)
1221		return ERR_PTR(-EINVAL);
1222
1223	/*
1224	 * If the new process will be in a different pid or user namespace
1225	 * do not allow it to share a thread group or signal handlers or
1226	 * parent with the forking task.
1227	 */
1228	if (clone_flags & CLONE_SIGHAND) {
1229		if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1230		    (task_active_pid_ns(current) !=
1231				current->nsproxy->pid_ns_for_children))
1232			return ERR_PTR(-EINVAL);
1233	}
1234
1235	retval = security_task_create(clone_flags);
1236	if (retval)
1237		goto fork_out;
1238
1239	retval = -ENOMEM;
1240	p = dup_task_struct(current);
1241	if (!p)
1242		goto fork_out;
1243
1244	ftrace_graph_init_task(p);
1245
1246	rt_mutex_init_task(p);
1247
1248#ifdef CONFIG_PROVE_LOCKING
1249	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
1250	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
1251#endif
1252	retval = -EAGAIN;
1253	if (atomic_read(&p->real_cred->user->processes) >=
1254			task_rlimit(p, RLIMIT_NPROC)) {
1255		if (p->real_cred->user != INIT_USER &&
1256		    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
1257			goto bad_fork_free;
1258	}
1259	current->flags &= ~PF_NPROC_EXCEEDED;
1260
1261	retval = copy_creds(p, clone_flags);
1262	if (retval < 0)
1263		goto bad_fork_free;
1264
1265	/*
1266	 * If multiple threads are within copy_process(), then this check
1267	 * triggers too late. This doesn't hurt, the check is only there
1268	 * to stop root fork bombs.
1269	 */
1270	retval = -EAGAIN;
1271	if (nr_threads >= max_threads)
1272		goto bad_fork_cleanup_count;
1273
1274	if (!try_module_get(task_thread_info(p)->exec_domain->module))
1275		goto bad_fork_cleanup_count;
1276
1277	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
1278	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
1279	p->flags |= PF_FORKNOEXEC;
1280	INIT_LIST_HEAD(&p->children);
1281	INIT_LIST_HEAD(&p->sibling);
1282	rcu_copy_process(p);
1283	p->vfork_done = NULL;
1284	spin_lock_init(&p->alloc_lock);
1285
1286	init_sigpending(&p->pending);
1287
1288	p->utime = p->stime = p->gtime = 0;
1289	p->utimescaled = p->stimescaled = 0;
1290#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1291	p->prev_cputime.utime = p->prev_cputime.stime = 0;
1292#endif
1293#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1294	seqlock_init(&p->vtime_seqlock);
1295	p->vtime_snap = 0;
1296	p->vtime_snap_whence = VTIME_SLEEPING;
1297#endif
1298
1299#if defined(SPLIT_RSS_COUNTING)
1300	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1301#endif
1302
1303	p->default_timer_slack_ns = current->timer_slack_ns;
1304
1305	task_io_accounting_init(&p->ioac);
1306	acct_clear_integrals(p);
1307
1308	posix_cpu_timers_init(p);
1309
1310	p->start_time = ktime_get_ns();
1311	p->real_start_time = ktime_get_boot_ns();
1312	p->io_context = NULL;
1313	p->audit_context = NULL;
1314	if (clone_flags & CLONE_THREAD)
1315		threadgroup_change_begin(current);
1316	cgroup_fork(p);
1317#ifdef CONFIG_NUMA
1318	p->mempolicy = mpol_dup(p->mempolicy);
1319	if (IS_ERR(p->mempolicy)) {
1320		retval = PTR_ERR(p->mempolicy);
1321		p->mempolicy = NULL;
1322		goto bad_fork_cleanup_threadgroup_lock;
1323	}
1324#endif
1325#ifdef CONFIG_CPUSETS
1326	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1327	p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1328	seqcount_init(&p->mems_allowed_seq);
1329#endif
1330#ifdef CONFIG_TRACE_IRQFLAGS
1331	p->irq_events = 0;
1332	p->hardirqs_enabled = 0;
1333	p->hardirq_enable_ip = 0;
1334	p->hardirq_enable_event = 0;
1335	p->hardirq_disable_ip = _THIS_IP_;
1336	p->hardirq_disable_event = 0;
1337	p->softirqs_enabled = 1;
1338	p->softirq_enable_ip = _THIS_IP_;
1339	p->softirq_enable_event = 0;
1340	p->softirq_disable_ip = 0;
1341	p->softirq_disable_event = 0;
1342	p->hardirq_context = 0;
1343	p->softirq_context = 0;
1344#endif
1345#ifdef CONFIG_LOCKDEP
1346	p->lockdep_depth = 0; /* no locks held yet */
1347	p->curr_chain_key = 0;
1348	p->lockdep_recursion = 0;
1349#endif
1350
1351#ifdef CONFIG_DEBUG_MUTEXES
1352	p->blocked_on = NULL; /* not blocked yet */
1353#endif
1354#ifdef CONFIG_BCACHE
1355	p->sequential_io	= 0;
1356	p->sequential_io_avg	= 0;
1357#endif
1358
1359	/* Perform scheduler related setup. Assign this task to a CPU. */
1360	retval = sched_fork(clone_flags, p);
1361	if (retval)
1362		goto bad_fork_cleanup_policy;
1363
1364	retval = perf_event_init_task(p);
1365	if (retval)
1366		goto bad_fork_cleanup_policy;
1367	retval = audit_alloc(p);
1368	if (retval)
1369		goto bad_fork_cleanup_perf;
1370	/* copy all the process information */
1371	shm_init_task(p);
1372	retval = copy_semundo(clone_flags, p);
1373	if (retval)
1374		goto bad_fork_cleanup_audit;
1375	retval = copy_files(clone_flags, p);
1376	if (retval)
1377		goto bad_fork_cleanup_semundo;
1378	retval = copy_fs(clone_flags, p);
1379	if (retval)
1380		goto bad_fork_cleanup_files;
1381	retval = copy_sighand(clone_flags, p);
1382	if (retval)
1383		goto bad_fork_cleanup_fs;
1384	retval = copy_signal(clone_flags, p);
1385	if (retval)
1386		goto bad_fork_cleanup_sighand;
1387	retval = copy_mm(clone_flags, p);
1388	if (retval)
1389		goto bad_fork_cleanup_signal;
1390	retval = copy_namespaces(clone_flags, p);
1391	if (retval)
1392		goto bad_fork_cleanup_mm;
1393	retval = copy_io(clone_flags, p);
1394	if (retval)
1395		goto bad_fork_cleanup_namespaces;
1396	retval = copy_thread(clone_flags, stack_start, stack_size, p);
1397	if (retval)
1398		goto bad_fork_cleanup_io;
1399
1400	if (pid != &init_struct_pid) {
1401		retval = -ENOMEM;
1402		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
1403		if (!pid)
1404			goto bad_fork_cleanup_io;
1405	}
1406
1407	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1408	/*
1409	 * Clear TID on mm_release()?
1410	 */
1411	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
1412#ifdef CONFIG_BLOCK
1413	p->plug = NULL;
1414#endif
1415#ifdef CONFIG_FUTEX
1416	p->robust_list = NULL;
1417#ifdef CONFIG_COMPAT
1418	p->compat_robust_list = NULL;
1419#endif
1420	INIT_LIST_HEAD(&p->pi_state_list);
1421	p->pi_state_cache = NULL;
1422#endif
1423	/*
1424	 * sigaltstack should be cleared when sharing the same VM
1425	 */
1426	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
1427		p->sas_ss_sp = p->sas_ss_size = 0;
1428
1429	/*
1430	 * Syscall tracing and stepping should be turned off in the
1431	 * child regardless of CLONE_PTRACE.
1432	 */
1433	user_disable_single_step(p);
1434	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1435#ifdef TIF_SYSCALL_EMU
1436	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1437#endif
1438	clear_all_latency_tracing(p);
1439
1440	/* ok, now we should be set up.. */
1441	p->pid = pid_nr(pid);
1442	if (clone_flags & CLONE_THREAD) {
1443		p->exit_signal = -1;
1444		p->group_leader = current->group_leader;
1445		p->tgid = current->tgid;
1446	} else {
1447		if (clone_flags & CLONE_PARENT)
1448			p->exit_signal = current->group_leader->exit_signal;
1449		else
1450			p->exit_signal = (clone_flags & CSIGNAL);
1451		p->group_leader = p;
1452		p->tgid = p->pid;
1453	}
1454
1455	p->nr_dirtied = 0;
1456	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1457	p->dirty_paused_when = 0;
1458
1459	p->pdeath_signal = 0;
1460	INIT_LIST_HEAD(&p->thread_group);
1461	p->task_works = NULL;
1462
1463	/*
1464	 * Make it visible to the rest of the system, but dont wake it up yet.
1465	 * Need tasklist lock for parent etc handling!
1466	 */
1467	write_lock_irq(&tasklist_lock);
1468
1469	/* CLONE_PARENT re-uses the old parent */
1470	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1471		p->real_parent = current->real_parent;
1472		p->parent_exec_id = current->parent_exec_id;
1473	} else {
1474		p->real_parent = current;
1475		p->parent_exec_id = current->self_exec_id;
1476	}
1477
1478	spin_lock(&current->sighand->siglock);
1479
1480	/*
1481	 * Copy seccomp details explicitly here, in case they were changed
1482	 * before holding sighand lock.
1483	 */
1484	copy_seccomp(p);
1485
1486	/*
1487	 * Process group and session signals need to be delivered to just the
1488	 * parent before the fork or both the parent and the child after the
1489	 * fork. Restart if a signal comes in before we add the new process to
1490	 * it's process group.
1491	 * A fatal signal pending means that current will exit, so the new
1492	 * thread can't slip out of an OOM kill (or normal SIGKILL).
1493	*/
1494	recalc_sigpending();
1495	if (signal_pending(current)) {
1496		spin_unlock(&current->sighand->siglock);
1497		write_unlock_irq(&tasklist_lock);
1498		retval = -ERESTARTNOINTR;
1499		goto bad_fork_free_pid;
1500	}
1501
1502	if (likely(p->pid)) {
1503		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
1504
1505		init_task_pid(p, PIDTYPE_PID, pid);
1506		if (thread_group_leader(p)) {
1507			init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
1508			init_task_pid(p, PIDTYPE_SID, task_session(current));
1509
1510			if (is_child_reaper(pid)) {
1511				ns_of_pid(pid)->child_reaper = p;
1512				p->signal->flags |= SIGNAL_UNKILLABLE;
1513			}
1514
1515			p->signal->leader_pid = pid;
1516			p->signal->tty = tty_kref_get(current->signal->tty);
1517			list_add_tail(&p->sibling, &p->real_parent->children);
1518			list_add_tail_rcu(&p->tasks, &init_task.tasks);
1519			attach_pid(p, PIDTYPE_PGID);
1520			attach_pid(p, PIDTYPE_SID);
1521			__this_cpu_inc(process_counts);
1522		} else {
1523			current->signal->nr_threads++;
1524			atomic_inc(&current->signal->live);
1525			atomic_inc(&current->signal->sigcnt);
1526			list_add_tail_rcu(&p->thread_group,
1527					  &p->group_leader->thread_group);
1528			list_add_tail_rcu(&p->thread_node,
1529					  &p->signal->thread_head);
1530		}
1531		attach_pid(p, PIDTYPE_PID);
1532		nr_threads++;
1533	}
1534
1535	total_forks++;
1536	spin_unlock(&current->sighand->siglock);
1537	syscall_tracepoint_update(p);
1538	write_unlock_irq(&tasklist_lock);
1539
1540	proc_fork_connector(p);
1541	cgroup_post_fork(p);
1542	if (clone_flags & CLONE_THREAD)
1543		threadgroup_change_end(current);
1544	perf_event_fork(p);
1545
1546	trace_task_newtask(p, clone_flags);
1547	uprobe_copy_process(p, clone_flags);
1548
1549	return p;
1550
1551bad_fork_free_pid:
1552	if (pid != &init_struct_pid)
1553		free_pid(pid);
1554bad_fork_cleanup_io:
1555	if (p->io_context)
1556		exit_io_context(p);
1557bad_fork_cleanup_namespaces:
1558	exit_task_namespaces(p);
1559bad_fork_cleanup_mm:
1560	if (p->mm)
1561		mmput(p->mm);
1562bad_fork_cleanup_signal:
1563	if (!(clone_flags & CLONE_THREAD))
1564		free_signal_struct(p->signal);
1565bad_fork_cleanup_sighand:
1566	__cleanup_sighand(p->sighand);
1567bad_fork_cleanup_fs:
1568	exit_fs(p); /* blocking */
1569bad_fork_cleanup_files:
1570	exit_files(p); /* blocking */
1571bad_fork_cleanup_semundo:
1572	exit_sem(p);
1573bad_fork_cleanup_audit:
1574	audit_free(p);
1575bad_fork_cleanup_perf:
1576	perf_event_free_task(p);
1577bad_fork_cleanup_policy:
1578#ifdef CONFIG_NUMA
1579	mpol_put(p->mempolicy);
1580bad_fork_cleanup_threadgroup_lock:
1581#endif
1582	if (clone_flags & CLONE_THREAD)
1583		threadgroup_change_end(current);
1584	delayacct_tsk_free(p);
1585	module_put(task_thread_info(p)->exec_domain->module);
1586bad_fork_cleanup_count:
1587	atomic_dec(&p->cred->user->processes);
1588	exit_creds(p);
1589bad_fork_free:
1590	free_task(p);
1591fork_out:
1592	return ERR_PTR(retval);
1593}
1594
1595static inline void init_idle_pids(struct pid_link *links)
1596{
1597	enum pid_type type;
1598
1599	for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
1600		INIT_HLIST_NODE(&links[type].node); /* not really needed */
1601		links[type].pid = &init_struct_pid;
1602	}
1603}
1604
1605struct task_struct *fork_idle(int cpu)
1606{
1607	struct task_struct *task;
1608	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
1609	if (!IS_ERR(task)) {
1610		init_idle_pids(task->pids);
1611		init_idle(task, cpu);
1612	}
1613
1614	return task;
1615}
1616
1617/*
1618 *  Ok, this is the main fork-routine.
1619 *
1620 * It copies the process, and if successful kick-starts
1621 * it and waits for it to finish using the VM if required.
1622 */
1623long do_fork(unsigned long clone_flags,
1624	      unsigned long stack_start,
1625	      unsigned long stack_size,
1626	      int __user *parent_tidptr,
1627	      int __user *child_tidptr)
1628{
1629	struct task_struct *p;
1630	int trace = 0;
1631	long nr;
1632
1633	/*
1634	 * Determine whether and which event to report to ptracer.  When
1635	 * called from kernel_thread or CLONE_UNTRACED is explicitly
1636	 * requested, no event is reported; otherwise, report if the event
1637	 * for the type of forking is enabled.
1638	 */
1639	if (!(clone_flags & CLONE_UNTRACED)) {
1640		if (clone_flags & CLONE_VFORK)
1641			trace = PTRACE_EVENT_VFORK;
1642		else if ((clone_flags & CSIGNAL) != SIGCHLD)
1643			trace = PTRACE_EVENT_CLONE;
1644		else
1645			trace = PTRACE_EVENT_FORK;
1646
1647		if (likely(!ptrace_event_enabled(current, trace)))
1648			trace = 0;
1649	}
1650
1651	p = copy_process(clone_flags, stack_start, stack_size,
1652			 child_tidptr, NULL, trace);
1653	/*
1654	 * Do this prior waking up the new thread - the thread pointer
1655	 * might get invalid after that point, if the thread exits quickly.
1656	 */
1657	if (!IS_ERR(p)) {
1658		struct completion vfork;
1659		struct pid *pid;
1660
1661		trace_sched_process_fork(current, p);
1662
1663		pid = get_task_pid(p, PIDTYPE_PID);
1664		nr = pid_vnr(pid);
1665
1666		if (clone_flags & CLONE_PARENT_SETTID)
1667			put_user(nr, parent_tidptr);
1668
1669		if (clone_flags & CLONE_VFORK) {
1670			p->vfork_done = &vfork;
1671			init_completion(&vfork);
1672			get_task_struct(p);
1673		}
1674
1675		wake_up_new_task(p);
1676
1677		/* forking complete and child started to run, tell ptracer */
1678		if (unlikely(trace))
1679			ptrace_event_pid(trace, pid);
1680
1681		if (clone_flags & CLONE_VFORK) {
1682			if (!wait_for_vfork_done(p, &vfork))
1683				ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
1684		}
1685
1686		put_pid(pid);
1687	} else {
1688		nr = PTR_ERR(p);
1689	}
1690	return nr;
1691}
1692
1693/*
1694 * Create a kernel thread.
1695 */
1696pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
1697{
1698	return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
1699		(unsigned long)arg, NULL, NULL);
1700}
1701
1702#ifdef __ARCH_WANT_SYS_FORK
1703SYSCALL_DEFINE0(fork)
1704{
1705#ifdef CONFIG_MMU
1706	return do_fork(SIGCHLD, 0, 0, NULL, NULL);
1707#else
1708	/* can not support in nommu mode */
1709	return -EINVAL;
1710#endif
1711}
1712#endif
1713
1714#ifdef __ARCH_WANT_SYS_VFORK
1715SYSCALL_DEFINE0(vfork)
1716{
1717	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
1718			0, NULL, NULL);
1719}
1720#endif
1721
1722#ifdef __ARCH_WANT_SYS_CLONE
1723#ifdef CONFIG_CLONE_BACKWARDS
1724SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1725		 int __user *, parent_tidptr,
1726		 int, tls_val,
1727		 int __user *, child_tidptr)
1728#elif defined(CONFIG_CLONE_BACKWARDS2)
1729SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
1730		 int __user *, parent_tidptr,
1731		 int __user *, child_tidptr,
1732		 int, tls_val)
1733#elif defined(CONFIG_CLONE_BACKWARDS3)
1734SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
1735		int, stack_size,
1736		int __user *, parent_tidptr,
1737		int __user *, child_tidptr,
1738		int, tls_val)
1739#else
1740SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1741		 int __user *, parent_tidptr,
1742		 int __user *, child_tidptr,
1743		 int, tls_val)
1744#endif
1745{
1746	return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
1747}
1748#endif
1749
1750#ifndef ARCH_MIN_MMSTRUCT_ALIGN
1751#define ARCH_MIN_MMSTRUCT_ALIGN 0
1752#endif
1753
1754static void sighand_ctor(void *data)
1755{
1756	struct sighand_struct *sighand = data;
1757
1758	spin_lock_init(&sighand->siglock);
1759	init_waitqueue_head(&sighand->signalfd_wqh);
1760}
1761
1762void __init proc_caches_init(void)
1763{
1764	sighand_cachep = kmem_cache_create("sighand_cache",
1765			sizeof(struct sighand_struct), 0,
1766			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
1767			SLAB_NOTRACK, sighand_ctor);
1768	signal_cachep = kmem_cache_create("signal_cache",
1769			sizeof(struct signal_struct), 0,
1770			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1771	files_cachep = kmem_cache_create("files_cache",
1772			sizeof(struct files_struct), 0,
1773			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1774	fs_cachep = kmem_cache_create("fs_cache",
1775			sizeof(struct fs_struct), 0,
1776			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1777	/*
1778	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
1779	 * whole struct cpumask for the OFFSTACK case. We could change
1780	 * this to *only* allocate as much of it as required by the
1781	 * maximum number of CPU's we can ever have.  The cpumask_allocation
1782	 * is at the end of the structure, exactly for that reason.
1783	 */
1784	mm_cachep = kmem_cache_create("mm_struct",
1785			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1786			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1787	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1788	mmap_init();
1789	nsproxy_cache_init();
1790}
1791
1792/*
1793 * Check constraints on flags passed to the unshare system call.
1794 */
1795static int check_unshare_flags(unsigned long unshare_flags)
1796{
1797	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1798				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1799				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
1800				CLONE_NEWUSER|CLONE_NEWPID))
1801		return -EINVAL;
1802	/*
1803	 * Not implemented, but pretend it works if there is nothing to
1804	 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
1805	 * needs to unshare vm.
1806	 */
1807	if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1808		/* FIXME: get_task_mm() increments ->mm_users */
1809		if (atomic_read(&current->mm->mm_users) > 1)
1810			return -EINVAL;
1811	}
1812
1813	return 0;
1814}
1815
1816/*
1817 * Unshare the filesystem structure if it is being shared
1818 */
1819static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
1820{
1821	struct fs_struct *fs = current->fs;
1822
1823	if (!(unshare_flags & CLONE_FS) || !fs)
1824		return 0;
1825
1826	/* don't need lock here; in the worst case we'll do useless copy */
1827	if (fs->users == 1)
1828		return 0;
1829
1830	*new_fsp = copy_fs_struct(fs);
1831	if (!*new_fsp)
1832		return -ENOMEM;
1833
1834	return 0;
1835}
1836
1837/*
1838 * Unshare file descriptor table if it is being shared
1839 */
1840static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
1841{
1842	struct files_struct *fd = current->files;
1843	int error = 0;
1844
1845	if ((unshare_flags & CLONE_FILES) &&
1846	    (fd && atomic_read(&fd->count) > 1)) {
1847		*new_fdp = dup_fd(fd, &error);
1848		if (!*new_fdp)
1849			return error;
1850	}
1851
1852	return 0;
1853}
1854
1855/*
1856 * unshare allows a process to 'unshare' part of the process
1857 * context which was originally shared using clone.  copy_*
1858 * functions used by do_fork() cannot be used here directly
1859 * because they modify an inactive task_struct that is being
1860 * constructed. Here we are modifying the current, active,
1861 * task_struct.
1862 */
1863SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1864{
1865	struct fs_struct *fs, *new_fs = NULL;
1866	struct files_struct *fd, *new_fd = NULL;
1867	struct cred *new_cred = NULL;
1868	struct nsproxy *new_nsproxy = NULL;
1869	int do_sysvsem = 0;
1870	int err;
1871
1872	/*
1873	 * If unsharing a user namespace must also unshare the thread.
1874	 */
1875	if (unshare_flags & CLONE_NEWUSER)
1876		unshare_flags |= CLONE_THREAD | CLONE_FS;
1877	/*
1878	 * If unsharing a thread from a thread group, must also unshare vm.
1879	 */
1880	if (unshare_flags & CLONE_THREAD)
1881		unshare_flags |= CLONE_VM;
1882	/*
1883	 * If unsharing vm, must also unshare signal handlers.
1884	 */
1885	if (unshare_flags & CLONE_VM)
1886		unshare_flags |= CLONE_SIGHAND;
1887	/*
1888	 * If unsharing namespace, must also unshare filesystem information.
1889	 */
1890	if (unshare_flags & CLONE_NEWNS)
1891		unshare_flags |= CLONE_FS;
1892
1893	err = check_unshare_flags(unshare_flags);
1894	if (err)
1895		goto bad_unshare_out;
1896	/*
1897	 * CLONE_NEWIPC must also detach from the undolist: after switching
1898	 * to a new ipc namespace, the semaphore arrays from the old
1899	 * namespace are unreachable.
1900	 */
1901	if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
1902		do_sysvsem = 1;
1903	err = unshare_fs(unshare_flags, &new_fs);
1904	if (err)
1905		goto bad_unshare_out;
1906	err = unshare_fd(unshare_flags, &new_fd);
1907	if (err)
1908		goto bad_unshare_cleanup_fs;
1909	err = unshare_userns(unshare_flags, &new_cred);
1910	if (err)
1911		goto bad_unshare_cleanup_fd;
1912	err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
1913					 new_cred, new_fs);
1914	if (err)
1915		goto bad_unshare_cleanup_cred;
1916
1917	if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
1918		if (do_sysvsem) {
1919			/*
1920			 * CLONE_SYSVSEM is equivalent to sys_exit().
1921			 */
1922			exit_sem(current);
1923		}
1924		if (unshare_flags & CLONE_NEWIPC) {
1925			/* Orphan segments in old ns (see sem above). */
1926			exit_shm(current);
1927			shm_init_task(current);
1928		}
1929
1930		if (new_nsproxy)
1931			switch_task_namespaces(current, new_nsproxy);
1932
1933		task_lock(current);
1934
1935		if (new_fs) {
1936			fs = current->fs;
1937			spin_lock(&fs->lock);
1938			current->fs = new_fs;
1939			if (--fs->users)
1940				new_fs = NULL;
1941			else
1942				new_fs = fs;
1943			spin_unlock(&fs->lock);
1944		}
1945
1946		if (new_fd) {
1947			fd = current->files;
1948			current->files = new_fd;
1949			new_fd = fd;
1950		}
1951
1952		task_unlock(current);
1953
1954		if (new_cred) {
1955			/* Install the new user namespace */
1956			commit_creds(new_cred);
1957			new_cred = NULL;
1958		}
1959	}
1960
1961bad_unshare_cleanup_cred:
1962	if (new_cred)
1963		put_cred(new_cred);
1964bad_unshare_cleanup_fd:
1965	if (new_fd)
1966		put_files_struct(new_fd);
1967
1968bad_unshare_cleanup_fs:
1969	if (new_fs)
1970		free_fs_struct(new_fs);
1971
1972bad_unshare_out:
1973	return err;
1974}
1975
1976/*
1977 *	Helper to unshare the files of the current task.
1978 *	We don't want to expose copy_files internals to
1979 *	the exec layer of the kernel.
1980 */
1981
1982int unshare_files(struct files_struct **displaced)
1983{
1984	struct task_struct *task = current;
1985	struct files_struct *copy = NULL;
1986	int error;
1987
1988	error = unshare_fd(CLONE_FILES, &copy);
1989	if (error || !copy) {
1990		*displaced = NULL;
1991		return error;
1992	}
1993	*displaced = task->files;
1994	task_lock(task);
1995	task->files = copy;
1996	task_unlock(task);
1997	return 0;
1998}
1999