Re: 2.2.0 SECURITY (fwd)

From: Andrea Arcangeli (andrea@E-MIND.COM)
Date: Wed Jan 27 1999 - 12:38:37 PST

  • Next message: Andrea Arcangeli: "Re: 2.2.0 SECURITY (fwd)"

    On Wed, 27 Jan 1999, //Stany wrote:
    
    > Later on down the thread there are other people then Dan Burcaw who say
    > that the bug crashes their boxes too.
    
    There' s another way to cause 2.2.0 to generate many Oops, process in D
    state, and once it also locked up when I was in X.
    
    This night I am been able to write a simple script to reproduce the bad
    behavior:
    
    #!/bin/sh
    # Copyright (C) 1999  Andrea Arcangeli
    # Crash-2.2.0.sh
    
    while :; do
    	free &>/dev/null &
    	ps &>/dev/null &
    done
    
    And here it is my race-fix against linux-2.2.0 (that I written today and
    last night). I am not able to cause process to go in D state or Oopses or
    locks anymore (once applyed also the other munmap bugfix by Ingo Molnar
    of course ;).
    
    Index: arch/i386/kernel/ldt.c
    ===================================================================
    RCS file: /var/cvs/linux/arch/i386/kernel/ldt.c,v
    retrieving revision 1.1.1.1
    diff -u -r1.1.1.1 ldt.c
    --- ldt.c	1999/01/18 01:28:57	1.1.1.1
    +++ linux/arch/i386/kernel/ldt.c	1999/01/27 19:36:10
    @@ -83,7 +83,7 @@
     			set_ldt_desc(i, ldt, LDT_ENTRIES);
     			current->tss.ldt = _LDT(i);
     			load_ldt(i);
    -			if (atomic_read(&mm->count) > 1)
    +			if (mm->count > 1)
     				printk(KERN_WARNING
     					"LDT allocated for cloned task!\n");
     		} else {
    Index: fs/binfmt_aout.c
    ===================================================================
    RCS file: /var/cvs/linux/fs/binfmt_aout.c,v
    retrieving revision 1.1.1.1
    diff -u -r1.1.1.1 binfmt_aout.c
    --- binfmt_aout.c	1999/01/18 01:26:49	1.1.1.1
    +++ linux/fs/binfmt_aout.c	1999/01/27 20:06:16
    @@ -101,7 +101,7 @@
     #       define START_STACK(u)   (u.start_stack)
     #endif
    
    -	if (!current->dumpable || atomic_read(&current->mm->count) != 1)
    +	if (!current->dumpable || current->mm->count != 1)
     		return 0;
     	current->dumpable = 0;
    
    Index: fs/binfmt_elf.c
    ===================================================================
    RCS file: /var/cvs/linux/fs/binfmt_elf.c,v
    retrieving revision 1.1.1.1
    diff -u -r1.1.1.1 binfmt_elf.c
    --- binfmt_elf.c	1999/01/18 01:26:49	1.1.1.1
    +++ linux/fs/binfmt_elf.c	1999/01/27 19:36:10
    @@ -1067,7 +1067,7 @@
    
     	if (!current->dumpable ||
     	    limit < ELF_EXEC_PAGESIZE ||
    -	    atomic_read(&current->mm->count) != 1)
    +	    current->mm->count != 1)
     		return 0;
     	current->dumpable = 0;
    
    Index: fs/exec.c
    ===================================================================
    RCS file: /var/cvs/linux/fs/exec.c,v
    retrieving revision 1.1.1.2
    diff -u -r1.1.1.2 exec.c
    --- exec.c	1999/01/23 16:28:07	1.1.1.2
    +++ linux/fs/exec.c	1999/01/27 19:36:10
    @@ -378,7 +378,7 @@
     	struct mm_struct * mm, * old_mm;
     	int retval, nr;
    
    -	if (atomic_read(&current->mm->count) == 1) {
    +	if (current->mm->count == 1) {
     		flush_cache_mm(current->mm);
     		mm_release();
     		release_segments(current->mm);
    @@ -409,7 +409,9 @@
     	activate_context(current);
     	up(&mm->mmap_sem);
     	mm_release();
    +	spin_lock(&mm_lock);
     	mmput(old_mm);
    +	spin_unlock(&mm_lock);
     	return 0;
    
     	/*
    Index: fs/proc/array.c
    ===================================================================
    RCS file: /var/cvs/linux/fs/proc/array.c,v
    retrieving revision 1.1.1.4
    diff -u -r1.1.1.4 array.c
    --- array.c	1999/01/26 18:30:44	1.1.1.4
    +++ linux/fs/proc/array.c	1999/01/27 19:59:22
    @@ -43,6 +43,8 @@
      *			<Alan.Coxat_private>
      *
      * Andi Kleen	     :  Race Fixes. 	
    + *
    + * Andrea Arcangeli  :  do_exit/mmput/mmget race fix on the Race Fixes ;).
      *
      */
    
    @@ -65,6 +67,7 @@
     #include <linux/slab.h>
     #include <linux/smp.h>
     #include <linux/signal.h>
    +#include <linux/init.h>
    
     #include <asm/uaccess.h>
     #include <asm/pgtable.h>
    @@ -77,6 +80,7 @@
     int get_malloc(char * buffer);
     #endif
    
    +static kmem_cache_t * task_struct_head_cache;
    
     static ssize_t read_core(struct file * file, char * buf,
     			 size_t count, loff_t *ppos)
    @@ -399,8 +403,11 @@
    
     	read_lock(&tasklist_lock);
     	tsk = find_task_by_pid(pid);
    +	spin_lock(&mm_lock);
     	if (tsk && tsk->mm && tsk->mm != &init_mm)
    -		mmget(mm = tsk->mm);
    +		if (!mmget(mm = tsk->mm))
    +			mm = NULL;
    +	spin_unlock(&mm_lock);
     	read_unlock(&tasklist_lock);
     	if (mm != NULL)
     		down(&mm->mmap_sem);
    @@ -410,7 +417,9 @@
     static void release_mm(struct mm_struct *mm)
     {
     	up(&mm->mmap_sem);
    +	spin_lock(&mm_lock);
     	mmput(mm);
    +	spin_unlock(&mm_lock);
     }
    
     static unsigned long get_phys_addr(struct mm_struct *mm, unsigned long ptr)
    @@ -504,17 +513,9 @@
     	return res;
     }
    
    -/*
    - * These bracket the sleeping functions..
    - */
    -extern void scheduling_functions_start_here(void);
    -extern void scheduling_functions_end_here(void);
    -#define first_sched	((unsigned long) scheduling_functions_start_here)
    -#define last_sched	((unsigned long) scheduling_functions_end_here)
    -
     static unsigned long get_wchan(struct task_struct *p)
     {
    -	if (!p || p == current || p->state == TASK_RUNNING)
    +	if (!p || real_tsk(p) == current || p->state == TASK_RUNNING)
     		return 0;
     #if defined(__i386__)
     	{
    @@ -522,7 +523,7 @@
     		unsigned long stack_page;
     		int count = 0;
    
    -		stack_page = (unsigned long)p;
    +		stack_page = (unsigned long)real_tsk(p);
     		esp = p->tss.esp;
     		if (!stack_page || esp < stack_page || esp >= 8188+stack_page)
     			return 0;
    @@ -564,7 +565,7 @@
     	    unsigned long stack_page;
     	    int count = 0;
    
    -	    stack_page = (unsigned long)p;
    +	    stack_page = (unsigned long)real_tsk(p);
     	    fp = ((struct switch_stack *)p->tss.ksp)->a6;
     	    do {
     		    if (fp < stack_page+sizeof(struct task_struct) ||
    @@ -585,7 +586,7 @@
     		unsigned long stack_page;
     		int count = 0;
    
    -		stack_page = 4096 + (unsigned long)p;
    +		stack_page = 4096 + (unsigned long)real_tsk(p);
     		fp = get_css_fp (&p->tss);
     		do {
     			if (fp < stack_page || fp > 4092+stack_page)
    @@ -599,7 +600,7 @@
     #elif defined (__sparc__)
     	{
     		unsigned long pc, fp, bias = 0;
    -		unsigned long task_base = (unsigned long) p;
    +		unsigned long task_base = (unsigned long) real_tsk(p);
     		struct reg_window *rw;
     		int count = 0;
    
    @@ -624,8 +625,8 @@
     }
    
     #if defined(__i386__)
    -# define KSTK_EIP(tsk)	(((unsigned long *)(4096+(unsigned long)(tsk)))[1019])
    -# define KSTK_ESP(tsk)	(((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
    +# define KSTK_EIP(tsk)	(((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1019])
    +# define KSTK_ESP(tsk)	(((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1022])
     #elif defined(__alpha__)
       /*
        * See arch/alpha/kernel/ptrace.c for details.
    @@ -633,11 +634,11 @@
     # define PT_REG(reg)		(PAGE_SIZE - sizeof(struct pt_regs)	\
     				 + (long)&((struct pt_regs *)0)->reg)
     # define KSTK_EIP(tsk) \
    -    (*(unsigned long *)(PT_REG(pc) + PAGE_SIZE + (unsigned long)(tsk)))
    +    (*(unsigned long *)(PT_REG(pc) + PAGE_SIZE + (unsigned long)real_tsk(tsk)))
     # define KSTK_ESP(tsk)	((tsk) == current ? rdusp() : (tsk)->tss.usp)
     #elif defined(CONFIG_ARM)
    -# define KSTK_EIP(tsk)	(((unsigned long *)(4096+(unsigned long)(tsk)))[1022])
    -# define KSTK_ESP(tsk)	(((unsigned long *)(4096+(unsigned long)(tsk)))[1020])
    +# define KSTK_EIP(tsk)	(((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1022])
    +# define KSTK_ESP(tsk)	(((unsigned long *)(4096+(unsigned long)real_tsk(tsk)))[1020])
     #elif defined(__mc68000__)
     #define	KSTK_EIP(tsk)	\
         ({			\
    @@ -646,7 +647,7 @@
     	    MAP_NR((tsk)->tss.esp0) < max_mapnr) \
     	      eip = ((struct pt_regs *) (tsk)->tss.esp0)->pc;	 \
     	eip; })
    -#define	KSTK_ESP(tsk)	((tsk) == current ? rdusp() : (tsk)->tss.usp)
    +#define	KSTK_ESP(tsk)	(real_tsk(tsk) == current ? rdusp() : (tsk)->tss.usp)
     #elif defined(__powerpc__)
     #define KSTK_EIP(tsk)	((tsk)->tss.regs->nip)
     #define KSTK_ESP(tsk)	((tsk)->tss.regs->gpr[1])
    @@ -851,21 +852,42 @@
    
     static struct task_struct *grab_task(int pid)
     {
    -	struct task_struct *tsk = current;
    +	struct task_struct *tsk = current, *dst;
     	if (pid != tsk->pid) {
    +		dst = kmem_cache_alloc(task_struct_head_cache, SLAB_KERNEL);
    +		if (!dst)
    +			return NULL;
     		read_lock(&tasklist_lock);
     		tsk = find_task_by_pid(pid);
    -		if (tsk && tsk->mm && tsk->mm != &init_mm)
    -			mmget(tsk->mm);
    +		if (tsk) {
    +			spin_lock(&mm_lock);
    +			if (tsk->mm && tsk->mm != &init_mm)
    +				if (!mmget(tsk->mm))
    +					tsk = NULL;
    +			spin_unlock(&mm_lock);
    +			if (tsk)
    +			{
    +				memcpy(dst, tsk, sizeof(struct task_struct));
    +				tsk = dst;
    +			}
    +		}
     		read_unlock(&tasklist_lock);
    -	}	
    +		if (!tsk)
    +			kmem_cache_free(task_struct_head_cache, dst);
    +	}
     	return tsk;
     }
    
     static void release_task(struct task_struct *tsk)
     {
    -	if (tsk != current && tsk->mm && tsk->mm != &init_mm)
    -		mmput(tsk->mm);
    +	if (tsk != current)
    +	{
    +		spin_lock(&mm_lock);
    +		if (tsk->mm && tsk->mm != &init_mm)
    +			mmput(tsk->mm);
    +		spin_unlock(&mm_lock);
    +		kmem_cache_free(task_struct_head_cache, tsk);
    +	}
     }
    
     static int get_status(int pid, char * buffer)
    @@ -1149,7 +1171,7 @@
     		goto getlen_out;
    
     	/* Check whether the mmaps could change if we sleep */
    -	volatile_task = (p != current || atomic_read(&p->mm->count) > 1);
    +	volatile_task = (p != current || p->mm->count > 1);
    
     	/* decode f_pos */
     	lineno = *ppos >> MAPS_LINE_SHIFT;
    @@ -1600,3 +1622,14 @@
     	NULL,			/* truncate */
     	NULL			/* permission */
     };
    +
    +void __init proc_array_init(void)
    +{
    +	task_struct_head_cache = kmem_cache_create("task_struct",
    +						   sizeof(struct task_struct),
    +						   0,
    +						   SLAB_HWCACHE_ALIGN,
    +						   NULL, NULL);
    +	if (!task_struct_head_cache)
    +		panic("cannot create task_struct cache");
    +}
    Index: fs/proc/root.c
    ===================================================================
    RCS file: /var/cvs/linux/fs/proc/root.c,v
    retrieving revision 1.1.1.2
    diff -u -r1.1.1.2 root.c
    --- root.c	1999/01/18 13:39:15	1.1.1.2
    +++ linux/fs/proc/root.c	1999/01/27 19:49:40
    @@ -671,6 +671,7 @@
    
     __initfunc(void proc_root_init(void))
     {
    +	proc_array_init();
     	proc_base_init();
     	proc_register(&proc_root, &proc_root_loadavg);
     	proc_register(&proc_root, &proc_root_uptime);
    Index: include/asm-i386/pgtable.h
    ===================================================================
    RCS file: /var/cvs/linux/include/asm-i386/pgtable.h,v
    retrieving revision 1.1.1.1
    diff -u -r1.1.1.1 pgtable.h
    --- pgtable.h	1999/01/18 01:27:15	1.1.1.1
    +++ linux/include/asm-i386/pgtable.h	1999/01/27 19:45:56
    @@ -101,7 +101,7 @@
     static inline void flush_tlb_current_task(void)
     {
     	/* just one copy of this mm? */
    -	if (atomic_read(&current->mm->count) == 1)
    +	if (current->mm->count == 1)
     		local_flush_tlb();	/* and that's us, so.. */
     	else
     		smp_flush_tlb();
    @@ -113,7 +113,7 @@
    
     static inline void flush_tlb_mm(struct mm_struct * mm)
     {
    -	if (mm == current->mm && atomic_read(&mm->count) == 1)
    +	if (mm == current->mm && mm->count == 1)
     		local_flush_tlb();
     	else
     		smp_flush_tlb();
    @@ -122,7 +122,7 @@
     static inline void flush_tlb_page(struct vm_area_struct * vma,
     	unsigned long va)
     {
    -	if (vma->vm_mm == current->mm && atomic_read(&current->mm->count) == 1)
    +	if (vma->vm_mm == current->mm && current->mm->count == 1)
     		__flush_tlb_one(va);
     	else
     		smp_flush_tlb();
    Index: include/linux/proc_fs.h
    ===================================================================
    RCS file: /var/cvs/linux/include/linux/proc_fs.h,v
    retrieving revision 1.1.1.1
    diff -u -r1.1.1.1 proc_fs.h
    --- proc_fs.h	1999/01/18 01:27:09	1.1.1.1
    +++ linux/include/linux/proc_fs.h	1999/01/27 19:48:49
    @@ -312,6 +312,7 @@
     extern struct inode_operations proc_scsi_inode_operations;
    
     extern void proc_root_init(void);
    +extern void proc_array_init(void);
     extern void proc_base_init(void);
    
     extern int proc_register(struct proc_dir_entry *, struct proc_dir_entry *);
    Index: include/linux/sched.h
    ===================================================================
    RCS file: /var/cvs/linux/include/linux/sched.h,v
    retrieving revision 1.1.1.3
    diff -u -r1.1.1.3 sched.h
    --- sched.h	1999/01/23 16:29:58	1.1.1.3
    +++ linux/include/linux/sched.h	1999/01/27 19:45:56
    @@ -114,6 +114,15 @@
     extern rwlock_t tasklist_lock;
     extern spinlock_t scheduler_lock;
    
    +/*
    + * These bracket the sleeping functions..
    + */
    +extern void scheduling_functions_start_here(void);
    +extern void scheduling_functions_end_here(void);
    +#define first_sched	((unsigned long) scheduling_functions_start_here)
    +#define last_sched	((unsigned long) scheduling_functions_end_here)
    +#define real_tsk(tsk)	(*(tsk)->tarray_ptr)
    +
     extern void sched_init(void);
     extern void show_state(void);
     extern void trap_init(void);
    @@ -164,7 +173,7 @@
     	struct vm_area_struct *mmap_avl;	/* tree of VMAs */
     	struct vm_area_struct *mmap_cache;	/* last find_vma result */
     	pgd_t * pgd;
    -	atomic_t count;
    +	int count;
     	int map_count;				/* number of VMAs */
     	struct semaphore mmap_sem;
     	unsigned long context;
    @@ -184,7 +193,7 @@
     #define INIT_MM {					\
     		&init_mmap, NULL, NULL,			\
     		swapper_pg_dir, 			\
    -		ATOMIC_INIT(1), 1,			\
    +		1, 1,					\
     		MUTEX,					\
     		0,					\
     		0, 0, 0, 0,				\
    @@ -391,6 +400,7 @@
    
     extern struct task_struct **tarray_freelist;
     extern spinlock_t taskslot_lock;
    +extern spinlock_t mm_lock;
    
     extern __inline__ void add_free_taskslot(struct task_struct **t)
     {
    @@ -609,11 +619,8 @@
      * Routines for handling mm_structs
      */
     extern struct mm_struct * mm_alloc(void);
    -static inline void mmget(struct mm_struct * mm)
    -{
    -	atomic_inc(&mm->count);
    -}
    -extern void mmput(struct mm_struct *);
    +extern int FASTCALL(mmget(struct mm_struct *));
    +extern void FASTCALL(mmput(struct mm_struct *));
     /* Remove the current tasks stale references to the old mm_struct */
     extern void mm_release(void);
    
    Index: kernel/exit.c
    ===================================================================
    RCS file: /var/cvs/linux/kernel/exit.c,v
    retrieving revision 1.1.1.2
    diff -u -r1.1.1.2 exit.c
    --- exit.c	1999/01/23 16:30:27	1.1.1.2
    +++ linux/kernel/exit.c	1999/01/27 19:36:10
    @@ -248,8 +248,10 @@
    
     static inline void __exit_mm(struct task_struct * tsk)
     {
    -	struct mm_struct * mm = tsk->mm;
    +	struct mm_struct * mm;
    
    +	spin_lock(&mm_lock);
    +	mm = tsk->mm;
     	/* Set us up to use the kernel mm state */
     	if (mm != &init_mm) {
     		flush_cache_mm(mm);
    @@ -261,6 +263,7 @@
     		mm_release();
     		mmput(mm);
     	}
    +	spin_unlock(&mm_lock);
     }
    
     void exit_mm(struct task_struct *tsk)
    Index: kernel/fork.c
    ===================================================================
    RCS file: /var/cvs/linux/kernel/fork.c,v
    retrieving revision 1.1.1.3
    diff -u -r1.1.1.3 fork.c
    --- fork.c	1999/01/23 16:30:27	1.1.1.3
    +++ linux/kernel/fork.c	1999/01/27 19:36:23
    @@ -10,6 +10,11 @@
      * Fork is rather simple, once you get the hang of it, but the memory
      * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
      */
    +/*
    + * Fixed a race between mmget() and mmput(): added the mm_lock spinlock
    + * to serialize accesses to the tsk->mm field.
    + * Copyright (C) 1999  Andrea Arcangeli
    + */
    
     #include <linux/malloc.h>
     #include <linux/init.h>
    @@ -39,6 +44,7 @@
    
     struct task_struct **tarray_freelist = NULL;
     spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
    +spinlock_t mm_lock = SPIN_LOCK_UNLOCKED;
    
     /* UID task count cache, to prevent walking entire process list every
      * single fork() operation.
    @@ -261,7 +267,7 @@
     	if (mm) {
     		*mm = *current->mm;
     		init_new_context(mm);
    -		atomic_set(&mm->count, 1);
    +		mm->count = 1;
     		mm->map_count = 0;
     		mm->def_flags = 0;
     		mm->mmap_sem = MUTEX_LOCKED;
    @@ -303,12 +309,25 @@
     	}
     }
    
    +int mmget(struct mm_struct * mm)
    +{
    +	int retval = 0;
    +
    +	if (mm->count)
    +	{
    +		mm->count++;
    +		retval = 1;
    +	}
    +
    +	return retval;
    +}
    +
     /*
      * Decrement the use count and release all resources for an mm.
      */
     void mmput(struct mm_struct *mm)
     {
    -	if (atomic_dec_and_test(&mm->count)) {
    +	if (!--mm->count) {
     		release_segments(mm);
     		exit_mmap(mm);
     		free_page_tables(mm);
    @@ -322,7 +341,9 @@
     	int retval;
    
     	if (clone_flags & CLONE_VM) {
    +		spin_lock(&mm_lock);
     		mmget(current->mm);
    +		spin_unlock(&mm_lock);
     		/*
     		 * Set up the LDT descriptor for the clone task.
     		 */
    



    This archive was generated by hypermail 2b30 : Fri Apr 13 2001 - 14:31:44 PDT