immunant / IA2-Phase2

4 stars 0 forks source link

Syscall categorization #231

Open fw-immunant opened 1 year ago

fw-immunant commented 1 year ago

Controlling system calls isn't as simple as filtering them on syscall identity, but identity is the first criterion we filter on. I've gone through x86_64 syscalls and categorized them on the basis of what I think we ultimately want/need to do to support them and how they interact with our sandbox.

In general, we will disallow forbidden syscalls with seccomp-bpf discriminating on the syscall number; syscalls that require more complex filtering will have some amount of argument inspection done directly by logic in seccomp-bpf and some by userspace helper.

In some cases we can give the userspace helper more latitude for inspecting arguments (e.g. those in buffers pointed at by syscall arguments) by wrapping syscalls in a shim that "freezes" these arguments by copying them to a read-only memory region. This allows us to avoid TOCTTOU by checking pointers against a known map of frozen memory regions and to racelessly inspect pointed-to data if it is within a frozen region.

I've attempted to categorize all syscalls for x86_64, but we should implement filtering as an allowlist prioritized based on the syscalls used by common programs, our tests, and nginx. I'll file a separate issue on syscall usage characterization and which subsets of the below table to prioritize.

List of x86_64 syscalls is from here: https://syscalls.mebeim.net/?table=x86/64/x64/v6.3

The table (editable here) is inlined below, as converted with this tool followed by sed -r -e 's/ +/ /g' to collapse spaces and make it fit in GH's comment character limit:

syscalls, categorization, and policy | %rax | name | entrypoint | implementation | category | policy | |-------|-------------------------|-----------------------------|------------------------------------|---------------|------------------------------------------------------------------------------------------| | 0 | read | sys_read | fs/read_write.c | io | allow | | 1 | write | sys_write | fs/read_write.c | io | allow | | 2 | open | sys_open | fs/open.c | fs | not special devices (/proc/self/mem) | | 3 | close | sys_close | fs/open.c | io | allow | | 4 | stat | sys_newstat | fs/stat.c | fs | allow | | 5 | fstat | sys_newfstat | fs/stat.c | fs | allow | | 6 | lstat | sys_newlstat | fs/stat.c | fs | allow | | 7 | poll | sys_poll | fs/select.c | io | allow | | 8 | lseek | sys_lseek | fs/read_write.c | io | allow | | 9 | mmap | sys_mmap | arch/x86/kernel/sys_x86_64.c | mem | complex filtering (mmap) | | 10 | mprotect | sys_mprotect | mm/mprotect.c | mem | complex filtering (mmap) | | 11 | munmap | sys_munmap | mm/mmap.c | mem | complex filtering (mmap) | | 12 | brk | sys_brk | mm/mmap.c | mem | allow | | 13 | rt_sigaction | sys_rt_sigaction | kernel/signal.c | signal | complex filtering (signal) | | 14 | rt_sigprocmask | sys_rt_sigprocmask | kernel/signal.c | signal | complex filtering (signal) | | 15 | rt_sigreturn | stub_rt_sigreturn | arch/x86/kernel/signal.c | signal | complex filtering (signal) | | 16 | ioctl | sys_ioctl | fs/ioctl.c | multiplexed | complex filtering (allowlist of simple ioctls like tty ones) | | 17 | pread64 | sys_pread64 | fs/read_write.c | io | allow | | 18 | pwrite64 | sys_pwrite64 | fs/read_write.c | io | allow | | 19 | readv | sys_readv | fs/read_write.c | io | allow | | 20 | writev | sys_writev | fs/read_write.c | io | allow | | 21 | access | sys_access | fs/open.c | fs | allow | | 22 | pipe | sys_pipe | fs/pipe.c | fd | allow | | 23 | select | sys_select | fs/select.c | io | allow | | 24 | sched_yield | sys_sched_yield | kernel/sched/core.c | sched | allow | | 25 | mremap | sys_mremap | mm/mmap.c | mem | complex filtering (mmap) | | 26 | msync | sys_msync | mm/msync.c | mem | allow | | 27 | mincore | sys_mincore | mm/mincore.c | mem | allow | | 28 | madvise | sys_madvise | mm/madvise.c | mem | allow? (can this change semantics in a dangerous way?) | | 29 | shmget | sys_shmget | ipc/shm.c | shm | allow | | 30 | shmat | sys_shmat | ipc/shm.c | shm | forbid (alters memory mappings, out-of-scope for now) | | 31 | shmctl | sys_shmctl | ipc/shm.c | shm | allow | | 32 | dup | sys_dup | fs/file.c | fd | allow | | 33 | dup2 | sys_dup2 | fs/file.c | fd | allow | | 34 | pause | sys_pause | kernel/signal.c | sched | allow | | 35 | nanosleep | sys_nanosleep | kernel/hrtimer.c | sched | allow | | 36 | getitimer | sys_getitimer | kernel/itimer.c | sched | allow | | 37 | alarm | sys_alarm | kernel/timer.c | sched | allow | | 38 | setitimer | sys_setitimer | kernel/itimer.c | sched | allow | | 39 | getpid | sys_getpid | kernel/sys.c | process | allow | | 40 | sendfile | sys_sendfile64 | fs/read_write.c | io | allow | | 41 | socket | sys_socket | net/socket.c | fd | allow | | 42 | connect | sys_connect | net/socket.c | fd | allow | | 43 | accept | sys_accept | net/socket.c | fd | allow | | 44 | sendto | sys_sendto | net/socket.c | io | allow | | 45 | recvfrom | sys_recvfrom | net/socket.c | io | allow | | 46 | sendmsg | sys_sendmsg | net/socket.c | io | allow | | 47 | recvmsg | sys_recvmsg | net/socket.c | io | allow | | 48 | shutdown | sys_shutdown | net/socket.c | io | allow | | 49 | bind | sys_bind | net/socket.c | io | allow | | 50 | listen | sys_listen | net/socket.c | fd | allow | | 51 | getsockname | sys_getsockname | net/socket.c | io | allow | | 52 | getpeername | sys_getpeername | net/socket.c | io | allow | | 53 | socketpair | sys_socketpair | net/socket.c | fd | allow | | 54 | setsockopt | sys_setsockopt | net/socket.c | io | allow | | 55 | getsockopt | sys_getsockopt | net/socket.c | io | allow | | 56 | clone | stub_clone | kernel/fork.c | thread | complex filtering (only allowed from our wrapper with args we pass) | | 57 | fork | stub_fork | kernel/fork.c | thread | allow (we support fork) | | 58 | vfork | stub_vfork | kernel/fork.c | thread | allow (is vfork in IA2 safe in the same conditions as outside?) | | 59 | execve | stub_execve | fs/exec.c | exec | allow | | 60 | exit | sys_exit | kernel/exit.c | process | allow | | 61 | wait4 | sys_wait4 | kernel/exit.c | ipc | allow | | 62 | kill | sys_kill | kernel/signal.c | process | allow | | 63 | uname | sys_newuname | kernel/sys.c | ambient | allow | | 64 | semget | sys_semget | ipc/sem.c | obscure | forbid (obscure) | | 65 | semop | sys_semop | ipc/sem.c | obscure | forbid (obscure) | | 66 | semctl | sys_semctl | ipc/sem.c | obscure | forbid (obscure) | | 67 | shmdt | sys_shmdt | ipc/shm.c | obscure | forbid (obscure) | | 68 | msgget | sys_msgget | ipc/msg.c | obscure | forbid (obscure) | | 69 | msgsnd | sys_msgsnd | ipc/msg.c | obscure | forbid (obscure) | | 70 | msgrcv | sys_msgrcv | ipc/msg.c | obscure | forbid (obscure) | | 71 | msgctl | sys_msgctl | ipc/msg.c | obscure | forbid (obscure) | | 72 | fcntl | sys_fcntl | fs/fcntl.c | fd | allow | | 73 | flock | sys_flock | fs/locks.c | fs | allow | | 74 | fsync | sys_fsync | fs/sync.c | fs | allow | | 75 | fdatasync | sys_fdatasync | fs/sync.c | fs | allow | | 76 | truncate | sys_truncate | fs/open.c | fs | allow | | 77 | ftruncate | sys_ftruncate | fs/open.c | fs | allow | | 78 | getdents | sys_getdents | fs/readdir.c | fs | allow | | 79 | getcwd | sys_getcwd | fs/dcache.c | fs | allow | | 80 | chdir | sys_chdir | fs/open.c | fs | allow | | 81 | fchdir | sys_fchdir | fs/open.c | fs | allow | | 82 | rename | sys_rename | fs/namei.c | fs | allow | | 83 | mkdir | sys_mkdir | fs/namei.c | fs | allow | | 84 | rmdir | sys_rmdir | fs/namei.c | fs | allow | | 85 | creat | sys_creat | fs/open.c | fs | allow | | 86 | link | sys_link | fs/namei.c | fs | allow | | 87 | unlink | sys_unlink | fs/namei.c | fs | allow | | 88 | symlink | sys_symlink | fs/namei.c | fs | allow | | 89 | readlink | sys_readlink | fs/stat.c | fs | allow | | 90 | chmod | sys_chmod | fs/open.c | fs | allow | | 91 | fchmod | sys_fchmod | fs/open.c | fs | allow | | 92 | chown | sys_chown | fs/open.c | fs | allow | | 93 | fchown | sys_fchown | fs/open.c | fs | allow | | 94 | lchown | sys_lchown | fs/open.c | fs | allow | | 95 | umask | sys_umask | kernel/sys.c | process | allow | | 96 | gettimeofday | sys_gettimeofday | kernel/time.c | ambient | allow | | 97 | getrlimit | sys_getrlimit | kernel/sys.c | ambient | allow | | 98 | getrusage | sys_getrusage | kernel/sys.c | ambient | allow | | 99 | sysinfo | sys_sysinfo | kernel/sys.c | ambient | allow | | 100 | times | sys_times | kernel/sys.c | ambient | allow | | 101 | ptrace | sys_ptrace | kernel/ptrace.c | debug | complex filtering (may be used as part of sandbox, forbid otherwise) | | 102 | getuid | sys_getuid | kernel/sys.c | process | allow | | 103 | syslog | sys_syslog | kernel/printk/printk.c | io | allow | | 104 | getgid | sys_getgid | kernel/sys.c | process | allow | | 105 | setuid | sys_setuid | kernel/sys.c | process | forbid (security) | | 106 | setgid | sys_setgid | kernel/sys.c | process | forbid (security) | | 107 | geteuid | sys_geteuid | kernel/sys.c | process | allow | | 108 | getegid | sys_getegid | kernel/sys.c | process | allow | | 109 | setpgid | sys_setpgid | kernel/sys.c | process | forbid (security) | | 110 | getppid | sys_getppid | kernel/sys.c | process | allow | | 111 | getpgrp | sys_getpgrp | kernel/sys.c | process | allow | | 112 | setsid | sys_setsid | kernel/sys.c | process | forbid (security) | | 113 | setreuid | sys_setreuid | kernel/sys.c | process | forbid (security) | | 114 | setregid | sys_setregid | kernel/sys.c | process | forbid (security) | | 115 | getgroups | sys_getgroups | kernel/groups.c | process | allow | | 116 | setgroups | sys_setgroups | kernel/groups.c | process | forbid (security) | | 117 | setresuid | sys_setresuid | kernel/sys.c | process | forbid (security) | | 118 | getresuid | sys_getresuid | kernel/sys.c | process | allow | | 119 | setresgid | sys_setresgid | kernel/sys.c | process | forbid (security) | | 120 | getresgid | sys_getresgid | kernel/sys.c | process | allow | | 121 | getpgid | sys_getpgid | kernel/sys.c | process | allow | | 122 | setfsuid | sys_setfsuid | kernel/sys.c | process | forbid (security) | | 123 | setfsgid | sys_setfsgid | kernel/sys.c | process | forbid (security) | | 124 | getsid | sys_getsid | kernel/sys.c | process | forbid (security) | | 125 | capget | sys_capget | kernel/capability.c | caps | forbid (security) | | 126 | capset | sys_capset | kernel/capability.c | caps | forbid (security) | | 127 | rt_sigpending | sys_rt_sigpending | kernel/signal.c | signal | complex filtering (signal) | | 128 | rt_sigtimedwait | sys_rt_sigtimedwait | kernel/signal.c | signal | complex filtering (signal) | | 129 | rt_sigqueueinfo | sys_rt_sigqueueinfo | kernel/signal.c | signal | complex filtering (signal) | | 130 | rt_sigsuspend | sys_rt_sigsuspend | kernel/signal.c | signal | complex filtering (signal) | | 131 | sigaltstack | sys_sigaltstack | kernel/signal.c | signal | complex filtering (signal) | | 132 | utime | sys_utime | fs/utimes.c | ambient | allow | | 133 | mknod | sys_mknod | fs/namei.c | device | forbid (privileged, avoid devices quagmire) | | 134 | uselib | | fs/exec.c | obsolete | forbid (obsolete) | | 135 | personality | sys_personality | kernel/exec_domain.c | process | forbid (too weird) | | 136 | ustat | sys_ustat | fs/statfs.c | fs | allow | | 137 | statfs | sys_statfs | fs/statfs.c | fs | allow | | 138 | fstatfs | sys_fstatfs | fs/statfs.c | fs | allow | | 139 | sysfs | sys_sysfs | fs/filesystems.c | fs | forbid (obsolete) | | 140 | getpriority | sys_getpriority | kernel/sys.c | sched | allow | | 141 | setpriority | sys_setpriority | kernel/sys.c | sched | allow | | 142 | sched_setparam | sys_sched_setparam | kernel/sched/core.c | sched | allow | | 143 | sched_getparam | sys_sched_getparam | kernel/sched/core.c | sched | allow | | 144 | sched_setscheduler | sys_sched_setscheduler | kernel/sched/core.c | sched | allow | | 145 | sched_getscheduler | sys_sched_getscheduler | kernel/sched/core.c | sched | allow | | 146 | sched_get_priority_max | sys_sched_get_priority_max | kernel/sched/core.c | sched | allow | | 147 | sched_get_priority_min | sys_sched_get_priority_min | kernel/sched/core.c | sched | allow | | 148 | sched_rr_get_interval | sys_sched_rr_get_interval | kernel/sched/core.c | sched | allow | | 149 | mlock | sys_mlock | mm/mlock.c | mem | allow (resident vs not does not change protection semantics) | | 150 | munlock | sys_munlock | mm/mlock.c | mem | allow (resident vs not does not change protection semantics) | | 151 | mlockall | sys_mlockall | mm/mlock.c | mem | allow (resident vs not does not change protection semantics) | | 152 | munlockall | sys_munlockall | mm/mlock.c | mem | allow (resident vs not does not change protection semantics) | | 153 | vhangup | sys_vhangup | fs/open.c | tty | allow | | 154 | modify_ldt | sys_modify_ldt | arch/x86/um/ldt.c | arch-specific | forbid (obscure) | | 155 | pivot_root | sys_pivot_root | fs/namespace.c | process | forbid (privileged) | | 156 | _sysctl | sys_sysctl | kernel/sysctl_binary.c | obsolete | forbid (obsolete) | | 157 | prctl | sys_prctl | kernel/sys.c | process | complex filtering (prctl) | | 158 | arch_prctl | sys_arch_prctl | arch/x86/um/syscalls_64.c | process | complex filtering (arch_prctl needed for SET_FS; glibc seems to do ARCH_CET_STATUS also) | | 159 | adjtimex | sys_adjtimex | kernel/time.c | system | forbid (privileged) | | 160 | setrlimit | sys_setrlimit | kernel/sys.c | process | allow | | 161 | chroot | sys_chroot | fs/open.c | process | forbid (privileged) | | 162 | sync | sys_sync | fs/sync.c | fs | allow | | 163 | acct | sys_acct | kernel/acct.c | fs | forbid (obscure) | | 164 | settimeofday | sys_settimeofday | kernel/time.c | system | forbid (privileged) | | 165 | mount | sys_mount | fs/namespace.c | fs | forbid (privileged) | | 166 | umount2 | sys_umount | fs/namespace.c | fs | forbid (privileged) | | 167 | swapon | sys_swapon | mm/swapfile.c | fs | forbid (privileged) | | 168 | swapoff | sys_swapoff | mm/swapfile.c | fs | forbid (privileged) | | 169 | reboot | sys_reboot | kernel/reboot.c | system | forbid (privileged) | | 170 | sethostname | sys_sethostname | kernel/sys.c | system | allow | | 171 | setdomainname | sys_setdomainname | kernel/sys.c | system | allow | | 172 | iopl | stub_iopl | arch/x86/kernel/ioport.c | process | forbid (obscure) | | 173 | ioperm | sys_ioperm | arch/x86/kernel/ioport.c | process | forbid (obscure) | | 174 | create_module | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 175 | init_module | sys_init_module | kernel/module.c | kernel | forbid (privileged) | | 176 | delete_module | sys_delete_module | kernel/module.c | kernel | forbid (privileged) | | 177 | get_kernel_syms | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 178 | query_module | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 179 | quotactl | sys_quotactl | fs/quota/quota.c | fs | allow | | 180 | nfsservctl | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 181 | getpmsg | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 182 | putpmsg | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 183 | afs_syscall | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 184 | tuxcall | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 185 | security | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 186 | gettid | sys_gettid | kernel/sys.c | ambient | allow | | 187 | readahead | sys_readahead | mm/readahead.c | io | allow | | 188 | setxattr | sys_setxattr | fs/xattr.c | fs | allow | | 189 | lsetxattr | sys_lsetxattr | fs/xattr.c | fs | allow | | 190 | fsetxattr | sys_fsetxattr | fs/xattr.c | fs | allow | | 191 | getxattr | sys_getxattr | fs/xattr.c | fs | allow | | 192 | lgetxattr | sys_lgetxattr | fs/xattr.c | fs | allow | | 193 | fgetxattr | sys_fgetxattr | fs/xattr.c | fs | allow | | 194 | listxattr | sys_listxattr | fs/xattr.c | fs | allow | | 195 | llistxattr | sys_llistxattr | fs/xattr.c | fs | allow | | 196 | flistxattr | sys_flistxattr | fs/xattr.c | fs | allow | | 197 | removexattr | sys_removexattr | fs/xattr.c | fs | allow | | 198 | lremovexattr | sys_lremovexattr | fs/xattr.c | fs | allow | | 199 | fremovexattr | sys_fremovexattr | fs/xattr.c | fs | allow | | 200 | tkill | sys_tkill | kernel/signal.c | thread | allow | | 201 | time | sys_time | kernel/time.c | ambient | allow | | 202 | futex | sys_futex | kernel/futex.c | thread | allow | | 203 | sched_setaffinity | sys_sched_setaffinity | kernel/sched/core.c | sched | allow | | 204 | sched_getaffinity | sys_sched_getaffinity | kernel/sched/core.c | sched | allow | | 205 | set_thread_area | | arch/x86/kernel/tls.c | thread | forbid (afaik unused by glibc threading) | | 206 | io_setup | sys_io_setup | fs/aio.c | aio | forbid (out-of-scope) | | 207 | io_destroy | sys_io_destroy | fs/aio.c | aio | forbid (out-of-scope) | | 208 | io_getevents | sys_io_getevents | fs/aio.c | aio | forbid (out-of-scope) | | 209 | io_submit | sys_io_submit | fs/aio.c | aio | forbid (out-of-scope) | | 210 | io_cancel | sys_io_cancel | fs/aio.c | aio | forbid (out-of-scope) | | 211 | get_thread_area | | arch/x86/kernel/tls.c | thread | allow | | 212 | lookup_dcookie | sys_lookup_dcookie | fs/dcookies.c | fs | allow | | 213 | epoll_create | sys_epoll_create | fs/eventpoll.c | fd | allow | | 214 | epoll_ctl_old | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 215 | epoll_wait_old | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 216 | remap_file_pages | sys_remap_file_pages | mm/fremap.c | mem | forbid (obscure) | | 217 | getdents64 | sys_getdents64 | fs/readdir.c | fs | allow | | 218 | set_tid_address | sys_set_tid_address | kernel/fork.c | thread | complex filtering (thread setup uses, should not be used otherwise) | | 219 | restart_syscall | sys_restart_syscall | kernel/signal.c | signal | allow (glibc wrappers for poll etc. use) | | 220 | semtimedop | sys_semtimedop | ipc/sem.c | thread | allow | | 221 | fadvise64 | sys_fadvise64 | mm/fadvise.c | fs | allow | | 222 | timer_create | sys_timer_create | kernel/posix-timers.c | sched | allow | | 223 | timer_settime | sys_timer_settime | kernel/posix-timers.c | sched | allow | | 224 | timer_gettime | sys_timer_gettime | kernel/posix-timers.c | sched | allow | | 225 | timer_getoverrun | sys_timer_getoverrun | kernel/posix-timers.c | sched | allow | | 226 | timer_delete | sys_timer_delete | kernel/posix-timers.c | sched | allow | | 227 | clock_settime | sys_clock_settime | kernel/posix-timers.c | sched | allow | | 228 | clock_gettime | sys_clock_gettime | kernel/posix-timers.c | sched | allow | | 229 | clock_getres | sys_clock_getres | kernel/posix-timers.c | sched | allow | | 230 | clock_nanosleep | sys_clock_nanosleep | kernel/posix-timers.c | sched | allow | | 231 | exit_group | sys_exit_group | kernel/exit.c | process | allow | | 232 | epoll_wait | sys_epoll_wait | fs/eventpoll.c | io | allow | | 233 | epoll_ctl | sys_epoll_ctl | fs/eventpoll.c | io | allow | | 234 | tgkill | sys_tgkill | kernel/signal.c | process | allow | | 235 | utimes | sys_utimes | fs/utimes.c | fs | allow | | 236 | vserver | | NOT IMPLEMENTED | unimpl | forbid (unimpl) | | 237 | mbind | sys_mbind | mm/mempolicy.c | numa | forbid (out-of-scope) | | 238 | set_mempolicy | sys_set_mempolicy | mm/mempolicy.c | numa | forbid (out-of-scope) | | 239 | get_mempolicy | sys_get_mempolicy | mm/mempolicy.c | numa | forbid (out-of-scope) | | 240 | mq_open | sys_mq_open | ipc/mqueue.c | mq | allow | | 241 | mq_unlink | sys_mq_unlink | ipc/mqueue.c | mq | allow | | 242 | mq_timedsend | sys_mq_timedsend | ipc/mqueue.c | mq | allow | | 243 | mq_timedreceive | sys_mq_timedreceive | ipc/mqueue.c | mq | allow | | 244 | mq_notify | sys_mq_notify | ipc/mqueue.c | mq | allow | | 245 | mq_getsetattr | sys_mq_getsetattr | ipc/mqueue.c | mq | allow | | 246 | kexec_load | sys_kexec_load | kernel/kexec.c | kernel | forbid (privileged) | | 247 | waitid | sys_waitid | kernel/exit.c | ipc | allow | | 248 | add_key | sys_add_key | security/keys/keyctl.c | key | forbid (obscure) | | 249 | request_key | sys_request_key | security/keys/keyctl.c | key | forbid (obscure) | | 250 | keyctl | sys_keyctl | security/keys/keyctl.c | key | forbid (obscure) | | 251 | ioprio_set | sys_ioprio_set | fs/ioprio.c | sched | allow | | 252 | ioprio_get | sys_ioprio_get | fs/ioprio.c | sched | allow | | 253 | inotify_init | sys_inotify_init | fs/notify/inotify/inotify_user.c | inotify | allow | | 254 | inotify_add_watch | sys_inotify_add_watch | fs/notify/inotify/inotify_user.c | inotify | allow | | 255 | inotify_rm_watch | sys_inotify_rm_watch | fs/notify/inotify/inotify_user.c | inotify | allow | | 256 | migrate_pages | sys_migrate_pages | mm/mempolicy.c | numa | forbid (out-of-scope) | | 257 | openat | sys_openat | fs/open.c | fs | complex filtering (see open) | | 258 | mkdirat | sys_mkdirat | fs/namei.c | fs | allow | | 259 | mknodat | sys_mknodat | fs/namei.c | device | forbid (privileged, avoid devices quagmire) | | 260 | fchownat | sys_fchownat | fs/open.c | fs | allow | | 261 | futimesat | sys_futimesat | fs/utimes.c | fs | allow | | 262 | newfstatat | sys_newfstatat | fs/stat.c | fs | allow | | 263 | unlinkat | sys_unlinkat | fs/namei.c | fs | allow | | 264 | renameat | sys_renameat | fs/namei.c | fs | allow | | 265 | linkat | sys_linkat | fs/namei.c | fs | allow | | 266 | symlinkat | sys_symlinkat | fs/namei.c | fs | allow | | 267 | readlinkat | sys_readlinkat | fs/stat.c | fs | allow | | 268 | fchmodat | sys_fchmodat | fs/open.c | fs | allow | | 269 | faccessat | sys_faccessat | fs/open.c | fs | allow | | 270 | pselect6 | sys_pselect6 | fs/select.c | io | allow | | 271 | ppoll | sys_ppoll | fs/select.c | io | allow | | 272 | unshare | sys_unshare | kernel/fork.c | process | complex filtering (see clone) | | 273 | set_robust_list | sys_set_robust_list | kernel/futex.c | thread | allow (thread setup uses) | | 274 | get_robust_list | sys_get_robust_list | kernel/futex.c | thread | allow | | 275 | splice | sys_splice | fs/splice.c | io | allow | | 276 | tee | sys_tee | fs/splice.c | io | allow | | 277 | sync_file_range | sys_sync_file_range | fs/sync.c | mem | allow | | 278 | vmsplice | sys_vmsplice | fs/splice.c | io | allow | | 279 | move_pages | sys_move_pages | mm/migrate.c | numa | forbid (out-of-scope) | | 280 | utimensat | sys_utimensat | fs/utimes.c | fs | allow | | 281 | epoll_pwait | sys_epoll_pwait | fs/eventpoll.c | io | allow | | 282 | signalfd | sys_signalfd | fs/signalfd.c | signal | forbid (signals) | | 283 | timerfd_create | sys_timerfd_create | fs/timerfd.c | fd | allow | | 284 | eventfd | sys_eventfd | fs/eventfd.c | fd | allow | | 285 | fallocate | sys_fallocate | fs/open.c | fs | allow | | 286 | timerfd_settime | sys_timerfd_settime | fs/timerfd.c | fd | allow | | 287 | timerfd_gettime | sys_timerfd_gettime | fs/timerfd.c | fd | allow | | 288 | accept4 | sys_accept4 | net/socket.c | fd | allow | | 289 | signalfd4 | sys_signalfd4 | fs/signalfd.c | signal | complex filtering (signal) | | 290 | eventfd2 | sys_eventfd2 | fs/eventfd.c | fd | allow | | 291 | epoll_create1 | sys_epoll_create1 | fs/eventpoll.c | fd | allow | | 292 | dup3 | sys_dup3 | fs/file.c | fd | allow | | 293 | pipe2 | sys_pipe2 | fs/pipe.c | fd | allow | | 294 | inotify_init1 | sys_inotify_init1 | fs/notify/inotify/inotify_user.c | inotify | allow | | 295 | preadv | sys_preadv | fs/read_write.c | io | allow | | 296 | pwritev | sys_pwritev | fs/read_write.c | io | allow | | 297 | rt_tgsigqueueinfo | sys_rt_tgsigqueueinfo | kernel/signal.c | signal | complex filtering (signal) | | 298 | perf_event_open | sys_perf_event_open | kernel/events/core.c | kernel | forbid (privileged) | | 299 | recvmmsg | sys_recvmmsg | net/socket.c | io | allow | | 300 | fanotify_init | sys_fanotify_init | fs/notify/fanotify/fanotify_user.c | fanotify | allow | | 301 | fanotify_mark | sys_fanotify_mark | fs/notify/fanotify/fanotify_user.c | fanotify | allow | | 302 | prlimit64 | sys_prlimit64 | kernel/sys.c | process | allow | | 303 | name_to_handle_at | sys_name_to_handle_at | fs/fhandle.c | fs | allow | | 304 | open_by_handle_at | sys_open_by_handle_at | fs/fhandle.c | fs | allow | | 305 | clock_adjtime | sys_clock_adjtime | kernel/posix-timers.c | system | allow | | 306 | syncfs | sys_syncfs | fs/sync.c | fs | allow | | 307 | sendmmsg | sys_sendmmsg | net/socket.c | io | allow | | 308 | setns | sys_setns | kernel/nsproxy.c | process | forbid (namespaces) | | 309 | getcpu | sys_getcpu | kernel/sys.c | ambient | allow | | 310 | process_vm_readv | sys_process_vm_readv | mm/process_vm_access.c | debug | forbid (debugger) | | 311 | process_vm_writev | sys_process_vm_writev | mm/process_vm_access.c | debug | forbid (debugger) | | 312 | kcmp | sys_kcmp | kernel/kcmp.c | kernel | forbid (obscure) | | 313 | finit_module | sys_finit_module | kernel/module.c | kernel | forbid (privileged) | | 314 | sched_setattr | sys_sched_setattr | kernel/sched/core.c:7987 | sched | allow | | 315 | sched_getattr | sys_sched_getattr | kernel/sched/core.c:8140 | sched | allow | | 316 | renameat2 | sys_renameat2 | fs/namei.c:4953 | fs | allow | | 317 | seccomp | sys_seccomp | kernel/seccomp.c:2007 | security | complex filtering (part of sandbox) | | 318 | getrandom | sys_getrandom | drivers/char/random.c:1364 | io | allow | | 319 | memfd_create | sys_memfd_create | mm/memfd.c:275 | mem | complex filtering (memory) | | 320 | kexec_file_load | sys_kexec_file_load | kernel/kexec_file.c:325 | kernel | forbid (privileged) | | 321 | bpf | sys_bpf | kernel/bpf/syscall.c:5094 | security | complex filtering (part of sandbox) | | 322 | execveat | sys_execveat | fs/exec.c:2113 | exec | allow | | 323 | userfaultfd | sys_userfaultfd | fs/userfaultfd.c:2141 | signal | forbid (signals) | | 324 | membarrier | sys_membarrier | kernel/sched/membarrier.c:614 | thread | allow | | 325 | mlock2 | sys_mlock2 | mm/mlock.c:618 | mem | allow (resident vs not does not change protection semantics) | | 326 | copy_file_range | sys_copy_file_range | fs/read_write.c:1559 | io | allow | | 327 | preadv2 | sys_preadv2 | fs/read_write.c:1061 | io | allow | | 328 | pwritev2 | sys_pwritev2 | fs/read_write.c:1081 | io | allow | | 329 | pkey_mprotect | sys_pkey_mprotect | mm/mprotect.c:857 | pkey | complex filtering (part of sandbox) | | 330 | pkey_alloc | sys_pkey_alloc | mm/mprotect.c:863 | pkey | complex filtering (part of sandbox) | | 331 | pkey_free | sys_pkey_free | mm/mprotect.c:893 | pkey | complex filtering (part of sandbox) | | 332 | statx | sys_statx | fs/stat.c:677 | fs | allow | | 333 | io_pgetevents | sys_io_pgetevents | fs/aio.c:2247 | aio | forbid (out-of-scope) | | 334 | rseq | sys_rseq | kernel/rseq.c:365 | rseq | allow | | (gap) | | | | | | | 424 | pidfd_send_signal | sys_pidfd_send_signal | kernel/signal.c:3849 | ipc | allow | | 425 | io_uring_setup | sys_io_uring_setup | io_uring/io_uring.c:3852 | io_uring | forbid (out-of-scope) | | 426 | io_uring_enter | sys_io_uring_enter | io_uring/io_uring.c:3392 | io_uring | forbid (out-of-scope) | | 427 | io_uring_register | sys_io_uring_register | io_uring/io_uring.c:4303 | io_uring | forbid (out-of-scope) | | 428 | open_tree | sys_open_tree | fs/namespace.c:2506 | fs | forbid (privileged) | | 429 | move_mount | sys_move_mount | fs/namespace.c:3768 | fs | forbid (privileged) | | 430 | fsopen | sys_fsopen | fs/fsopen.c:115 | fs | forbid (privileged) | | 431 | fsconfig | sys_fsconfig | fs/fsopen.c:314 | fs | forbid (privileged) | | 432 | fsmount | sys_fsmount | fs/namespace.c:3639 | fs | forbid (privileged) | | 433 | fspick | sys_fspick | fs/fsopen.c:158 | fs | forbid (privileged) | | 434 | pidfd_open | sys_pidfd_open | kernel/pid.c:629 | fd | allow | | 435 | clone3 | sys_clone3 | kernel/fork.c:2966 | process | complex filtering (see clone) | | 436 | close_range | sys_close_range | fs/open.c:1501 | fd | allow | | 437 | openat2 | sys_openat2 | fs/open.c:1383 | fs | complex filtering (see open) | | 438 | pidfd_getfd | sys_pidfd_getfd | kernel/pid.c:724 | fd | allow | | 439 | faccessat2 | sys_faccessat2 | fs/open.c:534 | fs | allow | | 440 | process_madvise | sys_process_madvise | mm/madvise.c:1455 | mem | forbid (mem) | | 441 | epoll_pwait2 | sys_epoll_pwait2 | fs/eventpoll.c:2310 | io | allow | | 442 | mount_setattr | sys_mount_setattr | fs/namespace.c:4334 | fs | forbid (privileged) | | 443 | quotactl_fd | sys_quotactl_fd | fs/quota/quota.c:972 | fs | allow | | 444 | landlock_create_ruleset | sys_landlock_create_ruleset | security/landlock/syscalls.c:157 | security | allow (use to restrict what can be open()ed) | | 445 | landlock_add_rule | sys_landlock_add_rule | security/landlock/syscalls.c:305 | security | allow (use to restrict what can be open()ed) | | 446 | landlock_restrict_self | sys_landlock_restrict_self | security/landlock/syscalls.c:397 | security | allow (use to restrict what can be open()ed) | | 447 | memfd_secret | sys_memfd_secret | mm/secretmem.c:231 | mem | forbid (out-of-scope) | | 448 | process_mrelease | sys_process_mrelease | mm/oom_kill.c:1200 | mem | forbid (out-of-scope) | | 449 | futex_waitv | sys_futex_waitv | kernel/futex/syscalls.c:246 | thread | allow | | 450 | set_mempolicy_home_node | sys_set_mempolicy_home_node | mm/mempolicy.c:1485 | numa | forbid (out-of-scope) |
rinon commented 1 year ago

open can be dealt with by selinux, can we mix selinux and seccomp-bpf?

brk seems like it might be dangerous, we need to test how it interacts with existing mappings. In general it is deprecated anyway, so we may not need to allow it.

shmat is dangerous - I believe that can remap memory.

We can go through the whole list in more detail while allow-listing each that we need, these just stuck out to me.

fw-immunant commented 1 year ago

open can be dealt with by selinux, can we mix selinux and seccomp-bpf?

We can (I think ChromeOS folks investigated this though they didn't end up going with it as they were able to just modify the kernel), but after reading about it I think actually the Landlock LSM is probably more appropriate, if we can reasonably depend on it. Landlock gets us unprivileged access to control over access to kernel objects without the system-wide changes that SELinux requires. This means we'll end up allowing the three landlock syscalls landlock_create_ruleset, landlock_add_rule, and landlock_restrict_self.

brk seems like it might be dangerous, we need to test how it interacts with existing mappings. In general it is deprecated anyway, so we may not need to allow it.

In my tests (see #233) we do need to allow brk, but probably not after initial program startup.

shmat is dangerous - I believe that can remap memory.

I lumped this in with other shm operations without actually knowing its precise semantics. Thanks for catching it.

EDIT: I've updated the table to forbid shmat.

We can go through the whole list in more detail while allow-listing each that we need, these just stuck out to me.

:+1:

fw-immunant commented 1 year ago

For brk/sbrk, this test program seems to indicate that they're benign w/r/t existing mappings:

// Verify that brk()/sbrk() will not stomp on existing mappings.
// Exits with status 0 if this property holds.

#include <unistd.h>
#include <sys/mman.h>
#include <stdio.h>

int main(void) {
    // Determine the current program break.
    void* prog_break = sbrk(0);

    // Allocate some memory on the next page.
    void* alloc = mmap(prog_break + 4096, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0);
    if (alloc != prog_break + 4096) {
        printf("mmap of prog_break + 4096 failed\n");
        return 1;
    }

    // Try to get more memory via sbrk().
    void* new_prog_break = sbrk(4096);

    // If sbrk() failed, it's because our allocation prevented it.
    if (new_prog_break == (void*)-1) {
        printf("sbrk(4096) failed\n");
        return 0;
    }

    // Otherwise, our allocation is presumably stomped-on.
    printf("%p -> %p\n", prog_break, new_prog_break);
    return 1;
}

Output:

 ./sbrk-test
sbrk(4096) failed
fw-immunant commented 5 months ago

We also need to trace madvise as evinced by the failure of the assertion in this test program:

#define _GNU_SOURCE

#include <assert.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>

#define PAGE_SIZE 4096

/* circumvent pkeys for debugging */
unsigned char read_proc_self_mem_byte(void* ptr)
{
    unsigned char buf[32] = {0};
    int fd = open("/proc/self/mem", O_RDWR);
    pread(fd, buf, sizeof(buf), (uint64_t)ptr);
    return buf[0];
}

int main(int argc, char** argv)
{
    /* allocate memory to protect with pkey */
    void *mem = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
    memset(mem, 0x5a, PAGE_SIZE);
    int pkey = pkey_alloc(0 /* reserved */, PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE);

    /* use pkey to forbid access */
    pkey_mprotect(mem, PAGE_SIZE, PROT_NONE, pkey);

    if (madvise(mem, PAGE_SIZE, MADV_DONTNEED) < 0)
        perror("madvise(MADV_DONTNEED)");

    printf("[0]=%02x\n", read_proc_self_mem_byte(mem));
    /* this assertion fails because madvise(MADV_DONTNEED) bypasses pkeys */
    assert(read_proc_self_mem_byte(mem) == 0x5a);

    return 0;
}