浅谈Linux SECCOMP安全机制在容器中的使用

CNCF

发布于 2021-03-15 17:23:35

7.8K0

文章被收录于专栏：CNCFCNCF

Linux自身安全机制之SECCOMP

SECCOMP的由来

Seccomp是 "secure computing" 的缩写。是Linux内核2.6.12版本（2005年3月8日）中引入。最开始的引入的目的是把服务器上多余的CPU出借出去，跑一些安全系数低的程序；所以当时只允许4个系统调用：

read，write，_exit，sigreturn

如果调用其它系统API，就会收到 SIGKILL 信号退出。

测试代码：

#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/prctl.h>
#include <linux/seccomp.h>

void configure_seccomp() {
    printf("Configuring seccomp\n");
    prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
}

int main(int argc, char* argv[]) {
    int infd, outfd;

    if (argc < 3) {
        printf("Usage:\n\t%s <input path> <output_path>\n", argv[0]);
        return -1;
    }

    printf("Starting test seccomp Y/N?");
    char c = getchar();
    if (c == 'y' || c == 'Y') configure_seccomp();

    printf("Opening '%s' for reading\n", argv[1]);
    if ((infd = open(argv[1], O_RDONLY)) > 0) {
        ssize_t read_bytes;
        char buffer[1024];
        printf("Opening '%s' for writing\n", argv[2]);
        if ((outfd = open(argv[2], O_WRONLY | O_CREAT, 0644)) > 0) {
            while ((read_bytes = read(infd, &buffer, 1024)) > 0)
                write(outfd, &buffer, (ssize_t)read_bytes);
        }
        close(infd);
        close(outfd);
    }
    printf("End!\n");

    return 0;
}

简单的文件复制代码，当seccomp功能打开的时候，代码执行到25行“open(argv[1], O_RDONLY)”时就会退出，如图：

Seccomp升级Seccomp-BPF

直到2012年7月12日Linux 3.5内核版本中，引入seccomp第二种匹配模式：SECCOMP_MODE_FILTER。(以下Seccomp-BPF皆指seccomp的过滤模式)

而在该模式下，进程可以指定允许哪些系统调用，而不是像最开始的限制到4个系统调用中。过滤模式是通过使用Berkeley的数据包过滤器做过滤规则匹配，也就是这里的BPF。使用了seccomp-BPF的程序，必须具有此CAP_SYS_ADMIN权限；或者通过使用prctrl把no_new_priv设置bit 位设置成1：

prctl(PR_SET_NO_NEW_PRIVS, 1);

在过滤模式下，使用seccomp功能的程序对任意系统调用及其参数进行过滤匹配，这里需要注意的是匹配参数仅仅只能匹配常数参数，如果是指针类型是不会对指针进行解引用操作，去匹配指针指向的内存。

Seccomp-BPF 使用的也只是BPF的子集功能：

指令集
- Conditional JMP(条件判断跳转)
  - 当匹配条件为真，跳转到true指定位置
  - 当匹配条件为假，跳转到false指定位置
  - 跳转偏移量最大255
- JMP(直接跳转)
  - 跳转目标是指令偏移量
  - 跳转偏移量最大255
- Load(数据读取)
  - 读取程序参数
  - 读取指定的16位内存地址
- Store(数据存储)
  - 保存数据到指定的16位内存地址中
- 支持的运算
  - + - * / & | ^ >> << !
- 返回值
  - SECCOMP_RET_ALLOW - 允许继续使用系统调用
  - SECCOMP_RET_KILL - 终止系统调用
  - SECCOMP_RET_ERRNO - 返回设置的errno值
  - SECCOMP_RET_TRACE - 通知附加的ptrace（如果存在）
  - SECCOMP_RET_TRAP - 往进程发送 SIGSYS信号
最多只能有4096条命令
不能出现循环

Seccomp-BPF程序接收以下结构作为输入参数：

// 详见 /usr/include/linux/seccomp.h 文件(不同的 Linux 版本可能会有路径差异)
struct seccomp_data {
  int nr ;                    /* System call number */
  __u32 arch ;                /* AUDIT_ARCH_ * value */
  __u64 instruction_pointer ; /* CPU IP */
  __u64 args [6];             /* System call arguments */
};

Seccomp-BPF是过滤系统调用。

具体的系统调用的接口API可以通过查看对应系统的声明头文件。

比如我现在用的这台机器是64位的Linux系统，就查看/usr/include/asm/unistd_64.h文件。如果是32位的Linux系统就查看/usr/include/asm/unistd_32.h。

里面去掉前缀"__NR_"就是对应的系统接口API，也就是seccomp可以接管的系统调用。

#ifndef _ASM_X86_UNISTD_64_H
#define _ASM_X86_UNISTD_64_H 1

#define __NR_read 0
#define __NR_write 1
#define __NR_open 2
#define __NR_close 3
#define __NR_stat 4
#define __NR_fstat 5
#define __NR_lstat 6
#define __NR_poll 7
#define __NR_lseek 8
#define __NR_mmap 9
#define __NR_mprotect 10
#define __NR_munmap 11
#define __NR_brk 12
#define __NR_rt_sigaction 13
#define __NR_rt_sigprocmask 14
#define __NR_rt_sigreturn 15
....

Seccomp-BPF 简单应用代码：

#include <errno.h>
#include <linux/audit.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/prctl.h>
#include <unistd.h>

static int install_filter(int syscall_nr, int f_errno) {

    struct sock_filter filter[] = {
        /* [0] Load architecture from 'seccomp_data' buffer into accumulator */
        BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))),

        /* [1] Jump forward 5 instructions if architecture does not match 't_arch' */
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 5),

        /* [2] Load system call number from 'seccomp_data' buffer into accumulator */
        BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),

        /* [3] Check ABI - only needed for x86-64 in deny-list use
                      cases.  Use BPF_JGT instead of checking against the bit
                      mask to avoid having to reload the syscall number. */
        BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, 0x40000000 - 1, 3, 0),

        /* [4] Jump forward 1 instruction if system call number does not match 'syscall_nr' */
        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),

        /* [5] Matching architecture and system call: don't execute
                  the system call, and return 'f_errno' in 'errno' */
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),

        /* [6] Destination of system call number mismatch: allow other system calls */
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),

        /* [7] Destination of architecture mismatch: kill process */
        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
    };

    struct sock_fprog prog = {
        .len = sizeof(filter) / sizeof(filter[0]),
        .filter = filter,
    };

    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1) {
        perror("seccomp");
        return 1;
    }

    return 0;
}

int main(int argc, char **argv) {
    if (argc < 4) {
        fprintf(stderr, "Usage: %s <syscall_nr> <errno> <prog> [<args>]\n", argv[0]);
        exit(EXIT_FAILURE);
    }

    if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
        perror("prctl");
        exit(EXIT_FAILURE);
    }

    if (install_filter(strtol(argv[1], NULL, 0), strtol(argv[2], NULL, 0)))
        exit(EXIT_FAILURE);

    execv(argv[4], &argv[4]);
    perror("execv");
    exit(EXIT_FAILURE);
}

注：只匹配x86_64位架构下的系统调用

把想要阻断的系统调用号，返回错误码，和需要运行的程序为参数，运行上面的样例。如图，阻断了59号系统调用(64位 Linux对应的是execve系统接口)，并返回指定的错误码。

seccomp与capabilities的区别

一句话总结：seccomp是比capabilities 更细粒度的capabilities权限限制系统内核提供的能力。

如果针对单一容器来说，配置的工作量都差不多。但是如果需要大批量的配置多个相同的容器，seccomp就相对来说容易得多；定义好一份seccomp的配置文件，在多个容器加载的时候，指定该份配置文件就可以省掉单个容器的配置。

capabilities一共限制了39个系统能力：

CAP_AUDIT_CONTROL (since Linux 2.6.11)
CAP_AUDIT_READ (since Linux 3.16)
CAP_AUDIT_WRITE (since Linux 2.6.11)
CAP_BLOCK_SUSPEND (since Linux 3.5)
CAP_BPF (since Linux 5.8)
CAP_CHECKPOINT_RESTORE (since Linux 5.9)
CAP_CHOWN
CAP_DAC_OVERRIDE
CAP_DAC_READ_SEARCH
CAP_FOWNER
CAP_FSETID
CAP_IPC_LOCK
CAP_IPC_OWNER
CAP_KILL
CAP_LEASE (since Linux 2.4)
CAP_LINUX_IMMUTABLE
CAP_MAC_ADMIN (since Linux 2.6.25)
CAP_MAC_OVERRIDE (since Linux 2.6.25)
CAP_MKNOD (since Linux 2.4)
CAP_NET_ADMIN
CAP_NET_BIND_SERVICE
CAP_NET_BROADCAST
CAP_NET_RAW
CAP_PERFMON (since Linux 5.8)
CAP_SETGID
CAP_SETFCAP (since Linux 2.6.24)
CAP_SETPCAP
CAP_SETUID
CAP_SYS_ADMIN
CAP_SYS_BOOT
CAP_SYS_CHROOT
CAP_SYS_MODULE
CAP_SYS_NICE
CAP_SYS_PACCT
CAP_SYS_PTRACE
CAP_SYS_RAWIO
CAP_SYS_RESOURCE
CAP_SYSLOG (since Linux 2.6.37)
CAP_WAKE_ALARM (since Linux 3.0)

Seccomp是对系统接口的限制，也就是系统接口有多少个，Seccomp就能管理多少个。查看上面提到的unistd_64.h头文件，一共有427个(不同的Linux版本会有差异)：

#define __NR_statx 332
#define __NR_io_pgetevents 333
#define __NR_rseq 334
#define __NR_io_uring_setup 425
#define __NR_io_uring_enter 426
#define __NR_io_uring_register 427

#endif /* _ASM_X86_UNISTD_64_H */

容器中seccomp的使用

容器中 seccomp的使用，本质是对Seccomp-BPF的再封装使用；通过简单的配置文件来达快速设置多个容器的seccomp安全应用(以下全部以docker为例)。

docker中，通过配置一个profile.json文件来告知容器需要限制的系统 API，比如：

{
    "defaultAction": "SCMP_ACT_ALLOW",
    "syscalls": [
        {
            "name": "mkdir",
            "action": "SCMP_ACT_ERRNO",
            "args": []
        }
    ]
}

在这个配置文件中，默认情况下允许容器执行除“ mkdir”以外的全部系统调用。如图：在容器内执行“ mkdir /home/test”生成新目录失败

而docker默认加载的seccomp配置内容在github上可以查看：https://github.com/moby/moby/blob/master/profiles/seccomp/default.json

配置文件里面禁用了40+的系统调用，允许了300+的系统调用。有点黑白名单的意思。

总结

在容器环境里面有AppArmor、 SElinux、Capability、Seccomp等安全加固技术。从一个攻击者的角度，如果Java/Python等攻击软件已经在容器内，想获取到root权限，那么就需要突破三层防护(JVM/Python->libc->Seccomp-BPF)到达内核获取最高的权限直接root。而 Seccomp-BPF就做为容器的最后一层安全防线。

seccomp做为容器中最后一道安全防御机制，本质是对seccomp-BPF的再封闭使用，来达到最小权限来运行Docker容器，而从避免恶意软件对容器本身越权的行为，把恶意行为限制到容器内，避免扩散。

参考：

https://lwn.net/Articles/656307/

https://www.man7.org/linux/man-pages/man2/seccomp.2.html

https://man7.org/linux/man-pages/man7/capabilities.7.html

https://lwn.net/Articles/120647/

https://www.kernel.org/doc/Documentation/networking/filter.txt

https://man7.org/conf/lpc2015/limiting_kernel_attack_surface_with_seccomp-LPC_2015-Kerrisk.pdf

https://github.com/seccomp/libseccomp

https://docs.docker.com/engine/security/seccomp/

END

关于鲲鹏安全实验室