Intel E810/ICE DPU RDMA 及MLX中断原理分析2(CE/AE)

原创

晓兵

发布于 2025-03-29 21:59:27

3730

文章被收录于专栏：Linux内核Linux内核 DPU

接上文: https://cloud.tencent.com/developer/article/2472554

创建CQ时的comp_vector参数

struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe,
                             void *cq_context,
                             struct ibv_comp_channel *channel,
                             int comp_vector);


comp_vector: MSI-X 完成向量，用于发出完成事件信号。如果已配置这些中断的 IRQ 关联掩码以将每个 MSI-X 中断分散到不同的核心处理，则可以使用此参数将完成工作负载分散到多个核心。值可以是 [0..context->num_comp_vectors)

E810设置 num_comp_vectors

static void irdma_init_rdma_device(struct irdma_device *iwdev)
{
        struct pci_dev *pcidev = iwdev->rf->pcidev;

        if (iwdev->roce_mode)
                irdma_init_roce_device(iwdev);
        else
                irdma_init_iw_device(iwdev);

        iwdev->ibdev.phys_port_cnt = 1;
        iwdev->ibdev.num_comp_vectors = iwdev->rf->ceqs_count; // 完成通知向量数量等于创建的CEQ总数
        iwdev->ibdev.dev.parent = &pcidev->dev;
        ib_set_device_ops(&iwdev->ibdev, &irdma_dev_ops);
}

获取上下文时, 直接返回设置上的num_comp_vectors

static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
{
    struct ib_uverbs_get_context_resp resp;
    struct ib_uverbs_get_context cmd;
    struct ib_device *ib_dev;
    struct ib_uobject *uobj;
    int ret;

    ret = uverbs_request(attrs, &cmd, sizeof(cmd));
    if (ret)
        return ret;

    ret = ib_alloc_ucontext(attrs);
    if (ret)
        return ret;

    uobj = uobj_alloc(UVERBS_OBJECT_ASYNC_EVENT, attrs, &ib_dev);
    if (IS_ERR(uobj)) {
        ret = PTR_ERR(uobj);
        goto err_ucontext;
    }

    resp = (struct ib_uverbs_get_context_resp){
        .num_comp_vectors = attrs->ufile->device->num_comp_vectors, // 获取上下文时, 直接返回设置上的num_comp_vectors
        .async_fd = uobj->id,
    };
    ret = uverbs_response(attrs, &resp, sizeof(resp));
    if (ret)
        goto err_uobj;

    ret = ib_init_ucontext(attrs);
    if (ret)
        goto err_uobj;

    ib_uverbs_init_async_event_file(
        container_of(uobj, struct ib_uverbs_async_event_file, uobj));
    rdma_alloc_commit_uobject(uobj, attrs);
    return 0;

err_uobj:
    rdma_alloc_abort_uobject(uobj, attrs, false);
err_ucontext:
    rdma_restrack_put(&attrs->context->res);
    kfree(attrs->context);
    attrs->context = NULL;
    return ret;
}


static int ib_uverbs_add_one(struct ib_device *device)
{
    int devnum;
    dev_t base;
    struct ib_uverbs_device *uverbs_dev;
    int ret;

    if (!device->ops.alloc_ucontext ||
        device->type == RDMA_DEVICE_TYPE_SMI)
        return -EOPNOTSUPP;

    uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
    if (!uverbs_dev)
        return -ENOMEM;

    ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
    if (ret) {
        kfree(uverbs_dev);
        return -ENOMEM;
    }

    device_initialize(&uverbs_dev->dev);
    uverbs_dev->dev.class = &uverbs_class;
    uverbs_dev->dev.parent = device->dev.parent;
    uverbs_dev->dev.release = ib_uverbs_release_dev;
    uverbs_dev->groups[0] = &dev_attr_group;
    uverbs_dev->dev.groups = uverbs_dev->groups;
    refcount_set(&uverbs_dev->refcount, 1);
    init_completion(&uverbs_dev->comp);
    uverbs_dev->xrcd_tree = RB_ROOT;
    mutex_init(&uverbs_dev->xrcd_tree_mutex);
    mutex_init(&uverbs_dev->lists_mutex);
    INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
    rcu_assign_pointer(uverbs_dev->ib_dev, device);
    uverbs_dev->num_comp_vectors = device->num_comp_vectors; //注册ib_device时, 将device上的num_comp_vectors设置到uverbs_dev上

    devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1,
                   GFP_KERNEL);
    if (devnum < 0) {
        ret = -ENOMEM;
        goto err;
    }
    uverbs_dev->devnum = devnum;
    if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
        base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
    else
        base = IB_UVERBS_BASE_DEV + devnum;

    ret = ib_uverbs_create_uapi(device, uverbs_dev);
    if (ret)
        goto err_uapi;

    uverbs_dev->dev.devt = base;
    dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum);

    cdev_init(&uverbs_dev->cdev,
          device->ops.mmap ? &uverbs_mmap_fops : &uverbs_fops);
    uverbs_dev->cdev.owner = THIS_MODULE;

    ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev);
    if (ret)
        goto err_uapi;

    ib_set_client_data(device, &uverbs_client, uverbs_dev);
    return 0;

err_uapi:
    ida_free(&uverbs_ida, devnum);
err:
    if (refcount_dec_and_test(&uverbs_dev->refcount))
        ib_uverbs_comp_dev(uverbs_dev);
    wait_for_completion(&uverbs_dev->comp);
    put_device(&uverbs_dev->dev);
    return ret;
}

irdma_sc_dev中定义记录ceq是的数组

struct irdma_sc_dev {
    struct list_head cqp_cmd_head; /* head of the CQP command list */
    spinlock_t cqp_lock; /* protect CQP list access */
    bool stats_idx_array[IRDMA_MAX_STATS_COUNT_GEN_1];
    struct irdma_dma_mem vf_fpm_query_buf[IRDMA_MAX_PE_ENA_VF_COUNT];
    u64 fpm_query_buf_pa;
    u64 fpm_commit_buf_pa;
    __le64 *fpm_query_buf;
    __le64 *fpm_commit_buf;
    struct irdma_hw *hw;
    u8 __iomem *db_addr;
    u32 __iomem *wqe_alloc_db;
    u32 __iomem *cq_arm_db;
    u32 __iomem *aeq_alloc_db;
    u32 __iomem *cqp_db;
    u32 __iomem *cq_ack_db;
    u32 __iomem *ceq_itr_mask_db;
    u32 __iomem *aeq_itr_mask_db;
    u32 __iomem *hw_regs[IRDMA_MAX_REGS];
    u32 ceq_itr;   /* Interrupt throttle, usecs between interrupts: 0 disabled. 2 - 8160 */
    u64 hw_masks[IRDMA_MAX_MASKS];
    u64 hw_shifts[IRDMA_MAX_SHIFTS];
    const struct irdma_hw_stat_map *hw_stats_map;
    u64 hw_stats_regs[IRDMA_HW_STAT_INDEX_MAX_GEN_1];
    u64 feature_info[IRDMA_MAX_FEATURES];
    u64 cqp_cmd_stats[IRDMA_MAX_CQP_OPS];
    struct irdma_hw_attrs hw_attrs;
    struct irdma_hmc_info *hmc_info;
    struct irdma_sc_cqp *cqp;
    struct irdma_sc_aeq *aeq;
    struct irdma_sc_ceq *ceq[IRDMA_CEQ_MAX_COUNT]; // CEQ数组, IRDMA_CEQ_MAX_COUNT = 1024
    struct irdma_sc_cq *ccq;
    const struct irdma_irq_ops *irq_ops;
    struct irdma_hmc_fpm_misc hmc_fpm_misc;
    struct irdma_ws_node *ws_tree_root;
    struct mutex ws_mutex; /* ws tree mutex */
    u16 num_vfs;
    u8 hmc_fn_id;
    u8 vf_id;
    bool vchnl_up:1;
    bool ceq_valid:1;
    u8 pci_rev;
    int (*ws_add)(struct irdma_sc_vsi *vsi, u8 user_pri);
    void (*ws_remove)(struct irdma_sc_vsi *vsi, u8 user_pri);
    void (*ws_reset)(struct irdma_sc_vsi *vsi);
};

创建CEQ时递增RF上的CEQ计数器(ceqs_count)

static int irdma_setup_ceqs(struct irdma_pci_f *rf, struct irdma_sc_vsi *vsi)
{
    u32 i;
    u32 ceq_id;
    struct irdma_ceq *iwceq;
    struct irdma_msix_vector *msix_vec;
    int status;
    u32 num_ceqs;

    num_ceqs = min(rf->msix_count, rf->sc_dev.hmc_fpm_misc.max_ceqs); // CEQ数量等于: 中断向量数与HW_HMC中记录的最大CEQS数量间较小的那个
    i = (rf->msix_shared) ? 1 : 2;
    for (ceq_id = 1; i < num_ceqs; i++, ceq_id++) { // CEQ_ID递增
        iwceq = &rf->ceqlist[ceq_id]; // 处理时, 遍历该列表, 参考: irdma_process_ceq(rf, rf->ceqlist)
        status = irdma_create_ceq(rf, iwceq, ceq_id, vsi);
        if (status) {
            ibdev_dbg(&rf->iwdev->ibdev,
                  "ERR: create ceq status = %d\n", status);
            goto del_ceqs;
        }
        spin_lock_init(&iwceq->ce_lock);
        msix_vec = &rf->iw_msixtbl[i];
        iwceq->irq = msix_vec->irq;
        iwceq->msix_idx = msix_vec->idx;
        status = irdma_cfg_ceq_vector(rf, iwceq, ceq_id, msix_vec);
        if (status) {
            irdma_destroy_ceq(rf, iwceq);
            goto del_ceqs;
        }
        irdma_ena_intr(&rf->sc_dev, msix_vec->idx);
        rf->ceqs_count++; // 递增CEQ计数器
    }

    return 0;

del_ceqs:
    irdma_del_ceqs(rf);

    return status;
}

删除所有CEQ(ceq0除外)时, CEQ计数器设置为1

static void irdma_del_ceqs(struct irdma_pci_f *rf)
{
    struct irdma_ceq *iwceq = &rf->ceqlist[1];
    struct irdma_msix_vector *msix_vec;
    u32 i = 0;

    if (rf->msix_shared)
        msix_vec = &rf->iw_msixtbl[1];
    else
        msix_vec = &rf->iw_msixtbl[2];

    for (i = 1; i < rf->ceqs_count; i++, msix_vec++, iwceq++) {
        rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, msix_vec->ceq_id,
                          msix_vec->idx, false);
        irdma_destroy_irq(rf, msix_vec, iwceq);
        irdma_cqp_ceq_cmd(&rf->sc_dev, &iwceq->sc_ceq,
                  IRDMA_OP_CEQ_DESTROY);
        dma_free_coherent(rf->sc_dev.hw->device, iwceq->mem.size,
                  iwceq->mem.va, iwceq->mem.pa);
        iwceq->mem.va = NULL;
    }
    rf->ceqs_count = 1; // 删除所有CEQ(ceq0除外)时, CEQ计数器设置为1
}

删除ceq0时, 将CEQ计数器设置为0

static void irdma_del_ceq_0(struct irdma_pci_f *rf)
{
    struct irdma_ceq *iwceq = rf->ceqlist;
    struct irdma_msix_vector *msix_vec;

    if (rf->msix_shared) {
        msix_vec = &rf->iw_msixtbl[0];
        rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev,
                          msix_vec->ceq_id,
                          msix_vec->idx, false);
        irdma_destroy_irq(rf, msix_vec, rf);
    } else {
        msix_vec = &rf->iw_msixtbl[1];
        irdma_destroy_irq(rf, msix_vec, iwceq);
    }

    irdma_destroy_ceq(rf, iwceq);
    rf->sc_dev.ceq_valid = false;
    rf->ceqs_count = 0; // 删除ceq0时, 将CEQ计数器设置为0
}

创建CQ时设置ceq_id为comp_vector

irdma_create_cq
    ...
    if (attr->comp_vector < rf->ceqs_count)
        info.ceq_id = attr->comp_vector; // 创建CQ时设置ceq_id为comp_vector
    ...
        
int irdma_sc_cq_init(struct irdma_sc_cq *cq, struct irdma_cq_init_info *info)
{
    u32 pble_obj_cnt;

    pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt;
    if (info->virtual_map && info->first_pm_pbl_idx >= pble_obj_cnt)
        return -EINVAL;

    cq->cq_pa = info->cq_base_pa;
    cq->dev = info->dev;
    cq->ceq_id = info->ceq_id; // 在cq上设置ceq_id
    info->cq_uk_init_info.cqe_alloc_db = cq->dev->cq_arm_db;
    info->cq_uk_init_info.cq_ack_db = cq->dev->cq_ack_db;
    irdma_uk_cq_init(&cq->cq_uk, &info->cq_uk_init_info);

    cq->virtual_map = info->virtual_map;
    cq->pbl_chunk_size = info->pbl_chunk_size;
    cq->ceqe_mask = info->ceqe_mask;
    cq->cq_type = (info->type) ? info->type : IRDMA_CQ_TYPE_IWARP;
    cq->shadow_area_pa = info->shadow_area_pa;
    cq->shadow_read_threshold = info->shadow_read_threshold;
    cq->ceq_id_valid = info->ceq_id_valid;
    cq->tph_en = info->tph_en;
    cq->tph_val = info->tph_val;
    cq->first_pm_pbl_idx = info->first_pm_pbl_idx;
    cq->vsi = info->vsi;

    return 0;
}

ceq上记录ceq_id

struct irdma_sc_ceq {
    u32 size;
    u64 ceq_elem_pa;
    struct irdma_sc_dev *dev;
    struct irdma_ceqe *ceqe_base;
    void *pbl_list;
    u32 ceq_id; // 记录该ceq_id作为标识
    u32 elem_cnt;
    struct irdma_ring ceq_ring;
    u8 pbl_chunk_size;
    u8 tph_val;
    u32 first_pm_pbl_idx;
    u8 polarity;
    struct irdma_sc_vsi *vsi;
    struct irdma_sc_cq **reg_cq;
    u32 reg_cq_size;
    spinlock_t req_cq_lock; /* protect access to reg_cq array */
    bool virtual_map:1;
    bool tph_en:1;
    bool itr_no_expire:1;
};

初始化CEQ时将ceq记录到dev的ceq数组中

static int irdma_setup_ceqs(struct irdma_pci_f *rf, struct irdma_sc_vsi *vsi)
{
    u32 i;
    u32 ceq_id;
    struct irdma_ceq *iwceq;
    struct irdma_msix_vector *msix_vec;
    int status;
    u32 num_ceqs;

    num_ceqs = min(rf->msix_count, rf->sc_dev.hmc_fpm_misc.max_ceqs);
    i = (rf->msix_shared) ? 1 : 2;
    for (ceq_id = 1; i < num_ceqs; i++, ceq_id++) {
        iwceq = &rf->ceqlist[ceq_id];
        status = irdma_create_ceq(rf, iwceq, ceq_id, vsi); // ceq_id从1开始自增
        if (status) {
            ibdev_dbg(&rf->iwdev->ibdev,
                  "ERR: create ceq status = %d\n", status);
            goto del_ceqs;
        }
        spin_lock_init(&iwceq->ce_lock);
        msix_vec = &rf->iw_msixtbl[i];
        iwceq->irq = msix_vec->irq;
        iwceq->msix_idx = msix_vec->idx;
        status = irdma_cfg_ceq_vector(rf, iwceq, ceq_id, msix_vec);
        if (status) {
            irdma_destroy_ceq(rf, iwceq);
            goto del_ceqs;
        }
        irdma_ena_intr(&rf->sc_dev, msix_vec->idx);
        rf->ceqs_count++;
    }

    return 0;

del_ceqs:
    irdma_del_ceqs(rf);

    return status;
}


int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq,
              struct irdma_ceq_init_info *info)
{
    u32 pble_obj_cnt;

    if (info->elem_cnt < info->dev->hw_attrs.min_hw_ceq_size ||
        info->elem_cnt > info->dev->hw_attrs.max_hw_ceq_size)
        return -EINVAL;

    if (info->ceq_id >= info->dev->hmc_fpm_misc.max_ceqs)
        return -EINVAL;
    pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt;

    if (info->virtual_map && info->first_pm_pbl_idx >= pble_obj_cnt)
        return -EINVAL;

    ceq->size = sizeof(*ceq);
    ceq->ceqe_base = (struct irdma_ceqe *)info->ceqe_base;
    ceq->ceq_id = info->ceq_id;
    ceq->dev = info->dev;
    ceq->elem_cnt = info->elem_cnt;
    ceq->ceq_elem_pa = info->ceqe_pa;
    ceq->virtual_map = info->virtual_map;
    ceq->itr_no_expire = info->itr_no_expire;
    ceq->reg_cq = info->reg_cq;
    ceq->reg_cq_size = 0;
    spin_lock_init(&ceq->req_cq_lock);
    ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0);
    ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0);
    ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL);
    ceq->tph_en = info->tph_en;
    ceq->tph_val = info->tph_val;
    ceq->vsi = info->vsi;
    ceq->polarity = 1;
    IRDMA_RING_INIT(ceq->ceq_ring, ceq->elem_cnt);
    ceq->dev->ceq[info->ceq_id] = ceq; // 根据ceq_id递增, 依次将ceq按ceq_id作为索引记录到dev的ceq数组中

    return 0;
}

销毁CQ时清理CEQE

static int irdma_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
{
    struct irdma_device *iwdev = to_iwdev(ib_cq->device);
    struct irdma_cq *iwcq = to_iwcq(ib_cq);
    struct irdma_sc_cq *cq = &iwcq->sc_cq;
    struct irdma_sc_dev *dev = cq->dev;
    struct irdma_sc_ceq *ceq = dev->ceq[cq->ceq_id]; // 根据cq上的ceq_id从dev的ceq数组索引获取ceq
    struct irdma_ceq *iwceq = container_of(ceq, struct irdma_ceq, sc_ceq);
    unsigned long flags;

    spin_lock_irqsave(&iwcq->lock, flags);
    if (!list_empty(&iwcq->cmpl_generated))
        irdma_remove_cmpls_list(iwcq);
    if (!list_empty(&iwcq->resize_list))
        irdma_process_resize_list(iwcq, iwdev, NULL);
    spin_unlock_irqrestore(&iwcq->lock, flags);

    irdma_cq_rem_ref(ib_cq);
    wait_for_completion(&iwcq->free_cq);

    irdma_cq_wq_destroy(iwdev->rf, cq);

    spin_lock_irqsave(&iwceq->ce_lock, flags);
    irdma_sc_cleanup_ceqes(cq, ceq); // 清理CEQE
    spin_unlock_irqrestore(&iwceq->ce_lock, flags);
    irdma_cq_free_rsrc(iwdev->rf, iwcq);

    return 0;
}

rdma-core实现

ibv_get_cq_event

接口定义:
/**
 * ibv_get_cq_event - Read next CQ event
 * @channel: Channel to get next event from.
 * @cq: Used to return pointer to CQ.
 * @cq_context: Used to return consumer-supplied CQ context.
 *
 * All completion events returned by ibv_get_cq_event() must
 * eventually be acknowledged with ibv_ack_cq_events().
 */
int ibv_get_cq_event(struct ibv_comp_channel *channel,
             struct ibv_cq **cq, void **cq_context);

实现:
LATEST_SYMVER_FUNC(ibv_get_cq_event, 1_1, "IBVERBS_1.1",
           int,
           struct ibv_comp_channel *channel,
           struct ibv_cq **cq, void **cq_context)
{
    struct ib_uverbs_comp_event_desc ev;

    if (read(channel->fd, &ev, sizeof ev) != sizeof ev)
        return -1;

    *cq         = (struct ibv_cq *) (uintptr_t) ev.cq_handle;
    *cq_context = (*cq)->cq_context;

    get_ops((*cq)->context)->cq_event(*cq);

    return 0;
}

perftest

v2.0提交记录

https://github.com/linux-rdma/perftest/commit/d66d7888ec4513ca0fd826216c64e20129d0bef8

All tests use shared resources (functions and structs): 所有测试共享资源(以下方法和结构体)

* alloc_ctx * ctx_set_send_wqes * ctx_set_recv_wqes * run_iter_bw * run_iter_bw_server

All BW tests support multi QPs execution (-q flag).
All BW tests support Dual port mode (-O flag)
All BW tests support Post list WQEs feature (-l flag).
All BW tests support Duration (instead of iterations) mode.(所有 BW 测试都支持持续时间（而不是迭代）模式)
bug fixes in all tests.

Still known issues:

RDMA CM QPs in read and atomic benchmarks.
RDMA CM UD and UC SEND mode.
SEND BW bidirectional mode with UD/UC.

发送和写立即数需要创建接收CQ

int create_cqs(struct pingpong_context *ctx, struct perftest_parameters *user_param)
{
    int ret;
    int dct_only = 0, need_recv_cq = 0;
    int tx_buffer_depth = user_param->tx_depth;

    if (user_param->connection_type == DC) {
        dct_only = (user_param->machine == SERVER && !(user_param->duplex || user_param->tst == LAT));
    }

    if (dct_only)
        tx_buffer_depth = user_param->rx_depth;

    if ((user_param->connection_type == DC && !dct_only) || (user_param->verb == SEND || user_param->verb == WRITE_IMM))
        need_recv_cq = 1; // 发送和写立即数需要创建接收CQ

    ret = create_reg_cqs(ctx, user_param, tx_buffer_depth, need_recv_cq);

    return ret;
}

创建QP的时候决定RECV_CQ是共用SEND_CQ或者之前单独创建的RECV_CQ

struct ibv_qp* ctx_qp_create(struct pingpong_context *ctx,
        struct perftest_parameters *user_param, int qp_index)
{
    struct ibv_qp* qp = NULL;
    int dc_num_of_qps = user_param->num_of_qps / 2;

    int is_dc_server_side = 0;
    struct ibv_qp_init_attr attr;
    memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
    struct ibv_qp_cap *qp_cap = &attr.cap;

    #ifdef HAVE_IBV_WR_API
    enum ibv_wr_opcode opcode;
    struct ibv_qp_init_attr_ex attr_ex;
    memset(&attr_ex, 0, sizeof(struct ibv_qp_init_attr_ex));
    #ifdef HAVE_MLX5DV
    struct mlx5dv_qp_init_attr attr_dv;
    memset(&attr_dv, 0, sizeof(attr_dv));
    #ifdef HAVE_OOO_RECV_WRS
    struct mlx5dv_context ctx_dv;
    memset(&ctx_dv, 0, sizeof(ctx_dv));
    #endif
    #endif
    #ifdef HAVE_SRD
    struct efadv_qp_init_attr efa_attr = {};
    #endif
    #endif
    #ifdef HAVE_HNSDV
    struct hnsdv_qp_init_attr hns_attr = {};
    #endif

    attr.send_cq = ctx->send_cq;
    attr.recv_cq = (user_param->verb == SEND || user_param->verb == WRITE_IMM) ? ctx->recv_cq : ctx->send_cq; // 创建QP的时候决定RECV_CQ是共用SEND_CQ或者之前单独创建的RECV_CQ

    is_dc_server_side = ((!(user_param->duplex || user_param->tst == LAT) &&
                          (user_param->machine == SERVER)) ||
                         ((user_param->duplex || user_param->tst == LAT) &&
                          (qp_index >= dc_num_of_qps)));

中断转忙轮询设计


run_iter_bw
    ...
if (totccnt < tot_iters || (user_param->test_type == DURATION &&  totccnt < totscnt)) {
                /* Make sure all completions from previous event were polled before waiting for another */
                if (user_param->use_event && ne == 0) {
                    fprintf(stdout, "Client: ctx_notify_events send_channel %s(), %s:%d\n", __FUNCTION__, __FILE__, __LINE__);
                    if (ctx_notify_events(ctx->send_channel)) {
                        fprintf(stderr, "Couldn't request CQ notification\n");
                        return_value = FAILURE;
                        goto cleaning;
                    }
                }
                ne = ibv_poll_cq(ctx->send_cq, CTX_POLL_BATCH, wc);
                if (ne > 0) {
                    for (i = 0; i < ne; i++) {
                        wc_id = (int)wc[i].wr_id;

                        if (wc[i].status != IBV_WC_SUCCESS) {
                            NOTIFY_COMP_ERROR_SEND(wc[i],totscnt,totccnt);
                            return_value = FAILURE;
                            goto cleaning;
                        }
                        int fill = user_param->cq_mod;
                        if (user_param->fill_count && ctx->ccnt[wc_id] + user_param->cq_mod > user_param->iters) {
                            fill = user_param->iters - ctx->ccnt[wc_id];
                        }
                        ctx->ccnt[wc_id] += fill;
                        totccnt += fill;

                        if (user_param->noPeak == OFF) {
                            if (totccnt > tot_iters)
                                user_param->tcompleted[user_param->iters*num_of_qps - 1] = get_cycles();
                            else
                                user_param->tcompleted[totccnt-1] = get_cycles();
                        }

在设置发送WQE中设置发送标记(决定是否生成CQE)

启动参数中设置cq_mod(参数Q)

case 'Q': CHECK_VALUE(user_param->cq_mod,int,MIN_CQ_MOD,MAX_CQ_MOD,"CQ moderation");
      user_param->req_cq_mod = 1;
      break;

在ctx_set_send_reg_wqes中根据cq_mod设置发送标记

void ctx_set_send_reg_wqes(struct pingpong_context *ctx,
        struct perftest_parameters *user_param,
        struct pingpong_dest *rem_dest)
{
    int i,j;
    ...
    if ((j + 1) % user_param->cq_mod == 0) { // 已发送个数+1个如果是cq_mod的倍数则设置当前WQE需要产生CQE标记位
        ctx->wr[i*user_param->post_list + j].send_flags = IBV_SEND_SIGNALED;
    } else {
        ctx->wr[i*user_param->post_list + j].send_flags = 0;
    }
    ...
        
最新开源实现:    
if (j == (user_param->post_list - 1)) {
    ctx->wr[i*user_param->post_list + j].next = NULL;
} else {
    ctx->wr[i*user_param->post_list + j].next = &ctx->wr[i*user_param->post_list+j+1];
}
if ((j + 1) % user_param->cq_mod == 0) {
    ctx->wr[i*user_param->post_list + j].send_flags = IBV_SEND_SIGNALED;
    #ifdef HAVE_IBV_WR_API
    ctx->qpx[i]->wr_flags = IBV_SEND_SIGNALED;
    #endif
} else {
...

如果没有设置req_cq_mod, 则设置其为post_list大小(post_list大小必须为cq_mod的倍数)

if (user_param->post_list > 1) {
    if (!user_param->req_cq_mod) {
        user_param->cq_mod = user_param->post_list;
        printf(RESULT_LINE);
        printf("Post List requested - CQ moderation will be the size of the post list\n");
    } else if ((user_param->post_list % user_param->cq_mod) != 0) {
        printf(RESULT_LINE);
        fprintf(stderr, " Post list size must be a multiple of CQ moderation\n");
        exit(1);
    }
}

提交记录

https://github.com/linux-rdma/perftest/commit/56d025e4f19a6ebe0aaf45bbc6abf2186aa85dfb

允许在发送列表(post_list)模式下覆盖 CQ 调节(moderation)(控制CQE生成) (https://github.com/linux-rdma/perftest/pull/58) * 使用发送列表(post_list)和 cq mod 修复报告的峰值 BW 如果启用了发送列表(post_list)模式，则每个 post_list 迭代仅获取一次“已发布”时间戳。同样，如果启用了 CQ 调节(控制CQE生成)，则每个 cq_mod 迭代仅获取一次完成时间戳。通过考虑此问题并跳过 tposted 和 tcompleted 数组中的空字段来修复报告的峰值 BW。还修复了填充 tcompleted 数组时的错误。签名人：Firas Jahjah * 允许在发送列表(post_list)模式下覆盖 CQ 调节(控制CQE生成) 发布 WQE 列表时，允许指定 CQ 调节(控制CQE生成)而不是覆盖它。为此，每 X WQE 请求一次 SIGNAL，并修复 BW 测试以支持 cq_mod != post_list 的情况

显示生成CQE(explicit)

explicit CQE (where #tot_iterations % cq_mod != 0).

报告带宽统计时根据cq_mod处理

if (user_param->noPeak == OFF) {
    /* Find the peak bandwidth unless asked not to in command line 查找峰值带宽，除非命令行要求不要这样做 */
    for (i = 0; i < num_of_calculated_iters * num_of_qps; i += user_param->post_list) {
        for (j = ROUND_UP(i + 1, user_param->cq_mod) - 1; j < num_of_calculated_iters * num_of_qps;
                j += user_param->cq_mod) {
            t = (user_param->tcompleted[j] - user_param->tposted[i]) / (j - i + 1);
            if (t < opt_delta)
                opt_delta  = t;
        }
        /* Handle case where CQE was explicitly signaled on last iteration. 处理上次迭代中明确发出 CQE 信号的情况 */
        if ((num_of_calculated_iters * num_of_qps) % user_param->cq_mod) {
            j = num_of_calculated_iters * num_of_qps - 1;
            t = (user_param->tcompleted[j] - user_param->tposted[i]) / (j - i + 1);
            if (t < opt_delta)
                opt_delta  = t;
        }
    }
}

linux

SMC协议中会使用内核中断模式

int smc_ib_ready_link(struct smc_link *lnk)
{
    struct smc_link_group *lgr = smc_get_lgr(lnk);
    int rc = 0;

    rc = smc_ib_modify_qp_init(lnk);
    if (rc)
        goto out;

    rc = smc_ib_modify_qp_rtr(lnk);
    if (rc)
        goto out;
    smc_wr_remember_qp_attr(lnk);
    rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
                  IB_CQ_SOLICITED_MASK); // 调用内核ARM,通知HW产生CQE时, 以中断模式上报
    if (rc)
        goto out;
    rc = smc_wr_rx_post_init(lnk);
    if (rc)
        goto out;
    smc_wr_remember_qp_attr(lnk);

    if (lgr->role == SMC_SERV) {
        rc = smc_ib_modify_qp_rts(lnk);
        if (rc)
            goto out;
        smc_wr_remember_qp_attr(lnk);
    }
out:
    return rc;
}

阿里erdma设备回调函数

static const struct ib_device_ops erdma_device_ops = {
    .owner = THIS_MODULE,
    .driver_id = RDMA_DRIVER_ERDMA,
    .uverbs_abi_ver = ERDMA_ABI_VERSION,

    .alloc_hw_port_stats = erdma_alloc_hw_port_stats,
    .alloc_mr = erdma_ib_alloc_mr,
    .alloc_pd = erdma_alloc_pd,
    .alloc_ucontext = erdma_alloc_ucontext,
    .create_cq = erdma_create_cq,
    .create_qp = erdma_create_qp,
    .dealloc_pd = erdma_dealloc_pd,
    .dealloc_ucontext = erdma_dealloc_ucontext,
    .dereg_mr = erdma_dereg_mr,
    .destroy_cq = erdma_destroy_cq,
    .destroy_qp = erdma_destroy_qp,
    .disassociate_ucontext = erdma_disassociate_ucontext,
    .get_dma_mr = erdma_get_dma_mr,
    .get_hw_stats = erdma_get_hw_stats,
    .get_port_immutable = erdma_get_port_immutable,
    .map_mr_sg = erdma_map_mr_sg,
    .mmap = erdma_mmap,
    .mmap_free = erdma_mmap_free,
    .post_recv = erdma_post_recv,
    .post_send = erdma_post_send,
    .poll_cq = erdma_poll_cq,
    .query_device = erdma_query_device,
    .query_gid = erdma_query_gid,
    .query_port = erdma_query_port,
    .query_qp = erdma_query_qp,
    .req_notify_cq = erdma_req_notify_cq, // ARM实现
    .reg_user_mr = erdma_reg_user_mr,
    .modify_qp = erdma_modify_qp,

    INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq),
    INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd),
    INIT_RDMA_OBJ_SIZE(ib_ucontext, erdma_ucontext, ibucontext),
    INIT_RDMA_OBJ_SIZE(ib_qp, erdma_qp, ibqp),
};

ARM实现:
int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
{
    struct erdma_cq *cq = to_ecq(ibcq);
    unsigned long irq_flags;
    int ret = 0;

    spin_lock_irqsave(&cq->kern_cq.lock, irq_flags);

    notify_cq(cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);

    if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && get_next_valid_cqe(cq))
        ret = 1;

    cq->kern_cq.notify_cnt++;

    spin_unlock_irqrestore(&cq->kern_cq.lock, irq_flags);

    return ret;
}

static void notify_cq(struct erdma_cq *cq, u8 solcitied)
{
    u64 db_data =
        FIELD_PREP(ERDMA_CQDB_IDX_MASK, (cq->kern_cq.notify_cnt)) |
        FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->cqn) |
        FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) |
        FIELD_PREP(ERDMA_CQDB_SOL_MASK, solcitied) |
        FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) |
        FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci);

    *cq->kern_cq.dbrec = db_data;
    writeq(db_data, cq->kern_cq.db);
}

UCX

SE

UCX中使用SE模式

ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface,
                                 uct_ib_dir_t dir,
                                 int solicited_only)
{
    int ret;

    ret = ibv_req_notify_cq(iface->cq[dir], solicited_only); // SE标记
    if (ret != 0) {
        ucs_error("ibv_req_notify_cq("UCT_IB_IFACE_FMT", %d, sol=%d) failed: %m",
                  UCT_IB_IFACE_ARG(iface), dir, solicited_only);
        return UCS_ERR_IO_ERROR;
    }
    return UCS_OK;
}

UCT的IB设备结构中的ARM与IBV_SEND_SOLICITED标记位

/**
 * IB device (corresponds to HCA)
 */
typedef struct uct_ib_device {
    struct ibv_context          *ibv_context;    /* Verbs context */
    uct_ib_device_attr          dev_attr;        /* Cached device attributes */
    uint8_t                     first_port;      /* Number of first port (usually 1) */
    uint8_t                     num_ports;       /* Amount of physical ports */
    ucs_sys_cpuset_t            local_cpus;      /* CPUs local to device */
    int                         async_events;    /* Whether async events are handled */
    int                         max_zcopy_log_sge; /* Maximum sges log for zcopy am */
    UCS_STATS_NODE_DECLARE(stats)
    struct ibv_port_attr        port_attr[UCT_IB_DEV_MAX_PORTS]; /* Cached port attributes */
    uct_ib_pci_id_t             pci_id;          /* PCI identifiers */
    ucs_sys_device_t            sys_dev;         /* System device id */
    double                      pci_bw;          /* Supported PCI bandwidth */
    unsigned                    flags;
    uint8_t                     atomic_arg_sizes;
    uint8_t                     atomic_arg_sizes_be;
    uint8_t                     ext_atomic_arg_sizes;
    uint8_t                     ext_atomic_arg_sizes_be;
    uint8_t                     pci_fadd_arg_sizes;
    uint8_t                     pci_cswap_arg_sizes;
    uint8_t                     atomic_align;
    uint8_t                     lag_level;
    uint8_t                     req_notify_cq_support; /* Also indicates
                                                          IBV_SEND_SOLICITED
                                                          support */ ARM与发送SE标记位同时支持?
    uint8_t                     ordered_send_comp;
    uint64_t                    mr_access_flags;
    uint32_t                    max_inline_data;

    /* AH hash */
    khash_t(uct_ib_ah)          ah_hash;
    ucs_recursive_spinlock_t    ah_lock;
    /* Async event subscribers */
    ucs_spinlock_t              async_event_lock;
    khash_t(uct_ib_async_event) async_events_hash;
} uct_ib_device_t;

IB协议要求

请求完成通知(11.4.2.2 REQUEST COMPLETION NOTIFICATION)

对于特定的 CQ(particular)，每次请求完成通知调用时最多调用一次处理程序
调用 CQ 事件处理程序时，它仅表示已将新条目添加到指定的 CQ(HW先产生CQE, 然后在触发中断到驱动这边处理CEQ, 调用完成回调)

请求在将指定类型的下一个完成条目添加到指定 CQ 时调用 CQ 事件处理程序。对于特定的 CQ(particular)，每次请求完成通知调用时最多调用一次处理程序。启用通知之前存在的任何 CQ 条目都不会导致调用处理程序完成事件有两种类型：请求的(solicited)或未经请求(unsolicited)的。当传入的发送或 RDMA 写入即时(Send or RDMA Write with Immediate)数据消息（设置了请求的事件标头位）导致成功的接收工作完成添加到 CQ 时，或者当不成功的工作完成添加到 CQ 时，会发生请求的完成事件(Solicited Completion Event )。当任何其他成功的接收工作完成或任何成功的发送工作完成添加到 CQ 时，会发生未经请求的完成事件。C11-29：CI 应支持请求的和未经请求的完成事件类型(Unsolicited Completion Event)。消费者请求完成通知时，必须指定是否为以下事件调用通知回调：

• 仅限下一个请求的完成事件(next Solicited Completion Event only)，或

• 下一个请求或非请求的完成事件(the next Solicited or Unsolicited Completion Event)

C11-29.1.1：当“仅限下一个请求的完成事件”未完成时，CI 应在以下任一情况下调用通知回调：

• 设置了请求的事件标头位的传入发送导致成功的接收工作完成被添加到指定的 CQ(Send with the Solicited Event Header bit set)。

• 设置了请求的事件标头位的传入 RDMA 写入和立即数据导致成功的接收工作完成被添加到指定的 CQ(Write with Immediate Data with the Solicited Event Header bit set )

• 将不成功的发送或接收工作完成添加到指定的 CQ(An unsuccessful Send or Receive Work Completion), 也就是发送或接收失败

C11-29.1.2：当“下一个请求的或非请求的完成事件”未完成时，CI 应在将任何工作完成添加到指定的 CQ 时调用通知回调(只要有CQE产生, 就需要调用一次完成回调)。如果请求完成通知处于待处理状态，则在完成事件之前对同一 CQ 的请求完成通知的后续调用仅在通知发生时才会生效(如果之前的ARM被阻塞, 后续需要等回调完成, ARM才能生效)。下一个完成事件的请求完成(next completion event )通知优先于同一 CQ 的请求事件完成的请求完成通知(next SE)。如果对同一 CQ 进行了多次请求完成通知调用，并且至少有一个请求将类型设置为下一个完成，则在将下一个完成添加到该 CQ 时将调用 CQ 事件处理程序。即使在指定 CQ 的完成事件之前进行了多个 CQ 通知请求，CQ 事件处理程序也只会被调用一次。一旦调用 CQ 事件处理程序，必须先注册另一个完成通知请求，然后才能再次调用 CQ 事件处理程序。C11-30：当 CQ 上请求的完成类型的完成通知请求未完成，并且对该 CQ 进行了另一个指定下一个完成通知的请求时，CI 应将未完成的完成通知类型更改为下一个完成。 C11-31：当 CQ 上下一个完成的完成通知请求未完成，并且对该 CQ 发出了另一个通知请求时，CI 不得更改未完成的完成通知类型。在调用此例程之前，必须指定 CQ 事件处理程序。如果在生成事件时尚未注册 CQ 事件处理程序，则不会进行处理程序调用。调用 CQ 事件处理程序时，它仅表示已将新条目添加到指定的 CQ(HW先产生CQE, 然后在触发中断到驱动这边处理CEQ, 调用完成回调)。HCA 和 CQ 句柄被传递给 CQ 事件处理程序，因此 CQ 事件处理程序可以确定哪个 CQ 导致它被调用。调用处理程序例程后，消费者必须再次调用请求完成通知，以便在向该 CQ 添加新条目时收到通知。消费者有责任调用轮询完成动词来检索工作完成。注意：如果消费者在没有与 CQ 关联的 CQ 事件处理程序 ID 的 CQ 句柄上请求完成通知，则该操作将不起作用。也就是说，不会生成任何完成事件。输入修饰符：• HCA 句柄。• CQ 句柄。• 请求的完成通知类型。类型是下一个完成或请求的完成发生时。输出修饰符：• 动词结果：• 操作已成功完成。• 无效的 HCA 句柄。• 无效的 CQ 句柄。• 无效的完成通知类型

设置完成事件回调(11.5.1 SET COMPLETION EVENT HANDLER)

描述：将完成处理程序标识符(Completion Handler Identifier)与完成事件处理程序地址关联。如果 HCA 支持基本队列管理扩展，则每个 HCA 可以注册多个 CQ 事件处理程序。对于给定的完成处理程序标识符，对此动词的其他调用将覆盖与完成处理程序标识符关联的完成事件处理程序地址。此调用不会自动请求完成事件的通知。必须调用请求完成通知动词才能请求通知。传递给 CQ 事件处理程序的参数包括：• HCA 句柄。• CQ 句柄。输入修饰符：• HCA 句柄。• 完成事件处理程序地址。• 完成事件处理程序标识符：• 如果为零，CI 将创建完成处理程序标识符并分配完成事件处理程序地址。• 如果非零，CI 将替换与完成事件处理程序标识符标识的现有完成处理程序关联的完成事件处理程序地址。如果完成事件处理程序地址为零，则清除完成事件处理程序地址。注意：当 CQ 与已清除的完成事件处理程序相关联时，不得生成完成事件。输出修饰符：• 完成事件处理程序标识符。仅当输入修饰符“完成事件处理程序标识符”设置为零时才返回。• 动词结果：• 操作成功完成。• HCA 句柄无效。• 完成事件处理程序标识符无效。• HCA 不支持基本队列管理扩展 • 资源不足，无法完成请求

设置异步完成事件回调(11.5.2 SET ASYNCHRONOUS EVENT HANDLER)

描述：注册异步事件处理程序。每个 HCA 只能注册一个异步事件处理程序。对此动词的额外调用将覆盖要调用的处理程序例程。额外的调用不会生成额外的处理程序例程。C11-32：即使在已注册现有异步事件处理程序的情况下，CI 也应使用此动词中指定的异步事件处理程序。注册异步事件处理程序后，所有后续异步事件都将导致调用该处理程序。在注册异步事件处理程序之前，异步事件将丢失。调用异步事件处理程序时传递给它的参数包括：• HCA 句柄。• 事件记录。这包含指示资源类型和标识符以及发生哪个事件的信息。有关更多信息，请参阅异步事件。输入修饰符：• HCA 句柄。• 处理程序地址。输出修饰符：• 动词结果：• 操作已成功完成。• 无效的 HCA 句柄