接上文: https://cloud.tencent.com/developer/article/2472554
struct ibv_cq *ibv_create_cq(struct ibv_context *context, int cqe,
void *cq_context,
struct ibv_comp_channel *channel,
int comp_vector);
comp_vector: MSI-X 完成向量,用于发出完成事件信号。如果已配置这些中断的 IRQ 关联掩码以将每个 MSI-X 中断分散到不同的核心处理,则可以使用此参数将完成工作负载分散到多个核心。值可以是 [0..context->num_comp_vectors)
static void irdma_init_rdma_device(struct irdma_device *iwdev)
{
struct pci_dev *pcidev = iwdev->rf->pcidev;
if (iwdev->roce_mode)
irdma_init_roce_device(iwdev);
else
irdma_init_iw_device(iwdev);
iwdev->ibdev.phys_port_cnt = 1;
iwdev->ibdev.num_comp_vectors = iwdev->rf->ceqs_count; // 完成通知向量数量等于创建的CEQ总数
iwdev->ibdev.dev.parent = &pcidev->dev;
ib_set_device_ops(&iwdev->ibdev, &irdma_dev_ops);
}
static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
{
struct ib_uverbs_get_context_resp resp;
struct ib_uverbs_get_context cmd;
struct ib_device *ib_dev;
struct ib_uobject *uobj;
int ret;
ret = uverbs_request(attrs, &cmd, sizeof(cmd));
if (ret)
return ret;
ret = ib_alloc_ucontext(attrs);
if (ret)
return ret;
uobj = uobj_alloc(UVERBS_OBJECT_ASYNC_EVENT, attrs, &ib_dev);
if (IS_ERR(uobj)) {
ret = PTR_ERR(uobj);
goto err_ucontext;
}
resp = (struct ib_uverbs_get_context_resp){
.num_comp_vectors = attrs->ufile->device->num_comp_vectors, // 获取上下文时, 直接返回设置上的num_comp_vectors
.async_fd = uobj->id,
};
ret = uverbs_response(attrs, &resp, sizeof(resp));
if (ret)
goto err_uobj;
ret = ib_init_ucontext(attrs);
if (ret)
goto err_uobj;
ib_uverbs_init_async_event_file(
container_of(uobj, struct ib_uverbs_async_event_file, uobj));
rdma_alloc_commit_uobject(uobj, attrs);
return 0;
err_uobj:
rdma_alloc_abort_uobject(uobj, attrs, false);
err_ucontext:
rdma_restrack_put(&attrs->context->res);
kfree(attrs->context);
attrs->context = NULL;
return ret;
}
static int ib_uverbs_add_one(struct ib_device *device)
{
int devnum;
dev_t base;
struct ib_uverbs_device *uverbs_dev;
int ret;
if (!device->ops.alloc_ucontext ||
device->type == RDMA_DEVICE_TYPE_SMI)
return -EOPNOTSUPP;
uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
if (!uverbs_dev)
return -ENOMEM;
ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
if (ret) {
kfree(uverbs_dev);
return -ENOMEM;
}
device_initialize(&uverbs_dev->dev);
uverbs_dev->dev.class = &uverbs_class;
uverbs_dev->dev.parent = device->dev.parent;
uverbs_dev->dev.release = ib_uverbs_release_dev;
uverbs_dev->groups[0] = &dev_attr_group;
uverbs_dev->dev.groups = uverbs_dev->groups;
refcount_set(&uverbs_dev->refcount, 1);
init_completion(&uverbs_dev->comp);
uverbs_dev->xrcd_tree = RB_ROOT;
mutex_init(&uverbs_dev->xrcd_tree_mutex);
mutex_init(&uverbs_dev->lists_mutex);
INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
rcu_assign_pointer(uverbs_dev->ib_dev, device);
uverbs_dev->num_comp_vectors = device->num_comp_vectors; //注册ib_device时, 将device上的num_comp_vectors设置到uverbs_dev上
devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1,
GFP_KERNEL);
if (devnum < 0) {
ret = -ENOMEM;
goto err;
}
uverbs_dev->devnum = devnum;
if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
else
base = IB_UVERBS_BASE_DEV + devnum;
ret = ib_uverbs_create_uapi(device, uverbs_dev);
if (ret)
goto err_uapi;
uverbs_dev->dev.devt = base;
dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum);
cdev_init(&uverbs_dev->cdev,
device->ops.mmap ? &uverbs_mmap_fops : &uverbs_fops);
uverbs_dev->cdev.owner = THIS_MODULE;
ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev);
if (ret)
goto err_uapi;
ib_set_client_data(device, &uverbs_client, uverbs_dev);
return 0;
err_uapi:
ida_free(&uverbs_ida, devnum);
err:
if (refcount_dec_and_test(&uverbs_dev->refcount))
ib_uverbs_comp_dev(uverbs_dev);
wait_for_completion(&uverbs_dev->comp);
put_device(&uverbs_dev->dev);
return ret;
}
struct irdma_sc_dev {
struct list_head cqp_cmd_head; /* head of the CQP command list */
spinlock_t cqp_lock; /* protect CQP list access */
bool stats_idx_array[IRDMA_MAX_STATS_COUNT_GEN_1];
struct irdma_dma_mem vf_fpm_query_buf[IRDMA_MAX_PE_ENA_VF_COUNT];
u64 fpm_query_buf_pa;
u64 fpm_commit_buf_pa;
__le64 *fpm_query_buf;
__le64 *fpm_commit_buf;
struct irdma_hw *hw;
u8 __iomem *db_addr;
u32 __iomem *wqe_alloc_db;
u32 __iomem *cq_arm_db;
u32 __iomem *aeq_alloc_db;
u32 __iomem *cqp_db;
u32 __iomem *cq_ack_db;
u32 __iomem *ceq_itr_mask_db;
u32 __iomem *aeq_itr_mask_db;
u32 __iomem *hw_regs[IRDMA_MAX_REGS];
u32 ceq_itr; /* Interrupt throttle, usecs between interrupts: 0 disabled. 2 - 8160 */
u64 hw_masks[IRDMA_MAX_MASKS];
u64 hw_shifts[IRDMA_MAX_SHIFTS];
const struct irdma_hw_stat_map *hw_stats_map;
u64 hw_stats_regs[IRDMA_HW_STAT_INDEX_MAX_GEN_1];
u64 feature_info[IRDMA_MAX_FEATURES];
u64 cqp_cmd_stats[IRDMA_MAX_CQP_OPS];
struct irdma_hw_attrs hw_attrs;
struct irdma_hmc_info *hmc_info;
struct irdma_sc_cqp *cqp;
struct irdma_sc_aeq *aeq;
struct irdma_sc_ceq *ceq[IRDMA_CEQ_MAX_COUNT]; // CEQ数组, IRDMA_CEQ_MAX_COUNT = 1024
struct irdma_sc_cq *ccq;
const struct irdma_irq_ops *irq_ops;
struct irdma_hmc_fpm_misc hmc_fpm_misc;
struct irdma_ws_node *ws_tree_root;
struct mutex ws_mutex; /* ws tree mutex */
u16 num_vfs;
u8 hmc_fn_id;
u8 vf_id;
bool vchnl_up:1;
bool ceq_valid:1;
u8 pci_rev;
int (*ws_add)(struct irdma_sc_vsi *vsi, u8 user_pri);
void (*ws_remove)(struct irdma_sc_vsi *vsi, u8 user_pri);
void (*ws_reset)(struct irdma_sc_vsi *vsi);
};
static int irdma_setup_ceqs(struct irdma_pci_f *rf, struct irdma_sc_vsi *vsi)
{
u32 i;
u32 ceq_id;
struct irdma_ceq *iwceq;
struct irdma_msix_vector *msix_vec;
int status;
u32 num_ceqs;
num_ceqs = min(rf->msix_count, rf->sc_dev.hmc_fpm_misc.max_ceqs); // CEQ数量等于: 中断向量数与HW_HMC中记录的最大CEQS数量间较小的那个
i = (rf->msix_shared) ? 1 : 2;
for (ceq_id = 1; i < num_ceqs; i++, ceq_id++) { // CEQ_ID递增
iwceq = &rf->ceqlist[ceq_id]; // 处理时, 遍历该列表, 参考: irdma_process_ceq(rf, rf->ceqlist)
status = irdma_create_ceq(rf, iwceq, ceq_id, vsi);
if (status) {
ibdev_dbg(&rf->iwdev->ibdev,
"ERR: create ceq status = %d\n", status);
goto del_ceqs;
}
spin_lock_init(&iwceq->ce_lock);
msix_vec = &rf->iw_msixtbl[i];
iwceq->irq = msix_vec->irq;
iwceq->msix_idx = msix_vec->idx;
status = irdma_cfg_ceq_vector(rf, iwceq, ceq_id, msix_vec);
if (status) {
irdma_destroy_ceq(rf, iwceq);
goto del_ceqs;
}
irdma_ena_intr(&rf->sc_dev, msix_vec->idx);
rf->ceqs_count++; // 递增CEQ计数器
}
return 0;
del_ceqs:
irdma_del_ceqs(rf);
return status;
}
static void irdma_del_ceqs(struct irdma_pci_f *rf)
{
struct irdma_ceq *iwceq = &rf->ceqlist[1];
struct irdma_msix_vector *msix_vec;
u32 i = 0;
if (rf->msix_shared)
msix_vec = &rf->iw_msixtbl[1];
else
msix_vec = &rf->iw_msixtbl[2];
for (i = 1; i < rf->ceqs_count; i++, msix_vec++, iwceq++) {
rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev, msix_vec->ceq_id,
msix_vec->idx, false);
irdma_destroy_irq(rf, msix_vec, iwceq);
irdma_cqp_ceq_cmd(&rf->sc_dev, &iwceq->sc_ceq,
IRDMA_OP_CEQ_DESTROY);
dma_free_coherent(rf->sc_dev.hw->device, iwceq->mem.size,
iwceq->mem.va, iwceq->mem.pa);
iwceq->mem.va = NULL;
}
rf->ceqs_count = 1; // 删除所有CEQ(ceq0除外)时, CEQ计数器设置为1
}
static void irdma_del_ceq_0(struct irdma_pci_f *rf)
{
struct irdma_ceq *iwceq = rf->ceqlist;
struct irdma_msix_vector *msix_vec;
if (rf->msix_shared) {
msix_vec = &rf->iw_msixtbl[0];
rf->sc_dev.irq_ops->irdma_cfg_ceq(&rf->sc_dev,
msix_vec->ceq_id,
msix_vec->idx, false);
irdma_destroy_irq(rf, msix_vec, rf);
} else {
msix_vec = &rf->iw_msixtbl[1];
irdma_destroy_irq(rf, msix_vec, iwceq);
}
irdma_destroy_ceq(rf, iwceq);
rf->sc_dev.ceq_valid = false;
rf->ceqs_count = 0; // 删除ceq0时, 将CEQ计数器设置为0
}
irdma_create_cq
...
if (attr->comp_vector < rf->ceqs_count)
info.ceq_id = attr->comp_vector; // 创建CQ时设置ceq_id为comp_vector
...
int irdma_sc_cq_init(struct irdma_sc_cq *cq, struct irdma_cq_init_info *info)
{
u32 pble_obj_cnt;
pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt;
if (info->virtual_map && info->first_pm_pbl_idx >= pble_obj_cnt)
return -EINVAL;
cq->cq_pa = info->cq_base_pa;
cq->dev = info->dev;
cq->ceq_id = info->ceq_id; // 在cq上设置ceq_id
info->cq_uk_init_info.cqe_alloc_db = cq->dev->cq_arm_db;
info->cq_uk_init_info.cq_ack_db = cq->dev->cq_ack_db;
irdma_uk_cq_init(&cq->cq_uk, &info->cq_uk_init_info);
cq->virtual_map = info->virtual_map;
cq->pbl_chunk_size = info->pbl_chunk_size;
cq->ceqe_mask = info->ceqe_mask;
cq->cq_type = (info->type) ? info->type : IRDMA_CQ_TYPE_IWARP;
cq->shadow_area_pa = info->shadow_area_pa;
cq->shadow_read_threshold = info->shadow_read_threshold;
cq->ceq_id_valid = info->ceq_id_valid;
cq->tph_en = info->tph_en;
cq->tph_val = info->tph_val;
cq->first_pm_pbl_idx = info->first_pm_pbl_idx;
cq->vsi = info->vsi;
return 0;
}
struct irdma_sc_ceq {
u32 size;
u64 ceq_elem_pa;
struct irdma_sc_dev *dev;
struct irdma_ceqe *ceqe_base;
void *pbl_list;
u32 ceq_id; // 记录该ceq_id作为标识
u32 elem_cnt;
struct irdma_ring ceq_ring;
u8 pbl_chunk_size;
u8 tph_val;
u32 first_pm_pbl_idx;
u8 polarity;
struct irdma_sc_vsi *vsi;
struct irdma_sc_cq **reg_cq;
u32 reg_cq_size;
spinlock_t req_cq_lock; /* protect access to reg_cq array */
bool virtual_map:1;
bool tph_en:1;
bool itr_no_expire:1;
};
static int irdma_setup_ceqs(struct irdma_pci_f *rf, struct irdma_sc_vsi *vsi)
{
u32 i;
u32 ceq_id;
struct irdma_ceq *iwceq;
struct irdma_msix_vector *msix_vec;
int status;
u32 num_ceqs;
num_ceqs = min(rf->msix_count, rf->sc_dev.hmc_fpm_misc.max_ceqs);
i = (rf->msix_shared) ? 1 : 2;
for (ceq_id = 1; i < num_ceqs; i++, ceq_id++) {
iwceq = &rf->ceqlist[ceq_id];
status = irdma_create_ceq(rf, iwceq, ceq_id, vsi); // ceq_id从1开始自增
if (status) {
ibdev_dbg(&rf->iwdev->ibdev,
"ERR: create ceq status = %d\n", status);
goto del_ceqs;
}
spin_lock_init(&iwceq->ce_lock);
msix_vec = &rf->iw_msixtbl[i];
iwceq->irq = msix_vec->irq;
iwceq->msix_idx = msix_vec->idx;
status = irdma_cfg_ceq_vector(rf, iwceq, ceq_id, msix_vec);
if (status) {
irdma_destroy_ceq(rf, iwceq);
goto del_ceqs;
}
irdma_ena_intr(&rf->sc_dev, msix_vec->idx);
rf->ceqs_count++;
}
return 0;
del_ceqs:
irdma_del_ceqs(rf);
return status;
}
int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq,
struct irdma_ceq_init_info *info)
{
u32 pble_obj_cnt;
if (info->elem_cnt < info->dev->hw_attrs.min_hw_ceq_size ||
info->elem_cnt > info->dev->hw_attrs.max_hw_ceq_size)
return -EINVAL;
if (info->ceq_id >= info->dev->hmc_fpm_misc.max_ceqs)
return -EINVAL;
pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt;
if (info->virtual_map && info->first_pm_pbl_idx >= pble_obj_cnt)
return -EINVAL;
ceq->size = sizeof(*ceq);
ceq->ceqe_base = (struct irdma_ceqe *)info->ceqe_base;
ceq->ceq_id = info->ceq_id;
ceq->dev = info->dev;
ceq->elem_cnt = info->elem_cnt;
ceq->ceq_elem_pa = info->ceqe_pa;
ceq->virtual_map = info->virtual_map;
ceq->itr_no_expire = info->itr_no_expire;
ceq->reg_cq = info->reg_cq;
ceq->reg_cq_size = 0;
spin_lock_init(&ceq->req_cq_lock);
ceq->pbl_chunk_size = (ceq->virtual_map ? info->pbl_chunk_size : 0);
ceq->first_pm_pbl_idx = (ceq->virtual_map ? info->first_pm_pbl_idx : 0);
ceq->pbl_list = (ceq->virtual_map ? info->pbl_list : NULL);
ceq->tph_en = info->tph_en;
ceq->tph_val = info->tph_val;
ceq->vsi = info->vsi;
ceq->polarity = 1;
IRDMA_RING_INIT(ceq->ceq_ring, ceq->elem_cnt);
ceq->dev->ceq[info->ceq_id] = ceq; // 根据ceq_id递增, 依次将ceq按ceq_id作为索引记录到dev的ceq数组中
return 0;
}
static int irdma_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
{
struct irdma_device *iwdev = to_iwdev(ib_cq->device);
struct irdma_cq *iwcq = to_iwcq(ib_cq);
struct irdma_sc_cq *cq = &iwcq->sc_cq;
struct irdma_sc_dev *dev = cq->dev;
struct irdma_sc_ceq *ceq = dev->ceq[cq->ceq_id]; // 根据cq上的ceq_id从dev的ceq数组索引获取ceq
struct irdma_ceq *iwceq = container_of(ceq, struct irdma_ceq, sc_ceq);
unsigned long flags;
spin_lock_irqsave(&iwcq->lock, flags);
if (!list_empty(&iwcq->cmpl_generated))
irdma_remove_cmpls_list(iwcq);
if (!list_empty(&iwcq->resize_list))
irdma_process_resize_list(iwcq, iwdev, NULL);
spin_unlock_irqrestore(&iwcq->lock, flags);
irdma_cq_rem_ref(ib_cq);
wait_for_completion(&iwcq->free_cq);
irdma_cq_wq_destroy(iwdev->rf, cq);
spin_lock_irqsave(&iwceq->ce_lock, flags);
irdma_sc_cleanup_ceqes(cq, ceq); // 清理CEQE
spin_unlock_irqrestore(&iwceq->ce_lock, flags);
irdma_cq_free_rsrc(iwdev->rf, iwcq);
return 0;
}
接口定义:
/**
* ibv_get_cq_event - Read next CQ event
* @channel: Channel to get next event from.
* @cq: Used to return pointer to CQ.
* @cq_context: Used to return consumer-supplied CQ context.
*
* All completion events returned by ibv_get_cq_event() must
* eventually be acknowledged with ibv_ack_cq_events().
*/
int ibv_get_cq_event(struct ibv_comp_channel *channel,
struct ibv_cq **cq, void **cq_context);
实现:
LATEST_SYMVER_FUNC(ibv_get_cq_event, 1_1, "IBVERBS_1.1",
int,
struct ibv_comp_channel *channel,
struct ibv_cq **cq, void **cq_context)
{
struct ib_uverbs_comp_event_desc ev;
if (read(channel->fd, &ev, sizeof ev) != sizeof ev)
return -1;
*cq = (struct ibv_cq *) (uintptr_t) ev.cq_handle;
*cq_context = (*cq)->cq_context;
get_ops((*cq)->context)->cq_event(*cq);
return 0;
}
https://github.com/linux-rdma/perftest/commit/d66d7888ec4513ca0fd826216c64e20129d0bef8
* alloc_ctx * ctx_set_send_wqes * ctx_set_recv_wqes * run_iter_bw * run_iter_bw_server
Still known issues:
int create_cqs(struct pingpong_context *ctx, struct perftest_parameters *user_param)
{
int ret;
int dct_only = 0, need_recv_cq = 0;
int tx_buffer_depth = user_param->tx_depth;
if (user_param->connection_type == DC) {
dct_only = (user_param->machine == SERVER && !(user_param->duplex || user_param->tst == LAT));
}
if (dct_only)
tx_buffer_depth = user_param->rx_depth;
if ((user_param->connection_type == DC && !dct_only) || (user_param->verb == SEND || user_param->verb == WRITE_IMM))
need_recv_cq = 1; // 发送和写立即数需要创建接收CQ
ret = create_reg_cqs(ctx, user_param, tx_buffer_depth, need_recv_cq);
return ret;
}
struct ibv_qp* ctx_qp_create(struct pingpong_context *ctx,
struct perftest_parameters *user_param, int qp_index)
{
struct ibv_qp* qp = NULL;
int dc_num_of_qps = user_param->num_of_qps / 2;
int is_dc_server_side = 0;
struct ibv_qp_init_attr attr;
memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
struct ibv_qp_cap *qp_cap = &attr.cap;
#ifdef HAVE_IBV_WR_API
enum ibv_wr_opcode opcode;
struct ibv_qp_init_attr_ex attr_ex;
memset(&attr_ex, 0, sizeof(struct ibv_qp_init_attr_ex));
#ifdef HAVE_MLX5DV
struct mlx5dv_qp_init_attr attr_dv;
memset(&attr_dv, 0, sizeof(attr_dv));
#ifdef HAVE_OOO_RECV_WRS
struct mlx5dv_context ctx_dv;
memset(&ctx_dv, 0, sizeof(ctx_dv));
#endif
#endif
#ifdef HAVE_SRD
struct efadv_qp_init_attr efa_attr = {};
#endif
#endif
#ifdef HAVE_HNSDV
struct hnsdv_qp_init_attr hns_attr = {};
#endif
attr.send_cq = ctx->send_cq;
attr.recv_cq = (user_param->verb == SEND || user_param->verb == WRITE_IMM) ? ctx->recv_cq : ctx->send_cq; // 创建QP的时候决定RECV_CQ是共用SEND_CQ或者之前单独创建的RECV_CQ
is_dc_server_side = ((!(user_param->duplex || user_param->tst == LAT) &&
(user_param->machine == SERVER)) ||
((user_param->duplex || user_param->tst == LAT) &&
(qp_index >= dc_num_of_qps)));
run_iter_bw
...
if (totccnt < tot_iters || (user_param->test_type == DURATION && totccnt < totscnt)) {
/* Make sure all completions from previous event were polled before waiting for another */
if (user_param->use_event && ne == 0) {
fprintf(stdout, "Client: ctx_notify_events send_channel %s(), %s:%d\n", __FUNCTION__, __FILE__, __LINE__);
if (ctx_notify_events(ctx->send_channel)) {
fprintf(stderr, "Couldn't request CQ notification\n");
return_value = FAILURE;
goto cleaning;
}
}
ne = ibv_poll_cq(ctx->send_cq, CTX_POLL_BATCH, wc);
if (ne > 0) {
for (i = 0; i < ne; i++) {
wc_id = (int)wc[i].wr_id;
if (wc[i].status != IBV_WC_SUCCESS) {
NOTIFY_COMP_ERROR_SEND(wc[i],totscnt,totccnt);
return_value = FAILURE;
goto cleaning;
}
int fill = user_param->cq_mod;
if (user_param->fill_count && ctx->ccnt[wc_id] + user_param->cq_mod > user_param->iters) {
fill = user_param->iters - ctx->ccnt[wc_id];
}
ctx->ccnt[wc_id] += fill;
totccnt += fill;
if (user_param->noPeak == OFF) {
if (totccnt > tot_iters)
user_param->tcompleted[user_param->iters*num_of_qps - 1] = get_cycles();
else
user_param->tcompleted[totccnt-1] = get_cycles();
}
启动参数中设置cq_mod(参数Q)
case 'Q': CHECK_VALUE(user_param->cq_mod,int,MIN_CQ_MOD,MAX_CQ_MOD,"CQ moderation");
user_param->req_cq_mod = 1;
break;
在ctx_set_send_reg_wqes中根据cq_mod设置发送标记
void ctx_set_send_reg_wqes(struct pingpong_context *ctx,
struct perftest_parameters *user_param,
struct pingpong_dest *rem_dest)
{
int i,j;
...
if ((j + 1) % user_param->cq_mod == 0) { // 已发送个数+1个如果是cq_mod的倍数则设置当前WQE需要产生CQE标记位
ctx->wr[i*user_param->post_list + j].send_flags = IBV_SEND_SIGNALED;
} else {
ctx->wr[i*user_param->post_list + j].send_flags = 0;
}
...
最新开源实现:
if (j == (user_param->post_list - 1)) {
ctx->wr[i*user_param->post_list + j].next = NULL;
} else {
ctx->wr[i*user_param->post_list + j].next = &ctx->wr[i*user_param->post_list+j+1];
}
if ((j + 1) % user_param->cq_mod == 0) {
ctx->wr[i*user_param->post_list + j].send_flags = IBV_SEND_SIGNALED;
#ifdef HAVE_IBV_WR_API
ctx->qpx[i]->wr_flags = IBV_SEND_SIGNALED;
#endif
} else {
...
如果没有设置req_cq_mod, 则设置其为post_list大小(post_list大小必须为cq_mod的倍数)
if (user_param->post_list > 1) {
if (!user_param->req_cq_mod) {
user_param->cq_mod = user_param->post_list;
printf(RESULT_LINE);
printf("Post List requested - CQ moderation will be the size of the post list\n");
} else if ((user_param->post_list % user_param->cq_mod) != 0) {
printf(RESULT_LINE);
fprintf(stderr, " Post list size must be a multiple of CQ moderation\n");
exit(1);
}
}
https://github.com/linux-rdma/perftest/commit/56d025e4f19a6ebe0aaf45bbc6abf2186aa85dfb
允许在发送列表(post_list)模式下覆盖 CQ 调节(moderation)(控制CQE生成) (https://github.com/linux-rdma/perftest/pull/58) * 使用发送列表(post_list)和 cq mod 修复报告的峰值 BW 如果启用了发送列表(post_list)模式,则每个 post_list 迭代仅获取一次“已发布”时间戳。同样,如果启用了 CQ 调节(控制CQE生成),则每个 cq_mod 迭代仅获取一次完成时间戳。通过考虑此问题并跳过 tposted 和 tcompleted 数组中的空字段来修复报告的峰值 BW。还修复了填充 tcompleted 数组时的错误。签名人:Firas Jahjah * 允许在发送列表(post_list)模式下覆盖 CQ 调节(控制CQE生成) 发布 WQE 列表时,允许指定 CQ 调节(控制CQE生成)而不是覆盖它。为此,每 X WQE 请求一次 SIGNAL,并修复 BW 测试以支持 cq_mod != post_list 的情况
explicit CQE (where #tot_iterations % cq_mod != 0).
if (user_param->noPeak == OFF) {
/* Find the peak bandwidth unless asked not to in command line 查找峰值带宽,除非命令行要求不要这样做 */
for (i = 0; i < num_of_calculated_iters * num_of_qps; i += user_param->post_list) {
for (j = ROUND_UP(i + 1, user_param->cq_mod) - 1; j < num_of_calculated_iters * num_of_qps;
j += user_param->cq_mod) {
t = (user_param->tcompleted[j] - user_param->tposted[i]) / (j - i + 1);
if (t < opt_delta)
opt_delta = t;
}
/* Handle case where CQE was explicitly signaled on last iteration. 处理上次迭代中明确发出 CQE 信号的情况 */
if ((num_of_calculated_iters * num_of_qps) % user_param->cq_mod) {
j = num_of_calculated_iters * num_of_qps - 1;
t = (user_param->tcompleted[j] - user_param->tposted[i]) / (j - i + 1);
if (t < opt_delta)
opt_delta = t;
}
}
}
int smc_ib_ready_link(struct smc_link *lnk)
{
struct smc_link_group *lgr = smc_get_lgr(lnk);
int rc = 0;
rc = smc_ib_modify_qp_init(lnk);
if (rc)
goto out;
rc = smc_ib_modify_qp_rtr(lnk);
if (rc)
goto out;
smc_wr_remember_qp_attr(lnk);
rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
IB_CQ_SOLICITED_MASK); // 调用内核ARM,通知HW产生CQE时, 以中断模式上报
if (rc)
goto out;
rc = smc_wr_rx_post_init(lnk);
if (rc)
goto out;
smc_wr_remember_qp_attr(lnk);
if (lgr->role == SMC_SERV) {
rc = smc_ib_modify_qp_rts(lnk);
if (rc)
goto out;
smc_wr_remember_qp_attr(lnk);
}
out:
return rc;
}
static const struct ib_device_ops erdma_device_ops = {
.owner = THIS_MODULE,
.driver_id = RDMA_DRIVER_ERDMA,
.uverbs_abi_ver = ERDMA_ABI_VERSION,
.alloc_hw_port_stats = erdma_alloc_hw_port_stats,
.alloc_mr = erdma_ib_alloc_mr,
.alloc_pd = erdma_alloc_pd,
.alloc_ucontext = erdma_alloc_ucontext,
.create_cq = erdma_create_cq,
.create_qp = erdma_create_qp,
.dealloc_pd = erdma_dealloc_pd,
.dealloc_ucontext = erdma_dealloc_ucontext,
.dereg_mr = erdma_dereg_mr,
.destroy_cq = erdma_destroy_cq,
.destroy_qp = erdma_destroy_qp,
.disassociate_ucontext = erdma_disassociate_ucontext,
.get_dma_mr = erdma_get_dma_mr,
.get_hw_stats = erdma_get_hw_stats,
.get_port_immutable = erdma_get_port_immutable,
.map_mr_sg = erdma_map_mr_sg,
.mmap = erdma_mmap,
.mmap_free = erdma_mmap_free,
.post_recv = erdma_post_recv,
.post_send = erdma_post_send,
.poll_cq = erdma_poll_cq,
.query_device = erdma_query_device,
.query_gid = erdma_query_gid,
.query_port = erdma_query_port,
.query_qp = erdma_query_qp,
.req_notify_cq = erdma_req_notify_cq, // ARM实现
.reg_user_mr = erdma_reg_user_mr,
.modify_qp = erdma_modify_qp,
INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq),
INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd),
INIT_RDMA_OBJ_SIZE(ib_ucontext, erdma_ucontext, ibucontext),
INIT_RDMA_OBJ_SIZE(ib_qp, erdma_qp, ibqp),
};
ARM实现:
int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
{
struct erdma_cq *cq = to_ecq(ibcq);
unsigned long irq_flags;
int ret = 0;
spin_lock_irqsave(&cq->kern_cq.lock, irq_flags);
notify_cq(cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && get_next_valid_cqe(cq))
ret = 1;
cq->kern_cq.notify_cnt++;
spin_unlock_irqrestore(&cq->kern_cq.lock, irq_flags);
return ret;
}
static void notify_cq(struct erdma_cq *cq, u8 solcitied)
{
u64 db_data =
FIELD_PREP(ERDMA_CQDB_IDX_MASK, (cq->kern_cq.notify_cnt)) |
FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->cqn) |
FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) |
FIELD_PREP(ERDMA_CQDB_SOL_MASK, solcitied) |
FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) |
FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci);
*cq->kern_cq.dbrec = db_data;
writeq(db_data, cq->kern_cq.db);
}
ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface,
uct_ib_dir_t dir,
int solicited_only)
{
int ret;
ret = ibv_req_notify_cq(iface->cq[dir], solicited_only); // SE标记
if (ret != 0) {
ucs_error("ibv_req_notify_cq("UCT_IB_IFACE_FMT", %d, sol=%d) failed: %m",
UCT_IB_IFACE_ARG(iface), dir, solicited_only);
return UCS_ERR_IO_ERROR;
}
return UCS_OK;
}
/**
* IB device (corresponds to HCA)
*/
typedef struct uct_ib_device {
struct ibv_context *ibv_context; /* Verbs context */
uct_ib_device_attr dev_attr; /* Cached device attributes */
uint8_t first_port; /* Number of first port (usually 1) */
uint8_t num_ports; /* Amount of physical ports */
ucs_sys_cpuset_t local_cpus; /* CPUs local to device */
int async_events; /* Whether async events are handled */
int max_zcopy_log_sge; /* Maximum sges log for zcopy am */
UCS_STATS_NODE_DECLARE(stats)
struct ibv_port_attr port_attr[UCT_IB_DEV_MAX_PORTS]; /* Cached port attributes */
uct_ib_pci_id_t pci_id; /* PCI identifiers */
ucs_sys_device_t sys_dev; /* System device id */
double pci_bw; /* Supported PCI bandwidth */
unsigned flags;
uint8_t atomic_arg_sizes;
uint8_t atomic_arg_sizes_be;
uint8_t ext_atomic_arg_sizes;
uint8_t ext_atomic_arg_sizes_be;
uint8_t pci_fadd_arg_sizes;
uint8_t pci_cswap_arg_sizes;
uint8_t atomic_align;
uint8_t lag_level;
uint8_t req_notify_cq_support; /* Also indicates
IBV_SEND_SOLICITED
support */ ARM与发送SE标记位同时支持?
uint8_t ordered_send_comp;
uint64_t mr_access_flags;
uint32_t max_inline_data;
/* AH hash */
khash_t(uct_ib_ah) ah_hash;
ucs_recursive_spinlock_t ah_lock;
/* Async event subscribers */
ucs_spinlock_t async_event_lock;
khash_t(uct_ib_async_event) async_events_hash;
} uct_ib_device_t;
请求在将指定类型的下一个完成条目添加到指定 CQ 时调用 CQ 事件处理程序。对于特定的 CQ(particular),每次请求完成通知调用时最多调用一次处理程序。启用通知之前存在的任何 CQ 条目都不会导致调用处理程序完成事件有两种类型:请求的(solicited)或未经请求(unsolicited)的。当传入的发送或 RDMA 写入即时(Send or RDMA Write with Immediate)数据消息(设置了请求的事件标头位)导致成功的接收工作完成添加到 CQ 时,或者当不成功的工作完成添加到 CQ 时,会发生请求的完成事件(Solicited Completion Event )。当任何其他成功的接收工作完成或任何成功的发送工作完成添加到 CQ 时,会发生未经请求的完成事件。C11-29:CI 应支持请求的和未经请求的完成事件类型(Unsolicited Completion Event)。消费者请求完成通知时,必须指定是否为以下事件调用通知回调:
• 仅限下一个请求的完成事件(next Solicited Completion Event only),或
• 下一个请求或非请求的完成事件(the next Solicited or Unsolicited Completion Event)
C11-29.1.1:当“仅限下一个请求的完成事件”未完成时,CI 应在以下任一情况下调用通知回调:
• 设置了请求的事件标头位的传入发送导致成功的接收工作完成被添加到指定的 CQ(Send with the Solicited Event Header bit set)。
• 设置了请求的事件标头位的传入 RDMA 写入和立即数据导致成功的接收工作完成被添加到指定的 CQ(Write with Immediate Data with the Solicited Event Header bit set )
• 将不成功的发送或接收工作完成添加到指定的 CQ(An unsuccessful Send or Receive Work Completion), 也就是发送或接收失败
C11-29.1.2:当“下一个请求的或非请求的完成事件”未完成时,CI 应在将任何工作完成添加到指定的 CQ 时调用通知回调(只要有CQE产生, 就需要调用一次完成回调)。如果请求完成通知处于待处理状态,则在完成事件之前对同一 CQ 的请求完成通知的后续调用仅在通知发生时才会生效(如果之前的ARM被阻塞, 后续需要等回调完成, ARM才能生效)。下一个完成事件的请求完成(next completion event )通知优先于同一 CQ 的请求事件完成的请求完成通知(next SE)。如果对同一 CQ 进行了多次请求完成通知调用,并且至少有一个请求将类型设置为下一个完成,则在将下一个完成添加到该 CQ 时将调用 CQ 事件处理程序。即使在指定 CQ 的完成事件之前进行了多个 CQ 通知请求,CQ 事件处理程序也只会被调用一次。一旦调用 CQ 事件处理程序,必须先注册另一个完成通知请求,然后才能再次调用 CQ 事件处理程序。C11-30:当 CQ 上请求的完成类型的完成通知请求未完成,并且对该 CQ 进行了另一个指定下一个完成通知的请求时,CI 应将未完成的完成通知类型更改为下一个完成。 C11-31:当 CQ 上下一个完成的完成通知请求未完成,并且对该 CQ 发出了另一个通知请求时,CI 不得更改未完成的完成通知类型。在调用此例程之前,必须指定 CQ 事件处理程序。如果在生成事件时尚未注册 CQ 事件处理程序,则不会进行处理程序调用。调用 CQ 事件处理程序时,它仅表示已将新条目添加到指定的 CQ(HW先产生CQE, 然后在触发中断到驱动这边处理CEQ, 调用完成回调)。HCA 和 CQ 句柄被传递给 CQ 事件处理程序,因此 CQ 事件处理程序可以确定哪个 CQ 导致它被调用。调用处理程序例程后,消费者必须再次调用请求完成通知,以便在向该 CQ 添加新条目时收到通知。消费者有责任调用轮询完成动词来检索工作完成。注意:如果消费者在没有与 CQ 关联的 CQ 事件处理程序 ID 的 CQ 句柄上请求完成通知,则该操作将不起作用。也就是说,不会生成任何完成事件。输入修饰符:• HCA 句柄。• CQ 句柄。• 请求的完成通知类型。类型是下一个完成或请求的完成发生时。输出修饰符:• 动词结果:• 操作已成功完成。• 无效的 HCA 句柄。• 无效的 CQ 句柄。• 无效的完成通知类型
描述:将完成处理程序标识符(Completion Handler Identifier)与完成事件处理程序地址关联。如果 HCA 支持基本队列管理扩展,则每个 HCA 可以注册多个 CQ 事件处理程序。对于给定的完成处理程序标识符,对此动词的其他调用将覆盖与完成处理程序标识符关联的完成事件处理程序地址。此调用不会自动请求完成事件的通知。必须调用请求完成通知动词才能请求通知。传递给 CQ 事件处理程序的参数包括:• HCA 句柄。• CQ 句柄。输入修饰符:• HCA 句柄。• 完成事件处理程序地址。• 完成事件处理程序标识符:• 如果为零,CI 将创建完成处理程序标识符并分配完成事件处理程序地址。• 如果非零,CI 将替换与完成事件处理程序标识符标识的现有完成处理程序关联的完成事件处理程序地址。如果完成事件处理程序地址为零,则清除完成事件处理程序地址。注意:当 CQ 与已清除的完成事件处理程序相关联时,不得生成完成事件。输出修饰符:• 完成事件处理程序标识符。仅当输入修饰符“完成事件处理程序标识符”设置为零时才返回。• 动词结果:• 操作成功完成。• HCA 句柄无效。• 完成事件处理程序标识符无效。• HCA 不支持基本队列管理扩展 • 资源不足,无法完成请求
描述:注册异步事件处理程序。每个 HCA 只能注册一个异步事件处理程序。对此动词的额外调用将覆盖要调用的处理程序例程。额外的调用不会生成额外的处理程序例程。C11-32:即使在已注册现有异步事件处理程序的情况下,CI 也应使用此动词中指定的异步事件处理程序。注册异步事件处理程序后,所有后续异步事件都将导致调用该处理程序。在注册异步事件处理程序之前,异步事件将丢失。调用异步事件处理程序时传递给它的参数包括:• HCA 句柄。• 事件记录。这包含指示资源类型和标识符以及发生哪个事件的信息。有关更多信息,请参阅异步事件。输入修饰符:• HCA 句柄。• 处理程序地址。输出修饰符:• 动词结果:• 操作已成功完成。• 无效的 HCA 句柄
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有