通过sriov capability,驱动获取vf的bdf号,创建对应的pci设备和bus,同时硬件的sriov engine根据sriov的offset ,stride字段创建对应的bus和bdf的vf硬件实例,这些实例包含配置空间,配置空间的capability(msix等)一般通过地址映射的方式映射到pf来共享pf的资源,其他字段则由sriov engine提供,例如vender device id等等;其实此处主要是负责完成对vf的pci配置空间的访问;
对于vf来说,它拥有独立的配置空间(配置信息),bar空间则是复用pf的bar资源;
在整个创建vf的过程中,内核负责完成vf的内核对象创建,绑定驱动;
硬件则负责在pci物理总线上呈现对应的带有配置空间的vf实例,完成对vf的pci配置空间的访问;
偏移 (Offset) | 宏定义 | 字段名 | 长度 | 说明 | ||
---|---|---|---|---|---|---|
0x04 |
| SR-IOV Capabilities | 32 位 | 支持 VF Migration,支持中断数量等能力位 | ||
0x08 |
| SR-IOV Control | 16 位 | 控制 VF 启用、Migration 启用、Memory Space Enable、ARI 等 | ||
0x0A |
| SR-IOV Status | 16 位 | VF Migration 状态等 | ||
0x0C |
| Initial VFs | 16 位 | 硬件建议默认启用的 VF 数 | ||
0x0E |
| Total VFs | 16 位 | 设备支持的最大 VF 数 | ||
0x10 |
| Number of VFs | 16 位 | 当前启用的 VF 数,由 PF 驱动写入 | ||
0x12 |
| Function Dependency Link | 8 位 | 功能依赖链接,PF 和 VF 之间的关联 | ||
0x14 |
| First VF Offset | 8 位 | 第一个 VF 的 Function Number 偏移 | ||
0x16 |
| Following VF Stride | 16 位 | VF 之间 Function Number 的跨度 | ||
0x1A |
| VF Device ID | 16 位 | VF 的 Device ID | ||
0x1C |
| Supported Page Sizes | 32 位 | 支持的页大小掩码(bitn = 支持 2^n 字节) | ||
0x20 |
| System Page Size | 32 位 | PF 写入,实际使用的页大小 | ||
0x24 |
| VF BAR0 | 32 位 | VF BAR 资源基地址,最多 6 个 BAR,依次偏移 | ||
函数调用
- pci_scan_child_bus_extend
- pci_scan_slot
- pci_scan_single_device
- pci_device_add
- pci_init_capabilities
- pci_iov_init
- sriov_init
sriov_init函数基本流程:
static int sriov_init(struct pci_dev *dev, int pos)
{
int i, bar64;
int rc;
int nres;
u32 pgsz;
u16 ctrl, total;
struct pci_sriov *iov;
struct resource *res;
const char *res_name;
struct pci_dev *pdev;
u32 sriovbars[PCI_SRIOV_NUM_BARS];
pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
if (ctrl & PCI_SRIOV_CTRL_VFE) {
pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
ssleep(1);
}
ctrl = 0;
list_for_each_entry(pdev, &dev->bus->devices, bus_list)
if (pdev->is_physfn)
goto found;
pdev = NULL;
if (pci_ari_enabled(dev->bus))
ctrl |= PCI_SRIOV_CTRL_ARI;
found:
pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
if (!total)
return 0;
/* 系统应该获取设备支持的最小页大小,便于其将vf配置空间划分到不同虚拟地址页 */
pci_read_config_dword(dev, pos + PCI_SRIOV_SUP_PGSIZE, &pgsz);
i = PAGE_SHIFT > 12 ? PAGE_SHIFT - 12 : 0;
pgsz &= ~((1 << i) - 1);
if (!pgsz)
return -EIO;
pgsz &= ~(pgsz - 1);
/* 写入协商后的pgsz方便后续使用*/
pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
iov = kzalloc(sizeof(*iov), GFP_KERNEL);
if (!iov)
return -ENOMEM;
/* 获取vf bar的大小 */
/* Sizing SR-IOV BARs with VF Enable cleared - no decode */
__pci_size_stdbars(dev, PCI_SRIOV_NUM_BARS,
pos + PCI_SRIOV_BAR, sriovbars);
nres = 0;
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &dev->resource[i + PCI_IOV_RESOURCES];
res_name = pci_resource_name(dev, i + PCI_IOV_RESOURCES);
/*
* If it is already FIXED, don't change it, something
* (perhaps EA or header fixups) wants it this way.
*/
if (res->flags & IORESOURCE_PCI_FIXED)
bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
else
/* 初始化vf的bar的res */
bar64 = __pci_read_base(dev, pci_bar_unknown, res,
pos + PCI_SRIOV_BAR + i * 4,
&sriovbars[i]);
if (!res->flags)
continue;
if (resource_size(res) & (PAGE_SIZE - 1)) {
rc = -EIO;
goto failed;
}
/* resource_size(res)默认是单个vf的bar大小 */
iov->barsz[i] = resource_size(res);
resource_set_size(res, resource_size(res) * total);
pci_info(dev, "%s %pR: contains BAR %d for %d VFs\n",
res_name, res, i, total);
i += bar64;
nres++;
}
iov->pos = pos;
iov->nres = nres;
iov->ctrl = ctrl;
iov->total_VFs = total;
iov->driver_max_VFs = total;
/* vf device id */
pci_read_config_word(dev, pos + PCI_SRIOV_VF_DID, &iov->vf_device);
iov->pgsz = pgsz;
iov->self = dev;
iov->drivers_autoprobe = true;
pci_read_config_dword(dev, pos + PCI_SRIOV_CAP, &iov->cap);
pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END)
iov->link = PCI_DEVFN(PCI_SLOT(dev->devfn), iov->link);
/* 优先使用内核标识的pf作为所有vf的主设备*/
if (pdev)
iov->dev = pci_dev_get(pdev);
else
iov->dev = dev;
dev->sriov = iov;
dev->is_physfn = 1;
/* 计算设备的vf能否被当前总线支持
其中iov->stride和iov->offset在该函数填充*/
rc = compute_max_vf_buses(dev);
if (rc)
goto fail_max_buses;
return 0;
fail_max_buses:
dev->sriov = NULL;
dev->is_physfn = 0;
failed:
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &dev->resource[i + PCI_IOV_RESOURCES];
res->flags = 0;
}
kfree(iov);
return rc;
}
基本流程:
static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs)
{
struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
struct virtio_device *vdev = &vp_dev->vdev;
int ret;
if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK))
return -EBUSY;
if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV))
return -EINVAL;
/* pci设备是否有vf被其他驱动使用?*/
if (pci_vfs_assigned(pci_dev))
return -EPERM;
if (num_vfs == 0) {
pci_disable_sriov(pci_dev);
return 0;
}
ret = pci_enable_sriov(pci_dev, num_vfs);
if (ret < 0)
return ret;
return num_vfs;
}
关键函数调用:
基本流程
static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
{
int rc;
int i;
int nres;
u16 initial;
struct resource *res;
struct pci_dev *pdev;
struct pci_sriov *iov = dev->sriov;
int bars = 0;
int bus;
if (!nr_virtfn)
return 0;
/* 已有vf? iov操作不允许将更改使用的vf数量 why?*/
if (iov->num_VFs)
return -EINVAL;
pci_read_config_word(dev, iov->pos + PCI_SRIOV_INITIAL_VF, &initial);
if (initial > iov->total_VFs ||
(!(iov->cap & PCI_SRIOV_CAP_VFM) && (initial != iov->total_VFs)))
return -EIO;
/* 推荐vf不应该超过最大支持的vf */
/* 如果不支持热迁移,推荐的vf就应该等于所有vf */
if (nr_virtfn < 0 || nr_virtfn > iov->total_VFs ||
(!(iov->cap & PCI_SRIOV_CAP_VFM) && (nr_virtfn > initial)))
return -EINVAL;
nres = 0;
/* 检查vf的resource是否被正确初始化 */
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
bars |= (1 << (i + ));
res = &dev->resource[i + PCI_IOV_RESOURCES];
if (res->parent)
nres++;
}
if (nres != iov->nres) {
pci_err(dev, "not enough MMIO resources for SR-IOV\n");
return -ENOMEM;
}
/* vf应该出现的bus号 ,此处按照vf = 0 的offset和stride预判vf的bus号, 可能对于哪些可变offset和stride的设备出错*/
bus = pci_iov_virtfn_bus(dev, nr_virtfn - 1);
/* 当前bus资源无法支撑最后一个vf的创建 */
if (bus > dev->bus->busn_res.end) {
pci_err(dev, "can't enable %d VFs (bus %02x out of range of %pR)\n",
nr_virtfn, bus, &dev->bus->busn_res);
return -ENOMEM;
}
if (pci_enable_resources(dev, bars)) {
pci_err(dev, "SR-IOV: IOV BARS not allocated\n");
return -ENOMEM;
}
/* 如果当前iov->link的设备不是pci设备,那么找到该设备,并在文件系统创建链接到当前当前设备*/
if (iov->link != dev->devfn) {
pdev = pci_get_slot(dev->bus, iov->link);
if (!pdev)
return -ENODEV;
if (!pdev->is_physfn) {
pci_dev_put(pdev);
return -ENOSYS;
}
rc = sysfs_create_link(&dev->dev.kobj,
&pdev->dev.kobj, "dep_link");
pci_dev_put(pdev);
if (rc)
return rc;
}
iov->initial_VFs = initial;
if (nr_virtfn < initial)
initial = nr_virtfn;
/* arch x86 do nothing */
rc = pcibios_sriov_enable(dev, initial);
if (rc) {
pci_err(dev, "failure %d from pcibios_sriov_enable()\n", rc);
goto err_pcibios;
}
/* 写入pci iov capability 使能vf */
pci_iov_set_numvfs(dev, nr_virtfn);
iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
msleep(100);
pci_cfg_access_unlock(dev);
rc = sriov_add_vfs(dev, initial);
if (rc)
goto err_pcibios;
kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
iov->num_VFs = nr_virtfn;
return 0;
err_pcibios:
iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
pci_cfg_access_lock(dev);
pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
ssleep(1);
pci_cfg_access_unlock(dev);
pcibios_sriov_disable(dev);
if (iov->link != dev->devfn)
sysfs_remove_link(&dev->dev.kobj, "dep_link");
pci_iov_set_numvfs(dev, 0);
return rc;
}
static int sriov_add_vfs(struct pci_dev *dev, u16 num_vfs)
{
unsigned int i;
int rc;
if (dev->no_vf_scan)
return 0;
for (i = 0; i < num_vfs; i++) {
rc = pci_iov_add_virtfn(dev, i);
if (rc)
goto failed;
}
return 0;
failed:
while (i--)
pci_iov_remove_virtfn(dev, i);
return rc;
}
int pci_iov_add_virtfn(struct pci_dev *dev, int id)
{
struct pci_bus *bus;
struct pci_dev *virtfn;
struct resource *res;
int rc, i;
u64 size;
/* 为vf添加或者寻找已有的pci_bus */
bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
if (!bus) {
rc = -ENOMEM;
goto failed;
}
/* 新建总线和pci设备 ,设置vf归属关系和基本pci属性*/
virtfn = pci_iov_scan_device(dev, id, bus);
if (IS_ERR(virtfn)) {
rc = PTR_ERR(virtfn);
goto failed0;
}
virtfn->dev.parent = dev->dev.parent;
virtfn->multifunction = 0;
/*设定vf的bar的resource大小和占用*/
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &dev->resource[i + PCI_IOV_RESOURCES];
if (!res->parent)
continue;
virtfn->resource[i].name = pci_name(virtfn);
virtfn->resource[i].flags = res->flags;
size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
resource_set_range(&virtfn->resource[i],
res->start + size * id, size);
rc = request_resource(res, &virtfn->resource[i]);
BUG_ON(rc);
}
/* 配置并添加设备*/
pci_device_add(virtfn, virtfn->bus);
rc = pci_iov_sysfs_link(dev, virtfn, id);
if (rc)
goto failed1;
/* 为设备匹配驱动 */
pci_bus_add_device(virtfn);
return 0;
failed1:
pci_stop_and_remove_bus_device(virtfn);
pci_dev_put(dev);
failed0:
virtfn_remove_bus(dev->bus, bus);
failed:
return rc;
}
int pci_iov_add_virtfn(struct pci_dev *dev, int id)
{
struct pci_bus *bus;
struct pci_dev *virtfn;
struct resource *res;
int rc, i;
u64 size;
/* 为vf添加或者寻找已有的pci_bus */
bus = virtfn_add_bus(dev->bus, pci_iov_virtfn_bus(dev, id));
if (!bus) {
rc = -ENOMEM;
goto failed;
}
/* 新建总线和pci设备 ,设置vf归属关系和基本pci属性*/
virtfn = pci_iov_scan_device(dev, id, bus);
if (IS_ERR(virtfn)) {
rc = PTR_ERR(virtfn);
goto failed0;
}
virtfn->dev.parent = dev->dev.parent;
virtfn->multifunction = 0;
/*设定vf的bar的resource大小和占用*/
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &dev->resource[i + PCI_IOV_RESOURCES];
if (!res->parent)
continue;
virtfn->resource[i].name = pci_name(virtfn);
virtfn->resource[i].flags = res->flags;
size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
resource_set_range(&virtfn->resource[i],
res->start + size * id, size);
rc = request_resource(res, &virtfn->resource[i]);
BUG_ON(rc);
}
/* 配置并添加设备*/
pci_device_add(virtfn, virtfn->bus);
rc = pci_iov_sysfs_link(dev, virtfn, id);
if (rc)
goto failed1;
/* 为设备匹配驱动 */
pci_bus_add_device(virtfn);
return 0;
failed1:
pci_stop_and_remove_bus_device(virtfn);
pci_dev_put(dev);
failed0:
virtfn_remove_bus(dev->bus, bus);
failed:
return rc;
}
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。