for循环中含有大量调用GPU计算的命令,有没有办法将所有任务打包后一次性发送给GPU进行计算,以减少CPU逐个调用GPU造成的开销?
以下为举例(需要将for里执行的7000次循环任务,一次性发送给GPU计算):
import torch
length = 7000
num = 500000
device = "cuda:0"
a1 = torch.randint(0, 2, (num, length), dtype=torch.bool, device=device)
a2 = torch.randint(0, 2, (num, length), dtype=torch.bool, device=device)
b1 = torch.zeros(num, dtype=torch.bool, device=device)
b2 = torch.zeros(num, dtype=torch.bool, device=device)
c1 = torch.zeros(num, dtype=torch.float32, device=device)
c2 = torch.zeros(num, dtype=torch.float32, device=device)
d = torch.zeros(num, dtype=torch.int32, device=device)
e = torch.randint(0, 101, (length,), dtype=torch.float32, device=device)
f = torch.ones(num, dtype=torch.bool, device=device)
for i in range(length):
sum_data = e[i]
a1_ = a1[:, i].flatten()
a2_ = a2[:, i].flatten()
b1[a1_] = True
b2[a2_] = True
c1[b1] += sum_data
c2[b2] += c1[b2]
d[b2] += 1
b1[b2] = False
b2[f] = False