Enhance torch.zeros_like for GPU inference in dynamic shape

M-Quadra commented 1 month ago

Add dtype support and unit test. The old PR #2345 is a wrong implementation.

M-Quadra commented 3 weeks ago

The model:

import torch
from torch import nn
from torch.nn import functional as F
from typing import Final
import coremltools as ct
from coremltools.converters.mil.mil import types

class Model(nn.Module):
    def forward(self, x):
        return torch.zeros_like(x)

model = Model().eval()
x = torch.randn(100)
traced_model = torch.jit.trace(model, (x))

var_dim: Final[ct.RangeDim] = ct.RangeDim(1, 10_000)
mlmodel = ct.convert(
    traced_model,
    inputs=[ct.TensorType(name="x", shape=ct.Shape([var_dim]), dtype=types.fp16)],
    outputs=[ct.TensorType(name="y")],
    minimum_deployment_target=ct.target.iOS16,
)
mlmodel.save("Sub.mlpackage")
# mlmodel.save("Mul.mlpackage")

Then convert the mlpackage to mlmodelc:

xcrun coremlcompiler compile Sub.mlpackage .
xcrun coremlcompiler generate Sub.mlpackage . --language Swift
xcrun coremlcompiler compile Mul.mlpackage .
xcrun coremlcompiler generate Mul.mlpackage . --language Swift

The test case:

func runTest() {
    struct Msg: Codable {
        let infers: [TimeInterval]
    }
    DispatchQueue.global().async {
        let infers = try! testSub(cnt: 10_000)
        let data = try! JSONEncoder().encode(Msg(infers: infers))
        let url = URL.documentsDirectory.appending(path: "./sub.json")
        try! data.write(to: url)
    }
    DispatchQueue.global().async {
        let infers = try! testMul(cnt: 10_000)
        let data = try! JSONEncoder().encode(Msg(infers: infers))
        let url = URL.documentsDirectory.appending(path: "./mul.json")
        try! data.write(to: url)
    }
}

func testMul(cnt: Int) throws -> [TimeInterval] {
    let cfg = MLModelConfiguration()
    cfg.computeUnits = .cpuAndGPU
    let st = Date()
    let model = try Mul(configuration: consume cfg)
    let load = Date().timeIntervalSince(consume st)
    print("mul:", load)

    return try [TimeInterval](unsafeUninitializedCapacity: cnt) { buffer, initializedCount in
        buffer[0] = 1
        for i in 1..<cnt {
            let x = try MLMultiArray(shape: [i as NSNumber], dataType: .float16)
            let st = Date()
            _ = try model.prediction(x: consume x)
            buffer[i] = Date().timeIntervalSince(consume st)
        }
        initializedCount = cnt
    }
}

func testSub(cnt: Int) throws -> [TimeInterval] {
    let cfg = MLModelConfiguration()
    cfg.computeUnits = .cpuAndGPU
    let st = Date()
    let model = try Sub(configuration: consume cfg)
    let load = Date().timeIntervalSince(consume st)
    print("sub:", load)

    return try [TimeInterval](unsafeUninitializedCapacity: cnt) { buffer, initializedCount in
        buffer[0] = 1
        for i in 1..<cnt {
            let x = try MLMultiArray(shape: [i as NSNumber], dataType: .float16)
            let st = Date()
            _ = try model.prediction(x: consume x)
            buffer[i] = Date().timeIntervalSince(consume st)
        }
        initializedCount = cnt
    }
}

The result (iPhone XR, iOS 18.0.1, Low Power Mode off):

import numpy as np
import matplotlib.pyplot as plt
avg_mul, std_mul = np.mean(mul_infers), np.std(mul_infers)
avg_sub, std_sub = np.mean(sub_infers), np.std(sub_infers)
print(f'mul, avg:{avg_mul: .6f}, std:{std_mul: .6f}')
print(f'sub, avg:{avg_sub: .6f}, std:{std_sub: .6f}')

avg_mul_5k, std_mul_5k = np.mean(mul_infers[5000:]), np.std(mul_infers[5000:])
avg_sub_5k, std_sub_5k = np.mean(sub_infers[5000:]), np.std(sub_infers[5000:])
print(f'mul_5k, avg:{avg_mul_5k: .6f}, std:{std_mul_5k: .6f}')
print(f'sub_5k, avg:{avg_sub_5k: .6f}, std:{std_sub_5k: .6f}')

mul, avg: 0.017661, std: 0.010109
sub, avg: 0.017655, std: 0.010110
mul_5k, avg: 0.019093, std: 0.001480
sub_5k, avg: 0.019091, std: 0.001475

mul_loads = [
    0.5651620626449585,
    0.5372270345687866,
    0.5468639135360718,
    0.6307599544525146,
    0.5672019720077515,
    0.5513859987258911,
    0.564581036567688,
    0.5799169540405273,
    0.5120859146118164,
    0.6692310571670532,
]
sub_loads = [
    0.5627679824829102,
    0.5371979475021362,
    0.5452049970626831,
    0.6308039426803589,
    0.5673099756240845,
    0.5501949787139893,
    0.5646369457244873,
    0.5798110961914062,
    0.5121839046478271,
    0.6110190153121948,
]
avg_mul_loads, std_mul_loads = np.mean(mul_loads), np.std(mul_loads)
avg_sub_loads, std_sub_loads = np.mean(sub_loads), np.std(sub_loads)
print(f'mul_loads, avg:{avg_mul_loads: .6f}, std:{std_mul_loads: .6f}')
print(f'sub_loads, avg:{avg_sub_loads: .6f}, std:{std_sub_loads: .6f}')

mul_loads, avg: 0.572442, std: 0.043529
sub_loads, avg: 0.566113, std: 0.032922

window_size = 100
avg_mul_infers = [np.mean(mul_infers[i-window_size:i]) for i in range(window_size, len(mul_infers) + 1)]
avg_sub_infers = [np.mean(sub_infers[i-window_size:i]) for i in range(window_size, len(sub_infers) + 1)]

x = range(window_size, window_size+len(avg_mul_infers))
plt.figure(figsize=(10, 6))
plt.plot(x, avg_sub_infers, label='sub-avg')
plt.plot(x, avg_mul_infers, label='mul-avg')
plt.legend()
plt.show()

std_mul_infers = [np.std(mul_infers[i-window_size:i]) for i in range(window_size, len(mul_infers) + 1)]
std_sub_infers = [np.std(sub_infers[i-window_size:i]) for i in range(window_size, len(sub_infers) + 1)]

x = range(window_size, window_size+len(std_mul_infers))
plt.figure(figsize=(10, 6))
plt.plot(x, std_sub_infers, label='sub-std')
plt.plot(x, std_mul_infers, label='mul-std')
plt.legend()
plt.show()

Although the performance difference is minimal, mb.sub demonstrates a slight edge over mb.mul.

YifanShenSZ commented 3 weeks ago

Sounds good. Will merge once CI green

https://gitlab.com/coremltools1/coremltools/-/commit/a1ce3f158bc872d516bb3abb02042fcea633d175/pipelines

apple / coremltools

Enhance torch.zeros_like for GPU inference in dynamic shape #2369