emmansun / base64

Base64 with SIMD acceleration
https://godoc.org/github.com/emmansun/base64
BSD 3-Clause "New" or "Revised" License
3 stars 0 forks source link

About the performance of EncodeToString & DecodeString #5

Closed emmansun closed 11 months ago

emmansun commented 11 months ago

The test

// EncodeToString returns the base64 encoding of src.
func (enc *Encoding) EncodeToString(src []byte) string {
    buf := make([]byte, enc.EncodedLen(len(src)))
    enc.Encode(buf, src)
    return string(buf)
}

func BenchmarkEncodeToString(b *testing.B) {
    data := make([]byte, 8192)
    b.SetBytes(int64(len(data)))
    for i := 0; i < b.N; i++ {
        StdEncoding.EncodeToString(data)
    }
}

AVX2:

goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkEncodeToString-6         252020          4846 ns/op    1690.43 MB/s
BenchmarkEncodeToString-6         283879          4747 ns/op    1725.75 MB/s
BenchmarkEncodeToString-6         235201          4922 ns/op    1664.32 MB/s
BenchmarkEncodeToString-6         258640          4806 ns/op    1704.61 MB/s
BenchmarkEncodeToString-6         308361          4792 ns/op    1709.40 MB/s
BenchmarkEncodeToString-6         261927          4644 ns/op    1764.12 MB/s
BenchmarkEncodeToString-6         290762          4841 ns/op    1692.12 MB/s
BenchmarkEncodeToString-6         307858          4885 ns/op    1677.03 MB/s
BenchmarkEncodeToString-6         246835          4612 ns/op    1776.05 MB/s
BenchmarkEncodeToString-6         251816          4965 ns/op    1649.94 MB/s

But if we test below:

func BenchmarkEncodeSIMD(b *testing.B) {
    data := make([]byte, 8192)
    dst := make([]byte, StdEncoding.EncodedLen(8192))
    b.SetBytes(int64(len(data)))
    for i := 0; i < b.N; i++ {
        encodeSIMD(dst, data, &encodeStdLut)
    }
}

The performance is as below:

goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkEncodeSIMD
BenchmarkEncodeSIMD-6
 2501653           486.4 ns/op  16842.78 MB/s          0 B/op          0 allocs/op
PASS
ok      github.com/emmansun/base64  2.044s
func BenchmarkEncode(b *testing.B) {
    data := make([]byte, 8192)
    dst := make([]byte, StdEncoding.EncodedLen(8192))
    b.SetBytes(int64(len(data)))
    for i := 0; i < b.N; i++ {
        StdEncoding.Encode(dst, data)
    }
}

The performance is as below:

goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkEncode
BenchmarkEncode-6
 2386078           497.4 ns/op  16471.10 MB/s          0 B/op          0 allocs/op
PASS
ok      github.com/emmansun/base64  2.757s

After adding slice make :

func BenchmarkEncode(b *testing.B) {
    data := make([]byte, 8192)
    b.SetBytes(int64(len(data)))
    for i := 0; i < b.N; i++ {
        dst := make([]byte, StdEncoding.EncodedLen(8192))
        StdEncoding.Encode(dst, data)
    }
}

The performance becomes:

goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkEncode
BenchmarkEncode-6
  468992          2571 ns/op    3186.12 MB/s       12288 B/op          1 allocs/op
PASS
ok      github.com/emmansun/base64  2.344s

So, the main bottleneck is:

    buf := make([]byte, enc.EncodedLen(len(src)))
    return string(buf)
emmansun commented 11 months ago

Before go 1.20:

func ByteSlice2String(bs []byte) string {
    return *(*string)(unsafe.Pointer(&bs))
}

func String2ByteSlice(str string) []byte {
 if str == "" {
  return nil
 }
 x := (*[2]uintptr)(unsafe.Pointer(&str))
 h := [3]uintptr{x[0], x[1], x[1]}
 return *(*[]byte)(unsafe.Pointer(&h))
}

After go 1.20:

func ByteSlice2String(bs []byte) string {
    return unsafe.String(unsafe.SliceData(bs), len(bs))
}

func String2ByteSlice(str string) []byte {
 if str == "" {
  return nil
 }
 return unsafe.Slice(unsafe.StringData(str), len(str))
}
emmansun commented 11 months ago

After change:

BenchmarkEncodeToString-6         499687          2492 ns/op    3287.55 MB/s
BenchmarkEncodeToString-6         394807          2598 ns/op    3153.17 MB/s
BenchmarkEncodeToString-6         625746          2550 ns/op    3212.68 MB/s
BenchmarkEncodeToString-6         567187          2700 ns/op    3034.48 MB/s
BenchmarkEncodeToString-6         592618          2601 ns/op    3150.00 MB/s
BenchmarkEncodeToString-6         596892          2574 ns/op    3182.74 MB/s
BenchmarkEncodeToString-6         593444          2496 ns/op    3282.64 MB/s
BenchmarkEncodeToString-6         595665          2563 ns/op    3195.92 MB/s
BenchmarkEncodeToString-6         594034          2582 ns/op    3173.25 MB/s
BenchmarkEncodeToString-6         584518          2651 ns/op    3089.97 MB/s
emmansun commented 11 months ago

string to []byte case:

func BenchmarkDecode(b *testing.B) {
    data := []byte(StdEncoding.EncodeToString(make([]byte, 8192)))
    dbuf := make([]byte, StdEncoding.DecodedLen(len(data)))
    b.SetBytes(int64(len(data)))
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        StdEncoding.Decode(dbuf, data)
    }
}

The performance is:

goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkDecode
BenchmarkDecode-6
 1903186           645.4 ns/op  16925.07 MB/s          0 B/op          0 allocs/op
func BenchmarkDecode(b *testing.B) {
    data := StdEncoding.EncodeToString(make([]byte, 8192)))
    dbuf := make([]byte, StdEncoding.DecodedLen(len(data)))
    b.SetBytes(int64(len(data)))
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        StdEncoding.Decode(dbuf, []byte(data))
    }
}

The performance is:

goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkDecode
BenchmarkDecode-6
  358167          2975 ns/op    3671.33 MB/s       12288 B/op          1 allocs/op
emmansun commented 11 months ago

after optimize []byte to string, string to []byte, the performance:

goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkEncode-6                2404890           498.6 ns/op  16428.37 MB/s
BenchmarkEncode-6                2429791           499.5 ns/op  16401.74 MB/s
BenchmarkEncode-6                2399318           497.7 ns/op  16459.59 MB/s
BenchmarkEncode-6                2404642           490.6 ns/op  16697.28 MB/s
BenchmarkEncode-6                2453055           500.5 ns/op  16367.42 MB/s
BenchmarkEncode-6                2424813           492.8 ns/op  16623.91 MB/s
BenchmarkEncode-6                2444425           490.6 ns/op  16696.35 MB/s
BenchmarkEncode-6                2445487           494.6 ns/op  16561.92 MB/s
BenchmarkEncode-6                2448139           495.5 ns/op  16534.40 MB/s
BenchmarkEncode-6                2411620           490.3 ns/op  16708.81 MB/s
BenchmarkDecode-6                1902096           628.0 ns/op  17395.13 MB/s
BenchmarkDecode-6                1834161           644.2 ns/op  16957.14 MB/s
BenchmarkDecode-6                1894376           638.4 ns/op  17111.31 MB/s
BenchmarkDecode-6                1897893           635.1 ns/op  17200.43 MB/s
BenchmarkDecode-6                1861186           642.2 ns/op  17011.18 MB/s
BenchmarkDecode-6                1907582           646.2 ns/op  16904.31 MB/s
BenchmarkDecode-6                1838038           634.4 ns/op  17219.81 MB/s
BenchmarkDecode-6                1894174           644.6 ns/op  16946.33 MB/s
BenchmarkDecode-6                1934805           639.3 ns/op  17087.41 MB/s
BenchmarkDecode-6                1919010           677.0 ns/op  16135.80 MB/s
BenchmarkEncodeToString-6         475642          2547 ns/op    3216.37 MB/s
BenchmarkEncodeToString-6         394857          2705 ns/op    3027.95 MB/s
BenchmarkEncodeToString-6         437206          2680 ns/op    3057.03 MB/s
BenchmarkEncodeToString-6         641666          2604 ns/op    3146.04 MB/s
BenchmarkEncodeToString-6         467473          2685 ns/op    3050.47 MB/s
BenchmarkEncodeToString-6         580461          2685 ns/op    3050.71 MB/s
BenchmarkEncodeToString-6         557205          2555 ns/op    3205.69 MB/s
BenchmarkEncodeToString-6         572431          2702 ns/op    3031.94 MB/s
BenchmarkEncodeToString-6         596762          2605 ns/op    3145.11 MB/s
BenchmarkEncodeToString-6         525586          2685 ns/op    3050.97 MB/s
BenchmarkDecodeString/2-6       41973010            27.84 ns/op  143.70 MB/s
BenchmarkDecodeString/2-6       59267454            27.79 ns/op  143.94 MB/s
BenchmarkDecodeString/2-6       39362974            27.96 ns/op  143.08 MB/s
BenchmarkDecodeString/2-6       43633254            27.65 ns/op  144.65 MB/s
BenchmarkDecodeString/2-6       51420931            28.03 ns/op  142.70 MB/s
BenchmarkDecodeString/2-6       48159697            27.76 ns/op  144.10 MB/s
BenchmarkDecodeString/2-6       59073433            28.23 ns/op  141.70 MB/s
BenchmarkDecodeString/2-6       39309334            27.70 ns/op  144.39 MB/s
BenchmarkDecodeString/2-6       38879744            27.87 ns/op  143.54 MB/s
BenchmarkDecodeString/2-6       45519030            27.80 ns/op  143.90 MB/s
BenchmarkDecodeString/4-6       38823140            32.33 ns/op  247.46 MB/s
BenchmarkDecodeString/4-6       39018553            32.70 ns/op  244.64 MB/s
BenchmarkDecodeString/4-6       38788881            32.35 ns/op  247.29 MB/s
BenchmarkDecodeString/4-6       38905333            32.06 ns/op  249.50 MB/s
BenchmarkDecodeString/4-6       32061386            32.29 ns/op  247.79 MB/s
BenchmarkDecodeString/4-6       39306373            32.31 ns/op  247.60 MB/s
BenchmarkDecodeString/4-6       37433871            32.27 ns/op  247.92 MB/s
BenchmarkDecodeString/4-6       37938576            32.17 ns/op  248.66 MB/s
BenchmarkDecodeString/4-6       39209150            31.84 ns/op  251.22 MB/s
BenchmarkDecodeString/4-6       39482125            32.16 ns/op  248.73 MB/s
BenchmarkDecodeString/8-6       29318920            39.44 ns/op  304.28 MB/s
BenchmarkDecodeString/8-6       30925682            39.36 ns/op  304.84 MB/s
BenchmarkDecodeString/8-6       34237389            39.58 ns/op  303.21 MB/s
BenchmarkDecodeString/8-6       31749223            38.97 ns/op  307.92 MB/s
BenchmarkDecodeString/8-6       29708264            39.44 ns/op  304.29 MB/s
BenchmarkDecodeString/8-6       29762199            39.47 ns/op  304.01 MB/s
BenchmarkDecodeString/8-6       39575746            39.46 ns/op  304.11 MB/s
BenchmarkDecodeString/8-6       29032491            40.03 ns/op  299.80 MB/s
BenchmarkDecodeString/8-6       29604630            38.56 ns/op  311.24 MB/s
BenchmarkDecodeString/8-6       29326730            40.38 ns/op  297.17 MB/s
BenchmarkDecodeString/64-6      18011283            67.68 ns/op 1300.14 MB/s
BenchmarkDecodeString/64-6      18115723            69.04 ns/op 1274.68 MB/s
BenchmarkDecodeString/64-6      19608483            67.40 ns/op 1305.58 MB/s
BenchmarkDecodeString/64-6      17424132            67.96 ns/op 1294.83 MB/s
BenchmarkDecodeString/64-6      17977824            68.02 ns/op 1293.76 MB/s
BenchmarkDecodeString/64-6      16754557            67.67 ns/op 1300.42 MB/s
BenchmarkDecodeString/64-6      15259099            67.71 ns/op 1299.69 MB/s
BenchmarkDecodeString/64-6      18463298            67.26 ns/op 1308.41 MB/s
BenchmarkDecodeString/64-6      17852200            66.17 ns/op 1329.91 MB/s
BenchmarkDecodeString/64-6      17805157            67.93 ns/op 1295.47 MB/s
BenchmarkDecodeString/8192-6              565189          2203 ns/op    4957.69 MB/s
BenchmarkDecodeString/8192-6              758221          2180 ns/op    5010.81 MB/s
BenchmarkDecodeString/8192-6              619866          2232 ns/op    4893.70 MB/s
BenchmarkDecodeString/8192-6              474676          2209 ns/op    4944.55 MB/s
BenchmarkDecodeString/8192-6              525614          2118 ns/op    5158.24 MB/s
BenchmarkDecodeString/8192-6              558560          2112 ns/op    5172.22 MB/s
BenchmarkDecodeString/8192-6              591589          2156 ns/op    5066.09 MB/s
BenchmarkDecodeString/8192-6              636424          2151 ns/op    5078.55 MB/s
BenchmarkDecodeString/8192-6              567975          2247 ns/op    4862.39 MB/s
BenchmarkDecodeString/8192-6             1000000          2193 ns/op    4981.74 MB/s
PASS
ok      github.com/emmansun/base64  121.747s

The main bottleneck is []byte creation now.

emmansun commented 11 months ago

avx vs avx2.

goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
                    │   avx.txt    │              avx2.txt               │
                    │    sec/op    │   sec/op     vs base                │
Encode-6              1002.5n ± 1%   495.1n ± 1%  -50.62% (p=0.000 n=10)
Decode-6              1546.5n ± 4%   640.7n ± 1%  -58.57% (p=0.000 n=10)
EncodeToString-6       3.201µ ± 2%   2.683µ ± 5%  -16.18% (p=0.000 n=10)
DecodeString/2-6       27.95n ± 1%   27.82n ± 1%        ~ (p=0.289 n=10)
DecodeString/4-6       32.34n ± 1%   32.28n ± 1%        ~ (p=0.494 n=10)
DecodeString/8-6       39.12n ± 0%   39.45n ± 1%   +0.84% (p=0.034 n=10)
DecodeString/64-6      69.97n ± 2%   67.69n ± 1%   -3.24% (p=0.000 n=10)
DecodeString/8192-6    3.227µ ± 2%   2.187µ ± 3%  -32.23% (p=0.000 n=10)
geomean                281.7n        214.4n       -23.89%

                    │   avx.txt    │                avx2.txt                │
                    │     B/s      │      B/s       vs base                 │
Encode-6              7.612Gi ± 1%   15.412Gi ± 1%  +102.46% (p=0.000 n=10)
Decode-6              6.578Gi ± 3%   15.878Gi ± 1%  +141.37% (p=0.000 n=10)
EncodeToString-6      2.384Gi ± 2%    2.844Gi ± 5%   +19.31% (p=0.000 n=10)
DecodeString/2-6      136.5Mi ± 1%    137.1Mi ± 1%         ~ (p=0.315 n=10)
DecodeString/4-6      235.9Mi ± 1%    236.4Mi ± 1%         ~ (p=0.529 n=10)
DecodeString/8-6      292.6Mi ± 0%    290.1Mi ± 1%    -0.84% (p=0.035 n=10)
DecodeString/64-6     1.171Gi ± 2%    1.211Gi ± 1%    +3.35% (p=0.000 n=10)
DecodeString/8192-6   3.153Gi ± 2%    4.653Gi ± 3%   +47.58% (p=0.000 n=10)
geomean               1.184Gi         1.556Gi        +31.38%
emmansun commented 11 months ago

golang sdk vs. purego:

goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
                    │   sdk.txt    │             purego.txt              │
                    │    sec/op    │   sec/op     vs base                │
EncodeToString-6      11.774µ ± 2%   9.267µ ± 1%  -21.29% (p=0.000 n=10)
DecodeString/2-6       31.80n ± 0%   28.63n ± 1%   -9.97% (p=0.000 n=10)
DecodeString/4-6       35.00n ± 1%   32.89n ± 2%   -6.02% (p=0.000 n=10)
DecodeString/8-6       41.85n ± 1%   39.97n ± 2%   -4.48% (p=0.000 n=10)
DecodeString/64-6      154.7n ± 1%   112.0n ± 2%  -27.61% (p=0.000 n=10)
DecodeString/8192-6   12.630µ ± 1%   9.836µ ± 1%  -22.12% (p=0.000 n=10)
geomean                319.9n        269.6n       -15.71%

                    │   sdk.txt    │              purego.txt               │
                    │     B/s      │      B/s       vs base                │
EncodeToString-6      663.6Mi ± 2%    843.1Mi ± 1%  +27.05% (p=0.000 n=10)
DecodeString/2-6      120.0Mi ± 0%    133.2Mi ± 1%  +11.06% (p=0.000 n=10)
DecodeString/4-6      218.0Mi ± 1%    232.0Mi ± 2%   +6.39% (p=0.000 n=10)
DecodeString/8-6      273.5Mi ± 1%    286.3Mi ± 2%   +4.69% (p=0.000 n=10)
DecodeString/64-6     542.8Mi ± 1%    749.7Mi ± 2%  +38.12% (p=0.000 n=10)
DecodeString/8192-6   824.8Mi ± 1%   1059.2Mi ± 1%  +28.41% (p=0.000 n=10)
geomean               358.6Mi         425.4Mi       +18.63%