Closed emmansun closed 11 months ago
Before go 1.20:
func ByteSlice2String(bs []byte) string {
return *(*string)(unsafe.Pointer(&bs))
}
func String2ByteSlice(str string) []byte {
if str == "" {
return nil
}
x := (*[2]uintptr)(unsafe.Pointer(&str))
h := [3]uintptr{x[0], x[1], x[1]}
return *(*[]byte)(unsafe.Pointer(&h))
}
After go 1.20:
func ByteSlice2String(bs []byte) string {
return unsafe.String(unsafe.SliceData(bs), len(bs))
}
func String2ByteSlice(str string) []byte {
if str == "" {
return nil
}
return unsafe.Slice(unsafe.StringData(str), len(str))
}
After change:
BenchmarkEncodeToString-6 499687 2492 ns/op 3287.55 MB/s
BenchmarkEncodeToString-6 394807 2598 ns/op 3153.17 MB/s
BenchmarkEncodeToString-6 625746 2550 ns/op 3212.68 MB/s
BenchmarkEncodeToString-6 567187 2700 ns/op 3034.48 MB/s
BenchmarkEncodeToString-6 592618 2601 ns/op 3150.00 MB/s
BenchmarkEncodeToString-6 596892 2574 ns/op 3182.74 MB/s
BenchmarkEncodeToString-6 593444 2496 ns/op 3282.64 MB/s
BenchmarkEncodeToString-6 595665 2563 ns/op 3195.92 MB/s
BenchmarkEncodeToString-6 594034 2582 ns/op 3173.25 MB/s
BenchmarkEncodeToString-6 584518 2651 ns/op 3089.97 MB/s
string to []byte case:
func BenchmarkDecode(b *testing.B) {
data := []byte(StdEncoding.EncodeToString(make([]byte, 8192)))
dbuf := make([]byte, StdEncoding.DecodedLen(len(data)))
b.SetBytes(int64(len(data)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
StdEncoding.Decode(dbuf, data)
}
}
The performance is:
goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkDecode
BenchmarkDecode-6
1903186 645.4 ns/op 16925.07 MB/s 0 B/op 0 allocs/op
func BenchmarkDecode(b *testing.B) {
data := StdEncoding.EncodeToString(make([]byte, 8192)))
dbuf := make([]byte, StdEncoding.DecodedLen(len(data)))
b.SetBytes(int64(len(data)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
StdEncoding.Decode(dbuf, []byte(data))
}
}
The performance is:
goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkDecode
BenchmarkDecode-6
358167 2975 ns/op 3671.33 MB/s 12288 B/op 1 allocs/op
after optimize []byte to string, string to []byte, the performance:
goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
BenchmarkEncode-6 2404890 498.6 ns/op 16428.37 MB/s
BenchmarkEncode-6 2429791 499.5 ns/op 16401.74 MB/s
BenchmarkEncode-6 2399318 497.7 ns/op 16459.59 MB/s
BenchmarkEncode-6 2404642 490.6 ns/op 16697.28 MB/s
BenchmarkEncode-6 2453055 500.5 ns/op 16367.42 MB/s
BenchmarkEncode-6 2424813 492.8 ns/op 16623.91 MB/s
BenchmarkEncode-6 2444425 490.6 ns/op 16696.35 MB/s
BenchmarkEncode-6 2445487 494.6 ns/op 16561.92 MB/s
BenchmarkEncode-6 2448139 495.5 ns/op 16534.40 MB/s
BenchmarkEncode-6 2411620 490.3 ns/op 16708.81 MB/s
BenchmarkDecode-6 1902096 628.0 ns/op 17395.13 MB/s
BenchmarkDecode-6 1834161 644.2 ns/op 16957.14 MB/s
BenchmarkDecode-6 1894376 638.4 ns/op 17111.31 MB/s
BenchmarkDecode-6 1897893 635.1 ns/op 17200.43 MB/s
BenchmarkDecode-6 1861186 642.2 ns/op 17011.18 MB/s
BenchmarkDecode-6 1907582 646.2 ns/op 16904.31 MB/s
BenchmarkDecode-6 1838038 634.4 ns/op 17219.81 MB/s
BenchmarkDecode-6 1894174 644.6 ns/op 16946.33 MB/s
BenchmarkDecode-6 1934805 639.3 ns/op 17087.41 MB/s
BenchmarkDecode-6 1919010 677.0 ns/op 16135.80 MB/s
BenchmarkEncodeToString-6 475642 2547 ns/op 3216.37 MB/s
BenchmarkEncodeToString-6 394857 2705 ns/op 3027.95 MB/s
BenchmarkEncodeToString-6 437206 2680 ns/op 3057.03 MB/s
BenchmarkEncodeToString-6 641666 2604 ns/op 3146.04 MB/s
BenchmarkEncodeToString-6 467473 2685 ns/op 3050.47 MB/s
BenchmarkEncodeToString-6 580461 2685 ns/op 3050.71 MB/s
BenchmarkEncodeToString-6 557205 2555 ns/op 3205.69 MB/s
BenchmarkEncodeToString-6 572431 2702 ns/op 3031.94 MB/s
BenchmarkEncodeToString-6 596762 2605 ns/op 3145.11 MB/s
BenchmarkEncodeToString-6 525586 2685 ns/op 3050.97 MB/s
BenchmarkDecodeString/2-6 41973010 27.84 ns/op 143.70 MB/s
BenchmarkDecodeString/2-6 59267454 27.79 ns/op 143.94 MB/s
BenchmarkDecodeString/2-6 39362974 27.96 ns/op 143.08 MB/s
BenchmarkDecodeString/2-6 43633254 27.65 ns/op 144.65 MB/s
BenchmarkDecodeString/2-6 51420931 28.03 ns/op 142.70 MB/s
BenchmarkDecodeString/2-6 48159697 27.76 ns/op 144.10 MB/s
BenchmarkDecodeString/2-6 59073433 28.23 ns/op 141.70 MB/s
BenchmarkDecodeString/2-6 39309334 27.70 ns/op 144.39 MB/s
BenchmarkDecodeString/2-6 38879744 27.87 ns/op 143.54 MB/s
BenchmarkDecodeString/2-6 45519030 27.80 ns/op 143.90 MB/s
BenchmarkDecodeString/4-6 38823140 32.33 ns/op 247.46 MB/s
BenchmarkDecodeString/4-6 39018553 32.70 ns/op 244.64 MB/s
BenchmarkDecodeString/4-6 38788881 32.35 ns/op 247.29 MB/s
BenchmarkDecodeString/4-6 38905333 32.06 ns/op 249.50 MB/s
BenchmarkDecodeString/4-6 32061386 32.29 ns/op 247.79 MB/s
BenchmarkDecodeString/4-6 39306373 32.31 ns/op 247.60 MB/s
BenchmarkDecodeString/4-6 37433871 32.27 ns/op 247.92 MB/s
BenchmarkDecodeString/4-6 37938576 32.17 ns/op 248.66 MB/s
BenchmarkDecodeString/4-6 39209150 31.84 ns/op 251.22 MB/s
BenchmarkDecodeString/4-6 39482125 32.16 ns/op 248.73 MB/s
BenchmarkDecodeString/8-6 29318920 39.44 ns/op 304.28 MB/s
BenchmarkDecodeString/8-6 30925682 39.36 ns/op 304.84 MB/s
BenchmarkDecodeString/8-6 34237389 39.58 ns/op 303.21 MB/s
BenchmarkDecodeString/8-6 31749223 38.97 ns/op 307.92 MB/s
BenchmarkDecodeString/8-6 29708264 39.44 ns/op 304.29 MB/s
BenchmarkDecodeString/8-6 29762199 39.47 ns/op 304.01 MB/s
BenchmarkDecodeString/8-6 39575746 39.46 ns/op 304.11 MB/s
BenchmarkDecodeString/8-6 29032491 40.03 ns/op 299.80 MB/s
BenchmarkDecodeString/8-6 29604630 38.56 ns/op 311.24 MB/s
BenchmarkDecodeString/8-6 29326730 40.38 ns/op 297.17 MB/s
BenchmarkDecodeString/64-6 18011283 67.68 ns/op 1300.14 MB/s
BenchmarkDecodeString/64-6 18115723 69.04 ns/op 1274.68 MB/s
BenchmarkDecodeString/64-6 19608483 67.40 ns/op 1305.58 MB/s
BenchmarkDecodeString/64-6 17424132 67.96 ns/op 1294.83 MB/s
BenchmarkDecodeString/64-6 17977824 68.02 ns/op 1293.76 MB/s
BenchmarkDecodeString/64-6 16754557 67.67 ns/op 1300.42 MB/s
BenchmarkDecodeString/64-6 15259099 67.71 ns/op 1299.69 MB/s
BenchmarkDecodeString/64-6 18463298 67.26 ns/op 1308.41 MB/s
BenchmarkDecodeString/64-6 17852200 66.17 ns/op 1329.91 MB/s
BenchmarkDecodeString/64-6 17805157 67.93 ns/op 1295.47 MB/s
BenchmarkDecodeString/8192-6 565189 2203 ns/op 4957.69 MB/s
BenchmarkDecodeString/8192-6 758221 2180 ns/op 5010.81 MB/s
BenchmarkDecodeString/8192-6 619866 2232 ns/op 4893.70 MB/s
BenchmarkDecodeString/8192-6 474676 2209 ns/op 4944.55 MB/s
BenchmarkDecodeString/8192-6 525614 2118 ns/op 5158.24 MB/s
BenchmarkDecodeString/8192-6 558560 2112 ns/op 5172.22 MB/s
BenchmarkDecodeString/8192-6 591589 2156 ns/op 5066.09 MB/s
BenchmarkDecodeString/8192-6 636424 2151 ns/op 5078.55 MB/s
BenchmarkDecodeString/8192-6 567975 2247 ns/op 4862.39 MB/s
BenchmarkDecodeString/8192-6 1000000 2193 ns/op 4981.74 MB/s
PASS
ok github.com/emmansun/base64 121.747s
The main bottleneck is []byte creation now.
avx vs avx2.
goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
│ avx.txt │ avx2.txt │
│ sec/op │ sec/op vs base │
Encode-6 1002.5n ± 1% 495.1n ± 1% -50.62% (p=0.000 n=10)
Decode-6 1546.5n ± 4% 640.7n ± 1% -58.57% (p=0.000 n=10)
EncodeToString-6 3.201µ ± 2% 2.683µ ± 5% -16.18% (p=0.000 n=10)
DecodeString/2-6 27.95n ± 1% 27.82n ± 1% ~ (p=0.289 n=10)
DecodeString/4-6 32.34n ± 1% 32.28n ± 1% ~ (p=0.494 n=10)
DecodeString/8-6 39.12n ± 0% 39.45n ± 1% +0.84% (p=0.034 n=10)
DecodeString/64-6 69.97n ± 2% 67.69n ± 1% -3.24% (p=0.000 n=10)
DecodeString/8192-6 3.227µ ± 2% 2.187µ ± 3% -32.23% (p=0.000 n=10)
geomean 281.7n 214.4n -23.89%
│ avx.txt │ avx2.txt │
│ B/s │ B/s vs base │
Encode-6 7.612Gi ± 1% 15.412Gi ± 1% +102.46% (p=0.000 n=10)
Decode-6 6.578Gi ± 3% 15.878Gi ± 1% +141.37% (p=0.000 n=10)
EncodeToString-6 2.384Gi ± 2% 2.844Gi ± 5% +19.31% (p=0.000 n=10)
DecodeString/2-6 136.5Mi ± 1% 137.1Mi ± 1% ~ (p=0.315 n=10)
DecodeString/4-6 235.9Mi ± 1% 236.4Mi ± 1% ~ (p=0.529 n=10)
DecodeString/8-6 292.6Mi ± 0% 290.1Mi ± 1% -0.84% (p=0.035 n=10)
DecodeString/64-6 1.171Gi ± 2% 1.211Gi ± 1% +3.35% (p=0.000 n=10)
DecodeString/8192-6 3.153Gi ± 2% 4.653Gi ± 3% +47.58% (p=0.000 n=10)
geomean 1.184Gi 1.556Gi +31.38%
golang sdk vs. purego:
goos: windows
goarch: amd64
pkg: github.com/emmansun/base64
cpu: Intel(R) Core(TM) i5-9500 CPU @ 3.00GHz
│ sdk.txt │ purego.txt │
│ sec/op │ sec/op vs base │
EncodeToString-6 11.774µ ± 2% 9.267µ ± 1% -21.29% (p=0.000 n=10)
DecodeString/2-6 31.80n ± 0% 28.63n ± 1% -9.97% (p=0.000 n=10)
DecodeString/4-6 35.00n ± 1% 32.89n ± 2% -6.02% (p=0.000 n=10)
DecodeString/8-6 41.85n ± 1% 39.97n ± 2% -4.48% (p=0.000 n=10)
DecodeString/64-6 154.7n ± 1% 112.0n ± 2% -27.61% (p=0.000 n=10)
DecodeString/8192-6 12.630µ ± 1% 9.836µ ± 1% -22.12% (p=0.000 n=10)
geomean 319.9n 269.6n -15.71%
│ sdk.txt │ purego.txt │
│ B/s │ B/s vs base │
EncodeToString-6 663.6Mi ± 2% 843.1Mi ± 1% +27.05% (p=0.000 n=10)
DecodeString/2-6 120.0Mi ± 0% 133.2Mi ± 1% +11.06% (p=0.000 n=10)
DecodeString/4-6 218.0Mi ± 1% 232.0Mi ± 2% +6.39% (p=0.000 n=10)
DecodeString/8-6 273.5Mi ± 1% 286.3Mi ± 2% +4.69% (p=0.000 n=10)
DecodeString/64-6 542.8Mi ± 1% 749.7Mi ± 2% +38.12% (p=0.000 n=10)
DecodeString/8192-6 824.8Mi ± 1% 1059.2Mi ± 1% +28.41% (p=0.000 n=10)
geomean 358.6Mi 425.4Mi +18.63%
The test
AVX2:
But if we test below:
The performance is as below:
The performance is as below:
After adding slice make :
The performance becomes:
So, the main bottleneck is: